libgspeech
라이브러리는 C 경계를 통해 직렬화된 proto를 전달하며, proto 필드를 문서화합니다.
Proto 파일 호스팅 위치
https://libgspeech.git.corp.google.com/libgspeech-resources
// libgspeech.proto defines the messages passed from and to `libgspeech`.
// Background:
//
// `libgspeech` is an engine that serves Speech `LanguagePack`(s). A
// `LanguagePack` contains configurations for a set of computational graphs and
// graph resources. `libgspeech.(so|dll|dylib)` is a library that links all the
// operations necessary to run a graph and a graph execution engine. Different
// graphs have different features, for example: `libgspeech` emits
// SpeechEvent::VadEvent(s) if it is compiled with VADEvent ops and is executing
// a graph that contains that feature.
// This background is ment to provide context as to why your `LanguagePack`
// might not have all the features seen in this proto. This proto has all the
// functionality of all possible graphs `libgspeech` can serve.
// `libgspeech` binary size is big if not tuned for a specific user, because it
// links in all ops for all possible graphs it might execute.
syntax = "proto2";
package google.libgspeech;
option java_multiple_files = true;
option java_package = "com.google.libgspeech";
// `SpeechInitConfig` configures the initialisation of the Speech pipeline.
// Initialization is a heavy operation and should only be done sparsely,
// for example when changing language.
message SpeechInitConfig {
// `AudioConfig` contains information about what audio will be pushed into
// Speech.
message AudioConfig {
// `SampleType` has information about the sample width and type.
enum SampleType {
UNKNOWN = 0;
INT16 = 1;
INT32 = 2;
FLOAT16 = 3;
FLOAT32 = 4;
}
// `sample_rate_hz` is the sampling frequency of the audio in hertz.
optional int32 sample_rate_hz = 1;
// `channel_count` is the number of channels in the audio.
optional int32 channel_count = 2;
optional SampleType sample_type = 3;
}
// `ASRConfig` configures the recognizer.
message ASRConfig {
// `language_pack` is a path to a directory holding an inference graph and
// resources for ASR.
optional string language_pack = 1;
}
// `LangIdConfig` configures the LangId pipeline. The LangId pipeline can
// be used to determine the language spoken by a user.
message LangIdConfig {
// `language_pack` is a path to a directory holding an inference graph
// and resources for langId processing.
optional string language_pack = 1;
}
optional ASRConfig asr_config = 1;
optional AudioConfig audio_config = 2;
optional LangIdConfig lang_id_config = 3;
}
// `SpeechStartConfig` configures the start of the Speech pipeline.
message SpeechStartConfig {
// Speech adaptation configuration to improve the accuracy of speech
// recognition.
message SpeechAdaptation {
// A set of words or phrases that represents a common concept likely to
// appear in your audio, for example a list of passenger ship names.
// CustomClass items can be substituted into placeholders that you set in
// PhraseSet phrases.
message CustomClass {
// A unique id to be referenced in phrases. Must match regex
// "[A-Za-z0-9_]+" (quotes are not part of the regex).
optional string custom_class_id = 1;
// A collection of class items.
repeated string items = 2;
}
// Phrase sets containing words and phrase "hints" so that the speech
// recognition is more likely to recognize them. This can be used to improve
// the accuracy for specific words and phrases, for example, if specific
// commands are typically spoken by the user. This can also be used to add
// additional words to the vocabulary of the recognizer.
//
// List items can also include pre-built or custom classes containing groups
// of words that represent common concepts that occur in natural language.
// For example, rather than providing a phrase hint for every month of the
// year (e.g. "i was born in january", "i was born in febuary", ...), use
// the pre-built `$MONTH` class improves the likelihood of correctly
// transcribing audio that includes months (e.g. "i was born in $MONTH"). To
// refer to pre-built classes, use the class' symbol prepended with `$` e.g.
// `$MONTH`. To refer to custom classes that were defined inline in the
// request, set the class's `custom_class_id` to a unique string. Then use
// the class' id wrapped in $`{...}` e.g. "${my-months}".
message PhraseSet {
// A list of words and phrases.
repeated string phrases = 1;
}
// A collection of custom classes. Refer to the defined class in phrase
// hints by its unique `custom_class_id`.
repeated CustomClass custom_classes = 1;
// A collection of phrase sets, provides "hints" to the speech recognizer
// to favor specific words and phrases in the results. Any phrase set can
// use any custom class.
repeated PhraseSet phrase_sets = 2;
}
optional SpeechAdaptation speech_adaptation = 1;
}
// `SpeechEvent` is the message type in Google Speech responses.
message SpeechEvent {
// `RecognitionEvent` holds transcritpion information
message RecognitionEvent {
// Give an indication as to why a result was finalized.
enum IsFinalReason {
UNSET = 0;
// END_OF_SPEECH implies that the is_final was emitted because end of
// human speech was detected. This can occur at an utterance boundary
// or when a speaker stops speaking.
END_OF_SPEECH = 1;
// END_OF_AUDIO implies that the client marked the audio stream as
// finished and it triggered a final response.
END_OF_AUDIO = 2;
}
message Result {
// Transcript result from the recognition engine.
optional string transcript = 1;
// Confidence the recognizer has in the transcript.
// Currently, this is only provided for the transcript with the highest
// likelihood.
optional float confidence = 2;
}
// Recognition results from the Recognition engine. The results are ordered
// by likelihood where the 0'th result has the highest likelihood.
repeated Result results = 1;
// Indicates that this result is finalized.
// - A full transcript is the concatenation of all finalized results.
// - If this field is false, modified versions of the result can
// be emitted in the future.
optional bool is_final = 2;
// Indicates why a result was finalized.
optional IsFinalReason is_final_reason = 3;
}
// `VADEvent` holds voice activity information.
message VADEvent {
enum VADType {
UNKNOWN = 0;
// `START_OF_SPEECH` indicates that the classifier detected start of
// human speech.
START_OF_SPEECH = 1;
// `END_OF_SPEECH` indicates that the classifier detected end of
// human speech.
END_OF_SPEECH = 2;
}
optional VADType vad_type = 1;
}
// `AudioEvent` holds audio information.
message AudioEvent {
enum AudioEventType {
UNKNOWN = 0;
// `END_OF_AUDIO` indicates that Speech is done processing the audio.
END_OF_AUDIO = 2;
}
optional AudioEventType audio_event_type = 1;
}
// `LangIdEvent` holds results from the execution of `LangId` graph.
// If a LangId graph is executed `libgspeech` periodically emits
// `LangIdEvent`(s). A `LangIdEvent` provides the likelihood that
// some chunk of audio is spoken in a certain language, expressed through
// the confidence and the ordering of the languages.
//
// One use case aggregates all of the top language predictions. However,
// for a single `LangIdEvent`, the top language prediction is the first
// language in the `language` list. The most prevalent top language prediction
// weighed by its confidence can be used to predict the language spoken over
// some interval.
message LangIdEvent {
message Result {
// Language recognized by the LangId model.
optional string language = 1;
// Confidence that the recognized language is correct.
optional float confidence = 2;
}
// Results are ordered from the highest likelihood to the lowest. Currently,
// only the highest likelihood contains a confidence value.
repeated Result results = 1;
}
// Each SpeechEvent is one of the following events:
oneof speech_event {
RecognitionEvent recognition_event = 1;
VADEvent vad_event = 2;
AudioEvent audio_event = 3;
LangIdEvent lang_id_event = 4;
}
}
// Polling for Speech events yields this message, holding all available
// Speech events.
message SpeechEvents {
repeated SpeechEvent speech_events = 1;
}