API

libgspeech ライブラリは、C 境界を越えてシリアル化された proto を渡します。これには、proto のフィールドが記載されています。
proto ファイルは以下でホストされています。
https://libgspeech.git.corp.google.com/libgspeech-resources

// libgspeech.proto defines the messages passed from and to `libgspeech`.

// Background:
//
// `libgspeech` is an engine that serves Speech `LanguagePack`(s). A
// `LanguagePack` contains configurations for a set of computational graphs and
// graph resources. `libgspeech.(so|dll|dylib)` is a library that links all the
// operations necessary to run a graph and a graph execution engine. Different
// graphs have different features, for example: `libgspeech` emits
// SpeechEvent::VadEvent(s) if it is compiled with VADEvent ops and is executing
// a graph that contains that feature.

// This background is ment to provide context as to why your `LanguagePack`
// might not have all the features seen in this proto. This proto has all the
// functionality of all possible graphs `libgspeech` can serve.

// `libgspeech` binary size is big if not tuned for a specific user, because it
// links in all ops for all possible graphs it might execute.

syntax = "proto2";

package google.libgspeech;

option java_multiple_files = true;
option java_package = "com.google.libgspeech";

// `SpeechInitConfig` configures the initialisation of the Speech pipeline.
//  Initialization is a heavy operation and should only be done sparsely,
//  for example when changing language.

message SpeechInitConfig {
  // `AudioConfig` contains information about what audio will be pushed into
  // Speech.
  message AudioConfig {
    // `SampleType` has information about the sample width and type.
    enum SampleType {
      UNKNOWN = 0;
      INT16 = 1;
      INT32 = 2;
      FLOAT16 = 3;
      FLOAT32 = 4;
    }

    // `sample_rate_hz` is the sampling frequency of the audio in hertz.
    optional int32 sample_rate_hz = 1;
    // `channel_count` is the number of channels in the audio.
    optional int32 channel_count = 2;
    optional SampleType sample_type = 3;
  }
  // `ASRConfig` configures the recognizer.
  message ASRConfig {
    // `language_pack` is a path to a directory holding an inference graph and
    // resources for ASR.
    optional string language_pack = 1;
  }

  // `LangIdConfig` configures the LangId pipeline. The LangId pipeline can
  // be used to determine the language spoken by a user.
  message LangIdConfig {
    // `language_pack` is a path to a directory holding an inference graph
    // and resources for langId processing.
    optional string language_pack = 1;
  }

  optional ASRConfig asr_config = 1;
  optional AudioConfig audio_config = 2;
  optional LangIdConfig lang_id_config = 3;
}

// `SpeechStartConfig` configures the start of the Speech pipeline.
message SpeechStartConfig {
  // Speech adaptation configuration to improve the accuracy of speech
  // recognition.
  message SpeechAdaptation {
    // A set of words or phrases that represents a common concept likely to
    // appear in your audio, for example a list of passenger ship names.
    // CustomClass items can be substituted into placeholders that you set in
    // PhraseSet phrases.
    message CustomClass {
      // A unique id to be referenced in phrases. Must match regex
      // "[A-Za-z0-9_]+" (quotes are not part of the regex).
      optional string custom_class_id = 1;
      // A collection of class items.
      repeated string items = 2;
    }

    // Phrase sets containing words and phrase "hints" so that the speech
    // recognition is more likely to recognize them. This can be used to improve
    // the accuracy for specific words and phrases, for example, if specific
    // commands are typically spoken by the user. This can also be used to add
    // additional words to the vocabulary of the recognizer.
    //
    // List items can also include pre-built or custom classes containing groups
    // of words that represent common concepts that occur in natural language.
    // For example, rather than providing a phrase hint for every month of the
    // year (e.g. "i was born in january", "i was born in febuary", ...), use
    // the pre-built `$MONTH` class improves the likelihood of correctly
    // transcribing audio that includes months (e.g. "i was born in $MONTH"). To
    // refer to pre-built classes, use the class' symbol prepended with `$` e.g.
    // `$MONTH`. To refer to custom classes that were defined inline in the
    // request, set the class's `custom_class_id` to a unique string. Then use
    // the class' id wrapped in $`{...}` e.g. "${my-months}".
    message PhraseSet {
      // A list of words and phrases.
      repeated string phrases = 1;
    }

    // A collection of custom classes. Refer to the defined class in phrase
    // hints by its unique `custom_class_id`.
    repeated CustomClass custom_classes = 1;

    // A collection of phrase sets, provides "hints" to the speech recognizer
    // to favor specific words and phrases in the results. Any phrase set can
    // use any custom class.
    repeated PhraseSet phrase_sets = 2;
  }

  optional SpeechAdaptation speech_adaptation = 1;
}

// `SpeechEvent` is the message type in Google Speech responses.

message SpeechEvent {
  // `RecognitionEvent` holds transcritpion information

  message RecognitionEvent {
    // Give an indication as to why a result was finalized.
    enum IsFinalReason {
      UNSET = 0;
      // END_OF_SPEECH implies that the is_final was emitted because end of
      // human speech was detected. This can occur at an utterance boundary
      // or when a speaker stops speaking.
      END_OF_SPEECH = 1;
      // END_OF_AUDIO implies that the client marked the audio stream as
      // finished and it triggered a final response.
      END_OF_AUDIO = 2;
    }

    message Result {
      // Transcript result from the recognition engine.
      optional string transcript = 1;

      // Confidence the recognizer has in the transcript.
      // Currently, this is only provided for the transcript with the highest
      // likelihood.
      optional float confidence = 2;
    }

    // Recognition results from the Recognition engine. The results are ordered
    // by likelihood where the 0'th result has the highest likelihood.
    repeated Result results = 1;

    // Indicates that this result is finalized.
    // - A full transcript is the concatenation of all finalized results.
    // - If this field is false, modified versions of the result can
    // be emitted in the future.
    optional bool is_final = 2;
    // Indicates why a result was finalized.
    optional IsFinalReason is_final_reason = 3;
  }

  // `VADEvent` holds voice activity information.
  message VADEvent {
    enum VADType {
      UNKNOWN = 0;
      // `START_OF_SPEECH` indicates that the classifier detected start of
      // human speech.
      START_OF_SPEECH = 1;
      // `END_OF_SPEECH` indicates that the classifier detected end of
      // human speech.
      END_OF_SPEECH = 2;
    }

    optional VADType vad_type = 1;
  }

  // `AudioEvent` holds audio information.
  message AudioEvent {
    enum AudioEventType {
      UNKNOWN = 0;
      // `END_OF_AUDIO` indicates that Speech is done processing the audio.
      END_OF_AUDIO = 2;
    }

    optional AudioEventType audio_event_type = 1;
  }

  // `LangIdEvent` holds results from the execution of `LangId` graph.
  // If a LangId graph is executed `libgspeech` periodically emits
  // `LangIdEvent`(s). A `LangIdEvent` provides the likelihood that
  // some chunk of audio is spoken in a certain language, expressed through
  // the confidence and the ordering of the languages.
  //
  // One use case aggregates all of the top language predictions. However,
  // for a single `LangIdEvent`, the top language prediction is the first
  // language in the `language` list. The most prevalent top language prediction
  // weighed by its confidence can be used to predict the language spoken over
  // some interval.
  message LangIdEvent {
    message Result {
      // Language recognized by the LangId model.
      optional string language = 1;
      // Confidence that the recognized language is correct.
      optional float confidence = 2;
    }

    // Results are ordered from the highest likelihood to the lowest. Currently,
    // only the highest likelihood contains a confidence value.
    repeated Result results = 1;
  }

  // Each SpeechEvent is one of the following events:
  oneof speech_event {
    RecognitionEvent recognition_event = 1;
    VADEvent vad_event = 2;
    AudioEvent audio_event = 3;
    LangIdEvent lang_id_event = 4;
  }
}

// Polling for Speech events yields this message, holding all available
// Speech events.
message SpeechEvents {
  repeated SpeechEvent speech_events = 1;
}