legacy-libs/google-proto-files/google/assistant/embedded/v1alpha1/embedded_assistant.proto

   1 // Copyright 2017 Google Inc.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //     http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 syntax = "proto3";
  16
  17 package google.assistant.embedded.v1alpha1;
  18
  19 import "google/api/annotations.proto";
  20 import "google/rpc/status.proto";
  21
  22 option go_package = "google.golang.org/genproto/googleapis/assistant/embedded/v1alpha1;embedded";
  23 option java_multiple_files = true;
  24 option java_outer_classname = "AssistantProto";
  25 option java_package = "com.google.assistant.embedded.v1alpha1";
  26
  27 // Service that implements Google Assistant API.
  28 service EmbeddedAssistant {
  29   // Initiates or continues a conversation with the embedded assistant service.
  30   // Each call performs one round-trip, sending an audio request to the service
  31   // and receiving the audio response. Uses bidirectional streaming to receive
  32   // results, such as the `END_OF_UTTERANCE` event, while sending audio.
  33   //
  34   // A conversation is one or more gRPC connections, each consisting of several
  35   // streamed requests and responses.
  36   // For example, the user says *Add to my shopping list* and the assistant
  37   // responds *What do you want to add?*. The sequence of streamed requests and
  38   // responses in the first gRPC message could be:
  39   //
  40   // *   ConverseRequest.config
  41   // *   ConverseRequest.audio_in
  42   // *   ConverseRequest.audio_in
  43   // *   ConverseRequest.audio_in
  44   // *   ConverseRequest.audio_in
  45   // *   ConverseResponse.event_type.END_OF_UTTERANCE
  46   // *   ConverseResponse.result.microphone_mode.DIALOG_FOLLOW_ON
  47   // *   ConverseResponse.audio_out
  48   // *   ConverseResponse.audio_out
  49   // *   ConverseResponse.audio_out
  50   //
  51   // The user then says *bagels* and the assistant responds
  52   // *OK, I've added bagels to your shopping list*. This is sent as another gRPC
  53   // connection call to the `Converse` method, again with streamed requests and
  54   // responses, such as:
  55   //
  56   // *   ConverseRequest.config
  57   // *   ConverseRequest.audio_in
  58   // *   ConverseRequest.audio_in
  59   // *   ConverseRequest.audio_in
  60   // *   ConverseResponse.event_type.END_OF_UTTERANCE
  61   // *   ConverseResponse.result.microphone_mode.CLOSE_MICROPHONE
  62   // *   ConverseResponse.audio_out
  63   // *   ConverseResponse.audio_out
  64   // *   ConverseResponse.audio_out
  65   // *   ConverseResponse.audio_out
  66   //
  67   // Although the precise order of responses is not guaranteed, sequential
  68   // ConverseResponse.audio_out messages will always contain sequential portions
  69   // of audio.
  70   rpc Converse(stream ConverseRequest) returns (stream ConverseResponse);
  71 }
  72
  73 // Specifies how to process the `ConverseRequest` messages.
  74 message ConverseConfig {
  75   // *Required* Specifies how to process the subsequent incoming audio.
  76   AudioInConfig audio_in_config = 1;
  77
  78   // *Required* Specifies how to format the audio that will be returned.
  79   AudioOutConfig audio_out_config = 2;
  80
  81   // *Required* Represents the current dialog state.
  82   ConverseState converse_state = 3;
  83 }
  84
  85 // Specifies how to process the `audio_in` data that will be provided in
  86 // subsequent requests. For recommended settings, see the Google Assistant SDK
  87 // [best
  88 // practices](https://developers.google.com/assistant/sdk/develop/grpc/best-practices/audio).
  89 message AudioInConfig {
  90   // Audio encoding of the data sent in the audio message.
  91   // Audio must be one-channel (mono). The only language supported is "en-US".
  92   enum Encoding {
  93     // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
  94     ENCODING_UNSPECIFIED = 0;
  95
  96     // Uncompressed 16-bit signed little-endian samples (Linear PCM).
  97     // This encoding includes no header, only the raw audio bytes.
  98     LINEAR16 = 1;
  99
 100     // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio
 101     // Codec) is the recommended encoding because it is
 102     // lossless--therefore recognition is not compromised--and
 103     // requires only about half the bandwidth of `LINEAR16`. This encoding
 104     // includes the `FLAC` stream header followed by audio data. It supports
 105     // 16-bit and 24-bit samples, however, not all fields in `STREAMINFO` are
 106     // supported.
 107     FLAC = 2;
 108   }
 109
 110   // *Required* Encoding of audio data sent in all `audio_in` messages.
 111   Encoding encoding = 1;
 112
 113   // *Required* Sample rate (in Hertz) of the audio data sent in all `audio_in`
 114   // messages. Valid values are from 16000-24000, but 16000 is optimal.
 115   // For best results, set the sampling rate of the audio source to 16000 Hz.
 116   // If that's not possible, use the native sample rate of the audio source
 117   // (instead of re-sampling).
 118   int32 sample_rate_hertz = 2;
 119 }
 120
 121 // Specifies the desired format for the server to use when it returns
 122 // `audio_out` messages.
 123 message AudioOutConfig {
 124   // Audio encoding of the data returned in the audio message. All encodings are
 125   // raw audio bytes with no header, except as indicated below.
 126   enum Encoding {
 127     // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
 128     ENCODING_UNSPECIFIED = 0;
 129
 130     // Uncompressed 16-bit signed little-endian samples (Linear PCM).
 131     LINEAR16 = 1;
 132
 133     // MP3 audio encoding. The sample rate is encoded in the payload.
 134     MP3 = 2;
 135
 136     // Opus-encoded audio wrapped in an ogg container. The result will be a
 137     // file which can be played natively on Android and in some browsers (such
 138     // as Chrome). The quality of the encoding is considerably higher than MP3
 139     // while using the same bitrate. The sample rate is encoded in the payload.
 140     OPUS_IN_OGG = 3;
 141   }
 142
 143   // *Required* The encoding of audio data to be returned in all `audio_out`
 144   // messages.
 145   Encoding encoding = 1;
 146
 147   // *Required* The sample rate in Hertz of the audio data returned in
 148   // `audio_out` messages. Valid values are: 16000-24000.
 149   int32 sample_rate_hertz = 2;
 150
 151   // *Required* Current volume setting of the device's audio output.
 152   // Valid values are 1 to 100 (corresponding to 1% to 100%).
 153   int32 volume_percentage = 3;
 154 }
 155
 156 // Provides information about the current dialog state.
 157 message ConverseState {
 158   // *Required* The `conversation_state` value returned in the prior
 159   // `ConverseResponse`. Omit (do not set the field) if there was no prior
 160   // `ConverseResponse`. If there was a prior `ConverseResponse`, do not omit
 161   // this field; doing so will end that conversation (and this new request will
 162   // start a new conversation).
 163   bytes conversation_state = 1;
 164 }
 165
 166 // The audio containing the assistant's response to the query. Sequential chunks
 167 // of audio data are received in sequential `ConverseResponse` messages.
 168 message AudioOut {
 169   // *Output-only* The audio data containing the assistant's response to the
 170   // query. Sequential chunks of audio data are received in sequential
 171   // `ConverseResponse` messages.
 172   bytes audio_data = 1;
 173 }
 174
 175 // The semantic result for the user's spoken query.
 176 message ConverseResult {
 177   // Possible states of the microphone after a `Converse` RPC completes.
 178   enum MicrophoneMode {
 179     // No mode specified.
 180     MICROPHONE_MODE_UNSPECIFIED = 0;
 181
 182     // The service is not expecting a follow-on question from the user.
 183     // The microphone should remain off until the user re-activates it.
 184     CLOSE_MICROPHONE = 1;
 185
 186     // The service is expecting a follow-on question from the user. The
 187     // microphone should be re-opened when the `AudioOut` playback completes
 188     // (by starting a new `Converse` RPC call to send the new audio).
 189     DIALOG_FOLLOW_ON = 2;
 190   }
 191
 192   // *Output-only* The recognized transcript of what the user said.
 193   string spoken_request_text = 1;
 194
 195   // *Output-only* The text of the assistant's spoken response. This is only
 196   // returned for an IFTTT action.
 197   string spoken_response_text = 2;
 198
 199   // *Output-only* State information for subsequent `ConverseRequest`. This
 200   // value should be saved in the client and returned in the
 201   // `conversation_state` with the next `ConverseRequest`. (The client does not
 202   // need to interpret or otherwise use this value.) There is no need to save
 203   // this information across device restarts.
 204   bytes conversation_state = 3;
 205
 206   // *Output-only* Specifies the mode of the microphone after this `Converse`
 207   // RPC is processed.
 208   MicrophoneMode microphone_mode = 4;
 209
 210   // *Output-only* Updated volume level. The value will be 0 or omitted
 211   // (indicating no change) unless a voice command such as "Increase the volume"
 212   // or "Set volume level 4" was recognized, in which case the value will be
 213   // between 1 and 100 (corresponding to the new volume level of 1% to 100%).
 214   // Typically, a client should use this volume level when playing the
 215   // `audio_out` data, and retain this value as the current volume level and
 216   // supply it in the `AudioOutConfig` of the next `ConverseRequest`. (Some
 217   // clients may also implement other ways to allow the current volume level to
 218   // be changed, for example, by providing a knob that the user can turn.)
 219   int32 volume_percentage = 5;
 220 }
 221
 222 // The top-level message sent by the client. Clients must send at least two, and
 223 // typically numerous `ConverseRequest` messages. The first message must
 224 // contain a `config` message and must not contain `audio_in` data. All
 225 // subsequent messages must contain `audio_in` data and must not contain a
 226 // `config` message.
 227 message ConverseRequest {
 228   // Exactly one of these fields must be specified in each `ConverseRequest`.
 229   oneof converse_request {
 230     // The `config` message provides information to the recognizer that
 231     // specifies how to process the request.
 232     // The first `ConverseRequest` message must contain a `config` message.
 233     ConverseConfig config = 1;
 234
 235     // The audio data to be recognized. Sequential chunks of audio data are sent
 236     // in sequential `ConverseRequest` messages. The first `ConverseRequest`
 237     // message must not contain `audio_in` data and all subsequent
 238     // `ConverseRequest` messages must contain `audio_in` data. The audio bytes
 239     // must be encoded as specified in `AudioInConfig`.
 240     // Audio must be sent at approximately real-time (16000 samples per second).
 241     // An error will be returned if audio is sent significantly faster or
 242     // slower.
 243     bytes audio_in = 2;
 244   }
 245 }
 246
 247 // The top-level message received by the client. A series of one or more
 248 // `ConverseResponse` messages are streamed back to the client.
 249 message ConverseResponse {
 250   // Indicates the type of event.
 251   enum EventType {
 252     // No event specified.
 253     EVENT_TYPE_UNSPECIFIED = 0;
 254
 255     // This event indicates that the server has detected the end of the user's
 256     // speech utterance and expects no additional speech. Therefore, the server
 257     // will not process additional audio (although it may subsequently return
 258     // additional results). The client should stop sending additional audio
 259     // data, half-close the gRPC connection, and wait for any additional results
 260     // until the server closes the gRPC connection.
 261     END_OF_UTTERANCE = 1;
 262   }
 263
 264   // Exactly one of these fields will be populated in each `ConverseResponse`.
 265   oneof converse_response {
 266     // *Output-only* If set, returns a [google.rpc.Status][google.rpc.Status]
 267     // message that specifies the error for the operation. If an error occurs
 268     // during processing, this message will be set and there will be no further
 269     // messages sent.
 270     google.rpc.Status error = 1;
 271
 272     // *Output-only* Indicates the type of event.
 273     EventType event_type = 2;
 274
 275     // *Output-only* The audio containing the assistant's response to the query.
 276     AudioOut audio_out = 3;
 277
 278     // *Output-only* The semantic result for the user's spoken query.
 279     ConverseResult result = 5;
 280   }
 281 }