1 // Copyright 2018 Google Inc.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
17 package google.cloud.dialogflow.v2;
19 import "google/api/annotations.proto";
20 import "google/cloud/dialogflow/v2/context.proto";
21 import "google/cloud/dialogflow/v2/intent.proto";
22 import "google/cloud/dialogflow/v2/session_entity_type.proto";
23 import "google/protobuf/struct.proto";
24 import "google/rpc/status.proto";
25 import "google/type/latlng.proto";
27 option cc_enable_arenas = true;
28 option csharp_namespace = "Google.Cloud.Dialogflow.V2";
29 option go_package = "google.golang.org/genproto/googleapis/cloud/dialogflow/v2;dialogflow";
30 option java_multiple_files = true;
31 option java_outer_classname = "SessionProto";
32 option java_package = "com.google.cloud.dialogflow.v2";
33 option objc_class_prefix = "DF";
35 // A session represents an interaction with a user. You retrieve user input
37 // [DetectIntent][google.cloud.dialogflow.v2.Sessions.DetectIntent] (or
38 // [StreamingDetectIntent][google.cloud.dialogflow.v2.Sessions.StreamingDetectIntent])
39 // method to determine user intent and respond.
41 // Processes a natural language query and returns structured, actionable data
42 // as a result. This method is not idempotent, because it may cause contexts
43 // and session entity types to be updated, which in turn might affect
44 // results of future queries.
45 rpc DetectIntent(DetectIntentRequest) returns (DetectIntentResponse) {
46 option (google.api.http) = {
47 post: "/v2/{session=projects/*/agent/sessions/*}:detectIntent"
52 // Processes a natural language query in audio format in a streaming fashion
53 // and returns structured, actionable data as a result. This method is only
54 // available via the gRPC API (not REST).
55 rpc StreamingDetectIntent(stream StreamingDetectIntentRequest)
56 returns (stream StreamingDetectIntentResponse);
59 // The request to detect user's intent.
60 message DetectIntentRequest {
61 // Required. The name of the session this query is sent to. Format:
62 // `projects/<Project ID>/agent/sessions/<Session ID>`. It's up to the API
63 // caller to choose an appropriate session ID. It can be a random number or
64 // some type of user identifier (preferably hashed). The length of the session
65 // ID must not exceed 36 bytes.
68 // Optional. The parameters of this query.
69 QueryParameters query_params = 2;
71 // Required. The input specification. It can be set to:
74 // which instructs the speech recognizer how to process the speech audio,
76 // 2. a conversational query in the form of text, or
78 // 3. an event that specifies which intent to trigger.
79 QueryInput query_input = 3;
81 // Optional. The natural language speech audio to be processed. This field
82 // should be populated iff `query_input` is set to an input audio config.
83 // A single request can contain up to 1 minute of speech audio data.
84 bytes input_audio = 5;
87 // The message returned from the DetectIntent method.
88 message DetectIntentResponse {
89 // The unique identifier of the response. It can be used to
90 // locate a response in the training example set or for reporting issues.
91 string response_id = 1;
93 // The results of the conversational query or event processing.
94 QueryResult query_result = 2;
96 // Specifies the status of the webhook request. `webhook_status`
97 // is never populated in webhook requests.
98 google.rpc.Status webhook_status = 3;
101 // Represents the parameters of the conversational query.
102 message QueryParameters {
103 // Optional. The time zone of this conversational query from the
104 // [time zone database](https://www.iana.org/time-zones), e.g.,
105 // America/New_York, Europe/Paris. If not provided, the time zone specified in
106 // agent settings is used.
107 string time_zone = 1;
109 // Optional. The geo location of this conversational query.
110 google.type.LatLng geo_location = 2;
112 // Optional. The collection of contexts to be activated before this query is
114 repeated Context contexts = 3;
116 // Optional. Specifies whether to delete all contexts in the current session
117 // before the new ones are activated.
118 bool reset_contexts = 4;
120 // Optional. The collection of session entity types to replace or extend
121 // developer entities with for this query only. The entity synonyms apply
123 repeated SessionEntityType session_entity_types = 5;
125 // Optional. This field can be used to pass custom data into the webhook
126 // associated with the agent. Arbitrary JSON objects are supported.
127 google.protobuf.Struct payload = 6;
130 // Represents the query input. It can contain either:
132 // 1. An audio config which
133 // instructs the speech recognizer how to process the speech audio.
135 // 2. A conversational query in the form of text,.
137 // 3. An event that specifies which intent to trigger.
139 // Required. The input specification.
141 // Instructs the speech recognizer how to process the speech audio.
142 InputAudioConfig audio_config = 1;
144 // The natural language text to be processed.
147 // The event to be processed.
148 EventInput event = 3;
152 // Represents the result of conversational query or event processing.
153 message QueryResult {
154 // The original conversational query text:
155 // - If natural language text was provided as input, `query_text` contains
156 // a copy of the input.
157 // - If natural language speech audio was provided as input, `query_text`
158 // contains the speech recognition result. If speech recognizer produced
159 // multiple alternatives, a particular one is picked.
160 // - If an event was provided as input, `query_text` is not set.
161 string query_text = 1;
163 // The language that was triggered during intent detection.
164 // See [Language Support](https://dialogflow.com/docs/reference/language)
165 // for a list of the currently supported language codes.
166 string language_code = 15;
168 // The Speech recognition confidence between 0.0 and 1.0. A higher number
169 // indicates an estimated greater likelihood that the recognized words are
170 // correct. The default of 0.0 is a sentinel value indicating that confidence
173 // You should not rely on this field as it isn't guaranteed to be accurate, or
174 // even set. In particular this field isn't set in Webhook calls and for
175 // StreamingDetectIntent since the streaming endpoint has separate confidence
176 // estimates per portion of the audio in StreamingRecognitionResult.
177 float speech_recognition_confidence = 2;
179 // The action name from the matched intent.
182 // The collection of extracted parameters.
183 google.protobuf.Struct parameters = 4;
185 // This field is set to:
186 // - `false` if the matched intent has required parameters and not all of
187 // the required parameter values have been collected.
188 // - `true` if all required parameter values have been collected, or if the
189 // matched intent doesn't contain any required parameters.
190 bool all_required_params_present = 5;
192 // The text to be pronounced to the user or shown on the screen.
193 string fulfillment_text = 6;
195 // The collection of rich messages to present to the user.
196 repeated Intent.Message fulfillment_messages = 7;
198 // If the query was fulfilled by a webhook call, this field is set to the
199 // value of the `source` field returned in the webhook response.
200 string webhook_source = 8;
202 // If the query was fulfilled by a webhook call, this field is set to the
203 // value of the `payload` field returned in the webhook response.
204 google.protobuf.Struct webhook_payload = 9;
206 // The collection of output contexts. If applicable,
207 // `output_contexts.parameters` contains entries with name
208 // `<parameter name>.original` containing the original parameter values
210 repeated Context output_contexts = 10;
212 // The intent that matched the conversational query. Some, not
213 // all fields are filled in this message, including but not limited to:
214 // `name`, `display_name` and `webhook_state`.
217 // The intent detection confidence. Values range from 0.0
218 // (completely uncertain) to 1.0 (completely certain).
219 float intent_detection_confidence = 12;
221 // The free-form diagnostic info. For example, this field
222 // could contain webhook call latency.
223 google.protobuf.Struct diagnostic_info = 14;
226 // The top-level message sent by the client to the
227 // `StreamingDetectIntent` method.
229 // Multiple request messages should be sent in order:
231 // 1. The first message must contain `session`, `query_input` plus optionally
232 // `query_params` and/or `single_utterance`. The message must not contain
235 // 2. If `query_input` was set to a streaming input audio config,
236 // all subsequent messages must contain only `input_audio`.
237 // Otherwise, finish the request stream.
238 message StreamingDetectIntentRequest {
239 // Required. The name of the session the query is sent to.
240 // Format of the session name:
241 // `projects/<Project ID>/agent/sessions/<Session ID>`. It’s up to the API
242 // caller to choose an appropriate <Session ID>. It can be a random number or
243 // some type of user identifier (preferably hashed). The length of the session
244 // ID must not exceed 36 characters.
247 // Optional. The parameters of this query.
248 QueryParameters query_params = 2;
250 // Required. The input specification. It can be set to:
252 // 1. an audio config which instructs the speech recognizer how to process
255 // 2. a conversational query in the form of text, or
257 // 3. an event that specifies which intent to trigger.
258 QueryInput query_input = 3;
260 // Optional. If `false` (default), recognition does not cease until the
261 // client closes the stream.
262 // If `true`, the recognizer will detect a single spoken utterance in input
263 // audio. Recognition ceases when it detects the audio's voice has
264 // stopped or paused. In this case, once a detected intent is received, the
265 // client should close the stream and start a new request with a new stream as
267 // This setting is ignored when `query_input` is a piece of text or an event.
268 bool single_utterance = 4;
270 // Optional. The input audio content to be recognized. Must be sent if
271 // `query_input` was set to a streaming input audio config. The complete audio
272 // over all streaming messages must not exceed 1 minute.
273 bytes input_audio = 6;
276 // The top-level message returned from the
277 // `StreamingDetectIntent` method.
279 // Multiple response messages can be returned in order:
281 // 1. If the input was set to streaming audio, the first one or more messages
282 // contain `recognition_result`. Each `recognition_result` represents a more
283 // complete transcript of what the user said. The last `recognition_result`
284 // has `is_final` set to `true`.
286 // 2. The next message contains `response_id`, `query_result`
287 // and optionally `webhook_status` if a WebHook was called.
288 message StreamingDetectIntentResponse {
289 // The unique identifier of the response. It can be used to
290 // locate a response in the training example set or for reporting issues.
291 string response_id = 1;
293 // The result of speech recognition.
294 StreamingRecognitionResult recognition_result = 2;
296 // The result of the conversational query or event processing.
297 QueryResult query_result = 3;
299 // Specifies the status of the webhook request.
300 google.rpc.Status webhook_status = 4;
303 // Contains a speech recognition result corresponding to a portion of the audio
304 // that is currently being processed or an indication that this is the end
305 // of the single requested utterance.
309 // 1. transcript: "tube"
311 // 2. transcript: "to be a"
313 // 3. transcript: "to be"
315 // 4. transcript: "to be or not to be"
318 // 5. transcript: " that's"
320 // 6. transcript: " that is"
322 // 7. recognition_event_type: `RECOGNITION_EVENT_END_OF_SINGLE_UTTERANCE`
324 // 8. transcript: " that is the question"
327 // Only two of the responses contain final results (#4 and #8 indicated by
328 // `is_final: true`). Concatenating these generates the full transcript: "to be
329 // or not to be that is the question".
331 // In each response we populate:
333 // * for `MESSAGE_TYPE_TRANSCRIPT`: `transcript` and possibly `is_final`.
335 // * for `MESSAGE_TYPE_END_OF_SINGLE_UTTERANCE`: only `event_type`.
336 message StreamingRecognitionResult {
337 // Type of the response message.
339 // Not specified. Should never be used.
340 MESSAGE_TYPE_UNSPECIFIED = 0;
342 // Message contains a (possibly partial) transcript.
345 // Event indicates that the server has detected the end of the user's speech
346 // utterance and expects no additional speech. Therefore, the server will
347 // not process additional audio (although it may subsequently return
348 // additional results). The client should stop sending additional audio
349 // data, half-close the gRPC connection, and wait for any additional results
350 // until the server closes the gRPC connection. This message is only sent if
351 // `single_utterance` was set to `true`, and is not used otherwise.
352 END_OF_SINGLE_UTTERANCE = 2;
355 // Type of the result message.
356 MessageType message_type = 1;
358 // Transcript text representing the words that the user spoke.
359 // Populated if and only if `event_type` = `RECOGNITION_EVENT_TRANSCRIPT`.
360 string transcript = 2;
362 // The default of 0.0 is a sentinel value indicating `confidence` was not set.
363 // If `false`, the `StreamingRecognitionResult` represents an
364 // interim result that may change. If `true`, the recognizer will not return
365 // any further hypotheses about this piece of the audio. May only be populated
366 // for `event_type` = `RECOGNITION_EVENT_TRANSCRIPT`.
369 // The Speech confidence between 0.0 and 1.0 for the current portion of audio.
370 // A higher number indicates an estimated greater likelihood that the
371 // recognized words are correct. The default of 0.0 is a sentinel value
372 // indicating that confidence was not set.
374 // This field is typically only provided if `is_final` is true and you should
375 // not rely on it being accurate or even set.
376 float confidence = 4;
379 // Instructs the speech recognizer how to process the audio content.
380 message InputAudioConfig {
381 // Required. Audio encoding of the audio content to process.
382 AudioEncoding audio_encoding = 1;
384 // Required. Sample rate (in Hertz) of the audio content sent in the query.
385 // Refer to [Cloud Speech API documentation](/speech/docs/basics) for more
387 int32 sample_rate_hertz = 2;
389 // Required. The language of the supplied audio. Dialogflow does not do
390 // translations. See [Language
391 // Support](https://dialogflow.com/docs/languages) for a list of the
392 // currently supported language codes. Note that queries in the same session
393 // do not necessarily need to specify the same language.
394 string language_code = 3;
396 // Optional. The collection of phrase hints which are used to boost accuracy
397 // of speech recognition.
398 // Refer to [Cloud Speech API documentation](/speech/docs/basics#phrase-hints)
400 repeated string phrase_hints = 4;
403 // Represents the natural language text to be processed.
405 // Required. The UTF-8 encoded natural language text to be processed.
406 // Text length must not exceed 256 bytes.
409 // Required. The language of this conversational query. See [Language
410 // Support](https://dialogflow.com/docs/languages) for a list of the
411 // currently supported language codes. Note that queries in the same session
412 // do not necessarily need to specify the same language.
413 string language_code = 2;
416 // Events allow for matching intents by event name instead of the natural
417 // language input. For instance, input `<event: { name: “welcome_event”,
418 // parameters: { name: “Sam” } }>` can trigger a personalized welcome response.
419 // The parameter `name` may be used by the agent in the response:
420 // `“Hello #welcome_event.name! What can I do for you today?”`.
422 // Required. The unique identifier of the event.
425 // Optional. The collection of parameters associated with the event.
426 google.protobuf.Struct parameters = 2;
428 // Required. The language of this query. See [Language
429 // Support](https://dialogflow.com/docs/languages) for a list of the
430 // currently supported language codes. Note that queries in the same session
431 // do not necessarily need to specify the same language.
432 string language_code = 3;
435 // Audio encoding of the audio content sent in the conversational query request.
436 // Refer to the [Cloud Speech API documentation](/speech/docs/basics) for more
440 AUDIO_ENCODING_UNSPECIFIED = 0;
442 // Uncompressed 16-bit signed little-endian samples (Linear PCM).
443 AUDIO_ENCODING_LINEAR_16 = 1;
445 // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio
446 // Codec) is the recommended encoding because it is lossless (therefore
447 // recognition is not compromised) and requires only about half the
448 // bandwidth of `LINEAR16`. `FLAC` stream encoding supports 16-bit and
449 // 24-bit samples, however, not all fields in `STREAMINFO` are supported.
450 AUDIO_ENCODING_FLAC = 2;
452 // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
453 AUDIO_ENCODING_MULAW = 3;
455 // Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000.
456 AUDIO_ENCODING_AMR = 4;
458 // Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000.
459 AUDIO_ENCODING_AMR_WB = 5;
461 // Opus encoded audio frames in Ogg container
462 // ([OggOpus](https://wiki.xiph.org/OggOpus)).
463 // `sample_rate_hertz` must be 16000.
464 AUDIO_ENCODING_OGG_OPUS = 6;
466 // Although the use of lossy encodings is not recommended, if a very low
467 // bitrate encoding is required, `OGG_OPUS` is highly preferred over
468 // Speex encoding. The [Speex](https://speex.org/) encoding supported by
469 // Dialogflow API has a header byte in each block, as in MIME type
470 // `audio/x-speex-with-header-byte`.
471 // It is a variant of the RTP Speex encoding defined in
472 // [RFC 5574](https://tools.ietf.org/html/rfc5574).
473 // The stream is a sequence of blocks, one block per RTP packet. Each block
474 // starts with a byte containing the length of the block, in bytes, followed
475 // by one or more frames of Speex data, padded to an integral number of
476 // bytes (octets) as specified in RFC 5574. In other words, each RTP header
477 // is replaced with a single byte containing the block length. Only Speex
478 // wideband is supported. `sample_rate_hertz` must be 16000.
479 AUDIO_ENCODING_SPEEX_WITH_HEADER_BYTE = 7;