legacy-libs/dialogflow/protos/google/cloud/dialogflow/v2/session.proto

   1 // Copyright 2018 Google Inc.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //     http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 syntax = "proto3";
  16
  17 package google.cloud.dialogflow.v2;
  18
  19 import "google/api/annotations.proto";
  20 import "google/cloud/dialogflow/v2/context.proto";
  21 import "google/cloud/dialogflow/v2/intent.proto";
  22 import "google/cloud/dialogflow/v2/session_entity_type.proto";
  23 import "google/protobuf/struct.proto";
  24 import "google/rpc/status.proto";
  25 import "google/type/latlng.proto";
  26
  27 option cc_enable_arenas = true;
  28 option csharp_namespace = "Google.Cloud.Dialogflow.V2";
  29 option go_package = "google.golang.org/genproto/googleapis/cloud/dialogflow/v2;dialogflow";
  30 option java_multiple_files = true;
  31 option java_outer_classname = "SessionProto";
  32 option java_package = "com.google.cloud.dialogflow.v2";
  33 option objc_class_prefix = "DF";
  34
  35 // A session represents an interaction with a user. You retrieve user input
  36 // and pass it to the
  37 // [DetectIntent][google.cloud.dialogflow.v2.Sessions.DetectIntent] (or
  38 // [StreamingDetectIntent][google.cloud.dialogflow.v2.Sessions.StreamingDetectIntent])
  39 // method to determine user intent and respond.
  40 service Sessions {
  41   // Processes a natural language query and returns structured, actionable data
  42   // as a result. This method is not idempotent, because it may cause contexts
  43   // and session entity types to be updated, which in turn might affect
  44   // results of future queries.
  45   rpc DetectIntent(DetectIntentRequest) returns (DetectIntentResponse) {
  46     option (google.api.http) = {
  47       post: "/v2/{session=projects/*/agent/sessions/*}:detectIntent"
  48       body: "*"
  49     };
  50   }
  51
  52   // Processes a natural language query in audio format in a streaming fashion
  53   // and returns structured, actionable data as a result. This method is only
  54   // available via the gRPC API (not REST).
  55   rpc StreamingDetectIntent(stream StreamingDetectIntentRequest)
  56       returns (stream StreamingDetectIntentResponse);
  57 }
  58
  59 // The request to detect user's intent.
  60 message DetectIntentRequest {
  61   // Required. The name of the session this query is sent to. Format:
  62   // `projects/<Project ID>/agent/sessions/<Session ID>`. It's up to the API
  63   // caller to choose an appropriate session ID. It can be a random number or
  64   // some type of user identifier (preferably hashed). The length of the session
  65   // ID must not exceed 36 bytes.
  66   string session = 1;
  67
  68   // Optional. The parameters of this query.
  69   QueryParameters query_params = 2;
  70
  71   // Required. The input specification. It can be set to:
  72   //
  73   // 1.  an audio config
  74   //     which instructs the speech recognizer how to process the speech audio,
  75   //
  76   // 2.  a conversational query in the form of text, or
  77   //
  78   // 3.  an event that specifies which intent to trigger.
  79   QueryInput query_input = 3;
  80
  81   // Optional. The natural language speech audio to be processed. This field
  82   // should be populated iff `query_input` is set to an input audio config.
  83   // A single request can contain up to 1 minute of speech audio data.
  84   bytes input_audio = 5;
  85 }
  86
  87 // The message returned from the DetectIntent method.
  88 message DetectIntentResponse {
  89   // The unique identifier of the response. It can be used to
  90   // locate a response in the training example set or for reporting issues.
  91   string response_id = 1;
  92
  93   // The results of the conversational query or event processing.
  94   QueryResult query_result = 2;
  95
  96   // Specifies the status of the webhook request. `webhook_status`
  97   // is never populated in webhook requests.
  98   google.rpc.Status webhook_status = 3;
  99 }
 100
 101 // Represents the parameters of the conversational query.
 102 message QueryParameters {
 103   // Optional. The time zone of this conversational query from the
 104   // [time zone database](https://www.iana.org/time-zones), e.g.,
 105   // America/New_York, Europe/Paris. If not provided, the time zone specified in
 106   // agent settings is used.
 107   string time_zone = 1;
 108
 109   // Optional. The geo location of this conversational query.
 110   google.type.LatLng geo_location = 2;
 111
 112   // Optional. The collection of contexts to be activated before this query is
 113   // executed.
 114   repeated Context contexts = 3;
 115
 116   // Optional. Specifies whether to delete all contexts in the current session
 117   // before the new ones are activated.
 118   bool reset_contexts = 4;
 119
 120   // Optional. The collection of session entity types to replace or extend
 121   // developer entities with for this query only. The entity synonyms apply
 122   // to all languages.
 123   repeated SessionEntityType session_entity_types = 5;
 124
 125   // Optional. This field can be used to pass custom data into the webhook
 126   // associated with the agent. Arbitrary JSON objects are supported.
 127   google.protobuf.Struct payload = 6;
 128 }
 129
 130 // Represents the query input. It can contain either:
 131 //
 132 // 1.  An audio config which
 133 //     instructs the speech recognizer how to process the speech audio.
 134 //
 135 // 2.  A conversational query in the form of text,.
 136 //
 137 // 3.  An event that specifies which intent to trigger.
 138 message QueryInput {
 139   // Required. The input specification.
 140   oneof input {
 141     // Instructs the speech recognizer how to process the speech audio.
 142     InputAudioConfig audio_config = 1;
 143
 144     // The natural language text to be processed.
 145     TextInput text = 2;
 146
 147     // The event to be processed.
 148     EventInput event = 3;
 149   }
 150 }
 151
 152 // Represents the result of conversational query or event processing.
 153 message QueryResult {
 154   // The original conversational query text:
 155   // - If natural language text was provided as input, `query_text` contains
 156   //   a copy of the input.
 157   // - If natural language speech audio was provided as input, `query_text`
 158   //   contains the speech recognition result. If speech recognizer produced
 159   //   multiple alternatives, a particular one is picked.
 160   // - If an event was provided as input, `query_text` is not set.
 161   string query_text = 1;
 162
 163   // The language that was triggered during intent detection.
 164   // See [Language Support](https://dialogflow.com/docs/reference/language)
 165   // for a list of the currently supported language codes.
 166   string language_code = 15;
 167
 168   // The Speech recognition confidence between 0.0 and 1.0. A higher number
 169   // indicates an estimated greater likelihood that the recognized words are
 170   // correct. The default of 0.0 is a sentinel value indicating that confidence
 171   // was not set.
 172   //
 173   // You should not rely on this field as it isn't guaranteed to be accurate, or
 174   // even set. In particular this field isn't set in Webhook calls and for
 175   // StreamingDetectIntent since the streaming endpoint has separate confidence
 176   // estimates per portion of the audio in StreamingRecognitionResult.
 177   float speech_recognition_confidence = 2;
 178
 179   // The action name from the matched intent.
 180   string action = 3;
 181
 182   // The collection of extracted parameters.
 183   google.protobuf.Struct parameters = 4;
 184
 185   // This field is set to:
 186   // - `false` if the matched intent has required parameters and not all of
 187   //    the required parameter values have been collected.
 188   // - `true` if all required parameter values have been collected, or if the
 189   //    matched intent doesn't contain any required parameters.
 190   bool all_required_params_present = 5;
 191
 192   // The text to be pronounced to the user or shown on the screen.
 193   string fulfillment_text = 6;
 194
 195   // The collection of rich messages to present to the user.
 196   repeated Intent.Message fulfillment_messages = 7;
 197
 198   // If the query was fulfilled by a webhook call, this field is set to the
 199   // value of the `source` field returned in the webhook response.
 200   string webhook_source = 8;
 201
 202   // If the query was fulfilled by a webhook call, this field is set to the
 203   // value of the `payload` field returned in the webhook response.
 204   google.protobuf.Struct webhook_payload = 9;
 205
 206   // The collection of output contexts. If applicable,
 207   // `output_contexts.parameters` contains entries with name
 208   // `<parameter name>.original` containing the original parameter values
 209   // before the query.
 210   repeated Context output_contexts = 10;
 211
 212   // The intent that matched the conversational query. Some, not
 213   // all fields are filled in this message, including but not limited to:
 214   // `name`, `display_name` and `webhook_state`.
 215   Intent intent = 11;
 216
 217   // The intent detection confidence. Values range from 0.0
 218   // (completely uncertain) to 1.0 (completely certain).
 219   float intent_detection_confidence = 12;
 220
 221   // The free-form diagnostic info. For example, this field
 222   // could contain webhook call latency.
 223   google.protobuf.Struct diagnostic_info = 14;
 224 }
 225
 226 // The top-level message sent by the client to the
 227 // `StreamingDetectIntent` method.
 228 //
 229 // Multiple request messages should be sent in order:
 230 //
 231 // 1.  The first message must contain `session`, `query_input` plus optionally
 232 //     `query_params` and/or `single_utterance`. The message must not contain
 233 //     `input_audio`.
 234 //
 235 // 2.  If `query_input` was set to a streaming input audio config,
 236 //     all subsequent messages must contain only `input_audio`.
 237 //     Otherwise, finish the request stream.
 238 message StreamingDetectIntentRequest {
 239   // Required. The name of the session the query is sent to.
 240   // Format of the session name:
 241   // `projects/<Project ID>/agent/sessions/<Session ID>`. It’s up to the API
 242   // caller to choose an appropriate <Session ID>. It can be a random number or
 243   // some type of user identifier (preferably hashed). The length of the session
 244   // ID must not exceed 36 characters.
 245   string session = 1;
 246
 247   // Optional. The parameters of this query.
 248   QueryParameters query_params = 2;
 249
 250   // Required. The input specification. It can be set to:
 251   //
 252   // 1.  an audio config which instructs the speech recognizer how to process
 253   //     the speech audio,
 254   //
 255   // 2.  a conversational query in the form of text, or
 256   //
 257   // 3.  an event that specifies which intent to trigger.
 258   QueryInput query_input = 3;
 259
 260   // Optional. If `false` (default), recognition does not cease until the
 261   // client closes the stream.
 262   // If `true`, the recognizer will detect a single spoken utterance in input
 263   // audio. Recognition ceases when it detects the audio's voice has
 264   // stopped or paused. In this case, once a detected intent is received, the
 265   // client should close the stream and start a new request with a new stream as
 266   // needed.
 267   // This setting is ignored when `query_input` is a piece of text or an event.
 268   bool single_utterance = 4;
 269
 270   // Optional. The input audio content to be recognized. Must be sent if
 271   // `query_input` was set to a streaming input audio config. The complete audio
 272   // over all streaming messages must not exceed 1 minute.
 273   bytes input_audio = 6;
 274 }
 275
 276 // The top-level message returned from the
 277 // `StreamingDetectIntent` method.
 278 //
 279 // Multiple response messages can be returned in order:
 280 //
 281 // 1.  If the input was set to streaming audio, the first one or more messages
 282 //     contain `recognition_result`. Each `recognition_result` represents a more
 283 //     complete transcript of what the user said. The last `recognition_result`
 284 //     has `is_final` set to `true`.
 285 //
 286 // 2.  The next message contains `response_id`, `query_result`
 287 //     and optionally `webhook_status` if a WebHook was called.
 288 message StreamingDetectIntentResponse {
 289   // The unique identifier of the response. It can be used to
 290   // locate a response in the training example set or for reporting issues.
 291   string response_id = 1;
 292
 293   // The result of speech recognition.
 294   StreamingRecognitionResult recognition_result = 2;
 295
 296   // The result of the conversational query or event processing.
 297   QueryResult query_result = 3;
 298
 299   // Specifies the status of the webhook request.
 300   google.rpc.Status webhook_status = 4;
 301 }
 302
 303 // Contains a speech recognition result corresponding to a portion of the audio
 304 // that is currently being processed or an indication that this is the end
 305 // of the single requested utterance.
 306 //
 307 // Example:
 308 //
 309 // 1.  transcript: "tube"
 310 //
 311 // 2.  transcript: "to be a"
 312 //
 313 // 3.  transcript: "to be"
 314 //
 315 // 4.  transcript: "to be or not to be"
 316 //     is_final: true
 317 //
 318 // 5.  transcript: " that's"
 319 //
 320 // 6.  transcript: " that is"
 321 //
 322 // 7.  recognition_event_type: `RECOGNITION_EVENT_END_OF_SINGLE_UTTERANCE`
 323 //
 324 // 8.  transcript: " that is the question"
 325 //     is_final: true
 326 //
 327 // Only two of the responses contain final results (#4 and #8 indicated by
 328 // `is_final: true`). Concatenating these generates the full transcript: "to be
 329 // or not to be that is the question".
 330 //
 331 // In each response we populate:
 332 //
 333 // *  for `MESSAGE_TYPE_TRANSCRIPT`: `transcript` and possibly `is_final`.
 334 //
 335 // *  for `MESSAGE_TYPE_END_OF_SINGLE_UTTERANCE`: only `event_type`.
 336 message StreamingRecognitionResult {
 337   // Type of the response message.
 338   enum MessageType {
 339     // Not specified. Should never be used.
 340     MESSAGE_TYPE_UNSPECIFIED = 0;
 341
 342     // Message contains a (possibly partial) transcript.
 343     TRANSCRIPT = 1;
 344
 345     // Event indicates that the server has detected the end of the user's speech
 346     // utterance and expects no additional speech. Therefore, the server will
 347     // not process additional audio (although it may subsequently return
 348     // additional results). The client should stop sending additional audio
 349     // data, half-close the gRPC connection, and wait for any additional results
 350     // until the server closes the gRPC connection. This message is only sent if
 351     // `single_utterance` was set to `true`, and is not used otherwise.
 352     END_OF_SINGLE_UTTERANCE = 2;
 353   }
 354
 355   // Type of the result message.
 356   MessageType message_type = 1;
 357
 358   // Transcript text representing the words that the user spoke.
 359   // Populated if and only if `event_type` = `RECOGNITION_EVENT_TRANSCRIPT`.
 360   string transcript = 2;
 361
 362   // The default of 0.0 is a sentinel value indicating `confidence` was not set.
 363   // If `false`, the `StreamingRecognitionResult` represents an
 364   // interim result that may change. If `true`, the recognizer will not return
 365   // any further hypotheses about this piece of the audio. May only be populated
 366   // for `event_type` = `RECOGNITION_EVENT_TRANSCRIPT`.
 367   bool is_final = 3;
 368
 369   // The Speech confidence between 0.0 and 1.0 for the current portion of audio.
 370   // A higher number indicates an estimated greater likelihood that the
 371   // recognized words are correct. The default of 0.0 is a sentinel value
 372   // indicating that confidence was not set.
 373   //
 374   // This field is typically only provided if `is_final` is true and you should
 375   // not rely on it being accurate or even set.
 376   float confidence = 4;
 377 }
 378
 379 // Instructs the speech recognizer how to process the audio content.
 380 message InputAudioConfig {
 381   // Required. Audio encoding of the audio content to process.
 382   AudioEncoding audio_encoding = 1;
 383
 384   // Required. Sample rate (in Hertz) of the audio content sent in the query.
 385   // Refer to [Cloud Speech API documentation](/speech/docs/basics) for more
 386   // details.
 387   int32 sample_rate_hertz = 2;
 388
 389   // Required. The language of the supplied audio. Dialogflow does not do
 390   // translations. See [Language
 391   // Support](https://dialogflow.com/docs/languages) for a list of the
 392   // currently supported language codes. Note that queries in the same session
 393   // do not necessarily need to specify the same language.
 394   string language_code = 3;
 395
 396   // Optional. The collection of phrase hints which are used to boost accuracy
 397   // of speech recognition.
 398   // Refer to [Cloud Speech API documentation](/speech/docs/basics#phrase-hints)
 399   // for more details.
 400   repeated string phrase_hints = 4;
 401 }
 402
 403 // Represents the natural language text to be processed.
 404 message TextInput {
 405   // Required. The UTF-8 encoded natural language text to be processed.
 406   // Text length must not exceed 256 bytes.
 407   string text = 1;
 408
 409   // Required. The language of this conversational query. See [Language
 410   // Support](https://dialogflow.com/docs/languages) for a list of the
 411   // currently supported language codes. Note that queries in the same session
 412   // do not necessarily need to specify the same language.
 413   string language_code = 2;
 414 }
 415
 416 // Events allow for matching intents by event name instead of the natural
 417 // language input. For instance, input `<event: { name: “welcome_event”,
 418 // parameters: { name: “Sam” } }>` can trigger a personalized welcome response.
 419 // The parameter `name` may be used by the agent in the response:
 420 // `“Hello #welcome_event.name! What can I do for you today?”`.
 421 message EventInput {
 422   // Required. The unique identifier of the event.
 423   string name = 1;
 424
 425   // Optional. The collection of parameters associated with the event.
 426   google.protobuf.Struct parameters = 2;
 427
 428   // Required. The language of this query. See [Language
 429   // Support](https://dialogflow.com/docs/languages) for a list of the
 430   // currently supported language codes. Note that queries in the same session
 431   // do not necessarily need to specify the same language.
 432   string language_code = 3;
 433 }
 434
 435 // Audio encoding of the audio content sent in the conversational query request.
 436 // Refer to the [Cloud Speech API documentation](/speech/docs/basics) for more
 437 // details.
 438 enum AudioEncoding {
 439   // Not specified.
 440   AUDIO_ENCODING_UNSPECIFIED = 0;
 441
 442   // Uncompressed 16-bit signed little-endian samples (Linear PCM).
 443   AUDIO_ENCODING_LINEAR_16 = 1;
 444
 445   // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio
 446   // Codec) is the recommended encoding because it is lossless (therefore
 447   // recognition is not compromised) and requires only about half the
 448   // bandwidth of `LINEAR16`. `FLAC` stream encoding supports 16-bit and
 449   // 24-bit samples, however, not all fields in `STREAMINFO` are supported.
 450   AUDIO_ENCODING_FLAC = 2;
 451
 452   // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
 453   AUDIO_ENCODING_MULAW = 3;
 454
 455   // Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000.
 456   AUDIO_ENCODING_AMR = 4;
 457
 458   // Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000.
 459   AUDIO_ENCODING_AMR_WB = 5;
 460
 461   // Opus encoded audio frames in Ogg container
 462   // ([OggOpus](https://wiki.xiph.org/OggOpus)).
 463   // `sample_rate_hertz` must be 16000.
 464   AUDIO_ENCODING_OGG_OPUS = 6;
 465
 466   // Although the use of lossy encodings is not recommended, if a very low
 467   // bitrate encoding is required, `OGG_OPUS` is highly preferred over
 468   // Speex encoding. The [Speex](https://speex.org/) encoding supported by
 469   // Dialogflow API has a header byte in each block, as in MIME type
 470   // `audio/x-speex-with-header-byte`.
 471   // It is a variant of the RTP Speex encoding defined in
 472   // [RFC 5574](https://tools.ietf.org/html/rfc5574).
 473   // The stream is a sequence of blocks, one block per RTP packet. Each block
 474   // starts with a byte containing the length of the block, in bytes, followed
 475   // by one or more frames of Speex data, padded to an integral number of
 476   // bytes (octets) as specified in RFC 5574. In other words, each RTP header
 477   // is replaced with a single byte containing the block length. Only Speex
 478   // wideband is supported. `sample_rate_hertz` must be 16000.
 479   AUDIO_ENCODING_SPEEX_WITH_HEADER_BYTE = 7;
 480 }