legacy-libs/google-proto-files/google/cloud/videointelligence/v1p1beta1/video_intelligence.proto

   1 // Copyright 2018 Google Inc.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //     http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 syntax = "proto3";
  16
  17 package google.cloud.videointelligence.v1p1beta1;
  18
  19 import "google/api/annotations.proto";
  20 import "google/longrunning/operations.proto";
  21 import "google/protobuf/duration.proto";
  22 import "google/protobuf/timestamp.proto";
  23 import "google/rpc/status.proto";
  24
  25 option csharp_namespace = "Google.Cloud.VideoIntelligence.V1P1Beta1";
  26 option go_package = "google.golang.org/genproto/googleapis/cloud/videointelligence/v1p1beta1;videointelligence";
  27 option java_multiple_files = true;
  28 option java_outer_classname = "VideoIntelligenceServiceProto";
  29 option java_package = "com.google.cloud.videointelligence.v1p1beta1";
  30 option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1p1beta1";
  31
  32 // Service that implements Google Cloud Video Intelligence API.
  33 service VideoIntelligenceService {
  34   // Performs asynchronous video annotation. Progress and results can be
  35   // retrieved through the `google.longrunning.Operations` interface.
  36   // `Operation.metadata` contains `AnnotateVideoProgress` (progress).
  37   // `Operation.response` contains `AnnotateVideoResponse` (results).
  38   rpc AnnotateVideo(AnnotateVideoRequest)
  39       returns (google.longrunning.Operation) {
  40     option (google.api.http) = {
  41       post: "/v1p1beta1/videos:annotate"
  42       body: "*"
  43     };
  44   }
  45 }
  46
  47 // Video annotation request.
  48 message AnnotateVideoRequest {
  49   // Input video location. Currently, only
  50   // [Google Cloud Storage](https://cloud.google.com/storage/) URIs are
  51   // supported, which must be specified in the following format:
  52   // `gs://bucket-id/object-id` (other URI formats return
  53   // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
  54   // more information, see [Request URIs](/storage/docs/reference-uris). A video
  55   // URI may include wildcards in `object-id`, and thus identify multiple
  56   // videos. Supported wildcards: '*' to match 0 or more characters;
  57   // '?' to match 1 character. If unset, the input video should be embedded
  58   // in the request as `input_content`. If set, `input_content` should be unset.
  59   string input_uri = 1;
  60
  61   // The video data bytes.
  62   // If unset, the input video(s) should be specified via `input_uri`.
  63   // If set, `input_uri` should be unset.
  64   bytes input_content = 6;
  65
  66   // Requested video annotation features.
  67   repeated Feature features = 2;
  68
  69   // Additional video context and/or feature-specific parameters.
  70   VideoContext video_context = 3;
  71
  72   // Optional location where the output (in JSON format) should be stored.
  73   // Currently, only [Google Cloud Storage](https://cloud.google.com/storage/)
  74   // URIs are supported, which must be specified in the following format:
  75   // `gs://bucket-id/object-id` (other URI formats return
  76   // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
  77   // more information, see [Request URIs](/storage/docs/reference-uris).
  78   string output_uri = 4;
  79
  80   // Optional cloud region where annotation should take place. Supported cloud
  81   // regions: `us-east1`, `us-west1`, `europe-west1`, `asia-east1`. If no region
  82   // is specified, a region will be determined based on video file location.
  83   string location_id = 5;
  84 }
  85
  86 // Video context and/or feature-specific parameters.
  87 message VideoContext {
  88   // Video segments to annotate. The segments may overlap and are not required
  89   // to be contiguous or span the whole video. If unspecified, each video is
  90   // treated as a single segment.
  91   repeated VideoSegment segments = 1;
  92
  93   // Config for LABEL_DETECTION.
  94   LabelDetectionConfig label_detection_config = 2;
  95
  96   // Config for SHOT_CHANGE_DETECTION.
  97   ShotChangeDetectionConfig shot_change_detection_config = 3;
  98
  99   // Config for EXPLICIT_CONTENT_DETECTION.
 100   ExplicitContentDetectionConfig explicit_content_detection_config = 4;
 101
 102   // Config for SPEECH_TRANSCRIPTION.
 103   SpeechTranscriptionConfig speech_transcription_config = 6;
 104 }
 105
 106 // Config for LABEL_DETECTION.
 107 message LabelDetectionConfig {
 108   // What labels should be detected with LABEL_DETECTION, in addition to
 109   // video-level labels or segment-level labels.
 110   // If unspecified, defaults to `SHOT_MODE`.
 111   LabelDetectionMode label_detection_mode = 1;
 112
 113   // Whether the video has been shot from a stationary (i.e. non-moving) camera.
 114   // When set to true, might improve detection accuracy for moving objects.
 115   // Should be used with `SHOT_AND_FRAME_MODE` enabled.
 116   bool stationary_camera = 2;
 117
 118   // Model to use for label detection.
 119   // Supported values: "builtin/stable" (the default if unset) and
 120   // "builtin/latest".
 121   string model = 3;
 122 }
 123
 124 // Config for SHOT_CHANGE_DETECTION.
 125 message ShotChangeDetectionConfig {
 126   // Model to use for shot change detection.
 127   // Supported values: "builtin/stable" (the default if unset) and
 128   // "builtin/latest".
 129   string model = 1;
 130 }
 131
 132 // Config for EXPLICIT_CONTENT_DETECTION.
 133 message ExplicitContentDetectionConfig {
 134   // Model to use for explicit content detection.
 135   // Supported values: "builtin/stable" (the default if unset) and
 136   // "builtin/latest".
 137   string model = 1;
 138 }
 139
 140 // Video segment.
 141 message VideoSegment {
 142   // Time-offset, relative to the beginning of the video,
 143   // corresponding to the start of the segment (inclusive).
 144   google.protobuf.Duration start_time_offset = 1;
 145
 146   // Time-offset, relative to the beginning of the video,
 147   // corresponding to the end of the segment (inclusive).
 148   google.protobuf.Duration end_time_offset = 2;
 149 }
 150
 151 // Video segment level annotation results for label detection.
 152 message LabelSegment {
 153   // Video segment where a label was detected.
 154   VideoSegment segment = 1;
 155
 156   // Confidence that the label is accurate. Range: [0, 1].
 157   float confidence = 2;
 158 }
 159
 160 // Video frame level annotation results for label detection.
 161 message LabelFrame {
 162   // Time-offset, relative to the beginning of the video, corresponding to the
 163   // video frame for this location.
 164   google.protobuf.Duration time_offset = 1;
 165
 166   // Confidence that the label is accurate. Range: [0, 1].
 167   float confidence = 2;
 168 }
 169
 170 // Detected entity from video analysis.
 171 message Entity {
 172   // Opaque entity ID. Some IDs may be available in
 173   // [Google Knowledge Graph Search
 174   // API](https://developers.google.com/knowledge-graph/).
 175   string entity_id = 1;
 176
 177   // Textual description, e.g. `Fixed-gear bicycle`.
 178   string description = 2;
 179
 180   // Language code for `description` in BCP-47 format.
 181   string language_code = 3;
 182 }
 183
 184 // Label annotation.
 185 message LabelAnnotation {
 186   // Detected entity.
 187   Entity entity = 1;
 188
 189   // Common categories for the detected entity.
 190   // E.g. when the label is `Terrier` the category is likely `dog`. And in some
 191   // cases there might be more than one categories e.g. `Terrier` could also be
 192   // a `pet`.
 193   repeated Entity category_entities = 2;
 194
 195   // All video segments where a label was detected.
 196   repeated LabelSegment segments = 3;
 197
 198   // All video frames where a label was detected.
 199   repeated LabelFrame frames = 4;
 200 }
 201
 202 // Video frame level annotation results for explicit content.
 203 message ExplicitContentFrame {
 204   // Time-offset, relative to the beginning of the video, corresponding to the
 205   // video frame for this location.
 206   google.protobuf.Duration time_offset = 1;
 207
 208   // Likelihood of the pornography content..
 209   Likelihood pornography_likelihood = 2;
 210 }
 211
 212 // Explicit content annotation (based on per-frame visual signals only).
 213 // If no explicit content has been detected in a frame, no annotations are
 214 // present for that frame.
 215 message ExplicitContentAnnotation {
 216   // All video frames where explicit content was detected.
 217   repeated ExplicitContentFrame frames = 1;
 218 }
 219
 220 // Annotation results for a single video.
 221 message VideoAnnotationResults {
 222   // Output only. Video file location in
 223   // [Google Cloud Storage](https://cloud.google.com/storage/).
 224   string input_uri = 1;
 225
 226   // Label annotations on video level or user specified segment level.
 227   // There is exactly one element for each unique label.
 228   repeated LabelAnnotation segment_label_annotations = 2;
 229
 230   // Label annotations on shot level.
 231   // There is exactly one element for each unique label.
 232   repeated LabelAnnotation shot_label_annotations = 3;
 233
 234   // Label annotations on frame level.
 235   // There is exactly one element for each unique label.
 236   repeated LabelAnnotation frame_label_annotations = 4;
 237
 238   // Shot annotations. Each shot is represented as a video segment.
 239   repeated VideoSegment shot_annotations = 6;
 240
 241   // Explicit content annotation.
 242   ExplicitContentAnnotation explicit_annotation = 7;
 243
 244   // Speech transcription.
 245   repeated SpeechTranscription speech_transcriptions = 11;
 246
 247   // Output only. If set, indicates an error. Note that for a single
 248   // `AnnotateVideoRequest` some videos may succeed and some may fail.
 249   google.rpc.Status error = 9;
 250 }
 251
 252 // Video annotation response. Included in the `response`
 253 // field of the `Operation` returned by the `GetOperation`
 254 // call of the `google::longrunning::Operations` service.
 255 message AnnotateVideoResponse {
 256   // Annotation results for all videos specified in `AnnotateVideoRequest`.
 257   repeated VideoAnnotationResults annotation_results = 1;
 258 }
 259
 260 // Annotation progress for a single video.
 261 message VideoAnnotationProgress {
 262   // Output only. Video file location in
 263   // [Google Cloud Storage](https://cloud.google.com/storage/).
 264   string input_uri = 1;
 265
 266   // Output only. Approximate percentage processed thus far. Guaranteed to be
 267   // 100 when fully processed.
 268   int32 progress_percent = 2;
 269
 270   // Output only. Time when the request was received.
 271   google.protobuf.Timestamp start_time = 3;
 272
 273   // Output only. Time of the most recent update.
 274   google.protobuf.Timestamp update_time = 4;
 275 }
 276
 277 // Video annotation progress. Included in the `metadata`
 278 // field of the `Operation` returned by the `GetOperation`
 279 // call of the `google::longrunning::Operations` service.
 280 message AnnotateVideoProgress {
 281   // Progress metadata for all videos specified in `AnnotateVideoRequest`.
 282   repeated VideoAnnotationProgress annotation_progress = 1;
 283 }
 284
 285 // Config for SPEECH_TRANSCRIPTION.
 286 message SpeechTranscriptionConfig {
 287   // *Required* The language of the supplied audio as a
 288   // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
 289   // Example: "en-US".
 290   // See [Language Support](https://cloud.google.com/speech/docs/languages)
 291   // for a list of the currently supported language codes.
 292   string language_code = 1;
 293
 294   // *Optional* Maximum number of recognition hypotheses to be returned.
 295   // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
 296   // within each `SpeechRecognitionResult`. The server may return fewer than
 297   // `max_alternatives`. Valid values are `0`-`30`. A value of `0` or `1` will
 298   // return a maximum of one. If omitted, will return a maximum of one.
 299   int32 max_alternatives = 2;
 300
 301   // *Optional* If set to `true`, the server will attempt to filter out
 302   // profanities, replacing all but the initial character in each filtered word
 303   // with asterisks, e.g. "f***". If set to `false` or omitted, profanities
 304   // won't be filtered out.
 305   bool filter_profanity = 3;
 306
 307   // *Optional* A means to provide context to assist the speech recognition.
 308   repeated SpeechContext speech_contexts = 4;
 309
 310   // *Optional* If 'true', adds punctuation to recognition result hypotheses.
 311   // This feature is only available in select languages. Setting this for
 312   // requests in other languages has no effect at all. The default 'false' value
 313   // does not add punctuation to result hypotheses. NOTE: "This is currently
 314   // offered as an experimental service, complimentary to all users. In the
 315   // future this may be exclusively available as a premium feature."
 316   bool enable_automatic_punctuation = 5;
 317
 318   // *Optional* For file formats, such as MXF or MKV, supporting multiple audio
 319   // tracks, specify up to two tracks. Default: track 0.
 320   repeated int32 audio_tracks = 6;
 321 }
 322
 323 // Provides "hints" to the speech recognizer to favor specific words and phrases
 324 // in the results.
 325 message SpeechContext {
 326   // *Optional* A list of strings containing words and phrases "hints" so that
 327   // the speech recognition is more likely to recognize them. This can be used
 328   // to improve the accuracy for specific words and phrases, for example, if
 329   // specific commands are typically spoken by the user. This can also be used
 330   // to add additional words to the vocabulary of the recognizer. See
 331   // [usage limits](https://cloud.google.com/speech/limits#content).
 332   repeated string phrases = 1;
 333 }
 334
 335 // A speech recognition result corresponding to a portion of the audio.
 336 message SpeechTranscription {
 337   // Output only. May contain one or more recognition hypotheses (up to the
 338   // maximum specified in `max_alternatives`).
 339   // These alternatives are ordered in terms of accuracy, with the top (first)
 340   // alternative being the most probable, as ranked by the recognizer.
 341   repeated SpeechRecognitionAlternative alternatives = 1;
 342 }
 343
 344 // Alternative hypotheses (a.k.a. n-best list).
 345 message SpeechRecognitionAlternative {
 346   // Output only. Transcript text representing the words that the user spoke.
 347   string transcript = 1;
 348
 349   // Output only. The confidence estimate between 0.0 and 1.0. A higher number
 350   // indicates an estimated greater likelihood that the recognized words are
 351   // correct. This field is typically provided only for the top hypothesis, and
 352   // only for `is_final=true` results. Clients should not rely on the
 353   // `confidence` field as it is not guaranteed to be accurate or consistent.
 354   // The default of 0.0 is a sentinel value indicating `confidence` was not set.
 355   float confidence = 2;
 356
 357   // Output only. A list of word-specific information for each recognized word.
 358   repeated WordInfo words = 3;
 359 }
 360
 361 // Word-specific information for recognized words. Word information is only
 362 // included in the response when certain request parameters are set, such
 363 // as `enable_word_time_offsets`.
 364 message WordInfo {
 365   // Output only. Time offset relative to the beginning of the audio, and
 366   // corresponding to the start of the spoken word. This field is only set if
 367   // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
 368   // experimental feature and the accuracy of the time offset can vary.
 369   google.protobuf.Duration start_time = 1;
 370
 371   // Output only. Time offset relative to the beginning of the audio, and
 372   // corresponding to the end of the spoken word. This field is only set if
 373   // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
 374   // experimental feature and the accuracy of the time offset can vary.
 375   google.protobuf.Duration end_time = 2;
 376
 377   // Output only. The word corresponding to this set of information.
 378   string word = 3;
 379 }
 380
 381 // Video annotation feature.
 382 enum Feature {
 383   // Unspecified.
 384   FEATURE_UNSPECIFIED = 0;
 385
 386   // Label detection. Detect objects, such as dog or flower.
 387   LABEL_DETECTION = 1;
 388
 389   // Shot change detection.
 390   SHOT_CHANGE_DETECTION = 2;
 391
 392   // Explicit content detection.
 393   EXPLICIT_CONTENT_DETECTION = 3;
 394
 395   // Speech transcription.
 396   SPEECH_TRANSCRIPTION = 6;
 397 }
 398
 399 // Label detection mode.
 400 enum LabelDetectionMode {
 401   // Unspecified.
 402   LABEL_DETECTION_MODE_UNSPECIFIED = 0;
 403
 404   // Detect shot-level labels.
 405   SHOT_MODE = 1;
 406
 407   // Detect frame-level labels.
 408   FRAME_MODE = 2;
 409
 410   // Detect both shot-level and frame-level labels.
 411   SHOT_AND_FRAME_MODE = 3;
 412 }
 413
 414 // Bucketized representation of likelihood.
 415 enum Likelihood {
 416   // Unspecified likelihood.
 417   LIKELIHOOD_UNSPECIFIED = 0;
 418
 419   // Very unlikely.
 420   VERY_UNLIKELY = 1;
 421
 422   // Unlikely.
 423   UNLIKELY = 2;
 424
 425   // Possible.
 426   POSSIBLE = 3;
 427
 428   // Likely.
 429   LIKELY = 4;
 430
 431   // Very likely.
 432   VERY_LIKELY = 5;
 433 }