legacy-libs/google-proto-files/google/cloud/videointelligence/v1/video_intelligence.proto

   1 // Copyright 2018 Google LLC.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //     http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14 //
  15
  16 syntax = "proto3";
  17
  18 package google.cloud.videointelligence.v1;
  19
  20 import "google/api/annotations.proto";
  21 import "google/longrunning/operations.proto";
  22 import "google/protobuf/duration.proto";
  23 import "google/protobuf/timestamp.proto";
  24 import "google/rpc/status.proto";
  25
  26 option csharp_namespace = "Google.Cloud.VideoIntelligence.V1";
  27 option go_package = "google.golang.org/genproto/googleapis/cloud/videointelligence/v1;videointelligence";
  28 option java_multiple_files = true;
  29 option java_outer_classname = "VideoIntelligenceServiceProto";
  30 option java_package = "com.google.cloud.videointelligence.v1";
  31 option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1";
  32
  33 // Service that implements Google Cloud Video Intelligence API.
  34 service VideoIntelligenceService {
  35   // Performs asynchronous video annotation. Progress and results can be
  36   // retrieved through the `google.longrunning.Operations` interface.
  37   // `Operation.metadata` contains `AnnotateVideoProgress` (progress).
  38   // `Operation.response` contains `AnnotateVideoResponse` (results).
  39   rpc AnnotateVideo(AnnotateVideoRequest)
  40       returns (google.longrunning.Operation) {
  41     option (google.api.http) = {
  42       post: "/v1/videos:annotate"
  43       body: "*"
  44     };
  45   }
  46 }
  47
  48 // Video annotation request.
  49 message AnnotateVideoRequest {
  50   // Input video location. Currently, only
  51   // [Google Cloud Storage](https://cloud.google.com/storage/) URIs are
  52   // supported, which must be specified in the following format:
  53   // `gs://bucket-id/object-id` (other URI formats return
  54   // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
  55   // more information, see [Request URIs](/storage/docs/reference-uris). A video
  56   // URI may include wildcards in `object-id`, and thus identify multiple
  57   // videos. Supported wildcards: '*' to match 0 or more characters;
  58   // '?' to match 1 character. If unset, the input video should be embedded
  59   // in the request as `input_content`. If set, `input_content` should be unset.
  60   string input_uri = 1;
  61
  62   // The video data bytes.
  63   // If unset, the input video(s) should be specified via `input_uri`.
  64   // If set, `input_uri` should be unset.
  65   bytes input_content = 6;
  66
  67   // Requested video annotation features.
  68   repeated Feature features = 2;
  69
  70   // Additional video context and/or feature-specific parameters.
  71   VideoContext video_context = 3;
  72
  73   // Optional location where the output (in JSON format) should be stored.
  74   // Currently, only [Google Cloud Storage](https://cloud.google.com/storage/)
  75   // URIs are supported, which must be specified in the following format:
  76   // `gs://bucket-id/object-id` (other URI formats return
  77   // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
  78   // more information, see [Request URIs](/storage/docs/reference-uris).
  79   string output_uri = 4;
  80
  81   // Optional cloud region where annotation should take place. Supported cloud
  82   // regions: `us-east1`, `us-west1`, `europe-west1`, `asia-east1`. If no region
  83   // is specified, a region will be determined based on video file location.
  84   string location_id = 5;
  85 }
  86
  87 // Video context and/or feature-specific parameters.
  88 message VideoContext {
  89   // Video segments to annotate. The segments may overlap and are not required
  90   // to be contiguous or span the whole video. If unspecified, each video is
  91   // treated as a single segment.
  92   repeated VideoSegment segments = 1;
  93
  94   // Config for LABEL_DETECTION.
  95   LabelDetectionConfig label_detection_config = 2;
  96
  97   // Config for SHOT_CHANGE_DETECTION.
  98   ShotChangeDetectionConfig shot_change_detection_config = 3;
  99
 100   // Config for EXPLICIT_CONTENT_DETECTION.
 101   ExplicitContentDetectionConfig explicit_content_detection_config = 4;
 102
 103   // Config for FACE_DETECTION.
 104   FaceDetectionConfig face_detection_config = 5;
 105
 106   // Config for SPEECH_TRANSCRIPTION.
 107   SpeechTranscriptionConfig speech_transcription_config = 6;
 108
 109   // Config for TEXT_DETECTION.
 110   TextDetectionConfig text_detection_config = 8;
 111 }
 112
 113 // Config for LABEL_DETECTION.
 114 message LabelDetectionConfig {
 115   // What labels should be detected with LABEL_DETECTION, in addition to
 116   // video-level labels or segment-level labels.
 117   // If unspecified, defaults to `SHOT_MODE`.
 118   LabelDetectionMode label_detection_mode = 1;
 119
 120   // Whether the video has been shot from a stationary (i.e. non-moving) camera.
 121   // When set to true, might improve detection accuracy for moving objects.
 122   // Should be used with `SHOT_AND_FRAME_MODE` enabled.
 123   bool stationary_camera = 2;
 124
 125   // Model to use for label detection.
 126   // Supported values: "builtin/stable" (the default if unset) and
 127   // "builtin/latest".
 128   string model = 3;
 129 }
 130
 131 // Config for SHOT_CHANGE_DETECTION.
 132 message ShotChangeDetectionConfig {
 133   // Model to use for shot change detection.
 134   // Supported values: "builtin/stable" (the default if unset) and
 135   // "builtin/latest".
 136   string model = 1;
 137 }
 138
 139 // Config for EXPLICIT_CONTENT_DETECTION.
 140 message ExplicitContentDetectionConfig {
 141   // Model to use for explicit content detection.
 142   // Supported values: "builtin/stable" (the default if unset) and
 143   // "builtin/latest".
 144   string model = 1;
 145 }
 146
 147 // Config for FACE_DETECTION.
 148 message FaceDetectionConfig {
 149   // Model to use for face detection.
 150   // Supported values: "builtin/stable" (the default if unset) and
 151   // "builtin/latest".
 152   string model = 1;
 153
 154   // Whether bounding boxes be included in the face annotation output.
 155   bool include_bounding_boxes = 2;
 156 }
 157
 158 // Config for TEXT_DETECTION.
 159 message TextDetectionConfig {
 160   // Language hint can be specified if the language to be detected is known a
 161   // priori. It can increase the accuracy of the detection. Language hint must
 162   // be language code in BCP-47 format.
 163   //
 164   // Automatic language detection is performed if no hint is provided.
 165   repeated string language_hints = 1;
 166 }
 167
 168 // Video segment.
 169 message VideoSegment {
 170   // Time-offset, relative to the beginning of the video,
 171   // corresponding to the start of the segment (inclusive).
 172   google.protobuf.Duration start_time_offset = 1;
 173
 174   // Time-offset, relative to the beginning of the video,
 175   // corresponding to the end of the segment (inclusive).
 176   google.protobuf.Duration end_time_offset = 2;
 177 }
 178
 179 // Video segment level annotation results for label detection.
 180 message LabelSegment {
 181   // Video segment where a label was detected.
 182   VideoSegment segment = 1;
 183
 184   // Confidence that the label is accurate. Range: [0, 1].
 185   float confidence = 2;
 186 }
 187
 188 // Video frame level annotation results for label detection.
 189 message LabelFrame {
 190   // Time-offset, relative to the beginning of the video, corresponding to the
 191   // video frame for this location.
 192   google.protobuf.Duration time_offset = 1;
 193
 194   // Confidence that the label is accurate. Range: [0, 1].
 195   float confidence = 2;
 196 }
 197
 198 // Detected entity from video analysis.
 199 message Entity {
 200   // Opaque entity ID. Some IDs may be available in
 201   // [Google Knowledge Graph Search
 202   // API](https://developers.google.com/knowledge-graph/).
 203   string entity_id = 1;
 204
 205   // Textual description, e.g. `Fixed-gear bicycle`.
 206   string description = 2;
 207
 208   // Language code for `description` in BCP-47 format.
 209   string language_code = 3;
 210 }
 211
 212 // Label annotation.
 213 message LabelAnnotation {
 214   // Detected entity.
 215   Entity entity = 1;
 216
 217   // Common categories for the detected entity.
 218   // E.g. when the label is `Terrier` the category is likely `dog`. And in some
 219   // cases there might be more than one categories e.g. `Terrier` could also be
 220   // a `pet`.
 221   repeated Entity category_entities = 2;
 222
 223   // All video segments where a label was detected.
 224   repeated LabelSegment segments = 3;
 225
 226   // All video frames where a label was detected.
 227   repeated LabelFrame frames = 4;
 228 }
 229
 230 // Video frame level annotation results for explicit content.
 231 message ExplicitContentFrame {
 232   // Time-offset, relative to the beginning of the video, corresponding to the
 233   // video frame for this location.
 234   google.protobuf.Duration time_offset = 1;
 235
 236   // Likelihood of the pornography content..
 237   Likelihood pornography_likelihood = 2;
 238 }
 239
 240 // Explicit content annotation (based on per-frame visual signals only).
 241 // If no explicit content has been detected in a frame, no annotations are
 242 // present for that frame.
 243 message ExplicitContentAnnotation {
 244   // All video frames where explicit content was detected.
 245   repeated ExplicitContentFrame frames = 1;
 246 }
 247
 248 // Normalized bounding box.
 249 // The normalized vertex coordinates are relative to the original image.
 250 // Range: [0, 1].
 251 message NormalizedBoundingBox {
 252   // Left X coordinate.
 253   float left = 1;
 254
 255   // Top Y coordinate.
 256   float top = 2;
 257
 258   // Right X coordinate.
 259   float right = 3;
 260
 261   // Bottom Y coordinate.
 262   float bottom = 4;
 263 }
 264
 265 // Video segment level annotation results for face detection.
 266 message FaceSegment {
 267   // Video segment where a face was detected.
 268   VideoSegment segment = 1;
 269 }
 270
 271 // Video frame level annotation results for face detection.
 272 message FaceFrame {
 273   // Normalized Bounding boxes in a frame.
 274   // There can be more than one boxes if the same face is detected in multiple
 275   // locations within the current frame.
 276   repeated NormalizedBoundingBox normalized_bounding_boxes = 1;
 277
 278   // Time-offset, relative to the beginning of the video,
 279   // corresponding to the video frame for this location.
 280   google.protobuf.Duration time_offset = 2;
 281 }
 282
 283 // Face annotation.
 284 message FaceAnnotation {
 285   // Thumbnail of a representative face view (in JPEG format).
 286   bytes thumbnail = 1;
 287
 288   // All video segments where a face was detected.
 289   repeated FaceSegment segments = 2;
 290
 291   // All video frames where a face was detected.
 292   repeated FaceFrame frames = 3;
 293 }
 294
 295 // Annotation results for a single video.
 296 message VideoAnnotationResults {
 297   // Video file location in
 298   // [Google Cloud Storage](https://cloud.google.com/storage/).
 299   string input_uri = 1;
 300
 301   // Label annotations on video level or user specified segment level.
 302   // There is exactly one element for each unique label.
 303   repeated LabelAnnotation segment_label_annotations = 2;
 304
 305   // Label annotations on shot level.
 306   // There is exactly one element for each unique label.
 307   repeated LabelAnnotation shot_label_annotations = 3;
 308
 309   // Label annotations on frame level.
 310   // There is exactly one element for each unique label.
 311   repeated LabelAnnotation frame_label_annotations = 4;
 312
 313   // Face annotations. There is exactly one element for each unique face.
 314   repeated FaceAnnotation face_annotations = 5;
 315
 316   // Shot annotations. Each shot is represented as a video segment.
 317   repeated VideoSegment shot_annotations = 6;
 318
 319   // Explicit content annotation.
 320   ExplicitContentAnnotation explicit_annotation = 7;
 321
 322   // Speech transcription.
 323   repeated SpeechTranscription speech_transcriptions = 11;
 324
 325   // OCR text detection and tracking.
 326   // Annotations for list of detected text snippets. Each will have list of
 327   // frame information associated with it.
 328   repeated TextAnnotation text_annotations = 12;
 329
 330   // Annotations for list of objects detected and tracked in video.
 331   repeated ObjectTrackingAnnotation object_annotations = 14;
 332
 333   // If set, indicates an error. Note that for a single `AnnotateVideoRequest`
 334   // some videos may succeed and some may fail.
 335   google.rpc.Status error = 9;
 336 }
 337
 338 // Video annotation response. Included in the `response`
 339 // field of the `Operation` returned by the `GetOperation`
 340 // call of the `google::longrunning::Operations` service.
 341 message AnnotateVideoResponse {
 342   // Annotation results for all videos specified in `AnnotateVideoRequest`.
 343   repeated VideoAnnotationResults annotation_results = 1;
 344 }
 345
 346 // Annotation progress for a single video.
 347 message VideoAnnotationProgress {
 348   // Video file location in
 349   // [Google Cloud Storage](https://cloud.google.com/storage/).
 350   string input_uri = 1;
 351
 352   // Approximate percentage processed thus far. Guaranteed to be
 353   // 100 when fully processed.
 354   int32 progress_percent = 2;
 355
 356   // Time when the request was received.
 357   google.protobuf.Timestamp start_time = 3;
 358
 359   // Time of the most recent update.
 360   google.protobuf.Timestamp update_time = 4;
 361 }
 362
 363 // Video annotation progress. Included in the `metadata`
 364 // field of the `Operation` returned by the `GetOperation`
 365 // call of the `google::longrunning::Operations` service.
 366 message AnnotateVideoProgress {
 367   // Progress metadata for all videos specified in `AnnotateVideoRequest`.
 368   repeated VideoAnnotationProgress annotation_progress = 1;
 369 }
 370
 371 // Config for SPEECH_TRANSCRIPTION.
 372 message SpeechTranscriptionConfig {
 373   // *Required* The language of the supplied audio as a
 374   // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
 375   // Example: "en-US".
 376   // See [Language Support](https://cloud.google.com/speech/docs/languages)
 377   // for a list of the currently supported language codes.
 378   string language_code = 1;
 379
 380   // *Optional* Maximum number of recognition hypotheses to be returned.
 381   // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
 382   // within each `SpeechTranscription`. The server may return fewer than
 383   // `max_alternatives`. Valid values are `0`-`30`. A value of `0` or `1` will
 384   // return a maximum of one. If omitted, will return a maximum of one.
 385   int32 max_alternatives = 2;
 386
 387   // *Optional* If set to `true`, the server will attempt to filter out
 388   // profanities, replacing all but the initial character in each filtered word
 389   // with asterisks, e.g. "f***". If set to `false` or omitted, profanities
 390   // won't be filtered out.
 391   bool filter_profanity = 3;
 392
 393   // *Optional* A means to provide context to assist the speech recognition.
 394   repeated SpeechContext speech_contexts = 4;
 395
 396   // *Optional* If 'true', adds punctuation to recognition result hypotheses.
 397   // This feature is only available in select languages. Setting this for
 398   // requests in other languages has no effect at all. The default 'false' value
 399   // does not add punctuation to result hypotheses. NOTE: "This is currently
 400   // offered as an experimental service, complimentary to all users. In the
 401   // future this may be exclusively available as a premium feature."
 402   bool enable_automatic_punctuation = 5;
 403
 404   // *Optional* For file formats, such as MXF or MKV, supporting multiple audio
 405   // tracks, specify up to two tracks. Default: track 0.
 406   repeated int32 audio_tracks = 6;
 407
 408   // *Optional* If 'true', enables speaker detection for each recognized word in
 409   // the top alternative of the recognition result using a speaker_tag provided
 410   // in the WordInfo.
 411   // Note: When this is true, we send all the words from the beginning of the
 412   // audio for the top alternative in every consecutive responses.
 413   // This is done in order to improve our speaker tags as our models learn to
 414   // identify the speakers in the conversation over time.
 415   bool enable_speaker_diarization = 7;
 416
 417   // *Optional*
 418   // If set, specifies the estimated number of speakers in the conversation.
 419   // If not set, defaults to '2'.
 420   // Ignored unless enable_speaker_diarization is set to true.
 421   int32 diarization_speaker_count = 8;
 422
 423   // *Optional* If `true`, the top result includes a list of words and the
 424   // confidence for those words. If `false`, no word-level confidence
 425   // information is returned. The default is `false`.
 426   bool enable_word_confidence = 9;
 427 }
 428
 429 // Provides "hints" to the speech recognizer to favor specific words and phrases
 430 // in the results.
 431 message SpeechContext {
 432   // *Optional* A list of strings containing words and phrases "hints" so that
 433   // the speech recognition is more likely to recognize them. This can be used
 434   // to improve the accuracy for specific words and phrases, for example, if
 435   // specific commands are typically spoken by the user. This can also be used
 436   // to add additional words to the vocabulary of the recognizer. See
 437   // [usage limits](https://cloud.google.com/speech/limits#content).
 438   repeated string phrases = 1;
 439 }
 440
 441 // A speech recognition result corresponding to a portion of the audio.
 442 message SpeechTranscription {
 443   // May contain one or more recognition hypotheses (up to the maximum specified
 444   // in `max_alternatives`).  These alternatives are ordered in terms of
 445   // accuracy, with the top (first) alternative being the most probable, as
 446   // ranked by the recognizer.
 447   repeated SpeechRecognitionAlternative alternatives = 1;
 448
 449   // Output only. The
 450   // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
 451   // language in this result. This language code was detected to have the most
 452   // likelihood of being spoken in the audio.
 453   string language_code = 2;
 454 }
 455
 456 // Alternative hypotheses (a.k.a. n-best list).
 457 message SpeechRecognitionAlternative {
 458   // Transcript text representing the words that the user spoke.
 459   string transcript = 1;
 460
 461   // The confidence estimate between 0.0 and 1.0. A higher number
 462   // indicates an estimated greater likelihood that the recognized words are
 463   // correct. This field is typically provided only for the top hypothesis, and
 464   // only for `is_final=true` results. Clients should not rely on the
 465   // `confidence` field as it is not guaranteed to be accurate or consistent.
 466   // The default of 0.0 is a sentinel value indicating `confidence` was not set.
 467   float confidence = 2;
 468
 469   // A list of word-specific information for each recognized word.
 470   repeated WordInfo words = 3;
 471 }
 472
 473 // Word-specific information for recognized words. Word information is only
 474 // included in the response when certain request parameters are set, such
 475 // as `enable_word_time_offsets`.
 476 message WordInfo {
 477   // Time offset relative to the beginning of the audio, and
 478   // corresponding to the start of the spoken word. This field is only set if
 479   // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
 480   // experimental feature and the accuracy of the time offset can vary.
 481   google.protobuf.Duration start_time = 1;
 482
 483   // Time offset relative to the beginning of the audio, and
 484   // corresponding to the end of the spoken word. This field is only set if
 485   // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
 486   // experimental feature and the accuracy of the time offset can vary.
 487   google.protobuf.Duration end_time = 2;
 488
 489   // The word corresponding to this set of information.
 490   string word = 3;
 491
 492   // Output only. The confidence estimate between 0.0 and 1.0. A higher number
 493   // indicates an estimated greater likelihood that the recognized words are
 494   // correct. This field is set only for the top alternative.
 495   // This field is not guaranteed to be accurate and users should not rely on it
 496   // to be always provided.
 497   // The default of 0.0 is a sentinel value indicating `confidence` was not set.
 498   float confidence = 4;
 499
 500   // Output only. A distinct integer value is assigned for every speaker within
 501   // the audio. This field specifies which one of those speakers was detected to
 502   // have spoken this word. Value ranges from 1 up to diarization_speaker_count,
 503   // and is only set if speaker diarization is enabled.
 504   int32 speaker_tag = 5;
 505 }
 506
 507 // A vertex represents a 2D point in the image.
 508 // NOTE: the normalized vertex coordinates are relative to the original image
 509 // and range from 0 to 1.
 510 message NormalizedVertex {
 511   // X coordinate.
 512   float x = 1;
 513
 514   // Y coordinate.
 515   float y = 2;
 516 }
 517
 518 // Normalized bounding polygon for text (that might not be aligned with axis).
 519 // Contains list of the corner points in clockwise order starting from
 520 // top-left corner. For example, for a rectangular bounding box:
 521 // When the text is horizontal it might look like:
 522 //         0----1
 523 //         |    |
 524 //         3----2
 525 //
 526 // When it's clockwise rotated 180 degrees around the top-left corner it
 527 // becomes:
 528 //         2----3
 529 //         |    |
 530 //         1----0
 531 //
 532 // and the vertex order will still be (0, 1, 2, 3). Note that values can be less
 533 // than 0, or greater than 1 due to trignometric calculations for location of
 534 // the box.
 535 message NormalizedBoundingPoly {
 536   // Normalized vertices of the bounding polygon.
 537   repeated NormalizedVertex vertices = 1;
 538 }
 539
 540 // Video segment level annotation results for text detection.
 541 message TextSegment {
 542   // Video segment where a text snippet was detected.
 543   VideoSegment segment = 1;
 544
 545   // Confidence for the track of detected text. It is calculated as the highest
 546   // over all frames where OCR detected text appears.
 547   float confidence = 2;
 548
 549   // Information related to the frames where OCR detected text appears.
 550   repeated TextFrame frames = 3;
 551 }
 552
 553 // Video frame level annotation results for text annotation (OCR).
 554 // Contains information regarding timestamp and bounding box locations for the
 555 // frames containing detected OCR text snippets.
 556 message TextFrame {
 557   // Bounding polygon of the detected text for this frame.
 558   NormalizedBoundingPoly rotated_bounding_box = 1;
 559
 560   // Timestamp of this frame.
 561   google.protobuf.Duration time_offset = 2;
 562 }
 563
 564 // Annotations related to one detected OCR text snippet. This will contain the
 565 // corresponding text, confidence value, and frame level information for each
 566 // detection.
 567 message TextAnnotation {
 568   // The detected text.
 569   string text = 1;
 570
 571   // All video segments where OCR detected text appears.
 572   repeated TextSegment segments = 2;
 573 }
 574
 575 // Video frame level annotations for object detection and tracking. This field
 576 // stores per frame location, time offset, and confidence.
 577 message ObjectTrackingFrame {
 578   // The normalized bounding box location of this object track for the frame.
 579   NormalizedBoundingBox normalized_bounding_box = 1;
 580
 581   // The timestamp of the frame in microseconds.
 582   google.protobuf.Duration time_offset = 2;
 583 }
 584
 585 // Annotations corresponding to one tracked object.
 586 message ObjectTrackingAnnotation {
 587   // Different representation of tracking info in non-streaming batch
 588   // and streaming modes.
 589   oneof track_info {
 590     // Non-streaming batch mode ONLY.
 591     // Each object track corresponds to one video segment where it appears.
 592     VideoSegment segment = 3;
 593
 594     // Streaming mode ONLY.
 595     // In streaming mode, we do not know the end time of a tracked object
 596     // before it is completed. Hence, there is no VideoSegment info returned.
 597     // Instead, we provide a unique identifiable integer track_id so that
 598     // the customers can correlate the results of the ongoing
 599     // ObjectTrackAnnotation of the same track_id over time.
 600     int64 track_id = 5;
 601   }
 602
 603   // Entity to specify the object category that this track is labeled as.
 604   Entity entity = 1;
 605
 606   // Object category's labeling confidence of this track.
 607   float confidence = 4;
 608
 609   // Information corresponding to all frames where this object track appears.
 610   // Non-streaming batch mode: it may be one or multiple ObjectTrackingFrame
 611   // messages in frames.
 612   // Streaming mode: it can only be one ObjectTrackingFrame message in frames.
 613   repeated ObjectTrackingFrame frames = 2;
 614 }
 615
 616 // Video annotation feature.
 617 enum Feature {
 618   // Unspecified.
 619   FEATURE_UNSPECIFIED = 0;
 620
 621   // Label detection. Detect objects, such as dog or flower.
 622   LABEL_DETECTION = 1;
 623
 624   // Shot change detection.
 625   SHOT_CHANGE_DETECTION = 2;
 626
 627   // Explicit content detection.
 628   EXPLICIT_CONTENT_DETECTION = 3;
 629
 630   // Human face detection and tracking.
 631   FACE_DETECTION = 4;
 632
 633   // Speech transcription.
 634   SPEECH_TRANSCRIPTION = 6;
 635
 636   // OCR text detection and tracking.
 637   TEXT_DETECTION = 7;
 638
 639   // Object detection and tracking.
 640   OBJECT_TRACKING = 9;
 641 }
 642
 643 // Label detection mode.
 644 enum LabelDetectionMode {
 645   // Unspecified.
 646   LABEL_DETECTION_MODE_UNSPECIFIED = 0;
 647
 648   // Detect shot-level labels.
 649   SHOT_MODE = 1;
 650
 651   // Detect frame-level labels.
 652   FRAME_MODE = 2;
 653
 654   // Detect both shot-level and frame-level labels.
 655   SHOT_AND_FRAME_MODE = 3;
 656 }
 657
 658 // Bucketized representation of likelihood.
 659 enum Likelihood {
 660   // Unspecified likelihood.
 661   LIKELIHOOD_UNSPECIFIED = 0;
 662
 663   // Very unlikely.
 664   VERY_UNLIKELY = 1;
 665
 666   // Unlikely.
 667   UNLIKELY = 2;
 668
 669   // Possible.
 670   POSSIBLE = 3;
 671
 672   // Likely.
 673   LIKELY = 4;
 674
 675   // Very likely.
 676   VERY_LIKELY = 5;
 677 }