1 // Copyright 2018 Google LLC.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
18 package google.cloud.videointelligence.v1p3beta1;
20 import "google/api/annotations.proto";
21 import "google/longrunning/operations.proto";
22 import "google/protobuf/duration.proto";
23 import "google/protobuf/timestamp.proto";
24 import "google/rpc/status.proto";
26 option csharp_namespace = "Google.Cloud.VideoIntelligence.V1P3Beta1";
27 option go_package = "google.golang.org/genproto/googleapis/cloud/videointelligence/v1p3beta1;videointelligence";
28 option java_multiple_files = true;
29 option java_outer_classname = "VideoIntelligenceServiceProto";
30 option java_package = "com.google.cloud.videointelligence.v1p3beta1";
31 option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1p3beta1";
33 // Service that implements Google Cloud Video Intelligence API.
34 service VideoIntelligenceService {
35 // Performs asynchronous video annotation. Progress and results can be
36 // retrieved through the `google.longrunning.Operations` interface.
37 // `Operation.metadata` contains `AnnotateVideoProgress` (progress).
38 // `Operation.response` contains `AnnotateVideoResponse` (results).
39 rpc AnnotateVideo(AnnotateVideoRequest)
40 returns (google.longrunning.Operation) {
41 option (google.api.http) = {
42 post: "/v1p3beta1/videos:annotate"
48 // Service that implements Google Cloud Video Intelligence Streaming API.
49 service StreamingVideoIntelligenceService {
50 // Performs video annotation with bidirectional streaming: emitting results
51 // while sending video/audio bytes.
52 // This method is only available via the gRPC API (not REST).
53 rpc StreamingAnnotateVideo(stream StreamingAnnotateVideoRequest)
54 returns (stream StreamingAnnotateVideoResponse);
57 // Video annotation request.
58 message AnnotateVideoRequest {
59 // Input video location. Currently, only
60 // [Google Cloud Storage](https://cloud.google.com/storage/) URIs are
61 // supported, which must be specified in the following format:
62 // `gs://bucket-id/object-id` (other URI formats return
63 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
64 // more information, see [Request URIs](/storage/docs/reference-uris). A video
65 // URI may include wildcards in `object-id`, and thus identify multiple
66 // videos. Supported wildcards: '*' to match 0 or more characters;
67 // '?' to match 1 character. If unset, the input video should be embedded
68 // in the request as `input_content`. If set, `input_content` should be unset.
71 // The video data bytes.
72 // If unset, the input video(s) should be specified via `input_uri`.
73 // If set, `input_uri` should be unset.
74 bytes input_content = 6;
76 // Requested video annotation features.
77 repeated Feature features = 2;
79 // Additional video context and/or feature-specific parameters.
80 VideoContext video_context = 3;
82 // Optional location where the output (in JSON format) should be stored.
83 // Currently, only [Google Cloud Storage](https://cloud.google.com/storage/)
84 // URIs are supported, which must be specified in the following format:
85 // `gs://bucket-id/object-id` (other URI formats return
86 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
87 // more information, see [Request URIs](/storage/docs/reference-uris).
88 string output_uri = 4;
90 // Optional cloud region where annotation should take place. Supported cloud
91 // regions: `us-east1`, `us-west1`, `europe-west1`, `asia-east1`. If no region
92 // is specified, a region will be determined based on video file location.
93 string location_id = 5;
96 // Video context and/or feature-specific parameters.
97 message VideoContext {
98 // Video segments to annotate. The segments may overlap and are not required
99 // to be contiguous or span the whole video. If unspecified, each video is
100 // treated as a single segment.
101 repeated VideoSegment segments = 1;
103 // Config for LABEL_DETECTION.
104 LabelDetectionConfig label_detection_config = 2;
106 // Config for SHOT_CHANGE_DETECTION.
107 ShotChangeDetectionConfig shot_change_detection_config = 3;
109 // Config for EXPLICIT_CONTENT_DETECTION.
110 ExplicitContentDetectionConfig explicit_content_detection_config = 4;
112 // Config for TEXT_DETECTION.
113 TextDetectionConfig text_detection_config = 8;
116 // Config for LABEL_DETECTION.
117 message LabelDetectionConfig {
118 // What labels should be detected with LABEL_DETECTION, in addition to
119 // video-level labels or segment-level labels.
120 // If unspecified, defaults to `SHOT_MODE`.
121 LabelDetectionMode label_detection_mode = 1;
123 // Whether the video has been shot from a stationary (i.e. non-moving) camera.
124 // When set to true, might improve detection accuracy for moving objects.
125 // Should be used with `SHOT_AND_FRAME_MODE` enabled.
126 bool stationary_camera = 2;
128 // Model to use for label detection.
129 // Supported values: "builtin/stable" (the default if unset) and
134 // Config for SHOT_CHANGE_DETECTION.
135 message ShotChangeDetectionConfig {
136 // Model to use for shot change detection.
137 // Supported values: "builtin/stable" (the default if unset) and
142 // Config for EXPLICIT_CONTENT_DETECTION.
143 message ExplicitContentDetectionConfig {
144 // Model to use for explicit content detection.
145 // Supported values: "builtin/stable" (the default if unset) and
150 // Config for TEXT_DETECTION.
151 message TextDetectionConfig {
152 // Language hint can be specified if the language to be detected is known a
153 // priori. It can increase the accuracy of the detection. Language hint must
154 // be language code in BCP-47 format.
156 // Automatic language detection is performed if no hint is provided.
157 repeated string language_hints = 1;
161 message VideoSegment {
162 // Time-offset, relative to the beginning of the video,
163 // corresponding to the start of the segment (inclusive).
164 google.protobuf.Duration start_time_offset = 1;
166 // Time-offset, relative to the beginning of the video,
167 // corresponding to the end of the segment (inclusive).
168 google.protobuf.Duration end_time_offset = 2;
171 // Video segment level annotation results for label detection.
172 message LabelSegment {
173 // Video segment where a label was detected.
174 VideoSegment segment = 1;
176 // Confidence that the label is accurate. Range: [0, 1].
177 float confidence = 2;
180 // Video frame level annotation results for label detection.
182 // Time-offset, relative to the beginning of the video, corresponding to the
183 // video frame for this location.
184 google.protobuf.Duration time_offset = 1;
186 // Confidence that the label is accurate. Range: [0, 1].
187 float confidence = 2;
190 // Detected entity from video analysis.
192 // Opaque entity ID. Some IDs may be available in
193 // [Google Knowledge Graph Search
194 // API](https://developers.google.com/knowledge-graph/).
195 string entity_id = 1;
197 // Textual description, e.g. `Fixed-gear bicycle`.
198 string description = 2;
200 // Language code for `description` in BCP-47 format.
201 string language_code = 3;
205 message LabelAnnotation {
209 // Common categories for the detected entity.
210 // E.g. when the label is `Terrier` the category is likely `dog`. And in some
211 // cases there might be more than one categories e.g. `Terrier` could also be
213 repeated Entity category_entities = 2;
215 // All video segments where a label was detected.
216 repeated LabelSegment segments = 3;
218 // All video frames where a label was detected.
219 repeated LabelFrame frames = 4;
222 // Video frame level annotation results for explicit content.
223 message ExplicitContentFrame {
224 // Time-offset, relative to the beginning of the video, corresponding to the
225 // video frame for this location.
226 google.protobuf.Duration time_offset = 1;
228 // Likelihood of the pornography content..
229 Likelihood pornography_likelihood = 2;
232 // Explicit content annotation (based on per-frame visual signals only).
233 // If no explicit content has been detected in a frame, no annotations are
234 // present for that frame.
235 message ExplicitContentAnnotation {
236 // All video frames where explicit content was detected.
237 repeated ExplicitContentFrame frames = 1;
240 // Normalized bounding box.
241 // The normalized vertex coordinates are relative to the original image.
243 message NormalizedBoundingBox {
244 // Left X coordinate.
250 // Right X coordinate.
253 // Bottom Y coordinate.
257 // Annotation results for a single video.
258 message VideoAnnotationResults {
259 // Video file location in
260 // [Google Cloud Storage](https://cloud.google.com/storage/).
261 string input_uri = 1;
263 // Label annotations on video level or user specified segment level.
264 // There is exactly one element for each unique label.
265 repeated LabelAnnotation segment_label_annotations = 2;
267 // Label annotations on shot level.
268 // There is exactly one element for each unique label.
269 repeated LabelAnnotation shot_label_annotations = 3;
271 // Label annotations on frame level.
272 // There is exactly one element for each unique label.
273 repeated LabelAnnotation frame_label_annotations = 4;
275 // Shot annotations. Each shot is represented as a video segment.
276 repeated VideoSegment shot_annotations = 6;
278 // Explicit content annotation.
279 ExplicitContentAnnotation explicit_annotation = 7;
281 // OCR text detection and tracking.
282 // Annotations for list of detected text snippets. Each will have list of
283 // frame information associated with it.
284 repeated TextAnnotation text_annotations = 12;
286 // Annotations for list of objects detected and tracked in video.
287 repeated ObjectTrackingAnnotation object_annotations = 14;
289 // If set, indicates an error. Note that for a single `AnnotateVideoRequest`
290 // some videos may succeed and some may fail.
291 google.rpc.Status error = 9;
294 // Video annotation response. Included in the `response`
295 // field of the `Operation` returned by the `GetOperation`
296 // call of the `google::longrunning::Operations` service.
297 message AnnotateVideoResponse {
298 // Annotation results for all videos specified in `AnnotateVideoRequest`.
299 repeated VideoAnnotationResults annotation_results = 1;
302 // Annotation progress for a single video.
303 message VideoAnnotationProgress {
304 // Video file location in
305 // [Google Cloud Storage](https://cloud.google.com/storage/).
306 string input_uri = 1;
308 // Approximate percentage processed thus far. Guaranteed to be
309 // 100 when fully processed.
310 int32 progress_percent = 2;
312 // Time when the request was received.
313 google.protobuf.Timestamp start_time = 3;
315 // Time of the most recent update.
316 google.protobuf.Timestamp update_time = 4;
319 // Video annotation progress. Included in the `metadata`
320 // field of the `Operation` returned by the `GetOperation`
321 // call of the `google::longrunning::Operations` service.
322 message AnnotateVideoProgress {
323 // Progress metadata for all videos specified in `AnnotateVideoRequest`.
324 repeated VideoAnnotationProgress annotation_progress = 1;
327 // A vertex represents a 2D point in the image.
328 // NOTE: the normalized vertex coordinates are relative to the original image
329 // and range from 0 to 1.
330 message NormalizedVertex {
338 // Normalized bounding polygon for text (that might not be aligned with axis).
339 // Contains list of the corner points in clockwise order starting from
340 // top-left corner. For example, for a rectangular bounding box:
341 // When the text is horizontal it might look like:
346 // When it's clockwise rotated 180 degrees around the top-left corner it
352 // and the vertex order will still be (0, 1, 2, 3). Note that values can be less
353 // than 0, or greater than 1 due to trignometric calculations for location of
355 message NormalizedBoundingPoly {
356 // Normalized vertices of the bounding polygon.
357 repeated NormalizedVertex vertices = 1;
360 // Video segment level annotation results for text detection.
361 message TextSegment {
362 // Video segment where a text snippet was detected.
363 VideoSegment segment = 1;
365 // Confidence for the track of detected text. It is calculated as the highest
366 // over all frames where OCR detected text appears.
367 float confidence = 2;
369 // Information related to the frames where OCR detected text appears.
370 repeated TextFrame frames = 3;
373 // Video frame level annotation results for text annotation (OCR).
374 // Contains information regarding timestamp and bounding box locations for the
375 // frames containing detected OCR text snippets.
377 // Bounding polygon of the detected text for this frame.
378 NormalizedBoundingPoly rotated_bounding_box = 1;
380 // Timestamp of this frame.
381 google.protobuf.Duration time_offset = 2;
384 // Annotations related to one detected OCR text snippet. This will contain the
385 // corresponding text, confidence value, and frame level information for each
387 message TextAnnotation {
388 // The detected text.
391 // All video segments where OCR detected text appears.
392 repeated TextSegment segments = 2;
395 // Video frame level annotations for object detection and tracking. This field
396 // stores per frame location, time offset, and confidence.
397 message ObjectTrackingFrame {
398 // The normalized bounding box location of this object track for the frame.
399 NormalizedBoundingBox normalized_bounding_box = 1;
401 // The timestamp of the frame in microseconds.
402 google.protobuf.Duration time_offset = 2;
405 // Annotations corresponding to one tracked object.
406 message ObjectTrackingAnnotation {
407 // Entity to specify the object category that this track is labeled as.
410 // Object category's labeling confidence of this track.
411 float confidence = 4;
413 // Information corresponding to all frames where this object track appears.
414 // Non-streaming batch mode: it may be one or multiple ObjectTrackingFrame
415 // messages in frames.
416 // Streaming mode: it can only be one ObjectTrackingFrame message in frames.
417 repeated ObjectTrackingFrame frames = 2;
419 // Different representation of tracking info in non-streaming batch
420 // and streaming modes.
422 // Non-streaming batch mode ONLY.
423 // Each object track corresponds to one video segment where it appears.
424 VideoSegment segment = 3;
425 // Streaming mode ONLY.
426 // In streaming mode, we do not know the end time of a tracked object
427 // before it is completed. Hence, there is no VideoSegment info returned.
428 // Instead, we provide a unique identifiable integer track_id so that
429 // the customers can correlate the results of the ongoing
430 // ObjectTrackAnnotation of the same track_id over time.
435 // The top-level message sent by the client for the `StreamingAnnotateVideo`
436 // method. Multiple `StreamingAnnotateVideoRequest` messages are sent.
437 // The first message must only contain a `StreamingVideoConfig` message.
438 // All subsequent messages must only contain `input_content` data.
439 message StreamingAnnotateVideoRequest {
440 // *Required* The streaming request, which is either a streaming config or
442 oneof streaming_request {
443 // Provides information to the annotator, specifing how to process the
444 // request. The first `AnnotateStreamingVideoRequest` message must only
445 // contain a `video_config` message.
446 StreamingVideoConfig video_config = 1;
448 // The video data to be annotated. Chunks of video data are sequentially
449 // sent in `StreamingAnnotateVideoRequest` messages. Except the initial
450 // `StreamingAnnotateVideoRequest` message containing only
451 // `video_config`, all subsequent `AnnotateStreamingVideoRequest`
452 // messages must only contain `input_content` field.
453 bytes input_content = 2;
457 // `StreamingAnnotateVideoResponse` is the only message returned to the client
458 // by `StreamingAnnotateVideo`. A series of zero or more
459 // `StreamingAnnotateVideoResponse` messages are streamed back to the client.
460 message StreamingAnnotateVideoResponse {
461 // If set, returns a [google.rpc.Status][] message that
462 // specifies the error for the operation.
463 google.rpc.Status error = 1;
465 // Streaming annotation results.
466 StreamingVideoAnnotationResults annotation_results = 2;
468 // GCS URI that stores annotation results of one streaming session.
469 // It is a directory that can hold multiple files in JSON format.
470 // Example uri format:
471 // gs://bucket_id/object_id/cloud_project_name-session_id
472 string annotation_results_uri = 3;
475 // Config for EXPLICIT_CONTENT_DETECTION in streaming mode.
476 message StreamingExplicitContentDetectionConfig {
477 // No customized config support.
480 // Config for LABEL_DETECTION in streaming mode.
481 message StreamingLabelDetectionConfig {
482 // Whether the video has been captured from a stationary (i.e. non-moving)
483 // camera. When set to true, might improve detection accuracy for moving
484 // objects. Default: false.
485 bool stationary_camera = 1;
488 // Config for STREAMING_OBJECT_TRACKING.
489 message StreamingObjectTrackingConfig {
490 // No customized config support.
493 // Config for SHOT_CHANGE_DETECTION in streaming mode.
494 message StreamingShotChangeDetectionConfig {
495 // No customized config support.
498 // Config for streaming storage option.
499 message StreamingStorageConfig {
500 // Enable streaming storage. Default: false.
501 bool enable_storage_annotation_result = 1;
503 // GCS URI to store all annotation results for one client. Client should
504 // specify this field as the top-level storage directory. Annotation results
505 // of different sessions will be put into different sub-directories denoted
506 // by project_name and session_id. All sub-directories will be auto generated
507 // by program and will be made accessible to client in response proto.
508 // URIs must be specified in the following format: `gs://bucket-id/object-id`
509 // `bucket-id` should be a valid GCS bucket created by client and bucket
510 // permission shall also be configured properly. `object-id` can be arbitrary
511 // string that make sense to client. Other URI formats will return error and
512 // cause GCS write failure.
513 string annotation_result_storage_directory = 3;
516 // Streaming annotation results corresponding to a portion of the video
517 // that is currently being processed.
518 message StreamingVideoAnnotationResults {
519 // Shot annotation results. Each shot is represented as a video segment.
520 repeated VideoSegment shot_annotations = 1;
522 // Label annotation results.
523 repeated LabelAnnotation label_annotations = 2;
525 // Explicit content detection results.
526 ExplicitContentAnnotation explicit_annotation = 3;
528 // Object tracking results.
529 repeated ObjectTrackingAnnotation object_annotations = 4;
532 // Provides information to the annotator that specifies how to process the
534 message StreamingVideoConfig {
535 // Requested annotation feature.
536 StreamingFeature feature = 1;
538 // Config for requested annotation feature.
539 oneof streaming_config {
540 // Config for SHOT_CHANGE_DETECTION.
541 StreamingShotChangeDetectionConfig shot_change_detection_config = 2;
543 // Config for LABEL_DETECTION.
544 StreamingLabelDetectionConfig label_detection_config = 3;
546 // Config for STREAMING_EXPLICIT_CONTENT_DETECTION.
547 StreamingExplicitContentDetectionConfig explicit_content_detection_config =
550 // Config for STREAMING_OBJECT_TRACKING.
551 StreamingObjectTrackingConfig object_tracking_config = 5;
554 // Streaming storage option. By default: storage is disabled.
555 StreamingStorageConfig storage_config = 30;
558 // Video annotation feature.
561 FEATURE_UNSPECIFIED = 0;
563 // Label detection. Detect objects, such as dog or flower.
566 // Shot change detection.
567 SHOT_CHANGE_DETECTION = 2;
569 // Explicit content detection.
570 EXPLICIT_CONTENT_DETECTION = 3;
572 // OCR text detection and tracking.
575 // Object detection and tracking.
579 // Label detection mode.
580 enum LabelDetectionMode {
582 LABEL_DETECTION_MODE_UNSPECIFIED = 0;
584 // Detect shot-level labels.
587 // Detect frame-level labels.
590 // Detect both shot-level and frame-level labels.
591 SHOT_AND_FRAME_MODE = 3;
594 // Bucketized representation of likelihood.
596 // Unspecified likelihood.
597 LIKELIHOOD_UNSPECIFIED = 0;
615 // Streaming video annotation feature.
616 enum StreamingFeature {
618 STREAMING_FEATURE_UNSPECIFIED = 0;
619 // Label detection. Detect objects, such as dog or flower.
620 STREAMING_LABEL_DETECTION = 1;
621 // Shot change detection.
622 STREAMING_SHOT_CHANGE_DETECTION = 2;
623 // Explicit content detection.
624 STREAMING_EXPLICIT_CONTENT_DETECTION = 3;
625 // Object detection and tracking.
626 STREAMING_OBJECT_TRACKING = 4;