1 // Copyright 2017 Google Inc.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
17 package google.cloud.vision.v1p1beta1;
19 import "google/api/annotations.proto";
20 import "google/cloud/vision/v1p1beta1/geometry.proto";
21 import "google/cloud/vision/v1p1beta1/text_annotation.proto";
22 import "google/cloud/vision/v1p1beta1/web_detection.proto";
23 import "google/rpc/status.proto";
24 import "google/type/color.proto";
25 import "google/type/latlng.proto";
27 option cc_enable_arenas = true;
28 option go_package = "google.golang.org/genproto/googleapis/cloud/vision/v1p1beta1;vision";
29 option java_multiple_files = true;
30 option java_outer_classname = "ImageAnnotatorProto";
31 option java_package = "com.google.cloud.vision.v1p1beta1";
33 // Service that performs Google Cloud Vision API detection tasks over client
34 // images, such as face, landmark, logo, label, and text detection. The
35 // ImageAnnotator service returns detected entities from the images.
36 service ImageAnnotator {
37 // Run image detection and annotation for a batch of images.
38 rpc BatchAnnotateImages(BatchAnnotateImagesRequest)
39 returns (BatchAnnotateImagesResponse) {
40 option (google.api.http) = {
41 post: "/v1p1beta1/images:annotate"
47 // Users describe the type of Google Cloud Vision API tasks to perform over
48 // images by using *Feature*s. Each Feature indicates a type of image
49 // detection task to perform. Features encode the Cloud Vision API
50 // vertical to operate on and the number of top-scoring results to return.
52 // Type of image feature.
54 // Unspecified feature type.
57 // Run face detection.
60 // Run landmark detection.
61 LANDMARK_DETECTION = 2;
63 // Run logo detection.
66 // Run label detection.
72 // Run dense text document OCR. Takes precedence when both
73 // DOCUMENT_TEXT_DETECTION and TEXT_DETECTION are present.
74 DOCUMENT_TEXT_DETECTION = 11;
76 // Run computer vision models to compute image safe-search properties.
77 SAFE_SEARCH_DETECTION = 6;
79 // Compute a set of image properties, such as the image's dominant colors.
92 // Maximum number of results of this type.
93 int32 max_results = 2;
95 // Model to use for the feature.
96 // Supported values: "builtin/stable" (the default if unset) and
101 // External image source (Google Cloud Storage image location).
102 message ImageSource {
103 // NOTE: For new code `image_uri` below is preferred.
104 // Google Cloud Storage image URI, which must be in the following form:
105 // `gs://bucket_name/object_name` (for details, see
106 // [Google Cloud Storage Request
107 // URIs](https://cloud.google.com/storage/docs/reference-uris)).
108 // NOTE: Cloud Storage object versioning is not supported.
109 string gcs_image_uri = 1;
111 // Image URI which supports:
112 // 1) Google Cloud Storage image URI, which must be in the following form:
113 // `gs://bucket_name/object_name` (for details, see
114 // [Google Cloud Storage Request
115 // URIs](https://cloud.google.com/storage/docs/reference-uris)).
116 // NOTE: Cloud Storage object versioning is not supported.
117 // 2) Publicly accessible image HTTP/HTTPS URL.
118 // This is preferred over the legacy `gcs_image_uri` above. When both
119 // `gcs_image_uri` and `image_uri` are specified, `image_uri` takes
121 string image_uri = 2;
124 // Client image to perform Google Cloud Vision API tasks over.
126 // Image content, represented as a stream of bytes.
127 // Note: as with all `bytes` fields, protobuffers use a pure binary
128 // representation, whereas JSON representations use base64.
131 // Google Cloud Storage image location. If both `content` and `source`
132 // are provided for an image, `content` takes precedence and is
133 // used to perform the image annotation request.
134 ImageSource source = 2;
137 // A face annotation object contains the results of face detection.
138 message FaceAnnotation {
139 // A face-specific landmark (for example, a face feature).
141 // Face landmark (feature) type.
142 // Left and right are defined from the vantage of the viewer of the image
143 // without considering mirror projections typical of photos. So, `LEFT_EYE`,
144 // typically, is the person's right eye.
146 // Unknown face landmark detected. Should not be filled.
147 UNKNOWN_LANDMARK = 0;
155 // Left of left eyebrow.
156 LEFT_OF_LEFT_EYEBROW = 3;
158 // Right of left eyebrow.
159 RIGHT_OF_LEFT_EYEBROW = 4;
161 // Left of right eyebrow.
162 LEFT_OF_RIGHT_EYEBROW = 5;
164 // Right of right eyebrow.
165 RIGHT_OF_RIGHT_EYEBROW = 6;
167 // Midpoint between eyes.
168 MIDPOINT_BETWEEN_EYES = 7;
188 // Nose, bottom right.
189 NOSE_BOTTOM_RIGHT = 14;
191 // Nose, bottom left.
192 NOSE_BOTTOM_LEFT = 15;
194 // Nose, bottom center.
195 NOSE_BOTTOM_CENTER = 16;
197 // Left eye, top boundary.
198 LEFT_EYE_TOP_BOUNDARY = 17;
200 // Left eye, right corner.
201 LEFT_EYE_RIGHT_CORNER = 18;
203 // Left eye, bottom boundary.
204 LEFT_EYE_BOTTOM_BOUNDARY = 19;
206 // Left eye, left corner.
207 LEFT_EYE_LEFT_CORNER = 20;
209 // Right eye, top boundary.
210 RIGHT_EYE_TOP_BOUNDARY = 21;
212 // Right eye, right corner.
213 RIGHT_EYE_RIGHT_CORNER = 22;
215 // Right eye, bottom boundary.
216 RIGHT_EYE_BOTTOM_BOUNDARY = 23;
218 // Right eye, left corner.
219 RIGHT_EYE_LEFT_CORNER = 24;
221 // Left eyebrow, upper midpoint.
222 LEFT_EYEBROW_UPPER_MIDPOINT = 25;
224 // Right eyebrow, upper midpoint.
225 RIGHT_EYEBROW_UPPER_MIDPOINT = 26;
228 LEFT_EAR_TRAGION = 27;
230 // Right ear tragion.
231 RIGHT_EAR_TRAGION = 28;
237 RIGHT_EYE_PUPIL = 30;
239 // Forehead glabella.
240 FOREHEAD_GLABELLA = 31;
246 CHIN_LEFT_GONION = 33;
248 // Chin right gonion.
249 CHIN_RIGHT_GONION = 34;
252 // Face landmark type.
255 // Face landmark position.
256 Position position = 4;
259 // The bounding polygon around the face. The coordinates of the bounding box
260 // are in the original image's scale, as returned in `ImageParams`.
261 // The bounding box is computed to "frame" the face in accordance with human
262 // expectations. It is based on the landmarker results.
263 // Note that one or more x and/or y coordinates may not be generated in the
264 // `BoundingPoly` (the polygon will be unbounded) if only a partial face
265 // appears in the image to be annotated.
266 BoundingPoly bounding_poly = 1;
268 // The `fd_bounding_poly` bounding polygon is tighter than the
269 // `boundingPoly`, and encloses only the skin part of the face. Typically, it
270 // is used to eliminate the face from any image analysis that detects the
271 // "amount of skin" visible in an image. It is not based on the
272 // landmarker results, only on the initial face detection, hence
273 // the <code>fd</code> (face detection) prefix.
274 BoundingPoly fd_bounding_poly = 2;
276 // Detected face landmarks.
277 repeated Landmark landmarks = 3;
279 // Roll angle, which indicates the amount of clockwise/anti-clockwise rotation
280 // of the face relative to the image vertical about the axis perpendicular to
281 // the face. Range [-180,180].
282 float roll_angle = 4;
284 // Yaw angle, which indicates the leftward/rightward angle that the face is
285 // pointing relative to the vertical plane perpendicular to the image. Range
289 // Pitch angle, which indicates the upwards/downwards angle that the face is
290 // pointing relative to the image's horizontal plane. Range [-180,180].
291 float tilt_angle = 6;
293 // Detection confidence. Range [0, 1].
294 float detection_confidence = 7;
296 // Face landmarking confidence. Range [0, 1].
297 float landmarking_confidence = 8;
300 Likelihood joy_likelihood = 9;
302 // Sorrow likelihood.
303 Likelihood sorrow_likelihood = 10;
306 Likelihood anger_likelihood = 11;
308 // Surprise likelihood.
309 Likelihood surprise_likelihood = 12;
311 // Under-exposed likelihood.
312 Likelihood under_exposed_likelihood = 13;
314 // Blurred likelihood.
315 Likelihood blurred_likelihood = 14;
317 // Headwear likelihood.
318 Likelihood headwear_likelihood = 15;
321 // Detected entity location information.
322 message LocationInfo {
323 // lat/long location coordinates.
324 google.type.LatLng lat_lng = 1;
327 // A `Property` consists of a user-supplied name/value pair.
329 // Name of the property.
332 // Value of the property.
335 // Value of numeric properties.
336 uint64 uint64_value = 3;
339 // Set of detected entity features.
340 message EntityAnnotation {
341 // Opaque entity ID. Some IDs may be available in
342 // [Google Knowledge Graph Search
343 // API](https://developers.google.com/knowledge-graph/).
346 // The language code for the locale in which the entity textual
347 // `description` is expressed.
350 // Entity textual description, expressed in its `locale` language.
351 string description = 3;
353 // Overall score of the result. Range [0, 1].
356 // The accuracy of the entity detection in an image.
357 // For example, for an image in which the "Eiffel Tower" entity is detected,
358 // this field represents the confidence that there is a tower in the query
359 // image. Range [0, 1].
360 float confidence = 5;
362 // The relevancy of the ICA (Image Content Annotation) label to the
363 // image. For example, the relevancy of "tower" is likely higher to an image
364 // containing the detected "Eiffel Tower" than to an image containing a
365 // detected distant towering building, even though the confidence that
366 // there is a tower in each image may be the same. Range [0, 1].
367 float topicality = 6;
369 // Image region to which this entity belongs. Not produced
370 // for `LABEL_DETECTION` features.
371 BoundingPoly bounding_poly = 7;
373 // The location information for the detected entity. Multiple
374 // `LocationInfo` elements can be present because one location may
375 // indicate the location of the scene in the image, and another location
376 // may indicate the location of the place where the image was taken.
377 // Location information is usually present for landmarks.
378 repeated LocationInfo locations = 8;
380 // Some entities may have optional user-supplied `Property` (name/value)
381 // fields, such a score or string that qualifies the entity.
382 repeated Property properties = 9;
385 // Set of features pertaining to the image, computed by computer vision
386 // methods over safe-search verticals (for example, adult, spoof, medical,
388 message SafeSearchAnnotation {
389 // Represents the adult content likelihood for the image. Adult content may
390 // contain elements such as nudity, pornographic images or cartoons, or
391 // sexual activities.
392 Likelihood adult = 1;
394 // Spoof likelihood. The likelihood that an modification
395 // was made to the image's canonical version to make it appear
396 // funny or offensive.
397 Likelihood spoof = 2;
399 // Likelihood that this is a medical image.
400 Likelihood medical = 3;
402 // Likelihood that this image contains violent content.
403 Likelihood violence = 4;
405 // Likelihood that the request image contains racy content. Racy content may
406 // include (but is not limited to) skimpy or sheer clothing, strategically
407 // covered nudity, lewd or provocative poses, or close-ups of sensitive
412 // Rectangle determined by min and max `LatLng` pairs.
413 message LatLongRect {
414 // Min lat/long pair.
415 google.type.LatLng min_lat_lng = 1;
417 // Max lat/long pair.
418 google.type.LatLng max_lat_lng = 2;
421 // Color information consists of RGB channels, score, and the fraction of
422 // the image that the color occupies in the image.
424 // RGB components of the color.
425 google.type.Color color = 1;
427 // Image-specific score for this color. Value in range [0, 1].
430 // The fraction of pixels the color occupies in the image.
431 // Value in range [0, 1].
432 float pixel_fraction = 3;
435 // Set of dominant colors and their corresponding scores.
436 message DominantColorsAnnotation {
437 // RGB color values with their score and pixel fraction.
438 repeated ColorInfo colors = 1;
441 // Stores image properties, such as dominant colors.
442 message ImageProperties {
443 // If present, dominant colors completed successfully.
444 DominantColorsAnnotation dominant_colors = 1;
447 // Single crop hint that is used to generate a new crop when serving an image.
449 // The bounding polygon for the crop region. The coordinates of the bounding
450 // box are in the original image's scale, as returned in `ImageParams`.
451 BoundingPoly bounding_poly = 1;
453 // Confidence of this being a salient region. Range [0, 1].
454 float confidence = 2;
456 // Fraction of importance of this salient region with respect to the original
458 float importance_fraction = 3;
461 // Set of crop hints that are used to generate new crops when serving images.
462 message CropHintsAnnotation {
463 // Crop hint results.
464 repeated CropHint crop_hints = 1;
467 // Parameters for crop hints annotation request.
468 message CropHintsParams {
469 // Aspect ratios in floats, representing the ratio of the width to the height
470 // of the image. For example, if the desired aspect ratio is 4/3, the
471 // corresponding float value should be 1.33333. If not specified, the
472 // best possible crop is returned. The number of provided aspect ratios is
473 // limited to a maximum of 16; any aspect ratios provided after the 16th are
475 repeated float aspect_ratios = 1;
478 // Parameters for web detection request.
479 message WebDetectionParams {
480 // Whether to include results derived from the geo information in the image.
481 bool include_geo_results = 2;
484 // Image context and/or feature-specific parameters.
485 message ImageContext {
486 // lat/long rectangle that specifies the location of the image.
487 LatLongRect lat_long_rect = 1;
489 // List of languages to use for TEXT_DETECTION. In most cases, an empty value
490 // yields the best results since it enables automatic language detection. For
491 // languages based on the Latin alphabet, setting `language_hints` is not
492 // needed. In rare cases, when the language of the text in the image is known,
493 // setting a hint will help get better results (although it will be a
494 // significant hindrance if the hint is wrong). Text detection returns an
495 // error if one or more of the specified languages is not one of the
496 // [supported languages](/vision/docs/languages).
497 repeated string language_hints = 2;
499 // Parameters for crop hints annotation request.
500 CropHintsParams crop_hints_params = 4;
502 // Parameters for web detection.
503 WebDetectionParams web_detection_params = 6;
506 // Request for performing Google Cloud Vision API tasks over a user-provided
507 // image, with user-requested features.
508 message AnnotateImageRequest {
509 // The image to be processed.
512 // Requested features.
513 repeated Feature features = 2;
515 // Additional context that may accompany the image.
516 ImageContext image_context = 3;
519 // Response to an image annotation request.
520 message AnnotateImageResponse {
521 // If present, face detection has completed successfully.
522 repeated FaceAnnotation face_annotations = 1;
524 // If present, landmark detection has completed successfully.
525 repeated EntityAnnotation landmark_annotations = 2;
527 // If present, logo detection has completed successfully.
528 repeated EntityAnnotation logo_annotations = 3;
530 // If present, label detection has completed successfully.
531 repeated EntityAnnotation label_annotations = 4;
533 // If present, text (OCR) detection has completed successfully.
534 repeated EntityAnnotation text_annotations = 5;
536 // If present, text (OCR) detection or document (OCR) text detection has
537 // completed successfully.
538 // This annotation provides the structural hierarchy for the OCR detected
540 TextAnnotation full_text_annotation = 12;
542 // If present, safe-search annotation has completed successfully.
543 SafeSearchAnnotation safe_search_annotation = 6;
545 // If present, image properties were extracted successfully.
546 ImageProperties image_properties_annotation = 8;
548 // If present, crop hints have completed successfully.
549 CropHintsAnnotation crop_hints_annotation = 11;
551 // If present, web detection has completed successfully.
552 WebDetection web_detection = 13;
554 // If set, represents the error message for the operation.
555 // Note that filled-in image annotations are guaranteed to be
556 // correct, even when `error` is set.
557 google.rpc.Status error = 9;
560 // Multiple image annotation requests are batched into a single service call.
561 message BatchAnnotateImagesRequest {
562 // Individual image annotation requests for this batch.
563 repeated AnnotateImageRequest requests = 1;
566 // Response to a batch image annotation request.
567 message BatchAnnotateImagesResponse {
568 // Individual responses to image annotation requests within the batch.
569 repeated AnnotateImageResponse responses = 1;
572 // A bucketized representation of likelihood, which is intended to give clients
573 // highly stable results across model upgrades.
575 // Unknown likelihood.
578 // It is very unlikely that the image belongs to the specified vertical.
581 // It is unlikely that the image belongs to the specified vertical.
584 // It is possible that the image belongs to the specified vertical.
587 // It is likely that the image belongs to the specified vertical.
590 // It is very likely that the image belongs to the specified vertical.