| // Copyright 2018 Google LLC. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| |
| syntax = "proto3"; |
| |
| package google.cloud.videointelligence.v1p3beta1; |
| |
| import "google/api/annotations.proto"; |
| import "google/longrunning/operations.proto"; |
| import "google/protobuf/duration.proto"; |
| import "google/protobuf/timestamp.proto"; |
| import "google/rpc/status.proto"; |
| |
| option csharp_namespace = "Google.Cloud.VideoIntelligence.V1P3Beta1"; |
| option go_package = "google.golang.org/genproto/googleapis/cloud/videointelligence/v1p3beta1;videointelligence"; |
| option java_multiple_files = true; |
| option java_outer_classname = "VideoIntelligenceServiceProto"; |
| option java_package = "com.google.cloud.videointelligence.v1p3beta1"; |
| option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1p3beta1"; |
| |
| // Service that implements Google Cloud Video Intelligence API. |
| service VideoIntelligenceService { |
| // Performs asynchronous video annotation. Progress and results can be |
| // retrieved through the `google.longrunning.Operations` interface. |
| // `Operation.metadata` contains `AnnotateVideoProgress` (progress). |
| // `Operation.response` contains `AnnotateVideoResponse` (results). |
| rpc AnnotateVideo(AnnotateVideoRequest) |
| returns (google.longrunning.Operation) { |
| option (google.api.http) = { |
| post: "/v1p3beta1/videos:annotate" |
| body: "*" |
| }; |
| } |
| } |
| |
| // Service that implements Google Cloud Video Intelligence Streaming API. |
| service StreamingVideoIntelligenceService { |
| // Performs video annotation with bidirectional streaming: emitting results |
| // while sending video/audio bytes. |
| // This method is only available via the gRPC API (not REST). |
| rpc StreamingAnnotateVideo(stream StreamingAnnotateVideoRequest) |
| returns (stream StreamingAnnotateVideoResponse); |
| } |
| |
| // Video annotation request. |
| message AnnotateVideoRequest { |
| // Input video location. Currently, only |
| // [Google Cloud Storage](https://cloud.google.com/storage/) URIs are |
| // supported, which must be specified in the following format: |
| // `gs://bucket-id/object-id` (other URI formats return |
| // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For |
| // more information, see [Request URIs](/storage/docs/reference-uris). A video |
| // URI may include wildcards in `object-id`, and thus identify multiple |
| // videos. Supported wildcards: '*' to match 0 or more characters; |
| // '?' to match 1 character. If unset, the input video should be embedded |
| // in the request as `input_content`. If set, `input_content` should be unset. |
| string input_uri = 1; |
| |
| // The video data bytes. |
| // If unset, the input video(s) should be specified via `input_uri`. |
| // If set, `input_uri` should be unset. |
| bytes input_content = 6; |
| |
| // Requested video annotation features. |
| repeated Feature features = 2; |
| |
| // Additional video context and/or feature-specific parameters. |
| VideoContext video_context = 3; |
| |
| // Optional location where the output (in JSON format) should be stored. |
| // Currently, only [Google Cloud Storage](https://cloud.google.com/storage/) |
| // URIs are supported, which must be specified in the following format: |
| // `gs://bucket-id/object-id` (other URI formats return |
| // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For |
| // more information, see [Request URIs](/storage/docs/reference-uris). |
| string output_uri = 4; |
| |
| // Optional cloud region where annotation should take place. Supported cloud |
| // regions: `us-east1`, `us-west1`, `europe-west1`, `asia-east1`. If no region |
| // is specified, a region will be determined based on video file location. |
| string location_id = 5; |
| } |
| |
| // Video context and/or feature-specific parameters. |
| message VideoContext { |
| // Video segments to annotate. The segments may overlap and are not required |
| // to be contiguous or span the whole video. If unspecified, each video is |
| // treated as a single segment. |
| repeated VideoSegment segments = 1; |
| |
| // Config for LABEL_DETECTION. |
| LabelDetectionConfig label_detection_config = 2; |
| |
| // Config for SHOT_CHANGE_DETECTION. |
| ShotChangeDetectionConfig shot_change_detection_config = 3; |
| |
| // Config for EXPLICIT_CONTENT_DETECTION. |
| ExplicitContentDetectionConfig explicit_content_detection_config = 4; |
| |
| // Config for SPEECH_TRANSCRIPTION. |
| SpeechTranscriptionConfig speech_transcription_config = 6; |
| |
| // Config for TEXT_DETECTION. |
| TextDetectionConfig text_detection_config = 8; |
| |
| // Config for OBJECT_TRACKING. |
| ObjectTrackingConfig object_tracking_config = 13; |
| } |
| |
| // Config for LABEL_DETECTION. |
| message LabelDetectionConfig { |
| // What labels should be detected with LABEL_DETECTION, in addition to |
| // video-level labels or segment-level labels. |
| // If unspecified, defaults to `SHOT_MODE`. |
| LabelDetectionMode label_detection_mode = 1; |
| |
| // Whether the video has been shot from a stationary (i.e. non-moving) camera. |
| // When set to true, might improve detection accuracy for moving objects. |
| // Should be used with `SHOT_AND_FRAME_MODE` enabled. |
| bool stationary_camera = 2; |
| |
| // Model to use for label detection. |
| // Supported values: "builtin/stable" (the default if unset) and |
| // "builtin/latest". |
| string model = 3; |
| |
| // The confidence threshold we perform filtering on the labels from |
| // frame-level detection. If not set, it is set to 0.4 by default. The valid |
| // range for this threshold is [0.1, 0.9]. Any value set outside of this |
| // range will be clipped. |
| // Note: for best results please follow the default threshold. We will update |
| // the default threshold everytime when we release a new model. |
| float frame_confidence_threshold = 4; |
| |
| // The confidence threshold we perform filtering on the labels from |
| // video-level and shot-level detections. If not set, it is set to 0.3 by |
| // default. The valid range for this threshold is [0.1, 0.9]. Any value set |
| // outside of this range will be clipped. |
| // Note: for best results please follow the default threshold. We will update |
| // the default threshold everytime when we release a new model. |
| float video_confidence_threshold = 5; |
| } |
| |
| // Config for SHOT_CHANGE_DETECTION. |
| message ShotChangeDetectionConfig { |
| // Model to use for shot change detection. |
| // Supported values: "builtin/stable" (the default if unset) and |
| // "builtin/latest". |
| string model = 1; |
| } |
| |
| // Config for OBJECT_TRACKING. |
| message ObjectTrackingConfig { |
| // Model to use for object tracking. |
| // Supported values: "builtin/stable" (the default if unset) and |
| // "builtin/latest". |
| string model = 1; |
| } |
| |
| // Config for EXPLICIT_CONTENT_DETECTION. |
| message ExplicitContentDetectionConfig { |
| // Model to use for explicit content detection. |
| // Supported values: "builtin/stable" (the default if unset) and |
| // "builtin/latest". |
| string model = 1; |
| } |
| |
| // Config for TEXT_DETECTION. |
| message TextDetectionConfig { |
| // Language hint can be specified if the language to be detected is known a |
| // priori. It can increase the accuracy of the detection. Language hint must |
| // be language code in BCP-47 format. |
| // |
| // Automatic language detection is performed if no hint is provided. |
| repeated string language_hints = 1; |
| |
| // Model to use for text detection. |
| // Supported values: "builtin/stable" (the default if unset) and |
| // "builtin/latest". |
| string model = 2; |
| } |
| |
| // Video segment. |
| message VideoSegment { |
| // Time-offset, relative to the beginning of the video, |
| // corresponding to the start of the segment (inclusive). |
| google.protobuf.Duration start_time_offset = 1; |
| |
| // Time-offset, relative to the beginning of the video, |
| // corresponding to the end of the segment (inclusive). |
| google.protobuf.Duration end_time_offset = 2; |
| } |
| |
| // Video segment level annotation results for label detection. |
| message LabelSegment { |
| // Video segment where a label was detected. |
| VideoSegment segment = 1; |
| |
| // Confidence that the label is accurate. Range: [0, 1]. |
| float confidence = 2; |
| } |
| |
| // Video frame level annotation results for label detection. |
| message LabelFrame { |
| // Time-offset, relative to the beginning of the video, corresponding to the |
| // video frame for this location. |
| google.protobuf.Duration time_offset = 1; |
| |
| // Confidence that the label is accurate. Range: [0, 1]. |
| float confidence = 2; |
| } |
| |
| // Detected entity from video analysis. |
| message Entity { |
| // Opaque entity ID. Some IDs may be available in |
| // [Google Knowledge Graph Search |
| // API](https://developers.google.com/knowledge-graph/). |
| string entity_id = 1; |
| |
| // Textual description, e.g. `Fixed-gear bicycle`. |
| string description = 2; |
| |
| // Language code for `description` in BCP-47 format. |
| string language_code = 3; |
| } |
| |
| // Label annotation. |
| message LabelAnnotation { |
| // Detected entity. |
| Entity entity = 1; |
| |
| // Common categories for the detected entity. |
| // E.g. when the label is `Terrier` the category is likely `dog`. And in some |
| // cases there might be more than one categories e.g. `Terrier` could also be |
| // a `pet`. |
| repeated Entity category_entities = 2; |
| |
| // All video segments where a label was detected. |
| repeated LabelSegment segments = 3; |
| |
| // All video frames where a label was detected. |
| repeated LabelFrame frames = 4; |
| } |
| |
| // Video frame level annotation results for explicit content. |
| message ExplicitContentFrame { |
| // Time-offset, relative to the beginning of the video, corresponding to the |
| // video frame for this location. |
| google.protobuf.Duration time_offset = 1; |
| |
| // Likelihood of the pornography content.. |
| Likelihood pornography_likelihood = 2; |
| } |
| |
| // Explicit content annotation (based on per-frame visual signals only). |
| // If no explicit content has been detected in a frame, no annotations are |
| // present for that frame. |
| message ExplicitContentAnnotation { |
| // All video frames where explicit content was detected. |
| repeated ExplicitContentFrame frames = 1; |
| } |
| |
| // Normalized bounding box. |
| // The normalized vertex coordinates are relative to the original image. |
| // Range: [0, 1]. |
| message NormalizedBoundingBox { |
| // Left X coordinate. |
| float left = 1; |
| |
| // Top Y coordinate. |
| float top = 2; |
| |
| // Right X coordinate. |
| float right = 3; |
| |
| // Bottom Y coordinate. |
| float bottom = 4; |
| } |
| |
| // For tracking related features, such as LOGO_RECOGNITION, FACE_DETECTION, |
| // CELEBRITY_RECOGNITION, PERSON_DETECTION. |
| // An object at time_offset with attributes, and located with |
| // normalized_bounding_box. |
| message TimestampedObject { |
| // Normalized Bounding box in a frame, where the object is located. |
| NormalizedBoundingBox normalized_bounding_box = 1; |
| |
| // Time-offset, relative to the beginning of the video, |
| // corresponding to the video frame for this object. |
| google.protobuf.Duration time_offset = 2; |
| |
| // Optional. The attributes of the object in the bounding box. |
| repeated DetectedAttribute attributes = 3; |
| } |
| |
| // A track of an object instance. |
| message Track { |
| // Video segment of a track. |
| VideoSegment segment = 1; |
| |
| // The object with timestamp and attributes per frame in the track. |
| repeated TimestampedObject timestamped_objects = 2; |
| |
| // Optional. Attributes in the track level. |
| repeated DetectedAttribute attributes = 3; |
| |
| // Optional. The confidence score of the tracked object. |
| float confidence = 4; |
| } |
| |
| // A generic detected attribute represented by name in string format. |
| message DetectedAttribute { |
| // The name of the attribute, i.e. glasses, dark_glasses, mouth_open etc. |
| // A full list of supported type names will be provided in the document. |
| string name = 1; |
| |
| // Detected attribute confidence. Range [0, 1]. |
| float confidence = 2; |
| |
| // Text value of the detection result. For example, the value for "HairColor" |
| // can be "black", "blonde", etc. |
| string value = 3; |
| } |
| |
| // Annotation results for a single video. |
| message VideoAnnotationResults { |
| // Video file location in |
| // [Google Cloud Storage](https://cloud.google.com/storage/). |
| string input_uri = 1; |
| |
| // Label annotations on video level or user specified segment level. |
| // There is exactly one element for each unique label. |
| repeated LabelAnnotation segment_label_annotations = 2; |
| |
| // Label annotations on shot level. |
| // There is exactly one element for each unique label. |
| repeated LabelAnnotation shot_label_annotations = 3; |
| |
| // Label annotations on frame level. |
| // There is exactly one element for each unique label. |
| repeated LabelAnnotation frame_label_annotations = 4; |
| |
| // Shot annotations. Each shot is represented as a video segment. |
| repeated VideoSegment shot_annotations = 6; |
| |
| // Explicit content annotation. |
| ExplicitContentAnnotation explicit_annotation = 7; |
| |
| // Speech transcription. |
| repeated SpeechTranscription speech_transcriptions = 11; |
| |
| // OCR text detection and tracking. |
| // Annotations for list of detected text snippets. Each will have list of |
| // frame information associated with it. |
| repeated TextAnnotation text_annotations = 12; |
| |
| // Annotations for list of objects detected and tracked in video. |
| repeated ObjectTrackingAnnotation object_annotations = 14; |
| |
| // Annotations for list of logos detected, tracked and recognized in video. |
| repeated LogoRecognitionAnnotation logo_recognition_annotations = 19; |
| |
| // If set, indicates an error. Note that for a single `AnnotateVideoRequest` |
| // some videos may succeed and some may fail. |
| google.rpc.Status error = 9; |
| } |
| |
| // Video annotation response. Included in the `response` |
| // field of the `Operation` returned by the `GetOperation` |
| // call of the `google::longrunning::Operations` service. |
| message AnnotateVideoResponse { |
| // Annotation results for all videos specified in `AnnotateVideoRequest`. |
| repeated VideoAnnotationResults annotation_results = 1; |
| } |
| |
| // Annotation progress for a single video. |
| message VideoAnnotationProgress { |
| // Video file location in |
| // [Google Cloud Storage](https://cloud.google.com/storage/). |
| string input_uri = 1; |
| |
| // Approximate percentage processed thus far. Guaranteed to be |
| // 100 when fully processed. |
| int32 progress_percent = 2; |
| |
| // Time when the request was received. |
| google.protobuf.Timestamp start_time = 3; |
| |
| // Time of the most recent update. |
| google.protobuf.Timestamp update_time = 4; |
| } |
| |
| // Video annotation progress. Included in the `metadata` |
| // field of the `Operation` returned by the `GetOperation` |
| // call of the `google::longrunning::Operations` service. |
| message AnnotateVideoProgress { |
| // Progress metadata for all videos specified in `AnnotateVideoRequest`. |
| repeated VideoAnnotationProgress annotation_progress = 1; |
| } |
| |
| // Config for SPEECH_TRANSCRIPTION. |
| message SpeechTranscriptionConfig { |
| // *Required* The language of the supplied audio as a |
| // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. |
| // Example: "en-US". |
| // See [Language Support](https://cloud.google.com/speech/docs/languages) |
| // for a list of the currently supported language codes. |
| string language_code = 1; |
| |
| // *Optional* Maximum number of recognition hypotheses to be returned. |
| // Specifically, the maximum number of `SpeechRecognitionAlternative` messages |
| // within each `SpeechTranscription`. The server may return fewer than |
| // `max_alternatives`. Valid values are `0`-`30`. A value of `0` or `1` will |
| // return a maximum of one. If omitted, will return a maximum of one. |
| int32 max_alternatives = 2; |
| |
| // *Optional* If set to `true`, the server will attempt to filter out |
| // profanities, replacing all but the initial character in each filtered word |
| // with asterisks, e.g. "f***". If set to `false` or omitted, profanities |
| // won't be filtered out. |
| bool filter_profanity = 3; |
| |
| // *Optional* A means to provide context to assist the speech recognition. |
| repeated SpeechContext speech_contexts = 4; |
| |
| // *Optional* If 'true', adds punctuation to recognition result hypotheses. |
| // This feature is only available in select languages. Setting this for |
| // requests in other languages has no effect at all. The default 'false' value |
| // does not add punctuation to result hypotheses. NOTE: "This is currently |
| // offered as an experimental service, complimentary to all users. In the |
| // future this may be exclusively available as a premium feature." |
| bool enable_automatic_punctuation = 5; |
| |
| // *Optional* For file formats, such as MXF or MKV, supporting multiple audio |
| // tracks, specify up to two tracks. Default: track 0. |
| repeated int32 audio_tracks = 6; |
| |
| // *Optional* If 'true', enables speaker detection for each recognized word in |
| // the top alternative of the recognition result using a speaker_tag provided |
| // in the WordInfo. |
| // Note: When this is true, we send all the words from the beginning of the |
| // audio for the top alternative in every consecutive responses. |
| // This is done in order to improve our speaker tags as our models learn to |
| // identify the speakers in the conversation over time. |
| bool enable_speaker_diarization = 7; |
| |
| // *Optional* |
| // If set, specifies the estimated number of speakers in the conversation. |
| // If not set, defaults to '2'. |
| // Ignored unless enable_speaker_diarization is set to true. |
| int32 diarization_speaker_count = 8; |
| |
| // *Optional* If `true`, the top result includes a list of words and the |
| // confidence for those words. If `false`, no word-level confidence |
| // information is returned. The default is `false`. |
| bool enable_word_confidence = 9; |
| } |
| |
| // Provides "hints" to the speech recognizer to favor specific words and phrases |
| // in the results. |
| message SpeechContext { |
| // *Optional* A list of strings containing words and phrases "hints" so that |
| // the speech recognition is more likely to recognize them. This can be used |
| // to improve the accuracy for specific words and phrases, for example, if |
| // specific commands are typically spoken by the user. This can also be used |
| // to add additional words to the vocabulary of the recognizer. See |
| // [usage limits](https://cloud.google.com/speech/limits#content). |
| repeated string phrases = 1; |
| } |
| |
| // A speech recognition result corresponding to a portion of the audio. |
| message SpeechTranscription { |
| // May contain one or more recognition hypotheses (up to the maximum specified |
| // in `max_alternatives`). These alternatives are ordered in terms of |
| // accuracy, with the top (first) alternative being the most probable, as |
| // ranked by the recognizer. |
| repeated SpeechRecognitionAlternative alternatives = 1; |
| |
| // Output only. The |
| // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the |
| // language in this result. This language code was detected to have the most |
| // likelihood of being spoken in the audio. |
| string language_code = 2; |
| } |
| |
| // Alternative hypotheses (a.k.a. n-best list). |
| message SpeechRecognitionAlternative { |
| // Transcript text representing the words that the user spoke. |
| string transcript = 1; |
| |
| // The confidence estimate between 0.0 and 1.0. A higher number |
| // indicates an estimated greater likelihood that the recognized words are |
| // correct. This field is typically provided only for the top hypothesis, and |
| // only for `is_final=true` results. Clients should not rely on the |
| // `confidence` field as it is not guaranteed to be accurate or consistent. |
| // The default of 0.0 is a sentinel value indicating `confidence` was not set. |
| float confidence = 2; |
| |
| // A list of word-specific information for each recognized word. |
| repeated WordInfo words = 3; |
| } |
| |
| // Word-specific information for recognized words. Word information is only |
| // included in the response when certain request parameters are set, such |
| // as `enable_word_time_offsets`. |
| message WordInfo { |
| // Time offset relative to the beginning of the audio, and |
| // corresponding to the start of the spoken word. This field is only set if |
| // `enable_word_time_offsets=true` and only in the top hypothesis. This is an |
| // experimental feature and the accuracy of the time offset can vary. |
| google.protobuf.Duration start_time = 1; |
| |
| // Time offset relative to the beginning of the audio, and |
| // corresponding to the end of the spoken word. This field is only set if |
| // `enable_word_time_offsets=true` and only in the top hypothesis. This is an |
| // experimental feature and the accuracy of the time offset can vary. |
| google.protobuf.Duration end_time = 2; |
| |
| // The word corresponding to this set of information. |
| string word = 3; |
| |
| // Output only. The confidence estimate between 0.0 and 1.0. A higher number |
| // indicates an estimated greater likelihood that the recognized words are |
| // correct. This field is set only for the top alternative. |
| // This field is not guaranteed to be accurate and users should not rely on it |
| // to be always provided. |
| // The default of 0.0 is a sentinel value indicating `confidence` was not set. |
| float confidence = 4; |
| |
| // Output only. A distinct integer value is assigned for every speaker within |
| // the audio. This field specifies which one of those speakers was detected to |
| // have spoken this word. Value ranges from 1 up to diarization_speaker_count, |
| // and is only set if speaker diarization is enabled. |
| int32 speaker_tag = 5; |
| } |
| |
| // A vertex represents a 2D point in the image. |
| // NOTE: the normalized vertex coordinates are relative to the original image |
| // and range from 0 to 1. |
| message NormalizedVertex { |
| // X coordinate. |
| float x = 1; |
| |
| // Y coordinate. |
| float y = 2; |
| } |
| |
| // Normalized bounding polygon for text (that might not be aligned with axis). |
| // Contains list of the corner points in clockwise order starting from |
| // top-left corner. For example, for a rectangular bounding box: |
| // When the text is horizontal it might look like: |
| // 0----1 |
| // | | |
| // 3----2 |
| // |
| // When it's clockwise rotated 180 degrees around the top-left corner it |
| // becomes: |
| // 2----3 |
| // | | |
| // 1----0 |
| // |
| // and the vertex order will still be (0, 1, 2, 3). Note that values can be less |
| // than 0, or greater than 1 due to trignometric calculations for location of |
| // the box. |
| message NormalizedBoundingPoly { |
| // Normalized vertices of the bounding polygon. |
| repeated NormalizedVertex vertices = 1; |
| } |
| |
| // Video segment level annotation results for text detection. |
| message TextSegment { |
| // Video segment where a text snippet was detected. |
| VideoSegment segment = 1; |
| |
| // Confidence for the track of detected text. It is calculated as the highest |
| // over all frames where OCR detected text appears. |
| float confidence = 2; |
| |
| // Information related to the frames where OCR detected text appears. |
| repeated TextFrame frames = 3; |
| } |
| |
| // Video frame level annotation results for text annotation (OCR). |
| // Contains information regarding timestamp and bounding box locations for the |
| // frames containing detected OCR text snippets. |
| message TextFrame { |
| // Bounding polygon of the detected text for this frame. |
| NormalizedBoundingPoly rotated_bounding_box = 1; |
| |
| // Timestamp of this frame. |
| google.protobuf.Duration time_offset = 2; |
| } |
| |
| // Annotations related to one detected OCR text snippet. This will contain the |
| // corresponding text, confidence value, and frame level information for each |
| // detection. |
| message TextAnnotation { |
| // The detected text. |
| string text = 1; |
| |
| // All video segments where OCR detected text appears. |
| repeated TextSegment segments = 2; |
| } |
| |
| // Video frame level annotations for object detection and tracking. This field |
| // stores per frame location, time offset, and confidence. |
| message ObjectTrackingFrame { |
| // The normalized bounding box location of this object track for the frame. |
| NormalizedBoundingBox normalized_bounding_box = 1; |
| |
| // The timestamp of the frame in microseconds. |
| google.protobuf.Duration time_offset = 2; |
| } |
| |
| // Annotations corresponding to one tracked object. |
| message ObjectTrackingAnnotation { |
| // Entity to specify the object category that this track is labeled as. |
| Entity entity = 1; |
| |
| // Object category's labeling confidence of this track. |
| float confidence = 4; |
| |
| // Information corresponding to all frames where this object track appears. |
| // Non-streaming batch mode: it may be one or multiple ObjectTrackingFrame |
| // messages in frames. |
| // Streaming mode: it can only be one ObjectTrackingFrame message in frames. |
| repeated ObjectTrackingFrame frames = 2; |
| |
| // Different representation of tracking info in non-streaming batch |
| // and streaming modes. |
| oneof track_info { |
| // Non-streaming batch mode ONLY. |
| // Each object track corresponds to one video segment where it appears. |
| VideoSegment segment = 3; |
| // Streaming mode ONLY. |
| // In streaming mode, we do not know the end time of a tracked object |
| // before it is completed. Hence, there is no VideoSegment info returned. |
| // Instead, we provide a unique identifiable integer track_id so that |
| // the customers can correlate the results of the ongoing |
| // ObjectTrackAnnotation of the same track_id over time. |
| int64 track_id = 5; |
| } |
| } |
| |
| // Annotation corresponding to one detected, tracked and recognized logo class. |
| message LogoRecognitionAnnotation { |
| // Entity category information to specify the logo class that all the logo |
| // tracks within this LogoRecognitionAnnotation are recognized as. |
| Entity entity = 1; |
| |
| // All logo tracks where the recognized logo appears. Each track corresponds |
| // to one logo instance appearing in consecutive frames. |
| repeated Track tracks = 2; |
| |
| // All video segments where the recognized logo appears. There might be |
| // multiple instances of the same logo class appearing in one VideoSegment. |
| repeated VideoSegment segments = 3; |
| } |
| |
| // The top-level message sent by the client for the `StreamingAnnotateVideo` |
| // method. Multiple `StreamingAnnotateVideoRequest` messages are sent. |
| // The first message must only contain a `StreamingVideoConfig` message. |
| // All subsequent messages must only contain `input_content` data. |
| message StreamingAnnotateVideoRequest { |
| // *Required* The streaming request, which is either a streaming config or |
| // video content. |
| oneof streaming_request { |
| // Provides information to the annotator, specifing how to process the |
| // request. The first `AnnotateStreamingVideoRequest` message must only |
| // contain a `video_config` message. |
| StreamingVideoConfig video_config = 1; |
| |
| // The video data to be annotated. Chunks of video data are sequentially |
| // sent in `StreamingAnnotateVideoRequest` messages. Except the initial |
| // `StreamingAnnotateVideoRequest` message containing only |
| // `video_config`, all subsequent `AnnotateStreamingVideoRequest` |
| // messages must only contain `input_content` field. |
| // Note: as with all bytes fields, protobuffers use a pure binary |
| // representation (not base64). |
| bytes input_content = 2; |
| } |
| } |
| |
| // `StreamingAnnotateVideoResponse` is the only message returned to the client |
| // by `StreamingAnnotateVideo`. A series of zero or more |
| // `StreamingAnnotateVideoResponse` messages are streamed back to the client. |
| message StreamingAnnotateVideoResponse { |
| // If set, returns a [google.rpc.Status][google.rpc.Status] message that |
| // specifies the error for the operation. |
| google.rpc.Status error = 1; |
| |
| // Streaming annotation results. |
| StreamingVideoAnnotationResults annotation_results = 2; |
| |
| // GCS URI that stores annotation results of one streaming session. |
| // It is a directory that can hold multiple files in JSON format. |
| // Example uri format: |
| // gs://bucket_id/object_id/cloud_project_name-session_id |
| string annotation_results_uri = 3; |
| } |
| |
| // Config for AUTOML_CLASSIFICATION in streaming mode. |
| message StreamingAutomlClassificationConfig { |
| // Resource name of AutoML model. |
| // Format: `projects/{project_id}/locations/{location_id}/models/{model_id}` |
| string model_name = 1; |
| } |
| |
| // Config for AUTOML_OBJECT_TRACKING in streaming mode. |
| message StreamingAutomlObjectTrackingConfig { |
| // Resource name of AutoML model. |
| // Format: `projects/{project_id}/locations/{location_id}/models/{model_id}` |
| string model_name = 1; |
| } |
| |
| // Config for EXPLICIT_CONTENT_DETECTION in streaming mode. |
| message StreamingExplicitContentDetectionConfig { |
| // No customized config support. |
| } |
| |
| // Config for LABEL_DETECTION in streaming mode. |
| message StreamingLabelDetectionConfig { |
| // Whether the video has been captured from a stationary (i.e. non-moving) |
| // camera. When set to true, might improve detection accuracy for moving |
| // objects. Default: false. |
| bool stationary_camera = 1; |
| } |
| |
| // Config for STREAMING_OBJECT_TRACKING. |
| message StreamingObjectTrackingConfig { |
| // No customized config support. |
| } |
| |
| // Config for SHOT_CHANGE_DETECTION in streaming mode. |
| message StreamingShotChangeDetectionConfig { |
| // No customized config support. |
| } |
| |
| // Config for streaming storage option. |
| message StreamingStorageConfig { |
| // Enable streaming storage. Default: false. |
| bool enable_storage_annotation_result = 1; |
| |
| // GCS URI to store all annotation results for one client. Client should |
| // specify this field as the top-level storage directory. Annotation results |
| // of different sessions will be put into different sub-directories denoted |
| // by project_name and session_id. All sub-directories will be auto generated |
| // by program and will be made accessible to client in response proto. |
| // URIs must be specified in the following format: `gs://bucket-id/object-id` |
| // `bucket-id` should be a valid GCS bucket created by client and bucket |
| // permission shall also be configured properly. `object-id` can be arbitrary |
| // string that make sense to client. Other URI formats will return error and |
| // cause GCS write failure. |
| string annotation_result_storage_directory = 3; |
| } |
| |
| // Streaming annotation results corresponding to a portion of the video |
| // that is currently being processed. |
| message StreamingVideoAnnotationResults { |
| // Shot annotation results. Each shot is represented as a video segment. |
| repeated VideoSegment shot_annotations = 1; |
| |
| // Label annotation results. |
| repeated LabelAnnotation label_annotations = 2; |
| |
| // Explicit content annotation results. |
| ExplicitContentAnnotation explicit_annotation = 3; |
| |
| // Object tracking results. |
| repeated ObjectTrackingAnnotation object_annotations = 4; |
| } |
| |
| // Provides information to the annotator that specifies how to process the |
| // request. |
| message StreamingVideoConfig { |
| // Requested annotation feature. |
| StreamingFeature feature = 1; |
| |
| // Config for requested annotation feature. |
| oneof streaming_config { |
| // Config for STREAMING_SHOT_CHANGE_DETECTION. |
| StreamingShotChangeDetectionConfig shot_change_detection_config = 2; |
| |
| // Config for STREAMING_LABEL_DETECTION. |
| StreamingLabelDetectionConfig label_detection_config = 3; |
| |
| // Config for STREAMING_EXPLICIT_CONTENT_DETECTION. |
| StreamingExplicitContentDetectionConfig explicit_content_detection_config = |
| 4; |
| |
| // Config for STREAMING_OBJECT_TRACKING. |
| StreamingObjectTrackingConfig object_tracking_config = 5; |
| |
| // Config for STREAMING_AUTOML_CLASSIFICATION. |
| StreamingAutomlClassificationConfig automl_classification_config = 21; |
| |
| // Config for STREAMING_AUTOML_OBJECT_TRACKING. |
| StreamingAutomlObjectTrackingConfig automl_object_tracking_config = 22; |
| } |
| |
| // Streaming storage option. By default: storage is disabled. |
| StreamingStorageConfig storage_config = 30; |
| } |
| |
| // Video annotation feature. |
| enum Feature { |
| // Unspecified. |
| FEATURE_UNSPECIFIED = 0; |
| |
| // Label detection. Detect objects, such as dog or flower. |
| LABEL_DETECTION = 1; |
| |
| // Shot change detection. |
| SHOT_CHANGE_DETECTION = 2; |
| |
| // Explicit content detection. |
| EXPLICIT_CONTENT_DETECTION = 3; |
| |
| // Speech transcription. |
| SPEECH_TRANSCRIPTION = 6; |
| |
| // OCR text detection and tracking. |
| TEXT_DETECTION = 7; |
| |
| // Object detection and tracking. |
| OBJECT_TRACKING = 9; |
| |
| // Logo detection, tracking, and recognition. |
| LOGO_RECOGNITION = 12; |
| } |
| |
| // Label detection mode. |
| enum LabelDetectionMode { |
| // Unspecified. |
| LABEL_DETECTION_MODE_UNSPECIFIED = 0; |
| |
| // Detect shot-level labels. |
| SHOT_MODE = 1; |
| |
| // Detect frame-level labels. |
| FRAME_MODE = 2; |
| |
| // Detect both shot-level and frame-level labels. |
| SHOT_AND_FRAME_MODE = 3; |
| } |
| |
| // Bucketized representation of likelihood. |
| enum Likelihood { |
| // Unspecified likelihood. |
| LIKELIHOOD_UNSPECIFIED = 0; |
| |
| // Very unlikely. |
| VERY_UNLIKELY = 1; |
| |
| // Unlikely. |
| UNLIKELY = 2; |
| |
| // Possible. |
| POSSIBLE = 3; |
| |
| // Likely. |
| LIKELY = 4; |
| |
| // Very likely. |
| VERY_LIKELY = 5; |
| } |
| |
| // Streaming video annotation feature. |
| enum StreamingFeature { |
| // Unspecified. |
| STREAMING_FEATURE_UNSPECIFIED = 0; |
| // Label detection. Detect objects, such as dog or flower. |
| STREAMING_LABEL_DETECTION = 1; |
| // Shot change detection. |
| STREAMING_SHOT_CHANGE_DETECTION = 2; |
| // Explicit content detection. |
| STREAMING_EXPLICIT_CONTENT_DETECTION = 3; |
| // Object detection and tracking. |
| STREAMING_OBJECT_TRACKING = 4; |
| // Video classification based on AutoML model. |
| STREAMING_AUTOML_CLASSIFICATION = 21; |
| // Object detection and tracking based on AutoML model. |
| STREAMING_AUTOML_OBJECT_TRACKING = 22; |
| } |