|
| 1 | +// Copyright 2018 Google Inc. |
| 2 | +// |
| 3 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +// you may not use this file except in compliance with the License. |
| 5 | +// You may obtain a copy of the License at |
| 6 | +// |
| 7 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +// |
| 9 | +// Unless required by applicable law or agreed to in writing, software |
| 10 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +// See the License for the specific language governing permissions and |
| 13 | +// limitations under the License. |
| 14 | + |
| 15 | +syntax = "proto3"; |
| 16 | + |
| 17 | +package google.cloud.texttospeech.v1beta1; |
| 18 | + |
| 19 | +import "google/api/annotations.proto"; |
| 20 | + |
| 21 | +option cc_enable_arenas = true; |
| 22 | +option go_package = "google.golang.org/genproto/googleapis/cloud/texttospeech/v1beta1;texttospeech"; |
| 23 | +option java_multiple_files = true; |
| 24 | +option java_outer_classname = "TextToSpeechProto"; |
| 25 | +option java_package = "com.google.cloud.texttospeech.v1beta1"; |
| 26 | + |
| 27 | + |
| 28 | +// Service that implements Google Cloud Text-to-Speech API. |
| 29 | +service TextToSpeech { |
| 30 | + // Returns a list of [Voice][google.cloud.texttospeech.v1beta1.Voice] |
| 31 | + // supported for synthesis. |
| 32 | + rpc ListVoices(ListVoicesRequest) returns (ListVoicesResponse) { |
| 33 | + option (google.api.http) = { |
| 34 | + get: "/v1beta1/voices" |
| 35 | + }; |
| 36 | + } |
| 37 | + |
| 38 | + // Synthesizes speech synchronously: receive results after all text input |
| 39 | + // has been processed. |
| 40 | + rpc SynthesizeSpeech(SynthesizeSpeechRequest) returns (SynthesizeSpeechResponse) { |
| 41 | + option (google.api.http) = { |
| 42 | + post: "/v1beta1/text:synthesize" |
| 43 | + body: "*" |
| 44 | + }; |
| 45 | + } |
| 46 | +} |
| 47 | + |
| 48 | +// The top-level message sent by the client for the `ListVoices` method. |
| 49 | +message ListVoicesRequest { |
| 50 | + // Optional (but recommended) |
| 51 | + // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. If |
| 52 | + // specified, the ListVoices call will only return voices that can be used to |
| 53 | + // synthesize this language_code. E.g. when specifying "en-NZ", you will get |
| 54 | + // supported "en-*" voices; when specifying "no", you will get supported |
| 55 | + // "no-*" (Norwegian) and "nb-*" (Norwegian Bokmal) voices; specifying "zh" |
| 56 | + // will also get supported "cmn-*" voices; specifying "zh-hk" will also get |
| 57 | + // supported "yue-*" voices. |
| 58 | + string language_code = 1; |
| 59 | +} |
| 60 | + |
| 61 | +// The message returned to the client by the `ListVoices` method. |
| 62 | +message ListVoicesResponse { |
| 63 | + // The list of voices. |
| 64 | + repeated Voice voices = 1; |
| 65 | +} |
| 66 | + |
| 67 | +// Description of a voice supported by the TTS service. |
| 68 | +message Voice { |
| 69 | + // The languages that this voice supports, expressed as |
| 70 | + // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags (e.g. |
| 71 | + // "en-US", "es-419", "cmn-tw"). |
| 72 | + repeated string language_codes = 1; |
| 73 | + |
| 74 | + // The name of this voice. Each distinct voice has a unique name. |
| 75 | + string name = 2; |
| 76 | + |
| 77 | + // The gender of this voice. |
| 78 | + SsmlVoiceGender ssml_gender = 3; |
| 79 | + |
| 80 | + // The natural sample rate (in hertz) for this voice. |
| 81 | + int32 natural_sample_rate_hertz = 4; |
| 82 | +} |
| 83 | + |
| 84 | +// The top-level message sent by the client for the `SynthesizeSpeech` method. |
| 85 | +message SynthesizeSpeechRequest { |
| 86 | + // Required. The Synthesizer requires either plain text or SSML as input. |
| 87 | + SynthesisInput input = 1; |
| 88 | + |
| 89 | + // Required. The desired voice of the synthesized audio. |
| 90 | + VoiceSelectionParams voice = 2; |
| 91 | + |
| 92 | + // Required. The configuration of the synthesized audio. |
| 93 | + AudioConfig audio_config = 3; |
| 94 | +} |
| 95 | + |
| 96 | +// Contains text input to be synthesized. Either `text` or `ssml` must be |
| 97 | +// supplied. Supplying both or neither returns |
| 98 | +// [google.rpc.Code.INVALID_ARGUMENT][]. The input size is limited to 5000 |
| 99 | +// characters. |
| 100 | +message SynthesisInput { |
| 101 | + // The input source, which is either plain text or SSML. |
| 102 | + oneof input_source { |
| 103 | + // The raw text to be synthesized. |
| 104 | + string text = 1; |
| 105 | + |
| 106 | + // The SSML document to be synthesized. The SSML document must be valid |
| 107 | + // and well-formed. Otherwise the RPC will fail and return |
| 108 | + // [google.rpc.Code.INVALID_ARGUMENT][]. For more information, see |
| 109 | + // [SSML](/speech/text-to-speech/docs/ssml). |
| 110 | + string ssml = 2; |
| 111 | + } |
| 112 | +} |
| 113 | + |
| 114 | +// Description of which voice to use for a synthesis request. |
| 115 | +message VoiceSelectionParams { |
| 116 | + // The language (and optionally also the region) of the voice expressed as a |
| 117 | + // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag, e.g. |
| 118 | + // "en-US". Required. This should not include a script tag (e.g. use |
| 119 | + // "cmn-cn" rather than "cmn-Hant-cn"), because the script will be inferred |
| 120 | + // from the input provided in the SynthesisInput. The TTS service |
| 121 | + // will use this parameter to help choose an appropriate voice. Note that |
| 122 | + // the TTS service may choose a voice with a slightly different language code |
| 123 | + // than the one selected; it may substitute a different region |
| 124 | + // (e.g. using en-US rather than en-CA if there isn't a Canadian voice |
| 125 | + // available), or even a different language, e.g. using "nb" (Norwegian |
| 126 | + // Bokmal) instead of "no" (Norwegian)". |
| 127 | + string language_code = 1; |
| 128 | + |
| 129 | + // The name of the voice. Optional; if not set, the service will choose a |
| 130 | + // voice based on the other parameters such as language_code and gender. |
| 131 | + string name = 2; |
| 132 | + |
| 133 | + // The preferred gender of the voice. Optional; if not set, the service will |
| 134 | + // choose a voice based on the other parameters such as language_code and |
| 135 | + // name. Note that this is only a preference, not requirement; if a |
| 136 | + // voice of the appropriate gender is not available, the synthesizer should |
| 137 | + // substitute a voice with a different gender rather than failing the request. |
| 138 | + SsmlVoiceGender ssml_gender = 3; |
| 139 | +} |
| 140 | + |
| 141 | +// Description of audio data to be synthesized. |
| 142 | +message AudioConfig { |
| 143 | + // Required. The format of the requested audio byte stream. |
| 144 | + AudioEncoding audio_encoding = 1; |
| 145 | + |
| 146 | + // Optional speaking rate/speed, in the range [0.25, 4.0]. 1.0 is the normal |
| 147 | + // native speed supported by the specific voice. 2.0 is twice as fast, and |
| 148 | + // 0.5 is half as fast. If unset(0.0), defaults to the native 1.0 speed. Any |
| 149 | + // other values < 0.25 or > 4.0 will return an error. |
| 150 | + double speaking_rate = 2; |
| 151 | + |
| 152 | + // Optional speaking pitch, in the range [-20.0, 20.0]. 20 means increase 20 |
| 153 | + // semitones from the original pitch. -20 means decrease 20 semitones from the |
| 154 | + // original pitch. |
| 155 | + double pitch = 3; |
| 156 | + |
| 157 | + // Optional volume gain (in dB) of the normal native volume supported by the |
| 158 | + // specific voice, in the range [-96.0, 16.0]. If unset, or set to a value of |
| 159 | + // 0.0 (dB), will play at normal native signal amplitude. A value of -6.0 (dB) |
| 160 | + // will play at approximately half the amplitude of the normal native signal |
| 161 | + // amplitude. A value of +6.0 (dB) will play at approximately twice the |
| 162 | + // amplitude of the normal native signal amplitude. Strongly recommend not to |
| 163 | + // exceed +10 (dB) as there's usually no effective increase in loudness for |
| 164 | + // any value greater than that. |
| 165 | + double volume_gain_db = 4; |
| 166 | + |
| 167 | + // The synthesis sample rate (in hertz) for this audio. Optional. If this is |
| 168 | + // different from the voice's natural sample rate, then the synthesizer will |
| 169 | + // honor this request by converting to the desired sample rate (which might |
| 170 | + // result in worse audio quality), unless the specified sample rate is not |
| 171 | + // supported for the encoding chosen, in which case it will fail the request |
| 172 | + // and return [google.rpc.Code.INVALID_ARGUMENT][]. |
| 173 | + int32 sample_rate_hertz = 5; |
| 174 | + |
| 175 | + // An identifier which selects 'audio effects' profiles that are applied on |
| 176 | + // (post synthesized) text to speech. |
| 177 | + // Effects are applied on top of each other in the order they are given. |
| 178 | + repeated string effects_profile_id = 6; |
| 179 | +} |
| 180 | + |
| 181 | +// The message returned to the client by the `SynthesizeSpeech` method. |
| 182 | +message SynthesizeSpeechResponse { |
| 183 | + // The audio data bytes encoded as specified in the request, including the |
| 184 | + // header (For LINEAR16 audio, we include the WAV header). Note: as |
| 185 | + // with all bytes fields, protobuffers use a pure binary representation, |
| 186 | + // whereas JSON representations use base64. |
| 187 | + bytes audio_content = 1; |
| 188 | +} |
| 189 | + |
| 190 | +// Gender of the voice as described in |
| 191 | +// [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice). |
| 192 | +enum SsmlVoiceGender { |
| 193 | + // An unspecified gender. |
| 194 | + // In VoiceSelectionParams, this means that the client doesn't care which |
| 195 | + // gender the selected voice will have. In the Voice field of |
| 196 | + // ListVoicesResponse, this may mean that the voice doesn't fit any of the |
| 197 | + // other categories in this enum, or that the gender of the voice isn't known. |
| 198 | + SSML_VOICE_GENDER_UNSPECIFIED = 0; |
| 199 | + |
| 200 | + // A male voice. |
| 201 | + MALE = 1; |
| 202 | + |
| 203 | + // A female voice. |
| 204 | + FEMALE = 2; |
| 205 | + |
| 206 | + // A gender-neutral voice. |
| 207 | + NEUTRAL = 3; |
| 208 | +} |
| 209 | + |
| 210 | +// Configuration to set up audio encoder. The encoding determines the output |
| 211 | +// audio format that we'd like. |
| 212 | +enum AudioEncoding { |
| 213 | + // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][]. |
| 214 | + AUDIO_ENCODING_UNSPECIFIED = 0; |
| 215 | + |
| 216 | + // Uncompressed 16-bit signed little-endian samples (Linear PCM). |
| 217 | + // Audio content returned as LINEAR16 also contains a WAV header. |
| 218 | + LINEAR16 = 1; |
| 219 | + |
| 220 | + // MP3 audio. |
| 221 | + MP3 = 2; |
| 222 | + |
| 223 | + // Opus encoded audio wrapped in an ogg container. The result will be a |
| 224 | + // file which can be played natively on Android, and in browsers (at least |
| 225 | + // Chrome and Firefox). The quality of the encoding is considerably higher |
| 226 | + // than MP3 while using approximately the same bitrate. |
| 227 | + OGG_OPUS = 3; |
| 228 | +} |
0 commit comments