GoogleVoiceStreaming.cs(文件已创建)
| @@ -0,0 +1,369 @@ | |||
| 1 | + | ||
| 2 | + | using Google.Apis.Auth.OAuth2; | |
| 3 | + | using Google.Cloud.Speech.V1; | |
| 4 | + | using Google.Protobuf.Collections; | |
| 5 | + | using Grpc.Auth; | |
| 6 | + | using System; | |
| 7 | + | using System.Collections; | |
| 8 | + | using System.Collections.Generic; | |
| 9 | + | using System.IO; | |
| 10 | + | using System.Threading.Tasks; | |
| 11 | + | using UnityEngine; | |
| 12 | + | using UnityEngine.Events; | |
| 13 | + | #if PLATFORM_ANDROID | |
| 14 | + | using UnityEngine.Android; | |
| 15 | + | #endif | |
| 16 | + | ||
| 17 | + | public class GoogleVoiceStreaming : MonoBehaviour | |
| 18 | + | { | |
| 19 | + | [Serializable] | |
| 20 | + | public class TranscriptEvent : UnityEvent<StreamingRecognitionResult> { } | |
| 21 | + | public TranscriptEvent OnResultEvent; | |
| 22 | + | public TranscriptEvent OnFinalResultEvent; | |
| 23 | + | ||
| 24 | + | float SILENCE_THRESHOLD = 0.01f; | |
| 25 | + | AudioClip _audioClip = null; | |
| 26 | + | bool _recording = false; | |
| 27 | + | int _recordingHZ = 22050; | |
| 28 | + | bool _stream_started = false; | |
| 29 | + | Task ListenTask; | |
| 30 | + | float t = 0; | |
| 31 | + | ||
| 32 | + | RecognitionConfig _config; | |
| 33 | + | SpeechClient _speech; | |
| 34 | + | SpeechClient.StreamingRecognizeStream _streamingCall; | |
| 35 | + | #if PLATFORM_ANDROID | |
| 36 | + | GameObject dialog = null; | |
| 37 | + | #endif | |
| 38 | + | ||
| 39 | + | ~GoogleVoiceStreaming () { | |
| 40 | + | if (_stream_started) | |
| 41 | + | { | |
| 42 | + | _streamingCall.WriteCompleteAsync().Wait(); | |
| 43 | + | } | |
| 44 | + | } | |
| 45 | + | ||
| 46 | + | public void OnDestroy() | |
| 47 | + | { | |
| 48 | + | if (_stream_started) | |
| 49 | + | { | |
| 50 | + | _streamingCall.WriteCompleteAsync().Wait(); | |
| 51 | + | ||
| 52 | + | } | |
| 53 | + | } | |
| 54 | + | ||
| 55 | + | void OnDisable() | |
| 56 | + | { | |
| 57 | + | Debug.Log("Disabled"); | |
| 58 | + | if (_stream_started) | |
| 59 | + | { | |
| 60 | + | EndRecording(); | |
| 61 | + | } | |
| 62 | + | } | |
| 63 | + | ||
| 64 | + | void OnActive () | |
| 65 | + | { | |
| 66 | + | } | |
| 67 | + | ||
| 68 | + | public bool StartActive = false; | |
| 69 | + | public bool _active = false; | |
| 70 | + | public bool Active { | |
| 71 | + | set { | |
| 72 | + | if (value && !_recording) | |
| 73 | + | { | |
| 74 | + | _active = true; | |
| 75 | + | BeginRecognition(); | |
| 76 | + | } | |
| 77 | + | else if (!value && _recording) | |
| 78 | + | { | |
| 79 | + | _active = false; | |
| 80 | + | EndRecording(); | |
| 81 | + | } | |
| 82 | + | } | |
| 83 | + | } | |
| 84 | + | ||
| 85 | + | public void Start() | |
| 86 | + | { | |
| 87 | + | #if PLATFORM_ANDROID | |
| 88 | + | if (!Permission.HasUserAuthorizedPermission(Permission.Microphone)) | |
| 89 | + | { | |
| 90 | + | Permission.RequestUserPermission(Permission.Microphone); | |
| 91 | + | dialog = new GameObject(); | |
| 92 | + | } | |
| 93 | + | #endif | |
| 94 | + | if (_speech == null) | |
| 95 | + | { | |
| 96 | + | CreateChannel(); | |
| 97 | + | Active = StartActive; | |
| 98 | + | } | |
| 99 | + | ||
| 100 | + | Debug.Log(_speech); | |
| 101 | + | ||
| 102 | + | } | |
| 103 | + | ||
| 104 | + | public void Update() | |
| 105 | + | { | |
| 106 | + | // Count the how long the stream has been running | |
| 107 | + | // if the run time exceeds 340s, restart the stream | |
| 108 | + | if (_recording) | |
| 109 | + | { | |
| 110 | + | t += Time.deltaTime; | |
| 111 | + | if (t > 340) | |
| 112 | + | { | |
| 113 | + | EndRecording(); | |
| 114 | + | BeginRecognition(); | |
| 115 | + | t = 0; | |
| 116 | + | } | |
| 117 | + | } | |
| 118 | + | ||
| 119 | + | } | |
| 120 | + | ||
| 121 | + | void OnGUI() | |
| 122 | + | { | |
| 123 | + | ||
| 124 | + | #if PLATFORM_ANDROID | |
| 125 | + | if (!Permission.HasUserAuthorizedPermission(Permission.Microphone)) | |
| 126 | + | { | |
| 127 | + | // The user denied permission to use the microphone. | |
| 128 | + | // Display a message explaining why you need it with Yes/No buttons. | |
| 129 | + | // If the user says yes then present the request again | |
| 130 | + | // Display a dialog here. | |
| 131 | + | dialog.AddComponent<PermissionsRationaleDialog>(); | |
| 132 | + | return; | |
| 133 | + | } | |
| 134 | + | else if (dialog != null) | |
| 135 | + | { | |
| 136 | + | Destroy(dialog); | |
| 137 | + | } | |
| 138 | + | #endif | |
| 139 | + | ||
| 140 | + | // Now you can do things with the microphone | |
| 141 | + | } | |
| 142 | + | ||
| 143 | + | public void CreateChannel() | |
| 144 | + | { | |
| 145 | + | var key = Resources.Load<TextAsset>("key"); | |
| 146 | + | _config = new RecognitionConfig() | |
| 147 | + | { | |
| 148 | + | Encoding = RecognitionConfig.Types.AudioEncoding.Linear16, | |
| 149 | + | SampleRateHertz = 22050, | |
| 150 | + | LanguageCode = "en", | |
| 151 | + | }; | |
| 152 | + | ||
| 153 | + | var credentials = GoogleCredential.FromJson(key.text) | |
| 154 | + | .CreateScoped(); | |
| 155 | + | var channel = new Grpc.Core.Channel( | |
| 156 | + | SpeechClient.DefaultEndpoint.Host, | |
| 157 | + | credentials.ToChannelCredentials() | |
| 158 | + | ); | |
| 159 | + | _speech = SpeechClient.Create(channel); | |
| 160 | + | } | |
| 161 | + | ||
| 162 | + | public void BeginRecognition() | |
| 163 | + | { | |
| 164 | + | ||
| 165 | + | if (_speech == null) | |
| 166 | + | { | |
| 167 | + | CreateChannel(); | |
| 168 | + | } | |
| 169 | + | _streamingCall = _speech.StreamingRecognize(); | |
| 170 | + | _streamingCall.WriteAsync( | |
| 171 | + | new StreamingRecognizeRequest() | |
| 172 | + | { | |
| 173 | + | StreamingConfig = new StreamingRecognitionConfig() | |
| 174 | + | { | |
| 175 | + | Config = _config, | |
| 176 | + | InterimResults = true, | |
| 177 | + | } | |
| 178 | + | } | |
| 179 | + | ).Wait(); | |
| 180 | + | _recording = true; | |
| 181 | + | _stream_started = true; | |
| 182 | + | StartCoroutine(AudioListener()); | |
| 183 | + | ListenTask = new Task(() => listen()); | |
| 184 | + | ListenTask.RunSynchronously(); | |
| 185 | + | } | |
| 186 | + | ||
| 187 | + | // Funciton ends the recording of audio | |
| 188 | + | public void EndRecording() | |
| 189 | + | { | |
| 190 | + | if (_stream_started) | |
| 191 | + | { | |
| 192 | + | _streamingCall.WriteCompleteAsync(); | |
| 193 | + | _stream_started = false; | |
| 194 | + | _recording = false; | |
| 195 | + | } | |
| 196 | + | } | |
| 197 | + | ||
| 198 | + | /// <summary> | |
| 199 | + | /// This method provides the coroutine needed for recording audio from the microphone | |
| 200 | + | /// to a buffer that can be sent to google speech. Since it uses a ring buffer, once | |
| 201 | + | /// the buffer is full it loops to the start and overwrites old data. This buffer is | |
| 202 | + | /// streamed in 1/2 * buffer.length chunks. This allows for reduced recording latency. | |
| 203 | + | /// </summary> | |
| 204 | + | /// <returns></returns> | |
| 205 | + | IEnumerator AudioListener() | |
| 206 | + | { | |
| 207 | + | ||
| 208 | + | _audioClip = Microphone.Start(null, true, 1, _recordingHZ); | |
| 209 | + | ||
| 210 | + | yield return null; | |
| 211 | + | ||
| 212 | + | if (_audioClip == null) | |
| 213 | + | { | |
| 214 | + | yield break; | |
| 215 | + | } | |
| 216 | + | ||
| 217 | + | float[] samples = null; | |
| 218 | + | bool inFirstHalf = true; | |
| 219 | + | int midPoint = _audioClip.samples / 2; | |
| 220 | + | ||
| 221 | + | while (_recording && _audioClip != null) | |
| 222 | + | { | |
| 223 | + | ||
| 224 | + | int writePos = Microphone.GetPosition(null); | |
| 225 | + | ||
| 226 | + | if (writePos > _audioClip.samples || !Microphone.IsRecording(null)) | |
| 227 | + | { | |
| 228 | + | Debug.Log("Failed to get Microphone"); | |
| 229 | + | yield break; | |
| 230 | + | } | |
| 231 | + | // Check if we were in the first half, but have crossed the mid point (upper half full) | |
| 232 | + | // or if we were in the last half, but have wrapped to the start (lower half full) | |
| 233 | + | if ((inFirstHalf && writePos >= midPoint) || | |
| 234 | + | (!inFirstHalf && writePos < midPoint)) | |
| 235 | + | { | |
| 236 | + | samples = new float[midPoint]; | |
| 237 | + | _audioClip.GetData(samples, inFirstHalf ? 0 : midPoint); | |
| 238 | + | ||
| 239 | + | SendAudio(samples); | |
| 240 | + | inFirstHalf = !inFirstHalf; | |
| 241 | + | } | |
| 242 | + | else | |
| 243 | + | { | |
| 244 | + | // Else wait the amount of time to fill the buffer. | |
| 245 | + | int remaining = inFirstHalf ? (midPoint - writePos) : (_audioClip.samples - writePos); | |
| 246 | + | float timeRemaining = remaining / (float)_recordingHZ; | |
| 247 | + | yield return new WaitForSeconds(timeRemaining); | |
| 248 | + | } | |
| 249 | + | ||
| 250 | + | } | |
| 251 | + | } | |
| 252 | + | ||
| 253 | + | /// <summary> | |
| 254 | + | /// This function sends the audio to google. However, if the audio is below | |
| 255 | + | /// SILENEC_THRESHOLD then it does not send the audio. | |
| 256 | + | /// </summary> | |
| 257 | + | /// <param name="samples"></param> | |
| 258 | + | public void SendAudio(float[] samples) | |
| 259 | + | { | |
| 260 | + | // If the samples are all below the SILENCE_THRESHOLD, then the audio is not sent | |
| 261 | + | // to Google for processing. Additionally, it closes the streaming call. This is | |
| 262 | + | // done to prevent Google Cloud Speech API from crashing the program. Google | |
| 263 | + | // Cloud Speech API expect a continue stream of real time audio. Not sending | |
| 264 | + | // audio is concidered an error. Therefore, the stream needs to be closed and | |
| 265 | + | // later restarted. | |
| 266 | + | if (DetectSilence(samples, SILENCE_THRESHOLD)) | |
| 267 | + | { | |
| 268 | + | if (_stream_started) | |
| 269 | + | { | |
| 270 | + | _streamingCall.WriteCompleteAsync().Wait(); | |
| 271 | + | ListenTask.Wait(); | |
| 272 | + | _stream_started = false; | |
| 273 | + | ||
| 274 | + | } | |
| 275 | + | } | |
| 276 | + | else | |
| 277 | + | { | |
| 278 | + | // Start the stream if it has been closed. | |
| 279 | + | if (!_stream_started) | |
| 280 | + | { | |
| 281 | + | StartStream(); | |
| 282 | + | } | |
| 283 | + | var bytes = AudioUtils.FloatsToLinear16(samples); | |
| 284 | + | _streamingCall.WriteAsync( | |
| 285 | + | new StreamingRecognizeRequest() | |
| 286 | + | { | |
| 287 | + | AudioContent = Google.Protobuf.ByteString | |
| 288 | + | .CopyFrom(bytes, 0, bytes.Length) | |
| 289 | + | }); | |
| 290 | + | } | |
| 291 | + | ||
| 292 | + | } | |
| 293 | + | ||
| 294 | + | /// <summary> | |
| 295 | + | /// This function starts the streaming recognizer. This function blocks until it | |
| 296 | + | /// receives the connection to Google's API. | |
| 297 | + | /// </summary> | |
| 298 | + | private void StartStream() | |
| 299 | + | { | |
| 300 | + | _streamingCall = _speech.StreamingRecognize(); | |
| 301 | + | _streamingCall.WriteAsync( | |
| 302 | + | new StreamingRecognizeRequest() | |
| 303 | + | { | |
| 304 | + | StreamingConfig = new StreamingRecognitionConfig() | |
| 305 | + | { | |
| 306 | + | Config = _config, | |
| 307 | + | InterimResults = true, | |
| 308 | + | } | |
| 309 | + | }).Wait(); | |
| 310 | + | _stream_started = true; | |
| 311 | + | ListenTask = new Task(() => listen()); | |
| 312 | + | ListenTask.RunSynchronously(); | |
| 313 | + | } | |
| 314 | + | ||
| 315 | + | /// <summary> | |
| 316 | + | /// To avoid making wasted calls to Google's Speech API, this function is used to | |
| 317 | + | /// detect if the audio is quieter than the threshold. | |
| 318 | + | /// </summary> | |
| 319 | + | /// <param name="samples">The buffer of audio samples as floating points</param> | |
| 320 | + | /// <param name="threshold">The threashold to be treated as silence</param> | |
| 321 | + | /// <returns></returns> | |
| 322 | + | static private bool DetectSilence(float[] samples, float threshold) | |
| 323 | + | { | |
| 324 | + | float maxLevel = Mathf.Max( | |
| 325 | + | Mathf.Abs(Mathf.Min(samples)), | |
| 326 | + | Mathf.Abs(Mathf.Max(samples)) | |
| 327 | + | ); | |
| 328 | + | return maxLevel < threshold; | |
| 329 | + | } | |
| 330 | + | ||
| 331 | + | ||
| 332 | + | /// <summary> | |
| 333 | + | /// This function is ran in the background to read audio from the ResponseStream. | |
| 334 | + | /// This must be ran asynchronosly otherwise an error will be raised. | |
| 335 | + | /// | |
| 336 | + | /// This funciton invokes two events: One for a partial result and one for a final result. | |
| 337 | + | /// Callbacks can be attached to these events to react to incoming transcripts. | |
| 338 | + | /// </summary> | |
| 339 | + | private async void listen() | |
| 340 | + | { | |
| 341 | + | while (await _streamingCall.ResponseStream.MoveNext(default)) | |
| 342 | + | { | |
| 343 | + | foreach (var result in _streamingCall.ResponseStream.Current.Results) | |
| 344 | + | { | |
| 345 | + | OnResultEvent.Invoke(result); | |
| 346 | + | if (result.IsFinal) | |
| 347 | + | { | |
| 348 | + | OnFinalResultEvent.Invoke(result); | |
| 349 | + | } | |
| 350 | + | } | |
| 351 | + | } | |
| 352 | + | ||
| 353 | + | } | |
| 354 | + | ||
| 355 | + | private bool resumeWhenUnpaused = false; | |
| 356 | + | private void OnApplicationPause(bool pause) | |
| 357 | + | { | |
| 358 | + | if (pause) | |
| 359 | + | { | |
| 360 | + | resumeWhenUnpaused = _active; | |
| 361 | + | Active = false; | |
| 362 | + | } | |
| 363 | + | else | |
| 364 | + | { | |
| 365 | + | Active = resumeWhenUnpaused; | |
| 366 | + | } | |
| 367 | + | } | |
| 368 | + | ||
| 369 | + | } | |
上一页
下一页