最后活跃于 1747790553

修订 ed90540e81a0be4fce3a91e74a1ad35868eb1112

GoogleVoiceStreaming.cs 原始文件 Playground
1
2using Google.Apis.Auth.OAuth2;
3using Google.Cloud.Speech.V1;
4using Google.Protobuf.Collections;
5using Grpc.Auth;
6using System;
7using System.Collections;
8using System.Collections.Generic;
9using System.IO;
10using System.Threading.Tasks;
11using UnityEngine;
12using UnityEngine.Events;
13#if PLATFORM_ANDROID
14using UnityEngine.Android;
15#endif
16
17public class GoogleVoiceStreaming : MonoBehaviour
18{
19 [Serializable]
20 public class TranscriptEvent : UnityEvent<StreamingRecognitionResult> { }
21 public TranscriptEvent OnResultEvent;
22 public TranscriptEvent OnFinalResultEvent;
23
24 float SILENCE_THRESHOLD = 0.01f;
25 AudioClip _audioClip = null;
26 bool _recording = false;
27 int _recordingHZ = 22050;
28 bool _stream_started = false;
29 Task ListenTask;
30 float t = 0;
31
32 RecognitionConfig _config;
33 SpeechClient _speech;
34 SpeechClient.StreamingRecognizeStream _streamingCall;
35 #if PLATFORM_ANDROID
36 GameObject dialog = null;
37 #endif
38
39 ~GoogleVoiceStreaming () {
40 if (_stream_started)
41 {
42 _streamingCall.WriteCompleteAsync().Wait();
43 }
44 }
45
46 public void OnDestroy()
47 {
48 if (_stream_started)
49 {
50 _streamingCall.WriteCompleteAsync().Wait();
51
52 }
53 }
54
55 void OnDisable()
56 {
57 Debug.Log("Disabled");
58 if (_stream_started)
59 {
60 EndRecording();
61 }
62 }
63
64 void OnActive ()
65 {
66 }
67
68 public bool StartActive = false;
69 public bool _active = false;
70 public bool Active {
71 set {
72 if (value && !_recording)
73 {
74 _active = true;
75 BeginRecognition();
76 }
77 else if (!value && _recording)
78 {
79 _active = false;
80 EndRecording();
81 }
82 }
83 }
84
85 public void Start()
86 {
87 #if PLATFORM_ANDROID
88 if (!Permission.HasUserAuthorizedPermission(Permission.Microphone))
89 {
90 Permission.RequestUserPermission(Permission.Microphone);
91 dialog = new GameObject();
92 }
93 #endif
94 if (_speech == null)
95 {
96 CreateChannel();
97 Active = StartActive;
98 }
99
100 Debug.Log(_speech);
101
102 }
103
104 public void Update()
105 {
106 // Count the how long the stream has been running
107 // if the run time exceeds 340s, restart the stream
108 if (_recording)
109 {
110 t += Time.deltaTime;
111 if (t > 340)
112 {
113 EndRecording();
114 BeginRecognition();
115 t = 0;
116 }
117 }
118
119 }
120
121 void OnGUI()
122 {
123
124 #if PLATFORM_ANDROID
125 if (!Permission.HasUserAuthorizedPermission(Permission.Microphone))
126 {
127 // The user denied permission to use the microphone.
128 // Display a message explaining why you need it with Yes/No buttons.
129 // If the user says yes then present the request again
130 // Display a dialog here.
131 dialog.AddComponent<PermissionsRationaleDialog>();
132 return;
133 }
134 else if (dialog != null)
135 {
136 Destroy(dialog);
137 }
138 #endif
139
140 // Now you can do things with the microphone
141 }
142
143 public void CreateChannel()
144 {
145 var key = Resources.Load<TextAsset>("key");
146 _config = new RecognitionConfig()
147 {
148 Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
149 SampleRateHertz = 22050,
150 LanguageCode = "en",
151 };
152
153 var credentials = GoogleCredential.FromJson(key.text)
154 .CreateScoped();
155 var channel = new Grpc.Core.Channel(
156 SpeechClient.DefaultEndpoint.Host,
157 credentials.ToChannelCredentials()
158 );
159 _speech = SpeechClient.Create(channel);
160 }
161
162 public void BeginRecognition()
163 {
164
165 if (_speech == null)
166 {
167 CreateChannel();
168 }
169 _streamingCall = _speech.StreamingRecognize();
170 _streamingCall.WriteAsync(
171 new StreamingRecognizeRequest()
172 {
173 StreamingConfig = new StreamingRecognitionConfig()
174 {
175 Config = _config,
176 InterimResults = true,
177 }
178 }
179 ).Wait();
180 _recording = true;
181 _stream_started = true;
182 StartCoroutine(AudioListener());
183 ListenTask = new Task(() => listen());
184 ListenTask.RunSynchronously();
185 }
186
187 // Funciton ends the recording of audio
188 public void EndRecording()
189 {
190 if (_stream_started)
191 {
192 _streamingCall.WriteCompleteAsync();
193 _stream_started = false;
194 _recording = false;
195 }
196 }
197
198 /// <summary>
199 /// This method provides the coroutine needed for recording audio from the microphone
200 /// to a buffer that can be sent to google speech. Since it uses a ring buffer, once
201 /// the buffer is full it loops to the start and overwrites old data. This buffer is
202 /// streamed in 1/2 * buffer.length chunks. This allows for reduced recording latency.
203 /// </summary>
204 /// <returns></returns>
205 IEnumerator AudioListener()
206 {
207
208 _audioClip = Microphone.Start(null, true, 1, _recordingHZ);
209
210 yield return null;
211
212 if (_audioClip == null)
213 {
214 yield break;
215 }
216
217 float[] samples = null;
218 bool inFirstHalf = true;
219 int midPoint = _audioClip.samples / 2;
220
221 while (_recording && _audioClip != null)
222 {
223
224 int writePos = Microphone.GetPosition(null);
225
226 if (writePos > _audioClip.samples || !Microphone.IsRecording(null))
227 {
228 Debug.Log("Failed to get Microphone");
229 yield break;
230 }
231 // Check if we were in the first half, but have crossed the mid point (upper half full)
232 // or if we were in the last half, but have wrapped to the start (lower half full)
233 if ((inFirstHalf && writePos >= midPoint) ||
234 (!inFirstHalf && writePos < midPoint))
235 {
236 samples = new float[midPoint];
237 _audioClip.GetData(samples, inFirstHalf ? 0 : midPoint);
238
239 SendAudio(samples);
240 inFirstHalf = !inFirstHalf;
241 }
242 else
243 {
244 // Else wait the amount of time to fill the buffer.
245 int remaining = inFirstHalf ? (midPoint - writePos) : (_audioClip.samples - writePos);
246 float timeRemaining = remaining / (float)_recordingHZ;
247 yield return new WaitForSeconds(timeRemaining);
248 }
249
250 }
251 }
252
253 /// <summary>
254 /// This function sends the audio to google. However, if the audio is below
255 /// SILENEC_THRESHOLD then it does not send the audio.
256 /// </summary>
257 /// <param name="samples"></param>
258 public void SendAudio(float[] samples)
259 {
260 // If the samples are all below the SILENCE_THRESHOLD, then the audio is not sent
261 // to Google for processing. Additionally, it closes the streaming call. This is
262 // done to prevent Google Cloud Speech API from crashing the program. Google
263 // Cloud Speech API expect a continue stream of real time audio. Not sending
264 // audio is concidered an error. Therefore, the stream needs to be closed and
265 // later restarted.
266 if (DetectSilence(samples, SILENCE_THRESHOLD))
267 {
268 if (_stream_started)
269 {
270 _streamingCall.WriteCompleteAsync().Wait();
271 ListenTask.Wait();
272 _stream_started = false;
273
274 }
275 }
276 else
277 {
278 // Start the stream if it has been closed.
279 if (!_stream_started)
280 {
281 StartStream();
282 }
283 var bytes = AudioUtils.FloatsToLinear16(samples);
284 _streamingCall.WriteAsync(
285 new StreamingRecognizeRequest()
286 {
287 AudioContent = Google.Protobuf.ByteString
288 .CopyFrom(bytes, 0, bytes.Length)
289 });
290 }
291
292 }
293
294 /// <summary>
295 /// This function starts the streaming recognizer. This function blocks until it
296 /// receives the connection to Google's API.
297 /// </summary>
298 private void StartStream()
299 {
300 _streamingCall = _speech.StreamingRecognize();
301 _streamingCall.WriteAsync(
302 new StreamingRecognizeRequest()
303 {
304 StreamingConfig = new StreamingRecognitionConfig()
305 {
306 Config = _config,
307 InterimResults = true,
308 }
309 }).Wait();
310 _stream_started = true;
311 ListenTask = new Task(() => listen());
312 ListenTask.RunSynchronously();
313 }
314
315 /// <summary>
316 /// To avoid making wasted calls to Google's Speech API, this function is used to
317 /// detect if the audio is quieter than the threshold.
318 /// </summary>
319 /// <param name="samples">The buffer of audio samples as floating points</param>
320 /// <param name="threshold">The threashold to be treated as silence</param>
321 /// <returns></returns>
322 static private bool DetectSilence(float[] samples, float threshold)
323 {
324 float maxLevel = Mathf.Max(
325 Mathf.Abs(Mathf.Min(samples)),
326 Mathf.Abs(Mathf.Max(samples))
327 );
328 return maxLevel < threshold;
329 }
330
331
332 /// <summary>
333 /// This function is ran in the background to read audio from the ResponseStream.
334 /// This must be ran asynchronosly otherwise an error will be raised.
335 ///
336 /// This funciton invokes two events: One for a partial result and one for a final result.
337 /// Callbacks can be attached to these events to react to incoming transcripts.
338 /// </summary>
339 private async void listen()
340 {
341 while (await _streamingCall.ResponseStream.MoveNext(default))
342 {
343 foreach (var result in _streamingCall.ResponseStream.Current.Results)
344 {
345 OnResultEvent.Invoke(result);
346 if (result.IsFinal)
347 {
348 OnFinalResultEvent.Invoke(result);
349 }
350 }
351 }
352
353 }
354
355 private bool resumeWhenUnpaused = false;
356 private void OnApplicationPause(bool pause)
357 {
358 if (pause)
359 {
360 resumeWhenUnpaused = _active;
361 Active = false;
362 }
363 else
364 {
365 Active = resumeWhenUnpaused;
366 }
367 }
368
369}
370