capitalex revidoval tento gist . Přejít na revizi
1 file changed, 369 insertions
GoogleVoiceStreaming.cs(vytvořil soubor)
@@ -0,0 +1,369 @@ | |||
1 | + | ||
2 | + | using Google.Apis.Auth.OAuth2; | |
3 | + | using Google.Cloud.Speech.V1; | |
4 | + | using Google.Protobuf.Collections; | |
5 | + | using Grpc.Auth; | |
6 | + | using System; | |
7 | + | using System.Collections; | |
8 | + | using System.Collections.Generic; | |
9 | + | using System.IO; | |
10 | + | using System.Threading.Tasks; | |
11 | + | using UnityEngine; | |
12 | + | using UnityEngine.Events; | |
13 | + | #if PLATFORM_ANDROID | |
14 | + | using UnityEngine.Android; | |
15 | + | #endif | |
16 | + | ||
17 | + | public class GoogleVoiceStreaming : MonoBehaviour | |
18 | + | { | |
19 | + | [Serializable] | |
20 | + | public class TranscriptEvent : UnityEvent<StreamingRecognitionResult> { } | |
21 | + | public TranscriptEvent OnResultEvent; | |
22 | + | public TranscriptEvent OnFinalResultEvent; | |
23 | + | ||
24 | + | float SILENCE_THRESHOLD = 0.01f; | |
25 | + | AudioClip _audioClip = null; | |
26 | + | bool _recording = false; | |
27 | + | int _recordingHZ = 22050; | |
28 | + | bool _stream_started = false; | |
29 | + | Task ListenTask; | |
30 | + | float t = 0; | |
31 | + | ||
32 | + | RecognitionConfig _config; | |
33 | + | SpeechClient _speech; | |
34 | + | SpeechClient.StreamingRecognizeStream _streamingCall; | |
35 | + | #if PLATFORM_ANDROID | |
36 | + | GameObject dialog = null; | |
37 | + | #endif | |
38 | + | ||
39 | + | ~GoogleVoiceStreaming () { | |
40 | + | if (_stream_started) | |
41 | + | { | |
42 | + | _streamingCall.WriteCompleteAsync().Wait(); | |
43 | + | } | |
44 | + | } | |
45 | + | ||
46 | + | public void OnDestroy() | |
47 | + | { | |
48 | + | if (_stream_started) | |
49 | + | { | |
50 | + | _streamingCall.WriteCompleteAsync().Wait(); | |
51 | + | ||
52 | + | } | |
53 | + | } | |
54 | + | ||
55 | + | void OnDisable() | |
56 | + | { | |
57 | + | Debug.Log("Disabled"); | |
58 | + | if (_stream_started) | |
59 | + | { | |
60 | + | EndRecording(); | |
61 | + | } | |
62 | + | } | |
63 | + | ||
64 | + | void OnActive () | |
65 | + | { | |
66 | + | } | |
67 | + | ||
68 | + | public bool StartActive = false; | |
69 | + | public bool _active = false; | |
70 | + | public bool Active { | |
71 | + | set { | |
72 | + | if (value && !_recording) | |
73 | + | { | |
74 | + | _active = true; | |
75 | + | BeginRecognition(); | |
76 | + | } | |
77 | + | else if (!value && _recording) | |
78 | + | { | |
79 | + | _active = false; | |
80 | + | EndRecording(); | |
81 | + | } | |
82 | + | } | |
83 | + | } | |
84 | + | ||
85 | + | public void Start() | |
86 | + | { | |
87 | + | #if PLATFORM_ANDROID | |
88 | + | if (!Permission.HasUserAuthorizedPermission(Permission.Microphone)) | |
89 | + | { | |
90 | + | Permission.RequestUserPermission(Permission.Microphone); | |
91 | + | dialog = new GameObject(); | |
92 | + | } | |
93 | + | #endif | |
94 | + | if (_speech == null) | |
95 | + | { | |
96 | + | CreateChannel(); | |
97 | + | Active = StartActive; | |
98 | + | } | |
99 | + | ||
100 | + | Debug.Log(_speech); | |
101 | + | ||
102 | + | } | |
103 | + | ||
104 | + | public void Update() | |
105 | + | { | |
106 | + | // Count the how long the stream has been running | |
107 | + | // if the run time exceeds 340s, restart the stream | |
108 | + | if (_recording) | |
109 | + | { | |
110 | + | t += Time.deltaTime; | |
111 | + | if (t > 340) | |
112 | + | { | |
113 | + | EndRecording(); | |
114 | + | BeginRecognition(); | |
115 | + | t = 0; | |
116 | + | } | |
117 | + | } | |
118 | + | ||
119 | + | } | |
120 | + | ||
121 | + | void OnGUI() | |
122 | + | { | |
123 | + | ||
124 | + | #if PLATFORM_ANDROID | |
125 | + | if (!Permission.HasUserAuthorizedPermission(Permission.Microphone)) | |
126 | + | { | |
127 | + | // The user denied permission to use the microphone. | |
128 | + | // Display a message explaining why you need it with Yes/No buttons. | |
129 | + | // If the user says yes then present the request again | |
130 | + | // Display a dialog here. | |
131 | + | dialog.AddComponent<PermissionsRationaleDialog>(); | |
132 | + | return; | |
133 | + | } | |
134 | + | else if (dialog != null) | |
135 | + | { | |
136 | + | Destroy(dialog); | |
137 | + | } | |
138 | + | #endif | |
139 | + | ||
140 | + | // Now you can do things with the microphone | |
141 | + | } | |
142 | + | ||
143 | + | public void CreateChannel() | |
144 | + | { | |
145 | + | var key = Resources.Load<TextAsset>("key"); | |
146 | + | _config = new RecognitionConfig() | |
147 | + | { | |
148 | + | Encoding = RecognitionConfig.Types.AudioEncoding.Linear16, | |
149 | + | SampleRateHertz = 22050, | |
150 | + | LanguageCode = "en", | |
151 | + | }; | |
152 | + | ||
153 | + | var credentials = GoogleCredential.FromJson(key.text) | |
154 | + | .CreateScoped(); | |
155 | + | var channel = new Grpc.Core.Channel( | |
156 | + | SpeechClient.DefaultEndpoint.Host, | |
157 | + | credentials.ToChannelCredentials() | |
158 | + | ); | |
159 | + | _speech = SpeechClient.Create(channel); | |
160 | + | } | |
161 | + | ||
162 | + | public void BeginRecognition() | |
163 | + | { | |
164 | + | ||
165 | + | if (_speech == null) | |
166 | + | { | |
167 | + | CreateChannel(); | |
168 | + | } | |
169 | + | _streamingCall = _speech.StreamingRecognize(); | |
170 | + | _streamingCall.WriteAsync( | |
171 | + | new StreamingRecognizeRequest() | |
172 | + | { | |
173 | + | StreamingConfig = new StreamingRecognitionConfig() | |
174 | + | { | |
175 | + | Config = _config, | |
176 | + | InterimResults = true, | |
177 | + | } | |
178 | + | } | |
179 | + | ).Wait(); | |
180 | + | _recording = true; | |
181 | + | _stream_started = true; | |
182 | + | StartCoroutine(AudioListener()); | |
183 | + | ListenTask = new Task(() => listen()); | |
184 | + | ListenTask.RunSynchronously(); | |
185 | + | } | |
186 | + | ||
187 | + | // Funciton ends the recording of audio | |
188 | + | public void EndRecording() | |
189 | + | { | |
190 | + | if (_stream_started) | |
191 | + | { | |
192 | + | _streamingCall.WriteCompleteAsync(); | |
193 | + | _stream_started = false; | |
194 | + | _recording = false; | |
195 | + | } | |
196 | + | } | |
197 | + | ||
198 | + | /// <summary> | |
199 | + | /// This method provides the coroutine needed for recording audio from the microphone | |
200 | + | /// to a buffer that can be sent to google speech. Since it uses a ring buffer, once | |
201 | + | /// the buffer is full it loops to the start and overwrites old data. This buffer is | |
202 | + | /// streamed in 1/2 * buffer.length chunks. This allows for reduced recording latency. | |
203 | + | /// </summary> | |
204 | + | /// <returns></returns> | |
205 | + | IEnumerator AudioListener() | |
206 | + | { | |
207 | + | ||
208 | + | _audioClip = Microphone.Start(null, true, 1, _recordingHZ); | |
209 | + | ||
210 | + | yield return null; | |
211 | + | ||
212 | + | if (_audioClip == null) | |
213 | + | { | |
214 | + | yield break; | |
215 | + | } | |
216 | + | ||
217 | + | float[] samples = null; | |
218 | + | bool inFirstHalf = true; | |
219 | + | int midPoint = _audioClip.samples / 2; | |
220 | + | ||
221 | + | while (_recording && _audioClip != null) | |
222 | + | { | |
223 | + | ||
224 | + | int writePos = Microphone.GetPosition(null); | |
225 | + | ||
226 | + | if (writePos > _audioClip.samples || !Microphone.IsRecording(null)) | |
227 | + | { | |
228 | + | Debug.Log("Failed to get Microphone"); | |
229 | + | yield break; | |
230 | + | } | |
231 | + | // Check if we were in the first half, but have crossed the mid point (upper half full) | |
232 | + | // or if we were in the last half, but have wrapped to the start (lower half full) | |
233 | + | if ((inFirstHalf && writePos >= midPoint) || | |
234 | + | (!inFirstHalf && writePos < midPoint)) | |
235 | + | { | |
236 | + | samples = new float[midPoint]; | |
237 | + | _audioClip.GetData(samples, inFirstHalf ? 0 : midPoint); | |
238 | + | ||
239 | + | SendAudio(samples); | |
240 | + | inFirstHalf = !inFirstHalf; | |
241 | + | } | |
242 | + | else | |
243 | + | { | |
244 | + | // Else wait the amount of time to fill the buffer. | |
245 | + | int remaining = inFirstHalf ? (midPoint - writePos) : (_audioClip.samples - writePos); | |
246 | + | float timeRemaining = remaining / (float)_recordingHZ; | |
247 | + | yield return new WaitForSeconds(timeRemaining); | |
248 | + | } | |
249 | + | ||
250 | + | } | |
251 | + | } | |
252 | + | ||
253 | + | /// <summary> | |
254 | + | /// This function sends the audio to google. However, if the audio is below | |
255 | + | /// SILENEC_THRESHOLD then it does not send the audio. | |
256 | + | /// </summary> | |
257 | + | /// <param name="samples"></param> | |
258 | + | public void SendAudio(float[] samples) | |
259 | + | { | |
260 | + | // If the samples are all below the SILENCE_THRESHOLD, then the audio is not sent | |
261 | + | // to Google for processing. Additionally, it closes the streaming call. This is | |
262 | + | // done to prevent Google Cloud Speech API from crashing the program. Google | |
263 | + | // Cloud Speech API expect a continue stream of real time audio. Not sending | |
264 | + | // audio is concidered an error. Therefore, the stream needs to be closed and | |
265 | + | // later restarted. | |
266 | + | if (DetectSilence(samples, SILENCE_THRESHOLD)) | |
267 | + | { | |
268 | + | if (_stream_started) | |
269 | + | { | |
270 | + | _streamingCall.WriteCompleteAsync().Wait(); | |
271 | + | ListenTask.Wait(); | |
272 | + | _stream_started = false; | |
273 | + | ||
274 | + | } | |
275 | + | } | |
276 | + | else | |
277 | + | { | |
278 | + | // Start the stream if it has been closed. | |
279 | + | if (!_stream_started) | |
280 | + | { | |
281 | + | StartStream(); | |
282 | + | } | |
283 | + | var bytes = AudioUtils.FloatsToLinear16(samples); | |
284 | + | _streamingCall.WriteAsync( | |
285 | + | new StreamingRecognizeRequest() | |
286 | + | { | |
287 | + | AudioContent = Google.Protobuf.ByteString | |
288 | + | .CopyFrom(bytes, 0, bytes.Length) | |
289 | + | }); | |
290 | + | } | |
291 | + | ||
292 | + | } | |
293 | + | ||
294 | + | /// <summary> | |
295 | + | /// This function starts the streaming recognizer. This function blocks until it | |
296 | + | /// receives the connection to Google's API. | |
297 | + | /// </summary> | |
298 | + | private void StartStream() | |
299 | + | { | |
300 | + | _streamingCall = _speech.StreamingRecognize(); | |
301 | + | _streamingCall.WriteAsync( | |
302 | + | new StreamingRecognizeRequest() | |
303 | + | { | |
304 | + | StreamingConfig = new StreamingRecognitionConfig() | |
305 | + | { | |
306 | + | Config = _config, | |
307 | + | InterimResults = true, | |
308 | + | } | |
309 | + | }).Wait(); | |
310 | + | _stream_started = true; | |
311 | + | ListenTask = new Task(() => listen()); | |
312 | + | ListenTask.RunSynchronously(); | |
313 | + | } | |
314 | + | ||
315 | + | /// <summary> | |
316 | + | /// To avoid making wasted calls to Google's Speech API, this function is used to | |
317 | + | /// detect if the audio is quieter than the threshold. | |
318 | + | /// </summary> | |
319 | + | /// <param name="samples">The buffer of audio samples as floating points</param> | |
320 | + | /// <param name="threshold">The threashold to be treated as silence</param> | |
321 | + | /// <returns></returns> | |
322 | + | static private bool DetectSilence(float[] samples, float threshold) | |
323 | + | { | |
324 | + | float maxLevel = Mathf.Max( | |
325 | + | Mathf.Abs(Mathf.Min(samples)), | |
326 | + | Mathf.Abs(Mathf.Max(samples)) | |
327 | + | ); | |
328 | + | return maxLevel < threshold; | |
329 | + | } | |
330 | + | ||
331 | + | ||
332 | + | /// <summary> | |
333 | + | /// This function is ran in the background to read audio from the ResponseStream. | |
334 | + | /// This must be ran asynchronosly otherwise an error will be raised. | |
335 | + | /// | |
336 | + | /// This funciton invokes two events: One for a partial result and one for a final result. | |
337 | + | /// Callbacks can be attached to these events to react to incoming transcripts. | |
338 | + | /// </summary> | |
339 | + | private async void listen() | |
340 | + | { | |
341 | + | while (await _streamingCall.ResponseStream.MoveNext(default)) | |
342 | + | { | |
343 | + | foreach (var result in _streamingCall.ResponseStream.Current.Results) | |
344 | + | { | |
345 | + | OnResultEvent.Invoke(result); | |
346 | + | if (result.IsFinal) | |
347 | + | { | |
348 | + | OnFinalResultEvent.Invoke(result); | |
349 | + | } | |
350 | + | } | |
351 | + | } | |
352 | + | ||
353 | + | } | |
354 | + | ||
355 | + | private bool resumeWhenUnpaused = false; | |
356 | + | private void OnApplicationPause(bool pause) | |
357 | + | { | |
358 | + | if (pause) | |
359 | + | { | |
360 | + | resumeWhenUnpaused = _active; | |
361 | + | Active = false; | |
362 | + | } | |
363 | + | else | |
364 | + | { | |
365 | + | Active = resumeWhenUnpaused; | |
366 | + | } | |
367 | + | } | |
368 | + | ||
369 | + | } |
Novější
Starší