GoogleVoiceStreaming.cs
· 10 KiB · C#
Surowy
Playground
using Google.Apis.Auth.OAuth2;
using Google.Cloud.Speech.V1;
using Google.Protobuf.Collections;
using Grpc.Auth;
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Threading.Tasks;
using UnityEngine;
using UnityEngine.Events;
#if PLATFORM_ANDROID
using UnityEngine.Android;
#endif
public class GoogleVoiceStreaming : MonoBehaviour
{
[Serializable]
public class TranscriptEvent : UnityEvent<StreamingRecognitionResult> { }
public TranscriptEvent OnResultEvent;
public TranscriptEvent OnFinalResultEvent;
float SILENCE_THRESHOLD = 0.01f;
AudioClip _audioClip = null;
bool _recording = false;
int _recordingHZ = 22050;
bool _stream_started = false;
Task ListenTask;
float t = 0;
RecognitionConfig _config;
SpeechClient _speech;
SpeechClient.StreamingRecognizeStream _streamingCall;
#if PLATFORM_ANDROID
GameObject dialog = null;
#endif
~GoogleVoiceStreaming () {
if (_stream_started)
{
_streamingCall.WriteCompleteAsync().Wait();
}
}
public void OnDestroy()
{
if (_stream_started)
{
_streamingCall.WriteCompleteAsync().Wait();
}
}
void OnDisable()
{
Debug.Log("Disabled");
if (_stream_started)
{
EndRecording();
}
}
void OnActive ()
{
}
public bool StartActive = false;
public bool _active = false;
public bool Active {
set {
if (value && !_recording)
{
_active = true;
BeginRecognition();
}
else if (!value && _recording)
{
_active = false;
EndRecording();
}
}
}
public void Start()
{
#if PLATFORM_ANDROID
if (!Permission.HasUserAuthorizedPermission(Permission.Microphone))
{
Permission.RequestUserPermission(Permission.Microphone);
dialog = new GameObject();
}
#endif
if (_speech == null)
{
CreateChannel();
Active = StartActive;
}
Debug.Log(_speech);
}
public void Update()
{
// Count the how long the stream has been running
// if the run time exceeds 340s, restart the stream
if (_recording)
{
t += Time.deltaTime;
if (t > 340)
{
EndRecording();
BeginRecognition();
t = 0;
}
}
}
void OnGUI()
{
#if PLATFORM_ANDROID
if (!Permission.HasUserAuthorizedPermission(Permission.Microphone))
{
// The user denied permission to use the microphone.
// Display a message explaining why you need it with Yes/No buttons.
// If the user says yes then present the request again
// Display a dialog here.
dialog.AddComponent<PermissionsRationaleDialog>();
return;
}
else if (dialog != null)
{
Destroy(dialog);
}
#endif
// Now you can do things with the microphone
}
public void CreateChannel()
{
var key = Resources.Load<TextAsset>("key");
_config = new RecognitionConfig()
{
Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
SampleRateHertz = 22050,
LanguageCode = "en",
};
var credentials = GoogleCredential.FromJson(key.text)
.CreateScoped();
var channel = new Grpc.Core.Channel(
SpeechClient.DefaultEndpoint.Host,
credentials.ToChannelCredentials()
);
_speech = SpeechClient.Create(channel);
}
public void BeginRecognition()
{
if (_speech == null)
{
CreateChannel();
}
_streamingCall = _speech.StreamingRecognize();
_streamingCall.WriteAsync(
new StreamingRecognizeRequest()
{
StreamingConfig = new StreamingRecognitionConfig()
{
Config = _config,
InterimResults = true,
}
}
).Wait();
_recording = true;
_stream_started = true;
StartCoroutine(AudioListener());
ListenTask = new Task(() => listen());
ListenTask.RunSynchronously();
}
// Funciton ends the recording of audio
public void EndRecording()
{
if (_stream_started)
{
_streamingCall.WriteCompleteAsync();
_stream_started = false;
_recording = false;
}
}
/// <summary>
/// This method provides the coroutine needed for recording audio from the microphone
/// to a buffer that can be sent to google speech. Since it uses a ring buffer, once
/// the buffer is full it loops to the start and overwrites old data. This buffer is
/// streamed in 1/2 * buffer.length chunks. This allows for reduced recording latency.
/// </summary>
/// <returns></returns>
IEnumerator AudioListener()
{
_audioClip = Microphone.Start(null, true, 1, _recordingHZ);
yield return null;
if (_audioClip == null)
{
yield break;
}
float[] samples = null;
bool inFirstHalf = true;
int midPoint = _audioClip.samples / 2;
while (_recording && _audioClip != null)
{
int writePos = Microphone.GetPosition(null);
if (writePos > _audioClip.samples || !Microphone.IsRecording(null))
{
Debug.Log("Failed to get Microphone");
yield break;
}
// Check if we were in the first half, but have crossed the mid point (upper half full)
// or if we were in the last half, but have wrapped to the start (lower half full)
if ((inFirstHalf && writePos >= midPoint) ||
(!inFirstHalf && writePos < midPoint))
{
samples = new float[midPoint];
_audioClip.GetData(samples, inFirstHalf ? 0 : midPoint);
SendAudio(samples);
inFirstHalf = !inFirstHalf;
}
else
{
// Else wait the amount of time to fill the buffer.
int remaining = inFirstHalf ? (midPoint - writePos) : (_audioClip.samples - writePos);
float timeRemaining = remaining / (float)_recordingHZ;
yield return new WaitForSeconds(timeRemaining);
}
}
}
/// <summary>
/// This function sends the audio to google. However, if the audio is below
/// SILENEC_THRESHOLD then it does not send the audio.
/// </summary>
/// <param name="samples"></param>
public void SendAudio(float[] samples)
{
// If the samples are all below the SILENCE_THRESHOLD, then the audio is not sent
// to Google for processing. Additionally, it closes the streaming call. This is
// done to prevent Google Cloud Speech API from crashing the program. Google
// Cloud Speech API expect a continue stream of real time audio. Not sending
// audio is concidered an error. Therefore, the stream needs to be closed and
// later restarted.
if (DetectSilence(samples, SILENCE_THRESHOLD))
{
if (_stream_started)
{
_streamingCall.WriteCompleteAsync().Wait();
ListenTask.Wait();
_stream_started = false;
}
}
else
{
// Start the stream if it has been closed.
if (!_stream_started)
{
StartStream();
}
var bytes = AudioUtils.FloatsToLinear16(samples);
_streamingCall.WriteAsync(
new StreamingRecognizeRequest()
{
AudioContent = Google.Protobuf.ByteString
.CopyFrom(bytes, 0, bytes.Length)
});
}
}
/// <summary>
/// This function starts the streaming recognizer. This function blocks until it
/// receives the connection to Google's API.
/// </summary>
private void StartStream()
{
_streamingCall = _speech.StreamingRecognize();
_streamingCall.WriteAsync(
new StreamingRecognizeRequest()
{
StreamingConfig = new StreamingRecognitionConfig()
{
Config = _config,
InterimResults = true,
}
}).Wait();
_stream_started = true;
ListenTask = new Task(() => listen());
ListenTask.RunSynchronously();
}
/// <summary>
/// To avoid making wasted calls to Google's Speech API, this function is used to
/// detect if the audio is quieter than the threshold.
/// </summary>
/// <param name="samples">The buffer of audio samples as floating points</param>
/// <param name="threshold">The threashold to be treated as silence</param>
/// <returns></returns>
static private bool DetectSilence(float[] samples, float threshold)
{
float maxLevel = Mathf.Max(
Mathf.Abs(Mathf.Min(samples)),
Mathf.Abs(Mathf.Max(samples))
);
return maxLevel < threshold;
}
/// <summary>
/// This function is ran in the background to read audio from the ResponseStream.
/// This must be ran asynchronosly otherwise an error will be raised.
///
/// This funciton invokes two events: One for a partial result and one for a final result.
/// Callbacks can be attached to these events to react to incoming transcripts.
/// </summary>
private async void listen()
{
while (await _streamingCall.ResponseStream.MoveNext(default))
{
foreach (var result in _streamingCall.ResponseStream.Current.Results)
{
OnResultEvent.Invoke(result);
if (result.IsFinal)
{
OnFinalResultEvent.Invoke(result);
}
}
}
}
private bool resumeWhenUnpaused = false;
private void OnApplicationPause(bool pause)
{
if (pause)
{
resumeWhenUnpaused = _active;
Active = false;
}
else
{
Active = resumeWhenUnpaused;
}
}
}
1 | |
2 | using Google.Apis.Auth.OAuth2; |
3 | using Google.Cloud.Speech.V1; |
4 | using Google.Protobuf.Collections; |
5 | using Grpc.Auth; |
6 | using System; |
7 | using System.Collections; |
8 | using System.Collections.Generic; |
9 | using System.IO; |
10 | using System.Threading.Tasks; |
11 | using UnityEngine; |
12 | using UnityEngine.Events; |
13 | #if PLATFORM_ANDROID |
14 | using UnityEngine.Android; |
15 | #endif |
16 | |
17 | public class GoogleVoiceStreaming : MonoBehaviour |
18 | { |
19 | [Serializable] |
20 | public class TranscriptEvent : UnityEvent<StreamingRecognitionResult> { } |
21 | public TranscriptEvent OnResultEvent; |
22 | public TranscriptEvent OnFinalResultEvent; |
23 | |
24 | float SILENCE_THRESHOLD = 0.01f; |
25 | AudioClip _audioClip = null; |
26 | bool _recording = false; |
27 | int _recordingHZ = 22050; |
28 | bool _stream_started = false; |
29 | Task ListenTask; |
30 | float t = 0; |
31 | |
32 | RecognitionConfig _config; |
33 | SpeechClient _speech; |
34 | SpeechClient.StreamingRecognizeStream _streamingCall; |
35 | #if PLATFORM_ANDROID |
36 | GameObject dialog = null; |
37 | #endif |
38 | |
39 | ~GoogleVoiceStreaming () { |
40 | if (_stream_started) |
41 | { |
42 | _streamingCall.WriteCompleteAsync().Wait(); |
43 | } |
44 | } |
45 | |
46 | public void OnDestroy() |
47 | { |
48 | if (_stream_started) |
49 | { |
50 | _streamingCall.WriteCompleteAsync().Wait(); |
51 | |
52 | } |
53 | } |
54 | |
55 | void OnDisable() |
56 | { |
57 | Debug.Log("Disabled"); |
58 | if (_stream_started) |
59 | { |
60 | EndRecording(); |
61 | } |
62 | } |
63 | |
64 | void OnActive () |
65 | { |
66 | } |
67 | |
68 | public bool StartActive = false; |
69 | public bool _active = false; |
70 | public bool Active { |
71 | set { |
72 | if (value && !_recording) |
73 | { |
74 | _active = true; |
75 | BeginRecognition(); |
76 | } |
77 | else if (!value && _recording) |
78 | { |
79 | _active = false; |
80 | EndRecording(); |
81 | } |
82 | } |
83 | } |
84 | |
85 | public void Start() |
86 | { |
87 | #if PLATFORM_ANDROID |
88 | if (!Permission.HasUserAuthorizedPermission(Permission.Microphone)) |
89 | { |
90 | Permission.RequestUserPermission(Permission.Microphone); |
91 | dialog = new GameObject(); |
92 | } |
93 | #endif |
94 | if (_speech == null) |
95 | { |
96 | CreateChannel(); |
97 | Active = StartActive; |
98 | } |
99 | |
100 | Debug.Log(_speech); |
101 | |
102 | } |
103 | |
104 | public void Update() |
105 | { |
106 | // Count the how long the stream has been running |
107 | // if the run time exceeds 340s, restart the stream |
108 | if (_recording) |
109 | { |
110 | t += Time.deltaTime; |
111 | if (t > 340) |
112 | { |
113 | EndRecording(); |
114 | BeginRecognition(); |
115 | t = 0; |
116 | } |
117 | } |
118 | |
119 | } |
120 | |
121 | void OnGUI() |
122 | { |
123 | |
124 | #if PLATFORM_ANDROID |
125 | if (!Permission.HasUserAuthorizedPermission(Permission.Microphone)) |
126 | { |
127 | // The user denied permission to use the microphone. |
128 | // Display a message explaining why you need it with Yes/No buttons. |
129 | // If the user says yes then present the request again |
130 | // Display a dialog here. |
131 | dialog.AddComponent<PermissionsRationaleDialog>(); |
132 | return; |
133 | } |
134 | else if (dialog != null) |
135 | { |
136 | Destroy(dialog); |
137 | } |
138 | #endif |
139 | |
140 | // Now you can do things with the microphone |
141 | } |
142 | |
143 | public void CreateChannel() |
144 | { |
145 | var key = Resources.Load<TextAsset>("key"); |
146 | _config = new RecognitionConfig() |
147 | { |
148 | Encoding = RecognitionConfig.Types.AudioEncoding.Linear16, |
149 | SampleRateHertz = 22050, |
150 | LanguageCode = "en", |
151 | }; |
152 | |
153 | var credentials = GoogleCredential.FromJson(key.text) |
154 | .CreateScoped(); |
155 | var channel = new Grpc.Core.Channel( |
156 | SpeechClient.DefaultEndpoint.Host, |
157 | credentials.ToChannelCredentials() |
158 | ); |
159 | _speech = SpeechClient.Create(channel); |
160 | } |
161 | |
162 | public void BeginRecognition() |
163 | { |
164 | |
165 | if (_speech == null) |
166 | { |
167 | CreateChannel(); |
168 | } |
169 | _streamingCall = _speech.StreamingRecognize(); |
170 | _streamingCall.WriteAsync( |
171 | new StreamingRecognizeRequest() |
172 | { |
173 | StreamingConfig = new StreamingRecognitionConfig() |
174 | { |
175 | Config = _config, |
176 | InterimResults = true, |
177 | } |
178 | } |
179 | ).Wait(); |
180 | _recording = true; |
181 | _stream_started = true; |
182 | StartCoroutine(AudioListener()); |
183 | ListenTask = new Task(() => listen()); |
184 | ListenTask.RunSynchronously(); |
185 | } |
186 | |
187 | // Funciton ends the recording of audio |
188 | public void EndRecording() |
189 | { |
190 | if (_stream_started) |
191 | { |
192 | _streamingCall.WriteCompleteAsync(); |
193 | _stream_started = false; |
194 | _recording = false; |
195 | } |
196 | } |
197 | |
198 | /// <summary> |
199 | /// This method provides the coroutine needed for recording audio from the microphone |
200 | /// to a buffer that can be sent to google speech. Since it uses a ring buffer, once |
201 | /// the buffer is full it loops to the start and overwrites old data. This buffer is |
202 | /// streamed in 1/2 * buffer.length chunks. This allows for reduced recording latency. |
203 | /// </summary> |
204 | /// <returns></returns> |
205 | IEnumerator AudioListener() |
206 | { |
207 | |
208 | _audioClip = Microphone.Start(null, true, 1, _recordingHZ); |
209 | |
210 | yield return null; |
211 | |
212 | if (_audioClip == null) |
213 | { |
214 | yield break; |
215 | } |
216 | |
217 | float[] samples = null; |
218 | bool inFirstHalf = true; |
219 | int midPoint = _audioClip.samples / 2; |
220 | |
221 | while (_recording && _audioClip != null) |
222 | { |
223 | |
224 | int writePos = Microphone.GetPosition(null); |
225 | |
226 | if (writePos > _audioClip.samples || !Microphone.IsRecording(null)) |
227 | { |
228 | Debug.Log("Failed to get Microphone"); |
229 | yield break; |
230 | } |
231 | // Check if we were in the first half, but have crossed the mid point (upper half full) |
232 | // or if we were in the last half, but have wrapped to the start (lower half full) |
233 | if ((inFirstHalf && writePos >= midPoint) || |
234 | (!inFirstHalf && writePos < midPoint)) |
235 | { |
236 | samples = new float[midPoint]; |
237 | _audioClip.GetData(samples, inFirstHalf ? 0 : midPoint); |
238 | |
239 | SendAudio(samples); |
240 | inFirstHalf = !inFirstHalf; |
241 | } |
242 | else |
243 | { |
244 | // Else wait the amount of time to fill the buffer. |
245 | int remaining = inFirstHalf ? (midPoint - writePos) : (_audioClip.samples - writePos); |
246 | float timeRemaining = remaining / (float)_recordingHZ; |
247 | yield return new WaitForSeconds(timeRemaining); |
248 | } |
249 | |
250 | } |
251 | } |
252 | |
253 | /// <summary> |
254 | /// This function sends the audio to google. However, if the audio is below |
255 | /// SILENEC_THRESHOLD then it does not send the audio. |
256 | /// </summary> |
257 | /// <param name="samples"></param> |
258 | public void SendAudio(float[] samples) |
259 | { |
260 | // If the samples are all below the SILENCE_THRESHOLD, then the audio is not sent |
261 | // to Google for processing. Additionally, it closes the streaming call. This is |
262 | // done to prevent Google Cloud Speech API from crashing the program. Google |
263 | // Cloud Speech API expect a continue stream of real time audio. Not sending |
264 | // audio is concidered an error. Therefore, the stream needs to be closed and |
265 | // later restarted. |
266 | if (DetectSilence(samples, SILENCE_THRESHOLD)) |
267 | { |
268 | if (_stream_started) |
269 | { |
270 | _streamingCall.WriteCompleteAsync().Wait(); |
271 | ListenTask.Wait(); |
272 | _stream_started = false; |
273 | |
274 | } |
275 | } |
276 | else |
277 | { |
278 | // Start the stream if it has been closed. |
279 | if (!_stream_started) |
280 | { |
281 | StartStream(); |
282 | } |
283 | var bytes = AudioUtils.FloatsToLinear16(samples); |
284 | _streamingCall.WriteAsync( |
285 | new StreamingRecognizeRequest() |
286 | { |
287 | AudioContent = Google.Protobuf.ByteString |
288 | .CopyFrom(bytes, 0, bytes.Length) |
289 | }); |
290 | } |
291 | |
292 | } |
293 | |
294 | /// <summary> |
295 | /// This function starts the streaming recognizer. This function blocks until it |
296 | /// receives the connection to Google's API. |
297 | /// </summary> |
298 | private void StartStream() |
299 | { |
300 | _streamingCall = _speech.StreamingRecognize(); |
301 | _streamingCall.WriteAsync( |
302 | new StreamingRecognizeRequest() |
303 | { |
304 | StreamingConfig = new StreamingRecognitionConfig() |
305 | { |
306 | Config = _config, |
307 | InterimResults = true, |
308 | } |
309 | }).Wait(); |
310 | _stream_started = true; |
311 | ListenTask = new Task(() => listen()); |
312 | ListenTask.RunSynchronously(); |
313 | } |
314 | |
315 | /// <summary> |
316 | /// To avoid making wasted calls to Google's Speech API, this function is used to |
317 | /// detect if the audio is quieter than the threshold. |
318 | /// </summary> |
319 | /// <param name="samples">The buffer of audio samples as floating points</param> |
320 | /// <param name="threshold">The threashold to be treated as silence</param> |
321 | /// <returns></returns> |
322 | static private bool DetectSilence(float[] samples, float threshold) |
323 | { |
324 | float maxLevel = Mathf.Max( |
325 | Mathf.Abs(Mathf.Min(samples)), |
326 | Mathf.Abs(Mathf.Max(samples)) |
327 | ); |
328 | return maxLevel < threshold; |
329 | } |
330 | |
331 | |
332 | /// <summary> |
333 | /// This function is ran in the background to read audio from the ResponseStream. |
334 | /// This must be ran asynchronosly otherwise an error will be raised. |
335 | /// |
336 | /// This funciton invokes two events: One for a partial result and one for a final result. |
337 | /// Callbacks can be attached to these events to react to incoming transcripts. |
338 | /// </summary> |
339 | private async void listen() |
340 | { |
341 | while (await _streamingCall.ResponseStream.MoveNext(default)) |
342 | { |
343 | foreach (var result in _streamingCall.ResponseStream.Current.Results) |
344 | { |
345 | OnResultEvent.Invoke(result); |
346 | if (result.IsFinal) |
347 | { |
348 | OnFinalResultEvent.Invoke(result); |
349 | } |
350 | } |
351 | } |
352 | |
353 | } |
354 | |
355 | private bool resumeWhenUnpaused = false; |
356 | private void OnApplicationPause(bool pause) |
357 | { |
358 | if (pause) |
359 | { |
360 | resumeWhenUnpaused = _active; |
361 | Active = false; |
362 | } |
363 | else |
364 | { |
365 | Active = resumeWhenUnpaused; |
366 | } |
367 | } |
368 | |
369 | } |
370 |