GoogleVoiceStreaming.cs

GoogleVoiceStreaming.cs · 10 KiB · C# 原始檔案 Playground

using Google.Apis.Auth.OAuth2; using Google.Cloud.Speech.V1; using Google.Protobuf.Collections; using Grpc.Auth; using System; using System.Collections; using System.Collections.Generic; using System.IO; using System.Threading.Tasks; using UnityEngine; using UnityEngine.Events; #if PLATFORM_ANDROID using UnityEngine.Android; #endif public class GoogleVoiceStreaming : MonoBehaviour { [Serializable] public class TranscriptEvent : UnityEvent<StreamingRecognitionResult> { } public TranscriptEvent OnResultEvent; public TranscriptEvent OnFinalResultEvent; float SILENCE_THRESHOLD = 0.01f; AudioClip _audioClip = null; bool _recording = false; int _recordingHZ = 22050; bool _stream_started = false; Task ListenTask; float t = 0; RecognitionConfig _config; SpeechClient _speech; SpeechClient.StreamingRecognizeStream _streamingCall; #if PLATFORM_ANDROID GameObject dialog = null; #endif ~GoogleVoiceStreaming () { if (_stream_started) { _streamingCall.WriteCompleteAsync().Wait(); } } public void OnDestroy() { if (_stream_started) { _streamingCall.WriteCompleteAsync().Wait(); } } void OnDisable() { Debug.Log("Disabled"); if (_stream_started) { EndRecording(); } } void OnActive () { } public bool StartActive = false; public bool _active = false; public bool Active { set { if (value && !_recording) { _active = true; BeginRecognition(); } else if (!value && _recording) { _active = false; EndRecording(); } } } public void Start() { #if PLATFORM_ANDROID if (!Permission.HasUserAuthorizedPermission(Permission.Microphone)) { Permission.RequestUserPermission(Permission.Microphone); dialog = new GameObject(); } #endif if (_speech == null) { CreateChannel(); Active = StartActive; } Debug.Log(_speech); } public void Update() { // Count the how long the stream has been running // if the run time exceeds 340s, restart the stream if (_recording) { t += Time.deltaTime; if (t > 340) { EndRecording(); BeginRecognition(); t = 0; } } } void OnGUI() { #if PLATFORM_ANDROID if (!Permission.HasUserAuthorizedPermission(Permission.Microphone)) { // The user denied permission to use the microphone. // Display a message explaining why you need it with Yes/No buttons. // If the user says yes then present the request again // Display a dialog here. dialog.AddComponent<PermissionsRationaleDialog>(); return; } else if (dialog != null) { Destroy(dialog); } #endif // Now you can do things with the microphone } public void CreateChannel() { var key = Resources.Load<TextAsset>("key"); _config = new RecognitionConfig() { Encoding = RecognitionConfig.Types.AudioEncoding.Linear16, SampleRateHertz = 22050, LanguageCode = "en", }; var credentials = GoogleCredential.FromJson(key.text) .CreateScoped(); var channel = new Grpc.Core.Channel( SpeechClient.DefaultEndpoint.Host, credentials.ToChannelCredentials() ); _speech = SpeechClient.Create(channel); } public void BeginRecognition() { if (_speech == null) { CreateChannel(); } _streamingCall = _speech.StreamingRecognize(); _streamingCall.WriteAsync( new StreamingRecognizeRequest() { StreamingConfig = new StreamingRecognitionConfig() { Config = _config, InterimResults = true, } } ).Wait(); _recording = true; _stream_started = true; StartCoroutine(AudioListener()); ListenTask = new Task(() => listen()); ListenTask.RunSynchronously(); } // Funciton ends the recording of audio public void EndRecording() { if (_stream_started) { _streamingCall.WriteCompleteAsync(); _stream_started = false; _recording = false; } } /// <summary> /// This method provides the coroutine needed for recording audio from the microphone /// to a buffer that can be sent to google speech. Since it uses a ring buffer, once /// the buffer is full it loops to the start and overwrites old data. This buffer is /// streamed in 1/2 * buffer.length chunks. This allows for reduced recording latency. /// </summary> /// <returns></returns> IEnumerator AudioListener() { _audioClip = Microphone.Start(null, true, 1, _recordingHZ); yield return null; if (_audioClip == null) { yield break; } float[] samples = null; bool inFirstHalf = true; int midPoint = _audioClip.samples / 2; while (_recording && _audioClip != null) { int writePos = Microphone.GetPosition(null); if (writePos > _audioClip.samples || !Microphone.IsRecording(null)) { Debug.Log("Failed to get Microphone"); yield break; } // Check if we were in the first half, but have crossed the mid point (upper half full) // or if we were in the last half, but have wrapped to the start (lower half full) if ((inFirstHalf && writePos >= midPoint) || (!inFirstHalf && writePos < midPoint)) { samples = new float[midPoint]; _audioClip.GetData(samples, inFirstHalf ? 0 : midPoint); SendAudio(samples); inFirstHalf = !inFirstHalf; } else { // Else wait the amount of time to fill the buffer. int remaining = inFirstHalf ? (midPoint - writePos) : (_audioClip.samples - writePos); float timeRemaining = remaining / (float)_recordingHZ; yield return new WaitForSeconds(timeRemaining); } } } /// <summary> /// This function sends the audio to google. However, if the audio is below /// SILENEC_THRESHOLD then it does not send the audio. /// </summary> /// <param name="samples"></param> public void SendAudio(float[] samples) { // If the samples are all below the SILENCE_THRESHOLD, then the audio is not sent // to Google for processing. Additionally, it closes the streaming call. This is // done to prevent Google Cloud Speech API from crashing the program. Google // Cloud Speech API expect a continue stream of real time audio. Not sending // audio is concidered an error. Therefore, the stream needs to be closed and // later restarted. if (DetectSilence(samples, SILENCE_THRESHOLD)) { if (_stream_started) { _streamingCall.WriteCompleteAsync().Wait(); ListenTask.Wait(); _stream_started = false; } } else { // Start the stream if it has been closed. if (!_stream_started) { StartStream(); } var bytes = AudioUtils.FloatsToLinear16(samples); _streamingCall.WriteAsync( new StreamingRecognizeRequest() { AudioContent = Google.Protobuf.ByteString .CopyFrom(bytes, 0, bytes.Length) }); } } /// <summary> /// This function starts the streaming recognizer. This function blocks until it /// receives the connection to Google's API. /// </summary> private void StartStream() { _streamingCall = _speech.StreamingRecognize(); _streamingCall.WriteAsync( new StreamingRecognizeRequest() { StreamingConfig = new StreamingRecognitionConfig() { Config = _config, InterimResults = true, } }).Wait(); _stream_started = true; ListenTask = new Task(() => listen()); ListenTask.RunSynchronously(); } /// <summary> /// To avoid making wasted calls to Google's Speech API, this function is used to /// detect if the audio is quieter than the threshold. /// </summary> /// <param name="samples">The buffer of audio samples as floating points</param> /// <param name="threshold">The threashold to be treated as silence</param> /// <returns></returns> static private bool DetectSilence(float[] samples, float threshold) { float maxLevel = Mathf.Max( Mathf.Abs(Mathf.Min(samples)), Mathf.Abs(Mathf.Max(samples)) ); return maxLevel < threshold; } /// <summary> /// This function is ran in the background to read audio from the ResponseStream. /// This must be ran asynchronosly otherwise an error will be raised. /// /// This funciton invokes two events: One for a partial result and one for a final result. /// Callbacks can be attached to these events to react to incoming transcripts. /// </summary> private async void listen() { while (await _streamingCall.ResponseStream.MoveNext(default)) { foreach (var result in _streamingCall.ResponseStream.Current.Results) { OnResultEvent.Invoke(result); if (result.IsFinal) { OnFinalResultEvent.Invoke(result); } } } } private bool resumeWhenUnpaused = false; private void OnApplicationPause(bool pause) { if (pause) { resumeWhenUnpaused = _active; Active = false; } else { Active = resumeWhenUnpaused; } } }

1
2	using Google.Apis.Auth.OAuth2;
3	using Google.Cloud.Speech.V1;
4	using Google.Protobuf.Collections;
5	using Grpc.Auth;
6	using System;
7	using System.Collections;
8	using System.Collections.Generic;
9	using System.IO;
10	using System.Threading.Tasks;
11	using UnityEngine;
12	using UnityEngine.Events;
13	#if PLATFORM_ANDROID
14	using UnityEngine.Android;
15	#endif
16
17	public class GoogleVoiceStreaming : MonoBehaviour
18	{
19	[Serializable]
20	public class TranscriptEvent : UnityEvent<StreamingRecognitionResult> { }
21	public TranscriptEvent OnResultEvent;
22	public TranscriptEvent OnFinalResultEvent;
23
24	float SILENCE_THRESHOLD = 0.01f;
25	AudioClip _audioClip = null;
26	bool _recording = false;
27	int _recordingHZ = 22050;
28	bool _stream_started = false;
29	Task ListenTask;
30	float t = 0;
31
32	RecognitionConfig _config;
33	SpeechClient _speech;
34	SpeechClient.StreamingRecognizeStream _streamingCall;
35	#if PLATFORM_ANDROID
36	GameObject dialog = null;
37	#endif
38
39	~GoogleVoiceStreaming () {
40	if (_stream_started)
41	{
42	_streamingCall.WriteCompleteAsync().Wait();
43	}
44	}
45
46	public void OnDestroy()
47	{
48	if (_stream_started)
49	{
50	_streamingCall.WriteCompleteAsync().Wait();
51
52	}
53	}
54
55	void OnDisable()
56	{
57	Debug.Log("Disabled");
58	if (_stream_started)
59	{
60	EndRecording();
61	}
62	}
63
64	void OnActive ()
65	{
66	}
67
68	public bool StartActive = false;
69	public bool _active = false;
70	public bool Active {
71	set {
72	if (value && !_recording)
73	{
74	_active = true;
75	BeginRecognition();
76	}
77	else if (!value && _recording)
78	{
79	_active = false;
80	EndRecording();
81	}
82	}
83	}
84
85	public void Start()
86	{
87	#if PLATFORM_ANDROID
88	if (!Permission.HasUserAuthorizedPermission(Permission.Microphone))
89	{
90	Permission.RequestUserPermission(Permission.Microphone);
91	dialog = new GameObject();
92	}
93	#endif
94	if (_speech == null)
95	{
96	CreateChannel();
97	Active = StartActive;
98	}
99
100	Debug.Log(_speech);
101
102	}
103
104	public void Update()
105	{
106	// Count the how long the stream has been running
107	// if the run time exceeds 340s, restart the stream
108	if (_recording)
109	{
110	t += Time.deltaTime;
111	if (t > 340)
112	{
113	EndRecording();
114	BeginRecognition();
115	t = 0;
116	}
117	}
118
119	}
120
121	void OnGUI()
122	{
123
124	#if PLATFORM_ANDROID
125	if (!Permission.HasUserAuthorizedPermission(Permission.Microphone))
126	{
127	// The user denied permission to use the microphone.
128	// Display a message explaining why you need it with Yes/No buttons.
129	// If the user says yes then present the request again
130	// Display a dialog here.
131	dialog.AddComponent<PermissionsRationaleDialog>();
132	return;
133	}
134	else if (dialog != null)
135	{
136	Destroy(dialog);
137	}
138	#endif
139
140	// Now you can do things with the microphone
141	}
142
143	public void CreateChannel()
144	{
145	var key = Resources.Load<TextAsset>("key");
146	_config = new RecognitionConfig()
147	{
148	Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
149	SampleRateHertz = 22050,
150	LanguageCode = "en",
151	};
152
153	var credentials = GoogleCredential.FromJson(key.text)
154	.CreateScoped();
155	var channel = new Grpc.Core.Channel(
156	SpeechClient.DefaultEndpoint.Host,
157	credentials.ToChannelCredentials()
158	);
159	_speech = SpeechClient.Create(channel);
160	}
161
162	public void BeginRecognition()
163	{
164
165	if (_speech == null)
166	{
167	CreateChannel();
168	}
169	_streamingCall = _speech.StreamingRecognize();
170	_streamingCall.WriteAsync(
171	new StreamingRecognizeRequest()
172	{
173	StreamingConfig = new StreamingRecognitionConfig()
174	{
175	Config = _config,
176	InterimResults = true,
177	}
178	}
179	).Wait();
180	_recording = true;
181	_stream_started = true;
182	StartCoroutine(AudioListener());
183	ListenTask = new Task(() => listen());
184	ListenTask.RunSynchronously();
185	}
186
187	// Funciton ends the recording of audio
188	public void EndRecording()
189	{
190	if (_stream_started)
191	{
192	_streamingCall.WriteCompleteAsync();
193	_stream_started = false;
194	_recording = false;
195	}
196	}
197
198	/// <summary>
199	/// This method provides the coroutine needed for recording audio from the microphone
200	/// to a buffer that can be sent to google speech. Since it uses a ring buffer, once
201	/// the buffer is full it loops to the start and overwrites old data. This buffer is
202	/// streamed in 1/2 * buffer.length chunks. This allows for reduced recording latency.
203	/// </summary>
204	/// <returns></returns>
205	IEnumerator AudioListener()
206	{
207
208	_audioClip = Microphone.Start(null, true, 1, _recordingHZ);
209
210	yield return null;
211
212	if (_audioClip == null)
213	{
214	yield break;
215	}
216
217	float[] samples = null;
218	bool inFirstHalf = true;
219	int midPoint = _audioClip.samples / 2;
220
221	while (_recording && _audioClip != null)
222	{
223
224	int writePos = Microphone.GetPosition(null);
225
226	if (writePos > _audioClip.samples \|\| !Microphone.IsRecording(null))
227	{
228	Debug.Log("Failed to get Microphone");
229	yield break;
230	}
231	// Check if we were in the first half, but have crossed the mid point (upper half full)
232	// or if we were in the last half, but have wrapped to the start (lower half full)
233	if ((inFirstHalf && writePos >= midPoint) \|\|
234	(!inFirstHalf && writePos < midPoint))
235	{
236	samples = new float[midPoint];
237	_audioClip.GetData(samples, inFirstHalf ? 0 : midPoint);
238
239	SendAudio(samples);
240	inFirstHalf = !inFirstHalf;
241	}
242	else
243	{
244	// Else wait the amount of time to fill the buffer.
245	int remaining = inFirstHalf ? (midPoint - writePos) : (_audioClip.samples - writePos);
246	float timeRemaining = remaining / (float)_recordingHZ;
247	yield return new WaitForSeconds(timeRemaining);
248	}
249
250	}
251	}
252
253	/// <summary>
254	/// This function sends the audio to google. However, if the audio is below
255	/// SILENEC_THRESHOLD then it does not send the audio.
256	/// </summary>
257	/// <param name="samples"></param>
258	public void SendAudio(float[] samples)
259	{
260	// If the samples are all below the SILENCE_THRESHOLD, then the audio is not sent
261	// to Google for processing. Additionally, it closes the streaming call. This is
262	// done to prevent Google Cloud Speech API from crashing the program. Google
263	// Cloud Speech API expect a continue stream of real time audio. Not sending
264	// audio is concidered an error. Therefore, the stream needs to be closed and
265	// later restarted.
266	if (DetectSilence(samples, SILENCE_THRESHOLD))
267	{
268	if (_stream_started)
269	{
270	_streamingCall.WriteCompleteAsync().Wait();
271	ListenTask.Wait();
272	_stream_started = false;
273
274	}
275	}
276	else
277	{
278	// Start the stream if it has been closed.
279	if (!_stream_started)
280	{
281	StartStream();
282	}
283	var bytes = AudioUtils.FloatsToLinear16(samples);
284	_streamingCall.WriteAsync(
285	new StreamingRecognizeRequest()
286	{
287	AudioContent = Google.Protobuf.ByteString
288	.CopyFrom(bytes, 0, bytes.Length)
289	});
290	}
291
292	}
293
294	/// <summary>
295	/// This function starts the streaming recognizer. This function blocks until it
296	/// receives the connection to Google's API.
297	/// </summary>
298	private void StartStream()
299	{
300	_streamingCall = _speech.StreamingRecognize();
301	_streamingCall.WriteAsync(
302	new StreamingRecognizeRequest()
303	{
304	StreamingConfig = new StreamingRecognitionConfig()
305	{
306	Config = _config,
307	InterimResults = true,
308	}
309	}).Wait();
310	_stream_started = true;
311	ListenTask = new Task(() => listen());
312	ListenTask.RunSynchronously();
313	}
314
315	/// <summary>
316	/// To avoid making wasted calls to Google's Speech API, this function is used to
317	/// detect if the audio is quieter than the threshold.
318	/// </summary>
319	/// <param name="samples">The buffer of audio samples as floating points</param>
320	/// <param name="threshold">The threashold to be treated as silence</param>
321	/// <returns></returns>
322	static private bool DetectSilence(float[] samples, float threshold)
323	{
324	float maxLevel = Mathf.Max(
325	Mathf.Abs(Mathf.Min(samples)),
326	Mathf.Abs(Mathf.Max(samples))
327	);
328	return maxLevel < threshold;
329	}
330
331
332	/// <summary>
333	/// This function is ran in the background to read audio from the ResponseStream.
334	/// This must be ran asynchronosly otherwise an error will be raised.
335	///
336	/// This funciton invokes two events: One for a partial result and one for a final result.
337	/// Callbacks can be attached to these events to react to incoming transcripts.
338	/// </summary>
339	private async void listen()
340	{
341	while (await _streamingCall.ResponseStream.MoveNext(default))
342	{
343	foreach (var result in _streamingCall.ResponseStream.Current.Results)
344	{
345	OnResultEvent.Invoke(result);
346	if (result.IsFinal)
347	{
348	OnFinalResultEvent.Invoke(result);
349	}
350	}
351	}
352
353	}
354
355	private bool resumeWhenUnpaused = false;
356	private void OnApplicationPause(bool pause)
357	{
358	if (pause)
359	{
360	resumeWhenUnpaused = _active;
361	Active = false;
362	}
363	else
364	{
365	Active = resumeWhenUnpaused;
366	}
367	}
368
369	}
370