GoogleVoiceStreaming.cs
· 10 KiB · C#
原始檔案
Playground
using Google.Apis.Auth.OAuth2;
using Google.Cloud.Speech.V1;
using Google.Protobuf.Collections;
using Grpc.Auth;
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Threading.Tasks;
using UnityEngine;
using UnityEngine.Events;
#if PLATFORM_ANDROID
using UnityEngine.Android;
#endif
public class GoogleVoiceStreaming : MonoBehaviour
{
[Serializable]
public class TranscriptEvent : UnityEvent<StreamingRecognitionResult> { }
public TranscriptEvent OnResultEvent;
public TranscriptEvent OnFinalResultEvent;
float SILENCE_THRESHOLD = 0.01f;
AudioClip _audioClip = null;
bool _recording = false;
int _recordingHZ = 22050;
bool _stream_started = false;
Task ListenTask;
float t = 0;
RecognitionConfig _config;
SpeechClient _speech;
SpeechClient.StreamingRecognizeStream _streamingCall;
#if PLATFORM_ANDROID
GameObject dialog = null;
#endif
~GoogleVoiceStreaming () {
if (_stream_started)
{
_streamingCall.WriteCompleteAsync().Wait();
}
}
public void OnDestroy()
{
if (_stream_started)
{
_streamingCall.WriteCompleteAsync().Wait();
}
}
void OnDisable()
{
Debug.Log("Disabled");
if (_stream_started)
{
EndRecording();
}
}
void OnActive ()
{
}
public bool StartActive = false;
public bool _active = false;
public bool Active {
set {
if (value && !_recording)
{
_active = true;
BeginRecognition();
}
else if (!value && _recording)
{
_active = false;
EndRecording();
}
}
}
public void Start()
{
#if PLATFORM_ANDROID
if (!Permission.HasUserAuthorizedPermission(Permission.Microphone))
{
Permission.RequestUserPermission(Permission.Microphone);
dialog = new GameObject();
}
#endif
if (_speech == null)
{
CreateChannel();
Active = StartActive;
}
Debug.Log(_speech);
}
public void Update()
{
// Count the how long the stream has been running
// if the run time exceeds 340s, restart the stream
if (_recording)
{
t += Time.deltaTime;
if (t > 340)
{
EndRecording();
BeginRecognition();
t = 0;
}
}
}
void OnGUI()
{
#if PLATFORM_ANDROID
if (!Permission.HasUserAuthorizedPermission(Permission.Microphone))
{
// The user denied permission to use the microphone.
// Display a message explaining why you need it with Yes/No buttons.
// If the user says yes then present the request again
// Display a dialog here.
dialog.AddComponent<PermissionsRationaleDialog>();
return;
}
else if (dialog != null)
{
Destroy(dialog);
}
#endif
// Now you can do things with the microphone
}
public void CreateChannel()
{
var key = Resources.Load<TextAsset>("key");
_config = new RecognitionConfig()
{
Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
SampleRateHertz = 22050,
LanguageCode = "en",
};
var credentials = GoogleCredential.FromJson(key.text)
.CreateScoped();
var channel = new Grpc.Core.Channel(
SpeechClient.DefaultEndpoint.Host,
credentials.ToChannelCredentials()
);
_speech = SpeechClient.Create(channel);
}
public void BeginRecognition()
{
if (_speech == null)
{
CreateChannel();
}
_streamingCall = _speech.StreamingRecognize();
_streamingCall.WriteAsync(
new StreamingRecognizeRequest()
{
StreamingConfig = new StreamingRecognitionConfig()
{
Config = _config,
InterimResults = true,
}
}
).Wait();
_recording = true;
_stream_started = true;
StartCoroutine(AudioListener());
ListenTask = new Task(() => listen());
ListenTask.RunSynchronously();
}
// Funciton ends the recording of audio
public void EndRecording()
{
if (_stream_started)
{
_streamingCall.WriteCompleteAsync();
_stream_started = false;
_recording = false;
}
}
/// <summary>
/// This method provides the coroutine needed for recording audio from the microphone
/// to a buffer that can be sent to google speech. Since it uses a ring buffer, once
/// the buffer is full it loops to the start and overwrites old data. This buffer is
/// streamed in 1/2 * buffer.length chunks. This allows for reduced recording latency.
/// </summary>
/// <returns></returns>
IEnumerator AudioListener()
{
_audioClip = Microphone.Start(null, true, 1, _recordingHZ);
yield return null;
if (_audioClip == null)
{
yield break;
}
float[] samples = null;
bool inFirstHalf = true;
int midPoint = _audioClip.samples / 2;
while (_recording && _audioClip != null)
{
int writePos = Microphone.GetPosition(null);
if (writePos > _audioClip.samples || !Microphone.IsRecording(null))
{
Debug.Log("Failed to get Microphone");
yield break;
}
// Check if we were in the first half, but have crossed the mid point (upper half full)
// or if we were in the last half, but have wrapped to the start (lower half full)
if ((inFirstHalf && writePos >= midPoint) ||
(!inFirstHalf && writePos < midPoint))
{
samples = new float[midPoint];
_audioClip.GetData(samples, inFirstHalf ? 0 : midPoint);
SendAudio(samples);
inFirstHalf = !inFirstHalf;
}
else
{
// Else wait the amount of time to fill the buffer.
int remaining = inFirstHalf ? (midPoint - writePos) : (_audioClip.samples - writePos);
float timeRemaining = remaining / (float)_recordingHZ;
yield return new WaitForSeconds(timeRemaining);
}
}
}
/// <summary>
/// This function sends the audio to google. However, if the audio is below
/// SILENEC_THRESHOLD then it does not send the audio.
/// </summary>
/// <param name="samples"></param>
public void SendAudio(float[] samples)
{
// If the samples are all below the SILENCE_THRESHOLD, then the audio is not sent
// to Google for processing. Additionally, it closes the streaming call. This is
// done to prevent Google Cloud Speech API from crashing the program. Google
// Cloud Speech API expect a continue stream of real time audio. Not sending
// audio is concidered an error. Therefore, the stream needs to be closed and
// later restarted.
if (DetectSilence(samples, SILENCE_THRESHOLD))
{
if (_stream_started)
{
_streamingCall.WriteCompleteAsync().Wait();
ListenTask.Wait();
_stream_started = false;
}
}
else
{
// Start the stream if it has been closed.
if (!_stream_started)
{
StartStream();
}
var bytes = AudioUtils.FloatsToLinear16(samples);
_streamingCall.WriteAsync(
new StreamingRecognizeRequest()
{
AudioContent = Google.Protobuf.ByteString
.CopyFrom(bytes, 0, bytes.Length)
});
}
}
/// <summary>
/// This function starts the streaming recognizer. This function blocks until it
/// receives the connection to Google's API.
/// </summary>
private void StartStream()
{
_streamingCall = _speech.StreamingRecognize();
_streamingCall.WriteAsync(
new StreamingRecognizeRequest()
{
StreamingConfig = new StreamingRecognitionConfig()
{
Config = _config,
InterimResults = true,
}
}).Wait();
_stream_started = true;
ListenTask = new Task(() => listen());
ListenTask.RunSynchronously();
}
/// <summary>
/// To avoid making wasted calls to Google's Speech API, this function is used to
/// detect if the audio is quieter than the threshold.
/// </summary>
/// <param name="samples">The buffer of audio samples as floating points</param>
/// <param name="threshold">The threashold to be treated as silence</param>
/// <returns></returns>
static private bool DetectSilence(float[] samples, float threshold)
{
float maxLevel = Mathf.Max(
Mathf.Abs(Mathf.Min(samples)),
Mathf.Abs(Mathf.Max(samples))
);
return maxLevel < threshold;
}
/// <summary>
/// This function is ran in the background to read audio from the ResponseStream.
/// This must be ran asynchronosly otherwise an error will be raised.
///
/// This funciton invokes two events: One for a partial result and one for a final result.
/// Callbacks can be attached to these events to react to incoming transcripts.
/// </summary>
private async void listen()
{
while (await _streamingCall.ResponseStream.MoveNext(default))
{
foreach (var result in _streamingCall.ResponseStream.Current.Results)
{
OnResultEvent.Invoke(result);
if (result.IsFinal)
{
OnFinalResultEvent.Invoke(result);
}
}
}
}
private bool resumeWhenUnpaused = false;
private void OnApplicationPause(bool pause)
{
if (pause)
{
resumeWhenUnpaused = _active;
Active = false;
}
else
{
Active = resumeWhenUnpaused;
}
}
}
| 1 | |
| 2 | using Google.Apis.Auth.OAuth2; |
| 3 | using Google.Cloud.Speech.V1; |
| 4 | using Google.Protobuf.Collections; |
| 5 | using Grpc.Auth; |
| 6 | using System; |
| 7 | using System.Collections; |
| 8 | using System.Collections.Generic; |
| 9 | using System.IO; |
| 10 | using System.Threading.Tasks; |
| 11 | using UnityEngine; |
| 12 | using UnityEngine.Events; |
| 13 | #if PLATFORM_ANDROID |
| 14 | using UnityEngine.Android; |
| 15 | #endif |
| 16 | |
| 17 | public class GoogleVoiceStreaming : MonoBehaviour |
| 18 | { |
| 19 | [Serializable] |
| 20 | public class TranscriptEvent : UnityEvent<StreamingRecognitionResult> { } |
| 21 | public TranscriptEvent OnResultEvent; |
| 22 | public TranscriptEvent OnFinalResultEvent; |
| 23 | |
| 24 | float SILENCE_THRESHOLD = 0.01f; |
| 25 | AudioClip _audioClip = null; |
| 26 | bool _recording = false; |
| 27 | int _recordingHZ = 22050; |
| 28 | bool _stream_started = false; |
| 29 | Task ListenTask; |
| 30 | float t = 0; |
| 31 | |
| 32 | RecognitionConfig _config; |
| 33 | SpeechClient _speech; |
| 34 | SpeechClient.StreamingRecognizeStream _streamingCall; |
| 35 | #if PLATFORM_ANDROID |
| 36 | GameObject dialog = null; |
| 37 | #endif |
| 38 | |
| 39 | ~GoogleVoiceStreaming () { |
| 40 | if (_stream_started) |
| 41 | { |
| 42 | _streamingCall.WriteCompleteAsync().Wait(); |
| 43 | } |
| 44 | } |
| 45 | |
| 46 | public void OnDestroy() |
| 47 | { |
| 48 | if (_stream_started) |
| 49 | { |
| 50 | _streamingCall.WriteCompleteAsync().Wait(); |
| 51 | |
| 52 | } |
| 53 | } |
| 54 | |
| 55 | void OnDisable() |
| 56 | { |
| 57 | Debug.Log("Disabled"); |
| 58 | if (_stream_started) |
| 59 | { |
| 60 | EndRecording(); |
| 61 | } |
| 62 | } |
| 63 | |
| 64 | void OnActive () |
| 65 | { |
| 66 | } |
| 67 | |
| 68 | public bool StartActive = false; |
| 69 | public bool _active = false; |
| 70 | public bool Active { |
| 71 | set { |
| 72 | if (value && !_recording) |
| 73 | { |
| 74 | _active = true; |
| 75 | BeginRecognition(); |
| 76 | } |
| 77 | else if (!value && _recording) |
| 78 | { |
| 79 | _active = false; |
| 80 | EndRecording(); |
| 81 | } |
| 82 | } |
| 83 | } |
| 84 | |
| 85 | public void Start() |
| 86 | { |
| 87 | #if PLATFORM_ANDROID |
| 88 | if (!Permission.HasUserAuthorizedPermission(Permission.Microphone)) |
| 89 | { |
| 90 | Permission.RequestUserPermission(Permission.Microphone); |
| 91 | dialog = new GameObject(); |
| 92 | } |
| 93 | #endif |
| 94 | if (_speech == null) |
| 95 | { |
| 96 | CreateChannel(); |
| 97 | Active = StartActive; |
| 98 | } |
| 99 | |
| 100 | Debug.Log(_speech); |
| 101 | |
| 102 | } |
| 103 | |
| 104 | public void Update() |
| 105 | { |
| 106 | // Count the how long the stream has been running |
| 107 | // if the run time exceeds 340s, restart the stream |
| 108 | if (_recording) |
| 109 | { |
| 110 | t += Time.deltaTime; |
| 111 | if (t > 340) |
| 112 | { |
| 113 | EndRecording(); |
| 114 | BeginRecognition(); |
| 115 | t = 0; |
| 116 | } |
| 117 | } |
| 118 | |
| 119 | } |
| 120 | |
| 121 | void OnGUI() |
| 122 | { |
| 123 | |
| 124 | #if PLATFORM_ANDROID |
| 125 | if (!Permission.HasUserAuthorizedPermission(Permission.Microphone)) |
| 126 | { |
| 127 | // The user denied permission to use the microphone. |
| 128 | // Display a message explaining why you need it with Yes/No buttons. |
| 129 | // If the user says yes then present the request again |
| 130 | // Display a dialog here. |
| 131 | dialog.AddComponent<PermissionsRationaleDialog>(); |
| 132 | return; |
| 133 | } |
| 134 | else if (dialog != null) |
| 135 | { |
| 136 | Destroy(dialog); |
| 137 | } |
| 138 | #endif |
| 139 | |
| 140 | // Now you can do things with the microphone |
| 141 | } |
| 142 | |
| 143 | public void CreateChannel() |
| 144 | { |
| 145 | var key = Resources.Load<TextAsset>("key"); |
| 146 | _config = new RecognitionConfig() |
| 147 | { |
| 148 | Encoding = RecognitionConfig.Types.AudioEncoding.Linear16, |
| 149 | SampleRateHertz = 22050, |
| 150 | LanguageCode = "en", |
| 151 | }; |
| 152 | |
| 153 | var credentials = GoogleCredential.FromJson(key.text) |
| 154 | .CreateScoped(); |
| 155 | var channel = new Grpc.Core.Channel( |
| 156 | SpeechClient.DefaultEndpoint.Host, |
| 157 | credentials.ToChannelCredentials() |
| 158 | ); |
| 159 | _speech = SpeechClient.Create(channel); |
| 160 | } |
| 161 | |
| 162 | public void BeginRecognition() |
| 163 | { |
| 164 | |
| 165 | if (_speech == null) |
| 166 | { |
| 167 | CreateChannel(); |
| 168 | } |
| 169 | _streamingCall = _speech.StreamingRecognize(); |
| 170 | _streamingCall.WriteAsync( |
| 171 | new StreamingRecognizeRequest() |
| 172 | { |
| 173 | StreamingConfig = new StreamingRecognitionConfig() |
| 174 | { |
| 175 | Config = _config, |
| 176 | InterimResults = true, |
| 177 | } |
| 178 | } |
| 179 | ).Wait(); |
| 180 | _recording = true; |
| 181 | _stream_started = true; |
| 182 | StartCoroutine(AudioListener()); |
| 183 | ListenTask = new Task(() => listen()); |
| 184 | ListenTask.RunSynchronously(); |
| 185 | } |
| 186 | |
| 187 | // Funciton ends the recording of audio |
| 188 | public void EndRecording() |
| 189 | { |
| 190 | if (_stream_started) |
| 191 | { |
| 192 | _streamingCall.WriteCompleteAsync(); |
| 193 | _stream_started = false; |
| 194 | _recording = false; |
| 195 | } |
| 196 | } |
| 197 | |
| 198 | /// <summary> |
| 199 | /// This method provides the coroutine needed for recording audio from the microphone |
| 200 | /// to a buffer that can be sent to google speech. Since it uses a ring buffer, once |
| 201 | /// the buffer is full it loops to the start and overwrites old data. This buffer is |
| 202 | /// streamed in 1/2 * buffer.length chunks. This allows for reduced recording latency. |
| 203 | /// </summary> |
| 204 | /// <returns></returns> |
| 205 | IEnumerator AudioListener() |
| 206 | { |
| 207 | |
| 208 | _audioClip = Microphone.Start(null, true, 1, _recordingHZ); |
| 209 | |
| 210 | yield return null; |
| 211 | |
| 212 | if (_audioClip == null) |
| 213 | { |
| 214 | yield break; |
| 215 | } |
| 216 | |
| 217 | float[] samples = null; |
| 218 | bool inFirstHalf = true; |
| 219 | int midPoint = _audioClip.samples / 2; |
| 220 | |
| 221 | while (_recording && _audioClip != null) |
| 222 | { |
| 223 | |
| 224 | int writePos = Microphone.GetPosition(null); |
| 225 | |
| 226 | if (writePos > _audioClip.samples || !Microphone.IsRecording(null)) |
| 227 | { |
| 228 | Debug.Log("Failed to get Microphone"); |
| 229 | yield break; |
| 230 | } |
| 231 | // Check if we were in the first half, but have crossed the mid point (upper half full) |
| 232 | // or if we were in the last half, but have wrapped to the start (lower half full) |
| 233 | if ((inFirstHalf && writePos >= midPoint) || |
| 234 | (!inFirstHalf && writePos < midPoint)) |
| 235 | { |
| 236 | samples = new float[midPoint]; |
| 237 | _audioClip.GetData(samples, inFirstHalf ? 0 : midPoint); |
| 238 | |
| 239 | SendAudio(samples); |
| 240 | inFirstHalf = !inFirstHalf; |
| 241 | } |
| 242 | else |
| 243 | { |
| 244 | // Else wait the amount of time to fill the buffer. |
| 245 | int remaining = inFirstHalf ? (midPoint - writePos) : (_audioClip.samples - writePos); |
| 246 | float timeRemaining = remaining / (float)_recordingHZ; |
| 247 | yield return new WaitForSeconds(timeRemaining); |
| 248 | } |
| 249 | |
| 250 | } |
| 251 | } |
| 252 | |
| 253 | /// <summary> |
| 254 | /// This function sends the audio to google. However, if the audio is below |
| 255 | /// SILENEC_THRESHOLD then it does not send the audio. |
| 256 | /// </summary> |
| 257 | /// <param name="samples"></param> |
| 258 | public void SendAudio(float[] samples) |
| 259 | { |
| 260 | // If the samples are all below the SILENCE_THRESHOLD, then the audio is not sent |
| 261 | // to Google for processing. Additionally, it closes the streaming call. This is |
| 262 | // done to prevent Google Cloud Speech API from crashing the program. Google |
| 263 | // Cloud Speech API expect a continue stream of real time audio. Not sending |
| 264 | // audio is concidered an error. Therefore, the stream needs to be closed and |
| 265 | // later restarted. |
| 266 | if (DetectSilence(samples, SILENCE_THRESHOLD)) |
| 267 | { |
| 268 | if (_stream_started) |
| 269 | { |
| 270 | _streamingCall.WriteCompleteAsync().Wait(); |
| 271 | ListenTask.Wait(); |
| 272 | _stream_started = false; |
| 273 | |
| 274 | } |
| 275 | } |
| 276 | else |
| 277 | { |
| 278 | // Start the stream if it has been closed. |
| 279 | if (!_stream_started) |
| 280 | { |
| 281 | StartStream(); |
| 282 | } |
| 283 | var bytes = AudioUtils.FloatsToLinear16(samples); |
| 284 | _streamingCall.WriteAsync( |
| 285 | new StreamingRecognizeRequest() |
| 286 | { |
| 287 | AudioContent = Google.Protobuf.ByteString |
| 288 | .CopyFrom(bytes, 0, bytes.Length) |
| 289 | }); |
| 290 | } |
| 291 | |
| 292 | } |
| 293 | |
| 294 | /// <summary> |
| 295 | /// This function starts the streaming recognizer. This function blocks until it |
| 296 | /// receives the connection to Google's API. |
| 297 | /// </summary> |
| 298 | private void StartStream() |
| 299 | { |
| 300 | _streamingCall = _speech.StreamingRecognize(); |
| 301 | _streamingCall.WriteAsync( |
| 302 | new StreamingRecognizeRequest() |
| 303 | { |
| 304 | StreamingConfig = new StreamingRecognitionConfig() |
| 305 | { |
| 306 | Config = _config, |
| 307 | InterimResults = true, |
| 308 | } |
| 309 | }).Wait(); |
| 310 | _stream_started = true; |
| 311 | ListenTask = new Task(() => listen()); |
| 312 | ListenTask.RunSynchronously(); |
| 313 | } |
| 314 | |
| 315 | /// <summary> |
| 316 | /// To avoid making wasted calls to Google's Speech API, this function is used to |
| 317 | /// detect if the audio is quieter than the threshold. |
| 318 | /// </summary> |
| 319 | /// <param name="samples">The buffer of audio samples as floating points</param> |
| 320 | /// <param name="threshold">The threashold to be treated as silence</param> |
| 321 | /// <returns></returns> |
| 322 | static private bool DetectSilence(float[] samples, float threshold) |
| 323 | { |
| 324 | float maxLevel = Mathf.Max( |
| 325 | Mathf.Abs(Mathf.Min(samples)), |
| 326 | Mathf.Abs(Mathf.Max(samples)) |
| 327 | ); |
| 328 | return maxLevel < threshold; |
| 329 | } |
| 330 | |
| 331 | |
| 332 | /// <summary> |
| 333 | /// This function is ran in the background to read audio from the ResponseStream. |
| 334 | /// This must be ran asynchronosly otherwise an error will be raised. |
| 335 | /// |
| 336 | /// This funciton invokes two events: One for a partial result and one for a final result. |
| 337 | /// Callbacks can be attached to these events to react to incoming transcripts. |
| 338 | /// </summary> |
| 339 | private async void listen() |
| 340 | { |
| 341 | while (await _streamingCall.ResponseStream.MoveNext(default)) |
| 342 | { |
| 343 | foreach (var result in _streamingCall.ResponseStream.Current.Results) |
| 344 | { |
| 345 | OnResultEvent.Invoke(result); |
| 346 | if (result.IsFinal) |
| 347 | { |
| 348 | OnFinalResultEvent.Invoke(result); |
| 349 | } |
| 350 | } |
| 351 | } |
| 352 | |
| 353 | } |
| 354 | |
| 355 | private bool resumeWhenUnpaused = false; |
| 356 | private void OnApplicationPause(bool pause) |
| 357 | { |
| 358 | if (pause) |
| 359 | { |
| 360 | resumeWhenUnpaused = _active; |
| 361 | Active = false; |
| 362 | } |
| 363 | else |
| 364 | { |
| 365 | Active = resumeWhenUnpaused; |
| 366 | } |
| 367 | } |
| 368 | |
| 369 | } |
| 370 |