using Google.Apis.Auth.OAuth2; using Google.Cloud.Speech.V1; using Google.Protobuf.Collections; using Grpc.Auth; using System; using System.Collections; using System.Collections.Generic; using System.IO; using System.Threading.Tasks; using UnityEngine; using UnityEngine.Events; #if PLATFORM_ANDROID using UnityEngine.Android; #endif public class GoogleVoiceStreaming : MonoBehaviour { [Serializable] public class TranscriptEvent : UnityEvent { } public TranscriptEvent OnResultEvent; public TranscriptEvent OnFinalResultEvent; float SILENCE_THRESHOLD = 0.01f; AudioClip _audioClip = null; bool _recording = false; int _recordingHZ = 22050; bool _stream_started = false; Task ListenTask; float t = 0; RecognitionConfig _config; SpeechClient _speech; SpeechClient.StreamingRecognizeStream _streamingCall; #if PLATFORM_ANDROID GameObject dialog = null; #endif ~GoogleVoiceStreaming () { if (_stream_started) { _streamingCall.WriteCompleteAsync().Wait(); } } public void OnDestroy() { if (_stream_started) { _streamingCall.WriteCompleteAsync().Wait(); } } void OnDisable() { Debug.Log("Disabled"); if (_stream_started) { EndRecording(); } } void OnActive () { } public bool StartActive = false; public bool _active = false; public bool Active { set { if (value && !_recording) { _active = true; BeginRecognition(); } else if (!value && _recording) { _active = false; EndRecording(); } } } public void Start() { #if PLATFORM_ANDROID if (!Permission.HasUserAuthorizedPermission(Permission.Microphone)) { Permission.RequestUserPermission(Permission.Microphone); dialog = new GameObject(); } #endif if (_speech == null) { CreateChannel(); Active = StartActive; } Debug.Log(_speech); } public void Update() { // Count the how long the stream has been running // if the run time exceeds 340s, restart the stream if (_recording) { t += Time.deltaTime; if (t > 340) { EndRecording(); BeginRecognition(); t = 0; } } } void OnGUI() { #if PLATFORM_ANDROID if (!Permission.HasUserAuthorizedPermission(Permission.Microphone)) { // The user denied permission to use the microphone. // Display a message explaining why you need it with Yes/No buttons. // If the user says yes then present the request again // Display a dialog here. dialog.AddComponent(); return; } else if (dialog != null) { Destroy(dialog); } #endif // Now you can do things with the microphone } public void CreateChannel() { var key = Resources.Load("key"); _config = new RecognitionConfig() { Encoding = RecognitionConfig.Types.AudioEncoding.Linear16, SampleRateHertz = 22050, LanguageCode = "en", }; var credentials = GoogleCredential.FromJson(key.text) .CreateScoped(); var channel = new Grpc.Core.Channel( SpeechClient.DefaultEndpoint.Host, credentials.ToChannelCredentials() ); _speech = SpeechClient.Create(channel); } public void BeginRecognition() { if (_speech == null) { CreateChannel(); } _streamingCall = _speech.StreamingRecognize(); _streamingCall.WriteAsync( new StreamingRecognizeRequest() { StreamingConfig = new StreamingRecognitionConfig() { Config = _config, InterimResults = true, } } ).Wait(); _recording = true; _stream_started = true; StartCoroutine(AudioListener()); ListenTask = new Task(() => listen()); ListenTask.RunSynchronously(); } // Funciton ends the recording of audio public void EndRecording() { if (_stream_started) { _streamingCall.WriteCompleteAsync(); _stream_started = false; _recording = false; } } /// /// This method provides the coroutine needed for recording audio from the microphone /// to a buffer that can be sent to google speech. Since it uses a ring buffer, once /// the buffer is full it loops to the start and overwrites old data. This buffer is /// streamed in 1/2 * buffer.length chunks. This allows for reduced recording latency. /// /// IEnumerator AudioListener() { _audioClip = Microphone.Start(null, true, 1, _recordingHZ); yield return null; if (_audioClip == null) { yield break; } float[] samples = null; bool inFirstHalf = true; int midPoint = _audioClip.samples / 2; while (_recording && _audioClip != null) { int writePos = Microphone.GetPosition(null); if (writePos > _audioClip.samples || !Microphone.IsRecording(null)) { Debug.Log("Failed to get Microphone"); yield break; } // Check if we were in the first half, but have crossed the mid point (upper half full) // or if we were in the last half, but have wrapped to the start (lower half full) if ((inFirstHalf && writePos >= midPoint) || (!inFirstHalf && writePos < midPoint)) { samples = new float[midPoint]; _audioClip.GetData(samples, inFirstHalf ? 0 : midPoint); SendAudio(samples); inFirstHalf = !inFirstHalf; } else { // Else wait the amount of time to fill the buffer. int remaining = inFirstHalf ? (midPoint - writePos) : (_audioClip.samples - writePos); float timeRemaining = remaining / (float)_recordingHZ; yield return new WaitForSeconds(timeRemaining); } } } /// /// This function sends the audio to google. However, if the audio is below /// SILENEC_THRESHOLD then it does not send the audio. /// /// public void SendAudio(float[] samples) { // If the samples are all below the SILENCE_THRESHOLD, then the audio is not sent // to Google for processing. Additionally, it closes the streaming call. This is // done to prevent Google Cloud Speech API from crashing the program. Google // Cloud Speech API expect a continue stream of real time audio. Not sending // audio is concidered an error. Therefore, the stream needs to be closed and // later restarted. if (DetectSilence(samples, SILENCE_THRESHOLD)) { if (_stream_started) { _streamingCall.WriteCompleteAsync().Wait(); ListenTask.Wait(); _stream_started = false; } } else { // Start the stream if it has been closed. if (!_stream_started) { StartStream(); } var bytes = AudioUtils.FloatsToLinear16(samples); _streamingCall.WriteAsync( new StreamingRecognizeRequest() { AudioContent = Google.Protobuf.ByteString .CopyFrom(bytes, 0, bytes.Length) }); } } /// /// This function starts the streaming recognizer. This function blocks until it /// receives the connection to Google's API. /// private void StartStream() { _streamingCall = _speech.StreamingRecognize(); _streamingCall.WriteAsync( new StreamingRecognizeRequest() { StreamingConfig = new StreamingRecognitionConfig() { Config = _config, InterimResults = true, } }).Wait(); _stream_started = true; ListenTask = new Task(() => listen()); ListenTask.RunSynchronously(); } /// /// To avoid making wasted calls to Google's Speech API, this function is used to /// detect if the audio is quieter than the threshold. /// /// The buffer of audio samples as floating points /// The threashold to be treated as silence /// static private bool DetectSilence(float[] samples, float threshold) { float maxLevel = Mathf.Max( Mathf.Abs(Mathf.Min(samples)), Mathf.Abs(Mathf.Max(samples)) ); return maxLevel < threshold; } /// /// This function is ran in the background to read audio from the ResponseStream. /// This must be ran asynchronosly otherwise an error will be raised. /// /// This funciton invokes two events: One for a partial result and one for a final result. /// Callbacks can be attached to these events to react to incoming transcripts. /// private async void listen() { while (await _streamingCall.ResponseStream.MoveNext(default)) { foreach (var result in _streamingCall.ResponseStream.Current.Results) { OnResultEvent.Invoke(result); if (result.IsFinal) { OnFinalResultEvent.Invoke(result); } } } } private bool resumeWhenUnpaused = false; private void OnApplicationPause(bool pause) { if (pause) { resumeWhenUnpaused = _active; Active = false; } else { Active = resumeWhenUnpaused; } } }