Revision of GoogleVoiceStreaming.cs

1

+

2

+

using Google.Apis.Auth.OAuth2;

3

+

using Google.Cloud.Speech.V1;

4

+

using Google.Protobuf.Collections;

5

+

using Grpc.Auth;

6

+

using System;

7

+

using System.Collections;

8

+

using System.Collections.Generic;

9

+

using System.IO;

10

+

using System.Threading.Tasks;

11

+

using UnityEngine;

12

+

using UnityEngine.Events;

13

+

#if PLATFORM_ANDROID

14

+

using UnityEngine.Android;

15

+

#endif

16

+

17

+

public class GoogleVoiceStreaming : MonoBehaviour

18

+

{

19

+

[Serializable]

20

+

public class TranscriptEvent : UnityEvent<StreamingRecognitionResult> { }

21

+

public TranscriptEvent OnResultEvent;

22

+

public TranscriptEvent OnFinalResultEvent;

23

+

24

+

float SILENCE_THRESHOLD = 0.01f;

25

+

AudioClip _audioClip = null;

26

+

bool _recording = false;

27

+

int _recordingHZ = 22050;

28

+

bool _stream_started = false;

29

+

Task ListenTask;

30

+

float t = 0;

31

+

32

+

RecognitionConfig _config;

33

+

SpeechClient _speech;

34

+

SpeechClient.StreamingRecognizeStream _streamingCall;

35

+

#if PLATFORM_ANDROID

36

+

GameObject dialog = null;

37

+

#endif

38

+

39

+

~GoogleVoiceStreaming () {

40

+

if (_stream_started)

41

+

{

42

+

_streamingCall.WriteCompleteAsync().Wait();

43

+

}

44

+

}

45

+

46

+

public void OnDestroy()

47

+

{

48

+

if (_stream_started)

49

+

{

50

+

_streamingCall.WriteCompleteAsync().Wait();

51

+

52

+

}

53

+

}

54

+

55

+

void OnDisable()

56

+

{

57

+

Debug.Log("Disabled");

58

+

if (_stream_started)

59

+

{

60

+

EndRecording();

61

+

}

62

+

}

63

+

64

+

void OnActive ()

65

+

{

66

+

}

67

+

68

+

public bool StartActive = false;

69

+

public bool _active = false;

70

+

public bool Active {

71

+

set {

72

+

if (value && !_recording)

73

+

{

74

+

_active = true;

75

+

BeginRecognition();

76

+

}

77

+

else if (!value && _recording)

78

+

{

79

+

_active = false;

80

+

EndRecording();

81

+

}

82

+

}

83

+

}

84

+

85

+

public void Start()

86

+

{

87

+

#if PLATFORM_ANDROID

88

+

if (!Permission.HasUserAuthorizedPermission(Permission.Microphone))

89

+

{

90

+

Permission.RequestUserPermission(Permission.Microphone);

91

+

dialog = new GameObject();

92

+

}

93

+

#endif

94

+

if (_speech == null)

95

+

{

96

+

CreateChannel();

97

+

Active = StartActive;

98

+

}

99

+

100

+

Debug.Log(_speech);

101

+

102

+

}

103

+

104

+

public void Update()

105

+

{

106

+

// Count the how long the stream has been running

107

+

// if the run time exceeds 340s, restart the stream

108

+

if (_recording)

109

+

{

110

+

t += Time.deltaTime;

111

+

if (t > 340)

112

+

{

113

+

EndRecording();

114

+

BeginRecognition();

115

+

t = 0;

116

+

}

117

+

}

118

+

119

+

}

120

+

121

+

void OnGUI()

122

+

{

123

+

124

+

#if PLATFORM_ANDROID

125

+

if (!Permission.HasUserAuthorizedPermission(Permission.Microphone))

126

+

{

127

+

// The user denied permission to use the microphone.

128

+

// Display a message explaining why you need it with Yes/No buttons.

129

+

// If the user says yes then present the request again

130

+

// Display a dialog here.

131

+

dialog.AddComponent<PermissionsRationaleDialog>();

132

+

return;

133

+

}

134

+

else if (dialog != null)

135

+

{

136

+

Destroy(dialog);

137

+

}

138

+

#endif

139

+

140

+

// Now you can do things with the microphone

141

+

}

142

+

143

+

public void CreateChannel()

144

+

{

145

+

var key = Resources.Load<TextAsset>("key");

146

+

_config = new RecognitionConfig()

147

+

{

148

+

Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,

149

+

SampleRateHertz = 22050,

150

+

LanguageCode = "en",

151

+

};

152

+

153

+

var credentials = GoogleCredential.FromJson(key.text)

154

+

.CreateScoped();

155

+

var channel = new Grpc.Core.Channel(

156

+

SpeechClient.DefaultEndpoint.Host,

157

+

credentials.ToChannelCredentials()

158

+

);

159

+

_speech = SpeechClient.Create(channel);

160

+

}

161

+

162

+

public void BeginRecognition()

163

+

{

164

+

165

+

if (_speech == null)

166

+

{

167

+

CreateChannel();

168

+

}

169

+

_streamingCall = _speech.StreamingRecognize();

170

+

_streamingCall.WriteAsync(

171

+

new StreamingRecognizeRequest()

172

+

{

173

+

StreamingConfig = new StreamingRecognitionConfig()

174

+

{

175

+

Config = _config,

176

+

InterimResults = true,

177

+

}

178

+

}

179

+

).Wait();

180

+

_recording = true;

181

+

_stream_started = true;

182

+

StartCoroutine(AudioListener());

183

+

ListenTask = new Task(() => listen());

184

+

ListenTask.RunSynchronously();

185

+

}

186

+

187

+

// Funciton ends the recording of audio

188

+

public void EndRecording()

189

+

{

190

+

if (_stream_started)

191

+

{

192

+

_streamingCall.WriteCompleteAsync();

193

+

_stream_started = false;

194

+

_recording = false;

195

+

}

196

+

}

197

+

198

+

/// <summary>

199

+

/// This method provides the coroutine needed for recording audio from the microphone

200

+

/// to a buffer that can be sent to google speech. Since it uses a ring buffer, once

201

+

/// the buffer is full it loops to the start and overwrites old data. This buffer is

202

+

/// streamed in 1/2 * buffer.length chunks. This allows for reduced recording latency.

203

+

/// </summary>

204

+

/// <returns></returns>

205

+

IEnumerator AudioListener()

206

+

{

207

+

208

+

_audioClip = Microphone.Start(null, true, 1, _recordingHZ);

209

+

210

+

yield return null;

211

+

212

+

if (_audioClip == null)

213

+

{

214

+

yield break;

215

+

}

216

+

217

+

float[] samples = null;

218

+

bool inFirstHalf = true;

219

+

int midPoint = _audioClip.samples / 2;

220

+

221

+

while (_recording && _audioClip != null)

222

+

{

223

+

224

+

int writePos = Microphone.GetPosition(null);

225

+

226

+

if (writePos > _audioClip.samples || !Microphone.IsRecording(null))

227

+

{

228

+

Debug.Log("Failed to get Microphone");

229

+

yield break;

230

+

}

231

+

// Check if we were in the first half, but have crossed the mid point (upper half full)

232

+

// or if we were in the last half, but have wrapped to the start (lower half full)

233

+

if ((inFirstHalf && writePos >= midPoint) ||

234

+

(!inFirstHalf && writePos < midPoint))

235

+

{

236

+

samples = new float[midPoint];

237

+

_audioClip.GetData(samples, inFirstHalf ? 0 : midPoint);

238

+

239

+

SendAudio(samples);

240

+

inFirstHalf = !inFirstHalf;

241

+

}

242

+

else

243

+

{

244

+

// Else wait the amount of time to fill the buffer.

245

+

int remaining = inFirstHalf ? (midPoint - writePos) : (_audioClip.samples - writePos);

246

+

float timeRemaining = remaining / (float)_recordingHZ;

247

+

yield return new WaitForSeconds(timeRemaining);

248

+

}

249

+

250

+

}

251

+

}

252

+

253

+

/// <summary>

254

+

/// This function sends the audio to google. However, if the audio is below

255

+

/// SILENEC_THRESHOLD then it does not send the audio.

256

+

/// </summary>

257

+

/// <param name="samples"></param>

258

+

public void SendAudio(float[] samples)

259

+

{

260

+

// If the samples are all below the SILENCE_THRESHOLD, then the audio is not sent

261

+

// to Google for processing. Additionally, it closes the streaming call. This is

262

+

// done to prevent Google Cloud Speech API from crashing the program. Google

263

+

// Cloud Speech API expect a continue stream of real time audio. Not sending

264

+

// audio is concidered an error. Therefore, the stream needs to be closed and

265

+

// later restarted.

266

+

if (DetectSilence(samples, SILENCE_THRESHOLD))

267

+

{

268

+

if (_stream_started)

269

+

{

270

+

_streamingCall.WriteCompleteAsync().Wait();

271

+

ListenTask.Wait();

272

+

_stream_started = false;

273

+

274

+

}

275

+

}

276

+

else

277

+

{

278

+

// Start the stream if it has been closed.

279

+

if (!_stream_started)

280

+

{

281

+

StartStream();

282

+

}

283

+

var bytes = AudioUtils.FloatsToLinear16(samples);

284

+

_streamingCall.WriteAsync(

285

+

new StreamingRecognizeRequest()

286

+

{

287

+

AudioContent = Google.Protobuf.ByteString

288

+

.CopyFrom(bytes, 0, bytes.Length)

289

+

});

290

+

}

291

+

292

+

}

293

+

294

+

/// <summary>

295

+

/// This function starts the streaming recognizer. This function blocks until it

296

+

/// receives the connection to Google's API.

297

+

/// </summary>

298

+

private void StartStream()

299

+

{

300

+

_streamingCall = _speech.StreamingRecognize();

301

+

_streamingCall.WriteAsync(

302

+

new StreamingRecognizeRequest()

303

+

{

304

+

StreamingConfig = new StreamingRecognitionConfig()

305

+

{

306

+

Config = _config,

307

+

InterimResults = true,

308

+

}

309

+

}).Wait();

310

+

_stream_started = true;

311

+

ListenTask = new Task(() => listen());

312

+

ListenTask.RunSynchronously();

313

+

}

314

+

315

+

/// <summary>

316

+

/// To avoid making wasted calls to Google's Speech API, this function is used to

317

+

/// detect if the audio is quieter than the threshold.

318

+

/// </summary>

319

+

/// <param name="samples">The buffer of audio samples as floating points</param>

320

+

/// <param name="threshold">The threashold to be treated as silence</param>

321

+

/// <returns></returns>

322

+

static private bool DetectSilence(float[] samples, float threshold)

323

+

{

324

+

float maxLevel = Mathf.Max(

325

+

Mathf.Abs(Mathf.Min(samples)),

326

+

Mathf.Abs(Mathf.Max(samples))

327

+

);

328

+

return maxLevel < threshold;

329

+

}

330

+

331

+

332

+

/// <summary>

333

+

/// This function is ran in the background to read audio from the ResponseStream.

334

+

/// This must be ran asynchronosly otherwise an error will be raised.

335

+

///

336

+

/// This funciton invokes two events: One for a partial result and one for a final result.

337

+

/// Callbacks can be attached to these events to react to incoming transcripts.

338

+

/// </summary>

339

+

private async void listen()

340

+

{

341

+

while (await _streamingCall.ResponseStream.MoveNext(default))

342

+

{

343

+

foreach (var result in _streamingCall.ResponseStream.Current.Results)

344

+

{

345

+

OnResultEvent.Invoke(result);

346

+

if (result.IsFinal)

347

+

{

348

+

OnFinalResultEvent.Invoke(result);

349

+

}

350

+

}

351

+

}

352

+

353

+

}

354

+

355

+

private bool resumeWhenUnpaused = false;

356

+

private void OnApplicationPause(bool pause)

357

+

{

358

+

if (pause)

359

+

{

360

+

resumeWhenUnpaused = _active;

361

+

Active = false;

362

+

}

363

+

else

364

+

{

365

+

Active = resumeWhenUnpaused;

366

+

}

367

+

}

368

+

369

+

}

capitalex / GoogleVoiceStreaming.cs

capitalex revidoval tento gist 6 months ago. Přejít na revizi

		@@ -0,0 +1,369 @@
1	+
2	+	using Google.Apis.Auth.OAuth2;
3	+	using Google.Cloud.Speech.V1;
4	+	using Google.Protobuf.Collections;
5	+	using Grpc.Auth;
6	+	using System;
7	+	using System.Collections;
8	+	using System.Collections.Generic;
9	+	using System.IO;
10	+	using System.Threading.Tasks;
11	+	using UnityEngine;
12	+	using UnityEngine.Events;
13	+	#if PLATFORM_ANDROID
14	+	using UnityEngine.Android;
15	+	#endif
16	+
17	+	public class GoogleVoiceStreaming : MonoBehaviour
18	+	{
19	+	[Serializable]
20	+	public class TranscriptEvent : UnityEvent<StreamingRecognitionResult> { }
21	+	public TranscriptEvent OnResultEvent;
22	+	public TranscriptEvent OnFinalResultEvent;
23	+
24	+	float SILENCE_THRESHOLD = 0.01f;
25	+	AudioClip _audioClip = null;
26	+	bool _recording = false;
27	+	int _recordingHZ = 22050;
28	+	bool _stream_started = false;
29	+	Task ListenTask;
30	+	float t = 0;
31	+
32	+	RecognitionConfig _config;
33	+	SpeechClient _speech;
34	+	SpeechClient.StreamingRecognizeStream _streamingCall;
35	+	#if PLATFORM_ANDROID
36	+	GameObject dialog = null;
37	+	#endif
38	+
39	+	~GoogleVoiceStreaming () {
40	+	if (_stream_started)
41	+	{
42	+	_streamingCall.WriteCompleteAsync().Wait();
43	+	}
44	+	}
45	+
46	+	public void OnDestroy()
47	+	{
48	+	if (_stream_started)
49	+	{
50	+	_streamingCall.WriteCompleteAsync().Wait();
51	+
52	+	}
53	+	}
54	+
55	+	void OnDisable()
56	+	{
57	+	Debug.Log("Disabled");
58	+	if (_stream_started)
59	+	{
60	+	EndRecording();
61	+	}
62	+	}
63	+
64	+	void OnActive ()
65	+	{
66	+	}
67	+
68	+	public bool StartActive = false;
69	+	public bool _active = false;
70	+	public bool Active {
71	+	set {
72	+	if (value && !_recording)
73	+	{
74	+	_active = true;
75	+	BeginRecognition();
76	+	}
77	+	else if (!value && _recording)
78	+	{
79	+	_active = false;
80	+	EndRecording();
81	+	}
82	+	}
83	+	}
84	+
85	+	public void Start()
86	+	{
87	+	#if PLATFORM_ANDROID
88	+	if (!Permission.HasUserAuthorizedPermission(Permission.Microphone))
89	+	{
90	+	Permission.RequestUserPermission(Permission.Microphone);
91	+	dialog = new GameObject();
92	+	}
93	+	#endif
94	+	if (_speech == null)
95	+	{
96	+	CreateChannel();
97	+	Active = StartActive;
98	+	}
99	+
100	+	Debug.Log(_speech);
101	+
102	+	}
103	+
104	+	public void Update()
105	+	{
106	+	// Count the how long the stream has been running
107	+	// if the run time exceeds 340s, restart the stream
108	+	if (_recording)
109	+	{
110	+	t += Time.deltaTime;
111	+	if (t > 340)
112	+	{
113	+	EndRecording();
114	+	BeginRecognition();
115	+	t = 0;
116	+	}
117	+	}
118	+
119	+	}
120	+
121	+	void OnGUI()
122	+	{
123	+
124	+	#if PLATFORM_ANDROID
125	+	if (!Permission.HasUserAuthorizedPermission(Permission.Microphone))
126	+	{
127	+	// The user denied permission to use the microphone.
128	+	// Display a message explaining why you need it with Yes/No buttons.
129	+	// If the user says yes then present the request again
130	+	// Display a dialog here.
131	+	dialog.AddComponent<PermissionsRationaleDialog>();
132	+	return;
133	+	}
134	+	else if (dialog != null)
135	+	{
136	+	Destroy(dialog);
137	+	}
138	+	#endif
139	+
140	+	// Now you can do things with the microphone
141	+	}
142	+
143	+	public void CreateChannel()
144	+	{
145	+	var key = Resources.Load<TextAsset>("key");
146	+	_config = new RecognitionConfig()
147	+	{
148	+	Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
149	+	SampleRateHertz = 22050,
150	+	LanguageCode = "en",
151	+	};
152	+
153	+	var credentials = GoogleCredential.FromJson(key.text)
154	+	.CreateScoped();
155	+	var channel = new Grpc.Core.Channel(
156	+	SpeechClient.DefaultEndpoint.Host,
157	+	credentials.ToChannelCredentials()
158	+	);
159	+	_speech = SpeechClient.Create(channel);
160	+	}
161	+
162	+	public void BeginRecognition()
163	+	{
164	+
165	+	if (_speech == null)
166	+	{
167	+	CreateChannel();
168	+	}
169	+	_streamingCall = _speech.StreamingRecognize();
170	+	_streamingCall.WriteAsync(
171	+	new StreamingRecognizeRequest()
172	+	{
173	+	StreamingConfig = new StreamingRecognitionConfig()
174	+	{
175	+	Config = _config,
176	+	InterimResults = true,
177	+	}
178	+	}
179	+	).Wait();
180	+	_recording = true;
181	+	_stream_started = true;
182	+	StartCoroutine(AudioListener());
183	+	ListenTask = new Task(() => listen());
184	+	ListenTask.RunSynchronously();
185	+	}
186	+
187	+	// Funciton ends the recording of audio
188	+	public void EndRecording()
189	+	{
190	+	if (_stream_started)
191	+	{
192	+	_streamingCall.WriteCompleteAsync();
193	+	_stream_started = false;
194	+	_recording = false;
195	+	}
196	+	}
197	+
198	+	/// <summary>
199	+	/// This method provides the coroutine needed for recording audio from the microphone
200	+	/// to a buffer that can be sent to google speech. Since it uses a ring buffer, once
201	+	/// the buffer is full it loops to the start and overwrites old data. This buffer is
202	+	/// streamed in 1/2 * buffer.length chunks. This allows for reduced recording latency.
203	+	/// </summary>
204	+	/// <returns></returns>
205	+	IEnumerator AudioListener()
206	+	{
207	+
208	+	_audioClip = Microphone.Start(null, true, 1, _recordingHZ);
209	+
210	+	yield return null;
211	+
212	+	if (_audioClip == null)
213	+	{
214	+	yield break;
215	+	}
216	+
217	+	float[] samples = null;
218	+	bool inFirstHalf = true;
219	+	int midPoint = _audioClip.samples / 2;
220	+
221	+	while (_recording && _audioClip != null)
222	+	{
223	+
224	+	int writePos = Microphone.GetPosition(null);
225	+
226	+	if (writePos > _audioClip.samples \|\| !Microphone.IsRecording(null))
227	+	{
228	+	Debug.Log("Failed to get Microphone");
229	+	yield break;
230	+	}
231	+	// Check if we were in the first half, but have crossed the mid point (upper half full)
232	+	// or if we were in the last half, but have wrapped to the start (lower half full)
233	+	if ((inFirstHalf && writePos >= midPoint) \|\|
234	+	(!inFirstHalf && writePos < midPoint))
235	+	{
236	+	samples = new float[midPoint];
237	+	_audioClip.GetData(samples, inFirstHalf ? 0 : midPoint);
238	+
239	+	SendAudio(samples);
240	+	inFirstHalf = !inFirstHalf;
241	+	}
242	+	else
243	+	{
244	+	// Else wait the amount of time to fill the buffer.
245	+	int remaining = inFirstHalf ? (midPoint - writePos) : (_audioClip.samples - writePos);
246	+	float timeRemaining = remaining / (float)_recordingHZ;
247	+	yield return new WaitForSeconds(timeRemaining);
248	+	}
249	+
250	+	}
251	+	}
252	+
253	+	/// <summary>
254	+	/// This function sends the audio to google. However, if the audio is below
255	+	/// SILENEC_THRESHOLD then it does not send the audio.
256	+	/// </summary>
257	+	/// <param name="samples"></param>
258	+	public void SendAudio(float[] samples)
259	+	{
260	+	// If the samples are all below the SILENCE_THRESHOLD, then the audio is not sent
261	+	// to Google for processing. Additionally, it closes the streaming call. This is
262	+	// done to prevent Google Cloud Speech API from crashing the program. Google
263	+	// Cloud Speech API expect a continue stream of real time audio. Not sending
264	+	// audio is concidered an error. Therefore, the stream needs to be closed and
265	+	// later restarted.
266	+	if (DetectSilence(samples, SILENCE_THRESHOLD))
267	+	{
268	+	if (_stream_started)
269	+	{
270	+	_streamingCall.WriteCompleteAsync().Wait();
271	+	ListenTask.Wait();
272	+	_stream_started = false;
273	+
274	+	}
275	+	}
276	+	else
277	+	{
278	+	// Start the stream if it has been closed.
279	+	if (!_stream_started)
280	+	{
281	+	StartStream();
282	+	}
283	+	var bytes = AudioUtils.FloatsToLinear16(samples);
284	+	_streamingCall.WriteAsync(
285	+	new StreamingRecognizeRequest()
286	+	{
287	+	AudioContent = Google.Protobuf.ByteString
288	+	.CopyFrom(bytes, 0, bytes.Length)
289	+	});
290	+	}
291	+
292	+	}
293	+
294	+	/// <summary>
295	+	/// This function starts the streaming recognizer. This function blocks until it
296	+	/// receives the connection to Google's API.
297	+	/// </summary>
298	+	private void StartStream()
299	+	{
300	+	_streamingCall = _speech.StreamingRecognize();
301	+	_streamingCall.WriteAsync(
302	+	new StreamingRecognizeRequest()
303	+	{
304	+	StreamingConfig = new StreamingRecognitionConfig()
305	+	{
306	+	Config = _config,
307	+	InterimResults = true,
308	+	}
309	+	}).Wait();
310	+	_stream_started = true;
311	+	ListenTask = new Task(() => listen());
312	+	ListenTask.RunSynchronously();
313	+	}
314	+
315	+	/// <summary>
316	+	/// To avoid making wasted calls to Google's Speech API, this function is used to
317	+	/// detect if the audio is quieter than the threshold.
318	+	/// </summary>
319	+	/// <param name="samples">The buffer of audio samples as floating points</param>
320	+	/// <param name="threshold">The threashold to be treated as silence</param>
321	+	/// <returns></returns>
322	+	static private bool DetectSilence(float[] samples, float threshold)
323	+	{
324	+	float maxLevel = Mathf.Max(
325	+	Mathf.Abs(Mathf.Min(samples)),
326	+	Mathf.Abs(Mathf.Max(samples))
327	+	);
328	+	return maxLevel < threshold;
329	+	}
330	+
331	+
332	+	/// <summary>
333	+	/// This function is ran in the background to read audio from the ResponseStream.
334	+	/// This must be ran asynchronosly otherwise an error will be raised.
335	+	///
336	+	/// This funciton invokes two events: One for a partial result and one for a final result.
337	+	/// Callbacks can be attached to these events to react to incoming transcripts.
338	+	/// </summary>
339	+	private async void listen()
340	+	{
341	+	while (await _streamingCall.ResponseStream.MoveNext(default))
342	+	{
343	+	foreach (var result in _streamingCall.ResponseStream.Current.Results)
344	+	{
345	+	OnResultEvent.Invoke(result);
346	+	if (result.IsFinal)
347	+	{
348	+	OnFinalResultEvent.Invoke(result);
349	+	}
350	+	}
351	+	}
352	+
353	+	}
354	+
355	+	private bool resumeWhenUnpaused = false;
356	+	private void OnApplicationPause(bool pause)
357	+	{
358	+	if (pause)
359	+	{
360	+	resumeWhenUnpaused = _active;
361	+	Active = false;
362	+	}
363	+	else
364	+	{
365	+	Active = resumeWhenUnpaused;
366	+	}
367	+	}
368	+
369	+	}