web-speech-cognitive-services - npm Package Compare versions...

lib/BingSpeech/TextToSpeech/isSSML.js

lib/BingSpeech/TextToSpeech/isSSML.js.map

lib/SpeechServices/TextToSpeech/isSSML.js

lib/SpeechServices/TextToSpeech/isSSML.js.map

lib/SpeechServices/TextToSpeech/SpeechSynthesisEvent.js

lib/SpeechServices/TextToSpeech/SpeechSynthesisEvent.js.map

10

CHANGELOG.md

		@@ -35,2 +35,7 @@ # Changelog
		- Use option `speechSynthesisDeploymentId`
		- Speech synthesis: Fix [#48](https://github.com/compulim/web-speech-cognitive-services/issues/48), support output format through `outputFormat` option, in PR [#49](https://github.com/compulim/web-speech-cognitive-services/pull/49)
		- `*`: Fix [#47](https://github.com/compulim/web-speech-cognitive-services/issues/47), add `enableTelemetry` option for disabling collecting telemetry data in Speech SDK, in PR [#51](https://github.com/compulim/web-speech-cognitive-services/pull/51)
		- `*`: Fix [#53](https://github.com/compulim/web-speech-cognitive-services/issues/53), added ESLint, in PR [#54](https://github.com/compulim/web-speech-cognitive-services/pull/54)
		- Speech synthesis: Fix [#39](https://github.com/compulim/web-speech-cognitive-services/issues/39), support SSML utterance, in PR [#57](https://github.com/compulim/web-speech-cognitive-services/pull/57)
		- Speech recognition: Fix [#59](https://github.com/compulim/web-speech-cognitive-services/issues/59), support `stop()` function by finalizing partial speech, in PR [#60](https://github.com/compulim/web-speech-cognitive-services/pull/60)

		@@ -55,3 +60,8 @@ ### Changed
		- Bumped to [microsoft-cognitiveservices-speech-sdk@1.6.0](https://www.npmjs.com/package/microsoft-cognitiveservices-speech-sdk), in PR [#22](https://github.com/compulim/web-speech-cognitive-services/pull/22)
		- Fix [#55](https://github.com/compulim/web-speech-cognitive-services/issues/55) and [#63](https://github.com/compulim/web-speech-cognitive-services/issues/63). Moves to [WHATWG `EventTarget` interface](https://dom.spec.whatwg.org/#interface-eventtarget), in PR [#56](https://github.com/compulim/web-speech-cognitive-services/pulls/56) and PR [#64](https://github.com/compulim/web-speech-cognitive-services/pulls/64)

		### Fixed

		- Fix [#45](https://github.com/compulim/web-speech-cognitive-services/issues/45). Speech synthesize should emit "start" and "error" if the synthesized audio clip cannot be fetch over the network, in PR [#46](https://github.com/compulim/web-speech-cognitive-services/issues/46)

		## [4.0.0] - 2018-12-10
		@@ -58,0 +68,0 @@ ### Added

2

lib/BingSpeech/SpeechToText/createSpeechRecognitionPonyfill.js

		@@ -46,3 +46,3 @@ "use strict";

		var VERSION = "4.0.1-master.4a38b83";
		var VERSION = "4.0.1-master.54dc22a";

		@@ -49,0 +49,0 @@ function buildSpeechResult(transcript, confidence, isFinal) {

3

lib/SpeechServices.js

		@@ -51,7 +51,6 @@ "use strict";

		;
		var meta = document.createElement('meta');
		meta.setAttribute('name', 'web-speech-cognitive-services');
		meta.setAttribute('content', "version=".concat("4.0.1-master.4a38b83"));
		meta.setAttribute('content', "version=".concat("4.0.1-master.54dc22a"));
		document.head.appendChild(meta);
		//# sourceMappingURL=SpeechServices.js.map

2

lib/SpeechServices/fetchAuthorizationToken.js

		@@ -41,3 +41,3 @@ "use strict";

		if (!(res.status !== 200)) {
		if (res.ok) {
		_context.next = 6;
		@@ -44,0 +44,0 @@ break;

13

lib/SpeechServices/SpeechSDK.js

		"use strict";

		var _interopRequireWildcard = require("@babel/runtime/helpers/interopRequireWildcard");

		Object.defineProperty(exports, "__esModule", {
		@@ -10,6 +8,13 @@ value: true

		var CognitiveServicesSpeechSDK = _interopRequireWildcard(require("microsoft-cognitiveservices-speech-sdk/distrib/lib/microsoft.cognitiveservices.speech.sdk"));
		var _microsoftCognitiveservicesSpeech = require("microsoft-cognitiveservices-speech-sdk/distrib/lib/microsoft.cognitiveservices.speech.sdk");

		var _default = CognitiveServicesSpeechSDK;
		// We are only importing what we need.
		var _default = {
		AudioConfig: _microsoftCognitiveservicesSpeech.AudioConfig,
		OutputFormat: _microsoftCognitiveservicesSpeech.OutputFormat,
		ResultReason: _microsoftCognitiveservicesSpeech.ResultReason,
		SpeechConfig: _microsoftCognitiveservicesSpeech.SpeechConfig,
		SpeechRecognizer: _microsoftCognitiveservicesSpeech.SpeechRecognizer
		};
		exports.default = _default;
		//# sourceMappingURL=SpeechSDK.js.map

4

lib/SpeechServices/SpeechToText/cognitiveServiceEventResultToWebSpeechRecognitionResultList.js

		@@ -45,6 +45,6 @@ "use strict";
		return resultList;
		} else {
		return [];
		}

		return [];
		}
		//# sourceMappingURL=cognitiveServiceEventResultToWebSpeechRecognitionResultList.js.map

309

lib/SpeechServices/SpeechToText/createSpeechRecognitionPonyfill.js

		@@ -12,4 +12,2 @@ "use strict";

		var _defineProperty2 = _interopRequireDefault(require("@babel/runtime/helpers/defineProperty"));

		var _regenerator = _interopRequireDefault(require("@babel/runtime/regenerator"));
		@@ -19,4 +17,2 @@

		var _classCallCheck2 = _interopRequireDefault(require("@babel/runtime/helpers/classCallCheck"));

		var _createClass2 = _interopRequireDefault(require("@babel/runtime/helpers/createClass"));
		@@ -30,2 +26,8 @@

		var _defineProperty2 = _interopRequireDefault(require("@babel/runtime/helpers/defineProperty"));

		var _classCallCheck2 = _interopRequireDefault(require("@babel/runtime/helpers/classCallCheck"));

		var _eventTargetShim = require("event-target-shim");

		var _cognitiveServiceEventResultToWebSpeechRecognitionResultList = _interopRequireDefault(require("./cognitiveServiceEventResultToWebSpeechRecognitionResultList"));
		@@ -35,4 +37,2 @@

		var _DOMEventEmitter2 = _interopRequireDefault(require("../../Util/DOMEventEmitter"));

		var _SpeechGrammarList = _interopRequireDefault(require("./SpeechGrammarList"));
		@@ -120,14 +120,32 @@

		var SpeechRecognitionEvent = function SpeechRecognitionEvent(type) {
		var _ref2 = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {},
		data = _ref2.data,
		emma = _ref2.emma,
		interpretation = _ref2.interpretation,
		resultIndex = _ref2.resultIndex,
		results = _ref2.results;

		(0, _classCallCheck2.default)(this, SpeechRecognitionEvent);
		this.data = data;
		this.emma = emma;
		this.interpretation = interpretation;
		this.resultIndex = resultIndex;
		this.results = results;
		this.type = type;
		};

		var _default = function _default() {
		var _ref2 = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {},
		_ref2$audioConfig = _ref2.audioConfig,
		audioConfig = _ref2$audioConfig === void 0 ? AudioConfig.fromDefaultMicrophoneInput() : _ref2$audioConfig,
		authorizationToken = _ref2.authorizationToken,
		referenceGrammars = _ref2.referenceGrammars,
		_ref2$region = _ref2.region,
		region = _ref2$region === void 0 ? 'westus' : _ref2$region,
		speechRecognitionEndpointId = _ref2.speechRecognitionEndpointId,
		subscriptionKey = _ref2.subscriptionKey,
		_ref2$textNormalizati = _ref2.textNormalization,
		textNormalization = _ref2$textNormalizati === void 0 ? 'display' : _ref2$textNormalizati;
		var _ref3 = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {},
		_ref3$audioConfig = _ref3.audioConfig,
		audioConfig = _ref3$audioConfig === void 0 ? AudioConfig.fromDefaultMicrophoneInput() : _ref3$audioConfig,
		authorizationToken = _ref3.authorizationToken,
		enableTelemetry = _ref3.enableTelemetry,
		referenceGrammars = _ref3.referenceGrammars,
		_ref3$region = _ref3.region,
		region = _ref3$region === void 0 ? 'westus' : _ref3$region,
		speechRecognitionEndpointId = _ref3.speechRecognitionEndpointId,
		subscriptionKey = _ref3.subscriptionKey,
		_ref3$textNormalizati = _ref3.textNormalization,
		textNormalization = _ref3$textNormalizati === void 0 ? 'display' : _ref3$textNormalizati;

		@@ -142,6 +160,37 @@ if (!authorizationToken && !subscriptionKey) {

		var _onAudibleChunk;

		var muted; // We modify "attach" function and detect when audible chunk is read.
		// We will only modify "attach" function once.

		audioConfig.attach = improviseAsync(audioConfig.attach.bind(audioConfig), function (reader) {
		return _objectSpread({}, reader, {
		read: improviseAsync(reader.read.bind(reader), function (chunk) {
		// The magic number 150 is measured by:
		// 1. Set microphone volume to 0
		// 2. Observe the amplitude (100-110) for the first few chunks
		// (This is short static caught when turning on the microphone)
		// 3. Set the number a bit higher than the observation
		if (averageAmplitude(chunk.buffer) > 150) {
		_onAudibleChunk && _onAudibleChunk();
		}

		if (muted) {
		return {
		buffer: new ArrayBuffer(0),
		isEnd: true,
		timeReceived: Date.now()
		};
		}

		return chunk;
		})
		});
		});
		SpeechRecognizer.enableTelemetry(enableTelemetry);

		var SpeechRecognition =
		/#__PURE__/
		function (_DOMEventEmitter) {
		(0, _inherits2.default)(SpeechRecognition, _DOMEventEmitter);
		function (_EventTarget) {
		(0, _inherits2.default)(SpeechRecognition, _EventTarget);

		@@ -152,3 +201,3 @@ function SpeechRecognition() {
		(0, _classCallCheck2.default)(this, SpeechRecognition);
		_this = (0, _possibleConstructorReturn2.default)(this, (0, _getPrototypeOf2.default)(SpeechRecognition).call(this, ['audiostart', 'soundstart', 'speechstart', 'speechend', 'soundend', 'audioend', 'result', 'nomatch', 'error', 'start', 'end', 'cognitiveservices']));
		_this = (0, _possibleConstructorReturn2.default)(this, (0, _getPrototypeOf2.default)(SpeechRecognition).call(this));
		_this._continuous = false;
		@@ -238,4 +287,6 @@ _this._interimResults = false;
		value: function emitCognitiveServices(type, event) {
		this.emit('cognitiveservices', _objectSpread({}, event, {
		subType: type
		this.dispatchEvent(new SpeechRecognitionEvent('cognitiveservices', {
		data: _objectSpread({}, event, {
		type: type
		})
		}));
		@@ -252,6 +303,6 @@ }
		this._startOnce().catch(function (err) {
		_this2.emit('error', {
		_this2.dispatchEvent(new ErrorEvent('error', {
		error: err,
		message: err && err.message
		});
		}));
		});
		@@ -279,23 +330,11 @@ }
		queue = (0, _createPromiseQueue.default)();
		// We modify "attach" function and detect when the first chunk is read.
		recognizer.audioConfig.attach = improviseAsync(recognizer.audioConfig.attach.bind(recognizer.audioConfig), function (reader) {
		var firstAudibleChunkEmitted;
		return _objectSpread({}, reader, {
		read: improviseAsync(reader.read.bind(reader), function (chunk) {
		// The magic number 150 is measured by:
		// 1. Set microphone volume to 0
		// 2. Observe the amplitude (100-110) for the first few chunks
		// (This is short static caught when turning on the microphone)
		// 3. Set the number a bit higher than the observation
		if (!firstAudibleChunkEmitted && averageAmplitude(chunk.buffer) > 150) {
		queue.push({
		firstAudibleChunk: {}
		});
		firstAudibleChunkEmitted = true;
		}
		muted = false;

		return chunk;
		})
		_onAudibleChunk = function onAudibleChunk() {
		queue.push({
		firstAudibleChunk: {}
		});
		});
		_onAudibleChunk = null;
		};

		_recognizer$audioConf = recognizer.audioConfig.events.attach(function (event) {
		@@ -315,7 +354,7 @@ var name = event.name;

		recognizer.canceled = function (_, _ref3) {
		var errorDetails = _ref3.errorDetails,
		offset = _ref3.offset,
		reason = _ref3.reason,
		sessionId = _ref3.sessionId;
		recognizer.canceled = function (_, _ref4) {
		var errorDetails = _ref4.errorDetails,
		offset = _ref4.offset,
		reason = _ref4.reason,
		sessionId = _ref4.sessionId;
		queue.push({
		@@ -331,6 +370,6 @@ canceled: {

		recognizer.recognized = function (_, _ref4) {
		var offset = _ref4.offset,
		result = _ref4.result,
		sessionId = _ref4.sessionId;
		recognizer.recognized = function (_, _ref5) {
		var offset = _ref5.offset,
		result = _ref5.result,
		sessionId = _ref5.sessionId;
		queue.push({
		@@ -345,6 +384,6 @@ recognized: {

		recognizer.recognizing = function (_, _ref5) {
		var offset = _ref5.offset,
		result = _ref5.result,
		sessionId = _ref5.sessionId;
		recognizer.recognizing = function (_, _ref6) {
		var offset = _ref6.offset,
		result = _ref6.result,
		sessionId = _ref6.sessionId;
		queue.push({
		@@ -359,4 +398,4 @@ recognizing: {

		recognizer.sessionStarted = function (_, _ref6) {
		var sessionId = _ref6.sessionId;
		recognizer.sessionStarted = function (_, _ref7) {
		var sessionId = _ref7.sessionId;
		queue.push({
		@@ -369,4 +408,4 @@ sessionStarted: {

		recognizer.sessionStopped = function (_, _ref7) {
		var sessionId = _ref7.sessionId;
		recognizer.sessionStopped = function (_, _ref8) {
		var sessionId = _ref8.sessionId;
		// "sessionStopped" is never fired, probably because we are using startContinuousRecognitionAsync instead of recognizeOnceAsync.
		@@ -380,5 +419,5 @@ queue.push({

		recognizer.speechStartDetected = function (_, _ref8) {
		var offset = _ref8.offset,
		sessionId = _ref8.sessionId;
		recognizer.speechStartDetected = function (_, _ref9) {
		var offset = _ref9.offset,
		sessionId = _ref9.sessionId;
		queue.push({
		@@ -392,4 +431,4 @@ speechStartDetected: {

		recognizer.speechEndDetected = function (_, _ref9) {
		var sessionId = _ref9.sessionId;
		recognizer.speechEndDetected = function (_, _ref10) {
		var sessionId = _ref10.sessionId;
		// "speechEndDetected" is never fired, probably because we are using startContinuousRecognitionAsync instead of recognizeOnceAsync.
		@@ -408,6 +447,6 @@ queue.push({
		phrases && phrases.length && dynamicGrammar.addPhrase(phrases);
		_context3.next = 19;
		_context3.next = 20;
		return cognitiveServicesAsyncToPromise(recognizer.startContinuousRecognitionAsync.bind(recognizer))();

		case 19:
		case 20:
		this.abort = function () {
		@@ -446,3 +485,3 @@ return queue.push({

		if (!/Permission\sdenied/.test(errorMessage \|\| '')) {
		if (!/Permission[\t-\r \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000\uFEFF]denied/.test(errorMessage \|\| '')) {
		_context2.next = 9;
		@@ -461,3 +500,3 @@ break;
		if (!loop) {
		_this3.emit('start');
		_this3.dispatchEvent(new SpeechRecognitionEvent('start'));
		}
		@@ -472,5 +511,5 @@
		if (!audioStarted) {
		_this3.emit('audiostart');
		_this3.dispatchEvent(new SpeechRecognitionEvent('audiostart'));

		_this3.emit('audioend');
		_this3.dispatchEvent(new SpeechRecognitionEvent('audioend'));
		}
		@@ -493,3 +532,3 @@
		if (!(abort \|\| stop)) {
		_context2.next = 22;
		_context2.next = 23;
		break;
		@@ -503,44 +542,48 @@ }
		};
		} else if (finalizedResults.length) {
		finalEvent = {
		results: finalizedResults,
		type: 'result'
		};
		} else {
		// When we set to mute and { isEnd: true }, Speech Services will send us "recognized" event.
		muted = true;
		}

		stopping = true;
		_context2.next = 20;

		if (!abort) {
		_context2.next = 21;
		break;
		}

		_context2.next = 21;
		return cognitiveServicesAsyncToPromise(recognizer.stopContinuousRecognitionAsync.bind(recognizer))();

		case 20:
		_context2.next = 42;
		case 21:
		_context2.next = 43;
		break;

		case 22:
		case 23:
		if (!audioSourceReady) {
		_context2.next = 27;
		_context2.next = 28;
		break;
		}

		_this3.emit('audiostart');
		_this3.dispatchEvent(new SpeechRecognitionEvent('audiostart'));

		audioStarted = true;
		_context2.next = 42;
		_context2.next = 43;
		break;

		case 27:
		case 28:
		if (!firstAudibleChunk) {
		_context2.next = 32;
		_context2.next = 33;
		break;
		}

		_this3.emit('soundstart');
		_this3.dispatchEvent(new SpeechRecognitionEvent('soundstart'));

		soundStarted = true;
		_context2.next = 42;
		_context2.next = 43;
		break;

		case 32:
		case 33:
		if (!audioSourceOff) {
		_context2.next = 41;
		_context2.next = 42;
		break;
		@@ -550,9 +593,9 @@ }
		stopping = true;
		speechStarted && _this3.emit('speechend');
		soundStarted && _this3.emit('soundend');
		audioStarted && _this3.emit('audioend');
		speechStarted && _this3.dispatchEvent(new SpeechRecognitionEvent('speechend'));
		soundStarted && _this3.dispatchEvent(new SpeechRecognitionEvent('soundend'));
		audioStarted && _this3.dispatchEvent(new SpeechRecognitionEvent('audioend'));
		audioStarted = soundStarted = speechStarted = false;
		return _context2.abrupt("return", "break");

		case 41:
		case 42:
		if (recognized && recognized.result && recognized.result.reason === ResultReason.NoMatch) {
		@@ -566,3 +609,3 @@ finalEvent = {
		// Unconfirmed prevention of quirks
		_this3.emit('audiostart');
		_this3.dispatchEvent(new SpeechRecognitionEvent('audiostart'));

		@@ -573,3 +616,3 @@ audioStarted = true;
		if (!soundStarted) {
		_this3.emit('soundstart');
		_this3.dispatchEvent(new SpeechRecognitionEvent('soundstart'));

		@@ -580,3 +623,3 @@ soundStarted = true;
		if (!speechStarted) {
		_this3.emit('speechstart');
		_this3.dispatchEvent(new SpeechRecognitionEvent('speechstart'));

		@@ -595,16 +638,17 @@ speechStarted = true;
		finalizedResults = [].concat((0, _toConsumableArray2.default)(finalizedResults), [result]);
		_this3.continuous && _this3.emit('result', {
		_this3.continuous && _this3.dispatchEvent(new SpeechRecognitionEvent('result', {
		results: finalizedResults
		});
		}));
		}

		finalEvent = {
		results: finalizedResults,
		type: 'result'
		};

		if (!_this3.continuous) {
		finalEvent = {
		results: finalizedResults,
		type: 'result'
		};
		recognizer.stopContinuousRecognitionAsync();
		}
		} else if (recognizing) {
		_this3.interimResults && _this3.emit('result', {
		_this3.interimResults && _this3.dispatchEvent(new SpeechRecognitionEvent('result', {
		results: [].concat((0, _toConsumableArray2.default)(finalizedResults), [(0, _cognitiveServiceEventResultToWebSpeechRecognitionResultList.default)(recognizing.result, {
		@@ -614,7 +658,7 @@ maxAlternatives: _this3.maxAlternatives,
		})])
		});
		}));
		}
		}

		case 42:
		case 43:
		case "end":
		@@ -628,36 +672,38 @@ return _context2.stop();

		case 24:
		case 25:
		if (!(!stopping \|\| audioStarted)) {
		_context3.next = 32;
		_context3.next = 33;
		break;
		}

		return _context3.delegateYield(_loop(loop), "t0", 26);
		return _context3.delegateYield(_loop(loop), "t0", 27);

		case 26:
		case 27:
		_ret = _context3.t0;

		if (!(_ret === "break")) {
		_context3.next = 29;
		_context3.next = 30;
		break;
		}

		return _context3.abrupt("break", 32);
		return _context3.abrupt("break", 33);

		case 29:
		case 30:
		loop++;
		_context3.next = 24;
		_context3.next = 25;
		break;

		case 32:
		case 33:
		_onAudibleChunk = null;

		if (speechStarted) {
		this.emit('speechend');
		this.dispatchEvent(new SpeechRecognitionEvent('speechend'));
		}

		if (soundStarted) {
		this.emit('soundend');
		this.dispatchEvent(new SpeechRecognitionEvent('soundend'));
		}

		if (audioStarted) {
		this.emit('audioend');
		this.dispatchEvent(new SpeechRecognitionEvent('audioend'));
		}
		@@ -673,3 +719,7 @@

		this.emit(finalEvent.type, finalEvent);
		if (finalEvent.type === 'error') {
		this.dispatchEvent(new ErrorEvent('error', finalEvent));
		} else {
		this.dispatchEvent(new SpeechRecognitionEvent(finalEvent.type, finalEvent));
		}
		} // Even though there is no "start" event emitted, we will still emit "end" event
		@@ -679,7 +729,7 @@ // This is mainly for "microphone blocked" story.

		this.emit('end');
		this.dispatchEvent(new SpeechRecognitionEvent('end'));
		detachAudioConfigEvent();
		recognizer.dispose();

		case 39:
		case 41:
		case "end":
		@@ -747,7 +797,20 @@ return _context3.stop();
		return SpeechRecognition;
		}(_DOMEventEmitter2.default);
		}(_eventTargetShim.EventTarget);

		(0, _eventTargetShim.defineEventAttribute)(SpeechRecognition.prototype, 'audioend');
		(0, _eventTargetShim.defineEventAttribute)(SpeechRecognition.prototype, 'audiostart');
		(0, _eventTargetShim.defineEventAttribute)(SpeechRecognition.prototype, 'cognitiveservices');
		(0, _eventTargetShim.defineEventAttribute)(SpeechRecognition.prototype, 'end');
		(0, _eventTargetShim.defineEventAttribute)(SpeechRecognition.prototype, 'error');
		(0, _eventTargetShim.defineEventAttribute)(SpeechRecognition.prototype, 'nomatch');
		(0, _eventTargetShim.defineEventAttribute)(SpeechRecognition.prototype, 'result');
		(0, _eventTargetShim.defineEventAttribute)(SpeechRecognition.prototype, 'soundend');
		(0, _eventTargetShim.defineEventAttribute)(SpeechRecognition.prototype, 'soundstart');
		(0, _eventTargetShim.defineEventAttribute)(SpeechRecognition.prototype, 'speechend');
		(0, _eventTargetShim.defineEventAttribute)(SpeechRecognition.prototype, 'speechstart');
		(0, _eventTargetShim.defineEventAttribute)(SpeechRecognition.prototype, 'start');
		return {
		SpeechGrammarList: _SpeechGrammarList.default,
		SpeechRecognition: SpeechRecognition
		SpeechRecognition: SpeechRecognition,
		SpeechRecognitionEvent: SpeechRecognitionEvent
		};
		@@ -754,0 +817,0 @@ };

1

lib/SpeechServices/SpeechToText/SpeechGrammarList.js

		@@ -14,2 +14,3 @@ "use strict";

		/* eslint class-methods-use-this: "off" */
		var _default =
		@@ -16,0 +17,0 @@ /#__PURE__/

1

lib/SpeechServices/TextToSpeech/AudioContextConsumer.js

		@@ -18,2 +18,3 @@ "use strict";

		/* eslint no-await-in-loop: "off" */
		var _default =
		@@ -20,0 +21,0 @@ /#__PURE__/

1

lib/SpeechServices/TextToSpeech/AudioContextQueue.js

		@@ -22,2 +22,3 @@ "use strict";

		/* eslint no-await-in-loop: "off" */
		var _default =
		@@ -24,0 +25,0 @@ /#__PURE__/

6

lib/SpeechServices/TextToSpeech/buildSSML.js

		@@ -8,2 +8,3 @@ "use strict";

		/* eslint no-magic-numbers: ["error", { "ignore": [0, 1, 100] }] */
		// Cognitive Services does not support unsigned percentage
		@@ -22,4 +23,3 @@ // It must be converted into +/- first.
		function buildSSML(_ref) {
		var gender = _ref.gender,
		lang = _ref.lang,
		var lang = _ref.lang,
		_ref$pitch = _ref.pitch,
		@@ -32,4 +32,4 @@ pitch = _ref$pitch === void 0 ? 1 : _ref$pitch,
		volume = _ref.volume;
		return "<speak version=\"1.0\" xml:lang=\"".concat(lang, "\">\n <voice xml:lang=\"").concat(lang, "\" xml:gender=\"").concat(gender, "\" name=\"").concat(voice, "\">\n <prosody pitch=\"").concat(relativePercentage(pitch), "\" rate=\"").concat(relativePercentage(rate), "\" volume=\"").concat(relativePercentage(volume), "\">\n ").concat(text, "\n </prosody>\n </voice>\n</speak>");
		return "<speak version=\"1.0\" xml:lang=\"".concat(lang, "\">\n <voice xml:lang=\"").concat(lang, "\" name=\"").concat(voice, "\">\n <prosody pitch=\"").concat(relativePercentage(pitch), "\" rate=\"").concat(relativePercentage(rate), "\" volume=\"").concat(relativePercentage(volume), "\">\n ").concat(text, "\n </prosody>\n </voice>\n</speak>");
		}
		//# sourceMappingURL=buildSSML.js.map

132

lib/SpeechServices/TextToSpeech/createSpeechSynthesisPonyfill.js

		@@ -24,8 +24,10 @@ "use strict";

		var _eventTargetShim = require("event-target-shim");

		var _memoizeOne = _interopRequireDefault(require("memoize-one"));

		var _onErrorResumeNext = _interopRequireDefault(require("on-error-resume-next"));

		var _AudioContextQueue = _interopRequireDefault(require("./AudioContextQueue"));

		var _DOMEventEmitter2 = _interopRequireDefault(require("../../Util/DOMEventEmitter"));

		var _fetchAuthorizationToken = _interopRequireDefault(require("../fetchAuthorizationToken"));
		@@ -35,6 +37,8 @@

		var _SpeechSynthesisEvent = _interopRequireDefault(require("./SpeechSynthesisEvent"));

		var _SpeechSynthesisUtterance = _interopRequireDefault(require("./SpeechSynthesisUtterance"));

		// Supported output format can be found at https://docs.microsoft.com/en-us/azure/cognitive-services/Speech/API-Reference-REST/BingVoiceOutput#Subscription
		var DEFAULT_OUTPUT_FORMAT = 'audio-16khz-128kbitrate-mono-mp3';
		// Supported output format can be found at https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs
		var DEFAULT_OUTPUT_FORMAT = 'audio-24khz-160kbitrate-mono-mp3';
		var TOKEN_EXPIRATION = 600000;
		@@ -53,2 +57,4 @@ var TOKEN_EARLY_RENEWAL = 60000;
		speechSynthesisDeploymentId = _ref.speechSynthesisDeploymentId,
		_ref$speechSynthesisO = _ref.speechSynthesisOutputFormat,
		speechSynthesisOutputFormat = _ref$speechSynthesisO === void 0 ? DEFAULT_OUTPUT_FORMAT : _ref$speechSynthesisO,
		subscriptionKey = _ref.subscriptionKey;
		@@ -82,4 +88,4 @@
		/#__PURE__/
		function (_DOMEventEmitter) {
		(0, _inherits2.default)(SpeechSynthesis, _DOMEventEmitter);
		function (_EventTarget) {
		(0, _inherits2.default)(SpeechSynthesis, _EventTarget);

		@@ -90,4 +96,3 @@ function SpeechSynthesis() {
		(0, _classCallCheck2.default)(this, SpeechSynthesis);
		_this = (0, _possibleConstructorReturn2.default)(this, (0, _getPrototypeOf2.default)(SpeechSynthesis).call(this, ['voiceschanged']));
		_this.outputFormat = DEFAULT_OUTPUT_FORMAT;
		_this = (0, _possibleConstructorReturn2.default)(this, (0, _getPrototypeOf2.default)(SpeechSynthesis).call(this));
		_this.queue = new _AudioContextQueue.default({
		@@ -133,38 +138,14 @@ audioContext: audioContext,

		return new Promise(
		/#__PURE__/
		function () {
		var _ref3 = (0, _asyncToGenerator2.default)(
		/#__PURE__/
		_regenerator.default.mark(function _callee(resolve, reject) {
		return _regenerator.default.wrap(function _callee$(_context) {
		while (1) {
		switch (_context.prev = _context.next) {
		case 0:
		utterance.addEventListener('end', resolve);
		utterance.addEventListener('error', reject);
		_context.next = 4;
		return getAuthorizationTokenPromise;
		return new Promise(function (resolve, reject) {
		utterance.addEventListener('end', resolve);
		utterance.addEventListener('error', reject);
		utterance.preload({
		authorizationTokenPromise: getAuthorizationTokenPromise,
		deploymentId: speechSynthesisDeploymentId,
		outputFormat: speechSynthesisOutputFormat,
		region: region
		});

		case 4:
		utterance.authorizationToken = _context.sent;
		utterance.deploymentId = speechSynthesisDeploymentId;
		utterance.region = region;
		utterance.outputFormat = _this2.outputFormat;
		utterance.preload();

		_this2.queue.push(utterance);

		case 10:
		case "end":
		return _context.stop();
		}
		}
		}, _callee);
		}));

		return function (_x, _x2) {
		return _ref3.apply(this, arguments);
		};
		}());
		_this2.queue.push(utterance);
		});
		}
		@@ -177,2 +158,4 @@ }, {
		_regenerator.default.mark(function _callee2() {
		var _this3 = this;

		return _regenerator.default.wrap(function _callee2$(_context2) {
		@@ -183,27 +166,46 @@ while (1) {
		if (speechSynthesisDeploymentId) {
		_context2.next = 12;
		_context2.next = 3;
		break;
		}

		_context2.t0 = _fetchVoices.default;
		_context2.next = 4;
		return getAuthorizationTokenPromise;
		_context2.next = 3;
		return (0, _onErrorResumeNext.default)(
		/#__PURE__/
		(0, _asyncToGenerator2.default)(
		/#__PURE__/
		_regenerator.default.mark(function _callee() {
		return _regenerator.default.wrap(function _callee$(_context) {
		while (1) {
		switch (_context.prev = _context.next) {
		case 0:
		_context.t0 = _fetchVoices.default;
		_context.next = 3;
		return getAuthorizationTokenPromise;

		case 4:
		_context2.t1 = _context2.sent;
		_context2.t2 = speechSynthesisDeploymentId;
		_context2.t3 = region;
		_context2.t4 = {
		authorizationToken: _context2.t1,
		deploymentId: _context2.t2,
		region: _context2.t3
		};
		_context2.next = 10;
		return (0, _context2.t0)(_context2.t4);
		case 3:
		_context.t1 = _context.sent;
		_context.t2 = speechSynthesisDeploymentId;
		_context.t3 = region;
		_context.t4 = {
		authorizationToken: _context.t1,
		deploymentId: _context.t2,
		region: _context.t3
		};
		_context.next = 9;
		return (0, _context.t0)(_context.t4);

		case 10:
		this.voices = _context2.sent;
		this.emit('voiceschanged');
		case 9:
		_this3.voices = _context.sent;

		case 12:
		_this3.dispatchEvent(new _SpeechSynthesisEvent.default('voiceschanged'));

		case 11:
		case "end":
		return _context.stop();
		}
		}
		}, _callee);
		})));

		case 3:
		case "end":
		@@ -213,3 +215,3 @@ return _context2.stop();
		}
		}, _callee2, this);
		}, _callee2);
		}));
		@@ -230,6 +232,8 @@
		return SpeechSynthesis;
		}(_DOMEventEmitter2.default);
		}(_eventTargetShim.EventTarget);

		(0, _eventTargetShim.defineEventAttribute)(SpeechSynthesis.prototype, 'voiceschanged');
		return {
		speechSynthesis: new SpeechSynthesis(),
		SpeechSynthesisEvent: _SpeechSynthesisEvent.default,
		SpeechSynthesisUtterance: _SpeechSynthesisUtterance.default
		@@ -236,0 +240,0 @@ };

25

lib/SpeechServices/TextToSpeech/fetchSpeechData.js

		@@ -18,2 +18,4 @@ "use strict";

		var _isSSML = _interopRequireDefault(require("./isSSML"));

		var DEFAULT_LANGUAGE = 'en-US';
		@@ -33,3 +35,3 @@ var DEFAULT_VOICE = 'Microsoft Server Speech Text to Speech Voice (en-US, JessaRUS)';
		_regenerator.default.mark(function _callee(_ref) {
		var authorizationToken, deploymentId, _ref$lang, lang, outputFormat, pitch, rate, region, text, _ref$voice, voice, volume, ssml, url, res;
		var authorizationTokenPromise, deploymentId, _ref$lang, lang, outputFormat, pitch, rate, region, text, _ref$voice, voice, volume, authorizationToken, ssml, url, res;

		@@ -40,3 +42,3 @@ return _regenerator.default.wrap(function _callee$(_context) {
		case 0:
		authorizationToken = _ref.authorizationToken, deploymentId = _ref.deploymentId, _ref$lang = _ref.lang, lang = _ref$lang === void 0 ? DEFAULT_LANGUAGE : _ref$lang, outputFormat = _ref.outputFormat, pitch = _ref.pitch, rate = _ref.rate, region = _ref.region, text = _ref.text, _ref$voice = _ref.voice, voice = _ref$voice === void 0 ? DEFAULT_VOICE : _ref$voice, volume = _ref.volume;
		authorizationTokenPromise = _ref.authorizationTokenPromise, deploymentId = _ref.deploymentId, _ref$lang = _ref.lang, lang = _ref$lang === void 0 ? DEFAULT_LANGUAGE : _ref$lang, outputFormat = _ref.outputFormat, pitch = _ref.pitch, rate = _ref.rate, region = _ref.region, text = _ref.text, _ref$voice = _ref.voice, voice = _ref$voice === void 0 ? DEFAULT_VOICE : _ref$voice, volume = _ref.volume;

		@@ -51,3 +53,8 @@ if (text) {
		case 3:
		ssml = (0, _buildSSML.default)({
		_context.next = 5;
		return authorizationTokenPromise;

		case 5:
		authorizationToken = _context.sent;
		ssml = (0, _isSSML.default)(text) ? text : (0, _buildSSML.default)({
		lang: lang,
		@@ -62,3 +69,3 @@ pitch: pitch,
		url = deploymentId ? SYNTHESIS_CUSTOM_VOICE_URL_TEMPLATE.replace(/\{region\}/, encodeURI(region)).replace(/\{deploymentId\}/, encodeURI(deploymentId)) : SYNTHESIS_URL_TEMPLATE.replace(/\{region\}/, encodeURI(region));
		_context.next = 7;
		_context.next = 10;
		return fetch(url, {
		@@ -74,7 +81,7 @@ headers: {

		case 7:
		case 10:
		res = _context.sent;

		if (!(res.status !== 200)) {
		_context.next = 10;
		if (res.ok) {
		_context.next = 13;
		break;
		@@ -85,6 +92,6 @@ }

		case 10:
		case 13:
		return _context.abrupt("return", res.arrayBuffer());

		case 11:
		case 14:
		case "end":
		@@ -91,0 +98,0 @@ return _context.stop();

1

lib/SpeechServices/TextToSpeech/fetchVoices.js

		@@ -16,2 +16,3 @@ "use strict";

		/* eslint no-magic-numbers: ["error", { "ignore": [0, 1, -1] }] */
		function _default(_x) {
		@@ -18,0 +19,0 @@ return _ref2.apply(this, arguments);

193

lib/SpeechServices/TextToSpeech/SpeechSynthesisUtterance.js

		@@ -10,2 +10,6 @@ "use strict";

		var _regenerator = _interopRequireDefault(require("@babel/runtime/regenerator"));

		var _asyncToGenerator2 = _interopRequireDefault(require("@babel/runtime/helpers/asyncToGenerator"));

		var _classCallCheck2 = _interopRequireDefault(require("@babel/runtime/helpers/classCallCheck"));
		@@ -21,14 +25,15 @@

		var _regenerator = _interopRequireDefault(require("@babel/runtime/regenerator"));
		var _eventTargetShim = require("event-target-shim");

		var _asyncToGenerator2 = _interopRequireDefault(require("@babel/runtime/helpers/asyncToGenerator"));

		var _eventAsPromise = _interopRequireDefault(require("event-as-promise"));

		var _DOMEventEmitter2 = _interopRequireDefault(require("../../Util/DOMEventEmitter"));

		var _fetchSpeechData = _interopRequireDefault(require("./fetchSpeechData"));

		var _SpeechSynthesisEvent = _interopRequireDefault(require("./SpeechSynthesisEvent"));

		var _SpeechSynthesisVoice = _interopRequireDefault(require("./SpeechSynthesisVoice"));

		var _subscribeEvent = _interopRequireDefault(require("./subscribeEvent"));

		/* eslint no-empty: ["error", { "allowEmptyCatch": true }] */
		function asyncDecodeAudioData(audioContext, arrayBuffer) {
		@@ -43,55 +48,23 @@ return new Promise(function (resolve, reject) {
		function playDecoded(audioContext, audioBuffer, source) {
		return new Promise(
		/#__PURE__/
		function () {
		var _ref = (0, _asyncToGenerator2.default)(
		/#__PURE__/
		_regenerator.default.mark(function _callee(resolve, reject) {
		var audioContextClosed, sourceEnded, unsubscribe;
		return _regenerator.default.wrap(function _callee$(_context) {
		while (1) {
		switch (_context.prev = _context.next) {
		case 0:
		audioContextClosed = new _eventAsPromise.default();
		sourceEnded = new _eventAsPromise.default();
		unsubscribe = (0, _subscribeEvent.default)(audioContext, 'statechange', function (_ref2) {
		var state = _ref2.target.state;
		return state === 'closed' && audioContextClosed.eventListener();
		});
		_context.prev = 3;
		source.buffer = audioBuffer; // "ended" may not fire if the underlying AudioContext is closed prematurely
		return new Promise(function (resolve, reject) {
		var audioContextClosed = new _eventAsPromise.default();
		var sourceEnded = new _eventAsPromise.default();
		var unsubscribe = (0, _subscribeEvent.default)(audioContext, 'statechange', function (_ref) {
		var state = _ref.target.state;
		return state === 'closed' && audioContextClosed.eventListener();
		});

		source.onended = sourceEnded.eventListener;
		source.connect(audioContext.destination);
		source.start(0);
		_context.next = 10;
		return Promise.race([audioContextClosed.upcoming(), sourceEnded.upcoming()]);
		try {
		source.buffer = audioBuffer; // "ended" may not fire if the underlying AudioContext is closed prematurely

		case 10:
		resolve();
		_context.next = 16;
		break;

		case 13:
		_context.prev = 13;
		_context.t0 = _context["catch"](3);
		reject(_context.t0);

		case 16:
		_context.prev = 16;
		unsubscribe();
		return _context.finish(16);

		case 19:
		case "end":
		return _context.stop();
		}
		}
		}, _callee, null, [[3, 13, 16, 19]]);
		}));

		return function (_x, _x2) {
		return _ref.apply(this, arguments);
		};
		}());
		source.onended = sourceEnded.eventListener;
		source.connect(audioContext.destination);
		source.start(0);
		Promise.race([audioContextClosed.upcoming(), sourceEnded.upcoming()]).then(resolve);
		} catch (err) {
		reject(err);
		} finally {
		unsubscribe();
		}
		});
		}
		@@ -101,4 +74,4 @@
		/#__PURE__/
		function (_DOMEventEmitter) {
		(0, _inherits2.default)(_default, _DOMEventEmitter);
		function (_EventTarget) {
		(0, _inherits2.default)(_default, _EventTarget);

		@@ -109,3 +82,3 @@ function _default(text) {
		(0, _classCallCheck2.default)(this, _default);
		_this = (0, _possibleConstructorReturn2.default)(this, (0, _getPrototypeOf2.default)(_default).call(this, ['boundary', 'end', 'error', 'mark', 'pause', 'resume', 'start']));
		_this = (0, _possibleConstructorReturn2.default)(this, (0, _getPrototypeOf2.default)(_default).call(this));
		_this._lang = null;
		@@ -132,31 +105,44 @@ _this._pitch = 1;
		/#__PURE__/
		_regenerator.default.mark(function _callee2() {
		return _regenerator.default.wrap(function _callee2$(_context2) {
		_regenerator.default.mark(function _callee(_ref2) {
		var authorizationTokenPromise, deploymentId, outputFormat, region;
		return _regenerator.default.wrap(function _callee$(_context) {
		while (1) {
		switch (_context2.prev = _context2.next) {
		switch (_context.prev = _context.next) {
		case 0:
		authorizationTokenPromise = _ref2.authorizationTokenPromise, deploymentId = _ref2.deploymentId, outputFormat = _ref2.outputFormat, region = _ref2.region;
		this.arrayBufferPromise = (0, _fetchSpeechData.default)({
		authorizationToken: this.authorizationToken,
		deploymentId: this.deploymentId,
		authorizationTokenPromise: authorizationTokenPromise,
		deploymentId: deploymentId,
		lang: this.lang \|\| window.navigator.language,
		outputFormat: this.outputFormat,
		outputFormat: outputFormat,
		pitch: this.pitch,
		rate: this.rate,
		region: this.region,
		region: region,
		text: this.text,
		voice: this.voice && this.voice.voiceURI,
		volume: this.volume
		});
		_context2.next = 3;
		}); // We need to call "await" to make sure the Promise is running.
		// We will ignore the reject result and handled in play() later.

		_context.prev = 2;
		_context.next = 5;
		return this.arrayBufferPromise;

		case 3:
		case 5:
		_context.next = 9;
		break;

		case 7:
		_context.prev = 7;
		_context.t0 = _context["catch"](2);

		case 9:
		case "end":
		return _context2.stop();
		return _context.stop();
		}
		}
		}, _callee2, this);
		}, _callee, this, [[2, 7]]);
		}));

		function preload() {
		function preload(_x) {
		return _preload.apply(this, arguments);
		@@ -172,26 +158,27 @@ }
		/#__PURE__/
		_regenerator.default.mark(function _callee3(audioContext) {
		_regenerator.default.mark(function _callee2(audioContext) {
		var source, audioBuffer;
		return _regenerator.default.wrap(function _callee3$(_context3) {
		return _regenerator.default.wrap(function _callee2$(_context2) {
		while (1) {
		switch (_context3.prev = _context3.next) {
		switch (_context2.prev = _context2.next) {
		case 0:
		_context3.prev = 0;
		// HACK: iOS requires bufferSourceNode to be constructed before decoding data
		_context2.prev = 0;
		// We should emit "start" event even if preload() failed.
		this.dispatchEvent(new _SpeechSynthesisEvent.default('start')); // HACK: iOS requires bufferSourceNode to be constructed before decoding data.

		source = audioContext.createBufferSource();
		_context3.t0 = asyncDecodeAudioData;
		_context3.t1 = audioContext;
		_context3.next = 6;
		_context2.t0 = asyncDecodeAudioData;
		_context2.t1 = audioContext;
		_context2.next = 7;
		return this.arrayBufferPromise;

		case 6:
		_context3.t2 = _context3.sent;
		_context3.next = 9;
		return (0, _context3.t0)(_context3.t1, _context3.t2);
		case 7:
		_context2.t2 = _context2.sent;
		_context2.next = 10;
		return (0, _context2.t0)(_context2.t1, _context2.t2);

		case 9:
		audioBuffer = _context3.sent;
		this.emit('start');
		case 10:
		audioBuffer = _context2.sent;
		this._playingSource = source;
		_context3.next = 14;
		_context2.next = 14;
		return playDecoded(audioContext, audioBuffer, source);
		@@ -201,23 +188,22 @@
		this._playingSource = null;
		this.emit('end');
		_context3.next = 21;
		this.dispatchEvent(new _SpeechSynthesisEvent.default('end'));
		_context2.next = 21;
		break;

		case 18:
		_context3.prev = 18;
		_context3.t3 = _context3["catch"](0);
		this.emit('error', {
		error: _context3.t3,
		type: 'error'
		});
		_context2.prev = 18;
		_context2.t3 = _context2["catch"](0);
		this.dispatchEvent(new ErrorEvent('error', {
		error: _context2.t3
		}));

		case 21:
		case "end":
		return _context3.stop();
		return _context2.stop();
		}
		}
		}, _callee3, this, [[0, 18]]);
		}, _callee2, this, [[0, 18]]);
		}));

		function play(_x3) {
		function play(_x2) {
		return _play.apply(this, arguments);
		@@ -275,5 +261,12 @@ }
		return _default;
		}(_DOMEventEmitter2.default);
		}(_eventTargetShim.EventTarget);

		exports.default = _default;
		(0, _eventTargetShim.defineEventAttribute)(SpeechSynthesisUtterance.prototype, 'boundary');
		(0, _eventTargetShim.defineEventAttribute)(SpeechSynthesisUtterance.prototype, 'end');
		(0, _eventTargetShim.defineEventAttribute)(SpeechSynthesisUtterance.prototype, 'error');
		(0, _eventTargetShim.defineEventAttribute)(SpeechSynthesisUtterance.prototype, 'mark');
		(0, _eventTargetShim.defineEventAttribute)(SpeechSynthesisUtterance.prototype, 'pause');
		(0, _eventTargetShim.defineEventAttribute)(SpeechSynthesisUtterance.prototype, 'resume');
		(0, _eventTargetShim.defineEventAttribute)(SpeechSynthesisUtterance.prototype, 'start');
		//# sourceMappingURL=SpeechSynthesisUtterance.js.map

6

lib/SpeechServices/TextToSpeech/SpeechSynthesisVoice.js

		@@ -22,4 +22,6 @@ "use strict";
		(0, _classCallCheck2.default)(this, _default);
		this._default = false;
		this._gender = gender;
		this._lang = lang;
		this._localService = false;
		this._name = voiceURI;
		@@ -32,3 +34,3 @@ this._voiceURI = voiceURI;
		get: function get() {
		return false;
		return this._default;
		}
		@@ -48,3 +50,3 @@ }, {
		get: function get() {
		return false;
		return this._localService;
		}
		@@ -51,0 +53,0 @@ }, {

9

lib/Util/createDeferred.js

		@@ -32,3 +32,8 @@ // The MIT License (MIT)

		module.exports = function () {
		Object.defineProperty(exports, "__esModule", {
		value: true
		});
		exports.default = createDeferred;

		function createDeferred() {
		var ret = {};
		@@ -40,3 +45,3 @@ ret.promise = new Promise(function (resolve, reject) {
		return ret;
		};
		}
		//# sourceMappingURL=createDeferred.js.map

4

lib/Util/createPromiseQueue.js

		@@ -30,5 +30,5 @@ "use strict";
		return Promise.resolve(queue.shift());
		} else {
		return (shiftDeferred \|\| (shiftDeferred = (0, _createDeferred.default)())).promise;
		}

		return (shiftDeferred \|\| (shiftDeferred = (0, _createDeferred.default)())).promise;
		};
		@@ -35,0 +35,0 @@

7

package.json

		{
		"name": "web-speech-cognitive-services",
		"version": "4.0.1-master.4a38b83",
		"version": "4.0.1-master.54dc22a",
		"description": "Polyfill Web Speech API with Cognitive Services Speech-to-Text service",
		@@ -34,2 +34,4 @@ "keywords": [
		"clean": "rimraf lib",
		"eslint": "eslint src/*/.js",
		"prepublishOnly": "npm run eslint && npm run build",
		"start": "npm run build -- --verbose --watch",
		@@ -73,8 +75,11 @@ "test": "jest"
		"base64-arraybuffer": "^0.2.0",
		"eslint": "^6.1.0",
		"event-as-promise": "^1.0.5",
		"event-target-shim": "^5.0.1",
		"events": "^3.0.0",
		"memoize-one": "^5.0.5",
		"microsoft-cognitiveservices-speech-sdk": "1.6.0",
		"on-error-resume-next": "^1.1.0",
		"simple-update-in": "^2.1.0"
		}
		}

210

README.md

		@@ -9,16 +9,18 @@ # web-speech-cognitive-services

		# Demo
		# Description

		Try out our demo at https://compulim.github.io/web-speech-cognitive-services?s=your-subscription-key.
		Speech technologies enables a lot of interesting scenarios, including Intelligent Personal Assistant and provide alternative inputs for assistive technologies.

		We use [`react-dictate-button`](https://github.com/compulim/react-dictate-button/) and [`react-say`](https://github.com/compulim/react-say/) to quickly setup the playground.
		Although W3C standardized speech technologies in browser, speech-to-text and text-to-speech support are still scarce. However, cloud-based speech technologies are very mature.

		# Background
		This polyfill provides W3C [Speech Recognition](https://developer.mozilla.org/en-US/docs/Web/API/SpeechRecognition) and [Speech Synthesis](https://developer.mozilla.org/en-US/docs/Web/API/SpeechSynthesis) API in browser by using [Azure Cognitive Services Speech Services](https://azure.microsoft.com/en-us/services/cognitive-services/speech-services/). This will bring speech technologies to all modern first-party browsers available on both PC and mobile platforms.

		Web Speech API is not widely adopted on popular browsers and platforms. Polyfilling the API using cloud services is a great way to enable wider adoption. Nonetheless, Web Speech API in Google Chrome is also backed by cloud services.
		# Demo

		Microsoft Azure [Cognitive Services Speech Services](https://azure.microsoft.com/en-us/services/cognitive-services/speech-services/) service provide speech recognition with great accuracy. But unfortunately, the APIs are not based on Web Speech API.
		> Before getting started, please obtain a Cognitive Services subscription key from your Azure subscription.

		This package will polyfill Web Speech API by turning Cognitive Services Speech Services API into Web Speech API. We test this package with popular combination of platforms and browsers.
		Try out our demo at https://compulim.github.io/web-speech-cognitive-services. If you don't have a subscription key, you can still try out our demo in a speech-supported browser.

		We use [`react-dictate-button`](https://github.com/compulim/react-dictate-button/) and [`react-say`](https://github.com/compulim/react-say/) to quickly setup the playground.

		## Browser requirements
		@@ -28,4 +30,6 @@

		Speech synthesis requires Web Audio API. For Safari, user gesture (click or tap) is required to play audio clips using Web Audio API. To ready the Web Audio API to use without user gesture, you can synthesize an empty string.
		### Special requirement for Safari

		Speech synthesis requires Web Audio API. For Safari, user gesture (click or tap) is required to play audio clips using Web Audio API. To ready the Web Audio API to use without user gesture, you can synthesize an empty string, which will not trigger any network calls but playing an empty hardcoded short audio clip. If you already have a "primed" `AudioContext` object, you can also pass it as an option.

		# How to use
		@@ -53,3 +57,3 @@
		const { speechSynthesis, SpeechSynthesisUtterance } = window.WebSpeechCognitiveServices.create({
		region: 'westus2',
		region: 'westus',
		subscriptionKey: 'YOUR_SUBSCRIPTION_KEY'
		@@ -73,2 +77,4 @@ });

		The `voiceschanged` event come shortly after you created the ponyfill. You will need to wait until the event arrived before able to choose a voice for your utterance.

		## Install from NPM
		@@ -88,2 +94,112 @@

		## Options

		The following list all options supported by the adapter.

		<table>
		<thead>
		<tr>
		<th>Name and type</th>
		<th>Default value</th>
		<th>Description</th>
		</tr>
		</thead>
		<tbody>
		<tr>
		<td><code>audioConfig: <a href="https://docs.microsoft.com/en-us/javascript/api/microsoft-cognitiveservices-speech-sdk/audioconfig?view=azure-node-latest">AudioConfig</a></code></td>
		<td><code><a href="https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/how-to-select-audio-input-devices#audio-device-ids-in-javascript">fromDefaultMicrophoneInput()</a></code></td>
		<td>
		<a href="https://docs.microsoft.com/en-us/javascript/api/microsoft-cognitiveservices-speech-sdk/audioconfig?view=azure-node-latest"><code>AudioConfig</code></a> object to use with speech recognition. Please refer to <a href="https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/how-to-select-audio-input-devices#audio-device-ids-in-javascript">this article</a> for details on selecting different audio devices.
		</td>
		</tr>
		<tr>
		<td>
		<code>audioContext: <a href="https://developer.mozilla.org/en-US/docs/Web/API/AudioContext">AudioContext</a></code>
		</td>
		<td><code>undefined</code></td>
		<td>
		The audio context is synthesizing speech on. If this is <code>undefined</code>, the <code>AudioContext</code> object will be created on first synthesis.
		</td>
		</tr>
		<tr>
		<td>
		<code>authorizationToken: (</code><br />
		<code>  string \|\|</code><br />
		<code>  Promise<string> \|\|</code><br />
		<code>  () => string \|\|</code><br />
		<code>  () => Promise<string></code><br />
		<code>)</code>
		</td>
		<td>(Requires either<br /><code>authorizationToken</code> or<br /><code>subscriptionKey</code>)</td>
		<td>
		Authorization token from Cognitive Services. Please refer to <a href="https://docs.microsoft.com/en-us/azure/cognitive-services/authentication">this article</a> to obtain an authorization token.
		</td>
		</tr>
		<tr>
		<td><code>enableTelemetry</code></td>
		<td><code>undefined</code></td>
		<td>Pass-through option to enable or disable telemetry for Speech SDK recognizer as <a href="https://github.com/Microsoft/cognitive-services-speech-sdk-js#data--telemetry">outlined in Speech SDK</a>. This adapter does not collect any telemetry.<br /><br />By default, Speech SDK will collect telemetry unless this is set to <code>false</code>.</td>
		</tr>
		<tr>
		<td><code>ponyfill.AudioContext: <a href="https://developer.mozilla.org/en-US/docs/Web/API/AudioContext">AudioContext</a></code></td>
		<td><code>window.AudioContext \|\|</code><br /><code>window.webkitAudioContext</code></td>
		<td>
		Ponyfill for Web Audio API.<br /><br />
		Currently, only Web Audio API can be ponyfilled. We may expand to WebRTC for audio recording in the future.</td>
		</td>
		</tr>
		<tr>
		<td><code>referenceGrammars: string[]</code></td>
		<td><code>undefined</code></td>
		<td>Reference grammar IDs to send for speech recognition.</td>
		</tr>
		<tr>
		<td><code>region: string</code></td>
		<td><code>"westus"</code></td>
		<td>
		Azure region of Cognitive Services to use.
		</td>
		</tr>
		<tr>
		<td><code>speechRecognitionEndpointId: string</code></td>
		<td><code>undefined</code></td>
		<td>
		Endpoint ID for <a href="https://azure.microsoft.com/en-us/services/cognitive-services/custom-speech-service/">Custom Speech service</a>.
		</td>
		<tr>
		<td><code>speechSynthesisDeploymentId: string</code></td>
		<td><code>undefined</code></td>
		<td>
		Deployment ID for <a href="https://speech.microsoft.com/customvoice">Custom Voice service</a>.<br /><br />
		When you are using Custom Voice, you will need to specify your voice model name through <a href="https://developer.mozilla.org/en-US/docs/Web/API/SpeechSynthesisVoice"><code>SpeechSynthesisVoice.voiceURI</code></a>. Please refer to the <a href="#custom-voice-support">"Custom Voice support"</a> section for details.
		</td>
		</tr>
		<tr>
		<td><code>speechSynthesisOutputFormat: string</code></td>
		<td><code>audio-24khz-160kbitrate-mono-mp3</code></td>
		<td>Audio format for speech synthesis. Please refer to <a href="https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs">this article</a> for list of supported formats.</td>
		</tr>
		<tr>
		<td><code>subscriptionKey: string</code></td>
		<td>(Requires either<br /><code>authorizationToken</code> or<br /><code>subscriptionKey</code>)</td>
		<td>
		Subscription key to use. This is not recommended for production use as the subscription key will be leaked in the browser.
		</td>
		</tr>
		<tr>
		<td><code>textNormalization: string</code></td>
		<td><code>"display"</code></td>
		<td>
		Supported text normalization options:<br /><br />
		<ul>
		<li><code>"display"</code></li>
		<li><code>"itn"</code> (inverse text normalization)</li>
		<li><code>"lexical"</code></li>
		<li><code>"maskeditn"</code> (masked ITN)</li>
		</ul>
		</td>
		</tr>
		</tbody>
		</table>

		# Code snippets
		@@ -145,35 +261,2 @@

		### Speech priming (a.k.a. grammars)

		> This section is currently not implemented with new Speech SDK. We are leaving the section here for future reference.

		You can prime the speech recognition by giving a list of words.

		Since Cognitive Services does not works with weighted grammars, we built another `SpeechGrammarList` to better fit the scenario.

		```jsx
		import createPonyfill from 'web-speech-cognitive-services/lib/SpeechServices';

		const {
		SpeechGrammarList,
		SpeechRecognition
		} = await createPonyfill({
		region: 'westus',
		subscriptionKey: 'YOUR_SUBSCRIPTION_KEY'
		});

		const recognition = new SpeechRecognition();

		recognition.grammars = new SpeechGrammarList();
		recognition.grammars.words = ['Tuen Mun', 'Yuen Long'];

		recognition.onresult = ({ results }) => {
		console.log(results);
		};

		recognition.start();
		```

		> Note: you can also pass `grammars` to `react-dictate-button` via `extra` props.

		## Speech synthesis (text-to-speech)
		@@ -263,3 +346,3 @@

		You can also provide an async function that will fetch the authorization token on-demand. You should cache the authorization token for subsequent request.
		You can also provide an async function that will fetch the authorization token on-demand. You should cache the authorization token for subsequent request. For simplicity of this code snippets, we are not caching the result.

		@@ -270,3 +353,3 @@ ```jsx
		const ponyfill = await createPonyfill({
		authorizationToken: fetch('https://example.com/your-token').then(res => res.text()),
		authorizationToken: () => fetch('https://example.com/your-token').then(res => res.text()),
		region: 'westus',
		@@ -286,2 +369,31 @@ });

		## Biasing towards some words for recognition

		In some cases, you may want the speech recognition engine to be biased towards "Bellevue" because it is not trivial for the engine to recognize between "Bellevue", "Bellview" and "Bellvue" (without "e"). By giving a list of words, teh speech recognition engine will be more biased to your choice of words.

		Since Cognitive Services does not works with weighted grammars, we built another `SpeechGrammarList` to better fit the scenario.

		```jsx
		import createPonyfill from 'web-speech-cognitive-services/lib/SpeechServices';

		const {
		SpeechGrammarList,
		SpeechRecognition
		} = await createPonyfill({
		region: 'westus',
		subscriptionKey: 'YOUR_SUBSCRIPTION_KEY'
		});

		const recognition = new SpeechRecognition();

		recognition.grammars = new SpeechGrammarList();
		recognition.grammars.phrases = ['Tuen Mun', 'Yuen Long'];

		recognition.onresult = ({ results }) => {
		console.log(results);
		};

		recognition.start();
		```

		## Custom Speech support
		@@ -338,14 +450,6 @@
		* Although Google Chrome support grammar list, it seems the grammar list is not used at all
		* Continuous mode does not work
		* Speech synthesis
		* `onboundary`, `onmark`, `onpause`, and `onresume` are not supported/fired
		* `pause` will pause immediately and do not pause on word breaks
		* `pause` will pause immediately and do not pause on word breaks due to lack of boundary

		## Quirks

		* Speech recognition
		* Dictation mode
		* If `stop()` is called before first `recognized` event, there will be no final result
		* Cognitive Services stop recognition immediately after `stop()` is called

		# Roadmap
		@@ -352,0 +456,0 @@

lib/SpeechServices.js.map