@ricky0123/vad-web - npm Package Compare versions

15

dist/_common/frame-processor.d.ts

		@@ -31,2 +31,6 @@ import { SpeechProbabilities } from "./models";
		minSpeechFrames: number;
		/**
		* If true, when the user pauses the VAD, it may trigger `onSpeechEnd`.
		*/
		submitUserSpeechOnPause: boolean;
		}
		@@ -60,3 +64,12 @@ export declare const defaultFrameProcessorOptions: FrameProcessorOptions;
		reset: () => void;
		pause: () => void;
		pause: () => {
		msg: Message;
		audio: Float32Array;
		} \| {
		msg: Message;
		audio?: undefined;
		} \| {
		msg?: undefined;
		audio?: undefined;
		};
		resume: () => void;
		@@ -63,0 +76,0 @@ endSegment: () => {

9

dist/_common/frame-processor.js

		@@ -18,2 +18,3 @@ "use strict";
		minSpeechFrames: 3,
		submitUserSpeechOnPause: false,
		};
		@@ -68,3 +69,9 @@ function validateOptions(options) {
		this.active = false;
		this.reset();
		if (this.options.submitUserSpeechOnPause) {
		return this.endSegment();
		}
		else {
		this.reset();
		return {};
		}
		};
		@@ -71,0 +78,0 @@ this.resume = () => {

1

dist/_common/non-real-time-vad.js

		@@ -33,2 +33,3 @@ "use strict";
		minSpeechFrames: this.options.minSpeechFrames,
		submitUserSpeechOnPause: this.options.submitUserSpeechOnPause,
		});
		@@ -35,0 +36,0 @@ this.frameProcessor.resume();

6

dist/bundle.dev.js

		@@ -29,3 +29,3 @@ /*

		eval("\n/\nSome of this code, together with the default options found in index.ts,\nwere taken (or took inspiration) from https://github.com/snakers4/silero-vad\n/\nObject.defineProperty(exports, \"__esModule\", ({ value: true }));\nexports.FrameProcessor = exports.validateOptions = exports.defaultFrameProcessorOptions = void 0;\nconst messages_1 = __webpack_require__(/! ./messages / \"./dist/_common/messages.js\");\nconst logging_1 = __webpack_require__(/! ./logging / \"./dist/_common/logging.js\");\nconst RECOMMENDED_FRAME_SAMPLES = [512, 1024, 1536];\nexports.defaultFrameProcessorOptions = {\n positiveSpeechThreshold: 0.5,\n negativeSpeechThreshold: 0.5 - 0.15,\n preSpeechPadFrames: 1,\n redemptionFrames: 8,\n frameSamples: 1536,\n minSpeechFrames: 3,\n};\nfunction validateOptions(options) {\n if (!RECOMMENDED_FRAME_SAMPLES.includes(options.frameSamples)) {\n logging_1.log.warn(\"You are using an unusual frame size\");\n }\n if (options.positiveSpeechThreshold < 0 \|\|\n options.negativeSpeechThreshold > 1) {\n logging_1.log.error(\"postiveSpeechThreshold should be a number between 0 and 1\");\n }\n if (options.negativeSpeechThreshold < 0 \|\|\n options.negativeSpeechThreshold > options.positiveSpeechThreshold) {\n logging_1.log.error(\"negativeSpeechThreshold should be between 0 and postiveSpeechThreshold\");\n }\n if (options.preSpeechPadFrames < 0) {\n logging_1.log.error(\"preSpeechPadFrames should be positive\");\n }\n if (options.redemptionFrames < 0) {\n logging_1.log.error(\"preSpeechPadFrames should be positive\");\n }\n}\nexports.validateOptions = validateOptions;\nconst concatArrays = (arrays) => {\n const sizes = arrays.reduce((out, next) => {\n out.push(out.at(-1) + next.length);\n return out;\n }, [0]);\n const outArray = new Float32Array(sizes.at(-1));\n arrays.forEach((arr, index) => {\n const place = sizes[index];\n outArray.set(arr, place);\n });\n return outArray;\n};\nclass FrameProcessor {\n constructor(modelProcessFunc, modelResetFunc, options) {\n this.modelProcessFunc = modelProcessFunc;\n this.modelResetFunc = modelResetFunc;\n this.options = options;\n this.speaking = false;\n this.redemptionCounter = 0;\n this.active = false;\n this.reset = () => {\n this.speaking = false;\n this.audioBuffer = [];\n this.modelResetFunc();\n this.redemptionCounter = 0;\n };\n this.pause = () => {\n this.active = false;\n this.reset();\n };\n this.resume = () => {\n this.active = true;\n };\n this.endSegment = () => {\n const audioBuffer = this.audioBuffer;\n this.audioBuffer = [];\n const speaking = this.speaking;\n this.reset();\n const speechFrameCount = audioBuffer.reduce((acc, item) => {\n return acc + +item.isSpeech;\n }, 0);\n if (speaking) {\n if (speechFrameCount >= this.options.minSpeechFrames) {\n const audio = concatArrays(audioBuffer.map((item) => item.frame));\n return { msg: messages_1.Message.SpeechEnd, audio };\n }\n else {\n return { msg: messages_1.Message.VADMisfire };\n }\n }\n return {};\n };\n this.process = async (frame) => {\n if (!this.active) {\n return {};\n }\n const probs = await this.modelProcessFunc(frame);\n this.audioBuffer.push({\n frame,\n isSpeech: probs.isSpeech >= this.options.positiveSpeechThreshold,\n });\n if (probs.isSpeech >= this.options.positiveSpeechThreshold &&\n this.redemptionCounter) {\n this.redemptionCounter = 0;\n }\n if (probs.isSpeech >= this.options.positiveSpeechThreshold &&\n !this.speaking) {\n this.speaking = true;\n return { probs, msg: messages_1.Message.SpeechStart };\n }\n if (probs.isSpeech < this.options.negativeSpeechThreshold &&\n this.speaking &&\n ++this.redemptionCounter >= this.options.redemptionFrames) {\n this.redemptionCounter = 0;\n this.speaking = false;\n const audioBuffer = this.audioBuffer;\n this.audioBuffer = [];\n const speechFrameCount = audioBuffer.reduce((acc, item) => {\n return acc + +item.isSpeech;\n }, 0);\n if (speechFrameCount >= this.options.minSpeechFrames) {\n const audio = concatArrays(audioBuffer.map((item) => item.frame));\n return { probs, msg: messages_1.Message.SpeechEnd, audio };\n }\n else {\n return { probs, msg: messages_1.Message.VADMisfire };\n }\n }\n if (!this.speaking) {\n while (this.audioBuffer.length > this.options.preSpeechPadFrames) {\n this.audioBuffer.shift();\n }\n }\n return { probs };\n };\n this.audioBuffer = [];\n this.reset();\n }\n}\nexports.FrameProcessor = FrameProcessor;\n//# sourceMappingURL=frame-processor.js.map\n\n//# sourceURL=webpack://vad/./dist/_common/frame-processor.js?");
		eval("\n/\nSome of this code, together with the default options found in index.ts,\nwere taken (or took inspiration) from https://github.com/snakers4/silero-vad\n/\nObject.defineProperty(exports, \"__esModule\", ({ value: true }));\nexports.FrameProcessor = exports.validateOptions = exports.defaultFrameProcessorOptions = void 0;\nconst messages_1 = __webpack_require__(/! ./messages / \"./dist/_common/messages.js\");\nconst logging_1 = __webpack_require__(/! ./logging / \"./dist/_common/logging.js\");\nconst RECOMMENDED_FRAME_SAMPLES = [512, 1024, 1536];\nexports.defaultFrameProcessorOptions = {\n positiveSpeechThreshold: 0.5,\n negativeSpeechThreshold: 0.5 - 0.15,\n preSpeechPadFrames: 1,\n redemptionFrames: 8,\n frameSamples: 1536,\n minSpeechFrames: 3,\n submitUserSpeechOnPause: false,\n};\nfunction validateOptions(options) {\n if (!RECOMMENDED_FRAME_SAMPLES.includes(options.frameSamples)) {\n logging_1.log.warn(\"You are using an unusual frame size\");\n }\n if (options.positiveSpeechThreshold < 0 \|\|\n options.negativeSpeechThreshold > 1) {\n logging_1.log.error(\"postiveSpeechThreshold should be a number between 0 and 1\");\n }\n if (options.negativeSpeechThreshold < 0 \|\|\n options.negativeSpeechThreshold > options.positiveSpeechThreshold) {\n logging_1.log.error(\"negativeSpeechThreshold should be between 0 and postiveSpeechThreshold\");\n }\n if (options.preSpeechPadFrames < 0) {\n logging_1.log.error(\"preSpeechPadFrames should be positive\");\n }\n if (options.redemptionFrames < 0) {\n logging_1.log.error(\"preSpeechPadFrames should be positive\");\n }\n}\nexports.validateOptions = validateOptions;\nconst concatArrays = (arrays) => {\n const sizes = arrays.reduce((out, next) => {\n out.push(out.at(-1) + next.length);\n return out;\n }, [0]);\n const outArray = new Float32Array(sizes.at(-1));\n arrays.forEach((arr, index) => {\n const place = sizes[index];\n outArray.set(arr, place);\n });\n return outArray;\n};\nclass FrameProcessor {\n constructor(modelProcessFunc, modelResetFunc, options) {\n this.modelProcessFunc = modelProcessFunc;\n this.modelResetFunc = modelResetFunc;\n this.options = options;\n this.speaking = false;\n this.redemptionCounter = 0;\n this.active = false;\n this.reset = () => {\n this.speaking = false;\n this.audioBuffer = [];\n this.modelResetFunc();\n this.redemptionCounter = 0;\n };\n this.pause = () => {\n this.active = false;\n if (this.options.submitUserSpeechOnPause) {\n return this.endSegment();\n }\n else {\n this.reset();\n return {};\n }\n };\n this.resume = () => {\n this.active = true;\n };\n this.endSegment = () => {\n const audioBuffer = this.audioBuffer;\n this.audioBuffer = [];\n const speaking = this.speaking;\n this.reset();\n const speechFrameCount = audioBuffer.reduce((acc, item) => {\n return acc + +item.isSpeech;\n }, 0);\n if (speaking) {\n if (speechFrameCount >= this.options.minSpeechFrames) {\n const audio = concatArrays(audioBuffer.map((item) => item.frame));\n return { msg: messages_1.Message.SpeechEnd, audio };\n }\n else {\n return { msg: messages_1.Message.VADMisfire };\n }\n }\n return {};\n };\n this.process = async (frame) => {\n if (!this.active) {\n return {};\n }\n const probs = await this.modelProcessFunc(frame);\n this.audioBuffer.push({\n frame,\n isSpeech: probs.isSpeech >= this.options.positiveSpeechThreshold,\n });\n if (probs.isSpeech >= this.options.positiveSpeechThreshold &&\n this.redemptionCounter) {\n this.redemptionCounter = 0;\n }\n if (probs.isSpeech >= this.options.positiveSpeechThreshold &&\n !this.speaking) {\n this.speaking = true;\n return { probs, msg: messages_1.Message.SpeechStart };\n }\n if (probs.isSpeech < this.options.negativeSpeechThreshold &&\n this.speaking &&\n ++this.redemptionCounter >= this.options.redemptionFrames) {\n this.redemptionCounter = 0;\n this.speaking = false;\n const audioBuffer = this.audioBuffer;\n this.audioBuffer = [];\n const speechFrameCount = audioBuffer.reduce((acc, item) => {\n return acc + +item.isSpeech;\n }, 0);\n if (speechFrameCount >= this.options.minSpeechFrames) {\n const audio = concatArrays(audioBuffer.map((item) => item.frame));\n return { probs, msg: messages_1.Message.SpeechEnd, audio };\n }\n else {\n return { probs, msg: messages_1.Message.VADMisfire };\n }\n }\n if (!this.speaking) {\n while (this.audioBuffer.length > this.options.preSpeechPadFrames) {\n this.audioBuffer.shift();\n }\n }\n return { probs };\n };\n this.audioBuffer = [];\n this.reset();\n }\n}\nexports.FrameProcessor = FrameProcessor;\n//# sourceMappingURL=frame-processor.js.map\n\n//# sourceURL=webpack://vad/./dist/_common/frame-processor.js?");

		@@ -80,3 +80,3 @@ /***/ }),

		eval("\nObject.defineProperty(exports, \"__esModule\", ({ value: true }));\nexports.PlatformAgnosticNonRealTimeVAD = exports.defaultNonRealTimeVADOptions = void 0;\nconst frame_processor_1 = __webpack_require__(/! ./frame-processor / \"./dist/_common/frame-processor.js\");\nconst messages_1 = __webpack_require__(/! ./messages / \"./dist/_common/messages.js\");\nconst models_1 = __webpack_require__(/! ./models / \"./dist/_common/models.js\");\nconst resampler_1 = __webpack_require__(/! ./resampler / \"./dist/_common/resampler.js\");\nexports.defaultNonRealTimeVADOptions = {\n ...frame_processor_1.defaultFrameProcessorOptions,\n};\nclass PlatformAgnosticNonRealTimeVAD {\n static async _new(modelFetcher, ort, options = {}) {\n const vad = new this(modelFetcher, ort, {\n ...exports.defaultNonRealTimeVADOptions,\n ...options,\n });\n await vad.init();\n return vad;\n }\n constructor(modelFetcher, ort, options) {\n this.modelFetcher = modelFetcher;\n this.ort = ort;\n this.options = options;\n this.init = async () => {\n const model = await models_1.Silero.new(this.ort, this.modelFetcher);\n this.frameProcessor = new frame_processor_1.FrameProcessor(model.process, model.reset_state, {\n frameSamples: this.options.frameSamples,\n positiveSpeechThreshold: this.options.positiveSpeechThreshold,\n negativeSpeechThreshold: this.options.negativeSpeechThreshold,\n redemptionFrames: this.options.redemptionFrames,\n preSpeechPadFrames: this.options.preSpeechPadFrames,\n minSpeechFrames: this.options.minSpeechFrames,\n });\n this.frameProcessor.resume();\n };\n this.run = async function* (inputAudio, sampleRate) {\n const resamplerOptions = {\n nativeSampleRate: sampleRate,\n targetSampleRate: 16000,\n targetFrameSize: this.options.frameSamples,\n };\n const resampler = new resampler_1.Resampler(resamplerOptions);\n const frames = resampler.process(inputAudio);\n let start, end;\n for (const i of [...Array(frames.length)].keys()) {\n const f = frames[i];\n const { msg, audio } = await this.frameProcessor.process(f);\n switch (msg) {\n case messages_1.Message.SpeechStart:\n start = (i * this.options.frameSamples) / 16;\n break;\n case messages_1.Message.SpeechEnd:\n end = ((i + 1) * this.options.frameSamples) / 16;\n // @ts-ignore\n yield { audio, start, end };\n break;\n default:\n break;\n }\n }\n const { msg, audio } = this.frameProcessor.endSegment();\n if (msg == messages_1.Message.SpeechEnd) {\n yield {\n audio,\n // @ts-ignore\n start,\n end: (frames.length * this.options.frameSamples) / 16,\n };\n }\n };\n (0, frame_processor_1.validateOptions)(options);\n }\n}\nexports.PlatformAgnosticNonRealTimeVAD = PlatformAgnosticNonRealTimeVAD;\n//# sourceMappingURL=non-real-time-vad.js.map\n\n//# sourceURL=webpack://vad/./dist/_common/non-real-time-vad.js?");
		eval("\nObject.defineProperty(exports, \"__esModule\", ({ value: true }));\nexports.PlatformAgnosticNonRealTimeVAD = exports.defaultNonRealTimeVADOptions = void 0;\nconst frame_processor_1 = __webpack_require__(/! ./frame-processor / \"./dist/_common/frame-processor.js\");\nconst messages_1 = __webpack_require__(/! ./messages / \"./dist/_common/messages.js\");\nconst models_1 = __webpack_require__(/! ./models / \"./dist/_common/models.js\");\nconst resampler_1 = __webpack_require__(/! ./resampler / \"./dist/_common/resampler.js\");\nexports.defaultNonRealTimeVADOptions = {\n ...frame_processor_1.defaultFrameProcessorOptions,\n};\nclass PlatformAgnosticNonRealTimeVAD {\n static async _new(modelFetcher, ort, options = {}) {\n const vad = new this(modelFetcher, ort, {\n ...exports.defaultNonRealTimeVADOptions,\n ...options,\n });\n await vad.init();\n return vad;\n }\n constructor(modelFetcher, ort, options) {\n this.modelFetcher = modelFetcher;\n this.ort = ort;\n this.options = options;\n this.init = async () => {\n const model = await models_1.Silero.new(this.ort, this.modelFetcher);\n this.frameProcessor = new frame_processor_1.FrameProcessor(model.process, model.reset_state, {\n frameSamples: this.options.frameSamples,\n positiveSpeechThreshold: this.options.positiveSpeechThreshold,\n negativeSpeechThreshold: this.options.negativeSpeechThreshold,\n redemptionFrames: this.options.redemptionFrames,\n preSpeechPadFrames: this.options.preSpeechPadFrames,\n minSpeechFrames: this.options.minSpeechFrames,\n submitUserSpeechOnPause: this.options.submitUserSpeechOnPause,\n });\n this.frameProcessor.resume();\n };\n this.run = async function* (inputAudio, sampleRate) {\n const resamplerOptions = {\n nativeSampleRate: sampleRate,\n targetSampleRate: 16000,\n targetFrameSize: this.options.frameSamples,\n };\n const resampler = new resampler_1.Resampler(resamplerOptions);\n const frames = resampler.process(inputAudio);\n let start, end;\n for (const i of [...Array(frames.length)].keys()) {\n const f = frames[i];\n const { msg, audio } = await this.frameProcessor.process(f);\n switch (msg) {\n case messages_1.Message.SpeechStart:\n start = (i * this.options.frameSamples) / 16;\n break;\n case messages_1.Message.SpeechEnd:\n end = ((i + 1) * this.options.frameSamples) / 16;\n // @ts-ignore\n yield { audio, start, end };\n break;\n default:\n break;\n }\n }\n const { msg, audio } = this.frameProcessor.endSegment();\n if (msg == messages_1.Message.SpeechEnd) {\n yield {\n audio,\n // @ts-ignore\n start,\n end: (frames.length * this.options.frameSamples) / 16,\n };\n }\n };\n (0, frame_processor_1.validateOptions)(options);\n }\n}\nexports.PlatformAgnosticNonRealTimeVAD = PlatformAgnosticNonRealTimeVAD;\n//# sourceMappingURL=non-real-time-vad.js.map\n\n//# sourceURL=webpack://vad/./dist/_common/non-real-time-vad.js?");

		@@ -141,3 +141,3 @@ /***/ }),

		eval("\nvar __createBinding = (this && this.__createBinding) \|\| (Object.create ? (function(o, m, k, k2) {\n if (k2 === undefined) k2 = k;\n var desc = Object.getOwnPropertyDescriptor(m, k);\n if (!desc \|\| (\"get\" in desc ? !m.__esModule : desc.writable \|\| desc.configurable)) {\n desc = { enumerable: true, get: function() { return m[k]; } };\n }\n Object.defineProperty(o, k2, desc);\n}) : (function(o, m, k, k2) {\n if (k2 === undefined) k2 = k;\n o[k2] = m[k];\n}));\nvar __setModuleDefault = (this && this.__setModuleDefault) \|\| (Object.create ? (function(o, v) {\n Object.defineProperty(o, \"default\", { enumerable: true, value: v });\n}) : function(o, v) {\n o[\"default\"] = v;\n});\nvar __importStar = (this && this.__importStar) \|\| function (mod) {\n if (mod && mod.__esModule) return mod;\n var result = {};\n if (mod != null) for (var k in mod) if (k !== \"default\" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);\n __setModuleDefault(result, mod);\n return result;\n};\nObject.defineProperty(exports, \"__esModule\", ({ value: true }));\nexports.AudioNodeVAD = exports.MicVAD = exports.defaultRealTimeVADOptions = void 0;\nconst ort = __importStar(__webpack_require__(/! onnxruntime-web / \"onnxruntime-web\"));\nconst _common_1 = __webpack_require__(/! ./_common / \"./dist/_common/index.js\");\nconst asset_path_1 = __webpack_require__(/! ./asset-path / \"./dist/asset-path.js\");\nconst default_model_fetcher_1 = __webpack_require__(/! ./default-model-fetcher / \"./dist/default-model-fetcher.js\");\nexports.defaultRealTimeVADOptions = {\n ..._common_1.defaultFrameProcessorOptions,\n onFrameProcessed: (probabilities) => { },\n onVADMisfire: () => {\n _common_1.log.debug(\"VAD misfire\");\n },\n onSpeechStart: () => {\n _common_1.log.debug(\"Detected speech start\");\n },\n onSpeechEnd: () => {\n _common_1.log.debug(\"Detected speech end\");\n },\n workletURL: (0, asset_path_1.assetPath)(\"vad.worklet.bundle.min.js\"),\n modelURL: (0, asset_path_1.assetPath)(\"silero_vad.onnx\"),\n modelFetcher: default_model_fetcher_1.defaultModelFetcher,\n stream: undefined,\n};\nclass MicVAD {\n static async new(options = {}) {\n const fullOptions = {\n ...exports.defaultRealTimeVADOptions,\n ...options,\n };\n (0, _common_1.validateOptions)(fullOptions);\n let stream;\n if (fullOptions.stream === undefined)\n stream = await navigator.mediaDevices.getUserMedia({\n audio: {\n ...fullOptions.additionalAudioConstraints,\n channelCount: 1,\n echoCancellation: true,\n autoGainControl: true,\n noiseSuppression: true,\n },\n });\n else\n stream = fullOptions.stream;\n const audioContext = new AudioContext();\n const sourceNode = new MediaStreamAudioSourceNode(audioContext, {\n mediaStream: stream,\n });\n const audioNodeVAD = await AudioNodeVAD.new(audioContext, fullOptions);\n audioNodeVAD.receive(sourceNode);\n return new MicVAD(fullOptions, audioContext, stream, audioNodeVAD, sourceNode);\n }\n constructor(options, audioContext, stream, audioNodeVAD, sourceNode, listening = false) {\n this.options = options;\n this.audioContext = audioContext;\n this.stream = stream;\n this.audioNodeVAD = audioNodeVAD;\n this.sourceNode = sourceNode;\n this.listening = listening;\n this.pause = () => {\n this.audioNodeVAD.pause();\n this.listening = false;\n };\n this.start = () => {\n this.audioNodeVAD.start();\n this.listening = true;\n };\n this.destroy = () => {\n if (this.listening) {\n this.pause();\n }\n this.sourceNode.disconnect();\n this.audioNodeVAD.destroy();\n this.audioContext.close();\n };\n }\n}\nexports.MicVAD = MicVAD;\nclass AudioNodeVAD {\n static async new(ctx, options = {}) {\n const fullOptions = {\n ...exports.defaultRealTimeVADOptions,\n ...options,\n };\n (0, _common_1.validateOptions)(fullOptions);\n await ctx.audioWorklet.addModule(fullOptions.workletURL);\n const vadNode = new AudioWorkletNode(ctx, \"vad-helper-worklet\", {\n processorOptions: {\n frameSamples: fullOptions.frameSamples,\n },\n });\n const model = await _common_1.Silero.new(ort, () => fullOptions.modelFetcher(fullOptions.modelURL));\n const frameProcessor = new _common_1.FrameProcessor(model.process, model.reset_state, {\n frameSamples: fullOptions.frameSamples,\n positiveSpeechThreshold: fullOptions.positiveSpeechThreshold,\n negativeSpeechThreshold: fullOptions.negativeSpeechThreshold,\n redemptionFrames: fullOptions.redemptionFrames,\n preSpeechPadFrames: fullOptions.preSpeechPadFrames,\n minSpeechFrames: fullOptions.minSpeechFrames,\n });\n const audioNodeVAD = new AudioNodeVAD(ctx, fullOptions, frameProcessor, vadNode);\n vadNode.port.onmessage = async (ev) => {\n switch (ev.data?.message) {\n case _common_1.Message.AudioFrame:\n const buffer = ev.data.data;\n const frame = new Float32Array(buffer);\n await audioNodeVAD.processFrame(frame);\n break;\n default:\n break;\n }\n };\n return audioNodeVAD;\n }\n constructor(ctx, options, frameProcessor, entryNode) {\n this.ctx = ctx;\n this.options = options;\n this.frameProcessor = frameProcessor;\n this.entryNode = entryNode;\n this.pause = () => {\n this.frameProcessor.pause();\n };\n this.start = () => {\n this.frameProcessor.resume();\n };\n this.receive = (node) => {\n node.connect(this.entryNode);\n };\n this.processFrame = async (frame) => {\n const { probs, msg, audio } = await this.frameProcessor.process(frame);\n if (probs !== undefined) {\n this.options.onFrameProcessed(probs);\n }\n switch (msg) {\n case _common_1.Message.SpeechStart:\n this.options.onSpeechStart();\n break;\n case _common_1.Message.VADMisfire:\n this.options.onVADMisfire();\n break;\n case _common_1.Message.SpeechEnd:\n this.options.onSpeechEnd(audio);\n break;\n default:\n break;\n }\n };\n this.destroy = () => {\n this.entryNode.port.postMessage({\n message: _common_1.Message.SpeechStop,\n });\n this.entryNode.disconnect();\n };\n }\n}\nexports.AudioNodeVAD = AudioNodeVAD;\n//# sourceMappingURL=real-time-vad.js.map\n\n//# sourceURL=webpack://vad/./dist/real-time-vad.js?");
		eval("\nvar __createBinding = (this && this.__createBinding) \|\| (Object.create ? (function(o, m, k, k2) {\n if (k2 === undefined) k2 = k;\n var desc = Object.getOwnPropertyDescriptor(m, k);\n if (!desc \|\| (\"get\" in desc ? !m.__esModule : desc.writable \|\| desc.configurable)) {\n desc = { enumerable: true, get: function() { return m[k]; } };\n }\n Object.defineProperty(o, k2, desc);\n}) : (function(o, m, k, k2) {\n if (k2 === undefined) k2 = k;\n o[k2] = m[k];\n}));\nvar __setModuleDefault = (this && this.__setModuleDefault) \|\| (Object.create ? (function(o, v) {\n Object.defineProperty(o, \"default\", { enumerable: true, value: v });\n}) : function(o, v) {\n o[\"default\"] = v;\n});\nvar __importStar = (this && this.__importStar) \|\| function (mod) {\n if (mod && mod.__esModule) return mod;\n var result = {};\n if (mod != null) for (var k in mod) if (k !== \"default\" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);\n __setModuleDefault(result, mod);\n return result;\n};\nObject.defineProperty(exports, \"__esModule\", ({ value: true }));\nexports.AudioNodeVAD = exports.MicVAD = exports.defaultRealTimeVADOptions = void 0;\nconst ort = __importStar(__webpack_require__(/! onnxruntime-web / \"onnxruntime-web\"));\nconst _common_1 = __webpack_require__(/! ./_common / \"./dist/_common/index.js\");\nconst asset_path_1 = __webpack_require__(/! ./asset-path / \"./dist/asset-path.js\");\nconst default_model_fetcher_1 = __webpack_require__(/! ./default-model-fetcher / \"./dist/default-model-fetcher.js\");\nexports.defaultRealTimeVADOptions = {\n ..._common_1.defaultFrameProcessorOptions,\n onFrameProcessed: (probabilities) => { },\n onVADMisfire: () => {\n _common_1.log.debug(\"VAD misfire\");\n },\n onSpeechStart: () => {\n _common_1.log.debug(\"Detected speech start\");\n },\n onSpeechEnd: () => {\n _common_1.log.debug(\"Detected speech end\");\n },\n workletURL: (0, asset_path_1.assetPath)(\"vad.worklet.bundle.min.js\"),\n modelURL: (0, asset_path_1.assetPath)(\"silero_vad.onnx\"),\n modelFetcher: default_model_fetcher_1.defaultModelFetcher,\n stream: undefined,\n};\nclass MicVAD {\n static async new(options = {}) {\n const fullOptions = {\n ...exports.defaultRealTimeVADOptions,\n ...options,\n };\n (0, _common_1.validateOptions)(fullOptions);\n let stream;\n if (fullOptions.stream === undefined)\n stream = await navigator.mediaDevices.getUserMedia({\n audio: {\n ...fullOptions.additionalAudioConstraints,\n channelCount: 1,\n echoCancellation: true,\n autoGainControl: true,\n noiseSuppression: true,\n },\n });\n else\n stream = fullOptions.stream;\n const audioContext = new AudioContext();\n const sourceNode = new MediaStreamAudioSourceNode(audioContext, {\n mediaStream: stream,\n });\n const audioNodeVAD = await AudioNodeVAD.new(audioContext, fullOptions);\n audioNodeVAD.receive(sourceNode);\n return new MicVAD(fullOptions, audioContext, stream, audioNodeVAD, sourceNode);\n }\n constructor(options, audioContext, stream, audioNodeVAD, sourceNode, listening = false) {\n this.options = options;\n this.audioContext = audioContext;\n this.stream = stream;\n this.audioNodeVAD = audioNodeVAD;\n this.sourceNode = sourceNode;\n this.listening = listening;\n this.pause = () => {\n this.audioNodeVAD.pause();\n this.listening = false;\n };\n this.start = () => {\n this.audioNodeVAD.start();\n this.listening = true;\n };\n this.destroy = () => {\n if (this.listening) {\n this.pause();\n }\n this.sourceNode.disconnect();\n this.audioNodeVAD.destroy();\n this.audioContext.close();\n };\n }\n}\nexports.MicVAD = MicVAD;\nclass AudioNodeVAD {\n static async new(ctx, options = {}) {\n const fullOptions = {\n ...exports.defaultRealTimeVADOptions,\n ...options,\n };\n (0, _common_1.validateOptions)(fullOptions);\n await ctx.audioWorklet.addModule(fullOptions.workletURL);\n const vadNode = new AudioWorkletNode(ctx, \"vad-helper-worklet\", {\n processorOptions: {\n frameSamples: fullOptions.frameSamples,\n },\n });\n const model = await _common_1.Silero.new(ort, () => fullOptions.modelFetcher(fullOptions.modelURL));\n const frameProcessor = new _common_1.FrameProcessor(model.process, model.reset_state, {\n frameSamples: fullOptions.frameSamples,\n positiveSpeechThreshold: fullOptions.positiveSpeechThreshold,\n negativeSpeechThreshold: fullOptions.negativeSpeechThreshold,\n redemptionFrames: fullOptions.redemptionFrames,\n preSpeechPadFrames: fullOptions.preSpeechPadFrames,\n minSpeechFrames: fullOptions.minSpeechFrames,\n submitUserSpeechOnPause: fullOptions.submitUserSpeechOnPause,\n });\n const audioNodeVAD = new AudioNodeVAD(ctx, fullOptions, frameProcessor, vadNode);\n vadNode.port.onmessage = async (ev) => {\n switch (ev.data?.message) {\n case _common_1.Message.AudioFrame:\n const buffer = ev.data.data;\n const frame = new Float32Array(buffer);\n await audioNodeVAD.processFrame(frame);\n break;\n default:\n break;\n }\n };\n return audioNodeVAD;\n }\n constructor(ctx, options, frameProcessor, entryNode) {\n this.ctx = ctx;\n this.options = options;\n this.frameProcessor = frameProcessor;\n this.entryNode = entryNode;\n this.pause = () => {\n const ev = this.frameProcessor.pause();\n this.handleFrameProcessorEvent(ev);\n };\n this.start = () => {\n this.frameProcessor.resume();\n };\n this.receive = (node) => {\n node.connect(this.entryNode);\n };\n this.processFrame = async (frame) => {\n const ev = await this.frameProcessor.process(frame);\n this.handleFrameProcessorEvent(ev);\n };\n this.handleFrameProcessorEvent = (ev) => {\n if (ev.probs !== undefined) {\n this.options.onFrameProcessed(ev.probs);\n }\n switch (ev.msg) {\n case _common_1.Message.SpeechStart:\n this.options.onSpeechStart();\n break;\n case _common_1.Message.VADMisfire:\n this.options.onVADMisfire();\n break;\n case _common_1.Message.SpeechEnd:\n this.options.onSpeechEnd(ev.audio);\n break;\n default:\n break;\n }\n };\n this.destroy = () => {\n this.entryNode.port.postMessage({\n message: _common_1.Message.SpeechStop,\n });\n this.entryNode.disconnect();\n };\n }\n}\nexports.AudioNodeVAD = AudioNodeVAD;\n//# sourceMappingURL=real-time-vad.js.map\n\n//# sourceURL=webpack://vad/./dist/real-time-vad.js?");

		@@ -144,0 +144,0 @@ /***/ }),

2

dist/bundle.min.js

		@@ -1,1 +0,1 @@
		!function(e,t){"object"==typeof exports&&"object"==typeof module?module.exports=t(require("onnxruntime-web")):"function"==typeof define&&define.amd?define(["onnxruntime-web"],t):"object"==typeof exports?exports.vad=t(require("onnxruntime-web")):e.vad=t(e.ort)}(self,(e=>(()=>{"use strict";var t={428:(e,t,s)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.FrameProcessor=t.validateOptions=t.defaultFrameProcessorOptions=void 0;const o=s(294),r=s(842),i=[512,1024,1536];t.defaultFrameProcessorOptions={positiveSpeechThreshold:.5,negativeSpeechThreshold:.35,preSpeechPadFrames:1,redemptionFrames:8,frameSamples:1536,minSpeechFrames:3},t.validateOptions=function(e){i.includes(e.frameSamples)\|\|r.log.warn("You are using an unusual frame size"),(e.positiveSpeechThreshold<0\|\|e.negativeSpeechThreshold>1)&&r.log.error("postiveSpeechThreshold should be a number between 0 and 1"),(e.negativeSpeechThreshold<0\|\|e.negativeSpeechThreshold>e.positiveSpeechThreshold)&&r.log.error("negativeSpeechThreshold should be between 0 and postiveSpeechThreshold"),e.preSpeechPadFrames<0&&r.log.error("preSpeechPadFrames should be positive"),e.redemptionFrames<0&&r.log.error("preSpeechPadFrames should be positive")};const n=e=>{const t=e.reduce(((e,t)=>(e.push(e.at(-1)+t.length),e)),[0]),s=new Float32Array(t.at(-1));return e.forEach(((e,o)=>{const r=t[o];s.set(e,r)})),s};t.FrameProcessor=class{constructor(e,t,s){this.modelProcessFunc=e,this.modelResetFunc=t,this.options=s,this.speaking=!1,this.redemptionCounter=0,this.active=!1,this.reset=()=>{this.speaking=!1,this.audioBuffer=[],this.modelResetFunc(),this.redemptionCounter=0},this.pause=()=>{this.active=!1,this.reset()},this.resume=()=>{this.active=!0},this.endSegment=()=>{const e=this.audioBuffer;this.audioBuffer=[];const t=this.speaking;this.reset();const s=e.reduce(((e,t)=>e+ +t.isSpeech),0);if(t){if(s>=this.options.minSpeechFrames){const t=n(e.map((e=>e.frame)));return{msg:o.Message.SpeechEnd,audio:t}}return{msg:o.Message.VADMisfire}}return{}},this.process=async e=>{if(!this.active)return{};const t=await this.modelProcessFunc(e);if(this.audioBuffer.push({frame:e,isSpeech:t.isSpeech>=this.options.positiveSpeechThreshold}),t.isSpeech>=this.options.positiveSpeechThreshold&&this.redemptionCounter&&(this.redemptionCounter=0),t.isSpeech>=this.options.positiveSpeechThreshold&&!this.speaking)return this.speaking=!0,{probs:t,msg:o.Message.SpeechStart};if(t.isSpeech<this.options.negativeSpeechThreshold&&this.speaking&&++this.redemptionCounter>=this.options.redemptionFrames){this.redemptionCounter=0,this.speaking=!1;const e=this.audioBuffer;if(this.audioBuffer=[],e.reduce(((e,t)=>e+ +t.isSpeech),0)>=this.options.minSpeechFrames){const s=n(e.map((e=>e.frame)));return{probs:t,msg:o.Message.SpeechEnd,audio:s}}return{probs:t,msg:o.Message.VADMisfire}}if(!this.speaking)for(;this.audioBuffer.length>this.options.preSpeechPadFrames;)this.audioBuffer.shift();return{probs:t}},this.audioBuffer=[],this.reset()}}},14:function(e,t,s){var o=this&&this.__createBinding\|\|(Object.create?function(e,t,s,o){void 0===o&&(o=s);var r=Object.getOwnPropertyDescriptor(t,s);r&&!("get"in r?!t.__esModule:r.writable\|\|r.configurable)\|\|(r={enumerable:!0,get:function(){return t[s]}}),Object.defineProperty(e,o,r)}:function(e,t,s,o){void 0===o&&(o=s),e[o]=t[s]}),r=this&&this.__setModuleDefault\|\|(Object.create?function(e,t){Object.defineProperty(e,"default",{enumerable:!0,value:t})}:function(e,t){e.default=t}),i=this&&this.__importStar\|\|function(e){if(e&&e.__esModule)return e;var t={};if(null!=e)for(var s in e)"default"!==s&&Object.prototype.hasOwnProperty.call(e,s)&&o(t,e,s);return r(t,e),t},n=this&&this.__exportStar\|\|function(e,t){for(var s in e)"default"===s\|\|Object.prototype.hasOwnProperty.call(t,s)\|\|o(t,e,s)};Object.defineProperty(t,"__esModule",{value:!0}),t.utils=void 0;const a=i(s(26));t.utils={minFramesForTargetMS:a.minFramesForTargetMS,arrayBufferToBase64:a.arrayBufferToBase64,encodeWAV:a.encodeWAV},n(s(405),t),n(s(428),t),n(s(294),t),n(s(842),t),n(s(260),t),n(s(724),t)},842:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.log=t.LOG_PREFIX=void 0,t.LOG_PREFIX="[VAD]";const s=["error","debug","warn"].reduce(((e,s)=>(e[s]=function(e){return(...s)=>{console[e](t.LOG_PREFIX,...s)}}(s),e)),{});t.log=s},294:(e,t)=>{var s;Object.defineProperty(t,"__esModule",{value:!0}),t.Message=void 0,function(e){e.AudioFrame="AUDIO_FRAME",e.SpeechStart="SPEECH_START",e.VADMisfire="VAD_MISFIRE",e.SpeechEnd="SPEECH_END",e.SpeechStop="SPEECH_STOP"}(s\|\|(t.Message=s={}))},260:(e,t,s)=>{var o;Object.defineProperty(t,"__esModule",{value:!0}),t.Silero=void 0;const r=s(842);class i{constructor(e,t){this.ort=e,this.modelFetcher=t,this.init=async()=>{r.log.debug("initializing vad");const e=await this.modelFetcher();this._session=await this.ort.InferenceSession.create(e),this._sr=new this.ort.Tensor("int64",[16000n]),this.reset_state(),r.log.debug("vad is initialized")},this.reset_state=()=>{const e=Array(128).fill(0);this._h=new this.ort.Tensor("float32",e,[2,1,64]),this._c=new this.ort.Tensor("float32",e,[2,1,64])},this.process=async e=>{const t={input:new this.ort.Tensor("float32",e,[1,e.length]),h:this._h,c:this._c,sr:this._sr},s=await this._session.run(t);this._h=s.hn,this._c=s.cn;const[o]=s.output.data;return{notSpeech:1-o,isSpeech:o}}}}t.Silero=i,o=i,i.new=async(e,t)=>{const s=new o(e,t);return await s.init(),s}},405:(e,t,s)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.PlatformAgnosticNonRealTimeVAD=t.defaultNonRealTimeVADOptions=void 0;const o=s(428),r=s(294),i=s(260),n=s(724);t.defaultNonRealTimeVADOptions={...o.defaultFrameProcessorOptions},t.PlatformAgnosticNonRealTimeVAD=class{static async _new(e,s,o={}){const r=new this(e,s,{...t.defaultNonRealTimeVADOptions,...o});return await r.init(),r}constructor(e,t,s){this.modelFetcher=e,this.ort=t,this.options=s,this.init=async()=>{const e=await i.Silero.new(this.ort,this.modelFetcher);this.frameProcessor=new o.FrameProcessor(e.process,e.reset_state,{frameSamples:this.options.frameSamples,positiveSpeechThreshold:this.options.positiveSpeechThreshold,negativeSpeechThreshold:this.options.negativeSpeechThreshold,redemptionFrames:this.options.redemptionFrames,preSpeechPadFrames:this.options.preSpeechPadFrames,minSpeechFrames:this.options.minSpeechFrames}),this.frameProcessor.resume()},this.run=async function(e,t){const s={nativeSampleRate:t,targetSampleRate:16e3,targetFrameSize:this.options.frameSamples},o=new n.Resampler(s).process(e);let i,a;for(const e of[...Array(o.length)].keys()){const t=o[e],{msg:s,audio:n}=await this.frameProcessor.process(t);switch(s){case r.Message.SpeechStart:i=ethis.options.frameSamples/16;break;case r.Message.SpeechEnd:a=(e+1)this.options.frameSamples/16,yield{audio:n,start:i,end:a}}}const{msg:c,audio:h}=this.frameProcessor.endSegment();c==r.Message.SpeechEnd&&(yield{audio:h,start:i,end:o.lengththis.options.frameSamples/16})},(0,o.validateOptions)(s)}}},724:(e,t,s)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.Resampler=void 0;const o=s(842);t.Resampler=class{constructor(e){this.options=e,this.process=e=>{const t=[];for(const t of e)this.inputBuffer.push(t);for(;this.inputBuffer.lengththis.options.targetSampleRate/this.options.nativeSampleRate>this.options.targetFrameSize;){const e=new Float32Array(this.options.targetFrameSize);let s=0,o=0;for(;s<this.options.targetFrameSize;){let t=0,r=0;for(;o<Math.min(this.inputBuffer.length,(s+1)this.options.nativeSampleRate/this.options.targetSampleRate);)t+=this.inputBuffer[o],r++,o++;e[s]=t/r,s++}this.inputBuffer=this.inputBuffer.slice(o),t.push(e)}return t},e.nativeSampleRate<16e3&&o.log.error("nativeSampleRate is too low. Should have 16000 = targetSampleRate <= nativeSampleRate"),this.inputBuffer=[]}}},26:(e,t)=>{function s(e,t,s){for(var o=0;o<s.length;o++)e.setUint8(t+o,s.charCodeAt(o))}Object.defineProperty(t,"__esModule",{value:!0}),t.encodeWAV=t.arrayBufferToBase64=t.minFramesForTargetMS=void 0,t.minFramesForTargetMS=function(e,t,s=16e3){return Math.ceil(es/1e3/t)},t.arrayBufferToBase64=function(e){for(var t="",s=new Uint8Array(e),o=s.byteLength,r=0;r<o;r++)t+=String.fromCharCode(s[r]);return btoa(t)},t.encodeWAV=function(e,t=3,o=16e3,r=1,i=32){var n=i/8,a=rn,c=new ArrayBuffer(44+e.lengthn),h=new DataView(c);return s(h,0,"RIFF"),h.setUint32(4,36+e.lengthn,!0),s(h,8,"WAVE"),s(h,12,"fmt "),h.setUint32(16,16,!0),h.setUint16(20,t,!0),h.setUint16(22,r,!0),h.setUint32(24,o,!0),h.setUint32(28,oa,!0),h.setUint16(32,a,!0),h.setUint16(34,i,!0),s(h,36,"data"),h.setUint32(40,e.lengthn,!0),1===t?function(e,t,s){for(var o=0;o<s.length;o++,t+=2){var r=Math.max(-1,Math.min(1,s[o]));e.setInt16(t,r<0?32768r:32767r,!0)}}(h,44,e):function(e,t,s){for(var o=0;o<s.length;o++,t+=4)e.setFloat32(t,s[o],!0)}(h,44,e),c}},485:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.assetPath=void 0;const s="undefined"!=typeof window&&void 0!==window.document?window.document.currentScript:null;let o="";s&&(o=s.src.replace(/#.$/,"").replace(/\?.$/,"").replace(/\/[^\/]+$/,"/")),t.assetPath=e=>o+e},973:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.defaultModelFetcher=void 0,t.defaultModelFetcher=e=>fetch(e).then((e=>e.arrayBuffer()))},590:function(e,t,s){var o=this&&this.__createBinding\|\|(Object.create?function(e,t,s,o){void 0===o&&(o=s);var r=Object.getOwnPropertyDescriptor(t,s);r&&!("get"in r?!t.__esModule:r.writable\|\|r.configurable)\|\|(r={enumerable:!0,get:function(){return t[s]}}),Object.defineProperty(e,o,r)}:function(e,t,s,o){void 0===o&&(o=s),e[o]=t[s]}),r=this&&this.__setModuleDefault\|\|(Object.create?function(e,t){Object.defineProperty(e,"default",{enumerable:!0,value:t})}:function(e,t){e.default=t}),i=this&&this.__importStar\|\|function(e){if(e&&e.__esModule)return e;var t={};if(null!=e)for(var s in e)"default"!==s&&Object.prototype.hasOwnProperty.call(e,s)&&o(t,e,s);return r(t,e),t};Object.defineProperty(t,"__esModule",{value:!0}),t.defaultRealTimeVADOptions=t.AudioNodeVAD=t.MicVAD=t.NonRealTimeVAD=t.Message=t.FrameProcessor=t.utils=t.defaultNonRealTimeVADOptions=void 0;const n=i(s(656)),a=s(14);Object.defineProperty(t,"FrameProcessor",{enumerable:!0,get:function(){return a.FrameProcessor}}),Object.defineProperty(t,"Message",{enumerable:!0,get:function(){return a.Message}});const c=s(787),h=s(973),d=s(485);t.defaultNonRealTimeVADOptions={modelURL:(0,d.assetPath)("silero_vad.onnx"),modelFetcher:h.defaultModelFetcher};class l extends a.PlatformAgnosticNonRealTimeVAD{static async new(e={}){const{modelURL:s,modelFetcher:o}={...t.defaultNonRealTimeVADOptions,...e};return await this._new((()=>o(s)),n,e)}}t.NonRealTimeVAD=l,t.utils={audioFileToArray:c.audioFileToArray,...a.utils};var u=s(746);Object.defineProperty(t,"MicVAD",{enumerable:!0,get:function(){return u.MicVAD}}),Object.defineProperty(t,"AudioNodeVAD",{enumerable:!0,get:function(){return u.AudioNodeVAD}}),Object.defineProperty(t,"defaultRealTimeVADOptions",{enumerable:!0,get:function(){return u.defaultRealTimeVADOptions}})},746:function(e,t,s){var o=this&&this.__createBinding\|\|(Object.create?function(e,t,s,o){void 0===o&&(o=s);var r=Object.getOwnPropertyDescriptor(t,s);r&&!("get"in r?!t.__esModule:r.writable\|\|r.configurable)\|\|(r={enumerable:!0,get:function(){return t[s]}}),Object.defineProperty(e,o,r)}:function(e,t,s,o){void 0===o&&(o=s),e[o]=t[s]}),r=this&&this.__setModuleDefault\|\|(Object.create?function(e,t){Object.defineProperty(e,"default",{enumerable:!0,value:t})}:function(e,t){e.default=t}),i=this&&this.__importStar\|\|function(e){if(e&&e.__esModule)return e;var t={};if(null!=e)for(var s in e)"default"!==s&&Object.prototype.hasOwnProperty.call(e,s)&&o(t,e,s);return r(t,e),t};Object.defineProperty(t,"__esModule",{value:!0}),t.AudioNodeVAD=t.MicVAD=t.defaultRealTimeVADOptions=void 0;const n=i(s(656)),a=s(14),c=s(485),h=s(973);t.defaultRealTimeVADOptions={...a.defaultFrameProcessorOptions,onFrameProcessed:e=>{},onVADMisfire:()=>{a.log.debug("VAD misfire")},onSpeechStart:()=>{a.log.debug("Detected speech start")},onSpeechEnd:()=>{a.log.debug("Detected speech end")},workletURL:(0,c.assetPath)("vad.worklet.bundle.min.js"),modelURL:(0,c.assetPath)("silero_vad.onnx"),modelFetcher:h.defaultModelFetcher,stream:void 0};class d{static async new(e={}){const s={...t.defaultRealTimeVADOptions,...e};let o;(0,a.validateOptions)(s),o=void 0===s.stream?await navigator.mediaDevices.getUserMedia({audio:{...s.additionalAudioConstraints,channelCount:1,echoCancellation:!0,autoGainControl:!0,noiseSuppression:!0}}):s.stream;const r=new AudioContext,i=new MediaStreamAudioSourceNode(r,{mediaStream:o}),n=await l.new(r,s);return n.receive(i),new d(s,r,o,n,i)}constructor(e,t,s,o,r,i=!1){this.options=e,this.audioContext=t,this.stream=s,this.audioNodeVAD=o,this.sourceNode=r,this.listening=i,this.pause=()=>{this.audioNodeVAD.pause(),this.listening=!1},this.start=()=>{this.audioNodeVAD.start(),this.listening=!0},this.destroy=()=>{this.listening&&this.pause(),this.sourceNode.disconnect(),this.audioNodeVAD.destroy(),this.audioContext.close()}}}t.MicVAD=d;class l{static async new(e,s={}){const o={...t.defaultRealTimeVADOptions,...s};(0,a.validateOptions)(o),await e.audioWorklet.addModule(o.workletURL);const r=new AudioWorkletNode(e,"vad-helper-worklet",{processorOptions:{frameSamples:o.frameSamples}}),i=await a.Silero.new(n,(()=>o.modelFetcher(o.modelURL))),c=new a.FrameProcessor(i.process,i.reset_state,{frameSamples:o.frameSamples,positiveSpeechThreshold:o.positiveSpeechThreshold,negativeSpeechThreshold:o.negativeSpeechThreshold,redemptionFrames:o.redemptionFrames,preSpeechPadFrames:o.preSpeechPadFrames,minSpeechFrames:o.minSpeechFrames}),h=new l(e,o,c,r);return r.port.onmessage=async e=>{if(e.data?.message===a.Message.AudioFrame){const t=e.data.data,s=new Float32Array(t);await h.processFrame(s)}},h}constructor(e,t,s,o){this.ctx=e,this.options=t,this.frameProcessor=s,this.entryNode=o,this.pause=()=>{this.frameProcessor.pause()},this.start=()=>{this.frameProcessor.resume()},this.receive=e=>{e.connect(this.entryNode)},this.processFrame=async e=>{const{probs:t,msg:s,audio:o}=await this.frameProcessor.process(e);switch(void 0!==t&&this.options.onFrameProcessed(t),s){case a.Message.SpeechStart:this.options.onSpeechStart();break;case a.Message.VADMisfire:this.options.onVADMisfire();break;case a.Message.SpeechEnd:this.options.onSpeechEnd(o)}},this.destroy=()=>{this.entryNode.port.postMessage({message:a.Message.SpeechStop}),this.entryNode.disconnect()}}}t.AudioNodeVAD=l},787:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.audioFileToArray=void 0,t.audioFileToArray=async function(e){const t=new OfflineAudioContext(1,1,44100),s=new FileReader;let o=null;if(await new Promise((r=>{s.addEventListener("loadend",(e=>{const i=s.result;t.decodeAudioData(i,(e=>{o=e,t.startRendering().then((e=>{console.log("Rendering completed successfully"),r()})).catch((e=>{console.error(`Rendering failed: ${e}`)}))}),(e=>{console.log(`Error with decoding audio data: ${e}`)}))})),s.readAsArrayBuffer(e)})),null===o)throw Error("some shit");let r=o,i=new Float32Array(r.length);for(let e=0;e<r.length;e++)for(let t=0;t<r.numberOfChannels;t++)i[e]+=r.getChannelData(t)[e];return{audio:i,sampleRate:r.sampleRate}}},656:t=>{t.exports=e}},s={};return function e(o){var r=s[o];if(void 0!==r)return r.exports;var i=s[o]={exports:{}};return t[o].call(i.exports,i,i.exports,e),i.exports}(590)})()));
		!function(e,t){"object"==typeof exports&&"object"==typeof module?module.exports=t(require("onnxruntime-web")):"function"==typeof define&&define.amd?define(["onnxruntime-web"],t):"object"==typeof exports?exports.vad=t(require("onnxruntime-web")):e.vad=t(e.ort)}(self,(e=>(()=>{"use strict";var t={428:(e,t,s)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.FrameProcessor=t.validateOptions=t.defaultFrameProcessorOptions=void 0;const o=s(294),r=s(842),i=[512,1024,1536];t.defaultFrameProcessorOptions={positiveSpeechThreshold:.5,negativeSpeechThreshold:.35,preSpeechPadFrames:1,redemptionFrames:8,frameSamples:1536,minSpeechFrames:3,submitUserSpeechOnPause:!1},t.validateOptions=function(e){i.includes(e.frameSamples)\|\|r.log.warn("You are using an unusual frame size"),(e.positiveSpeechThreshold<0\|\|e.negativeSpeechThreshold>1)&&r.log.error("postiveSpeechThreshold should be a number between 0 and 1"),(e.negativeSpeechThreshold<0\|\|e.negativeSpeechThreshold>e.positiveSpeechThreshold)&&r.log.error("negativeSpeechThreshold should be between 0 and postiveSpeechThreshold"),e.preSpeechPadFrames<0&&r.log.error("preSpeechPadFrames should be positive"),e.redemptionFrames<0&&r.log.error("preSpeechPadFrames should be positive")};const n=e=>{const t=e.reduce(((e,t)=>(e.push(e.at(-1)+t.length),e)),[0]),s=new Float32Array(t.at(-1));return e.forEach(((e,o)=>{const r=t[o];s.set(e,r)})),s};t.FrameProcessor=class{constructor(e,t,s){this.modelProcessFunc=e,this.modelResetFunc=t,this.options=s,this.speaking=!1,this.redemptionCounter=0,this.active=!1,this.reset=()=>{this.speaking=!1,this.audioBuffer=[],this.modelResetFunc(),this.redemptionCounter=0},this.pause=()=>(this.active=!1,this.options.submitUserSpeechOnPause?this.endSegment():(this.reset(),{})),this.resume=()=>{this.active=!0},this.endSegment=()=>{const e=this.audioBuffer;this.audioBuffer=[];const t=this.speaking;this.reset();const s=e.reduce(((e,t)=>e+ +t.isSpeech),0);if(t){if(s>=this.options.minSpeechFrames){const t=n(e.map((e=>e.frame)));return{msg:o.Message.SpeechEnd,audio:t}}return{msg:o.Message.VADMisfire}}return{}},this.process=async e=>{if(!this.active)return{};const t=await this.modelProcessFunc(e);if(this.audioBuffer.push({frame:e,isSpeech:t.isSpeech>=this.options.positiveSpeechThreshold}),t.isSpeech>=this.options.positiveSpeechThreshold&&this.redemptionCounter&&(this.redemptionCounter=0),t.isSpeech>=this.options.positiveSpeechThreshold&&!this.speaking)return this.speaking=!0,{probs:t,msg:o.Message.SpeechStart};if(t.isSpeech<this.options.negativeSpeechThreshold&&this.speaking&&++this.redemptionCounter>=this.options.redemptionFrames){this.redemptionCounter=0,this.speaking=!1;const e=this.audioBuffer;if(this.audioBuffer=[],e.reduce(((e,t)=>e+ +t.isSpeech),0)>=this.options.minSpeechFrames){const s=n(e.map((e=>e.frame)));return{probs:t,msg:o.Message.SpeechEnd,audio:s}}return{probs:t,msg:o.Message.VADMisfire}}if(!this.speaking)for(;this.audioBuffer.length>this.options.preSpeechPadFrames;)this.audioBuffer.shift();return{probs:t}},this.audioBuffer=[],this.reset()}}},14:function(e,t,s){var o=this&&this.__createBinding\|\|(Object.create?function(e,t,s,o){void 0===o&&(o=s);var r=Object.getOwnPropertyDescriptor(t,s);r&&!("get"in r?!t.__esModule:r.writable\|\|r.configurable)\|\|(r={enumerable:!0,get:function(){return t[s]}}),Object.defineProperty(e,o,r)}:function(e,t,s,o){void 0===o&&(o=s),e[o]=t[s]}),r=this&&this.__setModuleDefault\|\|(Object.create?function(e,t){Object.defineProperty(e,"default",{enumerable:!0,value:t})}:function(e,t){e.default=t}),i=this&&this.__importStar\|\|function(e){if(e&&e.__esModule)return e;var t={};if(null!=e)for(var s in e)"default"!==s&&Object.prototype.hasOwnProperty.call(e,s)&&o(t,e,s);return r(t,e),t},n=this&&this.__exportStar\|\|function(e,t){for(var s in e)"default"===s\|\|Object.prototype.hasOwnProperty.call(t,s)\|\|o(t,e,s)};Object.defineProperty(t,"__esModule",{value:!0}),t.utils=void 0;const a=i(s(26));t.utils={minFramesForTargetMS:a.minFramesForTargetMS,arrayBufferToBase64:a.arrayBufferToBase64,encodeWAV:a.encodeWAV},n(s(405),t),n(s(428),t),n(s(294),t),n(s(842),t),n(s(260),t),n(s(724),t)},842:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.log=t.LOG_PREFIX=void 0,t.LOG_PREFIX="[VAD]";const s=["error","debug","warn"].reduce(((e,s)=>(e[s]=function(e){return(...s)=>{console[e](t.LOG_PREFIX,...s)}}(s),e)),{});t.log=s},294:(e,t)=>{var s;Object.defineProperty(t,"__esModule",{value:!0}),t.Message=void 0,function(e){e.AudioFrame="AUDIO_FRAME",e.SpeechStart="SPEECH_START",e.VADMisfire="VAD_MISFIRE",e.SpeechEnd="SPEECH_END",e.SpeechStop="SPEECH_STOP"}(s\|\|(t.Message=s={}))},260:(e,t,s)=>{var o;Object.defineProperty(t,"__esModule",{value:!0}),t.Silero=void 0;const r=s(842);class i{constructor(e,t){this.ort=e,this.modelFetcher=t,this.init=async()=>{r.log.debug("initializing vad");const e=await this.modelFetcher();this._session=await this.ort.InferenceSession.create(e),this._sr=new this.ort.Tensor("int64",[16000n]),this.reset_state(),r.log.debug("vad is initialized")},this.reset_state=()=>{const e=Array(128).fill(0);this._h=new this.ort.Tensor("float32",e,[2,1,64]),this._c=new this.ort.Tensor("float32",e,[2,1,64])},this.process=async e=>{const t={input:new this.ort.Tensor("float32",e,[1,e.length]),h:this._h,c:this._c,sr:this._sr},s=await this._session.run(t);this._h=s.hn,this._c=s.cn;const[o]=s.output.data;return{notSpeech:1-o,isSpeech:o}}}}t.Silero=i,o=i,i.new=async(e,t)=>{const s=new o(e,t);return await s.init(),s}},405:(e,t,s)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.PlatformAgnosticNonRealTimeVAD=t.defaultNonRealTimeVADOptions=void 0;const o=s(428),r=s(294),i=s(260),n=s(724);t.defaultNonRealTimeVADOptions={...o.defaultFrameProcessorOptions},t.PlatformAgnosticNonRealTimeVAD=class{static async _new(e,s,o={}){const r=new this(e,s,{...t.defaultNonRealTimeVADOptions,...o});return await r.init(),r}constructor(e,t,s){this.modelFetcher=e,this.ort=t,this.options=s,this.init=async()=>{const e=await i.Silero.new(this.ort,this.modelFetcher);this.frameProcessor=new o.FrameProcessor(e.process,e.reset_state,{frameSamples:this.options.frameSamples,positiveSpeechThreshold:this.options.positiveSpeechThreshold,negativeSpeechThreshold:this.options.negativeSpeechThreshold,redemptionFrames:this.options.redemptionFrames,preSpeechPadFrames:this.options.preSpeechPadFrames,minSpeechFrames:this.options.minSpeechFrames,submitUserSpeechOnPause:this.options.submitUserSpeechOnPause}),this.frameProcessor.resume()},this.run=async function(e,t){const s={nativeSampleRate:t,targetSampleRate:16e3,targetFrameSize:this.options.frameSamples},o=new n.Resampler(s).process(e);let i,a;for(const e of[...Array(o.length)].keys()){const t=o[e],{msg:s,audio:n}=await this.frameProcessor.process(t);switch(s){case r.Message.SpeechStart:i=ethis.options.frameSamples/16;break;case r.Message.SpeechEnd:a=(e+1)this.options.frameSamples/16,yield{audio:n,start:i,end:a}}}const{msg:c,audio:h}=this.frameProcessor.endSegment();c==r.Message.SpeechEnd&&(yield{audio:h,start:i,end:o.lengththis.options.frameSamples/16})},(0,o.validateOptions)(s)}}},724:(e,t,s)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.Resampler=void 0;const o=s(842);t.Resampler=class{constructor(e){this.options=e,this.process=e=>{const t=[];for(const t of e)this.inputBuffer.push(t);for(;this.inputBuffer.lengththis.options.targetSampleRate/this.options.nativeSampleRate>this.options.targetFrameSize;){const e=new Float32Array(this.options.targetFrameSize);let s=0,o=0;for(;s<this.options.targetFrameSize;){let t=0,r=0;for(;o<Math.min(this.inputBuffer.length,(s+1)this.options.nativeSampleRate/this.options.targetSampleRate);)t+=this.inputBuffer[o],r++,o++;e[s]=t/r,s++}this.inputBuffer=this.inputBuffer.slice(o),t.push(e)}return t},e.nativeSampleRate<16e3&&o.log.error("nativeSampleRate is too low. Should have 16000 = targetSampleRate <= nativeSampleRate"),this.inputBuffer=[]}}},26:(e,t)=>{function s(e,t,s){for(var o=0;o<s.length;o++)e.setUint8(t+o,s.charCodeAt(o))}Object.defineProperty(t,"__esModule",{value:!0}),t.encodeWAV=t.arrayBufferToBase64=t.minFramesForTargetMS=void 0,t.minFramesForTargetMS=function(e,t,s=16e3){return Math.ceil(es/1e3/t)},t.arrayBufferToBase64=function(e){for(var t="",s=new Uint8Array(e),o=s.byteLength,r=0;r<o;r++)t+=String.fromCharCode(s[r]);return btoa(t)},t.encodeWAV=function(e,t=3,o=16e3,r=1,i=32){var n=i/8,a=rn,c=new ArrayBuffer(44+e.lengthn),h=new DataView(c);return s(h,0,"RIFF"),h.setUint32(4,36+e.lengthn,!0),s(h,8,"WAVE"),s(h,12,"fmt "),h.setUint32(16,16,!0),h.setUint16(20,t,!0),h.setUint16(22,r,!0),h.setUint32(24,o,!0),h.setUint32(28,oa,!0),h.setUint16(32,a,!0),h.setUint16(34,i,!0),s(h,36,"data"),h.setUint32(40,e.lengthn,!0),1===t?function(e,t,s){for(var o=0;o<s.length;o++,t+=2){var r=Math.max(-1,Math.min(1,s[o]));e.setInt16(t,r<0?32768r:32767r,!0)}}(h,44,e):function(e,t,s){for(var o=0;o<s.length;o++,t+=4)e.setFloat32(t,s[o],!0)}(h,44,e),c}},485:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.assetPath=void 0;const s="undefined"!=typeof window&&void 0!==window.document?window.document.currentScript:null;let o="";s&&(o=s.src.replace(/#.$/,"").replace(/\?.$/,"").replace(/\/[^\/]+$/,"/")),t.assetPath=e=>o+e},973:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.defaultModelFetcher=void 0,t.defaultModelFetcher=e=>fetch(e).then((e=>e.arrayBuffer()))},590:function(e,t,s){var o=this&&this.__createBinding\|\|(Object.create?function(e,t,s,o){void 0===o&&(o=s);var r=Object.getOwnPropertyDescriptor(t,s);r&&!("get"in r?!t.__esModule:r.writable\|\|r.configurable)\|\|(r={enumerable:!0,get:function(){return t[s]}}),Object.defineProperty(e,o,r)}:function(e,t,s,o){void 0===o&&(o=s),e[o]=t[s]}),r=this&&this.__setModuleDefault\|\|(Object.create?function(e,t){Object.defineProperty(e,"default",{enumerable:!0,value:t})}:function(e,t){e.default=t}),i=this&&this.__importStar\|\|function(e){if(e&&e.__esModule)return e;var t={};if(null!=e)for(var s in e)"default"!==s&&Object.prototype.hasOwnProperty.call(e,s)&&o(t,e,s);return r(t,e),t};Object.defineProperty(t,"__esModule",{value:!0}),t.defaultRealTimeVADOptions=t.AudioNodeVAD=t.MicVAD=t.NonRealTimeVAD=t.Message=t.FrameProcessor=t.utils=t.defaultNonRealTimeVADOptions=void 0;const n=i(s(656)),a=s(14);Object.defineProperty(t,"FrameProcessor",{enumerable:!0,get:function(){return a.FrameProcessor}}),Object.defineProperty(t,"Message",{enumerable:!0,get:function(){return a.Message}});const c=s(787),h=s(973),u=s(485);t.defaultNonRealTimeVADOptions={modelURL:(0,u.assetPath)("silero_vad.onnx"),modelFetcher:h.defaultModelFetcher};class d extends a.PlatformAgnosticNonRealTimeVAD{static async new(e={}){const{modelURL:s,modelFetcher:o}={...t.defaultNonRealTimeVADOptions,...e};return await this._new((()=>o(s)),n,e)}}t.NonRealTimeVAD=d,t.utils={audioFileToArray:c.audioFileToArray,...a.utils};var l=s(746);Object.defineProperty(t,"MicVAD",{enumerable:!0,get:function(){return l.MicVAD}}),Object.defineProperty(t,"AudioNodeVAD",{enumerable:!0,get:function(){return l.AudioNodeVAD}}),Object.defineProperty(t,"defaultRealTimeVADOptions",{enumerable:!0,get:function(){return l.defaultRealTimeVADOptions}})},746:function(e,t,s){var o=this&&this.__createBinding\|\|(Object.create?function(e,t,s,o){void 0===o&&(o=s);var r=Object.getOwnPropertyDescriptor(t,s);r&&!("get"in r?!t.__esModule:r.writable\|\|r.configurable)\|\|(r={enumerable:!0,get:function(){return t[s]}}),Object.defineProperty(e,o,r)}:function(e,t,s,o){void 0===o&&(o=s),e[o]=t[s]}),r=this&&this.__setModuleDefault\|\|(Object.create?function(e,t){Object.defineProperty(e,"default",{enumerable:!0,value:t})}:function(e,t){e.default=t}),i=this&&this.__importStar\|\|function(e){if(e&&e.__esModule)return e;var t={};if(null!=e)for(var s in e)"default"!==s&&Object.prototype.hasOwnProperty.call(e,s)&&o(t,e,s);return r(t,e),t};Object.defineProperty(t,"__esModule",{value:!0}),t.AudioNodeVAD=t.MicVAD=t.defaultRealTimeVADOptions=void 0;const n=i(s(656)),a=s(14),c=s(485),h=s(973);t.defaultRealTimeVADOptions={...a.defaultFrameProcessorOptions,onFrameProcessed:e=>{},onVADMisfire:()=>{a.log.debug("VAD misfire")},onSpeechStart:()=>{a.log.debug("Detected speech start")},onSpeechEnd:()=>{a.log.debug("Detected speech end")},workletURL:(0,c.assetPath)("vad.worklet.bundle.min.js"),modelURL:(0,c.assetPath)("silero_vad.onnx"),modelFetcher:h.defaultModelFetcher,stream:void 0};class u{static async new(e={}){const s={...t.defaultRealTimeVADOptions,...e};let o;(0,a.validateOptions)(s),o=void 0===s.stream?await navigator.mediaDevices.getUserMedia({audio:{...s.additionalAudioConstraints,channelCount:1,echoCancellation:!0,autoGainControl:!0,noiseSuppression:!0}}):s.stream;const r=new AudioContext,i=new MediaStreamAudioSourceNode(r,{mediaStream:o}),n=await d.new(r,s);return n.receive(i),new u(s,r,o,n,i)}constructor(e,t,s,o,r,i=!1){this.options=e,this.audioContext=t,this.stream=s,this.audioNodeVAD=o,this.sourceNode=r,this.listening=i,this.pause=()=>{this.audioNodeVAD.pause(),this.listening=!1},this.start=()=>{this.audioNodeVAD.start(),this.listening=!0},this.destroy=()=>{this.listening&&this.pause(),this.sourceNode.disconnect(),this.audioNodeVAD.destroy(),this.audioContext.close()}}}t.MicVAD=u;class d{static async new(e,s={}){const o={...t.defaultRealTimeVADOptions,...s};(0,a.validateOptions)(o),await e.audioWorklet.addModule(o.workletURL);const r=new AudioWorkletNode(e,"vad-helper-worklet",{processorOptions:{frameSamples:o.frameSamples}}),i=await a.Silero.new(n,(()=>o.modelFetcher(o.modelURL))),c=new a.FrameProcessor(i.process,i.reset_state,{frameSamples:o.frameSamples,positiveSpeechThreshold:o.positiveSpeechThreshold,negativeSpeechThreshold:o.negativeSpeechThreshold,redemptionFrames:o.redemptionFrames,preSpeechPadFrames:o.preSpeechPadFrames,minSpeechFrames:o.minSpeechFrames,submitUserSpeechOnPause:o.submitUserSpeechOnPause}),h=new d(e,o,c,r);return r.port.onmessage=async e=>{if(e.data?.message===a.Message.AudioFrame){const t=e.data.data,s=new Float32Array(t);await h.processFrame(s)}},h}constructor(e,t,s,o){this.ctx=e,this.options=t,this.frameProcessor=s,this.entryNode=o,this.pause=()=>{const e=this.frameProcessor.pause();this.handleFrameProcessorEvent(e)},this.start=()=>{this.frameProcessor.resume()},this.receive=e=>{e.connect(this.entryNode)},this.processFrame=async e=>{const t=await this.frameProcessor.process(e);this.handleFrameProcessorEvent(t)},this.handleFrameProcessorEvent=e=>{switch(void 0!==e.probs&&this.options.onFrameProcessed(e.probs),e.msg){case a.Message.SpeechStart:this.options.onSpeechStart();break;case a.Message.VADMisfire:this.options.onVADMisfire();break;case a.Message.SpeechEnd:this.options.onSpeechEnd(e.audio)}},this.destroy=()=>{this.entryNode.port.postMessage({message:a.Message.SpeechStop}),this.entryNode.disconnect()}}}t.AudioNodeVAD=d},787:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.audioFileToArray=void 0,t.audioFileToArray=async function(e){const t=new OfflineAudioContext(1,1,44100),s=new FileReader;let o=null;if(await new Promise((r=>{s.addEventListener("loadend",(e=>{const i=s.result;t.decodeAudioData(i,(e=>{o=e,t.startRendering().then((e=>{console.log("Rendering completed successfully"),r()})).catch((e=>{console.error(`Rendering failed: ${e}`)}))}),(e=>{console.log(`Error with decoding audio data: ${e}`)}))})),s.readAsArrayBuffer(e)})),null===o)throw Error("some shit");let r=o,i=new Float32Array(r.length);for(let e=0;e<r.length;e++)for(let t=0;t<r.numberOfChannels;t++)i[e]+=r.getChannelData(t)[e];return{audio:i,sampleRate:r.sampleRate}}},656:t=>{t.exports=e}},s={};return function e(o){var r=s[o];if(void 0!==r)return r.exports;var i=s[o]={exports:{}};return t[o].call(i.exports,i,i.exports,e),i.exports}(590)})()));

7

dist/real-time-vad.d.ts

		@@ -1,2 +0,2 @@
		import { SpeechProbabilities, FrameProcessor, FrameProcessorOptions } from "./_common";
		import { Message, SpeechProbabilities, FrameProcessor, FrameProcessorOptions } from "./_common";
		interface RealTimeVADCallbacks {
		@@ -61,2 +61,7 @@ /** Callback to run after each frame. The size (number of samples) of a frame is given by `frameSamples`. */
		processFrame: (frame: Float32Array) => Promise<void>;
		handleFrameProcessorEvent: (ev: Partial<{
		probs: SpeechProbabilities;
		msg: Message;
		audio: Float32Array;
		}>) => void;
		destroy: () => void;
		@@ -63,0 +68,0 @@ }

17

dist/real-time-vad.js

		@@ -123,2 +123,3 @@ "use strict";
		minSpeechFrames: fullOptions.minSpeechFrames,
		submitUserSpeechOnPause: fullOptions.submitUserSpeechOnPause,
		});
		@@ -145,3 +146,4 @@ const audioNodeVAD = new AudioNodeVAD(ctx, fullOptions, frameProcessor, vadNode);
		this.pause = () => {
		this.frameProcessor.pause();
		const ev = this.frameProcessor.pause();
		this.handleFrameProcessorEvent(ev);
		};
		@@ -155,7 +157,10 @@ this.start = () => {
		this.processFrame = async (frame) => {
		const { probs, msg, audio } = await this.frameProcessor.process(frame);
		if (probs !== undefined) {
		this.options.onFrameProcessed(probs);
		const ev = await this.frameProcessor.process(frame);
		this.handleFrameProcessorEvent(ev);
		};
		this.handleFrameProcessorEvent = (ev) => {
		if (ev.probs !== undefined) {
		this.options.onFrameProcessed(ev.probs);
		}
		switch (msg) {
		switch (ev.msg) {
		case _common_1.Message.SpeechStart:
		@@ -168,3 +173,3 @@ this.options.onSpeechStart();
		case _common_1.Message.SpeechEnd:
		this.options.onSpeechEnd(audio);
		this.options.onSpeechEnd(ev.audio);
		break;
		@@ -171,0 +176,0 @@ default:

4

dist/vad.worklet.bundle.dev.js

		@@ -19,3 +19,3 @@ /*

		eval("\n/\nSome of this code, together with the default options found in index.ts,\nwere taken (or took inspiration) from https://github.com/snakers4/silero-vad\n/\nObject.defineProperty(exports, \"__esModule\", ({ value: true }));\nexports.FrameProcessor = exports.validateOptions = exports.defaultFrameProcessorOptions = void 0;\nconst messages_1 = __webpack_require__(/! ./messages / \"./dist/_common/messages.js\");\nconst logging_1 = __webpack_require__(/! ./logging / \"./dist/_common/logging.js\");\nconst RECOMMENDED_FRAME_SAMPLES = [512, 1024, 1536];\nexports.defaultFrameProcessorOptions = {\n positiveSpeechThreshold: 0.5,\n negativeSpeechThreshold: 0.5 - 0.15,\n preSpeechPadFrames: 1,\n redemptionFrames: 8,\n frameSamples: 1536,\n minSpeechFrames: 3,\n};\nfunction validateOptions(options) {\n if (!RECOMMENDED_FRAME_SAMPLES.includes(options.frameSamples)) {\n logging_1.log.warn(\"You are using an unusual frame size\");\n }\n if (options.positiveSpeechThreshold < 0 \|\|\n options.negativeSpeechThreshold > 1) {\n logging_1.log.error(\"postiveSpeechThreshold should be a number between 0 and 1\");\n }\n if (options.negativeSpeechThreshold < 0 \|\|\n options.negativeSpeechThreshold > options.positiveSpeechThreshold) {\n logging_1.log.error(\"negativeSpeechThreshold should be between 0 and postiveSpeechThreshold\");\n }\n if (options.preSpeechPadFrames < 0) {\n logging_1.log.error(\"preSpeechPadFrames should be positive\");\n }\n if (options.redemptionFrames < 0) {\n logging_1.log.error(\"preSpeechPadFrames should be positive\");\n }\n}\nexports.validateOptions = validateOptions;\nconst concatArrays = (arrays) => {\n const sizes = arrays.reduce((out, next) => {\n out.push(out.at(-1) + next.length);\n return out;\n }, [0]);\n const outArray = new Float32Array(sizes.at(-1));\n arrays.forEach((arr, index) => {\n const place = sizes[index];\n outArray.set(arr, place);\n });\n return outArray;\n};\nclass FrameProcessor {\n constructor(modelProcessFunc, modelResetFunc, options) {\n this.modelProcessFunc = modelProcessFunc;\n this.modelResetFunc = modelResetFunc;\n this.options = options;\n this.speaking = false;\n this.redemptionCounter = 0;\n this.active = false;\n this.reset = () => {\n this.speaking = false;\n this.audioBuffer = [];\n this.modelResetFunc();\n this.redemptionCounter = 0;\n };\n this.pause = () => {\n this.active = false;\n this.reset();\n };\n this.resume = () => {\n this.active = true;\n };\n this.endSegment = () => {\n const audioBuffer = this.audioBuffer;\n this.audioBuffer = [];\n const speaking = this.speaking;\n this.reset();\n const speechFrameCount = audioBuffer.reduce((acc, item) => {\n return acc + +item.isSpeech;\n }, 0);\n if (speaking) {\n if (speechFrameCount >= this.options.minSpeechFrames) {\n const audio = concatArrays(audioBuffer.map((item) => item.frame));\n return { msg: messages_1.Message.SpeechEnd, audio };\n }\n else {\n return { msg: messages_1.Message.VADMisfire };\n }\n }\n return {};\n };\n this.process = async (frame) => {\n if (!this.active) {\n return {};\n }\n const probs = await this.modelProcessFunc(frame);\n this.audioBuffer.push({\n frame,\n isSpeech: probs.isSpeech >= this.options.positiveSpeechThreshold,\n });\n if (probs.isSpeech >= this.options.positiveSpeechThreshold &&\n this.redemptionCounter) {\n this.redemptionCounter = 0;\n }\n if (probs.isSpeech >= this.options.positiveSpeechThreshold &&\n !this.speaking) {\n this.speaking = true;\n return { probs, msg: messages_1.Message.SpeechStart };\n }\n if (probs.isSpeech < this.options.negativeSpeechThreshold &&\n this.speaking &&\n ++this.redemptionCounter >= this.options.redemptionFrames) {\n this.redemptionCounter = 0;\n this.speaking = false;\n const audioBuffer = this.audioBuffer;\n this.audioBuffer = [];\n const speechFrameCount = audioBuffer.reduce((acc, item) => {\n return acc + +item.isSpeech;\n }, 0);\n if (speechFrameCount >= this.options.minSpeechFrames) {\n const audio = concatArrays(audioBuffer.map((item) => item.frame));\n return { probs, msg: messages_1.Message.SpeechEnd, audio };\n }\n else {\n return { probs, msg: messages_1.Message.VADMisfire };\n }\n }\n if (!this.speaking) {\n while (this.audioBuffer.length > this.options.preSpeechPadFrames) {\n this.audioBuffer.shift();\n }\n }\n return { probs };\n };\n this.audioBuffer = [];\n this.reset();\n }\n}\nexports.FrameProcessor = FrameProcessor;\n//# sourceMappingURL=frame-processor.js.map\n\n//# sourceURL=webpack://@ricky0123/vad-web/./dist/_common/frame-processor.js?");
		eval("\n/\nSome of this code, together with the default options found in index.ts,\nwere taken (or took inspiration) from https://github.com/snakers4/silero-vad\n/\nObject.defineProperty(exports, \"__esModule\", ({ value: true }));\nexports.FrameProcessor = exports.validateOptions = exports.defaultFrameProcessorOptions = void 0;\nconst messages_1 = __webpack_require__(/! ./messages / \"./dist/_common/messages.js\");\nconst logging_1 = __webpack_require__(/! ./logging / \"./dist/_common/logging.js\");\nconst RECOMMENDED_FRAME_SAMPLES = [512, 1024, 1536];\nexports.defaultFrameProcessorOptions = {\n positiveSpeechThreshold: 0.5,\n negativeSpeechThreshold: 0.5 - 0.15,\n preSpeechPadFrames: 1,\n redemptionFrames: 8,\n frameSamples: 1536,\n minSpeechFrames: 3,\n submitUserSpeechOnPause: false,\n};\nfunction validateOptions(options) {\n if (!RECOMMENDED_FRAME_SAMPLES.includes(options.frameSamples)) {\n logging_1.log.warn(\"You are using an unusual frame size\");\n }\n if (options.positiveSpeechThreshold < 0 \|\|\n options.negativeSpeechThreshold > 1) {\n logging_1.log.error(\"postiveSpeechThreshold should be a number between 0 and 1\");\n }\n if (options.negativeSpeechThreshold < 0 \|\|\n options.negativeSpeechThreshold > options.positiveSpeechThreshold) {\n logging_1.log.error(\"negativeSpeechThreshold should be between 0 and postiveSpeechThreshold\");\n }\n if (options.preSpeechPadFrames < 0) {\n logging_1.log.error(\"preSpeechPadFrames should be positive\");\n }\n if (options.redemptionFrames < 0) {\n logging_1.log.error(\"preSpeechPadFrames should be positive\");\n }\n}\nexports.validateOptions = validateOptions;\nconst concatArrays = (arrays) => {\n const sizes = arrays.reduce((out, next) => {\n out.push(out.at(-1) + next.length);\n return out;\n }, [0]);\n const outArray = new Float32Array(sizes.at(-1));\n arrays.forEach((arr, index) => {\n const place = sizes[index];\n outArray.set(arr, place);\n });\n return outArray;\n};\nclass FrameProcessor {\n constructor(modelProcessFunc, modelResetFunc, options) {\n this.modelProcessFunc = modelProcessFunc;\n this.modelResetFunc = modelResetFunc;\n this.options = options;\n this.speaking = false;\n this.redemptionCounter = 0;\n this.active = false;\n this.reset = () => {\n this.speaking = false;\n this.audioBuffer = [];\n this.modelResetFunc();\n this.redemptionCounter = 0;\n };\n this.pause = () => {\n this.active = false;\n if (this.options.submitUserSpeechOnPause) {\n return this.endSegment();\n }\n else {\n this.reset();\n return {};\n }\n };\n this.resume = () => {\n this.active = true;\n };\n this.endSegment = () => {\n const audioBuffer = this.audioBuffer;\n this.audioBuffer = [];\n const speaking = this.speaking;\n this.reset();\n const speechFrameCount = audioBuffer.reduce((acc, item) => {\n return acc + +item.isSpeech;\n }, 0);\n if (speaking) {\n if (speechFrameCount >= this.options.minSpeechFrames) {\n const audio = concatArrays(audioBuffer.map((item) => item.frame));\n return { msg: messages_1.Message.SpeechEnd, audio };\n }\n else {\n return { msg: messages_1.Message.VADMisfire };\n }\n }\n return {};\n };\n this.process = async (frame) => {\n if (!this.active) {\n return {};\n }\n const probs = await this.modelProcessFunc(frame);\n this.audioBuffer.push({\n frame,\n isSpeech: probs.isSpeech >= this.options.positiveSpeechThreshold,\n });\n if (probs.isSpeech >= this.options.positiveSpeechThreshold &&\n this.redemptionCounter) {\n this.redemptionCounter = 0;\n }\n if (probs.isSpeech >= this.options.positiveSpeechThreshold &&\n !this.speaking) {\n this.speaking = true;\n return { probs, msg: messages_1.Message.SpeechStart };\n }\n if (probs.isSpeech < this.options.negativeSpeechThreshold &&\n this.speaking &&\n ++this.redemptionCounter >= this.options.redemptionFrames) {\n this.redemptionCounter = 0;\n this.speaking = false;\n const audioBuffer = this.audioBuffer;\n this.audioBuffer = [];\n const speechFrameCount = audioBuffer.reduce((acc, item) => {\n return acc + +item.isSpeech;\n }, 0);\n if (speechFrameCount >= this.options.minSpeechFrames) {\n const audio = concatArrays(audioBuffer.map((item) => item.frame));\n return { probs, msg: messages_1.Message.SpeechEnd, audio };\n }\n else {\n return { probs, msg: messages_1.Message.VADMisfire };\n }\n }\n if (!this.speaking) {\n while (this.audioBuffer.length > this.options.preSpeechPadFrames) {\n this.audioBuffer.shift();\n }\n }\n return { probs };\n };\n this.audioBuffer = [];\n this.reset();\n }\n}\nexports.FrameProcessor = FrameProcessor;\n//# sourceMappingURL=frame-processor.js.map\n\n//# sourceURL=webpack://@ricky0123/vad-web/./dist/_common/frame-processor.js?");

		@@ -70,3 +70,3 @@ /***/ }),

		eval("\nObject.defineProperty(exports, \"__esModule\", ({ value: true }));\nexports.PlatformAgnosticNonRealTimeVAD = exports.defaultNonRealTimeVADOptions = void 0;\nconst frame_processor_1 = __webpack_require__(/! ./frame-processor / \"./dist/_common/frame-processor.js\");\nconst messages_1 = __webpack_require__(/! ./messages / \"./dist/_common/messages.js\");\nconst models_1 = __webpack_require__(/! ./models / \"./dist/_common/models.js\");\nconst resampler_1 = __webpack_require__(/! ./resampler / \"./dist/_common/resampler.js\");\nexports.defaultNonRealTimeVADOptions = {\n ...frame_processor_1.defaultFrameProcessorOptions,\n};\nclass PlatformAgnosticNonRealTimeVAD {\n static async _new(modelFetcher, ort, options = {}) {\n const vad = new this(modelFetcher, ort, {\n ...exports.defaultNonRealTimeVADOptions,\n ...options,\n });\n await vad.init();\n return vad;\n }\n constructor(modelFetcher, ort, options) {\n this.modelFetcher = modelFetcher;\n this.ort = ort;\n this.options = options;\n this.init = async () => {\n const model = await models_1.Silero.new(this.ort, this.modelFetcher);\n this.frameProcessor = new frame_processor_1.FrameProcessor(model.process, model.reset_state, {\n frameSamples: this.options.frameSamples,\n positiveSpeechThreshold: this.options.positiveSpeechThreshold,\n negativeSpeechThreshold: this.options.negativeSpeechThreshold,\n redemptionFrames: this.options.redemptionFrames,\n preSpeechPadFrames: this.options.preSpeechPadFrames,\n minSpeechFrames: this.options.minSpeechFrames,\n });\n this.frameProcessor.resume();\n };\n this.run = async function* (inputAudio, sampleRate) {\n const resamplerOptions = {\n nativeSampleRate: sampleRate,\n targetSampleRate: 16000,\n targetFrameSize: this.options.frameSamples,\n };\n const resampler = new resampler_1.Resampler(resamplerOptions);\n const frames = resampler.process(inputAudio);\n let start, end;\n for (const i of [...Array(frames.length)].keys()) {\n const f = frames[i];\n const { msg, audio } = await this.frameProcessor.process(f);\n switch (msg) {\n case messages_1.Message.SpeechStart:\n start = (i * this.options.frameSamples) / 16;\n break;\n case messages_1.Message.SpeechEnd:\n end = ((i + 1) * this.options.frameSamples) / 16;\n // @ts-ignore\n yield { audio, start, end };\n break;\n default:\n break;\n }\n }\n const { msg, audio } = this.frameProcessor.endSegment();\n if (msg == messages_1.Message.SpeechEnd) {\n yield {\n audio,\n // @ts-ignore\n start,\n end: (frames.length * this.options.frameSamples) / 16,\n };\n }\n };\n (0, frame_processor_1.validateOptions)(options);\n }\n}\nexports.PlatformAgnosticNonRealTimeVAD = PlatformAgnosticNonRealTimeVAD;\n//# sourceMappingURL=non-real-time-vad.js.map\n\n//# sourceURL=webpack://@ricky0123/vad-web/./dist/_common/non-real-time-vad.js?");
		eval("\nObject.defineProperty(exports, \"__esModule\", ({ value: true }));\nexports.PlatformAgnosticNonRealTimeVAD = exports.defaultNonRealTimeVADOptions = void 0;\nconst frame_processor_1 = __webpack_require__(/! ./frame-processor / \"./dist/_common/frame-processor.js\");\nconst messages_1 = __webpack_require__(/! ./messages / \"./dist/_common/messages.js\");\nconst models_1 = __webpack_require__(/! ./models / \"./dist/_common/models.js\");\nconst resampler_1 = __webpack_require__(/! ./resampler / \"./dist/_common/resampler.js\");\nexports.defaultNonRealTimeVADOptions = {\n ...frame_processor_1.defaultFrameProcessorOptions,\n};\nclass PlatformAgnosticNonRealTimeVAD {\n static async _new(modelFetcher, ort, options = {}) {\n const vad = new this(modelFetcher, ort, {\n ...exports.defaultNonRealTimeVADOptions,\n ...options,\n });\n await vad.init();\n return vad;\n }\n constructor(modelFetcher, ort, options) {\n this.modelFetcher = modelFetcher;\n this.ort = ort;\n this.options = options;\n this.init = async () => {\n const model = await models_1.Silero.new(this.ort, this.modelFetcher);\n this.frameProcessor = new frame_processor_1.FrameProcessor(model.process, model.reset_state, {\n frameSamples: this.options.frameSamples,\n positiveSpeechThreshold: this.options.positiveSpeechThreshold,\n negativeSpeechThreshold: this.options.negativeSpeechThreshold,\n redemptionFrames: this.options.redemptionFrames,\n preSpeechPadFrames: this.options.preSpeechPadFrames,\n minSpeechFrames: this.options.minSpeechFrames,\n submitUserSpeechOnPause: this.options.submitUserSpeechOnPause,\n });\n this.frameProcessor.resume();\n };\n this.run = async function* (inputAudio, sampleRate) {\n const resamplerOptions = {\n nativeSampleRate: sampleRate,\n targetSampleRate: 16000,\n targetFrameSize: this.options.frameSamples,\n };\n const resampler = new resampler_1.Resampler(resamplerOptions);\n const frames = resampler.process(inputAudio);\n let start, end;\n for (const i of [...Array(frames.length)].keys()) {\n const f = frames[i];\n const { msg, audio } = await this.frameProcessor.process(f);\n switch (msg) {\n case messages_1.Message.SpeechStart:\n start = (i * this.options.frameSamples) / 16;\n break;\n case messages_1.Message.SpeechEnd:\n end = ((i + 1) * this.options.frameSamples) / 16;\n // @ts-ignore\n yield { audio, start, end };\n break;\n default:\n break;\n }\n }\n const { msg, audio } = this.frameProcessor.endSegment();\n if (msg == messages_1.Message.SpeechEnd) {\n yield {\n audio,\n // @ts-ignore\n start,\n end: (frames.length * this.options.frameSamples) / 16,\n };\n }\n };\n (0, frame_processor_1.validateOptions)(options);\n }\n}\nexports.PlatformAgnosticNonRealTimeVAD = PlatformAgnosticNonRealTimeVAD;\n//# sourceMappingURL=non-real-time-vad.js.map\n\n//# sourceURL=webpack://@ricky0123/vad-web/./dist/_common/non-real-time-vad.js?");

		@@ -73,0 +73,0 @@ /***/ }),

2

dist/vad.worklet.bundle.min.js

		@@ -1,1 +0,1 @@
		(()=>{"use strict";var e={428:(e,t,s)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.FrameProcessor=t.validateOptions=t.defaultFrameProcessorOptions=void 0;const i=s(294),r=s(842),o=[512,1024,1536];t.defaultFrameProcessorOptions={positiveSpeechThreshold:.5,negativeSpeechThreshold:.35,preSpeechPadFrames:1,redemptionFrames:8,frameSamples:1536,minSpeechFrames:3},t.validateOptions=function(e){o.includes(e.frameSamples)\|\|r.log.warn("You are using an unusual frame size"),(e.positiveSpeechThreshold<0\|\|e.negativeSpeechThreshold>1)&&r.log.error("postiveSpeechThreshold should be a number between 0 and 1"),(e.negativeSpeechThreshold<0\|\|e.negativeSpeechThreshold>e.positiveSpeechThreshold)&&r.log.error("negativeSpeechThreshold should be between 0 and postiveSpeechThreshold"),e.preSpeechPadFrames<0&&r.log.error("preSpeechPadFrames should be positive"),e.redemptionFrames<0&&r.log.error("preSpeechPadFrames should be positive")};const n=e=>{const t=e.reduce(((e,t)=>(e.push(e.at(-1)+t.length),e)),[0]),s=new Float32Array(t.at(-1));return e.forEach(((e,i)=>{const r=t[i];s.set(e,r)})),s};t.FrameProcessor=class{constructor(e,t,s){this.modelProcessFunc=e,this.modelResetFunc=t,this.options=s,this.speaking=!1,this.redemptionCounter=0,this.active=!1,this.reset=()=>{this.speaking=!1,this.audioBuffer=[],this.modelResetFunc(),this.redemptionCounter=0},this.pause=()=>{this.active=!1,this.reset()},this.resume=()=>{this.active=!0},this.endSegment=()=>{const e=this.audioBuffer;this.audioBuffer=[];const t=this.speaking;this.reset();const s=e.reduce(((e,t)=>e+ +t.isSpeech),0);if(t){if(s>=this.options.minSpeechFrames){const t=n(e.map((e=>e.frame)));return{msg:i.Message.SpeechEnd,audio:t}}return{msg:i.Message.VADMisfire}}return{}},this.process=async e=>{if(!this.active)return{};const t=await this.modelProcessFunc(e);if(this.audioBuffer.push({frame:e,isSpeech:t.isSpeech>=this.options.positiveSpeechThreshold}),t.isSpeech>=this.options.positiveSpeechThreshold&&this.redemptionCounter&&(this.redemptionCounter=0),t.isSpeech>=this.options.positiveSpeechThreshold&&!this.speaking)return this.speaking=!0,{probs:t,msg:i.Message.SpeechStart};if(t.isSpeech<this.options.negativeSpeechThreshold&&this.speaking&&++this.redemptionCounter>=this.options.redemptionFrames){this.redemptionCounter=0,this.speaking=!1;const e=this.audioBuffer;if(this.audioBuffer=[],e.reduce(((e,t)=>e+ +t.isSpeech),0)>=this.options.minSpeechFrames){const s=n(e.map((e=>e.frame)));return{probs:t,msg:i.Message.SpeechEnd,audio:s}}return{probs:t,msg:i.Message.VADMisfire}}if(!this.speaking)for(;this.audioBuffer.length>this.options.preSpeechPadFrames;)this.audioBuffer.shift();return{probs:t}},this.audioBuffer=[],this.reset()}}},14:function(e,t,s){var i=this&&this.__createBinding\|\|(Object.create?function(e,t,s,i){void 0===i&&(i=s);var r=Object.getOwnPropertyDescriptor(t,s);r&&!("get"in r?!t.__esModule:r.writable\|\|r.configurable)\|\|(r={enumerable:!0,get:function(){return t[s]}}),Object.defineProperty(e,i,r)}:function(e,t,s,i){void 0===i&&(i=s),e[i]=t[s]}),r=this&&this.__setModuleDefault\|\|(Object.create?function(e,t){Object.defineProperty(e,"default",{enumerable:!0,value:t})}:function(e,t){e.default=t}),o=this&&this.__importStar\|\|function(e){if(e&&e.__esModule)return e;var t={};if(null!=e)for(var s in e)"default"!==s&&Object.prototype.hasOwnProperty.call(e,s)&&i(t,e,s);return r(t,e),t},n=this&&this.__exportStar\|\|function(e,t){for(var s in e)"default"===s\|\|Object.prototype.hasOwnProperty.call(t,s)\|\|i(t,e,s)};Object.defineProperty(t,"__esModule",{value:!0}),t.utils=void 0;const a=o(s(26));t.utils={minFramesForTargetMS:a.minFramesForTargetMS,arrayBufferToBase64:a.arrayBufferToBase64,encodeWAV:a.encodeWAV},n(s(405),t),n(s(428),t),n(s(294),t),n(s(842),t),n(s(260),t),n(s(724),t)},842:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.log=t.LOG_PREFIX=void 0,t.LOG_PREFIX="[VAD]";const s=["error","debug","warn"].reduce(((e,s)=>(e[s]=function(e){return(...s)=>{console[e](t.LOG_PREFIX,...s)}}(s),e)),{});t.log=s},294:(e,t)=>{var s;Object.defineProperty(t,"__esModule",{value:!0}),t.Message=void 0,function(e){e.AudioFrame="AUDIO_FRAME",e.SpeechStart="SPEECH_START",e.VADMisfire="VAD_MISFIRE",e.SpeechEnd="SPEECH_END",e.SpeechStop="SPEECH_STOP"}(s\|\|(t.Message=s={}))},260:(e,t,s)=>{var i;Object.defineProperty(t,"__esModule",{value:!0}),t.Silero=void 0;const r=s(842);class o{constructor(e,t){this.ort=e,this.modelFetcher=t,this.init=async()=>{r.log.debug("initializing vad");const e=await this.modelFetcher();this._session=await this.ort.InferenceSession.create(e),this._sr=new this.ort.Tensor("int64",[16000n]),this.reset_state(),r.log.debug("vad is initialized")},this.reset_state=()=>{const e=Array(128).fill(0);this._h=new this.ort.Tensor("float32",e,[2,1,64]),this._c=new this.ort.Tensor("float32",e,[2,1,64])},this.process=async e=>{const t={input:new this.ort.Tensor("float32",e,[1,e.length]),h:this._h,c:this._c,sr:this._sr},s=await this._session.run(t);this._h=s.hn,this._c=s.cn;const[i]=s.output.data;return{notSpeech:1-i,isSpeech:i}}}}t.Silero=o,i=o,o.new=async(e,t)=>{const s=new i(e,t);return await s.init(),s}},405:(e,t,s)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.PlatformAgnosticNonRealTimeVAD=t.defaultNonRealTimeVADOptions=void 0;const i=s(428),r=s(294),o=s(260),n=s(724);t.defaultNonRealTimeVADOptions={...i.defaultFrameProcessorOptions},t.PlatformAgnosticNonRealTimeVAD=class{static async _new(e,s,i={}){const r=new this(e,s,{...t.defaultNonRealTimeVADOptions,...i});return await r.init(),r}constructor(e,t,s){this.modelFetcher=e,this.ort=t,this.options=s,this.init=async()=>{const e=await o.Silero.new(this.ort,this.modelFetcher);this.frameProcessor=new i.FrameProcessor(e.process,e.reset_state,{frameSamples:this.options.frameSamples,positiveSpeechThreshold:this.options.positiveSpeechThreshold,negativeSpeechThreshold:this.options.negativeSpeechThreshold,redemptionFrames:this.options.redemptionFrames,preSpeechPadFrames:this.options.preSpeechPadFrames,minSpeechFrames:this.options.minSpeechFrames}),this.frameProcessor.resume()},this.run=async function(e,t){const s={nativeSampleRate:t,targetSampleRate:16e3,targetFrameSize:this.options.frameSamples},i=new n.Resampler(s).process(e);let o,a;for(const e of[...Array(i.length)].keys()){const t=i[e],{msg:s,audio:n}=await this.frameProcessor.process(t);switch(s){case r.Message.SpeechStart:o=ethis.options.frameSamples/16;break;case r.Message.SpeechEnd:a=(e+1)this.options.frameSamples/16,yield{audio:n,start:o,end:a}}}const{msg:h,audio:p}=this.frameProcessor.endSegment();h==r.Message.SpeechEnd&&(yield{audio:p,start:o,end:i.lengththis.options.frameSamples/16})},(0,i.validateOptions)(s)}}},724:(e,t,s)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.Resampler=void 0;const i=s(842);t.Resampler=class{constructor(e){this.options=e,this.process=e=>{const t=[];for(const t of e)this.inputBuffer.push(t);for(;this.inputBuffer.lengththis.options.targetSampleRate/this.options.nativeSampleRate>this.options.targetFrameSize;){const e=new Float32Array(this.options.targetFrameSize);let s=0,i=0;for(;s<this.options.targetFrameSize;){let t=0,r=0;for(;i<Math.min(this.inputBuffer.length,(s+1)this.options.nativeSampleRate/this.options.targetSampleRate);)t+=this.inputBuffer[i],r++,i++;e[s]=t/r,s++}this.inputBuffer=this.inputBuffer.slice(i),t.push(e)}return t},e.nativeSampleRate<16e3&&i.log.error("nativeSampleRate is too low. Should have 16000 = targetSampleRate <= nativeSampleRate"),this.inputBuffer=[]}}},26:(e,t)=>{function s(e,t,s){for(var i=0;i<s.length;i++)e.setUint8(t+i,s.charCodeAt(i))}Object.defineProperty(t,"__esModule",{value:!0}),t.encodeWAV=t.arrayBufferToBase64=t.minFramesForTargetMS=void 0,t.minFramesForTargetMS=function(e,t,s=16e3){return Math.ceil(es/1e3/t)},t.arrayBufferToBase64=function(e){for(var t="",s=new Uint8Array(e),i=s.byteLength,r=0;r<i;r++)t+=String.fromCharCode(s[r]);return btoa(t)},t.encodeWAV=function(e,t=3,i=16e3,r=1,o=32){var n=o/8,a=rn,h=new ArrayBuffer(44+e.lengthn),p=new DataView(h);return s(p,0,"RIFF"),p.setUint32(4,36+e.lengthn,!0),s(p,8,"WAVE"),s(p,12,"fmt "),p.setUint32(16,16,!0),p.setUint16(20,t,!0),p.setUint16(22,r,!0),p.setUint32(24,i,!0),p.setUint32(28,ia,!0),p.setUint16(32,a,!0),p.setUint16(34,o,!0),s(p,36,"data"),p.setUint32(40,e.lengthn,!0),1===t?function(e,t,s){for(var i=0;i<s.length;i++,t+=2){var r=Math.max(-1,Math.min(1,s[i]));e.setInt16(t,r<0?32768r:32767r,!0)}}(p,44,e):function(e,t,s){for(var i=0;i<s.length;i++,t+=4)e.setFloat32(t,s[i],!0)}(p,44,e),h}}},t={};function s(i){var r=t[i];if(void 0!==r)return r.exports;var o=t[i]={exports:{}};return e[i].call(o.exports,o,o.exports,s),o.exports}(()=>{const e=s(14);class t extends AudioWorkletProcessor{constructor(t){super(),this._initialized=!1,this._stopProcessing=!1,this.init=async()=>{e.log.debug("initializing worklet"),this.resampler=new e.Resampler({nativeSampleRate:sampleRate,targetSampleRate:16e3,targetFrameSize:this.options.frameSamples}),this._initialized=!0,e.log.debug("initialized worklet")},this.options=t.processorOptions,this.port.onmessage=t=>{t.data.message===e.Message.SpeechStop&&(this._stopProcessing=!0)},this.init()}process(t,s,i){if(this._stopProcessing)return!1;const r=t[0][0];if(this._initialized&&r instanceof Float32Array){const t=this.resampler.process(r);for(const s of t)this.port.postMessage({message:e.Message.AudioFrame,data:s.buffer},[s.buffer])}return!0}}registerProcessor("vad-helper-worklet",t)})()})();
		(()=>{"use strict";var e={428:(e,t,s)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.FrameProcessor=t.validateOptions=t.defaultFrameProcessorOptions=void 0;const i=s(294),r=s(842),o=[512,1024,1536];t.defaultFrameProcessorOptions={positiveSpeechThreshold:.5,negativeSpeechThreshold:.35,preSpeechPadFrames:1,redemptionFrames:8,frameSamples:1536,minSpeechFrames:3,submitUserSpeechOnPause:!1},t.validateOptions=function(e){o.includes(e.frameSamples)\|\|r.log.warn("You are using an unusual frame size"),(e.positiveSpeechThreshold<0\|\|e.negativeSpeechThreshold>1)&&r.log.error("postiveSpeechThreshold should be a number between 0 and 1"),(e.negativeSpeechThreshold<0\|\|e.negativeSpeechThreshold>e.positiveSpeechThreshold)&&r.log.error("negativeSpeechThreshold should be between 0 and postiveSpeechThreshold"),e.preSpeechPadFrames<0&&r.log.error("preSpeechPadFrames should be positive"),e.redemptionFrames<0&&r.log.error("preSpeechPadFrames should be positive")};const n=e=>{const t=e.reduce(((e,t)=>(e.push(e.at(-1)+t.length),e)),[0]),s=new Float32Array(t.at(-1));return e.forEach(((e,i)=>{const r=t[i];s.set(e,r)})),s};t.FrameProcessor=class{constructor(e,t,s){this.modelProcessFunc=e,this.modelResetFunc=t,this.options=s,this.speaking=!1,this.redemptionCounter=0,this.active=!1,this.reset=()=>{this.speaking=!1,this.audioBuffer=[],this.modelResetFunc(),this.redemptionCounter=0},this.pause=()=>(this.active=!1,this.options.submitUserSpeechOnPause?this.endSegment():(this.reset(),{})),this.resume=()=>{this.active=!0},this.endSegment=()=>{const e=this.audioBuffer;this.audioBuffer=[];const t=this.speaking;this.reset();const s=e.reduce(((e,t)=>e+ +t.isSpeech),0);if(t){if(s>=this.options.minSpeechFrames){const t=n(e.map((e=>e.frame)));return{msg:i.Message.SpeechEnd,audio:t}}return{msg:i.Message.VADMisfire}}return{}},this.process=async e=>{if(!this.active)return{};const t=await this.modelProcessFunc(e);if(this.audioBuffer.push({frame:e,isSpeech:t.isSpeech>=this.options.positiveSpeechThreshold}),t.isSpeech>=this.options.positiveSpeechThreshold&&this.redemptionCounter&&(this.redemptionCounter=0),t.isSpeech>=this.options.positiveSpeechThreshold&&!this.speaking)return this.speaking=!0,{probs:t,msg:i.Message.SpeechStart};if(t.isSpeech<this.options.negativeSpeechThreshold&&this.speaking&&++this.redemptionCounter>=this.options.redemptionFrames){this.redemptionCounter=0,this.speaking=!1;const e=this.audioBuffer;if(this.audioBuffer=[],e.reduce(((e,t)=>e+ +t.isSpeech),0)>=this.options.minSpeechFrames){const s=n(e.map((e=>e.frame)));return{probs:t,msg:i.Message.SpeechEnd,audio:s}}return{probs:t,msg:i.Message.VADMisfire}}if(!this.speaking)for(;this.audioBuffer.length>this.options.preSpeechPadFrames;)this.audioBuffer.shift();return{probs:t}},this.audioBuffer=[],this.reset()}}},14:function(e,t,s){var i=this&&this.__createBinding\|\|(Object.create?function(e,t,s,i){void 0===i&&(i=s);var r=Object.getOwnPropertyDescriptor(t,s);r&&!("get"in r?!t.__esModule:r.writable\|\|r.configurable)\|\|(r={enumerable:!0,get:function(){return t[s]}}),Object.defineProperty(e,i,r)}:function(e,t,s,i){void 0===i&&(i=s),e[i]=t[s]}),r=this&&this.__setModuleDefault\|\|(Object.create?function(e,t){Object.defineProperty(e,"default",{enumerable:!0,value:t})}:function(e,t){e.default=t}),o=this&&this.__importStar\|\|function(e){if(e&&e.__esModule)return e;var t={};if(null!=e)for(var s in e)"default"!==s&&Object.prototype.hasOwnProperty.call(e,s)&&i(t,e,s);return r(t,e),t},n=this&&this.__exportStar\|\|function(e,t){for(var s in e)"default"===s\|\|Object.prototype.hasOwnProperty.call(t,s)\|\|i(t,e,s)};Object.defineProperty(t,"__esModule",{value:!0}),t.utils=void 0;const a=o(s(26));t.utils={minFramesForTargetMS:a.minFramesForTargetMS,arrayBufferToBase64:a.arrayBufferToBase64,encodeWAV:a.encodeWAV},n(s(405),t),n(s(428),t),n(s(294),t),n(s(842),t),n(s(260),t),n(s(724),t)},842:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.log=t.LOG_PREFIX=void 0,t.LOG_PREFIX="[VAD]";const s=["error","debug","warn"].reduce(((e,s)=>(e[s]=function(e){return(...s)=>{console[e](t.LOG_PREFIX,...s)}}(s),e)),{});t.log=s},294:(e,t)=>{var s;Object.defineProperty(t,"__esModule",{value:!0}),t.Message=void 0,function(e){e.AudioFrame="AUDIO_FRAME",e.SpeechStart="SPEECH_START",e.VADMisfire="VAD_MISFIRE",e.SpeechEnd="SPEECH_END",e.SpeechStop="SPEECH_STOP"}(s\|\|(t.Message=s={}))},260:(e,t,s)=>{var i;Object.defineProperty(t,"__esModule",{value:!0}),t.Silero=void 0;const r=s(842);class o{constructor(e,t){this.ort=e,this.modelFetcher=t,this.init=async()=>{r.log.debug("initializing vad");const e=await this.modelFetcher();this._session=await this.ort.InferenceSession.create(e),this._sr=new this.ort.Tensor("int64",[16000n]),this.reset_state(),r.log.debug("vad is initialized")},this.reset_state=()=>{const e=Array(128).fill(0);this._h=new this.ort.Tensor("float32",e,[2,1,64]),this._c=new this.ort.Tensor("float32",e,[2,1,64])},this.process=async e=>{const t={input:new this.ort.Tensor("float32",e,[1,e.length]),h:this._h,c:this._c,sr:this._sr},s=await this._session.run(t);this._h=s.hn,this._c=s.cn;const[i]=s.output.data;return{notSpeech:1-i,isSpeech:i}}}}t.Silero=o,i=o,o.new=async(e,t)=>{const s=new i(e,t);return await s.init(),s}},405:(e,t,s)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.PlatformAgnosticNonRealTimeVAD=t.defaultNonRealTimeVADOptions=void 0;const i=s(428),r=s(294),o=s(260),n=s(724);t.defaultNonRealTimeVADOptions={...i.defaultFrameProcessorOptions},t.PlatformAgnosticNonRealTimeVAD=class{static async _new(e,s,i={}){const r=new this(e,s,{...t.defaultNonRealTimeVADOptions,...i});return await r.init(),r}constructor(e,t,s){this.modelFetcher=e,this.ort=t,this.options=s,this.init=async()=>{const e=await o.Silero.new(this.ort,this.modelFetcher);this.frameProcessor=new i.FrameProcessor(e.process,e.reset_state,{frameSamples:this.options.frameSamples,positiveSpeechThreshold:this.options.positiveSpeechThreshold,negativeSpeechThreshold:this.options.negativeSpeechThreshold,redemptionFrames:this.options.redemptionFrames,preSpeechPadFrames:this.options.preSpeechPadFrames,minSpeechFrames:this.options.minSpeechFrames,submitUserSpeechOnPause:this.options.submitUserSpeechOnPause}),this.frameProcessor.resume()},this.run=async function(e,t){const s={nativeSampleRate:t,targetSampleRate:16e3,targetFrameSize:this.options.frameSamples},i=new n.Resampler(s).process(e);let o,a;for(const e of[...Array(i.length)].keys()){const t=i[e],{msg:s,audio:n}=await this.frameProcessor.process(t);switch(s){case r.Message.SpeechStart:o=ethis.options.frameSamples/16;break;case r.Message.SpeechEnd:a=(e+1)this.options.frameSamples/16,yield{audio:n,start:o,end:a}}}const{msg:h,audio:p}=this.frameProcessor.endSegment();h==r.Message.SpeechEnd&&(yield{audio:p,start:o,end:i.lengththis.options.frameSamples/16})},(0,i.validateOptions)(s)}}},724:(e,t,s)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.Resampler=void 0;const i=s(842);t.Resampler=class{constructor(e){this.options=e,this.process=e=>{const t=[];for(const t of e)this.inputBuffer.push(t);for(;this.inputBuffer.lengththis.options.targetSampleRate/this.options.nativeSampleRate>this.options.targetFrameSize;){const e=new Float32Array(this.options.targetFrameSize);let s=0,i=0;for(;s<this.options.targetFrameSize;){let t=0,r=0;for(;i<Math.min(this.inputBuffer.length,(s+1)this.options.nativeSampleRate/this.options.targetSampleRate);)t+=this.inputBuffer[i],r++,i++;e[s]=t/r,s++}this.inputBuffer=this.inputBuffer.slice(i),t.push(e)}return t},e.nativeSampleRate<16e3&&i.log.error("nativeSampleRate is too low. Should have 16000 = targetSampleRate <= nativeSampleRate"),this.inputBuffer=[]}}},26:(e,t)=>{function s(e,t,s){for(var i=0;i<s.length;i++)e.setUint8(t+i,s.charCodeAt(i))}Object.defineProperty(t,"__esModule",{value:!0}),t.encodeWAV=t.arrayBufferToBase64=t.minFramesForTargetMS=void 0,t.minFramesForTargetMS=function(e,t,s=16e3){return Math.ceil(es/1e3/t)},t.arrayBufferToBase64=function(e){for(var t="",s=new Uint8Array(e),i=s.byteLength,r=0;r<i;r++)t+=String.fromCharCode(s[r]);return btoa(t)},t.encodeWAV=function(e,t=3,i=16e3,r=1,o=32){var n=o/8,a=rn,h=new ArrayBuffer(44+e.lengthn),p=new DataView(h);return s(p,0,"RIFF"),p.setUint32(4,36+e.lengthn,!0),s(p,8,"WAVE"),s(p,12,"fmt "),p.setUint32(16,16,!0),p.setUint16(20,t,!0),p.setUint16(22,r,!0),p.setUint32(24,i,!0),p.setUint32(28,ia,!0),p.setUint16(32,a,!0),p.setUint16(34,o,!0),s(p,36,"data"),p.setUint32(40,e.lengthn,!0),1===t?function(e,t,s){for(var i=0;i<s.length;i++,t+=2){var r=Math.max(-1,Math.min(1,s[i]));e.setInt16(t,r<0?32768r:32767r,!0)}}(p,44,e):function(e,t,s){for(var i=0;i<s.length;i++,t+=4)e.setFloat32(t,s[i],!0)}(p,44,e),h}}},t={};function s(i){var r=t[i];if(void 0!==r)return r.exports;var o=t[i]={exports:{}};return e[i].call(o.exports,o,o.exports,s),o.exports}(()=>{const e=s(14);class t extends AudioWorkletProcessor{constructor(t){super(),this._initialized=!1,this._stopProcessing=!1,this.init=async()=>{e.log.debug("initializing worklet"),this.resampler=new e.Resampler({nativeSampleRate:sampleRate,targetSampleRate:16e3,targetFrameSize:this.options.frameSamples}),this._initialized=!0,e.log.debug("initialized worklet")},this.options=t.processorOptions,this.port.onmessage=t=>{t.data.message===e.Message.SpeechStop&&(this._stopProcessing=!0)},this.init()}process(t,s,i){if(this._stopProcessing)return!1;const r=t[0][0];if(this._initialized&&r instanceof Float32Array){const t=this.resampler.process(r);for(const s of t)this.port.postMessage({message:e.Message.AudioFrame,data:s.buffer},[s.buffer])}return!0}}registerProcessor("vad-helper-worklet",t)})()})();

2

package.json

		@@ -15,3 +15,3 @@ {
		"homepage": "https://github.com/ricky0123/vad",
		"version": "0.0.14",
		"version": "0.0.15",
		"license": "ISC",
		@@ -18,0 +18,0 @@ "main": "dist/index.js",

dist/_common/frame-processor.d.ts.map