@@ -1,5 +0,6 @@
		// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
		//
		// SPDX-License-Identifier: Apache-2.0
		export { VAD, VADStream } from './vad.js';
		import { VAD, VADStream } from "./vad.js";
		export {
		VAD,
		VADStream
		};
		//# sourceMappingURL=index.js.map

129

dist/onnx_model.js

		@@ -1,69 +0,68 @@
		// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
		//
		// SPDX-License-Identifier: Apache-2.0
		import { InferenceSession, Tensor } from 'onnxruntime-node';
		export const newInferenceSession = (forceCPU) => {
		return InferenceSession.create(new URL('silero_vad.onnx', import.meta.url).pathname, {
		interOpNumThreads: 1,
		intraOpNumThreads: 1,
		executionMode: 'sequential',
		executionProviders: forceCPU ? [{ name: 'cpu' }] : undefined,
		});
		import { fileURLToPath } from "node:url";
		import { InferenceSession, Tensor } from "onnxruntime-node";
		const newInferenceSession = (forceCPU) => {
		return InferenceSession.create(fileURLToPath(new URL("silero_vad.onnx", import.meta.url).href), {
		interOpNumThreads: 1,
		intraOpNumThreads: 1,
		executionMode: "sequential",
		executionProviders: forceCPU ? [{ name: "cpu" }] : void 0
		});
		};
		export class OnnxModel {
		#session;
		#sampleRate;
		#windowSizeSamples;
		#contextSize;
		#sampleRateNd;
		#context;
		// #state: Float32Array;
		#rnnState;
		#inputBuffer;
		constructor(session, sampleRate) {
		this.#session = session;
		this.#sampleRate = sampleRate;
		switch (sampleRate) {
		case 8000:
		this.#windowSizeSamples = 256;
		this.#contextSize = 32;
		break;
		case 16000:
		this.#windowSizeSamples = 512;
		this.#contextSize = 64;
		break;
		}
		this.#sampleRateNd = BigInt64Array.from([BigInt(sampleRate)]);
		this.#context = new Float32Array(this.#contextSize);
		this.#rnnState = new Float32Array(2 * 1 * 128);
		this.#inputBuffer = new Float32Array(this.#contextSize + this.#windowSizeSamples);
		class OnnxModel {
		#session;
		#sampleRate;
		#windowSizeSamples;
		#contextSize;
		#sampleRateNd;
		#context;
		// #state: Float32Array;
		#rnnState;
		#inputBuffer;
		constructor(session, sampleRate) {
		this.#session = session;
		this.#sampleRate = sampleRate;
		switch (sampleRate) {
		case 8e3:
		this.#windowSizeSamples = 256;
		this.#contextSize = 32;
		break;
		case 16e3:
		this.#windowSizeSamples = 512;
		this.#contextSize = 64;
		break;
		}
		get sampleRate() {
		return this.#sampleRate;
		}
		get windowSizeSamples() {
		return this.#windowSizeSamples;
		}
		get contextSize() {
		return this.#contextSize;
		}
		async run(x) {
		this.#inputBuffer.set(this.#context, 0);
		this.#inputBuffer.set(x, this.#contextSize);
		return await this.#session
		.run({
		input: new Tensor('float32', this.#inputBuffer, [
		1,
		this.#contextSize + this.#windowSizeSamples,
		]),
		state: new Tensor('float32', this.#rnnState, [2, 1, 128]),
		sr: new Tensor('int64', this.#sampleRateNd),
		})
		.then((result) => {
		// this.#state = result.output.data as Float32Array,
		this.#context = this.#inputBuffer.subarray(0, this.#contextSize);
		return result.output.data.at(0);
		});
		}
		this.#sampleRateNd = BigInt64Array.from([BigInt(sampleRate)]);
		this.#context = new Float32Array(this.#contextSize);
		this.#rnnState = new Float32Array(2 * 1 * 128);
		this.#inputBuffer = new Float32Array(this.#contextSize + this.#windowSizeSamples);
		}
		get sampleRate() {
		return this.#sampleRate;
		}
		get windowSizeSamples() {
		return this.#windowSizeSamples;
		}
		get contextSize() {
		return this.#contextSize;
		}
		async run(x) {
		this.#inputBuffer.set(this.#context, 0);
		this.#inputBuffer.set(x, this.#contextSize);
		return await this.#session.run({
		input: new Tensor("float32", this.#inputBuffer, [
		1,
		this.#contextSize + this.#windowSizeSamples
		]),
		state: new Tensor("float32", this.#rnnState, [2, 1, 128]),
		sr: new Tensor("int64", this.#sampleRateNd)
		}).then((result) => {
		this.#context = this.#inputBuffer.subarray(0, this.#contextSize);
		return result.output.data.at(0);
		});
		}
		}
		export {
		OnnxModel,
		newInferenceSession
		};
		//# sourceMappingURL=onnx_model.js.map

dist/vad.d.ts

		@@ -51,3 +51,3 @@ /// <reference path="../src/onnxruntime.d.ts" />
		*/
		static load(opts?: VADOptions): Promise<VAD>;
		static load(opts?: Partial<VADOptions>): Promise<VAD>;
		stream(): VADStream;
		@@ -54,0 +54,0 @@ }

505

dist/vad.js

		@@ -1,250 +0,267 @@
		// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
		//
		// SPDX-License-Identifier: Apache-2.0
		import { ExpFilter, VADEventType, VADStream as baseStream, VAD as baseVAD, log, mergeFrames, } from '@livekit/agents';
		import { AudioFrame, AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node';
		import { OnnxModel, newInferenceSession } from './onnx_model.js';
		const SLOW_INFERENCE_THRESHOLD = 200; // late by 200ms
		import {
		ExpFilter,
		VADEventType,
		VADStream as baseStream,
		VAD as baseVAD,
		log,
		mergeFrames
		} from "@livekit/agents";
		import { AudioFrame, AudioResampler, AudioResamplerQuality } from "@livekit/rtc-node";
		import { OnnxModel, newInferenceSession } from "./onnx_model.js";
		const SLOW_INFERENCE_THRESHOLD = 200;
		const defaultVADOptions = {
		minSpeechDuration: 50,
		minSilenceDuration: 250,
		prefixPaddingDuration: 500,
		maxBufferedSpeech: 60000,
		activationThreshold: 0.5,
		sampleRate: 16000,
		forceCPU: true,
		minSpeechDuration: 50,
		minSilenceDuration: 250,
		prefixPaddingDuration: 500,
		maxBufferedSpeech: 6e4,
		activationThreshold: 0.5,
		sampleRate: 16e3,
		forceCPU: true
		};
		export class VAD extends baseVAD {
		#session;
		#opts;
		constructor(session, opts) {
		super({ updateInterval: 32 });
		this.#session = session;
		this.#opts = opts;
		}
		/**
		* Load and initialize the Silero VAD model.
		*
		* This method loads the ONNX model and prepares it for inference. When options are not provided,
		* sane defaults are used.
		*
		* @remarks
		* This method may take time to load the model into memory.
		* It is recommended to call this method inside your prewarm mechanism.
		*
		* @example
		* ```ts
		* export default defineAgent({
		* prewarm: async (proc: JobProcess) => {
		* proc.userData.vad = await VAD.load();
		* },
		* entry: async (ctx: JobContext) => {
		* const vad = ctx.proc.userData.vad! as VAD;
		* // the rest of your agent logic
		* },
		* });
		* ```
		*
		* @param options -
		* @returns Promise\<{@link VAD}\>: An instance of the VAD class ready for streaming.
		*/
		static async load(opts = defaultVADOptions) {
		const session = await newInferenceSession(opts.forceCPU);
		return new VAD(session, opts);
		}
		stream() {
		return new VADStream(this.#opts, new OnnxModel(this.#session, this.#opts.sampleRate));
		}
		class VAD extends baseVAD {
		#session;
		#opts;
		constructor(session, opts) {
		super({ updateInterval: 32 });
		this.#session = session;
		this.#opts = opts;
		}
		/**
		* Load and initialize the Silero VAD model.
		*
		* This method loads the ONNX model and prepares it for inference. When options are not provided,
		* sane defaults are used.
		*
		* @remarks
		* This method may take time to load the model into memory.
		* It is recommended to call this method inside your prewarm mechanism.
		*
		* @example
		* ```ts
		* export default defineAgent({
		* prewarm: async (proc: JobProcess) => {
		* proc.userData.vad = await VAD.load();
		* },
		* entry: async (ctx: JobContext) => {
		* const vad = ctx.proc.userData.vad! as VAD;
		* // the rest of your agent logic
		* },
		* });
		* ```
		*
		* @param options -
		* @returns Promise\<{@link VAD}\>: An instance of the VAD class ready for streaming.
		*/
		static async load(opts = {}) {
		const mergedOpts = { ...defaultVADOptions, ...opts };
		const session = await newInferenceSession(mergedOpts.forceCPU);
		return new VAD(session, mergedOpts);
		}
		stream() {
		return new VADStream(this.#opts, new OnnxModel(this.#session, this.#opts.sampleRate));
		}
		}
		export class VADStream extends baseStream {
		#opts;
		#model;
		#task;
		#expFilter = new ExpFilter(0.35);
		#extraInferenceTime = 0;
		#logger = log();
		constructor(opts, model) {
		super();
		this.#opts = opts;
		this.#model = model;
		this.#task = new Promise(async () => {
		let inferenceData = new Float32Array(this.#model.windowSizeSamples);
		// a copy is exposed to the user in END_OF_SPEECH
		let speechBuffer = null;
		let speechBufferMaxReached = false;
		let speechBufferIndex = 0;
		// "pub" means public, these values are exposed to the users through events
		let pubSpeaking = false;
		let pubSpeechDuration = 0;
		let pubSilenceDuration = 0;
		let pubCurrentSample = 0;
		let pubTimestamp = 0;
		let pubSampleRate = 0;
		let pubPrefixPaddingSamples = 0; // size in samples of padding data
		let speechThresholdDuration = 0;
		let silenceThresholdDuration = 0;
		let inputFrames = [];
		let inferenceFrames = [];
		let resampler = null;
		// used to avoid drift when the sampleRate ratio is not an integer
		let inputCopyRemainingFrac = 0.0;
		for await (const frame of this.input) {
		if (typeof frame === 'symbol') {
		continue; // ignore flush sentinel for now
		}
		if (!pubSampleRate \|\| !speechBuffer) {
		pubSampleRate = frame.sampleRate;
		pubPrefixPaddingSamples = Math.trunc((this.#opts.prefixPaddingDuration * pubSampleRate) / 1000);
		speechBuffer = new Int16Array(this.#opts.maxBufferedSpeech * pubSampleRate + pubPrefixPaddingSamples);
		if (this.#opts.sampleRate !== pubSampleRate) {
		// resampling needed: the input sample rate isn't the same as the model's
		// sample rate used for inference
		resampler = new AudioResampler(pubSampleRate, this.#opts.sampleRate, 1, AudioResamplerQuality.QUICK);
		}
		}
		else if (frame.sampleRate !== pubSampleRate) {
		this.#logger.error('a frame with a different sample rate was already published');
		continue;
		}
		inputFrames.push(frame);
		if (resampler) {
		inferenceFrames.push(...resampler.push(frame));
		}
		else {
		inferenceFrames.push(frame);
		}
		while (true) {
		const startTime = process.hrtime.bigint();
		const availableInferenceSamples = inferenceFrames
		.map((x) => x.samplesPerChannel)
		.reduce((acc, x) => acc + x, 0);
		if (availableInferenceSamples < this.#model.windowSizeSamples) {
		break; // not enough samples to run inference
		}
		const inputFrame = mergeFrames(inputFrames);
		const inferenceFrame = mergeFrames(inferenceFrames);
		// convert data to f32
		inferenceData = Float32Array.from(inferenceFrame.data.subarray(0, this.#model.windowSizeSamples), (x) => x / 32767);
		const p = await this.#model
		.run(inferenceData)
		.then((data) => this.#expFilter.apply(1, data));
		const windowDuration = (this.#model.windowSizeSamples / this.#opts.sampleRate) * 1000;
		pubCurrentSample += this.#model.windowSizeSamples;
		pubTimestamp += windowDuration;
		const resamplingRatio = pubSampleRate / this.#model.sampleRate;
		const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;
		const toCopyInt = Math.trunc(toCopy);
		inputCopyRemainingFrac = toCopy - toCopyInt;
		// copy the inference window to the speech buffer
		const availableSpace = speechBuffer.length - speechBufferIndex;
		const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);
		if (toCopyBuffer > 0) {
		speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
		speechBufferIndex += toCopyBuffer;
		}
		else if (!speechBufferMaxReached) {
		speechBufferMaxReached = true;
		this.#logger.warn('maxBufferedSpeech reached, ignoring further data for the current speech input');
		}
		const inferenceDuration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));
		this.#extraInferenceTime = Math.max(0, this.#extraInferenceTime + inferenceDuration - windowDuration);
		if (this.#extraInferenceTime > SLOW_INFERENCE_THRESHOLD) {
		this.#logger
		.child({ delay: this.#extraInferenceTime })
		.warn('inference is slower than realtime');
		}
		if (pubSpeaking) {
		pubSpeechDuration += inferenceDuration;
		}
		else {
		pubSilenceDuration += inferenceDuration;
		}
		this.queue.put({
		type: VADEventType.INFERENCE_DONE,
		samplesIndex: pubCurrentSample,
		timestamp: pubTimestamp,
		silenceDuration: pubSilenceDuration,
		speechDuration: pubSpeechDuration,
		probability: p,
		inferenceDuration,
		frames: [
		new AudioFrame(inputFrame.data.subarray(0, toCopyInt), pubSampleRate, 1, toCopyInt),
		],
		speaking: pubSpeaking,
		});
		const resetWriteCursor = () => {
		if (!speechBuffer)
		throw new Error('speechBuffer is empty');
		if (speechBufferIndex <= pubPrefixPaddingSamples) {
		return;
		}
		const paddingData = speechBuffer.subarray(speechBufferIndex - pubPrefixPaddingSamples, speechBufferIndex);
		speechBuffer.set(paddingData, 0);
		speechBufferIndex = pubPrefixPaddingSamples;
		speechBufferMaxReached = false;
		};
		const copySpeechBuffer = () => {
		if (!speechBuffer)
		throw new Error('speechBuffer is empty');
		return new AudioFrame(speechBuffer.subarray(0, speechBufferIndex), pubSampleRate, 1, speechBufferIndex);
		};
		if (p > this.#opts.activationThreshold) {
		speechThresholdDuration += windowDuration;
		silenceThresholdDuration = 0;
		if (!pubSpeaking && speechThresholdDuration >= this.#opts.minSpeechDuration) {
		pubSpeaking = true;
		pubSilenceDuration = 0;
		pubSpeechDuration = speechThresholdDuration;
		this.queue.put({
		type: VADEventType.START_OF_SPEECH,
		samplesIndex: pubCurrentSample,
		timestamp: pubTimestamp,
		silenceDuration: pubSilenceDuration,
		speechDuration: pubSpeechDuration,
		probability: p,
		inferenceDuration,
		frames: [copySpeechBuffer()],
		speaking: pubSpeaking,
		});
		}
		}
		else {
		silenceThresholdDuration += windowDuration;
		speechThresholdDuration = 0;
		if (!pubSpeaking) {
		resetWriteCursor();
		}
		if (pubSpeaking && silenceThresholdDuration > this.#opts.minSilenceDuration) {
		pubSpeaking = false;
		pubSpeechDuration = 0;
		pubSilenceDuration = silenceThresholdDuration;
		this.queue.put({
		type: VADEventType.END_OF_SPEECH,
		samplesIndex: pubCurrentSample,
		timestamp: pubTimestamp,
		silenceDuration: pubSilenceDuration,
		speechDuration: pubSpeechDuration,
		probability: p,
		inferenceDuration,
		frames: [copySpeechBuffer()],
		speaking: pubSpeaking,
		});
		resetWriteCursor();
		}
		}
		inputFrames = [];
		inferenceFrames = [];
		if (inputFrame.data.length > toCopyInt) {
		const data = inputFrame.data.subarray(toCopyInt);
		inputFrames.push(new AudioFrame(data, pubSampleRate, 1, Math.trunc(data.length / 2)));
		}
		if (inferenceFrame.data.length > this.#model.windowSizeSamples) {
		const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);
		inferenceFrames.push(new AudioFrame(data, this.#opts.sampleRate, 1, Math.trunc(data.length / 2)));
		}
		}
		class VADStream extends baseStream {
		#opts;
		#model;
		#task;
		#expFilter = new ExpFilter(0.35);
		#extraInferenceTime = 0;
		#logger = log();
		constructor(opts, model) {
		super();
		this.#opts = opts;
		this.#model = model;
		this.#task = new Promise(async () => {
		let inferenceData = new Float32Array(this.#model.windowSizeSamples);
		let speechBuffer = null;
		let speechBufferMaxReached = false;
		let speechBufferIndex = 0;
		let pubSpeaking = false;
		let pubSpeechDuration = 0;
		let pubSilenceDuration = 0;
		let pubCurrentSample = 0;
		let pubTimestamp = 0;
		let pubSampleRate = 0;
		let pubPrefixPaddingSamples = 0;
		let speechThresholdDuration = 0;
		let silenceThresholdDuration = 0;
		let inputFrames = [];
		let inferenceFrames = [];
		let resampler = null;
		let inputCopyRemainingFrac = 0;
		for await (const frame of this.input) {
		if (typeof frame === "symbol") {
		continue;
		}
		if (!pubSampleRate \|\| !speechBuffer) {
		pubSampleRate = frame.sampleRate;
		pubPrefixPaddingSamples = Math.trunc(
		this.#opts.prefixPaddingDuration * pubSampleRate / 1e3
		);
		speechBuffer = new Int16Array(
		this.#opts.maxBufferedSpeech * pubSampleRate + pubPrefixPaddingSamples
		);
		if (this.#opts.sampleRate !== pubSampleRate) {
		resampler = new AudioResampler(
		pubSampleRate,
		this.#opts.sampleRate,
		1,
		AudioResamplerQuality.QUICK
		// VAD doesn't need high quality
		);
		}
		} else if (frame.sampleRate !== pubSampleRate) {
		this.#logger.error("a frame with a different sample rate was already published");
		continue;
		}
		inputFrames.push(frame);
		if (resampler) {
		inferenceFrames.push(...resampler.push(frame));
		} else {
		inferenceFrames.push(frame);
		}
		while (true) {
		const startTime = process.hrtime.bigint();
		const availableInferenceSamples = inferenceFrames.map((x) => x.samplesPerChannel).reduce((acc, x) => acc + x, 0);
		if (availableInferenceSamples < this.#model.windowSizeSamples) {
		break;
		}
		const inputFrame = mergeFrames(inputFrames);
		const inferenceFrame = mergeFrames(inferenceFrames);
		inferenceData = Float32Array.from(
		inferenceFrame.data.subarray(0, this.#model.windowSizeSamples),
		(x) => x / 32767
		);
		const p = await this.#model.run(inferenceData).then((data) => this.#expFilter.apply(1, data));
		const windowDuration = this.#model.windowSizeSamples / this.#opts.sampleRate * 1e3;
		pubCurrentSample += this.#model.windowSizeSamples;
		pubTimestamp += windowDuration;
		const resamplingRatio = pubSampleRate / this.#model.sampleRate;
		const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;
		const toCopyInt = Math.trunc(toCopy);
		inputCopyRemainingFrac = toCopy - toCopyInt;
		const availableSpace = speechBuffer.length - speechBufferIndex;
		const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);
		if (toCopyBuffer > 0) {
		speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
		speechBufferIndex += toCopyBuffer;
		} else if (!speechBufferMaxReached) {
		speechBufferMaxReached = true;
		this.#logger.warn(
		"maxBufferedSpeech reached, ignoring further data for the current speech input"
		);
		}
		const inferenceDuration = Number((process.hrtime.bigint() - startTime) / BigInt(1e6));
		this.#extraInferenceTime = Math.max(
		0,
		this.#extraInferenceTime + inferenceDuration - windowDuration
		);
		if (this.#extraInferenceTime > SLOW_INFERENCE_THRESHOLD) {
		this.#logger.child({ delay: this.#extraInferenceTime }).warn("inference is slower than realtime");
		}
		if (pubSpeaking) {
		pubSpeechDuration += inferenceDuration;
		} else {
		pubSilenceDuration += inferenceDuration;
		}
		this.queue.put({
		type: VADEventType.INFERENCE_DONE,
		samplesIndex: pubCurrentSample,
		timestamp: pubTimestamp,
		silenceDuration: pubSilenceDuration,
		speechDuration: pubSpeechDuration,
		probability: p,
		inferenceDuration,
		frames: [
		new AudioFrame(inputFrame.data.subarray(0, toCopyInt), pubSampleRate, 1, toCopyInt)
		],
		speaking: pubSpeaking
		});
		const resetWriteCursor = () => {
		if (!speechBuffer) throw new Error("speechBuffer is empty");
		if (speechBufferIndex <= pubPrefixPaddingSamples) {
		return;
		}
		});
		}
		const paddingData = speechBuffer.subarray(
		speechBufferIndex - pubPrefixPaddingSamples,
		speechBufferIndex
		);
		speechBuffer.set(paddingData, 0);
		speechBufferIndex = pubPrefixPaddingSamples;
		speechBufferMaxReached = false;
		};
		const copySpeechBuffer = () => {
		if (!speechBuffer) throw new Error("speechBuffer is empty");
		return new AudioFrame(
		speechBuffer.subarray(0, speechBufferIndex),
		pubSampleRate,
		1,
		speechBufferIndex
		);
		};
		if (p > this.#opts.activationThreshold) {
		speechThresholdDuration += windowDuration;
		silenceThresholdDuration = 0;
		if (!pubSpeaking && speechThresholdDuration >= this.#opts.minSpeechDuration) {
		pubSpeaking = true;
		pubSilenceDuration = 0;
		pubSpeechDuration = speechThresholdDuration;
		this.queue.put({
		type: VADEventType.START_OF_SPEECH,
		samplesIndex: pubCurrentSample,
		timestamp: pubTimestamp,
		silenceDuration: pubSilenceDuration,
		speechDuration: pubSpeechDuration,
		probability: p,
		inferenceDuration,
		frames: [copySpeechBuffer()],
		speaking: pubSpeaking
		});
		}
		} else {
		silenceThresholdDuration += windowDuration;
		speechThresholdDuration = 0;
		if (!pubSpeaking) {
		resetWriteCursor();
		}
		if (pubSpeaking && silenceThresholdDuration > this.#opts.minSilenceDuration) {
		pubSpeaking = false;
		pubSpeechDuration = 0;
		pubSilenceDuration = silenceThresholdDuration;
		this.queue.put({
		type: VADEventType.END_OF_SPEECH,
		samplesIndex: pubCurrentSample,
		timestamp: pubTimestamp,
		silenceDuration: pubSilenceDuration,
		speechDuration: pubSpeechDuration,
		probability: p,
		inferenceDuration,
		frames: [copySpeechBuffer()],
		speaking: pubSpeaking
		});
		resetWriteCursor();
		}
		}
		inputFrames = [];
		inferenceFrames = [];
		if (inputFrame.data.length > toCopyInt) {
		const data = inputFrame.data.subarray(toCopyInt);
		inputFrames.push(new AudioFrame(data, pubSampleRate, 1, Math.trunc(data.length / 2)));
		}
		if (inferenceFrame.data.length > this.#model.windowSizeSamples) {
		const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);
		inferenceFrames.push(
		new AudioFrame(data, this.#opts.sampleRate, 1, Math.trunc(data.length / 2))
		);
		}
		}
		}
		});
		}
		}
		export {
		VAD,
		VADStream
		};
		//# sourceMappingURL=vad.js.map

package.json

		{
		"name": "@livekit/agents-plugin-silero",
		"version": "0.4.6",
		"version": "0.5.0",
		"description": "Silero voice activity detection LiveKit Node Agents",
		"main": "dist/index.js",
		"require": "dist/index.cjs",
		"types": "dist/index.d.ts",
		"exports": {
		".": {
		"types": "./dist/index.d.ts",
		"import": "./dist/index.js",
		"require": "./dist/index.cjs"
		}
		},
		"author": "LiveKit",
		@@ -13,11 +21,13 @@ "type": "module",
		"dist",
		"src"
		"src",
		"README.md"
		],
		"devDependencies": {
		"@livekit/agents": "^x",
		"@livekit/rtc-node": "^0.12.1",
		"@microsoft/api-extractor": "^7.35.0",
		"@livekit/rtc-node": "^0.11.1",
		"@types/ws": "^8.5.10",
		"onnxruntime-common": "^1.19.2",
		"typescript": "^5.0.0",
		"@livekit/agents": "^0.4.6"
		"tsup": "^8.3.5",
		"typescript": "^5.0.0"
		},
		@@ -29,7 +39,7 @@ "dependencies": {
		"peerDependencies": {
		"@livekit/rtc-node": "^0.11.1",
		"@livekit/agents": "^0.4.6"
		"@livekit/rtc-node": "^0.12.1",
		"@livekit/agents": "^0.5.0x"
		},
		"scripts": {
		"build": "tsc && cp src/*.onnx dist/",
		"build": "tsup --onSuccess \"tsc --declaration --emitDeclarationOnly\" && cp src/silero_vad.onnx dist/",
		"clean": "rm -rf dist",
		@@ -36,0 +46,0 @@ "clean:build": "pnpm clean && pnpm build",

src/onnx_model.ts

		// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
		//
		// SPDX-License-Identifier: Apache-2.0
		import { fileURLToPath } from 'node:url';
		import { InferenceSession, Tensor } from 'onnxruntime-node';
		@@ -9,3 +10,3 @@
		export const newInferenceSession = (forceCPU: boolean) => {
		return InferenceSession.create(new URL('silero_vad.onnx', import.meta.url).pathname, {
		return InferenceSession.create(fileURLToPath(new URL('silero_vad.onnx', import.meta.url).href), {
		interOpNumThreads: 1,
		@@ -12,0 +13,0 @@ intraOpNumThreads: 1,

src/vad.ts

		@@ -82,5 +82,6 @@ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
		*/
		static async load(opts = defaultVADOptions): Promise<VAD> {
		const session = await newInferenceSession(opts.forceCPU);
		return new VAD(session, opts);
		static async load(opts: Partial<VADOptions> = {}): Promise<VAD> {
		const mergedOpts: VADOptions = { ...defaultVADOptions, ...opts };
		const session = await newInferenceSession(mergedOpts.forceCPU);
		return new VAD(session, mergedOpts);
		}
		@@ -87,0 +88,0 @@

dist/index.js.map