edge-tts-node
Advanced tools
Comparing version 1.3.1 to 1.3.5
@@ -1,3 +0,1 @@ | ||
/// <reference types="node" /> | ||
/// <reference types="node" /> | ||
import { OUTPUT_FORMAT } from "./OUTPUT_FORMAT"; | ||
@@ -41,3 +39,3 @@ import { Readable } from "stream"; | ||
static OUTPUT_FORMAT: typeof OUTPUT_FORMAT; | ||
private static TRUSTED_CLIENT_TOKEN; | ||
static TRUSTED_CLIENT_TOKEN: string; | ||
private static VOICES_URL; | ||
@@ -56,2 +54,4 @@ private static SYNTH_URL; | ||
private readonly _agent; | ||
private _arraybuffer; | ||
private state; | ||
private _log; | ||
@@ -67,55 +67,13 @@ /** | ||
private _initClient; | ||
private _pushData; | ||
private _pushAudioData; | ||
private _SSMLTemplate; | ||
/** | ||
* Fetch the list of voices available in Microsoft Edge. | ||
* These, however, are not all. The complete list of voices supported by this module [can be found here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support) (neural, standard, and preview). | ||
*/ | ||
getVoices(): Promise<Voice[]>; | ||
/** | ||
* Sets the required information for the speech to be synthesised and inits a new WebSocket connection. | ||
* Must be called at least once before text can be synthesised. | ||
* Saved in this instance. Can be called at any time times to update the metadata. | ||
* | ||
* @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices) | ||
* @param outputFormat any {@link OUTPUT_FORMAT} | ||
* @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName` | ||
*/ | ||
setConfig(conf: any): void; | ||
setMetadata(voiceName: string, outputFormat: OUTPUT_FORMAT, voiceLocale?: string): Promise<void>; | ||
private _metadataCheck; | ||
/** | ||
* Close the WebSocket connection. | ||
*/ | ||
close(): void; | ||
/** | ||
* Writes raw audio synthesised from text to a file. Uses a basic {@link _SSMLTemplate SML template}. | ||
* | ||
* @param path a valid output path, including a filename and file extension. | ||
* @param input the input to synthesise | ||
* @param options (optional) {@link ProsodyOptions} | ||
* @returns {Promise<string>} - a `Promise` with the full filepath | ||
*/ | ||
toFile(path: string, input: string, options?: ProsodyOptions): Promise<string>; | ||
/** | ||
* Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}. | ||
* | ||
* @param input the text to synthesise. Can include SSML elements. | ||
* @param options (optional) {@link ProsodyOptions} | ||
* @returns {Readable} - a `stream.Readable` with the audio data | ||
*/ | ||
toStream(input: string, options?: ProsodyOptions): Readable; | ||
/** | ||
* Writes raw audio synthesised from text to a file. Has no SSML template. Basic SSML should be provided in the request. | ||
* | ||
* @param path a valid output path, including a filename and file extension. | ||
* @param requestSSML the SSML to send. SSML elements required in order to work. | ||
* @returns {Promise<string>} - a `Promise` with the full filepath | ||
*/ | ||
rawToFile(path: string, requestSSML: string): Promise<string>; | ||
/** | ||
* Writes raw audio synthesised from a request in real-time to a {@link Readable}. Has no SSML template. Basic SSML should be provided in the request. | ||
* | ||
* @param requestSSML the SSML to send. SSML elements required in order to work. | ||
* @returns {Readable} - a `stream.Readable` with the audio data | ||
*/ | ||
rawToStream(requestSSML: string): Readable; | ||
@@ -122,0 +80,0 @@ private _rawSSMLRequestToFile; |
@@ -33,6 +33,7 @@ "use strict"; | ||
const buffer_1 = require("buffer"); | ||
const crypto_1 = require("crypto"); | ||
const randombytes_1 = __importDefault(require("randombytes")); | ||
const OUTPUT_FORMAT_1 = require("./OUTPUT_FORMAT"); | ||
const stream_1 = require("stream"); | ||
const fs = __importStar(require("fs")); | ||
const utils_1 = require("./utils"); | ||
class ProsodyOptions { | ||
@@ -60,3 +61,3 @@ /** | ||
class MsEdgeTTS { | ||
static wordBoundaryEnabled = false; | ||
static wordBoundaryEnabled = true; | ||
static OUTPUT_FORMAT = OUTPUT_FORMAT_1.OUTPUT_FORMAT; | ||
@@ -77,4 +78,10 @@ static TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4"; | ||
_agent; | ||
_arraybuffer = false; | ||
state = { | ||
offsetCompensation: 0, | ||
lastDurationOffset: 0 | ||
}; | ||
_log(...o) { | ||
if (this._enableLogger) { | ||
o.unshift('edgetts:'); | ||
console.log(...o); | ||
@@ -103,3 +110,3 @@ } | ||
this._ws.send(message, () => { | ||
this._log("<- sent message: ", message); | ||
//this._log("<- sent message: ", message); | ||
}); | ||
@@ -111,3 +118,4 @@ } | ||
: new isomorphic_ws_1.default(MsEdgeTTS.SYNTH_URL, { agent: this._agent }); | ||
this._ws.binaryType = "arraybuffer"; | ||
if (this._arraybuffer) | ||
this._ws.binaryType = "arraybuffer"; | ||
return new Promise((resolve, reject) => { | ||
@@ -133,20 +141,77 @@ this._ws.onopen = () => { | ||
this._ws.onmessage = (m) => { | ||
const buffer = buffer_1.Buffer.from(m.data); | ||
const message = buffer.toString(); | ||
const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)[1]; | ||
if (message.includes("Path:turn.start")) { | ||
// start of turn, ignore | ||
this._log("type:::::::: ", typeof m.data); | ||
let mdata = m.data; | ||
if (typeof mdata === 'string') { | ||
const encodedData = buffer_1.Buffer.from(mdata, 'utf8'); | ||
const message = mdata; | ||
const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)[1]; | ||
let [headers, data] = (0, utils_1.getHeadersAndData)(encodedData, encodedData.indexOf("\r\n\r\n")); | ||
const path = headers['Path']; | ||
if (path === "audio.metadata") { | ||
let parsedMetadata = (0, utils_1.parseMetadata)(data, this.state["offsetCompensation"]); | ||
this._pushData(parsedMetadata, requestId); | ||
// 更新上一次的持续时间偏移量,用于下一次 SSML 请求 | ||
this.state["lastDurationOffset"] = parsedMetadata["offset"] + parsedMetadata["duration"]; | ||
} | ||
else if (path === "turn.end") { | ||
this.state["offsetCompensation"] = this.state["lastDurationOffset"]; | ||
this.state["offsetCompensation"] += 8750000; | ||
} | ||
else if (path !== "response" && path !== "turn.start") { | ||
// 如果路径不是 "response" 或 "turn.start" | ||
throw new Error("Unknown path received"); // 抛出未知响应错误 | ||
} | ||
} | ||
else if (message.includes("Path:turn.end")) { | ||
// end of turn, close stream | ||
this._streams[requestId].push(null); | ||
else if (buffer_1.Buffer.isBuffer(mdata)) { | ||
const message = mdata.toString(); | ||
const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)[1]; | ||
const headerLength = mdata.readUInt16BE(0); | ||
if (headerLength > mdata.length) { | ||
throw new Error("The header length is greater than the length of the data."); | ||
} | ||
// Parse the headers and data from the binary message. | ||
let [headers, data] = (0, utils_1.getHeadersAndData)(mdata, headerLength); | ||
if (headers['Path'] !== 'audio') { | ||
throw new Error("Received binary message, but the path is not audio."); | ||
} | ||
const contentType = headers['Content-Type']; | ||
if (contentType !== 'audio/mpeg' && contentType !== undefined) { | ||
throw new Error("Received binary message, but with an unexpected Content-Type."); | ||
} | ||
// We only allow no Content-Type if there is no data. | ||
if (contentType === undefined) { | ||
if (data.length === 0) { | ||
return; | ||
} | ||
// If the data is not empty, then we need to raise an exception. | ||
throw new Error("Received binary message with no Content-Type, but with data."); | ||
} | ||
// If the data is empty now, then we need to raise an exception. | ||
if (data.length === 0) { | ||
throw new Error("Received binary message, but it is missing the audio data."); | ||
} | ||
this._pushData({ type: "audio", data: data }, requestId); | ||
} | ||
else if (message.includes("Path:response")) { | ||
// context response, ignore | ||
} | ||
else if (message.includes("Path:audio") && m.data instanceof ArrayBuffer) { | ||
this._pushAudioData(buffer, requestId); | ||
} | ||
else { | ||
this._log("UNKNOWN MESSAGE", message); | ||
mdata = buffer_1.Buffer.isBuffer(mdata) ? mdata : mdata['data']; | ||
const buffer = buffer_1.Buffer.from(mdata); | ||
const message = buffer.toString(); | ||
const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)[1]; | ||
this._log(message.includes("Path:audio"), buffer_1.Buffer.isBuffer(mdata), mdata instanceof ArrayBuffer); | ||
if (message.includes("Path:turn.start")) { | ||
// start of turn, ignore | ||
} | ||
else if (message.includes("Path:turn.end")) { | ||
// end of turn, close stream | ||
this._streams[requestId].push(null); | ||
} | ||
else if (message.includes("Path:response")) { | ||
// context response, ignore | ||
} | ||
else if (message.includes("Path:audio") && buffer_1.Buffer.isBuffer(mdata)) { | ||
this._pushAudioData(buffer, requestId); | ||
} | ||
else { | ||
//this._log("UNKNOWN MESSAGE", message); | ||
} | ||
} | ||
@@ -165,2 +230,6 @@ }; | ||
} | ||
_pushData(data, requestId) { | ||
data = typeof data == "string" ? data : JSON.stringify(data); | ||
this._streams[requestId].push(data, 'utf8'); | ||
} | ||
_pushAudioData(audioBuffer, requestId) { | ||
@@ -170,3 +239,3 @@ const audioStartIndex = audioBuffer.indexOf(MsEdgeTTS.BINARY_DELIM) + MsEdgeTTS.BINARY_DELIM.length; | ||
this._streams[requestId].push(audioData); | ||
this._log("received audio chunk, size: ", audioData?.length); | ||
this._log("_pushAudioData: received audio chunk, size: ", audioData?.length); | ||
} | ||
@@ -184,6 +253,2 @@ _SSMLTemplate(input, options = {}) { | ||
} | ||
/** | ||
* Fetch the list of voices available in Microsoft Edge. | ||
* These, however, are not all. The complete list of voices supported by this module [can be found here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support) (neural, standard, and preview). | ||
*/ | ||
getVoices() { | ||
@@ -196,11 +261,5 @@ return new Promise((resolve, reject) => { | ||
} | ||
/** | ||
* Sets the required information for the speech to be synthesised and inits a new WebSocket connection. | ||
* Must be called at least once before text can be synthesised. | ||
* Saved in this instance. Can be called at any time times to update the metadata. | ||
* | ||
* @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices) | ||
* @param outputFormat any {@link OUTPUT_FORMAT} | ||
* @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName` | ||
*/ | ||
setConfig(conf) { | ||
this._arraybuffer = conf["arraybuffer"] ?? false; | ||
} | ||
async setMetadata(voiceName, outputFormat, voiceLocale) { | ||
@@ -232,26 +291,8 @@ const oldVoice = this._voice; | ||
} | ||
/** | ||
* Close the WebSocket connection. | ||
*/ | ||
close() { | ||
this._ws.close(); | ||
} | ||
/** | ||
* Writes raw audio synthesised from text to a file. Uses a basic {@link _SSMLTemplate SML template}. | ||
* | ||
* @param path a valid output path, including a filename and file extension. | ||
* @param input the input to synthesise | ||
* @param options (optional) {@link ProsodyOptions} | ||
* @returns {Promise<string>} - a `Promise` with the full filepath | ||
*/ | ||
toFile(path, input, options) { | ||
return this._rawSSMLRequestToFile(path, this._SSMLTemplate(input, options)); | ||
} | ||
/** | ||
* Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}. | ||
* | ||
* @param input the text to synthesise. Can include SSML elements. | ||
* @param options (optional) {@link ProsodyOptions} | ||
* @returns {Readable} - a `stream.Readable` with the audio data | ||
*/ | ||
toStream(input, options) { | ||
@@ -261,18 +302,5 @@ const { stream } = this._rawSSMLRequest(this._SSMLTemplate(input, options)); | ||
} | ||
/** | ||
* Writes raw audio synthesised from text to a file. Has no SSML template. Basic SSML should be provided in the request. | ||
* | ||
* @param path a valid output path, including a filename and file extension. | ||
* @param requestSSML the SSML to send. SSML elements required in order to work. | ||
* @returns {Promise<string>} - a `Promise` with the full filepath | ||
*/ | ||
rawToFile(path, requestSSML) { | ||
return this._rawSSMLRequestToFile(path, requestSSML); | ||
} | ||
/** | ||
* Writes raw audio synthesised from a request in real-time to a {@link Readable}. Has no SSML template. Basic SSML should be provided in the request. | ||
* | ||
* @param requestSSML the SSML to send. SSML elements required in order to work. | ||
* @returns {Readable} - a `stream.Readable` with the audio data | ||
*/ | ||
rawToStream(requestSSML) { | ||
@@ -303,3 +331,3 @@ const { stream } = this._rawSSMLRequest(requestSSML); | ||
this._metadataCheck(); | ||
const requestId = (0, crypto_1.randomBytes)(16).toString("hex"); | ||
const requestId = (0, randombytes_1.default)(16).toString("hex"); | ||
const request = `X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nPath:ssml\r\n\r\n | ||
@@ -313,2 +341,3 @@ ` + requestSSML.trim(); | ||
destroy(error, callback) { | ||
self._log("+_+_+_+__+_", error); | ||
delete self._streams[requestId]; | ||
@@ -315,0 +344,0 @@ callback(error); |
{ | ||
"name": "edge-tts-node", | ||
"version": "1.3.1", | ||
"version": "1.3.5", | ||
"description": "An Azure Speech Service module that uses the Microsoft Edge Read Aloud API.", | ||
@@ -40,2 +40,3 @@ "author": "Migushthe2nd <Migushthe2nd@users.noreply.github.com>", | ||
"process": "^0.11.10", | ||
"randombytes": "^2.1.0", | ||
"stream-browserify": "^3.0.0", | ||
@@ -42,0 +43,0 @@ "ws": "^8.14.1" |
@@ -1,4 +0,5 @@ | ||
# MsEdgeTTS | ||
[![npm version](https://badge.fury.io/js/msedge-tts.svg)](https://badge.fury.io/js/msedge-tts) | ||
# edge-tts-node | ||
[![npm version](https://badge.fury.io/js/edge-tts-node.svg)](https://badge.fury.io/js/edge-tts-node) | ||
An simple Azure Speech Service module that uses the Microsoft Edge Read Aloud API. | ||
@@ -31,15 +32,18 @@ | ||
```js | ||
import {MsEdgeTTS, OUTPUT_FORMAT} from "msedge-tts"; | ||
import { MsEdgeTTS, OUTPUT_FORMAT } from "edge-tts-node"; | ||
const tts = new MsEdgeTTS(); | ||
await tts.setMetadata("en-IE-ConnorNeural", OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS); | ||
await tts.setMetadata( | ||
"en-IE-ConnorNeural", | ||
OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS | ||
); | ||
const readable = tts.toStream("Hi, how are you?"); | ||
readable.on("data", (data) => { | ||
console.log("DATA RECEIVED", data); | ||
// raw audio file data | ||
console.log("DATA RECEIVED", data); | ||
// raw audio file data | ||
}); | ||
readable.on("close", () => { | ||
console.log("STREAM CLOSED"); | ||
console.log("STREAM CLOSED"); | ||
}); | ||
@@ -51,8 +55,11 @@ ``` | ||
```js | ||
import {MsEdgeTTS, OUTPUT_FORMAT} from "msedge-tts"; | ||
import { MsEdgeTTS, OUTPUT_FORMAT } from "edge-tts-node"; | ||
(async () => { | ||
const tts = new MsEdgeTTS(); | ||
await tts.setMetadata("en-US-AriaNeural", OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS); | ||
const filePath = await tts.toFile("./example_audio.webm", "Hi, how are you?"); | ||
const tts = new MsEdgeTTS(); | ||
await tts.setMetadata( | ||
"en-US-AriaNeural", | ||
OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS | ||
); | ||
const filePath = await tts.toFile("./example_audio.webm", "Hi, how are you?"); | ||
})(); | ||
@@ -62,9 +69,17 @@ ``` | ||
### Change voice rate, pitch and volume | ||
```js | ||
import {MsEdgeTTS, OUTPUT_FORMAT} from "msedge-tts"; | ||
import { MsEdgeTTS, OUTPUT_FORMAT } from "edge-tts-node"; | ||
(async () => { | ||
const tts = new MsEdgeTTS(); | ||
await tts.setMetadata("en-US-AriaNeural", OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS); | ||
const filePath = await tts.toFile("./example_audio.webm", "Hi, how are you?", {rate: 0.5, pitch: "+200Hz"}); | ||
const tts = new MsEdgeTTS(); | ||
await tts.setMetadata( | ||
"en-US-AriaNeural", | ||
OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS | ||
); | ||
const filePath = await tts.toFile( | ||
"./example_audio.webm", | ||
"Hi, how are you?", | ||
{ rate: 0.5, pitch: "+200Hz" } | ||
); | ||
})(); | ||
@@ -74,11 +89,18 @@ ``` | ||
### Use an alternative HTTP Agent | ||
Use a custom http.Agent implementation like [https-proxy-agent](https://github.com/TooTallNate/proxy-agents) or [socks-proxy-agent](https://github.com/TooTallNate/proxy-agents/tree/main/packages/socks-proxy-agent). | ||
```js | ||
import {SocksProxyAgent} from 'socks-proxy-agent'; | ||
import { SocksProxyAgent } from "socks-proxy-agent"; | ||
(async () => { | ||
const agent = new SocksProxyAgent("socks://your-name%40gmail.com:abcdef12345124@br41.nordvpn.com") | ||
const tts = new MsEdgeTTS(agent); | ||
await tts.setMetadata("en-US-AriaNeural", OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS); | ||
const filePath = await tts.toFile("./example_audio.webm", "Hi, how are you?"); | ||
const agent = new SocksProxyAgent( | ||
"socks://your-name%40gmail.com:abcdef12345124@br41.nordvpn.com" | ||
); | ||
const tts = new MsEdgeTTS(agent); | ||
await tts.setMetadata( | ||
"en-US-AriaNeural", | ||
OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS | ||
); | ||
const filePath = await tts.toFile("./example_audio.webm", "Hi, how are you?"); | ||
})(); | ||
@@ -85,0 +107,0 @@ ``` |
34630
17
624
110
8
+ Addedrandombytes@^2.1.0