Skip to content

文字转语音

听说微软的文字转语音是声音最像人的人工智能引擎, 随后在 github 上找到一个开源的软件 tts-vue 大致原理就是爬取微软演示界面的 API, 然后自行封装一下, 微软演示接口有字数限制, 拆分多次请求最后拼接到一起

ts 文件 转 js

shell
# 自动在当前目录下 生成同名的 js 文件
tsc xxx.ts

核心源码 剔除引用库

typescript
function getXTime() {
  return new Date().toISOString();
}

function wssSend(connect: any, msg: string) {
  return new Promise((resolve, reject) => {
    connect.send(msg);
    resolve(true);
  });
}

function wssConnect(url: string) {
  return new Promise((resolve, reject) => {
    const socket = new WebSocket(url);
    socket.addEventListener('open', (event) => {
      resolve(socket);
    });
  });
}

async function getTTSData(
  inps: any,
  voice: string = '',
  express: string = '',
  role: string = '',
  rate = 0,
  pitch = 0,
) {
  let SSML = '';
  if (inps.activeIndex == '1') {
    SSML = `
    <speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US">
        <voice name="${voice}">
            <mstts:express-as  ${
              express != 'General' ? 'style="' + express + '"' : ''
            } ${role != 'Default' ? 'role="' + role + '"' : ''}>
                <prosody rate="${rate}%" pitch="${pitch}%">
                ${inps.inputValue}
                </prosody>
            </mstts:express-as>
        </voice>
    </speak>
    `;
  } else {
    // SSML = inps.inputValue;
    SSML = `<speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US"><voice name="zh-CN-XiaoxiaoNeural"><prosody rate="0%" pitch="0%">
		
		${inps.inputValue}
		</prosody></voice></speak>`;
  }
  console.log(SSML);

  // console.log("获取Token...");
  // const Authorization = await getAuthToken();
  const randomUUID =
    typeof crypto !== 'undefined' &&
    crypto.randomUUID &&
    crypto.randomUUID.bind(crypto);
  const XConnectionId = randomUUID().toUpperCase().replaceAll('-', '');

  console.log('创建webscoket连接...');
  console.log(XConnectionId);
  const connect: any = await wssConnect(
    `wss://eastus.api.speech.microsoft.com/cognitiveservices/websocket/v1?TrafficType=AzureDemo&Authorization=bearer%20undefined&X-ConnectionId=${XConnectionId}`,
  );

  console.log('第1次上报...');
  const message_1 = `Path: speech.config\r\nX-RequestId: ${XConnectionId}\r\nX-Timestamp: ${getXTime()}\r\nContent-Type: application/json\r\n\r\n{"context":{"system":{"name":"SpeechSDK","version":"1.19.0","build":"JavaScript","lang":"JavaScript","os":{"platform":"Browser/Linux x86_64","name":"Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0","version":"5.0 (X11)"}}}}`;
  await wssSend(connect, message_1);

  console.log('第2次上报...');
  const message_2 = `Path: synthesis.context\r\nX-RequestId: ${XConnectionId}\r\nX-Timestamp: ${getXTime()}\r\nContent-Type: application/json\r\n\r\n{"synthesis":{"audio":{"metadataOptions":{"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":false},"outputFormat":"audio-24khz-160kbitrate-mono-mp3"}}}`;
  await wssSend(connect, message_2);

  console.log('第3次上报...');
  const message_3 = `Path: ssml\r\nX-RequestId: ${XConnectionId}\r\nX-Timestamp: ${getXTime()}\r\nContent-Type: application/ssml+xml\r\n\r\n${SSML}`;
  await wssSend(connect, message_3);

  return new Promise((resolve, reject) => {
    connect.addEventListener('message', (event: any) => {
      console.log('Message from server ', event.data);
    });
    // let final_data = Buffer.alloc(0);
    // connect.on("text", (data: string | string[]) => {
    // 	if (data.indexOf("Path:turn.end") >= 0) {
    // 		console.log("已完成");
    // 		connect.close();
    // 		resolve(final_data);
    // 	}
    // });
    // connect.on("binary", function (response: { on: (arg0: string, arg1: { (): void; (): void }) => void; read: () => any }) {
    // 	console.log("正在接收数据...");
    // 	let data = Buffer.alloc(0);
    // 	response.on("readable", function () {
    // 		const newData = response.read();
    // 		if (newData) data = Buffer.concat([data, newData], data.length + newData.length);
    // 	});
    // 	response.on("end", function () {
    // 		const index = data.toString().indexOf("Path:audio") + 10;
    // 		const cmbData = data.slice(index + 2);
    // 		final_data = Buffer.concat([final_data, cmbData]);
    // 	});
    // });
    // connect.on("close", function (code: any, reason: any) {});
  });
}
export default getTTSData;