IPAServer/api/espeak.js

93 lines
3.0 KiB
JavaScript

import axios from "axios";
import {getTag} from "@sozialhelden/ietf-language-tags";
const bodyParser = require('body-parser')
const app = require('express')()
const fs = require('fs')
const tmp = require('tmp')
const exec = require('child_process').exec;
const _ = require('lodash');
const builder = require('xmlbuilder');
require("dotenv").config()
const azureSpeech = require("microsoft-cognitiveservices-speech-sdk")
const {SpeechSynthesisOutputFormat} = require("microsoft-cognitiveservices-speech-sdk");
const speechConfig = azureSpeech.SpeechConfig.fromSubscription(process.env.AZURE_KEY, process.env.AZURE_REGION)
speechConfig.speechSynthesisOutputFormat = SpeechSynthesisOutputFormat.Ogg24Khz16BitMonoOpus
const eSpeakExecutable = process.platform === "win32" ? `"C:\\Program Files\\eSpeak NG\\espeak-ng.exe"` : `/usr/bin/espeak-ng`
app.use(bodyParser.json())
app.post('/get', (req, res, next) => {
tmp.file(function _tempFileCreated(err, path, fd, cleanupCallback) {
if (err) throw err;
fs.writeFile(path, req.body.sourceText.replace(/\n/gi,",\n"), {encoding: "utf-8"}, ()=> {
exec(`${eSpeakExecutable} -v"${req.body.selectedLanguage}" -q -f "${path}" --ipa`, ((error, stdout, stderr) => {
res.json({out: stdout.trim(), stderr: stderr.trim()})
cleanupCallback()
}))
})
});
})
app.get('/getLanguages', async (req, res, next) => {
try {
const languages = (await axios.get(`https://${process.env.AZURE_REGION}.tts.speech.microsoft.com/cognitiveservices/voices/list`, {
headers: {
"Ocp-Apim-Subscription-Key": process.env.AZURE_KEY
}
})).data
const grouped = _.groupBy(languages, x=>x.Locale)
let obj = []
for (const [key, value] of Object.entries(grouped)){
const tag = getTag(key)
obj.push({
label: `${tag.language.Description[0]} (${tag.region.Description[0]})`,
options: value.map(x=>{
return {
text: `${x.DisplayName} (${x.LocalName}, ${x.Gender}, ${x.VoiceType})`,
value: x.ShortName
}
}).sort((a, b) => a.text > b.text ? 1 : -1)
})
}
res.json(obj.sort((a, b) => a.label > b.label ? 1 : -1))
} catch(e) {
next(e)
}
})
app.post('/speak', async (req, res, next) => {
const synthesizer = new azureSpeech.SpeechSynthesizer(speechConfig)
const splitLines = req.body.targetText.split("\n")
const language = req.body.selectedSpeechLanguage.split("-").slice(0,2).join("-")
let root = builder.create("speak")
root.att("version","1.0")
root.att("xmlns", "http://www.w3.org/2001/10/synthesis")
root.att("xml:lang", language)
let voice = root.ele("voice").att("name", req.body.selectedSpeechLanguage)
for (const line of splitLines) {
voice.ele("phoneme").att("alphabet","ipa").att("ph",line).t(line)
}
const xml = root.toString()
synthesizer.speakSsmlAsync(xml, result => {
const audioData = result.audioData;
res.contentType("audio/ogg; codecs=opus").send(Buffer.from(audioData))
})
})
module.exports = app