IPAServer/api/espeak.js

import axios from "axios";
import {getTag} from "@sozialhelden/ietf-language-tags";

const bodyParser = require('body-parser')
const app = require('express')()
const fs = require('fs')
const tmp = require('tmp')
const exec = require('child_process').exec;
const _ = require('lodash');

const builder = require('xmlbuilder');


require("dotenv").config()

const azureSpeech = require("microsoft-cognitiveservices-speech-sdk")
const {SpeechSynthesisOutputFormat} = require("microsoft-cognitiveservices-speech-sdk");
const speechConfig = azureSpeech.SpeechConfig.fromSubscription(process.env.AZURE_KEY, process.env.AZURE_REGION)
speechConfig.speechSynthesisOutputFormat = SpeechSynthesisOutputFormat.Ogg24Khz16BitMonoOpus

const eSpeakExecutable = process.platform === "win32" ? `"C:\\Program Files\\eSpeak NG\\espeak-ng.exe"` : `/usr/bin/espeak-ng`

app.use(bodyParser.json())
app.post('/get', (req, res, next) => {
  tmp.file(function _tempFileCreated(err, path, fd, cleanupCallback) {
    if (err) throw err;

    fs.writeFile(path, req.body.sourceText.replace(/\n/gi,",\n"), {encoding: "utf-8"}, ()=> {
      exec(`${eSpeakExecutable} -v"${req.body.selectedLanguage}" -q -f "${path}" --ipa`, ((error, stdout, stderr) => {
        res.json({out: stdout.trim(), stderr: stderr.trim()})
        cleanupCallback()
      }))
    })
  });
})

app.get('/getLanguages', async (req, res, next) => {
  try {

    const languages = (await axios.get(`https://${process.env.AZURE_REGION}.tts.speech.microsoft.com/cognitiveservices/voices/list`, {
      headers: {
        "Ocp-Apim-Subscription-Key": process.env.AZURE_KEY
      }
    })).data
    const grouped = _.groupBy(languages, x=>x.Locale)
    let obj = []
    for (const [key, value] of Object.entries(grouped)){
      const tag = getTag(key)
      obj.push({
        label: `${tag.language.Description[0]} (${tag.region.Description[0]})`,
        options: value.map(x=>{
          return {
            text: `${x.DisplayName} (${x.LocalName}, ${x.Gender}, ${x.VoiceType})`,
            value: x.ShortName
          }
        }).sort((a, b) => a.text > b.text ? 1 : -1)
      })
    }
    res.json(obj.sort((a, b) => a.label > b.label ? 1 : -1))
  } catch(e) {
    next(e)
  }
})

app.post('/speak', async (req, res, next) => {
  const synthesizer = new azureSpeech.SpeechSynthesizer(speechConfig)

  const splitLines = req.body.targetText.split("\n")
  const language = req.body.selectedSpeechLanguage.split("-").slice(0,2).join("-")

  let root = builder.create("speak")
  root.att("version","1.0")
  root.att("xmlns", "http://www.w3.org/2001/10/synthesis")
  root.att("xml:lang", language)
  let voice = root.ele("voice").att("name", req.body.selectedSpeechLanguage)

  for (const line of splitLines) {
    voice.ele("phoneme").att("alphabet","ipa").att("ph",line).t(line)
  }

  const xml = root.toString()

  synthesizer.speakSsmlAsync(xml, result => {
    const audioData = result.audioData;
    res.contentType("audio/ogg; codecs=opus").send(Buffer.from(audioData))
  })

})

module.exports = app