mstts-tiny.agi

#!/usr/bin/env perl

#
# Script that uses Azure Cognitive Services for speech synthesis.
#
# Copyright (C) 2018, Lefteris Zafiris <zaf@fastmail.com>
#
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the COPYING file
# at the top of the source tree.
#
# -----
# Usage
# -----
# agi(mstts.agi,"text",[language],[intkey],[gender]): This will invoke the Azure TTS
# engine, render the text string to speech and play it back to the user.
# If 'intkey' is set the script will wait for user input. Any given interrupt keys will
# cause the playback to immediately terminate and the dialplan to proceed to the
# matching extension (this is mainly for use in IVR, see README for examples).
#
# The script contacts Microsoft Azure TTS service in order to get the voice data
# which then stores in a local cache (by default /tmp/) for future use.
#
# Parameters like default language, caching and cache dir
# can be set up by altering the following variables:
# Default language: $lang
# Speaker gender:   $gender
# Chace:            $usecache
# Chache directory: $cachedir
#

use warnings;
use strict;
use utf8;
use Encode qw(decode encode);
use File::Basename qw(basename);
use File::Temp qw(tempfile);
use File::Copy qw(move);
use File::Path qw(mkpath);
use Digest::MD5 qw(md5_hex);
use HTTP::Tiny;
$| = 1;

# ------------------------------ #
#   User defined parameters:     #
# ------------------------------ #
# Azure API Key                  #
my $key = "";

# Default language               #
my $lang = 'en-US';

# Default speaker gender         #
my $gender ='Female';

# Region (https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/regions#rest-apis)
my $region = "westus";           #

# Use of cache mechanism         #
my $usecache = 1;

# Cache directory path           #
my $cachedir = '/tmp';

# Verbose debugging messages     #
my $debug = 0;

# ------------------------------ #

my %AGI;
my @text;
my @result;
my $atoken;
my $fh;
my $tmpname;
my $http;
my $intkey  = "";
my $tmpdir  = "/tmp";
my $maxlen  = 4096;
my $timeout = 10;
my $url     = "https://" . $region . ".tts.speech.microsoft.com/cognitiveservices/v1";
my %lang_list = get_lang_list();

# Store AGI input: Text, Language, Interrupt, Gender #
($AGI{arg_1}, $AGI{arg_2}, $AGI{arg_3}, $AGI{arg_4}) = @ARGV;
while (<STDIN>) {
	chomp;
	last if (!length);
	$AGI{$1} = $2 if (/^agi_(\w+)\:\s+(.*)$/);
}

my $name = " -- $AGI{request}:";

# Sanitizing input text #
$AGI{arg_1} = decode('utf8', $AGI{arg_1});
for ($AGI{arg_1}) {
	s/[\\|*~<>^\(\)\[\]\{\}[:cntrl:]]/ /g;
	s/\s+/ /g;
	s/^\s|\s$//g;
	fatal_log("No text passed for synthesis.") if (!length);
	@text = /.{1,1000}$|.{1,1000}[.,?!:;]|.{1,1000}\s/g;
}

# Setting up language, gender and interrupt keys #
if (length($AGI{arg_2})) {
	if ($AGI{arg_2} =~ /^[a-z]{2}-[A-Z]{2}$/) {
		$lang = $AGI{arg_2};
	} else {
		fatal_log("Invalid language setting.");
	}
}
if (length($AGI{arg_3})) {
	$intkey = "0123456789#*" if ($AGI{arg_3} eq "any");
	$intkey = $AGI{arg_3} if ($AGI{arg_3} =~ /^[0-9*#]+$/);
}
if (length($AGI{arg_4})) {
	if ($AGI{arg_4} eq "m") {
		$gender = 'Male';
	} elsif  ($AGI{arg_4} eq "f") {
		$gender = 'Female';
	} else {
		console_log("Invalid Gender setting, using default.");
	}
}
fatal_log("Unsupported language/gender combination.") if (!exists $lang_list{"$lang-$gender"});

# Check cache path size: dir length + md5 + file extension #
if ($usecache) {
	if ((length($cachedir) + 32 + 6) < $maxlen) {
		mkpath("$cachedir") unless (-d "$cachedir");
	} else {
		console_log("Cache path size exceeds limit. Disabling cache.");
		$usecache = 0;
	}
}

# Answer channel if not already answered #
print "CHANNEL STATUS\n";
@result = checkresponse();
if ($result[0] == 4) {
	print "ANSWER\n";
	@result = checkresponse();
	if ($result[0] != 0) {
		fatal_log("Failed to answer channel.");
	}
}

# Detecting format based on the channel codec. #
my $format = 'raw-8khz-8bit-mono-mulaw';
my $fexten = detect_format();
$format = 'raw-16khz-16bit-mono-pcm' if ($fexten eq 'sln16');

# Synthesize speech using MS Translator API #
foreach my $line (@text) {
	$line = encode('utf8', $line);
	$line =~ s/^\s+|\s+$//g;
	next if (length($line) == 0);
	if ($debug) {
		console_log(
			"Text passed for synthesis: $line, Language: $lang, Gender: $gender",
			"Interrupt keys: $intkey, Format: $fexten",
			"Caching: $usecache, Cache dir: $cachedir"
		);
	}
	my $filename;
	if ($usecache) {
		$filename = md5_hex("$line.$lang.$gender.$format");
		# Stream file from cache if it exists #
		if (-r "$cachedir/$filename.$fexten") {
			console_log("File already in cache.") if ($debug);
			my $res = playback("$cachedir/$filename", $intkey);
			last if ($res > 0);
			die if ($res < 0);
			next;
		}
	}
	# Initialize User agent #
	if (!$http) {
		$http = HTTP::Tiny->new(
			agent      => 'Asterisk Bing TTS module',
			timeout    => $timeout,
			verify_SSL => 1,
		);
	}
	# Get access token #
	$atoken = get_access_token() if (!$atoken);
	fatal_log("No access token could be obtained. Aborting.") if (!$atoken);

	# Handle interrupts #
	$SIG{'INT'} = \&int_handler;
	$SIG{'HUP'} = \&int_handler;

	my $data = "<speak version='1.0' xml:lang='" . $lang . "'><voice xml:lang='". $lang
	. "' xml:gender='" . $gender . "' name='" . $lang_list{"$lang-$gender"}
	. "'>" . $line ."</voice></speak>";
	my %headers = (
		"Content-type" => "application/ssml+xml",
		"X-Microsoft-OutputFormat" => $format,
		"Authorization" => $atoken,
	);
	my %options = (
		"headers" => \%headers,
		"content" => $data,
	);
	my $response = $http->request('POST', $url, \%options);
	fatal_log("Failed to fetch file:", $response->{status}, $response->{reason}) unless ($response->{success});

	($fh, $tmpname) = tempfile("bingtts_XXXXXXXX", SUFFIX => ".$fexten", DIR => $tmpdir, UNLINK => 1);
	binmode $fh;
	print $fh $response->{content};
	close $fh;
	# Playback and save file in cache #
	my $res = playback("$tmpdir/" . basename($tmpname, ".$fexten"), $intkey);
	die if ($res < 0);
	if ($usecache) {
		console_log("Saving file $filename to cache") if ($debug);
		move($tmpname, "$cachedir/$filename.$fexten");
	} else {
		unlink $tmpname;
	}
	last if ($res > 0);
}
exit;

sub get_access_token {
# Obtaining an Access Token #
	my $token;
	my $expire;
	my $tempfl;
	my $fh;
	my $spool = $ENV{'AST_SPOOL_DIR'};
	my $tkfile = $spool . "/tmp/bing-tts.token";

	if (-f $tkfile && open($fh, "<", $tkfile)) {
		console_log("Reading access token from file.") if ($debug);
		while (<$fh>) {
			/^expire:(\d+)$/ and $expire = $1;
			/^token:(.+)$/ and $token = $1;
		}
		close $fh;
	}
	if (!$token or time > $expire - 10) {
		$token = '';
		fatal_log("No API Key provided.") if (!$key);
		console_log("Getting a new access token.") if ($debug);
		my %headers = (
			"Ocp-Apim-Subscription-Key" => $key,
			"Content-Length" => 0,
		);
		my %options = ( 'headers' => \%headers );
		my $response = $http->request(
			'POST',
			"https://" . $region . ".api.cognitive.microsoft.com/sts/v1.0/issueToken",
			\%options
		);
		if ($response->{success}) {
			$token = "Bearer " . $response->{content};
			$expire = time + 600;
			if ($spool) {
				($fh, $tempfl) = tempfile();
				print $fh "expire:$expire\n";
				print $fh "token:$token\n";
				close $fh;
				move($tempfl, $tkfile);
			}
		} else {
			console_log("Failed to get Access Token:", $response->{status}, $response->{reason}) if ($debug);
		}
	}
	return $token;
}

sub checkresponse {
	my $input = <STDIN>;
	my @values;

	chomp $input;
	if ($input =~ /^200 result=(-?\d+)\s?(.*)$/) {
		@values = ("$1", "$2");
	} else {
		$input .= <STDIN> if ($input =~ /^520-Invalid/);
		warn "$name Unexpected result: $input\n";
		@values = (-1, -1);
	}
	return @values;
}

sub playback {
	my ($file, $keys) = @_;
	my @response;

	print "STREAM FILE $file \"$keys\"\n";
	@response = checkresponse();
	if ($response[0] >= 32 && chr($response[0]) =~ /[\w*#]/) {
		console_log("Got digit ", chr($response[0])) if ($debug);
		print "SET EXTENSION ", chr($response[0]), "\n";
		checkresponse();
		print "SET PRIORITY 1\n";
		checkresponse();
	} elsif ($response[0] == -1) {
		console_log("Failed to play $file");
	}
	return $response[0];
}

sub detect_format {
# Detect the sound format used #
	my $format = 'ulaw';
	print "GET FULL VARIABLE \${CHANNEL(audionativeformat)}\n";
	my @reply = checkresponse();
	$format = 'sln16' if ( $reply[1] !~ /^(slin|g729|gsm|ulaw|alaw|ilbc|g726|g723)$/ );
	return $format;
}

# https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#text-to-speech
sub get_lang_list {
	my %list = (
		"ar-EG-Female"	=> "Microsoft Server Speech Text to Speech Voice (ar-EG, Hoda)",
		"ar-SA-Male"	=> "Microsoft Server Speech Text to Speech Voice (ar-SA, Naayf)",
		"bg-BG-Male"	=> "Microsoft Server Speech Text to Speech Voice (bg-BG, Ivan)",
		"ca-ES-Female"	=> "Microsoft Server Speech Text to Speech Voice (ca-ES, HerenaRUS)",
		"cs-CZ-Male"	=> "Microsoft Server Speech Text to Speech Voice (cs-CZ, Jakub)",
		"da-DK-Female"	=> "Microsoft Server Speech Text to Speech Voice (da-DK, HelleRUS)",
		"de-AT-Male"	=> "Microsoft Server Speech Text to Speech Voice (de-AT, Michael)",
		"de-CH-Male"	=> "Microsoft Server Speech Text to Speech Voice (de-CH, Karsten)",
		"de-DE-Female"	=> "Microsoft Server Speech Text to Speech Voice (de-DE, Hedda)",
		"de-DE-Female"	=> "Microsoft Server Speech Text to Speech Voice (de-DE, HeddaRUS)",
		"de-DE-Male"	=> "Microsoft Server Speech Text to Speech Voice (de-DE, Stefan, Apollo)",
		"el-GR-Male"	=> "Microsoft Server Speech Text to Speech Voice (el-GR, Stefanos)",
		"en-AU-Female"	=> "Microsoft Server Speech Text to Speech Voice (en-AU, Catherine)",
		"en-AU-Female"	=> "Microsoft Server Speech Text to Speech Voice (en-AU, HayleyRUS)",
		"en-CA-Female"	=> "Microsoft Server Speech Text to Speech Voice (en-CA, Linda)",
		"en-CA-Female"	=> "Microsoft Server Speech Text to Speech Voice (en-CA, HeatherRUS)",
		"en-GB-Female"	=> "Microsoft Server Speech Text to Speech Voice (en-GB, Susan, Apollo)",
		"en-GB-Female"	=> "Microsoft Server Speech Text to Speech Voice (en-GB, HazelRUS)",
		"en-GB-Male"	=> "Microsoft Server Speech Text to Speech Voice (en-GB, George, Apollo)",
		"en-IE-Male"	=> "Microsoft Server Speech Text to Speech Voice (en-IE, Sean)",
		"en-IN-Female"	=> "Microsoft Server Speech Text to Speech Voice (en-IN, Heera, Apollo)",
		"en-IN-Female"	=> "Microsoft Server Speech Text to Speech Voice (en-IN, PriyaRUS)",
		"en-IN-Male"	=> "Microsoft Server Speech Text to Speech Voice (en-IN, Ravi, Apollo)",
		"en-US-Female"	=> "Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)",
		"en-US-Female"	=> "Microsoft Server Speech Text to Speech Voice (en-US, JessaRUS)",
		"en-US-Male"	=> "Microsoft Server Speech Text to Speech Voice (en-US, BenjaminRUS)",
		"es-ES-Female"	=> "Microsoft Server Speech Text to Speech Voice (es-ES, Laura, Apollo)",
		"es-ES-Female"	=> "Microsoft Server Speech Text to Speech Voice (es-ES, HelenaRUS)",
		"es-ES-Male"	=> "Microsoft Server Speech Text to Speech Voice (es-ES, Pablo, Apollo)",
		"es-MX-Female"	=> "Microsoft Server Speech Text to Speech Voice (es-MX, HildaRUS)",
		"es-MX-Male"	=> "Microsoft Server Speech Text to Speech Voice (es-MX, Raul, Apollo)",
		"fi-FI-Female"	=> "Microsoft Server Speech Text to Speech Voice (fi-FI, HeidiRUS)",
		"fr-CA-Female"	=> "Microsoft Server Speech Text to Speech Voice (fr-CA, Caroline)",
		"fr-CA-Female"	=> "Microsoft Server Speech Text to Speech Voice (fr-CA, HarmonieRUS)",
		"fr-CH-Male"	=> "Microsoft Server Speech Text to Speech Voice (fr-CH, Guillaume)",
		"fr-FR-Female"	=> "Microsoft Server Speech Text to Speech Voice (fr-FR, Julie, Apollo)",
		"fr-FR-Female"	=> "Microsoft Server Speech Text to Speech Voice (fr-FR, HortenseRUS)",
		"fr-FR-Male"	=> "Microsoft Server Speech Text to Speech Voice (fr-FR, Paul, Apollo)",
		"he-IL-Male"	=> "Microsoft Server Speech Text to Speech Voice (he-IL, Asaf)",
		"hi-IN-Female"	=> "Microsoft Server Speech Text to Speech Voice (hi-IN, Kalpana, Apollo)",
		"hi-IN-Female"	=> "Microsoft Server Speech Text to Speech Voice (hi-IN, Kalpana)",
		"hi-IN-Male"	=> "Microsoft Server Speech Text to Speech Voice (hi-IN, Hemant)",
		"hr-HR-Male"	=> "Microsoft Server Speech Text to Speech Voice (hr-HR, Matej)",
		"hu-HU-Male"	=> "Microsoft Server Speech Text to Speech Voice (hu-HU, Szabolcs)",
		"id-ID-Male"	=> "Microsoft Server Speech Text to Speech Voice (id-ID, Andika)",
		"it-IT-Male"	=> "Microsoft Server Speech Text to Speech Voice (it-IT, Cosimo, Apollo)",
		"ja-JP-Female"	=> "Microsoft Server Speech Text to Speech Voice (ja-JP, Ayumi, Apollo)",
		"ja-JP-Male"	=> "Microsoft Server Speech Text to Speech Voice (ja-JP, Ichiro, Apollo)",
		"ja-JP-Female"	=> "Microsoft Server Speech Text to Speech Voice (ja-JP, HarukaRUS)",
		"ja-JP-Female"	=> "Microsoft Server Speech Text to Speech Voice (ja-JP, LuciaRUS)",
		"ja-JP-Male"	=> "Microsoft Server Speech Text to Speech Voice (ja-JP, EkaterinaRUS)",
		"ko-KR-Female"	=> "Microsoft Server Speech Text to Speech Voice (ko-KR, HeamiRUS)",
		"ms-MY-Male"	=> "Microsoft Server Speech Text to Speech Voice (ms-MY, Rizwan)",
		"nb-NO-Female"	=> "Microsoft Server Speech Text to Speech Voice (nb-NO, HuldaRUS)",
		"nl-NL-Female"	=> "Microsoft Server Speech Text to Speech Voice (nl-NL, HannaRUS)",
		"pl-PL-Female"	=> "Microsoft Server Speech Text to Speech Voice (pl-PL, PaulinaRUS)",
		"pt-BR-Female"	=> "Microsoft Server Speech Text to Speech Voice (pt-BR, HeloisaRUS)",
		"pt-BR-Male"	=> "Microsoft Server Speech Text to Speech Voice (pt-BR, Daniel, Apollo)",
		"pt-PT-Female"	=> "Microsoft Server Speech Text to Speech Voice (pt-PT, HeliaRUS)",
		"ro-RO-Male"	=> "Microsoft Server Speech Text to Speech Voice (ro-RO, Andrei)",
		"ru-RU-Female"	=> "Microsoft Server Speech Text to Speech Voice (ru-RU, Irina, Apollo)",
		"ru-RU-Male"	=> "Microsoft Server Speech Text to Speech Voice (ru-RU, Pavel, Apollo)",
		"sk-SK-Male"	=> "Microsoft Server Speech Text to Speech Voice (sk-SK, Filip)",
		"sl-SI-Male"	=> "Microsoft Server Speech Text to Speech Voice (sl-SI, Lado)",
		"sv-SE-Female"	=> "Microsoft Server Speech Text to Speech Voice (sv-SE, HedvigRUS)",
		"ta-IN-Male"	=> "Microsoft Server Speech Text to Speech Voice (ta-IN, Valluvar)",
		"th-TH-Male"	=> "Microsoft Server Speech Text to Speech Voice (th-TH, Pattara)",
		"tr-TR-Female"	=> "Microsoft Server Speech Text to Speech Voice (tr-TR, SedaRUS)",
		"vi-VN-Male"	=> "Microsoft Server Speech Text to Speech Voice (vi-VN, An)",
		"zh-CN-Female"	=> "Microsoft Server Speech Text to Speech Voice (zh-CN, HuihuiRUS)",
		"zh-CN-Female"	=> "Microsoft Server Speech Text to Speech Voice (zh-CN, Yaoyao, Apollo)",
		"zh-CN-Male"	=> "Microsoft Server Speech Text to Speech Voice (zh-CN, Kangkang, Apollo)",
		"zh-HK-Female"	=> "Microsoft Server Speech Text to Speech Voice (zh-HK, Tracy, Apollo)",
		"zh-HK-Female"	=> "Microsoft Server Speech Text to Speech Voice (zh-HK, TracyRUS)",
		"zh-HK-Male"	=> "Microsoft Server Speech Text to Speech Voice (zh-HK, Danny, Apollo)",
		"zh-TW-Female"	=> "Microsoft Server Speech Text to Speech Voice (zh-TW, Yating, Apollo)",
		"zh-TW-Female"	=> "Microsoft Server Speech Text to Speech Voice (zh-TW, HanHanRUS)",
		"zh-TW-Male"	=> "Microsoft Server Speech Text to Speech Voice (zh-TW, Zhiwei, Apollo)",
	);
	return %list;
}

sub console_log {
	my $message = join(" ", @_);
	warn "$name $message\n";
	print "NOOP \"$name $message\"\n";
	checkresponse();
}

sub fatal_log {
	console_log(@_);
	die;
}

sub int_handler {
	die "$name Interrupt signal received, terminating...\n";
}

END {
	if ($tmpname) {
		warn "$name Cleaning temp files.\n" if ($debug);
		unlink glob "$tmpname*";
	}
}