Sample synthesis client for Neural TTSaaS

Neural TTSaaS offers a fully functional Python client that you may download and use on Linux or Windows to synthesize speech using the Synthesizer gRPC API for Neural TTSaaS.

Note:

This is the same client provided with TTSaaS. When used in Neural TTSaaSs, the client includes different SSML sample input and a header that sends the request to the Neural TTSaaS service.

To run this client, you need:

Python 3.6 or later.
The generated Python stub files from gRPC setup.
Your Mix client ID and secret from Prerequisites from Mix.
The Python client files: sample-synthesis-client.zip

Download the zip file to Linux or Windows and extract the files into the same directory as the nuance directory, which contains your proto files and Python stubs.

On Linux, give the shell script execute permission with chmod +x. For example:

unzip sample-synthesis-client.zip
chmod +x run-client.sh

Python synthesis client: client.py

import argparse
import time
import logging
import grpc
import os
import urllib
import urllib.request
import base64
import json
from importlib.machinery import SourceFileLoader
import threading
from google.protobuf import text_format
from google.protobuf import json_format

from nuance.tts.v1.synthesizer_pb2 import *
from nuance.tts.v1.synthesizer_pb2_grpc import *

thread_context = threading.local()
thread_context.num_synthesis = 0

oauth_mutex = threading.Lock()
oauth_token_expiry_threshhold_seconds = 30
oauth_token_expiry_seconds = 0
oauth_token = None

num_iterations = 0

total_first_chunk_latency = 0
total_synthesis = 0

args = None

# Generates the .wav file header for a given set of parameters
def generate_wav_header(sampleRate, bitsPerSample, channels, datasize, formattype):
    # (4byte) Marks file as RIFF
    o = bytes("RIFF", 'ascii')
    # (4byte) File size in bytes excluding this and RIFF marker
    o += (datasize + 36).to_bytes(4, 'little')
    # (4byte) File type
    o += bytes("WAVE", 'ascii')
    # (4byte) Format Chunk Marker
    o += bytes("fmt ", 'ascii')
    # (4byte) Length of above format data
    o += (16).to_bytes(4, 'little')
    # (2byte) Format type (1 - PCM)
    o += (formattype).to_bytes(2, 'little')
    # (2byte) Will always be 1 for TTS
    o += (channels).to_bytes(2, 'little')
    # (4byte)
    o += (sampleRate).to_bytes(4, 'little')
    o += (sampleRate * channels * bitsPerSample // 8).to_bytes(4, 'little')  # (4byte)
    o += (channels * bitsPerSample // 8).to_bytes(2,'little')               # (2byte)
    # (2byte)
    o += (bitsPerSample).to_bytes(2, 'little')
    # (4byte) Data Chunk Marker
    o += bytes("data", 'ascii')
    # (4byte) Data size in bytes
    o += (datasize).to_bytes(4, 'little')

    return o

def send_http_request_with_json_response(url, headers, data, method):
    request = urllib.request.Request(url=url, headers=headers, data=data, method=method)

    with urllib.request.urlopen(request) as response:
        response = response.read().decode('utf-8')
        json_response = json.loads(response)
        return json_response

def get_oauth2_token():
    global oauth_token
    global oauth_token_expiry_seconds
    global oauth_token_expiry_threshhold_seconds

    if args.oauthURL is None:
        return None
    
    current_time = time.monotonic()

    oauth_mutex.acquire()
    try:
        if oauth_token and oauth_token_expiry_seconds - oauth_token_expiry_threshhold_seconds > current_time:
            log.debug('OAuth token is still valid')
            return oauth_token

        log.info("Obtaining auth token (Client ID: {}, URL: {})".format(args.clientID, args.oauthURL))

        encoded_credentials = base64.standard_b64encode("{}:{}".format(args.clientID, args.clientSecret).encode()).decode('utf-8')
        headers = { 'Authorization' : "Basic {}".format(encoded_credentials)  }

        data = {
            'grant_type': 'client_credentials',
            'scope': args.oauthScope,
        }

        request = urllib.request.Request(url=args.oauthURL, headers=headers, data=urllib.parse.urlencode(data).encode(), method='POST')
        with urllib.request.urlopen(request) as response:
            response = response.read().decode('utf-8')
            json_response = json.loads(response)

            oauth_token = json_response["access_token"]
            oauth_token_expiry_seconds = time.monotonic() + json_response["expires_in"]
        
            log.debug("Token TTL: %d" % json_response["expires_in"])
            return json_response["access_token"]
    except urllib.error.HTTPError as err:
        raise Exception("Failed to obtain authentication token. Status: {}, Error: {}".format(err.code, err.read().decode()))
    finally:
        oauth_mutex.release()


def send_get_voices_request(grpc_client, request, get_all_voices=False):
    log.info("Sending GetVoices request")

    client_span = None
    get_voices_span = None
    metadata = []
    http_headers = {}
    if args.sendHTTP:
        request = json_format.MessageToJson(request).encode()

    if args.jaeger:
        log.debug("Injecting Jaeger span context into request")
        client_span = tracer.start_span("Client.gRPC")
        get_voices_span = tracer.start_span(
            "Client.GetVoices", child_of=client_span)
        carrier = dict()
        tracer.inject(get_voices_span.context,
                      opentracing.propagation.Format.TEXT_MAP, carrier)
        metadata.append(('uber-trace-id', carrier['uber-trace-id']))
    
    if args.clientRequestID:
        metadata.append(('x-client-request-id', args.clientRequestID))

    if args.neural:
        log.info("Adding x-nuance-tts-neural header")
        metadata.append(('x-nuance-tts-neural', 'true'))
        
    if args.sendHTTP:
        current_oauth_token = get_oauth2_token()
        if current_oauth_token:
            http_headers['Authorization'] = 'Bearer {}'.format(current_oauth_token)
    
        json_response = send_http_request_with_json_response(url=args.serverUrl + "/api/v1/voices/", data=request, headers=http_headers, method="GET")
        log.info(json.dumps(json_response, sort_keys=True,
                            indent=2, separators=(',', ': ')))
    else:
        if args.getAllVoices:
            get_voices_rpc = grpc_client.GetAllVoices
        else:
            get_voices_rpc = grpc_client.GetVoices
        
        response, call = get_voices_rpc.with_call(request=request, timeout=args.timeoutSeconds, metadata=metadata)
        log.info(text_format.MessageToString(response))
        for key, value in call.trailing_metadata():
            log.info('Synthesis client received trailing metadata: key=%s value=%s' % (key, value))

    if get_voices_span:
        get_voices_span.finish()
    if client_span:
        client_span.finish()



def process_synthesis_response(request, response, start, synthesis_span, client_span, return_package, request_info):
    global args

    waveheader = ""
    metadata = []
    call_credentials = None

    if args.sendHTTP:
        decoded_audio_response = base64.b64decode(response["audio"])
        response = json_format.Parse(json.dumps(response), UnarySynthesisResponse())
        response.audio = decoded_audio_response

    if args.sendUnary or args.sendHTTP:
        if response.status.code != 200:
            if response.HasField("events"):
                log.info("Received events")
                log.info(text_format.MessageToString(response.events))
            log.error("Received status response: FAILED")
            log.error("Code: {}, Message: {}".format(
                response.status.code, response.status.message))
            log.error('Error: {}'.format(response.status.details))
            return

    if args.sendUnary or args.sendHTTP or response.HasField("audio"):
        log.info("Received audio: %d bytes" % len(response.audio))
        if not return_package["received_first_audio_chunk"]:
            return_package["received_first_audio_chunk"] = True
            latency = time.monotonic() - start
            log.info("First chunk latency: {} seconds".format(latency))
            global total_first_chunk_latency
            total_first_chunk_latency = total_first_chunk_latency + latency
            log.info("Average first-chunk latency (over {} synthesis requests): {} seconds".format(
                total_synthesis, total_first_chunk_latency/(total_synthesis)))

        if args.saveAudio:
            if args.saveAudioAsWav:
                if request.audio_params.audio_format.HasField("ogg_opus") or request.audio_params.audio_format.HasField("opus"):
                    log.warn(
                        "Cannot save wave format for Opus, ignoring")
                else:
                    return_package["currentaudiolen"] += len(response.audio)
                    waveheader = generate_wav_header(
                        request_info["sampleRate"], request_info["bitsPerSample"], request_info["channels"], return_package["currentaudiolen"], request_info["audioformat"])
                    if return_package["audio_file"] != None:
                        return_package["audio_file"].seek(0, 0)
                        return_package["audio_file"].write(waveheader)
                        return_package["audio_file"].seek(0, 2)
            if return_package["audio_file"] != None:
                return_package["audio_file"].write(response.audio)
        if args.saveAudioChunks:
            if request.audio_params.audio_format.HasField("ogg_opus"):
                log.warn(
                    "Cannot save separate audio chunks for Ogg Opus, ignoring")
            else:
                return_package["num_chunks"] = return_package["num_chunks"] + 1
                chunk_file_name = "%s_i%d_s%d_c%d.%s" % (
                    thread_context.file, request_info["num_iterations"], thread_context.num_synthesis, return_package["num_chunks"], request_info["extension"])
                chunk_file_name = os.path.join(args.audioDir, chunk_file_name)
                chunk_audio_file = open(chunk_file_name, "wb")
                if args.saveAudioAsWav:
                    if request.audio_params.audio_format.HasField("opus"):
                        log.warn(
                            "Cannot save audio chunks as wav for Opus, ignoring")
                    else:
                        # Adding wav header before writing to audio file
                        waveheader = generate_wav_header(
                            request_info["sampleRate"], request_info["bitsPerSample"], request_info["channels"], len(response.audio), request_info["audioformat"])
                        chunk_audio_file.write(waveheader)
                chunk_audio_file.write(response.audio)
                chunk_audio_file.close()
                log.info("Wrote audio chunk to %s" % chunk_file_name)
    if response.HasField("events"):
        log.info("Received events")
        log.info(text_format.MessageToString(response.events))

    if response.HasField("status"):
        if response.status.code == 200:
                log.info("Received status response: SUCCESS")
        else:
            log.error("Received status response: FAILED")
            log.error("Code: {}, Message: {}".format(
                response.status.code, response.status.message))
            log.error('Error: {}'.format(response.status.details))
    return return_package


def send_synthesis_request(grpc_client, request, num_iterations, metadata=None):
    global total_synthesis
    total_synthesis = total_synthesis + 1
    global args

    audio_file = None
    audio_file_name = ""
    extension = ""
    sampleRate = 0
    bitsPerSample = 0
    channels = 1
    audioformat = 0
    currentaudiolen = 0
    num_chunks = 0
    metadata = []
    http_headers = {}
    client_span = None
    synthesis_span = None
    received_first_audio_chunk = False
    call_credentials = None

    thread_context.num_synthesis = thread_context.num_synthesis + 1

    if args.saveAudio or args.saveAudioChunks:
        if request.audio_params.audio_format.HasField("pcm"):
            extension = "pcm"
            sampleRate = request.audio_params.audio_format.pcm.sample_rate_hz
            bitsPerSample = 16
            audioformat = 1
        elif request.audio_params.audio_format.HasField("alaw"):
            extension = "alaw"
            bitsPerSample = 8
            sampleRate = 8000
            audioformat = 6
        elif request.audio_params.audio_format.HasField("ulaw"):
            extension = "ulaw"
            bitsPerSample = 8
            sampleRate = 8000
            audioformat = 7
        elif request.audio_params.audio_format.HasField("ogg_opus"):
            extension = "ogg"
        elif request.audio_params.audio_format.HasField("opus"):
            extension = "opus"
        else:
            extension = "pcm"
            sampleRate = 22050
            bitsPerSample = 16
            audioformat = 1

        if args.saveAudioAsWav:
            if request.audio_params.audio_format.HasField("ogg_opus") or request.audio_params.audio_format.HasField("opus"):
                log.warn("Cannot set to wav format for Ogg Opus, ignoring")
            else:
                extension = "wav"

    if args.saveAudio:
        if request.audio_params.audio_format.HasField("opus"):
            log.warn("Cannot save whole audio for Opus, ignoring")
        else:
            audio_file_name = "%s_i%d_s%d.%s" % (
                thread_context.file, num_iterations, thread_context.num_synthesis, extension)
            audio_file_name = os.path.join(args.audioDir, audio_file_name)
            audio_file = open(audio_file_name, "wb")

    if args.appid:
        metadata.append(('x-nuance-client-id', args.appid))
        http_headers['x-nuance-client-id'] = args.appid

    if args.neural:
        log.info("Adding x-nuance-tts-neural header")
        metadata.append(('x-nuance-tts-neural', 'true'))
        http_headers['x-nuance-tts-neural'] = 'true'

    if args.clientRequestID:
        metadata.append(('x-client-request-id', args.clientRequestID))
        http_headers['x-client-request-id'] = args.clientRequestID

    if args.jaeger:
        log.debug("Injecting Jaeger span context into request")
        client_span = tracer.start_span("Client.gRPC")
        if args.sendUnary or args.sendHTTP:
            synthesis_span = tracer.start_span(
                "Client.UnarySynthesize", child_of=client_span)
        else:
            synthesis_span = tracer.start_span(
                "Client.Synthesize", child_of=client_span)
        carrier = dict()
        tracer.inject(synthesis_span.context,
                      opentracing.propagation.Format.TEXT_MAP, carrier)
        metadata.append(('uber-trace-id', carrier['uber-trace-id']))

    request_info = {"sampleRate": sampleRate, "bitsPerSample": bitsPerSample, "channels": channels, "audioformat": audioformat, "extension": extension, "num_iterations": num_iterations}
    return_package = {"received_first_audio_chunk": received_first_audio_chunk, "num_chunks": num_chunks, "currentaudiolen": currentaudiolen, "audio_file": audio_file, "audio_file_name": audio_file_name}

    start = time.monotonic()

    if args.sendUnary:
        response, call = grpc_client.UnarySynthesize.with_call(request=request, timeout=args.timeoutSeconds, metadata=metadata)
        log.info("Sending Unary Synthesis request")
        return_package = process_synthesis_response(request, response, start, synthesis_span, client_span, return_package, request_info)
        for key, value in call.trailing_metadata():
            log.info('Synthesis client received trailing metadata: key=%s value=%s' % (key, value))
    elif args.sendHTTP:
        current_oauth_token = get_oauth2_token()
        if current_oauth_token:
            http_headers['Authorization'] = 'Bearer {}'.format(current_oauth_token)

        json_response = send_http_request_with_json_response(url = args.serverUrl + "/api/v1/synthesize/", data=json_format.MessageToJson(request).encode(), headers=http_headers, method="POST")
        log.info("Sending HTTP Synthesis request")
        if json_response:
            return_package = process_synthesis_response(request, json_response, start, synthesis_span, client_span, return_package, request_info)
        else:
            log.error("Failed to get response from server!")
    else:
        log.info("Sending Synthesis request")
        responses = grpc_client.Synthesize(request=request, timeout=args.timeoutSeconds, metadata=metadata)
        for response in responses:
            return_package = process_synthesis_response(request, response, start, synthesis_span, client_span, return_package, request_info)
        for key, value in responses.trailing_metadata():
            log.info('Synthesis client received trailing metadata: key=%s value=%s' % (key, value))


    if args.saveAudio and return_package:
        if return_package["audio_file"] != None:
            return_package["audio_file"].close()
            log.info("Wrote audio to %s" % return_package["audio_file_name"])

    if synthesis_span:
        synthesis_span.finish()
    if client_span:
        client_span.finish()


def parse_args():
    global args
    parser = argparse.ArgumentParser(
        prog="client.py",
        usage="%(prog)s [-options]",
        add_help=False,
        formatter_class=lambda prog: argparse.HelpFormatter(
            prog, max_help_position=45, width=100)
    )

    options = parser.add_argument_group("options")
    options.add_argument("-h", "--help", action="help",
                         help="Show this help message and exit")
    options.add_argument("--appid", metavar="appID:client-id", nargs="?", help="Client ID or group name, prefixed with appID:")
    options.add_argument("--token", nargs="?", help=argparse.SUPPRESS)
    options.add_argument("-f", "--files", metavar="file", nargs="+",
                         help="List of flow files to execute sequentially, default=['flow.py']", default=['flow.py'])
    options.add_argument("-p", "--parallel", action="store_true",
                         help="Run each flow in a separate thread")
    options.add_argument("-i", "--iterations", metavar="num", nargs="?",
                         help="Number of times to run the list of files, default=1", default=1, type=int)
    options.add_argument("--infinite", action="store_true",
                         help="Run all files infinitely (overrides number of iterations)")
    options.add_argument("-t", "--timeoutSeconds", metavar="num", nargs="?",
                         help="Timeout in seconds for every RPC call, default=30", default=30, type=int)
    options.add_argument("-s", "--serverUrl", metavar="url", nargs="?",
                         help="NVC server URL, default=localhost:8080", default='localhost:8080')
    options.add_argument("--oauthURL", metavar="url", nargs="?",
                         help="OAuth 2.0 URL")
    options.add_argument("--clientRequestID", metavar="id", nargs="?",
                         help="Client-generated request ID")
    options.add_argument("--clientID", metavar="url", nargs="?",
                         help="OAuth 2.0 Client ID")
    options.add_argument("--clientSecret", metavar="url", nargs="?",
                         help="OAuth 2.0 Client Secret")
    options.add_argument("--oauthScope", metavar="url", nargs="?",
                         help="OAuth 2.0 Scope, default=tts", default='tts')
    options.add_argument("--secure", action="store_true",
                         help="Connect to the server using a secure gRPC channel")
    options.add_argument("--rootCerts",  metavar="file", nargs="?",
                         help="Root certificates when using a secure gRPC channel")
    options.add_argument("--privateKey",  metavar="file", nargs="?",
                         help="Certificate private key when using a secure gRPC channel")
    options.add_argument("--certChain",  metavar="file", nargs="?",
                         help="Certificate chain when using a secure gRPC channel")
    options.add_argument("--audioDir", metavar="dir", nargs="?",
                         help="Audio output directory, default=./audio", default='./audio')
    options.add_argument("--saveAudio", action="store_true",
                         help="Save whole audio to disk")
    options.add_argument("--saveAudioChunks", action="store_true",
                         help="Save each individual audio chunk to disk")
    options.add_argument("--saveAudioAsWav", action="store_true",
                         help="Save each audio file in the WAVE format")
    options.add_argument("--jaeger", metavar="addr", nargs="?", const='udp://localhost:6831',
                         help="Send UDP opentrace spans, default addr=udp://localhost:6831")
    options.add_argument("--sendUnary", action="store_true",
                         help="Receive one response (UnarySynthesize) instead of a stream of responses (Synthesize)")
    options.add_argument("--sendHTTP", action="store_true",
                         help="Send the requests using the HTTP-to-gRPC API")
    options.add_argument("--maxReceiveSizeMB", metavar="megabytes", nargs="?",
                         help="Maximum length of gRPC server response in megabytes, default=50 MB", default=50, type=int)
    options.add_argument("--getAllVoices", action="store_true",
                         help=argparse.SUPPRESS)
    options.add_argument("--neural", action="store_true",
                         help="Send the request to Neural TTS, if available.")
    args = parser.parse_args()


def initialize_tracing():
    if args.jaeger:
        print("Enabling Jaeger traces")
        global opentracing
        import opentracing
        import jaeger_client

        from urllib.parse import urlparse
        agent_addr = urlparse(args.jaeger)
        if not agent_addr.netloc:
            raise Exception(
                "invalid jaeger agent address: {}".format(args.jaeger))
        if not agent_addr.hostname:
            raise Exception(
                "missing hostname in jaeger agent address: {}".format(args.jaeger))
        if not agent_addr.port:
            raise Exception(
                "missing port in jaeger agent address: {}".format(args.jaeger))
        tracer_config = {
            'sampler': {
                'type': 'const',
                'param': 1,
            },
            'local_agent': {
                'reporting_host': agent_addr.hostname,
                'reporting_port': agent_addr.port
            },
            'logging': True
        }
        config = jaeger_client.Config(
            config=tracer_config, service_name='NVCClient', validate=True)
        global tracer
        tracer = config.initialize_tracer()


def create_channel():
    call_credentials = None
    channel = None

    if args.token:
        log.debug('Adding CallCredentials using token parameter')
        call_credentials = grpc.access_token_call_credentials(args.token)
    else:
        current_oauth_token = get_oauth2_token()
        if current_oauth_token:
            log.debug('Adding CallCredentials from OAuth endpoint')
            call_credentials = grpc.access_token_call_credentials(current_oauth_token)

    if args.secure:
        log.debug("Creating secure gRPC channel")
        root_certificates = None
        certificate_chain = None
        private_key = None
        if args.rootCerts:
            log.debug("Adding root certs")
            root_certificates = open(args.rootCerts, 'rb').read()
        if args.certChain:
            log.debug("Adding cert chain")
            certificate_chain = open(args.certChain, 'rb').read()
        if args.privateKey:
            log.debug("Adding private key")
            private_key = open(args.privateKey, 'rb').read()

        channel_credentials = grpc.ssl_channel_credentials(
            root_certificates=root_certificates, private_key=private_key, certificate_chain=certificate_chain)
        if call_credentials is not None:
            channel_credentials = grpc.composite_channel_credentials(
                channel_credentials, call_credentials)
        channel = grpc.secure_channel(args.serverUrl, credentials=channel_credentials, options=[
                                      ('grpc.max_receive_message_length', args.maxReceiveSizeMB * 1024 * 1024)])
    else:
        log.debug("Creating insecure gRPC channel")
        channel = grpc.insecure_channel(args.serverUrl, options=[(
            'grpc.max_receive_message_length', args.maxReceiveSizeMB * 1024 * 1024)])

    return channel


def worker_thread(file, num_iterations, list_of_requests):
    run_one_file(file, num_iterations, list_of_requests)

def run_one_file(file, num_iterations, list_of_requests):
    thread_context.num_synthesis = 0
    
    with create_channel() as channel:
        grpc_client = SynthesizerStub(channel=channel)
        log.info("Running file [%s]" % file)
        log.debug(list_of_requests)

        thread_context.num_synthesis = 0
        thread_context.file = os.path.basename(file)

        for request in list_of_requests:
            if isinstance(request, GetVoicesRequest):
                send_get_voices_request(grpc_client, request)
            elif isinstance(request, SynthesisRequest):
                send_synthesis_request(grpc_client, request, num_iterations)
            elif isinstance(request, (int, float)):
                log.info("Waiting for {} seconds".format(request))
                time.sleep(request)
        log.info("Done running file [%s]" % file)


def run():
    parse_args()

    log_level = logging.DEBUG
    global log
    log = logging.getLogger('')
    logging.basicConfig(
        format='%(asctime)s (%(thread)d) %(levelname)-5s %(message)s', level=log_level)

    if args.oauthURL:
        if args.clientID is None:
            log.error("OAuth 2.0 URL was supplied but client ID is missing")
            return
        elif args.clientSecret is None:
            log.error("OAuth 2.0 URL was supplied but client secret is missing")
            return

    initialize_tracing()
    get_oauth2_token()

    if (args.saveAudio or args.saveAudioChunks) and not os.path.exists(args.audioDir):
        log.info("Audio directory: {}".format(args.audioDir))
        os.mkdir(args.audioDir)
    
    if args.infinite:
        log.info("Setting iterations to infinity")
        args.iterations = 100**100

    for i in range(args.iterations):
        num_iterations = i + 1
        log.info("Iteration #{} out of {}".format(num_iterations, args.iterations))
        threads = []
        for file in args.files:
            absolute_path = os.path.abspath(file)
            module_name = os.path.splitext(absolute_path)[0]
            module = SourceFileLoader(module_name, absolute_path).load_module()
            if module.list_of_requests == None:
                raise Exception(
                    "Error importing [%s]: variable list_of_requests not defined" % file)
            if args.parallel:
                log.info("Running flows in parallel")
                thread = threading.Thread(target=worker_thread, args=[file, num_iterations, module.list_of_requests])
                threads.append(thread)
                thread.start()
            else:
                run_one_file(file, num_iterations, module.list_of_requests)
        for thread in threads:
            thread.join()
        log.info("Iteration #{} complete".format(num_iterations))

    if total_synthesis > 0:
        log.info("Average first-chunk latency (over {} synthesis requests): {} seconds".format(
            total_synthesis, total_first_chunk_latency/(total_synthesis)))

    if args.jaeger:
        tracer.close()
        # Need to give time to tracer to flush the spans: https://github.com/jaegertracing/jaeger-client-python/issues/50
        time.sleep(2)
    log.info("Done")


if __name__ == '__main__':
    run()

These are the resulting client files, in the same directory as the nuance directory.

├── client.py
├── flow.py
├── flow-multi.py
├── run-client.sh
├── run-client.bat
└── nuance
    └── tts
        └── v1 
            ├── synthesizer_pb2_grpc.py
            ├── synthesizer_pb2.py
            └── synthesizer.proto

You can use the client to check for available voices and/or request synthesis. Here are a few scenarios you can try.

Get help

For a quick check that the client is working, and to see the arguments it accepts, run it on Linux or Windows using the help (-h or --help) option.

See the results below and notice:

-s or --serverUrl: The URL of the service. The sample run script specifies the Mix service, tts.api.nuance.com, on its default port, 443.
Authorization: Include --oauthURL, --clientID, and --clientSecret. Alternatively, use the (hidden) --token argument. See Authorize.
--neural: Include this argument to send the request to Neural TTSaaS. The client adds the x-nuance-tts-neural header as it calls the service, which directs the request to Neural TTSaaS instead of TTSaaS.
-f or --files: The name of the input file to use for the request. The default is flow.py.

The results are the same on Linux and Windows:

python3 client.py --help 

usage: client.py [-options]

options:
  -h, --help                       Show this help message and exit
  --appid [appID:client-id]        Not used
  -f file [file ...], --files file [file ...] List of flow files to execute sequentially,
                                              default=['flow.py']
  -p, --parallel                   Run each flow in a separate thread
  -i [num], --iterations [num]     Number of times to run the list of files, default=1
  --infinite                       Run all files infinitely (overrides number of
                                   iterations)
  -t [num], --timeoutSeconds [num] Timeout in seconds for every RPC call, default=30
  -s [url], --serverUrl [url]      NVC server URL, default=localhost:8080
  --oauthURL [url]                 OAuth 2.0 URL
  --clientRequestID [id]           Client-generated request ID
  --clientID [url]                 OAuth 2.0 Client ID
  --clientSecret [url]             OAuth 2.0 Client Secret
  --oauthScope [url]               OAuth 2.0 Scope, default=tts
  --secure                         Connect to the server using a secure gRPC channel
  --rootCerts [file]               Not used
  --privateKey [file]              Not used
  --certChain [file]               Not used
  --audioDir [dir]                 Audio output directory, default=./audio
  --saveAudio                      Save whole audio to disk
  --saveAudioChunks                Save each individual audio chunk to disk
  --saveAudioAsWav                 Save each audio file in the WAVE format
  --jaeger [addr]                  Not used
  --sendUnary                      Not used
  --sendHTTP                       Not used
  --maxReceiveSizeMB [megabytes]   Maximum length of gRPC server response in megabytes,
                                   default=50 MB
  --neural                         Send the request to Neural TTS, if available.

Input files

The sample client includes two input files, flow.py and flow-multi.py. These files provide an easy way to customize the client without editing the main client.py file.

You’ll learn more about these input files in the following sections.

Synthesize text input

In this first scenario, use the default input file to ask Neural TTSaaS to synthesize a text string using SynthesisRequest and save the resulting audio in a wave file.

Edit the run script, run-client.sh or run-client.bat, to add your Mix client ID and secret. (See Authorize for details.)

Linux: run-client.sh
Windows: run-client.bat

#!/bin/bash

CLIENT_ID=<Mix client ID, starting with appID:>
SECRET=<Mix client secret>
# Change colons (:) to %3A in client ID
CLIENT_ID=${CLIENT_ID//:/%3A}

python3 client.py --oauthURL https://auth.crt.nuance.com/oauth2/token \
--clientID $CLIENT_ID --clientSecret $SECRET \
--secure --serverUrl tts.api.nuance.com:443 --neural --saveAudio --saveAudioAsWav

 @echo off
setlocal enabledelayedexpansion

set CLIENT_ID=<Mix client ID, starting with appID:>
set SECRET=<Mix client secret>
rem Change colons (:) to %3A in client ID
set CLIENT_ID=!CLIENT_ID::=%%3A!

python client.py --oauthURL https://auth.crt.nuance.com/oauth2/token ^
--clientID %CLIENT_ID% --clientSecret %SECRET% ^
--secure --serverUrl tts.api.nuance.com --neural --saveAudio --saveAudioAsWav

Notice the --neural argument. This adds the x-nuance-tts-neural header, which directs the request to Neural TTSaaS instead of TTSaaS.

Also notice the --saveAudio and --saveAudioAsWav arguments. These save the synthesized result as a wave file. There is no need to include the --files argument since flow.py is the default input filename.

Open the input file, flow.py, and notice the two sections:

# GetVoices request asks for information about the JennyNeural voice.
# Synthesis request requests the same voice and provides input text to synthesize.

from nuance.tts.v1.synthesizer_pb2 import *

list_of_requests = []

# GetVoices request
request = GetVoicesRequest()
request.voice.name = "en-US-JennyNeural" 
#request.voice.language = "en-US"
#request.voice.gender = EnumGender.FEMALE

# Add request to list
list_of_requests.append(request)

# ---

# Synthesis request
request = SynthesisRequest()

request.voice.name = "en-US-JennyNeural" 

pcm = PCM(sample_rate_hz=16000)
request.audio_params.audio_format.pcm.CopyFrom(pcm)

request.input.text.text = "This is a test, a very simple test."

#request.input.ssml.text = . . .

#request.user_id = "MyApplicationUser"

#request.client_data["company"] = "My Company"
#request.client_data["user"] = "My User Name"   

# Add request to list
list_of_requests.append(request)

# ---

Run the client using the script or batch file:
- Linux
- Windows
```
./run-client.sh
```
```
run-client.bat
```

The client first sends a GetVoicesRequest, which returns information about the JennyNeural voice.

It then sends a SynthesisRequest to turn the text into speech using the same voice, and creates a file named flow.py_i1_s1.wav in the default --audioDir location, ./audio. The WAV file contains the voice of Jenny saying “This is a test, a very simple test.”

These are the results. Some lines are omitted for brevity.

2023-10-26 16:48:16,111 (139817866266432) INFO  Obtaining auth token 
2023-10-26 16:48:16,476 (139817866266432) DEBUG Creating secure gRPC channel
2023-10-26 16:48:16,483 (139817866266432) INFO  Running file [flow.py]
2023-10-26 16:48:16,483 (139817866266432) DEBUG [voice {
  name: "en-US-JennyNeural"
}
, voice {
  name: "en-US-JennyNeural"
}
input {
  text {
    text: "This is a test, a very simple test."
  }
}
]
2023-10-26 16:48:16,483 (139817866266432) INFO  Sending GetVoices request
2023-10-26 16:48:16,483 (139817866266432) INFO  Adding x-nuance-tts-neural header
2023-10-26 16:48:16,615 (139817866266432) INFO  voices {
  name: "en-US-JennyNeural"
  model: "neural"
  language: "en-US"
  gender: FEMALE
  sample_rate_hz: 24000
  styles: "assistant"
  styles: "chat"
  styles: "customerservice"
  styles: "newscast"
  styles: "angry"
  styles: "cheerful"
  styles: "sad"
  styles: "excited"
  styles: "friendly"
  styles: "terrified"
  styles: "shouting"
  styles: "unfriendly"
  styles: "whispering"
  styles: "hopeful"
}

2023-10-26 16:48:16,616 (139817866266432) INFO  Adding x-nuance-tts-neural header
2023-10-26 16:48:16,616 (139817866266432) INFO  Sending Synthesis request
2023-10-26 16:48:16,897 (139817866266432) INFO  Received audio: 62842 bytes
2023-10-26 16:48:16,897 (139817866266432) INFO  Received audio: 30870 bytes
2023-10-26 16:48:16,898 (139817866266432) INFO  Received audio: 66 bytes
2023-10-26 16:48:16,898 (139817866266432) INFO  Received status response: SUCCESS
2023-10-26 16:48:16,899 (139817866266432) INFO  Wrote audio to ./audio/flow.py_i1_s1.wav
2023-10-26 16:48:16,899 (139817866266432) INFO  Done running file [flow.py]
2023-10-26 16:48:16,900 (139817866266432) INFO  Done

Warning:

The file created by the the client, flow.py_i1_s1.wav, is overwrittten every time you run the client. If you want to save the file, rename it, for example, jenny-simple.wav.

Change text and voice

Optionally change the voice and the input text in the synthesis request, and rerun the client. (To learn what other voices are available, see Get voices below.) For example:

# Synthesis request

request.voice.name = "en-US-ChristopherNeural"

request.input.text.text = "Your coffee will be ready in 5 minutes."

Include metadata

You may include metadata that will be included in event logs. Uncomment the following lines in the sample flow.py file and add your own values for user_id and one or more client_data key-value pairs

request.user_id = "MyApplicationUser"

request.client_data["company"] = "My Company"
request.client_data["user"] = "My User Name"

The information is shown in the results:

2023-10-26 16:52:59,572 (140255182530368) DEBUG [voice {
  name: "en-US-JennyNeural"
}
, voice {
  name: "en-US-JennyNeural"
}
input {
  text {
    text: "This is a test, a very simple test."
  }
}
client_data {
  key: "company"
  value: "My Company"
}
client_data {
  key: "user"
  value: "My User Name"
}
user_id: "MyApplicationUser"
]

Synthesize SSML input

You may provide SSML input instead of plain text.

Edit flow.py to disable the request.input.text.text line and enable request.input.ssml.text.

Optionally remove the enclosing <speak> </speak> element in the SSML as Neural TTSaaS will add it automatically.

from nuance.tts.v1.synthesizer_pb2 import *

list_of_requests = []

# Synthesis request
request = SynthesisRequest()

request.voice.name = "en-US-JennyNeural"

pcm = PCM(sample_rate_hz=16000)
request.audio_params.audio_format.pcm.CopyFrom(pcm)

#request.input.text.text = "This is a test, a very simple test."

request.input.ssml.text = '''<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
<voice name="en-US-JennyNeural">Hello, it's Jenny.</voice>
<voice name="en-US-AriaNeural">Hi, it's Aria.</voice>
</speak>'''

# Add request to list
list_of_requests.append(request)

Run the client as before.
- Linux
- Windows
```
./run-client.sh
```
```
run-client.bat
```

The client sends a SynthesisRequest to turn the SSML text into speech. It creates a file named flow.py_i1_s1.wav containing the speech: Jenny saying “Hello, it’s Jenny,” followed by Aria saying “Hi, it’s Aria.”

These are the results. (Some lines are omitted for brevity.)

2022-12-13 09:45:07,272 (140618171987776) INFO  Obtaining auth token
2022-12-13 09:45:07,642 (140618171987776) DEBUG Creating secure gRPC channel
2022-12-13 09:45:07,649 (140618171987776) INFO  Running file [flow.py]
2022-12-13 09:45:07,649 (140618171987776) DEBUG [voice {
  name: "en-US-JennyNeural"
}
, voice {
  name: "en-US-JennyNeural"
}
audio_params {
  audio_format {
    pcm {
      sample_rate_hz: 16000
    }
  }
}
input {
  ssml {
    text: "<speak version=\"1.0\" xmlns=\"http://www.w3.org/2001/10/synthesis\" xml:lang=\"en-US\">\n<voice name=\"en-US-JennyNeural\">Hello, it\'s Jenny.</voice>\n<voice name=\"en-US-AriaNeural\">Hi, it\'s Aria.</voice>\n</speak>"
  }
}
]
2022-12-13 09:45:07,649 (140618171987776) INFO  Sending GetVoices request
2022-12-13 09:45:07,649 (140618171987776) INFO  Adding x-nuance-tts-neural header
2022-12-13 09:45:08,049 (140618171987776) INFO  voices {
  name: "en-US-JennyNeural"
  ...
}

2022-12-13 09:45:08,050 (140618171987776) INFO  Adding x-nuance-tts-neural header
2022-12-13 09:45:08,050 (140618171987776) INFO  Sending Synthesis request
2022-12-13 09:45:08,373 (140618171987776) INFO  Received audio: 34358 bytes
2022-12-13 09:45:08,400 (140618171987776) INFO  Received audio: 25642 bytes
2022-12-13 09:45:08,467 (140618171987776) INFO  Received audio: 34358 bytes
2022-12-13 09:45:08,468 (140618171987776) INFO  Received audio: 24842 bytes
2022-12-13 09:45:08,469 (140618171987776) INFO  Received status response: SUCCESS
2022-12-13 09:45:08,470 (140618171987776) INFO  Wrote audio to ./audio/flow.py_i1_s1.wav
2022-12-13 09:45:08,470 (140618171987776) INFO  Done running file [flow.py]
2022-12-13 09:45:08,471 (140618171987776) INFO  Done

For more SSML examples, including how to add lexicons and prerecorded audio, see Reference topics: Input to synthesize and SSML elements.

Get voices

When you ask Neural TTSaaS to synthesize text, you must specify a named voice. To learn which voices are available, send a GetVoicesRequest, entering your requirements in the flow.py input file.

Make sure your run script, run-client.sh or run-client.bat, contains your Mix client ID and secret. (See Authorize for details.)

Edit the input file, flow.py, to request American English female voices. This combination of options returns voices that are both American English and female. Optionally turn off synthesis for this request.

from nuance.tts.v1.synthesizer_pb2 import *

list_of_requests = []

# GetVoices request
request = GetVoicesRequest()
#request.voice.name = "en-US-JennyNeural"
request.voice.language = "en-US"           # Request American English voices
request.voice.gender = EnumGender.FEMALE   # Request female voices

# Add request to list
list_of_requests.append(request)           # Make sure voice request is enabled

# Synthesis request
... 
# Add request to list
#list_of_requests.append(request)          # Disable synthesis request

Run the client using the script or batch file:
- Linux
- Windows
```
./run-client.sh
```
```
run-client.bat
```

The results include all female American English voices available. Neural TTSaaS returns the following information for each voice:

All voices include the voice name, model (usually “neural”), language code, gender, and audio sampling rate.
Voices that support expression styles return a list of styles that you may include in SSML input. See Voice style.
The Jenny multilingual voice returns the languages other than English (“foreign_languages”) that this voice supports. See Multilingual voice.

These are the American English female voices in the results:

2022-12-13 09:50:12,489 (140290220357440) INFO  Obtaining auth token 
2022-12-13 09:50:12,769 (140290220357440) DEBUG Creating secure gRPC channel
2022-12-13 09:50:12,775 (140290220357440) INFO  Running file [flow.py]
2022-12-13 09:50:12,775 (140290220357440) DEBUG [voice {
  language: "en-US"
  gender: FEMALE
}
]
2022-12-13 09:50:12,776 (140290220357440) INFO  Sending GetVoices request
2022-12-13 09:50:12,776 (140290220357440) INFO  Adding x-nuance-tts-neural header
2022-12-13 09:50:13,223 (140290220357440) INFO  voices {
  name: "en-US-JennyNeural"
  model: "neural"
  language: "en-US"
  gender: FEMALE
  sample_rate_hz: 24000
  styles: "assistant"
  styles: "chat"
  styles: "customerservice"
  styles: "newscast"
  styles: "angry"
  styles: "cheerful"
  styles: "sad"
  styles: "excited"
  styles: "friendly"
  styles: "terrified"
  styles: "shouting"
  styles: "unfriendly"
  styles: "whispering"
  styles: "hopeful"
}
voices {
  name: "en-US-JennyMultilingualNeural"
  model: "neural"
  language: "en-US"
  gender: FEMALE
  sample_rate_hz: 24000
  foreign_languages: "de-DE"
  foreign_languages: "en-AU"
  foreign_languages: "en-CA"
  foreign_languages: "en-GB"
  foreign_languages: "es-ES"
  foreign_languages: "es-MX"
  foreign_languages: "fr-CA"
  foreign_languages: "fr-FR"
  foreign_languages: "it-IT"
  foreign_languages: "ja-JP"
  foreign_languages: "ko-KR"
  foreign_languages: "pt-BR"
  foreign_languages: "zh-CN"
}
voices {
  name: "en-US-AmberNeural"
  model: "neural"
  language: "en-US"
  gender: FEMALE
  sample_rate_hz: 24000
}
voices {
  name: "en-US-AnaNeural"
  model: "neural"
  language: "en-US"
  gender: FEMALE
  sample_rate_hz: 24000
}
voices {
  name: "en-US-AriaNeural"
  model: "neural"
  language: "en-US"
  gender: FEMALE
  sample_rate_hz: 24000
  styles: "chat"
  styles: "customerservice"
  styles: "narration-professional"
  styles: "newscast-casual"
  styles: "newscast-formal"
  styles: "cheerful"
  styles: "empathetic"
  styles: "angry"
  styles: "sad"
  styles: "excited"
  styles: "friendly"
  styles: "terrified"
  styles: "shouting"
  styles: "unfriendly"
  styles: "whispering"
  styles: "hopeful"
}
... Voices omitted here ...

2022-12-13 09:50:13,223 (140290220357440) INFO  Done running file [flow.py]
2022-12-13 09:50:13,227 (140290220357440) INFO  Done

Get more voices

You can experiment with this request by commenting and uncommenting the request.voice lines in your flow.py file. For example, uncomment only the language line to see all American English voices, or change the language to es-ES, for example, to see Spanish voices.

# GetVoices request
request = GetVoicesRequest() 
#request.voice.name = "en-US-JennyNeural"
request.voice.language = "en-US"         # Or try "es-ES", "en-GB", or "zh-CN"
#request.voice.gender = EnumGender.FEMALE

Or, to see all available voices, comment out all request.voice lines, leaving only the main GetVoicesRequest.

# GetVoices request
request = GetVoicesRequest()             # Keep only this line to see all voices
#request.voice.name = "en-US-JennyNeural"
#request.voice.language = "en-US"
#request.voice.gender = EnumGender.FEMALE

Redirect results to file

If you request a large number of voices, you may wish to save the output to a file. For example, this requests all voices and saves them to a text file.

Linux
Windows

$ ./run-client.sh &> all-voices.txt
$ ls *.txt
-rw-r--r-- 1 xxx xxx 60185 Apr 17 14:57 all-voices.txt
$ cat all-voices.txt

>run-client.bat > all-voices.txt 2>&1

>dir *.txt
2023-04-17  11:15 AM            63,498 all-voices.txt

Multiple requests

You can send multiple requests for synthesis (and/or get voices) in the same session. For efficient communication with Neural TTSaaS, all requests use the same channel and stub. This scenario sends three synthesis requests.

Use the flow-multi.py input file, which contains three synthesis requests, with a pause between each one.

from nuance.tts.v1.synthesizer_pb2 import *

 list_of_requests = []

 # Synthesis request 
 request = SynthesisRequest()         # First request 
 request.voice.name = "en-US-JennyNeural"
 pcm = PCM(sample_rate_hz=22050)
 request.audio_params.audio_format.pcm.CopyFrom(pcm)
 request.input.text.text = "This is a test. A very simple test."
 list_of_requests.append(request)
 list_of_requests.append(2)           # Optionally pause after request

 # Synthesis request 
 request = SynthesisRequest()         # Second request 
 request.voice.name = "en-US-JennyNeural" 
 pcm = PCM(sample_rate_hz=22050)
 request.audio_params.audio_format.pcm.CopyFrom(pcm)
 request.input.text.text = "Your coffee will be ready in 5 minutes."
 list_of_requests.append(request)
 list_of_requests.append(2)           # Optionally pause after request 

 # Synthesis request 
 request = SynthesisRequest()         # Third request
 request.voice.name = "en-US-ChristopherNeural"
 pcm = PCM(sample_rate_hz=22050)
 request.audio_params.audio_format.pcm.CopyFrom(pcm)
 request.input.text.text = "The wind was a torrent of darkness, among the gusty trees."
 list_of_requests.append(request)

Edit the script or batch file to include the --file argument pointing to flow-multi.py.

...
python3 client.py --oauthURL https://auth.crt.nuance.com/oauth2/token \
--clientID $CLIENT_ID --clientSecret $SECRET \
--secure --serverUrl tts.api.nuance.com:443 --neural\
--saveAudio --saveAudioAsWav --file flow-multi.py

Run the client using the script or batch file.
- Linux
- Windows
```
./run-client.sh
```
```
run-client.bat
```

See the results below and notice the three audio files created:

flow.py_i1_s1.wav: Jenny saying: “This is a test, a very simple test.”
flow.py_i1_s2.wav: Jenny saying: “Your coffee will be ready in five minutes.”
flow.py_i1_s3.wav: Christopher saying: “The wind was a torrent of darkness, among the gusty trees.”

These are the results from multiple synthesis requests:

2022-12-13 15:33:11,048 (139787073779520) INFO  Obtaining auth token
2022-12-13 15:33:11,449 (139787073779520) DEBUG Creating secure gRPC channel
2022-12-13 15:33:11,454 (139787073779520) INFO  Running file [flow-multi.py]
2022-12-13 15:33:11,454 (139787073779520) DEBUG [voice {
  name: "en-US-JennyNeural"
}
audio_params {
  audio_format {
    pcm {
      sample_rate_hz: 22050
    }
  }
}
input {
  text {
    text: "This is a test, a very simple test."
  }
}
, 2, voice {
  name: "en-US-JennyNeural"
}
audio_params {
  audio_format {
    pcm {
      sample_rate_hz: 22050
    }
  }
}
input {
  text {
    text: "Your coffee will be ready in 5 minutes."
  }
}
, 2, voice {
  name: "en-US-ChristopherNeural"
}
audio_params {
  audio_format {
    pcm {
      sample_rate_hz: 22050
    }
  }
}
input {
  text {
    text: "The wind was a torrent of darkness, among the gusty trees."
  }
}
]
2022-12-13 15:33:11,455 (139787073779520) INFO  Adding x-nuance-tts-neural header
2022-12-13 15:33:11,455 (139787073779520) INFO  Sending Synthesis request
2022-12-13 15:33:11,966 (139787073779520) INFO  Received audio: 55058 bytes
2022-12-13 15:33:11,992 (139787073779520) INFO  Received audio: 55126 bytes
2022-12-13 15:33:11,994 (139787073779520) INFO  Received audio: 7716 bytes
2022-12-13 15:33:11,995 (139787073779520) INFO  Received audio: 30870 bytes
2022-12-13 15:33:11,995 (139787073779520) INFO  Received audio: 66 bytes
2022-12-13 15:33:11,996 (139787073779520) INFO  Received status response: SUCCESS
2022-12-13 15:33:11,997 (139787073779520) INFO  Wrote audio to ./audio/flow-multi.py_i1_s1.wav
2022-12-13 15:33:11,997 (139787073779520) INFO  Waiting for 2 seconds

2022-12-13 15:33:14,000 (139787073779520) INFO  Adding x-nuance-tts-neural header
2022-12-13 15:33:14,000 (139787073779520) INFO  Sending Synthesis request
2022-12-13 15:33:14,378 (139787073779520) INFO  Received audio: 55058 bytes
2022-12-13 15:33:14,404 (139787073779520) INFO  Received audio: 47958 bytes
2022-12-13 15:33:14,405 (139787073779520) INFO  Received audio: 30870 bytes
2022-12-13 15:33:14,405 (139787073779520) INFO  Received audio: 66 bytes
2022-12-13 15:33:14,406 (139787073779520) INFO  Received status response: SUCCESS
2022-12-13 15:33:14,407 (139787073779520) INFO  Wrote audio to ./audio/flow-multi.py_i1_s2.wav
2022-12-13 15:33:14,407 (139787073779520) INFO  Waiting for 2 seconds

2022-12-13 15:33:16,410 (139787073779520) INFO  Adding x-nuance-tts-neural header
2022-12-13 15:33:16,410 (139787073779520) INFO  Sending Synthesis request
2022-12-13 15:33:16,905 (139787073779520) INFO  Received audio: 55058 bytes
2022-12-13 15:33:16,933 (139787073779520) INFO  Received audio: 55126 bytes
2022-12-13 15:33:16,934 (139787073779520) INFO  Received audio: 48510 bytes
2022-12-13 15:33:16,934 (139787073779520) INFO  Received audio: 30870 bytes
2022-12-13 15:33:16,935 (139787073779520) INFO  Received audio: 66 bytes
2022-12-13 15:33:16,935 (139787073779520) INFO  Received status response: SUCCESS
2022-12-13 15:33:16,936 (139787073779520) INFO  Wrote audio to ./audio/flow-multi.py_i1_s3.wav
2022-12-13 15:33:16,936 (139787073779520) INFO  Done running file [flow-multi.py]
2022-12-13 15:33:16,939 (139787073779520) INFO  Done

Feedback

Was this page helpful?

Glad to hear it! Please tell us how we can improve.

Sorry to hear that. Please tell us how we can improve.