Sample synthesis client

TTSaaS offers a fully-functional Python client application that you may download and run on Linux or Windows to synthesize speech using the Synthesizer API.

Note:

You may also use this client with Microsoft neural voices, as described in Neural TTSaaS > Sample synthesis client for Neural TTSaaS.

To run this client, you need:

Python 3.6 or later.
The generated Python stub files from gRPC setup.
Your client ID and secret from Prerequisites from Mix.
The Python client files: sample-synthesis-client.zip

Download this zip file and extract its files into the same directory as the nuance directory, which contains your proto files and Python stubs.

On Linux, give run-mix-client.sh execute permission with chmod +x. For example:

unzip sample-synthesis-client.zip
chmod +x run-mix-client.sh

Python synthesis client: client.py

import argparse
import time
import logging
import grpc
import os
import urllib
import urllib.request
import base64
import json
from importlib.machinery import SourceFileLoader
import threading
from google.protobuf import text_format
from google.protobuf import json_format

from nuance.tts.v1.synthesizer_pb2 import *
from nuance.tts.v1.synthesizer_pb2_grpc import *

thread_context = threading.local()
thread_context.num_synthesis = 0

oauth_mutex = threading.Lock()
oauth_token_expiry_threshhold_seconds = 30
oauth_token_expiry_seconds = 0
oauth_token = None

num_iterations = 0

total_first_chunk_latency = 0
total_synthesis = 0

args = None

# Generates the .wav file header for a given set of parameters
def generate_wav_header(sampleRate, bitsPerSample, channels, datasize, formattype):
    # (4byte) Marks file as RIFF
    o = bytes("RIFF", 'ascii')
    # (4byte) File size in bytes excluding this and RIFF marker
    o += (datasize + 36).to_bytes(4, 'little')
    # (4byte) File type
    o += bytes("WAVE", 'ascii')
    # (4byte) Format Chunk Marker
    o += bytes("fmt ", 'ascii')
    # (4byte) Length of above format data
    o += (16).to_bytes(4, 'little')
    # (2byte) Format type (1 - PCM)
    o += (formattype).to_bytes(2, 'little')
    # (2byte) Will always be 1 for TTS
    o += (channels).to_bytes(2, 'little')
    # (4byte)
    o += (sampleRate).to_bytes(4, 'little')
    o += (sampleRate * channels * bitsPerSample // 8).to_bytes(4, 'little')  # (4byte)
    o += (channels * bitsPerSample // 8).to_bytes(2,'little')               # (2byte)
    # (2byte)
    o += (bitsPerSample).to_bytes(2, 'little')
    # (4byte) Data Chunk Marker
    o += bytes("data", 'ascii')
    # (4byte) Data size in bytes
    o += (datasize).to_bytes(4, 'little')

    return o

def send_http_request_with_json_response(url, headers, data, method):
    request = urllib.request.Request(url=url, headers=headers, data=data, method=method)

    with urllib.request.urlopen(request) as response:
        response = response.read().decode('utf-8')
        json_response = json.loads(response)
        return json_response

def get_oauth2_token():
    global oauth_token
    global oauth_token_expiry_seconds
    global oauth_token_expiry_threshhold_seconds

    if args.oauthURL is None:
        return None
    
    current_time = time.monotonic()

    oauth_mutex.acquire()
    try:
        if oauth_token and oauth_token_expiry_seconds - oauth_token_expiry_threshhold_seconds > current_time:
            log.debug('OAuth token is still valid')
            return oauth_token

        log.info("Obtaining auth token (Client ID: {}, URL: {})".format(args.clientID, args.oauthURL))

        encoded_credentials = base64.standard_b64encode("{}:{}".format(args.clientID, args.clientSecret).encode()).decode('utf-8')
        headers = { 'Authorization' : "Basic {}".format(encoded_credentials)  }

        data = {
            'grant_type': 'client_credentials',
            'scope': args.oauthScope,
        }

        request = urllib.request.Request(url=args.oauthURL, headers=headers, data=urllib.parse.urlencode(data).encode(), method='POST')
        with urllib.request.urlopen(request) as response:
            response = response.read().decode('utf-8')
            json_response = json.loads(response)

            oauth_token = json_response["access_token"]
            oauth_token_expiry_seconds = time.monotonic() + json_response["expires_in"]
        
            log.debug("Token TTL: %d" % json_response["expires_in"])
            return json_response["access_token"]
    except urllib.error.HTTPError as err:
        raise Exception("Failed to obtain authentication token. Status: {}, Error: {}".format(err.code, err.read().decode()))
    finally:
        oauth_mutex.release()


def send_get_voices_request(grpc_client, request, get_all_voices=False):
    log.info("Sending GetVoices request")

    client_span = None
    get_voices_span = None
    metadata = []
    http_headers = {}
    if args.sendHTTP:
        request = json_format.MessageToJson(request).encode()

    if args.jaeger:
        log.debug("Injecting Jaeger span context into request")
        client_span = tracer.start_span("Client.gRPC")
        get_voices_span = tracer.start_span(
            "Client.GetVoices", child_of=client_span)
        carrier = dict()
        tracer.inject(get_voices_span.context,
                      opentracing.propagation.Format.TEXT_MAP, carrier)
        metadata.append(('uber-trace-id', carrier['uber-trace-id']))
    
    if args.clientRequestID:
        metadata.append(('x-client-request-id', args.clientRequestID))

    if args.neural:
        log.info("Adding x-nuance-tts-neural header")
        metadata.append(('x-nuance-tts-neural', 'true'))
        
    if args.sendHTTP:
        current_oauth_token = get_oauth2_token()
        if current_oauth_token:
            http_headers['Authorization'] = 'Bearer {}'.format(current_oauth_token)
    
        json_response = send_http_request_with_json_response(url=args.serverUrl + "/api/v1/voices/", data=request, headers=http_headers, method="GET")
        log.info(json.dumps(json_response, sort_keys=True,
                            indent=2, separators=(',', ': ')))
    else:
        if args.getAllVoices:
            get_voices_rpc = grpc_client.GetAllVoices
        else:
            get_voices_rpc = grpc_client.GetVoices
        
        response, call = get_voices_rpc.with_call(request=request, timeout=args.timeoutSeconds, metadata=metadata)
        log.info(text_format.MessageToString(response))
        for key, value in call.trailing_metadata():
            log.info('Synthesis client received trailing metadata: key=%s value=%s' % (key, value))

    if get_voices_span:
        get_voices_span.finish()
    if client_span:
        client_span.finish()



def process_synthesis_response(request, response, start, synthesis_span, client_span, return_package, request_info):
    global args

    waveheader = ""
    metadata = []
    call_credentials = None

    if args.sendHTTP:
        decoded_audio_response = base64.b64decode(response["audio"])
        response = json_format.Parse(json.dumps(response), UnarySynthesisResponse())
        response.audio = decoded_audio_response

    if args.sendUnary or args.sendHTTP:
        if response.status.code != 200:
            if response.HasField("events"):
                log.info("Received events")
                log.info(text_format.MessageToString(response.events))
            log.error("Received status response: FAILED")
            log.error("Code: {}, Message: {}".format(
                response.status.code, response.status.message))
            log.error('Error: {}'.format(response.status.details))
            return

    if args.sendUnary or args.sendHTTP or response.HasField("audio"):
        log.info("Received audio: %d bytes" % len(response.audio))
        if not return_package["received_first_audio_chunk"]:
            return_package["received_first_audio_chunk"] = True
            latency = time.monotonic() - start
            log.info("First chunk latency: {} seconds".format(latency))
            global total_first_chunk_latency
            total_first_chunk_latency = total_first_chunk_latency + latency
            log.info("Average first-chunk latency (over {} synthesis requests): {} seconds".format(
                total_synthesis, total_first_chunk_latency/(total_synthesis)))

        if args.saveAudio:
            if args.saveAudioAsWav:
                if request.audio_params.audio_format.HasField("ogg_opus") or request.audio_params.audio_format.HasField("opus"):
                    log.warn(
                        "Cannot save wave format for Opus, ignoring")
                else:
                    return_package["currentaudiolen"] += len(response.audio)
                    waveheader = generate_wav_header(
                        request_info["sampleRate"], request_info["bitsPerSample"], request_info["channels"], return_package["currentaudiolen"], request_info["audioformat"])
                    if return_package["audio_file"] != None:
                        return_package["audio_file"].seek(0, 0)
                        return_package["audio_file"].write(waveheader)
                        return_package["audio_file"].seek(0, 2)
            if return_package["audio_file"] != None:
                return_package["audio_file"].write(response.audio)
        if args.saveAudioChunks:
            if request.audio_params.audio_format.HasField("ogg_opus"):
                log.warn(
                    "Cannot save separate audio chunks for Ogg Opus, ignoring")
            else:
                return_package["num_chunks"] = return_package["num_chunks"] + 1
                chunk_file_name = "%s_i%d_s%d_c%d.%s" % (
                    thread_context.file, request_info["num_iterations"], thread_context.num_synthesis, return_package["num_chunks"], request_info["extension"])
                chunk_file_name = os.path.join(args.audioDir, chunk_file_name)
                chunk_audio_file = open(chunk_file_name, "wb")
                if args.saveAudioAsWav:
                    if request.audio_params.audio_format.HasField("opus"):
                        log.warn(
                            "Cannot save audio chunks as wav for Opus, ignoring")
                    else:
                        # Adding wav header before writing to audio file
                        waveheader = generate_wav_header(
                            request_info["sampleRate"], request_info["bitsPerSample"], request_info["channels"], len(response.audio), request_info["audioformat"])
                        chunk_audio_file.write(waveheader)
                chunk_audio_file.write(response.audio)
                chunk_audio_file.close()
                log.info("Wrote audio chunk to %s" % chunk_file_name)
    if response.HasField("events"):
        log.info("Received events")
        log.info(text_format.MessageToString(response.events))

    if response.HasField("status"):
        if response.status.code == 200:
                log.info("Received status response: SUCCESS")
        else:
            log.error("Received status response: FAILED")
            log.error("Code: {}, Message: {}".format(
                response.status.code, response.status.message))
            log.error('Error: {}'.format(response.status.details))
    return return_package


def send_synthesis_request(grpc_client, request, num_iterations, metadata=None):
    global total_synthesis
    total_synthesis = total_synthesis + 1
    global args

    audio_file = None
    audio_file_name = ""
    extension = ""
    sampleRate = 0
    bitsPerSample = 0
    channels = 1
    audioformat = 0
    currentaudiolen = 0
    num_chunks = 0
    metadata = []
    http_headers = {}
    client_span = None
    synthesis_span = None
    received_first_audio_chunk = False
    call_credentials = None

    thread_context.num_synthesis = thread_context.num_synthesis + 1

    if args.saveAudio or args.saveAudioChunks:
        if request.audio_params.audio_format.HasField("pcm"):
            extension = "pcm"
            sampleRate = request.audio_params.audio_format.pcm.sample_rate_hz
            bitsPerSample = 16
            audioformat = 1
        elif request.audio_params.audio_format.HasField("alaw"):
            extension = "alaw"
            bitsPerSample = 8
            sampleRate = 8000
            audioformat = 6
        elif request.audio_params.audio_format.HasField("ulaw"):
            extension = "ulaw"
            bitsPerSample = 8
            sampleRate = 8000
            audioformat = 7
        elif request.audio_params.audio_format.HasField("ogg_opus"):
            extension = "ogg"
        else:
            extension = "opus"

        if args.saveAudioAsWav:
            if request.audio_params.audio_format.HasField("ogg_opus") or request.audio_params.audio_format.HasField("opus"):
                log.warn("Cannot set to wav format for Ogg Opus, ignoring")
            else:
                extension = "wav"

    if args.saveAudio:
        if request.audio_params.audio_format.HasField("opus"):
            log.warn("Cannot save whole audio for Opus, ignoring")
        else:
            audio_file_name = "%s_i%d_s%d.%s" % (
                thread_context.file, num_iterations, thread_context.num_synthesis, extension)
            audio_file_name = os.path.join(args.audioDir, audio_file_name)
            audio_file = open(audio_file_name, "wb")

    if args.appid:
        metadata.append(('x-nuance-client-id', args.appid))
        http_headers['x-nuance-client-id'] = args.appid

    if args.neural:
        log.info("Adding x-nuance-tts-neural header")
        metadata.append(('x-nuance-tts-neural', 'true'))
        http_headers['x-nuance-tts-neural'] = 'true'

    if args.clientRequestID:
        metadata.append(('x-client-request-id', args.clientRequestID))
        http_headers['x-client-request-id'] = args.clientRequestID

    if args.jaeger:
        log.debug("Injecting Jaeger span context into request")
        client_span = tracer.start_span("Client.gRPC")
        if args.sendUnary or args.sendHTTP:
            synthesis_span = tracer.start_span(
                "Client.UnarySynthesize", child_of=client_span)
        else:
            synthesis_span = tracer.start_span(
                "Client.Synthesize", child_of=client_span)
        carrier = dict()
        tracer.inject(synthesis_span.context,
                      opentracing.propagation.Format.TEXT_MAP, carrier)
        metadata.append(('uber-trace-id', carrier['uber-trace-id']))

    request_info = {"sampleRate": sampleRate, "bitsPerSample": bitsPerSample, "channels": channels, "audioformat": audioformat, "extension": extension, "num_iterations": num_iterations}
    return_package = {"received_first_audio_chunk": received_first_audio_chunk, "num_chunks": num_chunks, "currentaudiolen": currentaudiolen, "audio_file": audio_file, "audio_file_name": audio_file_name}

    start = time.monotonic()

    if args.sendUnary:
        response, call = grpc_client.UnarySynthesize.with_call(request=request, timeout=args.timeoutSeconds, metadata=metadata)
        log.info("Sending Unary Synthesis request")
        return_package = process_synthesis_response(request, response, start, synthesis_span, client_span, return_package, request_info)
        for key, value in call.trailing_metadata():
            log.info('Synthesis client received trailing metadata: key=%s value=%s' % (key, value))
    elif args.sendHTTP:
        current_oauth_token = get_oauth2_token()
        if current_oauth_token:
            http_headers['Authorization'] = 'Bearer {}'.format(current_oauth_token)

        json_response = send_http_request_with_json_response(url = args.serverUrl + "/api/v1/synthesize/", data=json_format.MessageToJson(request).encode(), headers=http_headers, method="POST")
        log.info("Sending HTTP Synthesis request")
        if json_response:
            return_package = process_synthesis_response(request, json_response, start, synthesis_span, client_span, return_package, request_info)
        else:
            log.error("Failed to get response from server!")
    else:
        log.info("Sending Synthesis request")
        responses = grpc_client.Synthesize(request=request, timeout=args.timeoutSeconds, metadata=metadata)
        for response in responses:
            return_package = process_synthesis_response(request, response, start, synthesis_span, client_span, return_package, request_info)
        for key, value in responses.trailing_metadata():
            log.info('Synthesis client received trailing metadata: key=%s value=%s' % (key, value))


    if args.saveAudio and return_package:
        if return_package["audio_file"] != None:
            return_package["audio_file"].close()
            log.info("Wrote audio to %s" % return_package["audio_file_name"])

    if synthesis_span:
        synthesis_span.finish()
    if client_span:
        client_span.finish()


def parse_args():
    global args
    parser = argparse.ArgumentParser(
        prog="client.py",
        usage="%(prog)s [-options]",
        add_help=False,
        formatter_class=lambda prog: argparse.HelpFormatter(
            prog, max_help_position=45, width=100)
    )

    options = parser.add_argument_group("options")
    options.add_argument("-h", "--help", action="help",
                         help="Show this help message and exit")
    options.add_argument("--appid", metavar="appID:client-id", nargs="?", help="Client ID or group name, prefixed with appID:")
    options.add_argument("--token", nargs="?", help=argparse.SUPPRESS)
    options.add_argument("-f", "--files", metavar="file", nargs="+",
                         help="List of flow files to execute sequentially, default=['flow.py']", default=['flow.py'])
    options.add_argument("-p", "--parallel", action="store_true",
                         help="Run each flow in a separate thread")
    options.add_argument("-i", "--iterations", metavar="num", nargs="?",
                         help="Number of times to run the list of files, default=1", default=1, type=int)
    options.add_argument("--infinite", action="store_true",
                         help="Run all files infinitely (overrides number of iterations)")
    options.add_argument("-t", "--timeoutSeconds", metavar="num", nargs="?",
                         help="Timeout in seconds for every RPC call, default=30", default=30, type=int)
    options.add_argument("-s", "--serverUrl", metavar="url", nargs="?",
                         help="NVC server URL, default=localhost:8080", default='localhost:8080')
    options.add_argument("--oauthURL", metavar="url", nargs="?",
                         help="OAuth 2.0 URL")
    options.add_argument("--clientRequestID", metavar="id", nargs="?",
                         help="Client-generated request ID")
    options.add_argument("--clientID", metavar="url", nargs="?",
                         help="OAuth 2.0 Client ID")
    options.add_argument("--clientSecret", metavar="url", nargs="?",
                         help="OAuth 2.0 Client Secret")
    options.add_argument("--oauthScope", metavar="url", nargs="?",
                         help="OAuth 2.0 Scope, default=tts", default='tts')
    options.add_argument("--secure", action="store_true",
                         help="Connect to the server using a secure gRPC channel")
    options.add_argument("--rootCerts",  metavar="file", nargs="?",
                         help="Root certificates when using a secure gRPC channel")
    options.add_argument("--privateKey",  metavar="file", nargs="?",
                         help="Certificate private key when using a secure gRPC channel")
    options.add_argument("--certChain",  metavar="file", nargs="?",
                         help="Certificate chain when using a secure gRPC channel")
    options.add_argument("--audioDir", metavar="dir", nargs="?",
                         help="Audio output directory, default=./audio", default='./audio')
    options.add_argument("--saveAudio", action="store_true",
                         help="Save whole audio to disk")
    options.add_argument("--saveAudioChunks", action="store_true",
                         help="Save each individual audio chunk to disk")
    options.add_argument("--saveAudioAsWav", action="store_true",
                         help="Save each audio file in the WAVE format")
    options.add_argument("--jaeger", metavar="addr", nargs="?", const='udp://localhost:6831',
                         help="Send UDP opentrace spans, default addr=udp://localhost:6831")
    options.add_argument("--sendUnary", action="store_true",
                         help="Receive one response (UnarySynthesize) instead of a stream of responses (Synthesize)")
    options.add_argument("--sendHTTP", action="store_true",
                         help="Send the requests using the HTTP-to-gRPC API")
    options.add_argument("--maxReceiveSizeMB", metavar="megabytes", nargs="?",
                         help="Maximum length of gRPC server response in megabytes, default=50 MB", default=50, type=int)
    options.add_argument("--getAllVoices", action="store_true",
                         help=argparse.SUPPRESS)
    options.add_argument("--neural", action="store_true",
                         help="Send the request to Neural TTS, if available.")
    args = parser.parse_args()


def initialize_tracing():
    if args.jaeger:
        print("Enabling Jaeger traces")
        global opentracing
        import opentracing
        import jaeger_client

        from urllib.parse import urlparse
        agent_addr = urlparse(args.jaeger)
        if not agent_addr.netloc:
            raise Exception(
                "invalid jaeger agent address: {}".format(args.jaeger))
        if not agent_addr.hostname:
            raise Exception(
                "missing hostname in jaeger agent address: {}".format(args.jaeger))
        if not agent_addr.port:
            raise Exception(
                "missing port in jaeger agent address: {}".format(args.jaeger))
        tracer_config = {
            'sampler': {
                'type': 'const',
                'param': 1,
            },
            'local_agent': {
                'reporting_host': agent_addr.hostname,
                'reporting_port': agent_addr.port
            },
            'logging': True
        }
        config = jaeger_client.Config(
            config=tracer_config, service_name='NVCClient', validate=True)
        global tracer
        tracer = config.initialize_tracer()


def create_channel():
    call_credentials = None
    channel = None

    if args.token:
        log.debug('Adding CallCredentials using token parameter')
        call_credentials = grpc.access_token_call_credentials(args.token)
    else:
        current_oauth_token = get_oauth2_token()
        if current_oauth_token:
            log.debug('Adding CallCredentials from OAuth endpoint')
            call_credentials = grpc.access_token_call_credentials(current_oauth_token)

    if args.secure:
        log.debug("Creating secure gRPC channel")
        root_certificates = None
        certificate_chain = None
        private_key = None
        if args.rootCerts:
            log.debug("Adding root certs")
            root_certificates = open(args.rootCerts, 'rb').read()
        if args.certChain:
            log.debug("Adding cert chain")
            certificate_chain = open(args.certChain, 'rb').read()
        if args.privateKey:
            log.debug("Adding private key")
            private_key = open(args.privateKey, 'rb').read()

        channel_credentials = grpc.ssl_channel_credentials(
            root_certificates=root_certificates, private_key=private_key, certificate_chain=certificate_chain)
        if call_credentials is not None:
            channel_credentials = grpc.composite_channel_credentials(
                channel_credentials, call_credentials)
        channel = grpc.secure_channel(args.serverUrl, credentials=channel_credentials, options=[
                                      ('grpc.max_receive_message_length', args.maxReceiveSizeMB * 1024 * 1024)])
    else:
        log.debug("Creating insecure gRPC channel")
        channel = grpc.insecure_channel(args.serverUrl, options=[(
            'grpc.max_receive_message_length', args.maxReceiveSizeMB * 1024 * 1024)])

    return channel


def worker_thread(file, num_iterations, list_of_requests):
    run_one_file(file, num_iterations, list_of_requests)

def run_one_file(file, num_iterations, list_of_requests):
    thread_context.num_synthesis = 0
    
    with create_channel() as channel:
        grpc_client = SynthesizerStub(channel=channel)
        log.info("Running file [%s]" % file)
        log.debug(list_of_requests)

        thread_context.num_synthesis = 0
        thread_context.file = os.path.basename(file)

        for request in list_of_requests:
            if isinstance(request, GetVoicesRequest):
                send_get_voices_request(grpc_client, request)
            elif isinstance(request, SynthesisRequest):
                send_synthesis_request(grpc_client, request, num_iterations)
            elif isinstance(request, (int, float)):
                log.info("Waiting for {} seconds".format(request))
                time.sleep(request)
        log.info("Done running file [%s]" % file)


def run():
    parse_args()

    log_level = logging.DEBUG
    global log
    log = logging.getLogger('')
    logging.basicConfig(
        format='%(asctime)s (%(thread)d) %(levelname)-5s %(message)s', level=log_level)

    if args.oauthURL:
        if args.clientID is None:
            log.error("OAuth 2.0 URL was supplied but client ID is missing")
            return
        elif args.clientSecret is None:
            log.error("OAuth 2.0 URL was supplied but client secret is missing")
            return

    initialize_tracing()
    get_oauth2_token()

    if (args.saveAudio or args.saveAudioChunks) and not os.path.exists(args.audioDir):
        log.info("Audio directory: {}".format(args.audioDir))
        os.mkdir(args.audioDir)
    
    if args.infinite:
        log.info("Setting iterations to infinity")
        args.iterations = 100**100

    for i in range(args.iterations):
        num_iterations = i + 1
        log.info("Iteration #{} out of {}".format(num_iterations, args.iterations))
        threads = []
        for file in args.files:
            absolute_path = os.path.abspath(file)
            module_name = os.path.splitext(absolute_path)[0]
            module = SourceFileLoader(module_name, absolute_path).load_module()
            if module.list_of_requests == None:
                raise Exception(
                    "Error importing [%s]: variable list_of_requests not defined" % file)
            if args.parallel:
                log.info("Running flows in parallel")
                thread = threading.Thread(target=worker_thread, args=[file, num_iterations, module.list_of_requests])
                threads.append(thread)
                thread.start()
            else:
                run_one_file(file, num_iterations, module.list_of_requests)
        for thread in threads:
            thread.join()
        log.info("Iteration #{} complete".format(num_iterations))

    if total_synthesis > 0:
        log.info("Average first-chunk latency (over {} synthesis requests): {} seconds".format(
            total_synthesis, total_first_chunk_latency/(total_synthesis)))

    if args.jaeger:
        tracer.close()
        # Need to give time to tracer to flush the spans: https://github.com/jaegertracing/jaeger-client-python/issues/50
        time.sleep(2)
    log.info("Done")


if __name__ == '__main__':
    run()

These are the resulting client files, in the same directory as the nuance directory:

├── flow.py
├── flow-multi.py
├── client.py
├── run-mix-client.bat
├── run-mix-client.sh
└── nuance
    ├── rpc (RPC message files)
    └── tts
        ├── storage (Storage files)
        └── v1 
            ├── synthesizer_pb2_grpc.py
            ├── synthesizer_pb2.py
            └── synthesizer.proto

You can use the client to search for available voices and/or request synthesis. Here are a few scenarios you can try.

Get help

For a quick check that the client is working, and to see the arguments it accepts, run it using the help (-h or --help) option.

See the results below and notice:

-s or --serverUrl: The URL of the service. The sample run script specifies the Mix service, tts.api.nuance.com, on its default port, 443.
Authorization: Include --oauthURL, --clientID, and --clientSecret. Alternatively, use the hidden --token argument. See Authorize.
--secure: Include this argument when calling TTSaaS.
-f or --files: This points to an input file, by default flow.py. For multiple files, specify --files flow1.py flow2.py

The results are the same on Linux and Windows:

python3 client.py --help 

usage: client.py [-options]

options:
  -h, --help                       Show this help message and exit
  --appid [appID:client-id]        Client ID or group name, prefixed with appID:
  -f file [file ...], --files file [file ...] 
                                   List of flow files to execute sequentially,
                                   default=['flow.py']
  -p, --parallel                   Run each flow in a separate thread
  -i [num], --iterations [num]     Number of times to run the list of files, default=1
  --infinite                       Run all files infinitely (overrides number of
                                   iterations)
  -t [num], --timeoutSeconds [num] Timeout in seconds for every RPC call, default=30
  -s [url], --serverUrl [url]      NVC server URL, default=localhost:8080
  --oauthURL [url]                 OAuth 2.0 URL
  --clientRequestID [id]           Client-generated request ID
  --clientID [url]                 OAuth 2.0 Client ID
  --clientSecret [url]             OAuth 2.0 Client Secret
  --oauthScope [url]               OAuth 2.0 Scope, default=tts
  --secure                         Connect to the server using a secure gRPC channel
  --rootCerts [file]               Not used
  --privateKey [file]              Not used
  --certChain [file]               Not used
  --audioDir [dir]                 Audio output directory, default=./audio
  --saveAudio                      Save whole audio to disk
  --saveAudioChunks                Save each individual audio chunk to disk
  --saveAudioAsWav                 Save each audio file in the WAVE format
  --jaeger [addr]                  Send UDP opentrace spans, default
                                   addr=udp://localhost:6831
  --sendUnary                      Receive one response (UnarySynthesize) instead of a
                                   stream of responses (Synthesize)
  --sendHTTP                       Send the requests using the HTTP-to-gRPC API
  --maxReceiveSizeMB [megabytes]   Maximum length of gRPC server response in megabytes,
                                   default=50 MB
  --neural                         Send the request to Neural TTS, if available.

Input files

The sample client includes two input files, flow.py and flow-multi.py. These files provide an easy way to customize the client without editing the main client.py file.

You will learn more about these input files in the following sections.

Edit run script

Edit the sample shell script or batch file to add your Mix client ID and secret. See Authorize.

Linux: run-mix-client.sh
Windows: run-mix-client.bat

#!/bin/bash
       
CLIENT_ID=<Mix client ID, starting with appID:>
SECRET=<Mix client secret>
#Change colons (:) to %3A in client ID
CLIENT_ID=${CLIENT_ID//:/%3A}

python3 client.py --oauthURL https://auth.crt.nuance.com/oauth2/token \
--clientID $CLIENT_ID --clientSecret $SECRET \
--secure --serverUrl tts.api.nuance.com \
--saveAudio --saveAudioAsWav

@echo off
setlocal enabledelayedexpansion

set CLIENT_ID=<Mix client ID, starting with appID:>
set SECRET=<Mix client secret>
rem Change colons (:) to %3A in client ID
set CLIENT_ID=!CLIENT_ID::=%%3A!

python mix-client.py --oauthURL https://auth.crt.nuance.com/oauth2/token ^
--clientID %CLIENT_ID% --clientSecret %SECRET% ^
--secure --serverUrl tts.api.nuance.com ^
--saveAudio --saveAudioAsWav

Notice the --saveAudio and --saveAudioAsWav arguments. These save the synthesized audio as a wave file in the --audioDir default location, ./audio.

Synthesize text input

In this first scenario, use the default input file to synthesize a text string using SynthesisRequest and save the audio in a wave file.

Open the input file, flow.py, and notice two sections. For this exercise, you don’t need to change anything in this file.

# GetVoices request asks for information about the Evan voice.
# Synthesis request uses the same voice to synthesize a text string.

from nuance.tts.v1.synthesizer_pb2 import *

list_of_requests = []

# GetVoices request
request = GetVoicesRequest()
request.voice.name = "Evan"

# Add request to list
list_of_requests.append(request)   

# ---

# Synthesis request
request = SynthesisRequest()

request.voice.name = "Evan"
request.voice.model = "enhanced"

pcm = PCM(sample_rate_hz=22050)
request.audio_params.audio_format.pcm.CopyFrom(pcm)

request.audio_params.volume_percentage = 80
request.audio_params.speaking_rate_factor = 1.0
request.audio_params.audio_chunk_duration_ms = 2000

request.input.text.text = "This is a test. A very simple test."

request.event_params.send_log_events = True

request.user_id = "MyApplicationUser"

#Add request to list
list_of_requests.append(request) 

# ---

Run the client using the shell script or batch file.
- Linux
- Windows
```
./run-mix-client.sh
```
```
run-mix-client.bat
```

The client takes the information in the input flow.py file and creates an audio file, flow.py_i1_s1.wav, of Evan saying “This is a test. A very simple test.”

The results are the same on Linux and Windows. Some lines have been omitted for brevity.

2023-10-12 16:58:03,713 (139946639763264) INFO  Obtaining auth token 
2023-10-12 16:58:03,834 (139946639763264) DEBUG Creating secure gRPC channel
2023-10-12 16:58:04,026 (139946639763264) INFO  Running file [flow.py]
2023-10-12 16:58:04,026 (139946639763264) DEBUG [voice {
  name: "Evan"
}
, voice {
  name: "Evan"
  model: "Enhanced"
}
audio_params {
  audio_format {
    pcm {
      sample_rate_hz: 22050
    }
  }
  volume_percentage: 80
  speaking_rate_factor: 1.0
  audio_chunk_duration_ms: 2000
}
input {
  text {
    text: "This is a test. A very simple test."
  }
}
event_params {
  send_log_events: true
}
user_id: "MyApplicationUser"
]
2023-10-12 16:58:04,026 (139946639763264) INFO  Sending GetVoices request
2023-10-12 16:58:04,350 (139946639763264) INFO  voices {
  name: "Evan"
  model: "enhanced"
  language: "en-US"
  gender: MALE
  sample_rate_hz: 22050
  language_tlw: "enu"
  version: "1.1.1"
}
2023-10-12 16:58:04,351 (139946639763264) INFO  Sending Synthesis request
*** Events and received audio chunks here *** 
2023-10-12 16:58:04,748 (139946639763264) INFO  Received status response: SUCCESS
2023-10-12 16:58:04,748 (139946639763264) INFO  Wrote audio to ./audio/flow.py_i1_s1.wav
2023-10-12 16:58:04,748 (139946639763264) INFO  Done running file [flow.py]
2023-10-12 16:58:04,749 (139946639763264) INFO  Done

For an example of events in the results, see Events.

Warning:

The file created by the the client, flow.py_i1_s1.wav, is overwrittten every time you run the client. If you wish to save the file, rename it, for example, evan-simple.wav.

Change text and voice

Optionally change the voice and the input text in the synthesis request, and rerun the client. To learn what other voices are available, see Get voices below.

To avoid the long list of events in the response, disable send_log_events. For example:

# Synthesis request
request = SynthesisRequest()

request.voice.name = "Zoe-Ml"
request.voice.model = "enhanced"

request.input.text.text = "Your coffee will be ready in 5 minutes."

#request.event_params.send_log_events = True # Comment out or change to False

Synthesize SSML input

You may provide SSML input instead of plain text.

Edit flow.py to comment out the request.input.text.text line and add an SSML line:

#request.input.text.text = "This is a test. A very simple test."
request.input.ssml.text = "<speak>It's 24,901 miles around the earth, or 40,075 km.</speak>"

Run the client using the shell script or batch file.
- Linux
- Windows
```
./run-mix-client.sh
```
```
run-mix-client.bat
```

The client sends a SynthesisRequest to turn the SSML into speech. It creates a file named flow.py_i1_s1.wav telling us the distance around the earth.

For more SSML examples, as well as examples using Nuance control codes, see Input to synthesize.

2023-10-12 17:10:27,576 (140011284363072) INFO  Obtaining auth token
2023-10-12 17:10:28,234 (140011284363072) DEBUG Creating secure gRPC channel
...
input {
  ssml {
    text: "<speak>It\'s 24,901 miles around the earth, or 40,075 km.</speak>"
  }
}
user_id: "MyApplicationUser"
]
2023-06-22 09:03:04,375 (13572) INFO  Sending GetVoices request
2023-06-22 09:03:04,628 (13572) INFO  voices {
  name: "Evan"
  model: "enhanced"
  language: "en-US"
  gender: MALE
  sample_rate_hz: 22050
  language_tlw: "enu"
  version: "1.1.1"
}
2023-06-22 09:03:04,629 (13572) INFO  Sending Synthesis request
2023-06-22 09:03:04,852 (13572) INFO  Received audio: 21336 bytes
2023-06-22 09:03:04,878 (13572) INFO  Received audio: 17856 bytes
2023-06-22 09:03:04,968 (13572) INFO  Received audio: 82492 bytes
2023-06-22 09:03:05,004 (13572) INFO  Received audio: 17030 bytes
2023-06-22 09:03:05,048 (13572) INFO  Received audio: 45300 bytes
2023-06-22 09:03:05,130 (13572) INFO  Received audio: 70044 bytes
2023-06-22 09:03:05,138 (13572) INFO  Received status response: SUCCESS
2023-06-22 09:03:05,141 (13572) INFO  Wrote audio to flow.py_i1_s1.av
2023-06-22 09:03:05,141 (13572) INFO  Done running file [flow.py]
2023-06-22 09:03:05,142 (13572) INFO  Done

Without send_log_events in the input flow.py file, notice that only the received audio chunks are shown in the results.

Get voices

When you ask TTSaaS to synthesize text, you must specify a named voice. To learn which voices are available, send a GetVoicesRequest, entering your requirements in the flow.py input file.

Edit flow.py to request American English female voices. This combination of options returns voices that are both American English and female.

Optionally turn off synthesis for this request.

from nuance.tts.v1.synthesizer_pb2 import *

list_of_requests = []

# GetVoices request
request = GetVoicesRequest()
#request.voice.name = "Evan"
request.voice.language = "en-us"           # Request American English voices
request.voice.gender = EnumGender.FEMALE   # Request female voices

# Add request to list
list_of_requests.append(request)

# Synthesis request
...
#Add request to list
#list_of_requests.append(request)          # Disable synthesis with #

Run the client using the shell script or batch file.
- Linux
- Windows
```
./run-mix-client.sh
```
```
run-mix-client.bat
```

The client returns information about all female American English voices available in the current environment.

2023-10-12 17:16:55,041 (140380487186240) INFO  Obtaining auth token
2023-10-12 17:16:55,256 (140380487186240) DEBUG Creating secure gRPC channel
...
2023-10-12 17:16:55,361 (140380487186240) DEBUG [voice {
  language: "en-US"
  gender: FEMALE
}
]
2023-10-12 17:16:55,362 (140380487186240) INFO  Sending GetVoices request
2023-10-12 17:16:55,604 (140380487186240) INFO  voices {
  name: "Allison"
  model: "standard"
  language: "en-US"
  gender: FEMALE
  sample_rate_hz: 22050
  language_tlw: "enu"
  version: "2.0.0"
}
voices {
  name: "Ava-Ml"
  model: "enhanced"
  language: "en-US"
  gender: FEMALE
  sample_rate_hz: 22050
  language_tlw: "enu"
  version: "3.0.1"
  foreign_languages: "es-MX"
}
...
voices {
  name: "Zoe-Ml"
  model: "enhanced"
  language: "en-US"
  gender: FEMALE
  sample_rate_hz: 22050
  language_tlw: "enu"
  version: "2.0.0"
  foreign_languages: "es-MX"
  foreign_languages: "fr-CA"
}

2023-10-12 17:16:55,604 (140380487186240) INFO  Done running file [flow.py]
2023-10-12 17:16:55,605 (140380487186240) INFO  Done

Notice the information that TTSaaS returns for each voice:

All voices include the voice name, model (standard or enhanced), language code, gender, and other parameters described in Voice.
Multilingual voices (ending in -Ml) list supported languages other than their native language.

Get more voices

You can experiment with this request: for example, to see all available voices, remove or comment out all the request.voice lines, leaving only the main GetVoicesRequest.

# GetVoices request
request = GetVoicesRequest()            # Keep only this line
#request.voice.name = "Evan"
#request.voice.language = "en-us"

The results include all voices available from the Nuance-hosted TTSaaS service.

Redirect results to file

If you request a large number of voices, you may wish to save the output to a file. For example, this requests all voices and saves them to a text file.

Linux
Windows

./run-mix-client.sh &> all-voices.txt

ls *.txt
-rw-r--r-- 1 xxx xxx 60185 Apr 17 14:57 all-voices.txt

cat all-voices.txt

run-mix-client.bat > all-voices.txt 2>&1

dir *.txt
2023-06-22  09:15 AM            30,807 all-voices.txt

Run client with resources

If you have uploaded synthesis resources using the Storage API (see the Sample storage client), you can reference them in a synthesis request.

Edit flow.py to specify one or more resources within the synthesis request, for example, a user dictionary uploaded with the Storage API.

from nuance.tts.v1.synthesizer_pb2 import *
. . .
# Synthesis request
request = SynthesisRequest()

request.voice.name = "Evan"
request.voice.model = "enhanced"
pcm = PCM(sample_rate_hz=22050)
request.audio_params.audio_format.pcm.CopyFrom(pcm)

user_dict = SynthesisResource()       # Add a user dictionary
user_dict.type = EnumResourceType.USER_DICTIONARY
user_dict.uri = "urn:nuance-mix:tag:tuning:lang/coffee_app/coffee_dict/en-us/mix.tts"
request.input.resources.extend([user_dict])

request.input.text.text = "This is a test. A very simple test."

#Add request to list
list_of_requests.append(request)

Run the client using the shell script or batch file.
- Linux
- Windows
```
./run-mix-client.sh
```
```
run-mix-client.bat
```

In the results, the user dictionary is listed under resources. As this is a Python example, type: USER_DICTIONARY is not shown under resources because it’s the default value.

2023-10-12 17:20:27,762 (139961486419776) INFO  Obtaining auth token
2023-10-12 17:20:27,834 (139961486419776) DEBUG Creating secure gRPC channel
...
2023-10-12 17:20:28,014 (139961486419776) DEBUG [voice {
  name: "Evan"
  model: "Enhanced"
}
audio_params {
  audio_format {
    pcm {
      sample_rate_hz: 22050
    }
  }
  volume_percentage: 80
  speaking_rate_factor: 1.0
  audio_chunk_duration_ms: 2000
}
input {
  ssml {
    text: "<speak>It\'s 24,901 miles around the earth, or 40,075 km.</speak>"
  }
  resources {
    uri: "urn:nuance-mix:tag:tuning:lang/coffee_app/coffee_dict/en-us/mix.tts"
  }
}
user_id: "MyApplicationUser"
]
2023-10-12 17:20:28,015 (139961486419776) INFO  Sending Synthesis request
2023-10-12 17:20:28,727 (139961486419776) INFO  Received audio: 21336 bytes
2023-10-12 17:20:28,752 (139961486419776) INFO  Received audio: 17856 bytes
2023-10-12 17:20:28,916 (139961486419776) INFO  Received audio: 82492 bytes
2023-10-12 17:20:28,945 (139961486419776) INFO  Received audio: 17030 bytes
2023-10-12 17:20:29,001 (139961486419776) INFO  Received audio: 45300 bytes
2023-10-12 17:20:29,107 (139961486419776) INFO  Received audio: 70044 bytes
2023-10-12 17:20:29,114 (139961486419776) INFO  Received status response: SUCCESS
2023-10-12 17:20:29,118 (139961486419776) INFO  Wrote audio to ./audio/flow.py_i1_s1.wav
2023-10-12 17:20:29,118 (139961486419776) INFO  Done running file [flow.py]
2023-10-12 17:20:29,119 (139961486419776) INFO  Done

For examples of using all types of synthesis resources, see Synthesis resources.

Multiple requests

You can send multiple requests for synthesis (and/or get voices) in the same session. For efficient communication with the TTSaaS server, all requests use the same channel and stub. This scenario sends three synthesis requests.

Open the input file, flow-multi.py, and notice it contains three synthesis requests, pausing for a couple of seconds after each request.

from nuance.tts.v1.synthesizer_pb2 import *

list_of_requests = []

# Synthesis request
request = SynthesisRequest()         # First request 
request.voice.name = "Evan"
request.voice.model = "enhanced"
pcm = PCM(sample_rate_hz=22050)
request.audio_params.audio_format.pcm.CopyFrom(pcm)
request.input.text.text = "This is a test. A very simple test."
list_of_requests.append(request)
list_of_requests.append(2)           # Pause after request

# Synthesis request
request = SynthesisRequest()         # Second request 
request.voice.name = "Evan"
request.voice.model = "enhanced"
pcm = PCM(sample_rate_hz=22050)
request.audio_params.audio_format.pcm.CopyFrom(pcm)
request.input.text.text = "Your coffee will be ready in 5 minutes."
list_of_requests.append(request)
list_of_requests.append(2)           # Pause after request

# Synthesis request
request = SynthesisRequest()         # Third request
request.voice.name = "Zoe-Ml"
request.voice.model = "enhanced"
pcm = PCM(sample_rate_hz=22050)
request.audio_params.audio_format.pcm.CopyFrom(pcm)
request.input.text.text = "The wind was a torrent of darkness, among the gusty trees."
list_of_requests.append(request)

Edit your shell script or batch file to point to the flow-multi.py input file:

Linux: run-mix-client.sh
Windows: run-mix-client.bat

... 
python3 mix-client.py --token $MY_TOKEN --saveAudio --saveAudioAsWav \
--files flow-multi.py

... 
python mix-client.py --token %MY_TOKEN% --saveAudio --saveAudioAsWav ^
--files flow-multi.py

Run the client using the shell script or batch file.
- Linux
- Windows
```
./run-mix-client.sh
```
```
run-mix-client.bat
```

The client makes three synthesis requests and creates three audio files:

flow.py_i1_s1.wav: Evan saying: “This is a test…”
flow.py_i1_s2.wav: Evan saying: “Your coffee will be ready…”
flow.py_i1_s3.wav: Zoe saying: “The wind was a torrent of darkness…”

2023-10-12 17:25:52,663 (140168510699328) INFO  Obtaining auth token
2023-10-12 17:25:52,725 (140168510699328) DEBUG Creating secure gRPC channel
...
2023-10-12 17:25:52,981 (140168510699328) DEBUG [voice {
  name: "Evan"
  model: "enhanced"
}
audio_params {
  audio_format {
    pcm {
      sample_rate_hz: 22050
    }
  }
}
input {
  text {
    text: "This is a test. A very simple test."
  }
}
, 2, voice {
  name: "Evan"
  model: "enhanced"
}
audio_params {
  audio_format {
    pcm {
      sample_rate_hz: 22050
    }
  }
}
input {
  text {
    text: "Your coffee will be ready in 5 minutes."
  }
}
, 2, voice {
  name: "Zoe-Ml"
  model: "enhanced"
}
audio_params {
  audio_format {
    pcm {
      sample_rate_hz: 22050
    }
  }
}
input {
  text {
    text: "The wind was a torrent of darkness, among the gusty trees."
  }
}
]
2023-10-12 17:25:52,982 (140168510699328) INFO  Sending Synthesis request
2023-10-12 17:25:53,504 (140168510699328) INFO  Received audio: 57484 bytes
2023-10-12 17:25:53,635 (140168510699328) INFO  Received audio: 70432 bytes
2023-10-12 17:25:53,635 (140168510699328) INFO  Received status response: SUCCESS
2023-10-12 17:25:53,636 (140168510699328) INFO  Wrote audio to ./audio/flow-multi.py_i1_s1.wav
2023-10-12 17:25:53,636 (140168510699328) INFO  Waiting for 2 seconds

2023-10-12 17:25:55,638 (140168510699328) INFO  Sending Synthesis request
2023-10-12 17:25:55,946 (140168510699328) INFO  Received audio: 44756 bytes
2023-10-12 17:25:56,010 (140168510699328) INFO  Received audio: 67030 bytes
2023-10-12 17:25:56,011 (140168510699328) INFO  Received status response: SUCCESS
2023-10-12 17:25:56,011 (140168510699328) INFO  Wrote audio to ./audio/flow-multi.py_i1_s2.wav
2023-10-12 17:25:56,011 (140168510699328) INFO  Waiting for 2 seconds

2023-10-12 17:25:58,013 (140168510699328) INFO  Sending Synthesis request
2023-10-12 17:25:58,278 (140168510699328) INFO  Received audio: 42424 bytes
2023-10-12 17:25:58,278 (140168510699328) INFO  Received audio: 1040 bytes
2023-10-12 17:25:58,278 (140168510699328) INFO  Received audio: 26648 bytes
2023-10-12 17:25:58,309 (140168510699328) INFO  Received audio: 20558 bytes
2023-10-12 17:25:58,309 (140168510699328) INFO  Received audio: 7902 bytes
2023-10-12 17:25:58,310 (140168510699328) INFO  Received audio: 10292 bytes
2023-10-12 17:25:58,318 (140168510699328) INFO  Received audio: 50508 bytes
2023-10-12 17:25:58,323 (140168510699328) INFO  Received status response: SUCCESS
2023-10-12 17:25:58,326 (140168510699328) INFO  Wrote audio to ./audio/flow-multi.py_i1_s3.wav
2023-10-12 17:25:58,326 (140168510699328) INFO  Done running file [flow-multi.py]
2023-10-12 17:25:58,327 (140168510699328) INFO  Done

What’s list_of_requests?

The client expects all input files to declare a global array named list_of_requests. It sequentially processes the requests contained in that array.

You may optionally instruct the client to wait a number of seconds between requests, by appending a number value to list_of_requests. For example:

list_of_requests.append(request1)
list_of_requests.append(1.5)
list_of_requests.append(request2)

Once request1 is complete, the client pauses for 1.5 seconds before executing request2.

Run client for unary response

By default, the synthesized voice is streamed back to the client, but you may request a unary (non-streamed, single package) response.

Using the sample client, include the --sendUnary argument in the run script. This example uses the same input flow.py file as Synthesize text input.

Linux: run-mix-client.sh
Windows: run-mix-client.bat

...
python3 client.py --oauthURL https://auth.crt.nuance.com/oauth2/token \
--clientID $CLIENT_ID --clientSecret $SECRET \
--secure --serverUrl tts.api.nuance.com \
--saveAudio --saveAudioAsWav --sendUnary

... 
python client.py --oauthURL https://auth.crt.nuance.com/oauth2/token ^
--clientID %CLIENT_ID% --clientSecret %SECRET% ^
--secure --serverUrl tts.api.nuance.com ^
--saveAudio --saveAudioAsWav --sendUnary

Run the client using the shell script or batch file.
- Linux
- Windows
```
./run-mix-client.sh
```
```
run-mix-client.bat
```

This unary response returns a single non-streamed audio package, logged as one Received audio:

2023-10-12 17:33:10,583 (140049666340672) INFO  Obtaining auth token
2023-10-12 17:33:10,629 (140049666340672) DEBUG Creating secure gRPC channel
...
2023-10-12 17:33:10,900 (140049666340672) DEBUG [voice {
  name: "Evan"
  model: "Enhanced"
}
audio_params {
  audio_format {
    pcm {
      sample_rate_hz: 22050
    }
  }
  volume_percentage: 80
  speaking_rate_factor: 1.0
  audio_chunk_duration_ms: 2000
}
input {
  ssml {
    text: "<speak>It\'s 24,901 miles around the earth, or 40,075 km.</speak>"
  }
  resources {
    uri: "urn:nuance-mix:tag:tuning:lang/coffee_app/coffee_dict/en-us/mix.tts"
  }
}
user_id: "MyApplicationUser"
]
2023-10-12 17:33:12,363 (140049666340672) INFO  Sending Unary Synthesis request
2023-10-12 17:33:12,363 (140049666340672) INFO  Received audio: 254058 bytes
2023-10-12 17:33:12,363 (140049666340672) INFO  Received status response: SUCCESS
2023-10-12 17:33:12,364 (140049666340672) INFO  Wrote audio to ./audio/flow.py_i1_s1.wav
2023-10-12 17:33:12,364 (140049666340672) INFO  Done running file [flow.py]
2023-10-12 17:33:12,365 (140049666340672) INFO  Done

If you have multiple requests, each request returns a single audio package.

Feedback

Was this page helpful?

Glad to hear it! Please tell us how we can improve.

Sorry to hear that. Please tell us how we can improve.