Sample recognition client

ASRaaS offers a simple Python client application that you may download and run on Linux or Windows to recognize speech using the Recognizer gRPC API. To run this client, you need:

Python 3.6 or later.
The generated Python stubs from gRPC setup.
Your client ID and secret from Prerequisites from Mix.
An audio file: mono, linear PCM file, 8 or 16 kHz. Stereo is not supported. The audio may be in any language listed in Geographies.
The Python client files: sample-reco-client.zip

Download the zip file and extract its files into the same directory as the nuance directory containing your proto files and stubs, for example /home/asr/clients on Linux or c:\ASR\clients on Windows.

On Linux, give run-reco-client.sh execute permission with chmod +x. For example:

unzip sample-reco-client.zip
chmod +x run-reco-client.sh

Python client: reco-client.py

import sys, wave, grpc, traceback
from time import sleep
from nuance.asr.v1.resource_pb2 import *
from nuance.asr.v1.result_pb2 import *
from nuance.asr.v1.recognizer_pb2 import *
from nuance.asr.v1.recognizer_pb2_grpc import *

# Declare a Mix DLM
travel_dlm = RecognitionResource(
    external_reference = ResourceReference(
        type = 'DOMAIN_LM',
        uri = 'urn:nuance-mix:tag:model/<context_tag>/mix.asr?=language=eng-USA'
    )
)

# Declare an inline wordset 
places_wordset = RecognitionResource(
    inline_wordset = '{"PLACES":[{"literal":"La Jolla","spoken":["la hoya"]},{"literal":"Llanfairpwllgwyngyll","spoken":["lan vire pool guin gill"]},{"literal":"Abington Pigotts"},{"literal":"Steeple Morden"},{"literal":"Hoyland Common"},{"literal":"Cogenhoe","spoken":["cook no"]},{"literal":"Fordoun","spoken":["forden"]},{"literal":"Llangollen","spoken":["lan-goth-lin","lhan-goth-luhn"]},{"literal":"Auchenblae"}]}'
)

# Declare a compiled wordset
places_compiled_ws = RecognitionResource(
    external_reference = ResourceReference(
        type = 'COMPILED_WORDSET',
        uri = 'urn:nuance-mix:tag:wordset:lang/<context_tag>/places-compiled-ws/eng-USA/mix.asr',
        mask_load_failures = True
    )
)

# Send recognition parameters and audio to recognize
def client_stream(wf):
    try:
        # Set recognition parameters, DLM, and inline wordset
        init = RecognitionInitMessage(
            parameters = RecognitionParameters(
                language = 'en-US', 
                topic = 'GEN',
                audio_format = AudioFormat(pcm=PCM(sample_rate_hz=wf.getframerate())),
                result_type = 'FINAL', 
                utterance_detection_mode = 'MULTIPLE',
                recognition_flags = RecognitionFlags(auto_punctuate = True)
            ),
        #    resources = [travel_dlm, places_wordset],
            client_data = {'company':'Aardvark','user':'Leslie'} 
        )
        yield RecognitionRequest(recognition_init_message=init)

        # Simulate a realtime audio stream using an audio file
        print(f'stream {wf.name}')
        packet_duration = 0.020
        packet_samples = int(wf.getframerate() * packet_duration)
        for packet in iter(lambda: wf.readframes(packet_samples), b''):
            yield RecognitionRequest(audio=packet)
            sleep(packet_duration)
        print('stream complete')
    except CancelledError as e:
        print(f'client stream: RPC canceled')
    except Exception as e:
        print(f'client stream: {type(e)}')
        traceback.print_exc()

# Collect arguments from user
hostaddr = access_token = audio_file = None
try:
    hostaddr = sys.argv[1]
    access_token = sys.argv[2]
    audio_file = sys.argv[3]
except Exception as e:
    print(f'usage: {sys.argv[0]} <hostaddr> <token> <audio_file.wav>')
    exit(1)

# Check audio file attributes and open secure channel with token
with wave.open(audio_file, 'r') as wf:
    assert wf.getsampwidth() == 2, f'{audio_file} is not linear PCM'
    assert wf.getframerate() in [8000, 16000], f'{audio_file} sample rate must be 8000 or 16000'
    assert wf.getnchannels() == 1, f'{audio_file} is not a mono audio file'
    setattr(wf, 'name', audio_file)
    call_credentials = grpc.access_token_call_credentials(access_token)
    ssl_credentials = grpc.ssl_channel_credentials()
    channel_credentials = grpc.composite_channel_credentials(ssl_credentials, call_credentials)     
    with grpc.secure_channel(hostaddr, credentials=channel_credentials) as channel:
        stub = RecognizerStub(channel)
        stream_in = stub.Recognize(client_stream(wf))
        try:
            # Iterate through messages returned from server
            for message in stream_in:
                if message.HasField('status'):
                    if message.status.details:
                         print(f'{message.status.code} {message.status.message} - {message.status.details}')
                    else:
                         print(f'{message.status.code} {message.status.message}')
                elif message.HasField('result'):
                    restype = 'partial' if message.result.result_type else 'final'
                    print(f'{restype}: {message.result.hypotheses[0].formatted_text}')
        except StreamClosedError:
            pass
        except Exception as e:
            print(f'server stream: {type(e)}')
            traceback.print_exc()

These are the resulting client files, in the same directory as the nuance directory:

├── reco-client.py
├── run-reco-client.bat
├── run-reco-client.sh
└── nuance
    ├── asr
    │   ├── v1  
    │   |   ├── recognizer_pb2_grpc.py
    │   |   ├── recognizer_pb2.py
    │   |   ├── resource_pb2.py
    │   |   └── result_pb2.py
    │   └── v1beta1 [ASR Training files, if present]
    └── rpc
        ├── error_details_pb2.py
        ├── status_code_pb2.py
        └── status_pb2.py

You can use this client to request speech recognition, optionally including recognition resources such as domain language models and wordsets.

Edit run script

First, edit the sample shell script or batch file to add your Mix client ID and secret. The script also changes the colons in the client ID to %3A so curl can parse the value correctly.

The scope in the authorization request is asr, listed along with the scopes for other Mix products.

Linux: run-reco-client.sh
Windows: run-reco-client.bat

#!/bin/bash

CLIENT_ID=<Mix client ID, starting with appID:>
SECRET=<Mix client secret>
#Change colons (:) to %3A in client ID
CLIENT_ID=${CLIENT_ID//:/%3A}

MY_TOKEN="`curl -s -u "$CLIENT_ID:$SECRET" \
"https://auth.crt.nuance.com/oauth2/token" \
-d "grant_type=client_credentials" -d "scope=asr nlu tts dlg" \
| python -c 'import sys, json; print(json.load(sys.stdin)["access_token"])'`"

python3 reco-client.py asr.api.nuance.com:443 $MY_TOKEN $1

# $1 - The location of a mono audio file, 8 or 16 kHz

@echo off
setlocal enabledelayedexpansion

set CLIENT_ID=< Mix client ID, starting with appID:>
set SECRET=<Mix client secret>
rem Change colons (:) to %3A in client ID
set CLIENT_ID=!CLIENT_ID::=%%3A!

set command=curl -s ^
-u %CLIENT_ID%:%SECRET% ^
-d "grant_type=client_credentials" -d "scope=asr nlu tts dlg" ^
https://auth.crt.nuance.com/oauth2/token

for /f "delims={}" %%a in ('%command%') do (
  for /f "tokens=1 delims=:, " %%b in ("%%a") do set key=%%b
  for /f "tokens=2 delims=:, " %%b in ("%%a") do set value=%%b
  goto done:
)

:done

rem Remove quotes
set MY_TOKEN=!value:"=!

python reco-client.py asr.api.nuance.com:443 %MY_TOKEN% %1

rem %1 - The location of a mono audio file, 8 or 16 kHz

Using your credentials and scope, the script generates a token that authorizes the client to call the ASRaaS service. It stores the resulting token in an environment variable, MY_TOKEN.

Alternatively, you might incorporate the token-generation code within the client.

This client accepts arguments positionally, without names:

try:
    hostaddr = sys.argv[1]
    access_token = sys.argv[2]
    audio_file = sys.argv[3]

Pass these arguments as you run the client using the shell script:

The location of the ASRaaS server on Mix: asr.api.nuance.com:443
An access token generated by the Mix OAuth server, as an environment variable, in this example MY_TOKEN.
The location of a local audio file that the client will chunk up and use to simulate real-time recognition. In this scenario, you can provide the audio file as you run the script.

Run the recognition client

The client accepts an audio file and passes it to ASRaaS to recognize it. As described in Client app development: Request recognition, this client simulates streaming audio by chunking up an audio file.

Run the client from the shell or batch script, passing it the name and location of an audio file (mono, 8 or 16 kHz) on your system:

Linux
Windows

./run-reco-client.sh ../audio/weather16.wav

stream ../audio/weather16.wav
100 Continue - recognition started on audio/l16;rate=16000 stream
final: There is more snow coming to the Montreal area in the next few days
final: We're expecting 10 cm overnight and the winds are blowing hard
final: Radar and satellite pictures show that we're on the western edge of the storm system as it continues to track further to the east
stream complete
200 Success

run-reco-client.bat ..\audio\weather16.wav

stream ..\audio\weather16.wav
100 Continue - recognition started on audio/l16;rate=16000 stream
final: There is more snow coming to the Montreal area in the next few days
final: We are expecting 10 cm overnight and the winds are blowing hard
final: Our radar and satellite pictures show that we are on the western edge of the storm system as it continues to track further to the east
stream complete
200 Success

Optionally change language

The language set in the client is en-US, or American English. If your audio files are in a different language, you must change the language code. If your audio uses a different English locale, you’ll get better recognition if you use a region-specific code.

Open reco-client.py in a text editor and change the language line. For example, depending on the language of your audio, you might set:

language='fr-CA' for French Canadian
language='es-ES' for Spanish
language='en-IN' for Indian English
language='en-GB' for British English

init = RecognitionInitMessage(
    parameters = RecognitionParameters(
        language='fr-CA',  # For example, change 'en-us' to 'fr-CA'
        ...
    ),
#   resources = [travel_dlm, places_wordset], # Leave commented out
)

Save reco-client.py and run the script, passing it an audio file spoken in your language or locale. In this scenario, the audio is in French, saying there’s more snow coming to the Montreal area in the next few days. ASRaaS recognizes this due to the fr-CA language code.

Linux
Windows

./run-reco-client.sh ../audio/weather-fr.wav

stream ../audio/weather-fr.wav
100 Continue - recognition started on audio/l16;rate=16000 stream
final:
stream complete
final: Il y aura encore de la neige dans la région de Montréal au cours des prochains jours.
200 Success

run-reco-client.bat ..\audio\weather-fr.wav

stream ..\audio\weather-fr.wav
100 Continue - recognition started on audio/l16;rate=16000 stream
stream complete
final: Il y aura encore de la neige dans la région de Montréal au cours des prochains jours.
200 Success

Optionally add resources

This client includes optional code for adding wordsets and/or DLMs. These resources let ASRaaS recognize terms and vocabulary used by speakers in your environment.

Add a wordset

A wordset is a collection of unusual words and short phrases that your users might say. You can include wordsets in recognition requests either on their own or as extensions of DLMs. See Wordsets.

Here you’ll add a standalone inline wordset with place names that ASRaaS might not otherwise recognize.

Open reco-client.py in a text editor and add an inline wordset. You can use the wordset in the sample client or create one with your own terms.

Inline wordsets must use compressed JSON but you can expand the JSON to work with it more easily, then recompress it for the client:

places_wordset = RecognitionResource(
    inline_wordset='{"PLACES":[{"literal":"La Jolla","spoken":["la hoya"]},{"literal":"Llanfairpwllgwyngyll","spoken":["lan vire pool guin gill"]},{"literal":"Abington Pigotts"},{"literal":"Steeple Morden"},{"literal":"Hoyland Common"},{"literal":"Cogenhoe","spoken":["cook no"]},{"literal":"Fordoun","spoken":["forden"]},{"literal":"Llangollen","spoken":["lan goth lin"]},{"literal":"Auchenblae"}]}'
)

Uncomment the resources line in RecognitionInitMessage and remove travel_dlm, leaving only places_wordset:

init = RecognitionInitMessage(
    parameters = RecognitionParameters(
        language='en-us',
        audio_format=AudioFormat(pcm=PCM(sample_rate_hz=wf.getframerate())),
        result_type='FINAL',
        utterance_detection_mode='MULTIPLE'
    ),
    resources = [places_wordset] # Uncomment this line, keep only places_wordset
)

Save the edited client file and run the script, passing it an audio file containing some of the terms in your wordset. For example, towns_16.wav includes some unusual place names. Without the wordset, these places are not understood, but the wordset lets ASRaaS recognize them perfectly.

See Before and after DLM and wordset for notes on the pronunciation of these places:

Linux
Windows

./run-reco-client.sh ../audio/towns_16.wav

stream ../audio/towns_16.wav
100 Continue - recognition started on audio/l16;rate=16000 stream
final: I'm going on a trip to Abington Pigotts in Cambridgeshire England
final: I'm speaking to you from the town of Cogenhoe in Northamptonshire
final: We stopped at the village of Steeple Morden on our way to Hoyland Common in Yorkshire
final: We spent a week in the town of Llangollen in Wales
final: Have you ever thought of moving to La Jolla in California
stream complete
200 Success

run-reco-client.bat ..\audio\towns_16.wav

stream ..\audio\towns_16.wav
100 Continue - recognition started on audio/l16;rate=16000 stream
final: I'm going on a trip to Abington Pigotts in Cambridgeshire England
final: I'm speaking to you from the town of Cogenhoe in Northamptonshire
final: We stopped at the village of Steeple Morden on our way to Hoyland Common in Yorkshire
final: We spent a week in the town of Llangollen in Wales
final: Have you ever thought of moving to La Jolla in California
stream complete
200 Success

Use a DLM and wordset

For more general specialization of the data pack, include DLMs and associated wordsets in your recognition request. See Domain LMs.

A DLM is an extension of the data pack’s language model, specialized for your environment. The DLM may contain one or more entities, or lists of specific words and phrases. You may also extend these entities with wordsets. Unlike the standalone wordsets described above, wordsets that extend DLM entities must be loaded into the recognition request along with the DLM.

In this exercise you’ll add a DLM that was created in Mix, along with a wordset that extends the DLM’s entity named PLACES.

See Prerequisites from Mix: DLMs from Mix to create a DLM with entities that you can extend with wordsets. This example uses entities named NAMES and PLACES.

Note:
The Mix project that generates your DLM must use the same language and topic as your client application. See Language in request and DLM.

Open reco-client.py in a text editor and enter the name of your DLM in <context tag>:

# Replace <context tag> with your Mix tag
travel_dlm = RecognitionResource(
    external_reference = ResourceReference(
        type='DOMAIN_LM',
        uri='urn:nuance-mix:tag:model/<context tag>/mix.asr?=language=eng-USA'
    )
)

Add terms to the PLACES entity in the DLM using a wordset. You can use the wordset in the sample client or create one with your own terms.

places_wordset = RecognitionResource(
    inline_wordset='{"PLACES":[{"literal":"La Jolla","spoken":["la hoya"]},{"literal":"Llanfairpwllgwyngyll","spoken":["lan vire pool guin gill"]},{"literal":"Abington Pigotts"},{"literal":"Steeple Morden"},{"literal":"Hoyland Common"},{"literal":"Cogenhoe","spoken":["cook no"]},{"literal":"Fordoun","spoken":["forden"]},{"literal":"Llangollen","spoken":["lan goth lin"]},{"literal":"Auchenblae"}]}'
)

Uncomment the resources line in RecognitionInitMessage.

init = RecognitionInitMessage(
    parameters = RecognitionParameters(
        language='en-us',
        audio_format=AudioFormat(pcm=PCM(sample_rate_hz=wf.getframerate())),
        result_type='FINAL',
        utterance_detection_mode='MULTIPLE'
    ),
    resources = [travel_dlm, places_wordset] # Uncomment resources line
)

Save the edited client and run the script, passing it an audio file containing terms in your DLM and/or wordset. In this example, the results are the same as when using the standalone wordset.

If you get the “Bad request: language mismatch” error, remember the DLM must have the same language and topic as the current recognition request. See Language in request and DLM for help.

Use a compiled wordset

Instead of an inline wordset, you may use a compiled wordset created with the Training API (see Sample training client). To do so, edit the definition to add your own context tag, and add the wordset to the resources list in place of the inline wordset.

A compiled wordset must extend a DLM entity, so include both DLM and wordset in your recognition request:

# Replace <context tag> with your Mix tag
travel_dlm = RecognitionResource(
    external_reference = ResourceReference(
        type='DOMAIN_LM',
        uri='urn:nuance-mix:tag:model/<context tag>/mix.asr?=language=eng-USA'
    )
)

# Replace <context tag> with your Mix tag
places_compiled_ws = RecognitionResource(
    external_reference = ResourceReference(
        type = 'COMPILED_WORDSET',
        uri = 'urn:nuance-mix:tag:wordset:lang/<context_tag>/places-compiled-ws/eng-USA/mix.asr',
        mask_load_failures = True
    )
)

init = RecognitionInitMessage(
    parameters = RecognitionParameters(...),
    resources = [travel_dlm, places_compiled_ws]
)

Display all results

This client prints just a few selected fields. For examples of adding extra individual fields, see Dsp, Hypothesis, and DataPack. To display all possible information returned by ASRaaS, replace these lines:

for message in stream_in:
    if message.HasField('status'):
        if message.status.details:
            print(f'{message.status.code} {message.status.message} - {message.status.details}')
        else:
            print(f'{message.status.code} {message.status.message}')
    elif message.HasField('result'):
        restype = 'partial' if message.result.result_type else 'final'
        print(f'{restype}: {message.result.hypotheses[0].formatted_text}')

With these:

for message in stream_in:
    print(message)

For an example of these longer—potentially much longer—results, see Fields chosen by client: All fields.

Feedback

Was this page helpful?

Glad to hear it! Please tell us how we can improve.

Sorry to hear that. Please tell us how we can improve.