123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206 |
- from freeswitch import *
- from xml.dom import minidom
- VOICE_ENGINE = "cepstral"
- VOICE = "William"
- """
- A few classes that make it easier to write speech applications
- using Python. It is roughly modelled after the equivalent that
- is written in JavaScript.
- Status: should work, but not yet complete. some pending items
- are mentioned in comments
- """
- class Grammar:
- def __init__(self, name, path, obj_path,
- min_score=1, confirm_score=400, halt=False):
- """
- @param name - name of grammar to reference it later
- @param path - path to xml grammar file
- @param obj_path - xml path to find interpretation from root
- in result xml, eg, 'interpretation'
- @param min_score - score threshold to accept result
- @param confirm_score - if score below this threshold, ask user
- if they are sure this is correct
- @param halt - not sure what was used for in js, currently unused
- """
- self.name=name
- self.path=path
- self.obj_path=obj_path
- self.min_score=min_score
- self.confirm_score=confirm_score
- self.halt=halt
-
- class SpeechDetect:
- def __init__(self, session, module_name, ip_addr):
- self.session=session
- self.module_name=module_name
- self.ip_addr=ip_addr
- self.grammars = {}
- def addGrammar(self, grammar):
- self.grammars[grammar.name]=grammar
- def setGrammar(self, name):
- self.grammar = self.grammars[name]
- def detectSpeech(self):
- # TODO: we might not always want to call detect_speech
- # with this cmd, see js version for other options
- # also see detect_speech_function() in mod_dptools.c
- cmd = "%s %s %s %s" % (self.module_name,
- self.grammar.name,
- self.grammar.path,
- self.ip_addr)
- console_log("debug", "calling detect_speech with: %s\n" % cmd)
- self.session.execute("detect_speech", cmd)
- console_log("debug", "finished calling detect_speech\n")
-
- class SpeechObtainer:
- def __init__(self, speech_detect, required_phrases, wait_time, max_tries):
- """
- @param speech_detect - the speech detect object, which holds a
- reference to underlying session and can
- be re-used by many SpeechObtainers
- @param required_phrases - the number of required phrases from the
- grammar. for example if its prompting for
- the toppings on a sandwhich and min toppings
- is 3, use 3. normally will be 1.
- @param wait_time - the time, in millisconds, to wait for
- input during each loop iteration
- @param max_tries - this number multiplied by wait time gives the
- 'total wait time' before we give up and return
- partial or no result
- """
- self.speech_detect=speech_detect
- self.required_phrases=required_phrases
- self.wait_time=wait_time
- self.max_tries=max_tries
- self.detected_phrases = []
- self.failed = False
-
- def setGrammar(self, grammar):
- """
- @param grammar - instance of grammar class
- """
- self.grammar=grammar
- self.speech_detect.addGrammar(grammar)
- self.speech_detect.setGrammar(self.grammar.name)
- def detectSpeech(self):
- self.speech_detect.detectSpeech()
-
- def run(self):
- """
- start speech detection with the current grammar,
- and listen for results from asr engine. once a result
- has been returned, return it to caller
- """
- def dtmf_handler(input, itype, funcargs):
- console_log("INFO","\n\nDTMF itype: %s\n" % itype)
- if itype == 1: # TODO!! use names for comparison instead of number
- return self.handle_event(input, funcargs)
- elif itype== 0:
- console_log("INFO","\n\nDTMF input: %s\n" % input)
- else:
- console_log("INFO","\n\nUnknown input type: %s\n" % itype)
- return None
-
- num_tries = 0
- session = self.speech_detect.session
- console_log("debug", "setting dtmf callback\n")
- session.setDTMFCallback(dtmf_handler, "")
- console_log("debug", "calling getDigits\n")
-
- console_log("debug", "starting run() while loop\n")
- while (session.ready() and
- num_tries < self.max_tries and
- len(self.detected_phrases) < self.required_phrases and
- not self.failed):
- console_log("debug", "top of run() while loop\n")
- session.collectDigits(self.wait_time)
- num_tries += 1
- console_log("debug", "while loop finished\n")
- return self.detected_phrases
- def handle_event(self, event, funcargs):
- """
- when the dtmf handler receives an event, it calls back
- this method. event is a dictionary with subdictionaries ..
- Example 1
- =========
- {'body': None, 'headers': {'Speech-Type': 'begin-speaking'}}
- Example 2
- =========
- {'body': '<result xmlns='http://www.ietf.org/xml/ns/mrcpv2'
- xmlns:ex='http://www.example.com/example' score='100'
- grammar='session:request1@form-level.store'><interpretation>
- <input mode='speech'>waffles</input></interpretation></result>',
- 'headers': {'Speech-Type': 'detected-speech'}}
- This dictionary is constructed in run_dtmf_callback() in
- freeswitch_python.cpp
- """
- # what kind of event?
- headers = event['headers']
- speech_type = headers['Speech-Type']
- if speech_type == "begin-speaking":
- # not sure what to do with this, try returning "stop"
- # so that it might stop playing a sound file once
- # speech has been detected
- return "stop"
- elif speech_type == "detected-speech":
- # extract the detected phrase. from result
- # BUG: this assumes only ONE interpretation in the xml
- # result. rest will get igored
- # NOTE: have to wrap everything with str() (at least
- # calls to console_log because otherwise it chokes on
- # unicode strings.
- # TODO: check the score
- body = event['body']
- if not body or len(body) == 0 or body == "(null)":
- # freeswitch returned a completely empty result
- self.failed = True
- # do we want to return stop? what should we return?
- return "stop"
- dom = minidom.parseString(body)
- phrase = dom.getElementsByTagName(self.grammar.obj_path)[0]
- phrase_text = self.getText(phrase)
- if phrase_text:
- self.detected_phrases.append(str(phrase_text))
- # do we want to return stop? what should we return?
- return "stop"
- else:
- raise Exception("Unknown speech event: %s" % speech_type)
- def getText(self, elt):
- """ given an element, get its text. if there is more than
- one text node child, just append all the text together.
- """
- result = ""
- children = elt.childNodes
- for child in children:
- if child.nodeType == child.TEXT_NODE:
- result += str(child.nodeValue)
- return result
|