2
0

speechtools.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. from freeswitch import *
  2. from xml.dom import minidom
  3. VOICE_ENGINE = "cepstral"
  4. VOICE = "William"
  5. """
  6. A few classes that make it easier to write speech applications
  7. using Python. It is roughly modelled after the equivalent that
  8. is written in JavaScript.
  9. Status: should work, but not yet complete. some pending items
  10. are mentioned in comments
  11. """
  12. class Grammar:
  13. def __init__(self, name, path, obj_path,
  14. min_score=1, confirm_score=400, halt=False):
  15. """
  16. @param name - name of grammar to reference it later
  17. @param path - path to xml grammar file
  18. @param obj_path - xml path to find interpretation from root
  19. in result xml, eg, 'interpretation'
  20. @param min_score - score threshold to accept result
  21. @param confirm_score - if score below this threshold, ask user
  22. if they are sure this is correct
  23. @param halt - not sure what was used for in js, currently unused
  24. """
  25. self.name=name
  26. self.path=path
  27. self.obj_path=obj_path
  28. self.min_score=min_score
  29. self.confirm_score=confirm_score
  30. self.halt=halt
  31. class SpeechDetect:
  32. def __init__(self, session, module_name, ip_addr):
  33. self.session=session
  34. self.module_name=module_name
  35. self.ip_addr=ip_addr
  36. self.grammars = {}
  37. def addGrammar(self, grammar):
  38. self.grammars[grammar.name]=grammar
  39. def setGrammar(self, name):
  40. self.grammar = self.grammars[name]
  41. def detectSpeech(self):
  42. # TODO: we might not always want to call detect_speech
  43. # with this cmd, see js version for other options
  44. # also see detect_speech_function() in mod_dptools.c
  45. cmd = "%s %s %s %s" % (self.module_name,
  46. self.grammar.name,
  47. self.grammar.path,
  48. self.ip_addr)
  49. console_log("debug", "calling detect_speech with: %s\n" % cmd)
  50. self.session.execute("detect_speech", cmd)
  51. console_log("debug", "finished calling detect_speech\n")
  52. class SpeechObtainer:
  53. def __init__(self, speech_detect, required_phrases, wait_time, max_tries):
  54. """
  55. @param speech_detect - the speech detect object, which holds a
  56. reference to underlying session and can
  57. be re-used by many SpeechObtainers
  58. @param required_phrases - the number of required phrases from the
  59. grammar. for example if its prompting for
  60. the toppings on a sandwhich and min toppings
  61. is 3, use 3. normally will be 1.
  62. @param wait_time - the time, in millisconds, to wait for
  63. input during each loop iteration
  64. @param max_tries - this number multiplied by wait time gives the
  65. 'total wait time' before we give up and return
  66. partial or no result
  67. """
  68. self.speech_detect=speech_detect
  69. self.required_phrases=required_phrases
  70. self.wait_time=wait_time
  71. self.max_tries=max_tries
  72. self.detected_phrases = []
  73. self.failed = False
  74. def setGrammar(self, grammar):
  75. """
  76. @param grammar - instance of grammar class
  77. """
  78. self.grammar=grammar
  79. self.speech_detect.addGrammar(grammar)
  80. self.speech_detect.setGrammar(self.grammar.name)
  81. def detectSpeech(self):
  82. self.speech_detect.detectSpeech()
  83. def run(self):
  84. """
  85. start speech detection with the current grammar,
  86. and listen for results from asr engine. once a result
  87. has been returned, return it to caller
  88. """
  89. def dtmf_handler(input, itype, funcargs):
  90. console_log("INFO","\n\nDTMF itype: %s\n" % itype)
  91. if itype == 1: # TODO!! use names for comparison instead of number
  92. return self.handle_event(input, funcargs)
  93. elif itype== 0:
  94. console_log("INFO","\n\nDTMF input: %s\n" % input)
  95. else:
  96. console_log("INFO","\n\nUnknown input type: %s\n" % itype)
  97. return None
  98. num_tries = 0
  99. session = self.speech_detect.session
  100. console_log("debug", "setting dtmf callback\n")
  101. session.setDTMFCallback(dtmf_handler, "")
  102. console_log("debug", "calling getDigits\n")
  103. console_log("debug", "starting run() while loop\n")
  104. while (session.ready() and
  105. num_tries < self.max_tries and
  106. len(self.detected_phrases) < self.required_phrases and
  107. not self.failed):
  108. console_log("debug", "top of run() while loop\n")
  109. session.collectDigits(self.wait_time)
  110. num_tries += 1
  111. console_log("debug", "while loop finished\n")
  112. return self.detected_phrases
  113. def handle_event(self, event, funcargs):
  114. """
  115. when the dtmf handler receives an event, it calls back
  116. this method. event is a dictionary with subdictionaries ..
  117. Example 1
  118. =========
  119. {'body': None, 'headers': {'Speech-Type': 'begin-speaking'}}
  120. Example 2
  121. =========
  122. {'body': '<result xmlns='http://www.ietf.org/xml/ns/mrcpv2'
  123. xmlns:ex='http://www.example.com/example' score='100'
  124. grammar='session:request1@form-level.store'><interpretation>
  125. <input mode='speech'>waffles</input></interpretation></result>',
  126. 'headers': {'Speech-Type': 'detected-speech'}}
  127. This dictionary is constructed in run_dtmf_callback() in
  128. freeswitch_python.cpp
  129. """
  130. # what kind of event?
  131. headers = event['headers']
  132. speech_type = headers['Speech-Type']
  133. if speech_type == "begin-speaking":
  134. # not sure what to do with this, try returning "stop"
  135. # so that it might stop playing a sound file once
  136. # speech has been detected
  137. return "stop"
  138. elif speech_type == "detected-speech":
  139. # extract the detected phrase. from result
  140. # BUG: this assumes only ONE interpretation in the xml
  141. # result. rest will get igored
  142. # NOTE: have to wrap everything with str() (at least
  143. # calls to console_log because otherwise it chokes on
  144. # unicode strings.
  145. # TODO: check the score
  146. body = event['body']
  147. if not body or len(body) == 0 or body == "(null)":
  148. # freeswitch returned a completely empty result
  149. self.failed = True
  150. # do we want to return stop? what should we return?
  151. return "stop"
  152. dom = minidom.parseString(body)
  153. phrase = dom.getElementsByTagName(self.grammar.obj_path)[0]
  154. phrase_text = self.getText(phrase)
  155. if phrase_text:
  156. self.detected_phrases.append(str(phrase_text))
  157. # do we want to return stop? what should we return?
  158. return "stop"
  159. else:
  160. raise Exception("Unknown speech event: %s" % speech_type)
  161. def getText(self, elt):
  162. """ given an element, get its text. if there is more than
  163. one text node child, just append all the text together.
  164. """
  165. result = ""
  166. children = elt.childNodes
  167. for child in children:
  168. if child.nodeType == child.TEXT_NODE:
  169. result += str(child.nodeValue)
  170. return result