From Sphinx
#!/usr/bin/env python
"""
Fix up a dictionary that was downloaded from
http://fife.speech.cs.cmu.edu/cgi-bin/tools/lmtool.2.pl
that may contain phonemes not recognized by the
wall street journal acoustic models being used
Should fix errors such as:
07:17.924 SEVERE wsj Can't find HMM for AXR
Good dict entry:
FIRED F AY ER D
Bad dict entry:
FIRED F AY AXR D
"""
import re
def main(file2fix, result2write):
content = open(file2fix,'r').read()
mappings = [('AXR', 'ER'), ('AX', 'AH'), ('IX', 'IH'), ('DX', 'D')]
for mapping in mappings:
bad_phoneme = mapping[0]
good_phoneme = mapping[1]
matcher = re.compile(bad_phoneme, re.I)
content = matcher.sub(good_phoneme,content)
file2write = open(result2write,'w')
file2write.write(content)
file2write.close()
if __name__=="__main__":
import sys
if len(sys.argv) != 3:
msg = "Usage: fixdict.py /path/to/x.dict /path/to/result.dict"
raise Exception(msg)
main(sys.argv[1], sys.argv[2])