Fixdict py

From Sphinx

#!/usr/bin/env python

"""
Fix up a dictionary that was downloaded from
http://fife.speech.cs.cmu.edu/cgi-bin/tools/lmtool.2.pl
that may contain phonemes not recognized by the
wall street journal acoustic models being used

Should fix errors such as:

07:17.924 SEVERE wsj                Can't find HMM for AXR


Good dict entry:
FIRED   F AY ER D

Bad dict entry:
FIRED   F AY AXR D


"""

import re

def main(file2fix, result2write):
    content = open(file2fix,'r').read()
    mappings = [('AXR', 'ER'), ('AX', 'AH'), ('IX', 'IH'), ('DX', 'D')]
    for mapping in mappings:
        bad_phoneme = mapping[0]
        good_phoneme = mapping[1]
        matcher = re.compile(bad_phoneme, re.I)
        content = matcher.sub(good_phoneme,content)
    file2write = open(result2write,'w')
    file2write.write(content)
    file2write.close()

if __name__=="__main__":
    import sys
    if len(sys.argv) != 3:
        msg = "Usage: fixdict.py /path/to/x.dict /path/to/result.dict"
        raise Exception(msg)
    main(sys.argv[1], sys.argv[2])

related