Lmtool py script

From Sphinx

NOTE: This only grabs the dictionary and would have to be adapted a bit to download the language model.


#!/usr/bin/env python

import BeautifulSoup    # << download this

import httplib, mimetypes
import re
import urllib2

HOST = 'fife.speech.cs.cmu.edu'
PORT = '80'

def post_multipart(host, selector, fields, files):
    """
    Post fields and files to an http host as multipart/form-data.
    fields is a sequence of (name, value) elements for regular form fields.
    files is a sequence of (name, filename, value) elements for data to be uploaded as files
    Return the server's response page.
    """
    content_type, body = encode_multipart_formdata(fields, files)
    h = httplib.HTTP(host)
    h.putrequest('POST', selector)
    h.putheader('content-type', content_type)
    h.putheader('content-length', str(len(body)))
    h.endheaders()
    h.send(body)
    errcode, errmsg, headers = h.getreply()
    return h.file.read()

def encode_multipart_formdata(fields, files):
    """
    fields is a sequence of (name, value) elements for regular form fields.
    files is a sequence of (name, filename, value) elements for data to be uploaded as files
    Return (content_type, body) ready for httplib.HTTP instance
    """
    BOUNDARY = '---------------------------18710690068420275431780121534'
    CRLF = '\r\n'
    L = []
    for (key, value) in fields:
        L.append('--' + BOUNDARY)
        L.append('Content-Disposition: form-data; name="%s"' % key)
        L.append('')
        L.append(value)
    for (key, filename, value) in files:
        L.append('--' + BOUNDARY)
        L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename))
        L.append('Content-Type: %s' % get_content_type(filename))
        L.append('')
        L.append(value)
    L.append('--' + BOUNDARY + '--')
    L.append('')
    body = CRLF.join(L)
    content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
    return content_type, body


def get_content_type(filename):
    return mimetypes.guess_type(filename)[0] or 'application/octet-stream'


def upload_corpus(corpusfile):
    content = open(corpusfile, 'r').read()
    file2upload = ("corpus", "corpusfile.txt", content)
    fields = ("formtype", "simple")
    files2upload = [file2upload]
    result = post_multipart("%s:%s" % (HOST, PORT),
                            "/cgi-bin/tools/lmtool.2.pl",
                            [fields],
                            files2upload)
    return result


def parse_dictionary_url(upload_result):
    """
    Given html such as:
    ...
    <a href="/tools/product//1172960283_24918/3290.dic">Dictionary</a>
    ..
    parse out the relative link and construct an absolute
    url and return to caller, eg:
    fife.speech.cs.cmu.edu/tools/product//1172960283_24918/3290.dic
    """

    # sanitize by removing stuff like this:
    # <!-- BASENAME >
    # find <!- followed by any character EXCEPT a '<' or a '>', and
    # keep search until finding a >
    matcher = re.compile('<!-([^<^>]*)>', re.I)
    upload_result = matcher.sub('',upload_result)

    # parse into data struct
    soup = BeautifulSoup(upload_result)

    # find all links
    links = soup('a')
    for link in links:
        linkstr = str(link)
        if linkstr.find(".dic") != -1:
            # found link of interest
            href = link['href']
            return "http://%s%s" % (HOST, href)
    return None


def dl_content(longurl, file2write):
    """ fetch the content from a url, write to given file """
    req = urllib2.Request(longurl)
    fd = urllib2.urlopen(req)
    sinkfd = open(file2write, 'w')
    while 1:
        data = fd.read(1024)
        sinkfd.write(data)
        if not len(data):
            break

def main(corpusfile, result2write):
    upload_result = upload_corpus(corpusfile)
    dictionary_url = parse_dictionary_url(upload_result)
    dl_content(dictionary_url, result2write)

if __name__=="__main__":
    import sys
    if len(sys.argv) != 3:
        msg = "Usage: corpus2dict.py /path/to/corpusfile.txt /path/to/result.dict"
        raise Exception(msg)
    main(sys.argv[1], sys.argv[2])
related