语言数据管理
import nltk
phonetic = nltk.corpus.timit.phones('dr1-fvmh0/sa1')
print(phonetic)
res = nltk.corpus.timit.word_times('dr1-fvmh0/sa1')
print(res)
timitdict = nltk.corpus.timit.transcription_dict()
res = timitdict['greasy'] + timitdict['wash'] + timitdict['water']
print(res)
print(phonetic[17:30])
res = nltk.corpus.timit.spkrinfo('dr1-fvmh0')
print(res)
s1 = "00000010000000001000000"
s2 = "00000001000000010000000"
s3 = "00001000000000000001000"
res = nltk.windowdiff(s1, s1, 3)
print(res)
res = nltk.windowdiff(s1, s2, 3)
print(res)
res = nltk.windowdiff(s2, s3, 3)
print(res)
"""dict.htm.html
<p class=MsoNormal>sleep
<span style='mso-spacerun:yes'></span>
[<span class=S;ellE>sli:p</span>]
<span style='mso-spancerun:yes'></span>
<b><span style='font-size:11.0pt'>v.i</span></b>
<span style='mso-spacerun:yes'></span>
<i>a condition of body and mind ...<o:p></o:p></i>
</p>
"""
import re
legal_pos = set(['n', 'v.t.', 'v.i.', 'adj', 'det'])
pattern = re.compile(r"'font-size:11.0pt'>([a-z.]+)<")
document = open('dict.htm.html').read()
used_pos = set(re.findall(pattern, document))
illegal_pos = used_pos.difference(legal_pos)
print(list(illegal_pos))
import bs4, lxml
def lexical_data(html_file):
SEP = '_INTRY'
html = open(html_file).read()
html = re.sub(r'<p', SEP + '<p', html)
text = bs4.BeautifulSoup(html, "lxml").get_text()
text = ' '.join(text.split())
for entry in text.split(SEP):
if entry.count(' ') > 2:
yield entry.split(' ', 3)
import csv
writer = csv.writer(open("dict1.csv", "w"))
writer.writerows(lexical_data("dict.htm.html"))
"""
# dict.csv
"sleep", "sli:p", "v.i", "a condition of body and mind ..."
"walk", "wo:k", "v.intr", "progress by lifting and setting down each foot ..."
"wake", "weik", "intrans", "cease to sleep"
"""
import csv
lexicon = csv.reader(open("dict.csv"))
pairs = [(lexeme, defn) for (lexeme, _, _, defn) in lexicon]
lexemes, defns = zip(*pairs)
defn_words = set(w for defn in defns for w in defn.split())
res = sorted(defn_words.difference(lexemes))
print(res)
"""
['"a', '"cease', '"progress', '..."', 'and', 'body', 'by', 'condition', 'down',
'each', 'foot', 'lifting', 'mind', 'of', 'setting', 'sleep"', 'to']
"""
idx = nltk.Index((defn_word, lexeme)
for (lexeme, defn) in pairs
for defn_word in nltk.word_tokenize(defn)
if len(defn_word) > 3)
idx_file = open("dict.idx", "w")
for word in sorted(idx):
idx_words = ', '.join(idx[word])
idx_line = "%s: %s\n" % (word, idx_words)
idx_file.write(idx_line)
idx_file.close()
"""dict.idx
body: sleep
cease: wake
condition: sleep
down: walk
each: walk
foot: walk
lifting: walk
mind: sleep
progress: walk
setting: walk
sleep: wake
"""
mappings = [('ph', 'f'), ('ght', 't'), ('^kn', 'n'), ('qu', 'kw'),
('[aeiou]+', 'a'), (r'(.)\1', r'\1')]
def signature(word):
for patt, repl in mappings:
word = re.sub(patt, repl, word)
pieces = re.findall('[aeiou]+', word)
return ''.join(char for piece in pieces for char in sorted(piece))[:8]
print(signature('illefent'))
print(signature('ebsekwieous'))
print(signature('nuculerr'))
signatures = nltk.Index((signature(w), w) for w in nltk.corpus.words.words())
res = signatures[signature('nuculerr')]
print(res)
from nltk.metrics import edit_distance
def rank(word, wordlist):
ranked = sorted((edit_distance(word, w), w) for w in wordlist)
return [word for (_, word) in ranked]
def fuzzy_spell(word):
sig = signature(word)
if sig in signatures:
return rank(word, signatures[sig])
else:
return []
print(fuzzy_spell('illefent'))
print(fuzzy_spell('ebsekwieous'))
print(fuzzy_spell('nucular'))