NLTK12《Python自然语言处理》code11 语言数据管理

语言数据管理

# -*- coding: utf-8 -*-
# win10 python3.5.3/python3.6.1 nltk3.2.4
# 《Python自然语言处理》 11 语言数据管理
# pnlp11.py

import nltk
# 11.1 语料库结构：案例研究
phonetic = nltk.corpus.timit.phones('dr1-fvmh0/sa1')
print(phonetic) # ['h#', 'sh', 'iy', 'hv', 'ae', 'dcl', 'y', 'ix',...
res = nltk.corpus.timit.word_times('dr1-fvmh0/sa1')
print(res) # [('she', 7812, 10610), ('had', 10610, 14496),...

timitdict = nltk.corpus.timit.transcription_dict()
res = timitdict['greasy'] + timitdict['wash'] + timitdict['water']
print(res) # ['g', 'r', 'iy1', 's', 'iy', 'w', 'ao1', 'sh', 'w', 'ao1', 't', 'axr']
print(phonetic[17:30]) # ['g', 'r', 'iy', 's', 'iy', 'w', 'aa', 'sh', 'epi', 'w', 'aa', 'dx', 'ax']

res = nltk.corpus.timit.spkrinfo('dr1-fvmh0')
print(res) # SpeakerInfo(id='VMH0', sex='F', dr='1', use='TRN', recdate='03/11/86', ...

# 主要设计特点
# 基本数据类型

# 11.2 语料库生命周期
# 创建语料库的3种方案
# 质量控制
s1 = "00000010000000001000000"
s2 = "00000001000000010000000"
s3 = "00001000000000000001000"
res = nltk.windowdiff(s1, s1, 3)
print(res) # 0.0
res = nltk.windowdiff(s1, s2, 3)
print(res) # 0.19047619047619047
res = nltk.windowdiff(s2, s3, 3)
print(res) # 0.5714285714285714

# 维护与演变

# 11.3 数据采集
# 从网上获取数据
# 从文字处理器文件获取数据

"""dict.htm.html
<p class=MsoNormal>sleep
 <span style='mso-spacerun:yes'></span>
 [<span class=S;ellE>sli:p</span>]
 <span style='mso-spancerun:yes'></span>
 <b><span style='font-size:11.0pt'>v.i</span></b>
 <span style='mso-spacerun:yes'></span>
 <i>a condition of body and mind ...<o:p></o:p></i>
</p>
"""
import re
legal_pos = set(['n', 'v.t.', 'v.i.', 'adj', 'det'])
pattern = re.compile(r"'font-size:11.0pt'>([a-z.]+)<")
document = open('dict.htm.html').read()
used_pos = set(re.findall(pattern, document))
illegal_pos = used_pos.difference(legal_pos)
print(list(illegal_pos)) # ['v.i']

# 例11-1 将Microsoft Word 创建的HTML转换成CSV
import bs4, lxml
def lexical_data(html_file):
    SEP = '_INTRY'
    html = open(html_file).read()
    html = re.sub(r'<p', SEP + '<p', html)
# text = nltk.clean_html(html)
    text = bs4.BeautifulSoup(html, "lxml").get_text()
    text = ' '.join(text.split())
for entry in text.split(SEP):
if entry.count(' ') > 2:
yield entry.split(' ', 3)

import csv
writer = csv.writer(open("dict1.csv", "w"))
writer.writerows(lexical_data("dict.htm.html"))
# dict1.csv
# sleep,[sli:p],v.i,a condition of body and mind ...

# 从电子表格和数据库中获取数据
"""
# dict.csv
"sleep", "sli:p", "v.i", "a condition of body and mind ..."
"walk", "wo:k", "v.intr", "progress by lifting and setting down each foot ..."
"wake", "weik", "intrans", "cease to sleep"
"""
import csv
lexicon = csv.reader(open("dict.csv"))
pairs = [(lexeme, defn) for (lexeme, _, _, defn) in lexicon]
lexemes, defns = zip(*pairs)
defn_words = set(w for defn in defns for w in defn.split())
res = sorted(defn_words.difference(lexemes))
print(res)
"""
['"a', '"cease', '"progress', '..."', 'and', 'body', 'by', 'condition', 'down', 
'each', 'foot', 'lifting', 'mind', 'of', 'setting', 'sleep"', 'to']
"""

# 转换数据格式
idx = nltk.Index((defn_word, lexeme)
for (lexeme, defn) in pairs
for defn_word in nltk.word_tokenize(defn)
if len(defn_word) > 3)
idx_file = open("dict.idx", "w")
for word in sorted(idx):
    idx_words = ', '.join(idx[word])
    idx_line = "%s: %s\n" % (word, idx_words)
    idx_file.write(idx_line)
idx_file.close()
"""dict.idx
body: sleep
cease: wake
condition: sleep
down: walk
each: walk
foot: walk
lifting: walk
mind: sleep
progress: walk
setting: walk
sleep: wake
"""

# 决定要包含的标注层
# 标准和工具

# 处理濒危语言时特别注意事项
# 有错误
mappings = [('ph', 'f'), ('ght', 't'), ('^kn', 'n'), ('qu', 'kw'),
            ('[aeiou]+', 'a'), (r'(.)\1', r'\1')]
def signature(word):
for patt, repl in mappings:
        word = re.sub(patt, repl, word)
    pieces = re.findall('[aeiou]+', word)
return ''.join(char for piece in pieces for char in sorted(piece))[:8]
print(signature('illefent')) # aaa
print(signature('ebsekwieous')) # aaa
print(signature('nuculerr')) # aaa

signatures = nltk.Index((signature(w), w) for w in nltk.corpus.words.words())
res = signatures[signature('nuculerr')]
print(res)
# ['Aaronic', 'abaca', 'abacay', 'abacist', 'abaction', ...

from nltk.metrics import edit_distance
def rank(word, wordlist):
    ranked = sorted((edit_distance(word, w), w) for w in wordlist)
return [word for (_, word) in ranked]

def fuzzy_spell(word):
    sig = signature(word)
if sig in signatures:
return rank(word, signatures[sig])
else:
return []

print(fuzzy_spell('illefent')) # ['idlement', 'element', 'idleset', ...
print(fuzzy_spell('ebsekwieous')) # formulaic', 'formular', 'formulary',...
print(fuzzy_spell('nucular')) # ['Ducula', 'Nucula', 'cumular', 'facular',...

# 11.4 使用XML
# 语言结构中使用XML
# XML的作用
# ElementTree接口
# 使用ElementTree访问Toolbox数据
# 格式化条目

# 11.5 使用Toolbox数据
# 为每个条目增加字段

# 11.6 使用OLAC元数据描述语言资源
秒客网

NLTK12《Python自然语言处理》code11 语言数据管理

语言数据管理

相关文章