处理原始文本
from __future__ import division
import nltk, re, pprint
from urllib.request import urlopen
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
raw = urlopen(url).read()
print(type(raw))
print(len(raw))
raw = raw.decode('utf-8')
tokens = nltk.word_tokenize(raw)
print(type(tokens))
print(tokens[:10])
text = nltk.Text(tokens)
print(type(text))
print(text[1020:1040])
text.collocations()
n = raw.find("PART I")
print(n)
n = raw.rfind("End of Project Gutenberg's Crime")
print(n)
raw = raw[5303:1157681]
n = raw.find("PART I")
print(n)
from urllib.request import urlopen
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urlopen(url).read()
print(html[:5])
import bs4
import lxml
raw = bs4.BeautifulSoup(html, "lxml")
raw = raw.get_text()
tokens = nltk.word_tokenize(raw)
print(tokens)
tokens = tokens[96:399]
text = nltk.Text(tokens)
text.concordance('gene')
import feedparser
import bs4
import lxml
llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")
print(llog['feed']['title'])
print(len(llog.entries))
post = llog.entries[2]
print(post.title)
content = post.content[0].value
print(content[:70])
txt = bs4.BeautifulSoup(content, "lxml").get_text()
t0 = nltk.word_tokenize(txt)
print(t0)
t1 = nltk.word_tokenize(bs4.BeautifulSoup(llog.entries[2].content[0].value, "lxml").get_text())
print(t1)
f = open('document.txt')
raw = f.read()
print(raw)
f.close()
f = open('document.txt')
for line in f:
print(line.strip())
f.close()
path = nltk.data.find('corpora/abc/rural.txt')
raw = open(path, 'rU').read()
print(raw[:20])
s = input("Enter some text: ")
print(type(s), s)
raw = open('document.txt').read()
print(type(raw))
tokens = nltk.word_tokenize(raw)
print(type(tokens))
words = [w.lower() for w in tokens]
print(type(words), words)
vocab = sorted(set(words))
print(type(vocab), vocab)
vocab.append('blog')
query = 'Who knows?'
beatles = ['john', 'paul', 'georage', 'ringo']
monty = 'Monty Python'
print(monty)
circus = "Monty Python's Flying Circus"
print(circus)
circus = 'Monty Python\'s Flying Circus'
print(circus)
couplet = "Shall I compare thee to a Summer's day?"\
"Thou are more Lovely and more temperate:"
print(couplet)
couplet = ("Rough winds do shake the darling duds of May, "
"And Summer's lease hath all too short a date:")
print(couplet)
couplet = """Shall I compare thee to a Summer's day? Thou are more lovely and more temperate:"""
print(couplet)
print('very' + 'very' + 'very')
print('very'*3)
a = [1, 2, 3, 4, 5, 6, 7, 6, 5, 4, 3, 2, 1]
b = [' ' * 2 * (7 - i) + 'very' * i for i in a]
for line in b:
print(line)
monty = 'Monty Python'
print(monty)
grail = 'Holy Grail'
print(monty + grail)
print(monty, "and the", grail)
print(monty[0])
print(monty[3])
print(monty[5])
print(monty[-1])
print(monty[5])
print(monty[-7])
sent = 'colorless green ideas sleep furiously'
for char in sent:
print(char, end='')
from nltk.corpus import gutenberg
raw = gutenberg.raw('melville-moby_dick.txt')
fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())
print(list(fdist.keys()))
print(monty[6:10])
print(monty[-12:-7])
print(monty[:5])
print(monty[6:])
phrase = 'And now for something completely different'
if 'thing' in phrase:
print('found "thing"')
query = 'Who knows?'
beatles = ['John', 'Paul', 'George', 'Ringo']
print(query[2])
print(beatles[2])
print(query[:2])
print(beatles[:2])
print(query + " I dot't")
print(beatles + ['Brian'])
beatles[0] = "John Lennon"
del beatles[-1]
print(beatles)
beatles[0] = 'F'
path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt')
import codecs
f = codecs.open(path, encoding='latin2')
for line in f:
line = line.strip()
print(line.encode('unicode_escape'))
print(ord('a'))
a = u'\u0061'
print(a)
nacute = u'\u0144'
print(nacute)
nacute_utf = nacute.encode('utf8')
print(repr(nacute_utf))
import unicodedata
lines = codecs.open(path, encoding = 'latin2').readlines()
line = lines[2]
print(line.encode('unicode_escape'))
for c in line:
if ord(c) > 127:
print('%r U+%04x %s' % (c.encode('utf8'), ord(c), unicodedata.name(c)))
print(line.find(u'zosta\u0142y'))
line = line.lower()
print(line.encode('unicode_escape'))
import re
m = re.search(u'\u015b\w*', line)
print(m.group())
nltk.word_tokenize(line)
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
print(wordlist)
res = [w for w in wordlist if re.search('ed$', w)]
print(res)
res = [w for w in wordlist if re.search('^..j..t..$', w)]
print(res)
res = [w for w in wordlist if re.search('^[hgi][mno][jlk][def]$', w)]
print(res)
chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))
res = [w for w in chat_words if re.search('^m+i+n+e+$', w)]
print(res)
res = [w for w in chat_words if re.search('^[ha]+$', w)]
print(res)
wsj = sorted(set(nltk.corpus.treebank.words()))
res = [w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)]
print(res)
res = [w for w in wsj if re.search('^[A-Z]+$', w)]
print(res)
res = [w for w in wsj if re.search('^[0-9]{4}$', w)]
print(res)
res = [w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)]
print(res)
res = [w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)]
print(res)
res = [w for w in wsj if re.search('(ed|ing)$', w)]
print(res)
word = 'supercalifragilisticexpialidocious'
res = re.findall(r'[aeiou]', word)
print(res)
print(len(res))
wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(vs for word in wsj for vs in re.findall(r'[aeiou]{2,}', word))
fd.items()
regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
def compress(word):
pieces = re.findall(regexp, word)
return ''.join(pieces)
english_udhr = nltk.corpus.udhr.words('English-Latin1')
print(nltk.tokenwrap(compress(w) for w in english_udhr[:75]))
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
cfd = nltk.ConditionalFreqDist(cvs)
cfd.tabulate()
cv_word_pairs = [(cv, w) for w in rotokas_words
for cv in re.findall(r'[ptksvr][aeiou]', w)]
cv_index = nltk.Index(cv_word_pairs)
print(cv_index['su'])
def stem(word):
for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
if word.endswith(suffix):
return word[:-len(suffix)]
re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
def stem(word):
regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$'
res = re.findall(regexp, word)
if len(res) > 0:
stem = res[0]
else:
stem = None
return stem
raw = """DENNIS: Listen, strange women lying in ponds distributing swords ... is no basis for a system of government. Supreme executive power derivers from ... a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = nltk.word_tokenize(raw)
print(tokens)
res = [stem(t) for t in tokens if len(t) > 0]
print(res)
from nltk.corpus import gutenberg, nps_chat
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
moby.findall(r"<a> (<.*>) <man>")
chat = nltk.Text(nps_chat.words())
chat.findall(r"<.*> <.*> <bro>")
chat.findall(r"<l.*>{3,}")
from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")
raw = """DENNIS: Listen, strange women lying in ponds distributing swords ... is no basis for a system of government. Supreme executive power derivers from ... a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = nltk.word_tokenize(raw)
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
res = [porter.stem(t) for t in tokens]
print(res)
res = [lancaster.stem(t) for t in tokens]
print(res)
class IndexedText(object):
def __init__(self, stemmer, text):
self._text = text
self._stemmer = stemmer
self._index = nltk.Index((self._stem(word), i)
for (i, word) in enumerate(text))
def concordance(self, word, width = 40):
key = self._stem(word)
wc = int(width / 4)
for i in self._index[key]:
lcontext = ' '.join(self._text[i-wc:i])
rcontext = ' '.join(self._text[i:i+wc])
ldisplay = '%*s' % (width, lcontext[-width:])
rdisplay = '%-*s' % (width, rcontext[:width])
print(ldisplay, rdisplay)
def _stem(self, word):
return self._stemmer.stem(word).lower()
porter = nltk.PorterStemmer()
grail = nltk.corpus.webtext.words('grail.txt')
text = IndexedText(porter, grail)
text.concordance("lie")
wnl = nltk.WordNetLemmatizer()
res = [wnl.lemmatize(t) for t in tokens]
print(res)
raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone ... though), 'I won't have any pepper in my kitchen AT ALL. Soup does very ... well without--Maybe it's always pepper that makes people hot-tempered,'..."""
re.split(r' ', raw)
re.split(r'[ \t\n]+', raw)
re.split(r'\W+', raw)
re.findall(r'\w+|\S\w*', raw)
re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", raw)
text = 'That U.S.A poster-print costs $12.40...'
pattern = r"""(?x) ([A-Z]\.)+ | \w+(-\w+)* | \$?\d+(\.\d+)?%? | \.\.\. | [][.,;"'?():-_`] """
nltk.regexp_tokenize(text, pattern)
l1 = len(nltk.corpus.brown.words()) / len(nltk.corpus.brown.sents())
print(l1)
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = sent_tokenizer.tokenize(text)
pprint.pprint(sents[171:181])
def segment(text, segs):
words = []
last = 0
for i in range(len(segs)):
if segs[i] == '1':
words.append(text[last:i+1])
last = i + 1
words.append(text[last:])
return words
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"
res = segment(text, seg1)
print(res)
res = segment(text, seg2)
print(res)
def evaluate(text, segs):
words = segment(text, segs)
text_size = len(words)
lexicon_size = len(' '.join(list(set(words))))
return text_size + lexicon_size
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"
seg3 = "0000100100000011001000000110000100010000001100010000001"
res = segment(text, seg3)
print(res)
res = evaluate(text, seg3)
print(res)
res = evaluate(text, seg2)
print(res)
res = evaluate(text, seg1)
print(res)
from random import randint
def flip(segs, pos):
return segs[:pos] + str(1 - int(segs[pos])) + segs[pos + 1:]
def flip_n(segs, n):
for i in range(n):
segs = flip(segs, randint(0, len(segs)-1))
return segs
def anneal(text, segs, iterations, cooling_rate):
temperature = float(len(segs))
while temperature > 0.5:
best_segs, best = segs, evaluate(text, segs)
for i in range(iterations):
guess = flip_n(segs, int(round(temperature)))
score = evaluate(text, guess)
if score < best:
best, best_segs = score, guess
score, segs = best, best_segs
temperature = temperature / cooling_rate
print(evaluate(text, segs), segment(text, segs))
print("")
return segs
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
anneal(text, seg1, 5000, 1.2)
silly = ['We', 'called', 'him', 'Tortoise', 'because', 'he', 'taught', 'us', '.']
res = ' '.join(silly)
print(res)
res = ';'.join(silly)
print(res)
res = ''.join(silly)
print(res)
word = 'cat'
sentence = """hello world"""
print(word)
print(sentence)
fdist = nltk.FreqDist(['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat'])
for word in fdist:
print(word, '->', fdist[word], ";", end='')
print("")
for word in fdist:
print('%s->%d;' % (word, fdist[word]), end='')
print("")
template = 'Lee wants a %s right now'
menu = ['sandwich', 'spam fritter', 'pancake']
for snack in menu:
print(template % snack)
print('%6s' % 'dog')
print('%-6s' % 'dog')
width = 6
print('%-*s' % (width, 'dog'))
count, total = 3205, 9375
print("accuracy for %d words: %2.4f%%" % (total, 100*count/total))
def tabulate(cfdist, words, categories):
print('%-16s' % 'Category', end='')
for word in words:
print('%6s' % word, end='')
print("")
for category in categories:
print('%-16s' % category, end='')
for word in words:
print('%6d' % cfdist[category][word], end='')
print("")
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
tabulate(cfd, modals, genres)
output_file = open('output.txt', 'w')
words = set(nltk.corpus.genesis.words('english-kjv.txt'))
for word in sorted(words):
output_file.write(word + "\n")
print(len(words))
print(str(len(words)))
output_file.write(str(len(words)) + "\n")
output_file.close()
from textwrap import fill
saying = ['After', 'all', 'is', 'said', 'and', 'done', ',', 'more', 'is', 'said', 'than', 'done', '.']
for word in saying:
print(word, '(' + str(len(word)) + ')', end="")
print("")
format = '%s (%d),'
pieces = [format % (word, len(word)) for word in saying]
output = ' '.join(pieces)
wrapped = fill(output)
print(wrapped)