分析句子结构
import nltk
groucho_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']
parser = nltk.ChartParser(groucho_grammar)
trees = parser.parse(sent)
for tree in trees:
print(tree)
"""
(S
(NP I)
(VP
(VP (V shot) (NP (Det an) (N elephant)))
(PP (P in) (NP (Det my) (N pajamas)))))
(S
(NP I)
(VP
(V shot)
(NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))
"""
grammar1 = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP | V NP PP
PP -> P NP
V -> "saw" | "ate" | "walked"
NP -> "John" | "Mary" | "Bob" | Det N | Det N PP
Det -> "a" | "an" | "the" | "my"
N -> "man" | "dog" | "cat" | "telescope" | "park"
P -> "in" | "on" | "by" | "with"
""")
sent = "Mary saw Bob".split()
rd_parser = nltk.RecursiveDescentParser(grammar1)
for tree in rd_parser.parse(sent):
print(tree)
"""(S (NP Mary) (VP (V saw) (NP Bob)))"""
grammar1 = nltk.data.load('file:mygrammar.cfg')
sent = "Mary saw Bob".split()
rd_parser = nltk.RecursiveDescentParser(grammar1)
for tree in rd_parser.parse(sent):
print(tree)
"""(S (NP Mary) (VP (V saw) (NP Bob)))"""
grammar2 = nltk.CFG.fromstring("""
S -> NP VP
NP -> Det Nom | PropN
Nom -> Adj Nom | N
VP -> V Adj | V NP | V S | V NP PP
PP -> P NP
PropN -> 'Buster' | 'Chatterer' | 'Joe'
Det -> 'the' | 'a'
N -> 'bear' | 'squirrel' | 'tree' | 'fish' | 'log'
Adj -> 'angry' | 'frightened' | 'little' | 'tall'
V -> 'chased' | 'saw' | 'said' | 'thought' | 'was' | 'put'
P -> 'on'
""")
def init_wfst(tokens, grammar):
numtokens = len(tokens)
wfst = [[None for i in range(numtokens+1)] for j in range(numtokens+1)]
for i in range(numtokens):
productions = grammar.productions(rhs=tokens[i])
wfst[i][i+1] = productions[0].lhs()
return wfst
def complete_wfst(wfst, tokens, grammar, trace=False):
index = dict((p.rhs(), p.lhs()) for p in grammar.productions())
numtokens = len(tokens)
for span in range(2, numtokens + 1):
for start in range(numtokens + 1):
end = start + span
if end > numtokens: break
for mid in range(start+1, end):
nt1, nt2 = wfst[start][mid], wfst[mid][end]
if nt1 and nt2 and (nt1, nt2) in index:
wfst[start][end] = index[(nt1, nt2)]
if trace:
print("[%s] %3s [%s] %3s [%s] ==> [%s] %3s [%s]"
%(start, nt1, mid, nt2, end, start, index[(nt1, nt2)], end))
return wfst
def display(wfst, tokens):
print('\nWFST ' + ' '.join([("%-4d" % i) for i in range(1, len(wfst))]))
for i in range(len(wfst)-1):
print("%d " %i, end="")
for j in range(1, len(wfst)):
print("%-4s" % (wfst[i][j] or '.'), end="")
print("")
tokens = "I shot an elephant in my pajamas".split()
wfst0 = init_wfst(tokens, groucho_grammar)
display(wfst0, tokens)
"""
WFST 1 2 3 4 5 6 7
0 NP . . . . . .
1 . V . . . . .
2 . . Det . . . .
3 . . . N . . .
4 . . . . P . .
5 . . . . . Det .
6 . . . . . . N
"""
wfst1 = complete_wfst(wfst0, tokens, groucho_grammar)
display(wfst1, tokens)
"""
WFST 1 2 3 4 5 6 7
0 NP . . S . . S
1 . V . VP . . VP
2 . . Det NP . . .
3 . . . N . . .
4 . . . . P . PP
5 . . . . . Det NP
6 . . . . . . N
"""
wfst1 = complete_wfst(wfst0, tokens, groucho_grammar, trace=True)
"""
[2] Det [3] N [4] ==> [2] NP [4]
[5] Det [6] N [7] ==> [5] NP [7]
[1] V [2] NP [4] ==> [1] VP [4]
[4] P [5] NP [7] ==> [4] PP [7]
[0] NP [1] VP [4] ==> [0] S [4]
[1] VP [4] PP [7] ==> [1] VP [7]
[0] NP [1] VP [7] ==> [0] S [7]
"""
import nltk
groucho_dep_grammar = nltk.grammar.DependencyGrammar.fromstring("""
'shot' -> 'I' | 'elephant' | 'in'
'elephant' -> 'an' | 'in'
'in' -> 'pajamas'
'pajamas' -> 'my'
""")
print(groucho_dep_grammar)
"""
Dependency grammar with 7 productions
'shot' -> 'I'
'shot' -> 'elephant'
'shot' -> 'in'
'elephant' -> 'an'
'elephant' -> 'in'
'in' -> 'pajamas'
'pajamas' -> 'my'
"""
pdp = nltk.ProjectiveDependencyParser(groucho_dep_grammar)
sent = 'I shot an elephant in my pajamas'.split()
trees = pdp.parse(sent)
for tree in trees:
print(tree)
"""
(shot I (elephant an (in (pajamas my))))
(shot I (elephant an) (in (pajamas my)))
"""
from nltk.corpus import treebank
t = treebank.parsed_sents('wsj_0001.mrg')[0]
print(t)
"""
(S
(NP-SBJ
(NP (NNP Pierre) (NNP Vinken))
(, ,)
(ADJP (NP (CD 61) (NNS years)) (JJ old))
(, ,))
(VP
(MD will)
(VP
(VB join)
(NP (DT the) (NN board))
(PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
(NP-TMP (NNP Nov.) (CD 29))))
(. .))
"""
def filter(tree):
child_nodes = [child.label() for child in tree if isinstance(child, nltk.Tree)]
return (tree.label() == 'VP') and ('S' in child_nodes)
from nltk.corpus import treebank
res = [subtree for tree in treebank.parsed_sents()
for subtree in tree.subtrees(filter)]
print(res)
"""[Tree('VP', [Tree('VBN', ['named']), ..."""
import nltk
entries = nltk.corpus.ppattach.attachments('training')
table = nltk.defaultdict(lambda: nltk.defaultdict(set))
for entry in entries:
key = entry.noun1 + '-' + entry.prep + '-' + entry.noun2
table[key][entry.attachment].add(entry.verb)
for key in sorted(table):
if len(table[key]) > 1:
print(key, 'N:', sorted(table[key]['N']), 'V:', sorted(table[key]['V']))
"""
%-below-level N: ['left'] V: ['be']
%-from-year N: ['was'] V: ['declined', 'dropped', 'fell', 'grew', 'increased', 'plunged', 'rose', 'was']
...
"""
nltk.corpus.sinica_treebank.parsed_sents()[3450].draw()
import nltk
grammar = nltk.CFG.fromstring("""
S -> NP V NP
NP -> NP Sbar
Sbar -> NP V
NP -> 'fish'
V -> 'fish'
""")
tokens = ["fish"] * 5
cp = nltk.ChartParser(grammar)
for tree in cp.parse(tokens):
print(tree)
"""
(S (NP fish) (V fish) (NP (NP fish) (Sbar (NP fish) (V fish))))
(S (NP (NP fish) (Sbar (NP fish) (V fish))) (V fish) (NP fish))
"""
def give(t):
return (t.label() == 'VP' and len(t) > 2 and t[1].label() == 'NP'
and (t[2].label() == 'PP-DTV' or t[2].label() == 'NP')
and ('give' in t[0].leaves() or 'gave' in t[0].leaves()))
def sent(t):
return ' '.join(token for token in t.leaves() if token[0] not in '*-0')
def print_node(t, width):
output = "%s %s: %s / %s: %s" %\
(sent(t[0]), t[1].label(), sent(t[1]), t[2].label(), sent(t[2]))
if len(output) > width:
output = output[:width] + "..."
print(output)
for tree in nltk.corpus.treebank.parsed_sents():
for t in tree.subtrees(give):
print_node(t, 72)
"""
gave NP: the chefs / NP: a standing ovation
give NP: advertisers / NP: discounts for maintaining or increasing ad sp...
give NP: it / PP-DTV: to the politicians
gave NP: them / NP: similar help
give NP: them / NP:
give NP: only French history questions / PP-DTV: to students in a Europe...
give NP: federal judges / NP: a raise
give NP: consumers / NP: the straight scoop on the U.S. waste crisis
gave NP: Mitsui / NP: access to a high-tech medical product
give NP: Mitsubishi / NP: a window on the U.S. glass industry
give NP: much thought / PP-DTV: to the rates she was receiving , nor to ...
give NP: your Foster Savings Institution / NP: the gift of hope and free...
give NP: market operators / NP: the authority to suspend trading in futu...
gave NP: quick approval / PP-DTV: to $ 3.18 billion in supplemental appr...
give NP: the Transportation Department / NP: up to 50 days to review any...
give NP: the president / NP: such power
give NP: me / NP: the heebie-jeebies
give NP: holders / NP: the right , but not the obligation , to buy a cal...
gave NP: Mr. Thomas / NP: only a `` qualified '' rating , rather than ``...
give NP: the president / NP: line-item veto power
"""
import nltk
grammar = nltk.PCFG.fromstring("""
S -> NP VP [1.0]
VP -> TV NP [0.4]
VP -> IV [0.3]
VP -> DatV NP NP [0.3]
TV -> 'saw' [1.0]
IV -> 'ate' [1.0]
DatV -> 'gave' [1.0]
NP -> 'telescopes' [0.8]
NP -> 'Jack' [0.2]
""")
print(grammar)
"""
Grammar with 9 productions (start state = S)
S -> NP VP [1.0]
VP -> TV NP [0.4]
VP -> IV [0.3]
VP -> DatV NP NP [0.3]
TV -> 'saw' [1.0]
IV -> 'ate' [1.0]
DatV -> 'gave' [1.0]
NP -> 'telescopes' [0.8]
NP -> 'Jack' [0.2]
"""
viterbi_parser = nltk.ViterbiParser(grammar)
trees = viterbi_parser.parse(['Jack', 'saw', 'telescopes'])
for tree in trees:
print(tree)
"""
(S (NP Jack) (VP (TV saw) (NP telescopes))) (p=0.064)
"""