git: https://github.com/linyi0604/MachineLearning
分别使用词袋法和nltk自然预言处理包提供的文本特征提取
1 from sklearn.feature_extraction.text import CountVectorizer 2 import nltk 3 # nltk.download("punkt") 4 # nltk.download('averaged_perceptron_tagger') 5 6 ''' 7 分别使用词袋法和nltk自然预言处理包提供的文本特征提取 8 ''' 9 10 sent1 = "The cat is walking in the bedroom." 11 sent2 = "A dog was running across the kitchen." 12 # 使用词袋法 将文本转化为特征向量 13 count_vec = CountVectorizer() 14 sentences = [sent1, sent2] 15 # 输出转化后的特征向量 16 # print(count_vec.fit_transform(sentences).toarray()) 17 ''' 18 [[0 1 1 0 1 1 0 0 2 1 0] 19 [1 0 0 1 0 0 1 1 1 0 1]] 20 ''' 21 # 输出转化后特征的含义 22 # print(count_vec.get_feature_names()) 23 ''' 24 ['across', 'bedroom', 'cat', 'dog', 'in', 'is', 'kitchen', 'running', 'the', 'walking', 'was'] 25 ''' 26 27 # 使用nltk对文本进行语言分析 28 # 对句子词汇分割和正则化 把aren't 分割成 are 和 n't I'm 分割成 I和'm 29 tokens1 = nltk.word_tokenize(sent1) 30 tokens2 = nltk.word_tokenize(sent2) 31 # print(tokens1) 32 # print(tokens2) 33 ''' 34 ['The', 'cat', 'is', 'walking', 'in', 'the', 'bedroom', '.'] 35 ['A', 'dog', 'was', 'running', 'across', 'the', 'kitchen', '.'] 36 ''' 37 # 整理词汇表 按照ASCII的顺序排序 38 vocab_1 = sorted(set(tokens1)) 39 vocab_2 = sorted(set(tokens2)) 40 # print(vocab_1) 41 # print(vocab_2) 42 ''' 43 ['.', 'The', 'bedroom', 'cat', 'in', 'is', 'the', 'walking'] 44 ['.', 'A', 'across', 'dog', 'kitchen', 'running', 'the', 'was'] 45 ''' 46 # 初始化stemer 寻找每个单词最原始的词根 47 stemmer = nltk.stem.PorterStemmer() 48 stem_1 = [stemmer.stem(t) for t in tokens1] 49 stem_2 = [stemmer.stem(t) for t in tokens2] 50 # print(stem_1) 51 # print(stem_2) 52 ''' 53 ['the', 'cat', 'is', 'walk', 'in', 'the', 'bedroom', '.'] 54 ['A', 'dog', 'wa', 'run', 'across', 'the', 'kitchen', '.'] 55 ''' 56 # 利用词性标注器 对词性进行标注 57 pos_tag_1 = nltk.tag.pos_tag(tokens1) 58 pos_tag_2 = nltk.tag.pos_tag(tokens2) 59 # print(pos_tag_1) 60 # print(pos_tag_2) 61 ''' 62 [('The', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('walking', 'VBG'), ('in', 'IN'), ('the', 'DT'), ('bedroom', 'NN'), ('.', '.')] 63 [('A', 'DT'), ('dog', 'NN'), ('was', 'VBD'), ('running', 'VBG'), ('across', 'IN'), ('the', 'DT'), ('kitchen', 'NN'), ('.', '.')] 64 '''