理解sklearn.feature.text中的CountVectorizer和TfidfVectorizer

"""

理解sklearn中的CountVectorizer和TfidfVectorizer

"""

from collections import Counter

import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

sentences = ["there is a dog dog", "here is a cat"]

count_vec = CountVectorizer()

a = count_vec.fit_transform(sentences)

print(a.toarray())

print(count_vec.vocabulary_)

"""

输出

{'dog': 1, 'there': 4, 'here': 2, 'cat': 0, 'is': 3}

表示每个词汇对应的坐标

"""

print("=" * 10)

tf_vec = TfidfVectorizer()

b = tf_vec.fit_transform(sentences)

print(b.toarray())

print(tf_vec.vocabulary_)

print(tf_vec.idf_)  # 逆文档频率

print(tf_vec.get_feature_names())

def mytf_idf(s):

    # 自己实现tfidf

    words = tf_vec.get_feature_names()

    tf_matrix = np.zeros((len(s), len(words)), dtype=np.float32)

    smooth = 1

    # 初始值加上平滑因子

    df_matrix = np.ones(len(words), dtype=np.float32) * smooth

    for i in range(len(s)):

        s_words = s[i].split()

        for j in range(len(words)):

            cnt = Counter(s_words).get(words[j], 0)

            tf_matrix[i][j] = cnt

            if cnt > 0:

                df_matrix[j] += 1

    # idf一定是大于1的数值

    idf_matrix = np.log((len(s) + smooth) / df_matrix) + 1

    matrix = tf_matrix * idf_matrix

    matrix = matrix / np.linalg.norm(matrix, 2, axis=1).reshape(matrix.shape[0], 1)

    print(matrix)

print("=" * 10)

mytf_idf(sentences)

"""

TODO:

* IDF可以学到，通过神经网络反向传播来学习IDF而不是直接计算得出

* CountVectorizer有时不需要考虑个数，只需要知道是否出现过即可

"""
秒客网

理解sklearn.feature.text中的CountVectorizer和TfidfVectorizer

相关文章