第N4周:NLP中的文本嵌入

时间:2024-10-26 21:59:30
  • ???? 本文为????365天深度学习训练营 中的学习记录博客
  • ???? 原作者:K同学啊

本周任务:
加载第N1周的.txt文件,使用Embeddingbag与Embedding完成词嵌入

Embedding

自定义数据集类

import torch
from torch import nn
import torch.nn.functional as F 
import torch.optim as optim
from torch.utils.data import DataLoader,Dataset

class MyDataset(Dataset):
    def __init__(self,texts,labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self,idx):
        texts = self.texts[idx]
        labels = self.labels[idx]

        return texts,labels

定义填充函数

# 自定义填充函数
def collate_batch(batch):
    texts,labels = zip(*batch)
    max_len = max(len(text) for text in texts)
    padded_texts = [F.pad(text,(0,max_len - len(text)),value=0) for text in texts]
    padded_texts = torch.stack(padded_texts)
    labels = torch.tensor(labels,dtype=torch.float).unsqueeze(1)
    return padded_texts,labels

准备数据和数据加载器

# 准备数据和数据加载器
text_data = [
    torch.tensor([1,1,1,1],dtype=torch.long),
    torch.tensor([2,2,2],dtype=torch.long),
    torch.tensor([3,3],dtype=torch.long)
]

labels = torch.tensor([4,5,6],dtype=torch.float)

my_dataset = MyDataset(text_data,labels)
data_loader = DataLoader(my_dataset,batch_size=2,shuffle=True,collate_fn=collate_batch)

for batch in data_loader:
    print(batch)

定义模型

# 定义模型
class EmbeddingModel(nn.Module):
    def __init__(self,vocab_size,embed_dim):
        super(EmbeddingModel, self).__init__()
        self.embedding = nn. Embedding(vocab_size,embed_dim)
        self.fc = nn. Linear(embed_dim,1)

    def forward(self,text):
        print('embedding输入文本是:',text)
        print('embedding输入文本shape:',text.shape)
        embedding=self.embedding(text)
        embedding_mean = embedding.mean(dim=1)
        print('embedding输出文本shape:',embedding_mean.shape)
        return self.fc(embedding_mean)

训练模型

vocab_size = 10
embed_dim = 6

model = EmbeddingModel(vocab_size, embed_dim)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(),lr=0.01)

for epoch in range(1):
    for batch in data_loader:
        texts, labels = batch
        outputs = model(texts)
        loss = criterion(outputs,labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

print(f'Epoch {epoch+1}, Loss: {loss.item()}')

Embeddingbag

前面的步骤相同

定义模型

# 定义模型

class EmbeddingBagModel(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(EmbeddingBagModel,self).__init__()
        self.embedding_bag = nn.EmbeddingBag(vocab_size,embed_dim,mode='mean')
        self.fc = nn.Linear(embed_dim,1)

    def forward(self,text,offsets):
        print('embedding_bag输入文本是:',text)
        print('embedding_bag输入文本shape:',text.shape)
        embedded = self.embedding_bag(text,offsets)
        print('embedding_bag输出文本shape:',embedded.shape)
        return self.fc(embedded)

训练模型

vocab_size = 10
embed_dim = 6

model = EmbeddingBagModel(vocab_size,embed_dim)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(),lr=0.01)

for epoch in range(1):
    for batch in data_loader:
        texts,labels = zip(*batch)

        offsets = [0] + [len(text) for text in texts[:-1]]
        offsets = torch.tensor(offsets).cumsum(dim=0)
        texts = torch.cat(texts)
        labels = torch.tensor(labels).unsqueeze(1)

        outputs = model(texts,offsets)
        loss = criterion(outputs,labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

使用任务文件.txt嵌入

Embedding

import torch
from torch import nn
import torch.nn.functional as F 
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import jieba
import numpy as np

# 自定义填充函数
def collate_batch(batch):
    texts, labels = zip(*batch)
    max_len = max(len(text) for text in texts)
    padded_texts = [F.pad(text, (0, max_len - len(text)), value=0) for text in texts]
    padded_texts = torch.stack(padded_texts)
    labels = torch.tensor(labels, dtype=torch.float).unsqueeze(1)
    return padded_texts, labels

# 从本地txt文件中读取文本内容
with open("F:/365data/N1/任务文件.txt", 'r', encoding='utf-8') as file:
    texts1 = [line.strip() for line in file]

# 分词
tokenized_texts = [list(jieba.cut(text)) for text in texts1]

# 构建词汇表
word_index = {}
index_word = {}
for i, word in enumerate(set([word for text in tokenized_texts for word in text])):
    word_index[word] = i
    index_word[i] = word

# 计算词汇表大小
vocab_size = len(word_index) + 1  # +1是为了包括padding的0

# 将文本转换为序列
texts = [[word_index[word] for word in text] for text in tokenized_texts]

# 手动指定标签
# 假设第一行的标签为 1.0,第二行的标签为 2.0
labels = [1.0, 2.0]

# 定义自定义数据集类
class MyDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        return torch.tensor(text, dtype=torch.long), torch.tensor(label, dtype=torch.float)

# 创建数据集
my_dataset = MyDataset(texts, labels)

# 创建数据加载器
data_loader = DataLoader(my_dataset, batch_size=2, shuffle=True, collate_fn=collate_batch)

# 打印数据加载器中的批次
for batch in data_loader:
    texts, labels = batch
    print("texts:", texts)
    print("Labels:", labels)

# 定义模型
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(EmbeddingModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, 1)

    def forward(self, text):
        print('embedding输入文本是:', text)
        print('embedding输入文本shape:', text.shape)
        embedding = self.embedding(text)
        embedding_mean = embedding.mean(dim=1)
        print('embedding输出文本shape:', embedding_mean.shape)
        return self.fc(embedding_mean)

embed_dim = 6

model = EmbeddingModel(vocab_size, embed_dim)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

for epoch in range(1):
    for batch in data_loader:
        texts, labels = batch
        outputs = model(texts)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

print(f'Epoch {epoch+1}, Loss: {loss.item()}')
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\admin\AppData\Local\Temp\jieba.cache
Loading model cost 0.566 seconds.
Prefix dict has been built successfully.
texts: tensor([[22, 45, 69, 23, 24, 70, 73, 34, 25, 75, 21, 52, 78, 62, 64, 21, 10, 56,
         34, 25, 75, 21,  4, 42, 47, 27, 35, 32, 54, 16, 36,  7, 83, 24, 74, 80,
          7, 81,  4, 15, 51, 17, 24, 67, 81,  4, 15, 51, 46, 56, 79, 24, 32, 54,
         44,  8, 82, 66,  4, 24, 12, 49, 31, 71,  6, 59, 56, 65, 24, 38, 41, 54,
          4, 20, 24, 58, 40, 60,  4, 34, 25, 75, 21, 68],
        [85, 86,  4, 11, 27, 84, 57, 33,  4, 14,  1, 56, 65, 24, 38,  7,  5, 41,
         54,  4, 23, 24, 58,  3, 17,  2, 77, 63, 72, 19, 55, 37, 41, 54, 56, 18,
         24, 69, 11, 49,  7, 23, 24,  8, 28, 30, 76, 48, 50, 61, 39, 54, 44, 49,
          0, 31, 71,  6, 59, 24,  9, 13, 29, 59, 30, 27, 12, 49,  4, 43, 12, 53,
         26,  4, 56,  0,  0,  0,  0,  0,  0,  0,  0,  0]])
Labels: tensor([[2.],
        [1.]])
embedding输入文本是: tensor([[22, 45, 69, 23, 24, 70, 73, 34, 25, 75, 21, 52, 78, 62, 64, 21, 10, 56,
         34, 25, 75, 21,  4, 42, 47, 27, 35, 32, 54, 16, 36,  7, 83, 24, 74, 80,
          7, 81,  4, 15, 51, 17, 24, 67, 81,  4, 15, 51, 46, 56, 79, 24, 32, 54,
         44,  8, 82, 66,  4, 24, 12, 49, 31, 71,  6, 59, 56, 65, 24, 38, 41, 54,
          4, 20, 24, 58, 40, 60,  4, 34, 25, 75, 21, 68],
        [85, 86,  4, 11, 27, 84, 57, 33,  4, 14,  1, 56, 65, 24, 38,  7,  5, 41,
         54,  4, 23, 24, 58,  3,