Word2Vec的Pytorch实现(二)

it2025-09-28  6

所需导入的包 import collections import math import random import sys import time import os import torch.utils.data as Data import torch import torch.nn as nn 1、读取并处理数据集 assert 'ptb.train.txt' in os.listdir("../data/ptb") with open('../data/ptb/ptb.train.txt','r') as f: lines = f.readlines() raw_dataset = [sentence.split() for sentence in lines] 1、1建立词语索引 counter = collections.Counter([token for sentence in raw_dataset for token in sentence]) # print(counter.items()) counter = dict(filter(lambda x:x[1] >= 5,counter.items())) # [token for sentence in raw_dataset for token in sentence] 代码作用同下 # a = [] # for sentence in raw_dataset: # for token in sentence: # a.append(token) # 将词映射到整数索引 idx_to_token = [token for token,_ in counter.items()] # print(idx_to_token) # ['pierre', '<unk>', 'N', 'years', 'old', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', token_to_idx = {token:idx for idx,token in enumerate(idx_to_token)} # print(token_to_idx) # {'pierre': 0, '<unk>': 1, 'N': 2, 'years': 3, 'old': 4, 'will': 5, 'join': 6, 'the': 7, 'board': 8, 'as': 9, dataset = [[token_to_idx[token] for token in sentence if token in token_to_idx] for sentence in raw_dataset] num_tokens = sum([len(sentence) for sentence in dataset]) # print(num_tokens) # 二次采样前 887100 # [[token_to_idx[token] for token in sentence if token in token_to_idx] for sentence in raw_dataset] # 代码作用同下 # print(len(dataset)) # print(dataset) # b = [] # for sentence in raw_dataset: # c = [] # for token in sentence: # if token in token_to_idx: # c.append(token_to_idx[token]) # b.append(c) # print(len(b)) # print(b) 1、2二次采样 # 1、2 二次采样 def discard(idx): return random.uniform(0,1) < 1 - math.sqrt(1e-4 / counter[idx_to_token[idx]] * num_tokens) subsampled_dataset = [[token for token in sentence if not discard(token)] for sentence in dataset] num_tokens_2 = sum([len(sentence) for sentence in subsampled_dataset]) print('二次采样后:',num_tokens_2) # 二次采样后 375930 # 比较一个词在二次采样前后出现在数据集中的次数 def compare_counts(token): return '# %s: before=%d, after=%d' % (token, sum( [st.count(token_to_idx[token]) for st in dataset]), sum( [st.count(token_to_idx[token]) for st in subsampled_dataset])) # print(compare_counts('the')) # the: before=50770, after=2089 # print(compare_counts('join')) # join: before=45, after=45 1、3 提取中心词和背景词 # 1、3 提取中心词和背景词 def get_centers_and_contexts(dataset, max_window_size): centers, contexts = [], [] for st in dataset: if len(st) < 2: # 每个句子至少要有2个词才可能组成一对“中心词-背景词” continue centers += st for center_i in range(len(st)): window_size = random.randint(1, max_window_size) indices = list(range(max(0, center_i - window_size), min(len(st), center_i + 1 + window_size))) indices.remove(center_i) # 将中心词排除在背景词之外 contexts.append([st[idx] for idx in indices]) return centers, contexts # tiny_dataset = [list(range(7)), list(range(7, 10))] # print('dataset', tiny_dataset) # for center, context in zip(*get_centers_and_contexts(tiny_dataset, 2)): # print('center', center, 'has contexts', context) # # # dataset [[0, 1, 2, 3, 4, 5, 6], [7, 8, 9]] # # center 0 has contexts [1] # # center 1 has contexts [0, 2] # # center 2 has contexts [0, 1, 3, 4] # # center 3 has contexts [1, 2, 4, 5] # # center 4 has contexts [3, 5] # # center 5 has contexts [3, 4, 6] # # center 6 has contexts [4, 5] # 设最大背景窗口大小为5,提取数据集中所有的中心词及其背景词 all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5) 2、负采样 def get_negatives(all_contexts, sampling_weights, K): all_negatives, neg_candidates, i = [], [], 0 population = list(range(len(sampling_weights))) for contexts in all_contexts: negatives = [] while len(negatives) < len(contexts) * K: if i == len(neg_candidates): # 根据每个词的权重(sampling_weights)随机生成k个词的索引作为噪声词。 # 为了高效计算,可以将k设得稍大一点 i, neg_candidates = 0, random.choices( population, sampling_weights, k=int(1e5)) neg, i = neg_candidates[i], i + 1 # 噪声词不能是背景词 if neg not in set(contexts): negatives.append(neg) all_negatives.append(negatives) return all_negatives sampling_weights = [counter[w]**0.75 for w in idx_to_token] all_negatives = get_negatives(all_contexts, sampling_weights, 5) 3、读取数据 class MyDataset(torch.utils.data.Dataset): def __init__(self, centers, contexts, negatives): assert len(centers) == len(contexts) == len(negatives) self.centers = centers self.contexts = contexts self.negatives = negatives def __getitem__(self, index): return (self.centers[index], self.contexts[index], self.negatives[index]) def __len__(self): return len(self.centers) # 小批量读取函数 def batchify(data): ''' 用作DataLoader的参数collate_fn: 输入是一个长为batchsize的list,list中的每个元素都是Dataset类调用__getitem__得到的结果 ''' max_len = max(len(c) + len(n) for _, c, n in data) centers, contexts_negatives, masks, labels = [], [], [], [] for center, context, negative in data: cur_len = len(context) + len(negative) centers += [center] contexts_negatives += [context + negative + [0] * (max_len - cur_len)] masks += [[1] * cur_len + [0] * (max_len - cur_len)] labels += [[1] * len(context) + [0] * (max_len - len(context))] return (torch.tensor(centers).view(-1, 1), torch.tensor(contexts_negatives), torch.tensor(masks), torch.tensor(labels)) # 上面定义的batchify函数指定DataLoader实例中小批量的读取方式,然后打印读取的第一个批量中各个变量的形状 batch_size = 512 num_workers = 0 if sys.platform.startswith('win32') else 4 dataset = MyDataset(all_centers, all_contexts, all_negatives) data_iter = Data.DataLoader(dataset, batch_size, shuffle=True, collate_fn=batchify, num_workers=num_workers) for batch in data_iter: for name, data in zip(['centers', 'contexts_negatives', 'masks', 'labels'], batch): print(name, 'shape:', data.shape) break # # centers shape: torch.Size([512, 1]) # # contexts_negatives shape: torch.Size([512, 60]) # # masks shape: torch.Size([512, 60]) # # labels shape: torch.Size([512, 60]) 4、跳字模型 # 行数为词典大小(num_embeddings),列数为每个词向量的维度(embedding_dim) embed = nn.Embedding(num_embeddings=20, embedding_dim=4) # [num_embeddings, embedding_dim] # print(embed.weight) # x = torch.tensor([[1,2,3],[4,5,6]], dtype=torch.long) # [2,3] # print(embed(x)) # [2, 3, 4] # 跳字模型前向计算 def skip_gram(center, contexts_and_negatives, embed_v, embed_u): v = embed_v(center) u = embed_u(contexts_and_negatives) pred = torch.bmm(v, u.permute(0, 2, 1)) # u.permute(0, 2, 1) 将tensor的维度换位 return pred 5、训练模型5、1定义模型的损失函数(二元交叉熵损失函数) class SigmoidBinaryCrossEntropyLoss(nn.Module): def __init__(self): super(SigmoidBinaryCrossEntropyLoss, self).__init__() def forward(self, inputs, targets, mask=None): ''' :param inputs: Tensor shape: (batch_size, len) :param targets: Tensor of the same shape as input ''' inputs, targets, mask = inputs.float(), targets.float(), mask.float() res = nn.functional.binary_cross_entropy_with_logits( input=inputs, target=targets, reduction='none', weight=mask ) return res.mean(dim=1) loss = SigmoidBinaryCrossEntropyLoss() # pred = torch.tensor([[1.5,0.3,-1,2],[1.1,-0.6,2.2,0.4]]) # # 标签变量label中的1和0分别代表背景词和噪声词 # label = torch.tensor([[1,0,0,0],[1,1,0,0]]) # mask = torch.tensor([[1,1,1,1],[1,1,1,0]]) # 掩码变量 # res = loss(pred,label,mask) * mask.shape[1] / mask.float().sum(dim=1) # print(res) # tensor([0.8740, 1.2100]) 5、2初始化模型参数 embed_size = 100 net = nn.Sequential( nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size), nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size) ) 5、3定义训练函数 def train(net, lr, num_epochs): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("train on", device) net = net.to(device) optimizer = torch.optim.Adam(net.parameters(), lr=lr) for epoch in range(num_epochs): start, l_sum, n = time.time(), 0.0, 0 for batch in data_iter: center, context_negative, mask, label = [d.to(device) for d in batch] pred = skip_gram(center, context_negative, net[0], net[1]) # 使用掩码变量mask来避免填充项对损失函数计算的影响 l = (loss(pred.view(label.shape), label, mask) * mask.shape[1] / mask.float().sum(dim=1)).mean() # 一个batch的平均loss optimizer.zero_grad() l.backward() optimizer.step() l_sum += l.cpu().item() n += 1 print('epoch %d, loss %.2f, time %.2fs' % (epoch + 1, l_sum / n, time.time() - start)) 训练 train(net, 0.01, 10) 6、应用词嵌入模型 def get_similar_tokens(query_token, k, embed): W = embed.weight.data x = W[token_to_idx[query_token]] # 添加的1e-9是为了数值稳定性 cos = torch.matmul(W, x) / (torch.sum(W * W, dim=1) * torch.sum(x * x) + 1e-9).sqrt() _, topk = torch.topk(cos, k=k+1) topk = topk.cpu().numpy() for i in topk[1:]: # 除去输入词 print('cosine sim=%.3f: %s' % (cos[i], (idx_to_token[i]))) get_similar_tokens('chip', 3, net[0])
最新回复(0)