尝试使用LSTM做情感分析,这个gluon有非常详细的例子,可以直接参考gluon的官方教程 。这里尝试使用PyTorch复现一个。数据用的是IMDB的数据http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 import torchimport torch.nn as nnimport torch.nn.functional as Fimport torch.utils.data.dataloader as dataloaderimport torch.optim as optimimport torch.autograd as autogradimport torchtext.vocab as torchvocabfrom torch.autograd import Variableimport tqdmimport osimport timeimport reimport pandas as pdimport stringimport gensimimport timeimport randomimport snowballstemmerimport collectionsfrom collections import Counterfrom nltk.corpus import stopwordsfrom itertools import chainfrom sklearn.metrics import accuracy_score
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 def readIMDB (path, seg='train' ): pos_or_neg = ['pos' , 'neg' ] data = [] for label in pos_or_neg: files = os.listdir(os.path.join(path, seg, label)) for file in files: with open (os.path.join(path, seg, label, file), 'r' , encoding='utf8' ) as rf: review = rf.read().replace('\n' , '' ) if label == 'pos' : data.append([review, 1 ]) elif label == 'neg' : data.append([review, 0 ]) return data train_data = readIMDB('aclImdb' ) test_data = readIMDB('aclImdb' , 'test' )
1 2 3 4 5 6 7 8 9 10 11 12 def tokenizer (text ): return [tok.lower() for tok in text.split(' ' )] train_tokenized = [] test_tokenized = [] for review, score in train_data: train_tokenized.append(tokenizer(review)) for review, score in test_data: test_tokenized.append(tokenizer(review)) vocab = set (chain(*train_tokenized)) vocab_size = len (vocab)
因为这个数据集非常小,所以如果我们用这个数据集做word embedding有可能过拟合,而且模型没有通用性,所以我们传入一个已经学好的word embedding。
1 2 3 wvmodelwvmodel = gensim.models.KeyedVectors.load_word2vec_format('test_word.txt' , binary=False , encoding='utf-8' )
然后一样要定义一个word to index的词典:
1 2 3 4 word_to_idxword_to = {word: i+1 for i, word in enumerate (vocab)} word_to_idx['<unk>' ] = 0 idx_to_word = {i+1 : word for i, word in enumerate (vocab)} idx_to_word[0 ] = '<unk>'
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 def encode_samples (tokenized_samples, vocab ): features = [] for sample in tokenized_samples: feature = [] for token in sample: if token in word_to_idx: feature.append(word_to_idx[token]) else : feature.append(0 ) features.append(feature) return features def pad_samples (features, maxlen=500 , PAD=0 ): padded_features = [] for feature in features: if len (feature) >= maxlen: padded_feature = feature[:maxlen] else : padded_feature = feature while (len (padded_feature) < maxlen): padded_feature.append(PAD) padded_features.append(padded_feature) return padded_features
1 2 3 4 train_features = torch.tensor(pad_samples(encode_samples(train_tokenized, vocab))) train_labels = torch.tensor([score for _, score in train_data]) test_features = torch.tensor(pad_samples(encode_samples(test_tokenized, vocab))) test_labels = torch.tensor([score for _, score in test_data])
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 class SentimentNet (nn.Module ): def __init__ (self, vocab_size, embed_size, num_hiddens, num_layers, bidirectional, weight, labels, use_gpu, **kwargs ): super (SentimentNet, self).__init__(**kwargs) self.num_hiddens = num_hiddens self.num_layers = num_layers self.use_gpu = use_gpu self.bidirectional = bidirectional self.embedding = nn.Embedding.from_pretrained(weight) self.embedding.weight.requires_grad = False self.encoder = nn.LSTM(input_size=embed_size, hidden_size=self.num_hiddens, num_layers=num_layers, bidirectional=self.bidirectional, dropout=0 ) if self.bidirectional: self.decoder = nn.Linear(num_hiddens * 4 , labels) else : self.decoder = nn.Linear(num_hiddens * 2 , labels) def forward (self, inputs ): embeddings = self.embedding(inputs) states, hidden = self.encoder(embeddings.permute([1 , 0 , 2 ])) encoding = torch.cat([states[0 ], states[-1 ]], dim=1 ) outputs = self.decoder(encoding) return outputs
那这里需要注意几个点,第一,LSTM可以不initialize hidden,如果不initialize的话,那么PyTorch会默认初始为0。
另外就是LSTM这里传进去的数据格式是[seq_len, batch_size, embedded_size]。而我们传进去的数据是[batch_size, seq_len]的样子,那经过embedding之后的结果是[batch_size, seq_len, embedded_size]。所以我们这里要将第二个维度和第一个维度做个调换。而LSTM这边output的dimension和inputs是一致的,如果这里我们不做维度的调换,可以将LSTM的batch_first参数设置为True。然后我们要拿到每个batch的初始状态和最后状态还是一样要去做一个第一第二维度的调换。这里非常的绕,我在这里卡了好久(=@__ @=)
1 2 3 4 5 6 7 8 9 weight = torch.zeros(vocab_size+1 , embed_size) for i in range (len (wvmodel.index2word)): try : index = word_to_idx[wvmodel.index2word[i]] except : continue weight[index, :] = torch.from_numpy(wvmodel.get_vector( idx_to_word[word_to_idx[wvmodel.index2word[i]]]))
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 num_epochs = 5 embed_size = 100 num_hiddens = 100 num_layers = 2 bidirectional = True batch_size = 64 labels = 2 lr = 0.8 device = torch.device('cuda:0' ) use_gpu = True net = SentimentNet(vocab_size=(vocab_size+1 ), embed_size=embed_size, num_hiddens=num_hiddens, num_layers=num_layers, bidirectional=bidirectional, weight=weight, labels=labels, use_gpu=use_gpu) net.to(device) loss_function = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=lr)
1 2 3 4 5 6 7 train_set = torch.utils.data.TensorDataset(train_features, train_labels) test_set = torch.utils.data.TensorDataset(test_features, test_labels) train_iter = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True ) test_iter = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False )
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 for epoch in range (num_epochs): start = time.time() train_loss, test_losses = 0 , 0 train_acc, test_acc = 0 , 0 n, m = 0 , 0 for feature, label in train_iter: n += 1 net.zero_grad() feature = Variable(feature.cuda()) label = Variable(label.cuda()) score = net(feature) loss = loss_function(score, label) loss.backward() optimizer.step() train_acc += accuracy_score(torch.argmax(score.cpu().data, dim=1 ), label.cpu()) train_loss += loss with torch.no_grad(): for test_feature, test_label in test_iter: m += 1 test_feature = test_feature.cuda() test_label = test_label.cuda() test_score = net(test_feature) test_loss = loss_function(test_score, test_label) test_acc += accuracy_score(torch.argmax(test_score.cpu().data, dim=1 ), test_label.cpu()) test_losses += test_loss end = time.time() runtime = end - start print ('epoch: %d, train loss: %.4f, train acc: %.2f, test loss: %.4f, test acc: %.2f, time: %.2f' % (epoch, train_loss.data / n, train_acc / n, test_losses.data / m, test_acc / m, runtime))
2018.08.16 更新一个textCNN的玩法。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 class textCNN (nn.Module ): def __init__ (self, vocab_size, embed_size, seq_len, labels, weight, **kwargs ): super (textCNN, self).__init__(**kwargs) self.labels = labels self.embedding = nn.Embedding.from_pretrained(weight) self.embedding.weight.requires_grad = False self.conv1 = nn.Conv2d(1 , 1 , (3 , embed_size)) self.conv2 = nn.Conv2d(1 , 1 , (4 , embed_size)) self.conv3 = nn.Conv2d(1 , 1 , (5 , embed_size)) self.pool1 = nn.MaxPool2d((seq_len - 3 + 1 , 1 )) self.pool2 = nn.MaxPool2d((seq_len - 4 + 1 , 1 )) self.pool3 = nn.MaxPool2d((seq_len - 5 + 1 , 1 )) self.linear = nn.Linear(3 , labels) def forward (self, inputs ): inputs = self.embedding(inputs).view(inputs.shape[0 ], 1 , inputs.shape[1 ], -1 ) x1 = F.relu(self.conv1(inputs)) x2 = F.relu(self.conv2(inputs)) x3 = F.relu(self.conv3(inputs)) x1 = self.pool1(x1) x2 = self.pool2(x2) x3 = self.pool3(x3) x = torch.cat((x1, x2, x3), -1 ) x = x.view(inputs.shape[0 ], 1 , -1 ) x = self.linear(x) x = x.view(-1 , self.labels) return (x)
1 2 net = textCNN(vocab_size=(vocab_size+1 ), embed_size=embed_size, seq_len=500 , labels=labels, weight=weight)