乐趣区

关于人工智能:word2veclstm做句子分类-简单例子

数据

3 万文本,train val test 6 2 2.

工具、手法

pytorch、sklearn、gensim 的 word2vec。
word2vec 嵌入句子进行示意,padding 后,用 LSTM+linear 对句序列向量分类。

代码

import jieba
import xgboost as xgb
from sklearn.model_selection import train_test_split
import numpy as np
from gensim.models import Word2Vec


# reorganize data
def get_split_sentences(file_path):
    res_sen=[]
    with open(file_path) as f:
        for line in f:
            split_query=jieba.lcut(line.strip())
            res_sen.append(split_query)
    return res_sen

label2_sentences=get_split_sentences('label2.csv')
label0_sentences=get_split_sentences('label0.csv')
label1_sentences=get_split_sentences('label1.csv')

all_sentences=[]
all_sentences.extend(label0_sentences)
all_sentences.extend(label1_sentences)
all_sentences.extend(label2_sentences)

# set params
emb_size=128
win=3
model=Word2Vec(sentences=all_sentences,vector_size=emb_size,window=win,min_count=1)
# retrieve word embeddings
w2vec=model.wv

# assemble sentence embeddings
def assemble_x(w2vec:dict,sentences):
    sen_vs=[]
    for sen in sentences:
        v=np.vstack([w2vec[w] for w in sen])
        v_len=v.shape[0]
    
        sen_v=np.concatenate((v,np.zeros((max_len-v_len,emb_size)))) if v_len<max_len else v
        
        sen_vs.append(sen_v)    
    return np.array(sen_vs)

# ready the data for training
x=assemble_x(w2vec,all_sentences)
y=np.array([0]*13000+[1]*13000+[2]*4000)
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.6,shuffle=True)
x_val,x_test,y_val,y_test=train_test_split(x_test,y_test,train_size=0.5,shuffle=True)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

mid_dim=32
k_class=3
n=1000
lr=0.1
class Net(nn.Module):
    def __init__(self,emb_size,mid_dim,k_class):
        super().__init__()
        self.lstm=nn.LSTM(emb_size,mid_dim,batch_first=True)
        self.lin=nn.Linear(mid_dim,k_class)
    def forward(self,x):
        _,(hid,cell)=self.lstm(x)
        hid=hid.squeeze(0).relu_()
        
        out=self.lin(hid)
        return F.softmax(out,dim=-1)

simple_net=Net(emb_size,mid_dim,k_class).to(device)
# simple_net=nn.LSTM(emb_size,mid_dim,batch_first=True)#nn.Sequential(,nn.ReLU(),nn.Linear(mid_dim,k_class),nn.Softmax())

x_train=torch.from_numpy(x_train).float().to(device)
y_train=torch.from_numpy(y_train).to(device)
x_val=torch.from_numpy(x_val).float().to(device)
y_val=torch.from_numpy(y_val).to(device)
optimizer=optim.Adam(simple_net.parameters(),lr=lr)
for i in range(n):
    optimizer.zero_grad()
    preds=simple_net(x_train)
    loss=F.cross_entropy(preds,y_train)
    
    
    loss_val=F.cross_entropy(simple_net(x_val),y_val)
    if i%100==0:
        print(f'{i} loss train: {loss.item()}, loss val: {loss_val.item()}')
    loss.backward()
    optimizer.step()

with torch.no_grad():
    x_test=torch.from_numpy(x_test).float().to(device)
    y_test=torch.from_numpy(y_test)
    simple_net.eval()
    preds_test=simple_net(x_test).cpu().argmax(-1)
    # labels 是 numpy 的 one hot 标签
    

    # preds_test=np.argmax(preds_test,dim=)
    def get_scores(preds,gt):
        from sklearn import metrics
        # print ('AUC: %.4f' % metrics.roc_auc_score(gt,preds))
        print ('ACC: %.4f' % metrics.accuracy_score(gt,preds))
        print('macro')
        print('Recall: %.4f' % metrics.recall_score(y_test,preds,average='macro'))
        print('F1-score: %.4f' %metrics.f1_score(gt,preds,average='macro'))
        print('Precision: %.4f' %metrics.precision_score(gt,preds,average='macro'))

        print('\nmicro:')
        print('Recall: %.4f' % metrics.recall_score(y_test,preds,average='micro'))
        print('F1-score: %.4f' %metrics.f1_score(gt,preds,average='micro'))
        print('Precision: %.4f' %metrics.precision_score(gt,preds,average='micro'))
    
    get_scores(preds_test,y_test)

后果

ACC: 0.4303

macro:
Recall: 0.3333
F1-score: 0.2006
Precision: 0.1434

micro:
Recall: 0.4303
F1-score: 0.4303
Precision: 0.4303

小结

成果十分差,起因次要有

  • padding 的 0 向量过于多了,导致模型失去的大部分都是 0 向量;
  • 并未对 lstm 做任何参数调整(懒
退出移动版