数据
3万文本,train val test 6 2 2.
工具、手法
pytorch、sklearn、gensim的word2vec。
word2vec嵌入句子进行示意,padding后,用LSTM+linear对句序列向量分类。
代码
import jiebaimport xgboost as xgbfrom sklearn.model_selection import train_test_splitimport numpy as npfrom gensim.models import Word2Vec# reorganize datadef get_split_sentences(file_path): res_sen=[] with open(file_path) as f: for line in f: split_query=jieba.lcut(line.strip()) res_sen.append(split_query) return res_senlabel2_sentences=get_split_sentences('label2.csv')label0_sentences=get_split_sentences('label0.csv')label1_sentences=get_split_sentences('label1.csv')all_sentences=[]all_sentences.extend(label0_sentences)all_sentences.extend(label1_sentences)all_sentences.extend(label2_sentences)# set paramsemb_size=128win=3model=Word2Vec(sentences=all_sentences,vector_size=emb_size,window=win,min_count=1)# retrieve word embeddingsw2vec=model.wv# assemble sentence embeddingsdef assemble_x(w2vec:dict,sentences): sen_vs=[] for sen in sentences: v=np.vstack([w2vec[w] for w in sen]) v_len=v.shape[0] sen_v=np.concatenate((v,np.zeros((max_len-v_len,emb_size)))) if v_len<max_len else v sen_vs.append(sen_v) return np.array(sen_vs)# ready the data for trainingx=assemble_x(w2vec,all_sentences)y=np.array([0]*13000+[1]*13000+[2]*4000)x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.6,shuffle=True)x_val,x_test,y_val,y_test=train_test_split(x_test,y_test,train_size=0.5,shuffle=True)device = torch.device("cuda" if torch.cuda.is_available() else "cpu")mid_dim=32k_class=3n=1000lr=0.1class Net(nn.Module): def __init__(self,emb_size,mid_dim,k_class): super().__init__() self.lstm=nn.LSTM(emb_size,mid_dim,batch_first=True) self.lin=nn.Linear(mid_dim,k_class) def forward(self,x): _,(hid,cell)=self.lstm(x) hid=hid.squeeze(0).relu_() out=self.lin(hid) return F.softmax(out,dim=-1)simple_net=Net(emb_size,mid_dim,k_class).to(device)# simple_net=nn.LSTM(emb_size,mid_dim,batch_first=True)#nn.Sequential(,nn.ReLU(),nn.Linear(mid_dim,k_class),nn.Softmax())x_train=torch.from_numpy(x_train).float().to(device)y_train=torch.from_numpy(y_train).to(device)x_val=torch.from_numpy(x_val).float().to(device)y_val=torch.from_numpy(y_val).to(device)optimizer=optim.Adam(simple_net.parameters(),lr=lr)for i in range(n): optimizer.zero_grad() preds=simple_net(x_train) loss=F.cross_entropy(preds,y_train) loss_val=F.cross_entropy(simple_net(x_val),y_val) if i%100==0: print(f'{i} loss train: {loss.item()}, loss val: {loss_val.item()}') loss.backward() optimizer.step()with torch.no_grad(): x_test=torch.from_numpy(x_test).float().to(device) y_test=torch.from_numpy(y_test) simple_net.eval() preds_test=simple_net(x_test).cpu().argmax(-1) # labels是numpy的one hot标签 # preds_test=np.argmax(preds_test,dim=) def get_scores(preds,gt): from sklearn import metrics # print ('AUC: %.4f' % metrics.roc_auc_score(gt,preds)) print ('ACC: %.4f' % metrics.accuracy_score(gt,preds)) print('macro') print( 'Recall: %.4f' % metrics.recall_score(y_test,preds,average='macro')) print( 'F1-score: %.4f' %metrics.f1_score(gt,preds,average='macro')) print( 'Precision: %.4f' %metrics.precision_score(gt,preds,average='macro')) print('\nmicro:') print( 'Recall: %.4f' % metrics.recall_score(y_test,preds,average='micro')) print( 'F1-score: %.4f' %metrics.f1_score(gt,preds,average='micro')) print( 'Precision: %.4f' %metrics.precision_score(gt,preds,average='micro')) get_scores(preds_test,y_test)
后果
ACC: 0.4303
macro:
Recall: 0.3333
F1-score: 0.2006
Precision: 0.1434
micro:
Recall: 0.4303
F1-score: 0.4303
Precision: 0.4303
小结
成果十分差,起因次要有
- padding的0向量过于多了,导致模型失去的大部分都是0向量;
- 并未对lstm做任何参数调整(懒