@TOC
本文将介绍3个应用PyTorch框架实现文本分类的案例
案例1:情感剖析
在这个案例中,咱们将应用PyTorch和BERT模型来进行情感剖析。咱们将应用IMDb电影评论数据集,其中蕴含50,000条正负样本的电影评论。
首先,咱们须要通过以下代码下载必要的Python库和数据集:
!pip install torch transformers pandas scikit-learn!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz!tar -xf aclImdb_v1.tar.gz
接下来,咱们将定义预处理函数,它将读取并筹备好数据,以便咱们能够将其输出到BERT模型中:
import pandas as pdfrom sklearn.model_selection import train_test_splitfrom transformers import BertTokenizerdef preprocess_data(): # 读取数据集 df = pd.read_csv('data.csv') # 划分训练集和测试集 train_data, test_data = train_test_split(df, test_size=0.2) # 加载BERT tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # 对训练集进行编码 train_encodings = tokenizer(list(train_data['text']), truncation=True, padding=True) train_labels = list(train_data['label']) # 对测试集进行编码 test_encodings = tokenizer(list(test_data['text']), truncation=True, padding=True) test_labels = list(test_data['label']) return train_encodings, train_labels, test_encodings, test_labels
而后,咱们将定义一个PyTorch模型类,它将应用BERT模型和一个线性层来进行情感剖析:
import torch.nn as nnfrom transformers import BertModelclass SentimentClassifier(nn.Module): def __init__(self): super(SentimentClassifier, self).__init__() # 加载BERT模型 self.bert = BertModel.from_pretrained('bert-base-uncased') # 增加线性层 self.linear = nn.Linear(768, 1) def forward(self, input_ids, attention_mask): # 将输出传递给BERT模型 outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask) # 提取[CLS]向量 pooled_output = outputs.pooler_output # 传递给线性层 logits = self.linear(pooled_output) return logits最初,咱们能够应用以下代码来训练和测试模型:import torchfrom torch.utils.data import DataLoader, TensorDatasetfrom transformers import AdamW# 预处理数据train_encodings, train_labels, test_encodings, test_labels = preprocess_data()# 将数据转换为PyTorch张量train_input_ids = torch.tensor(train_encodings.input_ids)train_attention_mask = torch.tensor(train_encodings.attention_mask)train_labels = torch.tensor(train_labels)test_input_ids = torch.tensor(test_encodings.input_ids)test_attention_mask = torch.tensor(test_encodings.attention_mask)test_labels = torch.tensor(test_labels)# 创立数据集和数据加载器train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_labels)train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_labels)test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)# 定义模型和优化器model = SentimentClassifier()optimizer = AdamW(model.parameters(), lr=2e-5)# 训练模型for epoch in range(3): for input_ids, attention_mask, labels in train_loader: optimizer.zero_grad() logits = model(input_ids, attention_mask).squeeze(-1) loss = nn.BCEWithLogitsLoss()(logits, labels.float()) loss.backward() optimizer.step()# 测试模型with torch.no_grad(): correct, total = 0, 0 for input_ids, attention_mask, labels in test_loader: logits = model(input_ids, attention_mask).squeeze(-1) predictions = (torch.sigmoid(logits) > 0.5).long() total += labels.size(0) correct += (predictions == labels.long()).sum().item() accuracy = correct / total print(f"Accuracy: {accuracy:.2f}")
案例2:垃圾邮件分类
在这个案例中,咱们将应用PyTorch和卷积神经网络(CNN)来进行垃圾邮件分类。咱们将应用SpamAssassin公共数据集,其中蕴含4,827封垃圾邮件和6,593封非垃圾邮件。
首先,咱们须要下载必要的Python库和数据集:
!pip install torch pandas scikit-learn!wget https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2!tar -xf 20050311_spam_2.tar.bz2
接下来,咱们将定义预处理函数,它将读取并筹备好数据,以便咱们能够将其输出到CNN模型中:
import osimport pandas as pdfrom sklearn.model_selection import train_test_splitfrom sklearn.feature_extraction.text import CountVectorizerdef preprocess_data(): # 读取数据集 spam_filenames = os.listdir('spam') ham_filenames = os.listdir('ham') spam_emails = [] ham_emails = [] for filename in spam_filenames: with open(os.path.join('spam', filename), encoding='latin-1') as f: spam_emails.append(f.read()) for filename in ham_filenames: with open(os.path.join('ham', filename), encoding='latin-1') as f: ham_emails.append(f.read()) # 将数据集转换为DataFrame spam_df = pd.DataFrame({'text': spam_emails, 'label': 1}) ham_df = pd.DataFrame({'text': ham_emails, 'label': 0}) df = pd.concat([spam_df, ham_df], ignore_index=True) # 划分训练集和测试集 train_data, test_data = train_test_split(df, test_size=0.2) # 应用Bag of Words办法将文本转换为数字特色 vectorizer = CountVectorizer(stop_words='english', max_features=1000) vectorizer.fit(train_data['text']) train_features = vectorizer.transform(train_data['text']).toarray() train_labels = list(train_data['label']) test_features = vectorizer.transform(test_data['text']).toarray() test_labels = list(test_data['label']) return train_features, train_labels, test_features, test_labels
而后,咱们将定义一个PyTorch模型类,它将应用CNN来进行垃圾邮件分类:
import torch.nn as nnclass SpamClassifier(nn.Module): def __init__(self, input_dim, output_dim): super(SpamClassifier, self).__init__() # 增加卷积层 self.conv1 = nn.Conv1d(input_dim, 128, kernel_size=5, padding=2) self.conv2 = nn.Conv1d(128, 64, kernel_size=5, padding=2) # 增加池化层 self.pool1 = nn.MaxPool1d(kernel_size=4) self.pool2 = nn.MaxPool1d(kernel_size=4) # 增加线性层 self.linear1 = nn.Linear(64 * (input_dim // 16), 64) self.linear2 = nn.Linear(64, output_dim) def forward(self, x): # 将输出传递给卷积层和池化层 x = torch.relu(self.conv1(x)) x = self.pool1(x) x = torch.relu(self.conv2(x)) x = self.pool2(x) # 将输入展平并传递给线性层 x = x.view(x.size(0), -1) x = torch.relu(self.linear1(x)) logits = self.linear2(x) return logits
最初,咱们能够应用以下代码来训练和测试模型:
import torchfrom torch.utils.data import DataLoader, TensorDataset# 预处理数据train_features, train_labels, test_features, test_labels = preprocess_data()# 将数据转换为PyTorch张量train_features = torch.tensor(train_features).float()train_labels = torch.tensor(train_labels)test_features = torch.tensor(test_features).float()test_labels = torch.tensor(test_labels)# 创立数据集和数据加载器train_dataset = TensorDataset(train_features.unsqueeze(1), train_labels)train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)test_dataset = TensorDataset(test_features.unsqueeze(1), test_labels)test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)# 定义模型和优化器model = SpamClassifier(input_dim=train_features.shape[1], output_dim=1)optimizer = torch.optim.Adam(model.parameters(), lr=0.001)# 训练模型for epoch in range(10): for features, labels in train_loader: optimizer.zero_grad() logits = model(features) loss = nn.BCEWithLogitsLoss()(logits.squeeze(-1), labels.float()) loss.backward() optimizer.step()# 测试模型with torch.no_grad(): correct, total = 0, 0 for features, labels in test_loader: logits = model(features) predictions = (torch.sigmoid(logits) > 0.5).long() total += labels.size(0) correct += (predictions == labels).sum().item() accuracy = correct / total print(f"Accuracy: {accuracy:.2f}")
案例3:多标签分类
在这个案例中,咱们将应用PyTorch和BERT模型来进行多标签分类。咱们将应用Reuters新闻语料库数据集,其中蕴含8,982个新闻文档,每个新闻文档都标有多个类别。
首先,咱们须要下载必要的Python库和数据集:
!pip install torch transformers pandas scikit-learn!wget https://archive.ics.uci.edu/ml/machine-learning-databases/reuters21578-mld/reuters21578.tar.gz!tar -xf reuters21578.tar.gz
接下来,咱们将定义预处理函数,它将读取并筹备好数据,以便咱们能够将其输出到BERT模型中:
import osimport xml.etree.ElementTree as ETimport pandas as pdfrom sklearn.model_selection import train_test_splitfrom transformers import BertTokenizerdef preprocess_data(): # 读取数据集 topics = ['acq', 'earn', 'money-fx', 'grain', 'crude', 'trade', 'interest', 'ship', 'wheat', 'corn'] documents = [] for root, dirs, files in os.walk('reut2-*.sgm'): for file in files: with open(os.path.join(root, file), encoding='latin-1') as f: tree = ET.parse(f) for elem in tree.iterfind('.//REUTERS'): if any(topic in elem.find('TOPICS').text for topic in topics): text = elem.findtext('.//TEXT') topics_list = [topic.strip() for topic in elem.find('TOPICS').text.split(',') if topic.strip() in topics] if text and topics_list: documents.append({'text': text, 'topics': topics_list}) df = pd.DataFrame(documents) # 划分训练集和测试集 train_data, test_data = train_test_split(df, test_size=0.2) # 加载BERT tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # 对训练集进行编码 train_encodings = tokenizer(list(train_data['text']), truncation=True, padding=True) train_labels = torch.tensor([list(map(lambda t: t in doc_topics, topics)) for doc_topics in train_data['topics']]) # 对测试集进行编码 test_encodings = tokenizer(list(test_data['text']), truncation=True, padding=True) test_labels = torch.tensor([list(map(lambda t: t in doc_topics, topics)) for doc_topics in test_data['topics']]) return train_encodings, train_labels, test_encodings, test_labels
而后,咱们将定义一个PyTorch模型类,它将应用BERT模型和一个多标签分类头来进行多标签分类:
import torch.nn as nnfrom transformers import BertModelclass MultiLabelClassifier(nn.Module): def __init__(self): super(MultiLabelClassifier, self).__init__() # 加载BERT模型 self.bert = BertModel.from_pretrained('bert-base-uncased') # 增加多标签分类头 self.classifier = nn.Linear(self.bert.config.hidden_size, 10) def forward(self, input_ids, attention_mask): # 将输出传递给BERT模型 outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask) pooled_output = outputs.pooler_output # 将输入传递给多标签分类头 logits = self.classifier(pooled_output) return logits
最初,咱们能够应用以下代码来训练和测试模型:
import torchfrom torch.utils.data import DataLoader, TensorDataset# 预处理数据train_encodings, train_labels, test_encodings, test_labels = preprocess_data()# 创立数据集和数据加载器train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_encodings['attention_mask']), train_labels)train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']), torch.tensor(test_encodings['attention_mask']), test_labels)test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)# 定义模型和优化器model = MultiLabelClassifier()optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)# 训练模型for epoch in range(3): for input_ids, attention_mask, labels in train_loader: optimizer.zero_grad() logits = model(input_ids, attention_mask) loss = nn.BCEWithLogitsLoss()(logits, labels.float()) loss.backward() optimizer.step()# 测试模型with torch.no_grad(): all_labels, all_predictions = [], [] for input_ids, attention_mask, labels in test_loader: logits = model(input_ids, attention_mask) predictions = (torch.sigmoid(logits) > 0.5).long() all_labels.append(labels) all_predictions.append(predictions) all_labels = torch.cat(all_labels, dim=0) all_predictions = torch.cat(all_predictions, dim=0) accuracy = (all_predictions == all_labels).float().mean() print(f"Accuracy: {accuracy:.2f}")
小结
这三个案例演示了如何应用PyTorch和不同类型的神经网络来解决不同的机器学习问题。PyTorch提供了丰盛的工具和函数,使得模型的训练和评估变得非常容易。
本文由mdnice多平台公布