PyTorch可以将文本数据集打标,然后进行训练正规股票配资平台 ,本文分享一段简单文本模型的训练跟预测方法。
一、 创建样本创建样本要用到Pandas工具,确保你电脑上安装了Python环境,没有安装的请查看我前面的博客。
安装pandas方法:
pip install pandas
打开PyCharm终端输入命令:pip install pandas
等待安装完成。
然后编写创建csv文件样本集代码,新建generate_data.py文件,内容为:
import pandas as pd
# 创建示例数据
data = {
'text': [
'This is a good movie.',
'I did not like the book.',
'The weather is nice.',
'Terrible service.',
'MyName is AI Teacher.'
],
一季度空客共交付142架民用飞机,包括12架A220飞机、116架A320系列飞机、7架A330飞机和7架A350飞机,空客民用飞机业务产生的收入同比增长13%。此外,空客直升机共交付50架直升机,其收入同比降低9%。
'label': [1.0, 0.0, 1.0, 0.0, 1.0]
}
# 创建 DataFrame
df = pd.DataFrame(data)
# 保存为 CSV 文件
df.to_csv('text_data.csv', index=False)
根据以上代码,在PyCharm工具右键点击运行,就会在当前目录生成text_data.csv文件啦:
在PyCharm打开text_data.csv文件可查看数据集:
这样就完成了样本创建。
二、 安装PyTorch打开PyCharm终端安装PyTorch:
pip install torch torchvision
安装过程:
完成PyTorch安装
三、 安装sklearn打开PyCharm终端安装sklearn:
pip install scikit-learn
使用镜像加速的话:
pip install scikit-learn -i https://mirrors.aliyun.com/pypi/simple
安装过程:
四、 导入PyTorch相关库新建train_new1.py文件,导入库:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import pandas as pd
五、 创建数据集加载器class TextDataset(Dataset):
def __init__(self, texts, labels):
self.texts = texts
self.labels = labels
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
text = torch.tensor(self.texts[idx], dtype=torch.long)
label = torch.tensor(self.labels[idx], dtype=torch.long)
return text, label
六、 创建自定义模型结构class TextClassifier(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super(TextClassifier, self).__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
out = self.fc1(x)
out = self.relu(out)
out = self.fc2(out)
return out
七、 加载数据并训练7.1加载数据
加载第一部分提到的文本样本集:
# 加载数据
data = pd.read_csv('text_data.csv')
texts = data['text'].values
labels = data['label'].values
进行分词和向量化:
# 分词和向量化
vectorizer = CountVectorizer(max_features=5000) # 可以调整特征数量
X = vectorizer.fit_transform(texts).toarray()
进行数据预处理:
# 标签编码
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
代码如图:
7.2创建数据加载器
# 创建数据集实例
train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)
# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
7.3设置训练参数
input_dim = X_train.shape[1] # 输入维度
hidden_dim = 128
output_dim = len(label_encoder.classes_) # 输出维度
model = TextClassifier(input_dim, hidden_dim, output_dim)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
7.4编写训练函数并进行训练
def train(model, dataloader, optimizer, criterion, device):
model.train()
total_loss = 0
for inputs, labels in dataloader:
inputs, labels = inputs.to(device), labels.to(device)
inputs = inputs.float()
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
return total_loss / len(dataloader)
# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
train_loss = train(model, train_loader, optimizer, criterion, device)
print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}')
八、 数据预测8.1预测函数及预测
编写预测函数:
def predict(model, dataloader, device):
model.eval()
predictions = []
with torch.no_grad():
for inputs, _ in dataloader:
inputs = inputs.to(device)
inputs = inputs.float()
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
predictions.extend(preds.tolist())
return predictions
使用测试集预测结果:
# 使用验证集进行预测
predictions = predict(model, val_loader, device)
print(predictions[:5])
8.2完整数据训练预测代码
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import pandas as pd
class TextDataset(Dataset):
def __init__(self, texts, labels):
self.texts = texts
self.labels = labels
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
text = torch.tensor(self.texts[idx], dtype=torch.long)
label = torch.tensor(self.labels[idx], dtype=torch.long)
return text, label
class TextClassifier(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super(TextClassifier, self).__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
out = self.fc1(x)
out = self.relu(out)
out = self.fc2(out)
return out
# 加载数据
data = pd.read_csv('text_data.csv')
texts = data['text'].values
labels = data['label'].values
# 分词和向量化
vectorizer = CountVectorizer(max_features=5000) # 可以调整特征数量
X = vectorizer.fit_transform(texts).toarray()
# 标签编码
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# 创建数据集实例
train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)
# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
input_dim = X_train.shape[1] # 输入维度
hidden_dim = 128
output_dim = len(label_encoder.classes_) # 输出维度
model = TextClassifier(input_dim, hidden_dim, output_dim)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def train(model, dataloader, optimizer, criterion, device):
model.train()
total_loss = 0
for inputs, labels in dataloader:
inputs, labels = inputs.to(device), labels.to(device)
inputs = inputs.float()
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
return total_loss / len(dataloader)
# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
train_loss = train(model, train_loader, optimizer, criterion, device)
print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}')
def predict(model, dataloader, device):
model.eval()
predictions = []
with torch.no_grad():
for inputs, _ in dataloader:
inputs = inputs.to(device)
inputs = inputs.float()
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
predictions.extend(preds.tolist())
return predictions
# 使用验证集进行预测
predictions = predict(model, val_loader, device)
print(predictions[:5])
8.3运行结果
右键运行train1_new.py文件,展开简易训练和得出结果:
共完成十轮训练,由于数据样本较少,训练速度较快。实际生产环境数据集一次完整训练可能需要花费一周,最好采用NVIDA显卡4080 Super GPU深度学习服务器训练。
欢迎关注博主正规股票配资平台 ,获取更多AI技术学习文章。
valinputsmodeldevicelabels发布于:湖北省声明:该文观点仅代表作者本人,搜狐号系信息发布平台,搜狐仅提供信息存储空间服务。