# https://hzwu.github.io/WuCode.html
import pandas as pd
from collections import Counter
import torch
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataset import random_split
from transformers import BertModel
from torch import nn
from torch.optim import Adam
from transformers import BertTokenizer
import numpy as np
trn_file = './tasks/Kaggle-BBC-News-Classification/train.csv'
tst_file = './tasks/Kaggle-BBC-News-Classification/test.csv'
df_trn = pd.read_csv(trn_file)
df_tst = pd.read_csv(tst_file)
print(df_trn.shape, df_tst.shape)
count = Counter(df_trn.Category)
cls2idx = {}
idx2cls = {}
for idx, cls in enumerate(count):
cls2idx[cls] = idx
idx2cls[idx] = cls
print(idx2cls, cls2idx)
df_trn['Category'] = df_trn.Category.map(cls2idx)
print(df_trn.head())
tokenizer = BertTokenizer.from_pretrained('./pt-models/bert-base-cased')
class BBCNewsDataset(Dataset):
def __init__(self, texts, labels=None):
self.texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True, return_tensors='pt') for
text in texts]
self.labels = labels
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
if self.labels is not None:
label = self.labels[idx]
return text, torch.tensor(label)
else:
return text
dataset_trn = BBCNewsDataset(df_trn['Text'].values, df_trn['Category'].values)
dataset_tst = BBCNewsDataset(df_tst['Text'].values)
print('before split:', len(dataset_trn), len(dataset_tst))
size_trn = int(0.8 * len(dataset_trn))
size_val = len(dataset_trn) - size_trn
dataset_trn, dataset_val = random_split(dataset_trn, [size_trn, size_val], generator=torch.Generator().manual_seed(42))
print(' after split:', len(dataset_trn), len(dataset_val), '( sum =', len(dataset_trn) + len(dataset_val), ')',
len(dataset_tst))
class BertClassifier(nn.Module):
def __init__(self, dropout=0.5):
super(BertClassifier, self).__init__()
self.bert = BertModel.from_pretrained('./pt-models/bert-base-cased')
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(768, 5)
self.relu = nn.ReLU()
def forward(self, input_id, mask):
_, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
dropout_output = self.dropout(pooled_output)
linear_output = self.linear(dropout_output)
final_layer = self.relu(linear_output)
return final_layer
batch_size = 8
loader_trn = DataLoader(dataset_trn, batch_size=batch_size, shuffle=True)
loader_val = DataLoader(dataset_val, batch_size=batch_size)
loader_tst = DataLoader(dataset_tst, batch_size=batch_size)
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device('cuda' if USE_CUDA else 'cpu')
print(DEVICE)
model = BertClassifier().to(DEVICE)
criterion = nn.CrossEntropyLoss().to(DEVICE)
optimizer = Adam(model.parameters(), lr=1e-5)
def train(trn_data, val_data, nb_epoch):
model.train()
total_steps = 0
total_loss = 0
total_txts = 0
for batch in trn_data:
model.zero_grad()
texts, labels = batch
labels = labels.to(DEVICE)
idxs = texts['input_ids'].squeeze().to(DEVICE)
mask = texts['attention_mask'].squeeze().to(DEVICE)
pred = model(idxs, mask)
loss = criterion(pred, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
total_txts += len(labels)
total_steps += 1
if total_steps % 20 == 0:
print('Training: epoch = {}, step = {}, ave loss = {:.4f}'.format(1 + nb_epoch, total_steps,
total_loss / total_txts))
model.eval()
total_corrects = 0
total_examples = 0
for batch in val_data:
texts, labels = batch
labels = labels.to(DEVICE)
idxs = texts['input_ids'].squeeze().to(DEVICE)
mask = texts['attention_mask'].squeeze().to(DEVICE)
pred = model(idxs, mask)
total_corrects += (pred.argmax(dim=1).view(labels.size()) == labels).sum().item()
total_examples += len(labels)
acc = total_corrects / total_examples
print('Validation: acc = {:.4f}'.format(acc))
return acc
def test(tst_data):
model.eval()
pred_list = []
for batch in tst_data:
idxs = batch['input_ids'].squeeze().to(DEVICE)
mask = batch['attention_mask'].squeeze().to(DEVICE)
pred = model(idxs, mask)
pred_list.extend(pred.argmax(dim=1).tolist())
return pred_list
for nb_epoch in range(50):
acc = train(loader_trn, loader_val, nb_epoch)
if acc > 0.975:
break
pred_list = test(loader_tst)
df_tst['Category'] = [idx2cls[idx] for idx in pred_list]
df_tst.drop(columns='Text', inplace=True)
sub_file = './tasks/Kaggle-BBC-News-Classification/sample_submission_2.csv'
df_tst.to_csv(sub_file, index=False)
print('Submission file generated sucessfully ...')