580 lines
18 KiB
Python
580 lines
18 KiB
Python
# %%
|
|
import torch
|
|
import torch.nn as nn
|
|
import torch.nn.functional as F
|
|
import pandas as pd
|
|
from torch.utils.data import Dataset, DataLoader
|
|
from collections import Counter
|
|
import os
|
|
import sys
|
|
sys.path.append(os.path.join(os.getcwd(), '../'))
|
|
from helper import default_labelling
|
|
from sklearn.metrics import f1_score
|
|
import numpy as np
|
|
|
|
|
|
# %%
|
|
label_map = {
|
|
'Label.FAKE': 0,
|
|
'Label.REAL': 1}
|
|
|
|
# %% [markdown]
|
|
"""
|
|
# Pipelining process
|
|
"""
|
|
|
|
# %%
|
|
df = pd.read_parquet("../../data/training/995,000_rows.parquet", columns=['tokens','type'])
|
|
|
|
|
|
df['label'] = df['type'].apply(default_labelling).astype(str)
|
|
df['label'] = df['label'].map(label_map).astype(int)
|
|
df = df.drop(columns=['type'])
|
|
|
|
# %%
|
|
df_test = pd.read_parquet("../../data/testing/995,000_rows.parquet", columns=['tokens','type'])
|
|
|
|
df_test['label'] = df_test['type'].apply(default_labelling).astype(str)
|
|
df_test['label'] = df_test['label'].map(label_map).astype(int)
|
|
df_test = df_test.drop(columns=['type'])
|
|
|
|
# %%
|
|
df_val = pd.read_parquet("../../data/validation/995,000_rows.parquet", columns=['tokens','type'])
|
|
df_val['label'] = df_val['type'].apply(default_labelling).astype(str)
|
|
df_val['label'] = df_val['label'].map(label_map).astype(int)
|
|
df_val = df_val.drop(columns=['type'])
|
|
|
|
# %%
|
|
# print("Loading Parquet file...")
|
|
|
|
# # Check the total number of rows (articles)
|
|
# print(f"Total rows in the raw Parquet file: {len(df)}")
|
|
|
|
# # Look at the first few rows to make sure the data looks correct
|
|
# print("\n--- First 3 Rows ---")
|
|
# print(df.head(3))
|
|
|
|
# %%
|
|
# count how many tokens we have in the corpuse
|
|
word_counts = Counter()
|
|
for x in df['tokens']:
|
|
word_counts.update(x)
|
|
|
|
# Keep the top 50,000 words.
|
|
# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)
|
|
vocab = {"<PAD>": 0, "<UNK>": 1}
|
|
for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):
|
|
vocab[word] = idx
|
|
|
|
print(f"Vocabulary built with {len(vocab)} words.")
|
|
|
|
# %%
|
|
# Create a Custom PyTorch Datase
|
|
|
|
# a wrapper for the data that PyTorch knows how to talk to.
|
|
class FakeNewsDataset(Dataset):
|
|
def __init__(self, dataframe, vocab, max_length=256):
|
|
self.dataframe = dataframe
|
|
self.vocab = vocab
|
|
self.max_length = max_length
|
|
|
|
# Tells PyTorch how many articles we have
|
|
#PyTorch calls this internally to know when to stop fetching data.
|
|
def __len__(self):
|
|
return len(self.dataframe)
|
|
|
|
def __getitem__(self, idx):
|
|
# Grabs one article and its label at a time
|
|
tokens = self.dataframe.iloc[idx]['tokens']
|
|
label = self.dataframe.iloc[idx]['label']
|
|
|
|
# Convert text tokens to Integer IDs
|
|
article_ids = [self.vocab.get(word, 1) for word in tokens]
|
|
|
|
# Truncate or Pad the article so they are all exactly 'max_length' long
|
|
if len(article_ids) > self.max_length:
|
|
article_ids = article_ids[:self.max_length]
|
|
else:
|
|
padding = [0] * (self.max_length - len(article_ids))
|
|
article_ids.extend(padding)
|
|
|
|
# Return as PyTorch tensors
|
|
return torch.tensor(article_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)
|
|
|
|
|
|
# %%
|
|
## Prepare the DataLoader
|
|
# Wrap The dataframe in the Dataset class
|
|
|
|
# The DataLoader feeds the data to the model in batches (e.g., 64 articles at a time)
|
|
# This prevents the computer from running out of RAM!
|
|
|
|
|
|
my_train_dataset = FakeNewsDataset(dataframe=df, vocab=vocab, max_length=256)
|
|
# Shuffle is true for training so the data keeps getting shuffled when trained and the model does not memorise the data
|
|
train_dataloader = DataLoader(my_train_dataset, batch_size=64, shuffle=True,num_workers=4, # Start with 4; if CPU stays cool, try 6
|
|
pin_memory=True, # Essential for fast data transfer
|
|
prefetch_factor=2)
|
|
|
|
|
|
val_data = FakeNewsDataset(dataframe=df_val, vocab=vocab, max_length=256)
|
|
val_dataloader = DataLoader(val_data, batch_size=64, shuffle=False)
|
|
|
|
test_data = FakeNewsDataset(dataframe=df_test, vocab=vocab, max_length=256)
|
|
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=False)
|
|
|
|
# %% [markdown]
|
|
"""
|
|
Checking if the data conversion works
|
|
"""
|
|
|
|
# %%
|
|
# features, labels = next(iter(train_dataloader))
|
|
# # 2. Check the shapes (the dimensions of your tensors)
|
|
# print("--- Tensor Shapes ---")
|
|
# print(f"Features shape: {features.shape}")
|
|
# print(f"Labels shape: {labels.shape}")
|
|
|
|
# # 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)
|
|
# print("\n--- Data Types ---")
|
|
# print(f"Features dtype: {features.dtype}")
|
|
# print(f"Labels dtype: {labels.dtype}")
|
|
|
|
# # 4. Peek at the actual data for the very first article in this batch
|
|
# print("\n--- First Article Peek ---")
|
|
# print(f"Label: {labels[0].item()} (0 = Real, 1 = Fake)")
|
|
# print(f"Tokens (first 20 IDs): {features[0][:20]}")
|
|
|
|
# %%
|
|
class BaseModel(nn.Module):
|
|
def __init__(self, vocab_size, embed_dim=32, h1=256, h2=128, out_features=2):
|
|
super().__init__()
|
|
|
|
# The Embedding Layer: Turns word IDs into rich numerical vectors
|
|
self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
|
|
|
|
# The Linear Layers: Learn the patterns to decide Fake vs. Real
|
|
self.fc1 = nn.Linear(embed_dim, h1)
|
|
self.fc2 = nn.Linear(h1, h2)
|
|
self.out = nn.Linear(h2, out_features)
|
|
|
|
def forward(self, x):
|
|
|
|
# x starts as integers: shape (batch_size, sequence_length) -> e.g., (64, 256)
|
|
# Pass through embedding
|
|
x = self.embedding(x)
|
|
# Average the word vectors to get one single vector for the whole article
|
|
x = x.mean(dim=1)
|
|
|
|
# Pass through hidden layers with ReLU activation
|
|
x = F.relu(self.fc1(x))
|
|
x = F.relu(self.fc2(x))
|
|
|
|
# Output layer (gives us the raw scores for 'Real' and 'Fake')
|
|
x = self.out(x)
|
|
return x
|
|
model_basic =BaseModel(vocab_size=len((vocab)))
|
|
|
|
# %% [markdown]
|
|
"""
|
|
'Advanced'
|
|
"""
|
|
|
|
# %%
|
|
|
|
class advanced_model(nn.Module):
|
|
def __init__(self, vocab_size, embed_dim=64, hidden_dim=128,num_layer = 2, out_features=2):
|
|
super().__init__()
|
|
|
|
# 1. The Embedding Layer (Same as before)
|
|
self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
|
|
|
|
# # 2. The GRU Layer (Extra layer)
|
|
# batch_first=True is required because our DataLoader outputs (batch_size, sequence_length)
|
|
self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, num_layers=2,batch_first=True,bidirectional=True,
|
|
dropout=0.3)
|
|
|
|
# 3. The Final Output Layer
|
|
# connect the GRU's memory (hidden_dim) directly to our Real/Fake outputs
|
|
self.out = nn.Linear(hidden_dim, out_features)
|
|
self.fc = nn.Linear(hidden_dim * 2, out_features)
|
|
def forward(self, x):
|
|
# x shape: (batch_size, sequence_length) -> e.g., (64, 256)
|
|
|
|
#Get the word embeddings
|
|
x = self.embedding(x)
|
|
# x shape becomes: (64, 256, 32)
|
|
|
|
# Pass the embeddings into the GRU
|
|
# A GRU outputs two things: the output at every single word, AND its final memory state.
|
|
# We use '_' to ignore the step-by-step output, and save 'hidden_state'.
|
|
_, hidden = self.gru(x)
|
|
|
|
# 4. Extract and Concatenate the final forward and backward states
|
|
# hidden[-2] is the last forward state, hidden[-1] is the last backward state
|
|
out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
|
|
|
|
return self.fc(out)
|
|
|
|
# Initilize
|
|
model_adv = advanced_model(vocab_size=len(vocab))
|
|
|
|
# %% [markdown]
|
|
"""
|
|
# Training
|
|
|
|
"""
|
|
|
|
# %%
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
# %%
|
|
def evaluate_performance(model, dataloader, device):
|
|
model.eval() # Put model in evaluation mode
|
|
|
|
all_predictions = []
|
|
all_true_labels = []
|
|
|
|
# Turn off gradient tracking to save memory
|
|
with torch.no_grad():
|
|
for features, labels in dataloader:
|
|
features = features.to(device)
|
|
labels = labels.to(device)
|
|
|
|
# Get model scores
|
|
scores = model(features)
|
|
|
|
# Find the predicted class (0 or 1)
|
|
_, predictions = torch.max(scores,1)
|
|
|
|
# Save predictions and actual labels to lists
|
|
# all_predictions.extend(predictions.cpu().tolist())
|
|
# all_true_labels.extend(labels.cpu().tolist())
|
|
all_predictions.extend(predictions.cpu().numpy().flatten().tolist())
|
|
all_true_labels.extend(labels.cpu().numpy().flatten().tolist())
|
|
|
|
all_predictions = np.array(all_predictions)
|
|
all_true_labels = np.array(all_true_labels)
|
|
|
|
accuracy = (all_predictions == all_true_labels).mean() * 100
|
|
|
|
# 4. Calculate F1 Score
|
|
# average='macro' is best for your report to show you care about both classes equally
|
|
f1 = f1_score(all_true_labels, all_predictions, average='macro')
|
|
model.train() # Return model to training mode just in case
|
|
return accuracy, f1
|
|
|
|
|
|
# %%
|
|
def train_model(model, train_loader, val_loader, device, epochs=5, lr=0.001):
|
|
model = model.to(device)
|
|
criterion = nn.CrossEntropyLoss()
|
|
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
|
|
|
|
# Dictionary to store results for your report
|
|
history = {'train_loss': [], 'val_acc': [], 'val_f1': []}
|
|
|
|
print(f"Training {model.__class__.__name__} on {device}...")
|
|
|
|
for epoch in range(epochs):
|
|
model.train()
|
|
total_loss = 0
|
|
|
|
for batch_idx, (features, labels) in enumerate(train_loader):
|
|
features, labels = features.to(device), labels.to(device)
|
|
|
|
optimizer.zero_grad()
|
|
predictions = model(features)
|
|
loss = criterion(predictions, labels)
|
|
loss.backward()
|
|
optimizer.step()
|
|
|
|
total_loss += loss.item()
|
|
|
|
avg_loss = total_loss / len(train_loader)
|
|
|
|
# After each epoch, evaluate on validation set
|
|
val_acc, val_f1 = evaluate_performance(model, val_loader, device)
|
|
|
|
# Save results to our history dictionary
|
|
history['train_loss'].append(avg_loss)
|
|
history['val_acc'].append(val_acc)
|
|
history['val_f1'].append(val_f1)
|
|
|
|
print(f"\n Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f} \n Val Acc: {val_acc:.2f}% \n Val F1: {val_f1:.4f}")
|
|
|
|
return history # Return the results so we can plot them later
|
|
|
|
# %%
|
|
train_995_basic =train_model (model_basic, train_dataloader, val_dataloader, device, epochs =7 )
|
|
print(train_995_basic )
|
|
|
|
# %%
|
|
train_995_adv =train_model (model_adv, train_dataloader, val_dataloader, device, epochs =7 )
|
|
print(train_995_adv )
|
|
|
|
# %%
|
|
|
|
|
|
# %%
|
|
|
|
|
|
# %% [markdown]
|
|
"""
|
|
# Evaluation
|
|
"""
|
|
|
|
# %% [markdown]
|
|
"""
|
|
Basic model
|
|
"""
|
|
|
|
# %%
|
|
|
|
# # 1. The Evaluation Function
|
|
# def evaluate_performance(model, dataloader, device):
|
|
# model.eval() # Put model in evaluation mode
|
|
|
|
# all_predictions = []
|
|
# all_true_labels = []
|
|
|
|
# # Turn off gradient tracking to save memory
|
|
# with torch.no_grad():
|
|
# for features, labels in dataloader:
|
|
# features = features.to(device)
|
|
# labels = labels.to(device)
|
|
|
|
# # Get model scores
|
|
# scores = model(features)
|
|
|
|
# # Find the predicted class (0 or 1)
|
|
# _, predictions = torch.max(scores,1)
|
|
|
|
# # Save predictions and actual labels to lists
|
|
# # all_predictions.extend(predictions.cpu().tolist())
|
|
# # all_true_labels.extend(labels.cpu().tolist())
|
|
# all_predictions.extend(predictions.cpu().numpy().flatten().tolist())
|
|
# all_true_labels.extend(labels.cpu().numpy().flatten().tolist())
|
|
|
|
# all_predictions = np.array(all_predictions)
|
|
# all_true_labels = np.array(all_true_labels)
|
|
|
|
# accuracy = (all_predictions == all_true_labels).mean() * 100
|
|
|
|
# # 4. Calculate F1 Score
|
|
# # average='macro' is best for your report to show you care about both classes equally
|
|
# f1 = f1_score(all_true_labels, all_predictions, average='macro')
|
|
# model.train() # Return model to training mode just in case
|
|
# return accuracy, f1
|
|
# # # Change me based on the model
|
|
|
|
# # model = model_basic.to(device)
|
|
|
|
|
|
# # print(f"Training on: {device}")
|
|
|
|
# # # 2. Setup Loss and Optimizer
|
|
# # # CrossEntropyLoss is the standard for classification tasks
|
|
# # criterion = nn.CrossEntropyLoss()
|
|
# # # Adam is a very reliable, fast optimizer
|
|
# # optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
|
|
|
# # # 3. The Training Loop
|
|
# # epochs = 7# Start with a small number of passes through the whole dataset
|
|
|
|
# # for epoch in range(epochs):
|
|
# # model.train() # Tell the model it is in training mode
|
|
# # total_loss = 0
|
|
|
|
# # # Loop through our batches of 64 articles
|
|
# # for batch_idx, (features, labels) in enumerate(train_dataloader):
|
|
|
|
# # # Move data to the same device as the model (GPU/CPU)
|
|
# # features = features.to(device)
|
|
# # labels = labels.to(device)
|
|
|
|
# # # Step A: Reset the optimizer's gradients
|
|
# # optimizer.zero_grad()
|
|
|
|
# # # Step B: Forward Pass (Have the model guess Real or Fake)
|
|
# # predictions = model(features)
|
|
|
|
# # # Step C: Calculate Loss (How wrong were the guesses?)
|
|
# # loss = criterion(predictions, labels)
|
|
|
|
# # # Step D: Backward Pass (Calculate how to fix the math)
|
|
# # loss.backward()
|
|
|
|
# # # Step E: Optimize (Actually apply the fixes to the model's weights)
|
|
# # optimizer.step()
|
|
|
|
# # total_loss += loss.item()
|
|
|
|
# # # Print an update every 100 batches so we know it's working
|
|
# # if batch_idx % 100 == 0:
|
|
# # print(f"Epoch [{epoch+1}/{epochs}] | Batch {batch_idx} | Loss: {loss.item():.4f}")
|
|
|
|
# # # Print the average loss at the end of each epoch
|
|
# # avg_loss = total_loss / len(train_dataloader)
|
|
# # print(f"--- End of Epoch {epoch+1} | Average Loss: {avg_loss:.4f} ---")
|
|
|
|
|
|
# %% [markdown]
|
|
"""
|
|
Advanced model
|
|
|
|
"""
|
|
|
|
# %%
|
|
|
|
# # 1. The Evaluation Function
|
|
# def evaluate_performance(model_adv, dataloader, device):
|
|
# model_adv.eval() # Put model in evaluation mode
|
|
|
|
# all_predictions = []
|
|
# all_true_labels = []
|
|
|
|
# # Turn off gradient tracking to save memory
|
|
# with torch.no_grad():
|
|
# for features, labels in dataloader:
|
|
# features = features.to(device)
|
|
# labels = labels.to(device)
|
|
|
|
# # Get model scores
|
|
# scores = model_adv(features)
|
|
|
|
# # Find the predicted class (0 or 1)
|
|
# _, predictions = scores.max(1)
|
|
|
|
# # Save predictions and actual labels to lists
|
|
# all_predictions.extend(predictions.cpu().tolist())
|
|
# all_true_labels.extend(labels.cpu().tolist())
|
|
|
|
# # Calculate Accuracy
|
|
# correct_guesses = sum(p == t for p, t in zip(all_predictions, all_true_labels))
|
|
# accuracy = (correct_guesses / len(all_true_labels)) * 100
|
|
|
|
# # Calculate F1 Score
|
|
# f1 = f1_score(all_true_labels, all_predictions, average='macro')
|
|
|
|
# model_adv.train() # Return model to training mode just in case
|
|
# return accuracy, f1
|
|
|
|
|
|
|
|
# %%
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
|
|
# %%
|
|
|
|
print("Basic model ")
|
|
print(" Validation ")
|
|
val_acc995, val_f1_995 = evaluate_performance(model,val_dataloader, device)
|
|
print(f"Validation Accuracy: {val_acc995:.2f}%")
|
|
print(f"Validation F1 Score: {val_f1_995:.4f}")
|
|
|
|
print("\n Testing Phase ")
|
|
test_acc995, test_f1_995 = evaluate_performance(model, test_dataloader, device)
|
|
print(f"Test Accuracy: {test_acc995:.2f}%")
|
|
print(f"Test F1 Score: git {test_f1_995:.4f}")
|
|
|
|
# %%
|
|
|
|
|
|
print(" GURU model ")
|
|
print(" Validation ")
|
|
adv_val_acc995, val_f1995 = evaluate_performance(model_adv,val_dataloader, device)
|
|
print(f"Validation Accuracy: {adv_val_acc995:.2f}%")
|
|
print(f"Validation F1 Score: {val_f1_995:.4f}")
|
|
|
|
print("\n Testing ")
|
|
test_acc, test_f1 = evaluate_performance(model_adv, test_dataloader, device)
|
|
print(f"Test Accuracy: {test_acc955:.2f}%")
|
|
print(f"Test F1 Score: git {test_f1:.4f}")
|
|
|
|
# %% [markdown]
|
|
"""
|
|
# Liar data
|
|
|
|
|
|
"""
|
|
|
|
# %%
|
|
from helper import LIAR_labelling
|
|
|
|
f"../../data/training/LIAR.parquet"
|
|
df_LIAR = pd.read_parquet("../../data/testing/LIAR.parquet",columns=['tokens','type'])
|
|
|
|
|
|
df_LIAR['label'] = df_LIAR['type'].apply(LIAR_labelling).astype(str)
|
|
df_LIAR['label'] = df_LIAR['label'].map(label_map).astype(int)
|
|
df_LIAR = df_LIAR.drop(columns=['type'])
|
|
|
|
# %%
|
|
df_LIAR.head()
|
|
|
|
# %%
|
|
# count how many tokens we have in the corpuse
|
|
word_counts = Counter()
|
|
for x in df_LIAR['tokens']:
|
|
word_counts.update(x)
|
|
|
|
# Keep the top 50,000 words.
|
|
# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)
|
|
vocab = {"<PAD>": 0, "<UNK>": 1}
|
|
for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):
|
|
vocab[word] = idx
|
|
|
|
print(f"Vocabulary built with {len(vocab)} words.")
|
|
|
|
# %%
|
|
|
|
LR_DATA = FakeNewsDataset(dataframe=df_LIAR, vocab=vocab, max_length=256)
|
|
LR_dataloader = DataLoader(LR_DATA, batch_size=32, shuffle=False)
|
|
|
|
# %%
|
|
features, labels = next(iter(LR_dataloader))
|
|
# 2. Check the shapes (the dimensions of your tensors)
|
|
print("--- Tensor Shapes ---")
|
|
print(f"Features shape: {features.shape}")
|
|
print(f"Labels shape: {labels.shape}")
|
|
|
|
# 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)
|
|
print("\n--- Data Types ---")
|
|
print(f"Features dtype: {features.dtype}")
|
|
print(f"Labels dtype: {labels.dtype}")
|
|
|
|
# 4. Peek at the actual data for the very first article in this batch
|
|
print("\n--- First Article Peek ---")
|
|
print(f"Label: {labels[0].item()} (0 = Real, 1 = Fake)")
|
|
print(f"Tokens (first 20 IDs): {features[0][:20]}")
|
|
|
|
# %%
|
|
# # 1. Check a single sample from the Dataset directly
|
|
# single_features, single_label = LR_DATA[0]
|
|
# print(f"Single Sample - Features: {single_features.shape}, Label: {single_label.shape}")
|
|
|
|
# # 2. Check the DataLoader batch
|
|
# batch_features, batch_labels = next(iter(LR_dataloader))
|
|
# # print(f"Batch - Features: {batch_features.shape}, Labels: {batch_labels.shape}")
|
|
|
|
# %%
|
|
evaluate_performance(model_adv,LR_dataloader,device)
|
|
|
|
print("\n--- 2. Testing Avanced model ---")
|
|
test_acc, test_f1 = evaluate_performance(model_adv, LR_dataloader, device)
|
|
print(f"Test Accuracy: {test_acc:.2f}%")
|
|
print(f"Test F1 Score: git {test_f1:.4f}")
|
|
|
|
# %%
|
|
|
|
print("\n--- 2. Testing BASE-Model ---")
|
|
test_acc, test_f1 = evaluate_performance(model, LR_dataloader, device)
|
|
print(f"Test Accuracy: {test_acc:.2f}%")
|
|
print(f"Test F1 Score: git {test_f1:.4f}")
|
|
|
|
# %%
|
|
|
|
|