This repository has been archived on 2026-03-27. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
fake-news-backup/src/models/nn.ju.py
2026-03-27 13:35:43 +01:00

580 lines
18 KiB
Python

# %%
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import os
import sys
sys.path.append(os.path.join(os.getcwd(), '../'))
from helper import default_labelling
from sklearn.metrics import f1_score
import numpy as np
# %%
label_map = {
'Label.FAKE': 0,
'Label.REAL': 1}
# %% [markdown]
"""
# Pipelining process
"""
# %%
df = pd.read_parquet("../../data/training/995,000_rows.parquet", columns=['tokens','type'])
df['label'] = df['type'].apply(default_labelling).astype(str)
df['label'] = df['label'].map(label_map).astype(int)
df = df.drop(columns=['type'])
# %%
df_test = pd.read_parquet("../../data/testing/995,000_rows.parquet", columns=['tokens','type'])
df_test['label'] = df_test['type'].apply(default_labelling).astype(str)
df_test['label'] = df_test['label'].map(label_map).astype(int)
df_test = df_test.drop(columns=['type'])
# %%
df_val = pd.read_parquet("../../data/validation/995,000_rows.parquet", columns=['tokens','type'])
df_val['label'] = df_val['type'].apply(default_labelling).astype(str)
df_val['label'] = df_val['label'].map(label_map).astype(int)
df_val = df_val.drop(columns=['type'])
# %%
# print("Loading Parquet file...")
# # Check the total number of rows (articles)
# print(f"Total rows in the raw Parquet file: {len(df)}")
# # Look at the first few rows to make sure the data looks correct
# print("\n--- First 3 Rows ---")
# print(df.head(3))
# %%
# count how many tokens we have in the corpuse
word_counts = Counter()
for x in df['tokens']:
word_counts.update(x)
# Keep the top 50,000 words.
# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)
vocab = {"<PAD>": 0, "<UNK>": 1}
for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):
vocab[word] = idx
print(f"Vocabulary built with {len(vocab)} words.")
# %%
# Create a Custom PyTorch Datase
# a wrapper for the data that PyTorch knows how to talk to.
class FakeNewsDataset(Dataset):
def __init__(self, dataframe, vocab, max_length=256):
self.dataframe = dataframe
self.vocab = vocab
self.max_length = max_length
# Tells PyTorch how many articles we have
#PyTorch calls this internally to know when to stop fetching data.
def __len__(self):
return len(self.dataframe)
def __getitem__(self, idx):
# Grabs one article and its label at a time
tokens = self.dataframe.iloc[idx]['tokens']
label = self.dataframe.iloc[idx]['label']
# Convert text tokens to Integer IDs
article_ids = [self.vocab.get(word, 1) for word in tokens]
# Truncate or Pad the article so they are all exactly 'max_length' long
if len(article_ids) > self.max_length:
article_ids = article_ids[:self.max_length]
else:
padding = [0] * (self.max_length - len(article_ids))
article_ids.extend(padding)
# Return as PyTorch tensors
return torch.tensor(article_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)
# %%
## Prepare the DataLoader
# Wrap The dataframe in the Dataset class
# The DataLoader feeds the data to the model in batches (e.g., 64 articles at a time)
# This prevents the computer from running out of RAM!
my_train_dataset = FakeNewsDataset(dataframe=df, vocab=vocab, max_length=256)
# Shuffle is true for training so the data keeps getting shuffled when trained and the model does not memorise the data
train_dataloader = DataLoader(my_train_dataset, batch_size=64, shuffle=True,num_workers=4, # Start with 4; if CPU stays cool, try 6
pin_memory=True, # Essential for fast data transfer
prefetch_factor=2)
val_data = FakeNewsDataset(dataframe=df_val, vocab=vocab, max_length=256)
val_dataloader = DataLoader(val_data, batch_size=64, shuffle=False)
test_data = FakeNewsDataset(dataframe=df_test, vocab=vocab, max_length=256)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=False)
# %% [markdown]
"""
Checking if the data conversion works
"""
# %%
# features, labels = next(iter(train_dataloader))
# # 2. Check the shapes (the dimensions of your tensors)
# print("--- Tensor Shapes ---")
# print(f"Features shape: {features.shape}")
# print(f"Labels shape: {labels.shape}")
# # 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)
# print("\n--- Data Types ---")
# print(f"Features dtype: {features.dtype}")
# print(f"Labels dtype: {labels.dtype}")
# # 4. Peek at the actual data for the very first article in this batch
# print("\n--- First Article Peek ---")
# print(f"Label: {labels[0].item()} (0 = Real, 1 = Fake)")
# print(f"Tokens (first 20 IDs): {features[0][:20]}")
# %%
class BaseModel(nn.Module):
def __init__(self, vocab_size, embed_dim=32, h1=256, h2=128, out_features=2):
super().__init__()
# The Embedding Layer: Turns word IDs into rich numerical vectors
self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
# The Linear Layers: Learn the patterns to decide Fake vs. Real
self.fc1 = nn.Linear(embed_dim, h1)
self.fc2 = nn.Linear(h1, h2)
self.out = nn.Linear(h2, out_features)
def forward(self, x):
# x starts as integers: shape (batch_size, sequence_length) -> e.g., (64, 256)
# Pass through embedding
x = self.embedding(x)
# Average the word vectors to get one single vector for the whole article
x = x.mean(dim=1)
# Pass through hidden layers with ReLU activation
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
# Output layer (gives us the raw scores for 'Real' and 'Fake')
x = self.out(x)
return x
model_basic =BaseModel(vocab_size=len((vocab)))
# %% [markdown]
"""
'Advanced'
"""
# %%
class advanced_model(nn.Module):
def __init__(self, vocab_size, embed_dim=64, hidden_dim=128,num_layer = 2, out_features=2):
super().__init__()
# 1. The Embedding Layer (Same as before)
self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
# # 2. The GRU Layer (Extra layer)
# batch_first=True is required because our DataLoader outputs (batch_size, sequence_length)
self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, num_layers=2,batch_first=True,bidirectional=True,
dropout=0.3)
# 3. The Final Output Layer
# connect the GRU's memory (hidden_dim) directly to our Real/Fake outputs
self.out = nn.Linear(hidden_dim, out_features)
self.fc = nn.Linear(hidden_dim * 2, out_features)
def forward(self, x):
# x shape: (batch_size, sequence_length) -> e.g., (64, 256)
#Get the word embeddings
x = self.embedding(x)
# x shape becomes: (64, 256, 32)
# Pass the embeddings into the GRU
# A GRU outputs two things: the output at every single word, AND its final memory state.
# We use '_' to ignore the step-by-step output, and save 'hidden_state'.
_, hidden = self.gru(x)
# 4. Extract and Concatenate the final forward and backward states
# hidden[-2] is the last forward state, hidden[-1] is the last backward state
out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
return self.fc(out)
# Initilize
model_adv = advanced_model(vocab_size=len(vocab))
# %% [markdown]
"""
# Training
"""
# %%
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# %%
def evaluate_performance(model, dataloader, device):
model.eval() # Put model in evaluation mode
all_predictions = []
all_true_labels = []
# Turn off gradient tracking to save memory
with torch.no_grad():
for features, labels in dataloader:
features = features.to(device)
labels = labels.to(device)
# Get model scores
scores = model(features)
# Find the predicted class (0 or 1)
_, predictions = torch.max(scores,1)
# Save predictions and actual labels to lists
# all_predictions.extend(predictions.cpu().tolist())
# all_true_labels.extend(labels.cpu().tolist())
all_predictions.extend(predictions.cpu().numpy().flatten().tolist())
all_true_labels.extend(labels.cpu().numpy().flatten().tolist())
all_predictions = np.array(all_predictions)
all_true_labels = np.array(all_true_labels)
accuracy = (all_predictions == all_true_labels).mean() * 100
# 4. Calculate F1 Score
# average='macro' is best for your report to show you care about both classes equally
f1 = f1_score(all_true_labels, all_predictions, average='macro')
model.train() # Return model to training mode just in case
return accuracy, f1
# %%
def train_model(model, train_loader, val_loader, device, epochs=5, lr=0.001):
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# Dictionary to store results for your report
history = {'train_loss': [], 'val_acc': [], 'val_f1': []}
print(f"Training {model.__class__.__name__} on {device}...")
for epoch in range(epochs):
model.train()
total_loss = 0
for batch_idx, (features, labels) in enumerate(train_loader):
features, labels = features.to(device), labels.to(device)
optimizer.zero_grad()
predictions = model(features)
loss = criterion(predictions, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
avg_loss = total_loss / len(train_loader)
# After each epoch, evaluate on validation set
val_acc, val_f1 = evaluate_performance(model, val_loader, device)
# Save results to our history dictionary
history['train_loss'].append(avg_loss)
history['val_acc'].append(val_acc)
history['val_f1'].append(val_f1)
print(f"\n Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f} \n Val Acc: {val_acc:.2f}% \n Val F1: {val_f1:.4f}")
return history # Return the results so we can plot them later
# %%
train_995_basic =train_model (model_basic, train_dataloader, val_dataloader, device, epochs =7 )
print(train_995_basic )
# %%
train_995_adv =train_model (model_adv, train_dataloader, val_dataloader, device, epochs =7 )
print(train_995_adv )
# %%
# %%
# %% [markdown]
"""
# Evaluation
"""
# %% [markdown]
"""
Basic model
"""
# %%
# # 1. The Evaluation Function
# def evaluate_performance(model, dataloader, device):
# model.eval() # Put model in evaluation mode
# all_predictions = []
# all_true_labels = []
# # Turn off gradient tracking to save memory
# with torch.no_grad():
# for features, labels in dataloader:
# features = features.to(device)
# labels = labels.to(device)
# # Get model scores
# scores = model(features)
# # Find the predicted class (0 or 1)
# _, predictions = torch.max(scores,1)
# # Save predictions and actual labels to lists
# # all_predictions.extend(predictions.cpu().tolist())
# # all_true_labels.extend(labels.cpu().tolist())
# all_predictions.extend(predictions.cpu().numpy().flatten().tolist())
# all_true_labels.extend(labels.cpu().numpy().flatten().tolist())
# all_predictions = np.array(all_predictions)
# all_true_labels = np.array(all_true_labels)
# accuracy = (all_predictions == all_true_labels).mean() * 100
# # 4. Calculate F1 Score
# # average='macro' is best for your report to show you care about both classes equally
# f1 = f1_score(all_true_labels, all_predictions, average='macro')
# model.train() # Return model to training mode just in case
# return accuracy, f1
# # # Change me based on the model
# # model = model_basic.to(device)
# # print(f"Training on: {device}")
# # # 2. Setup Loss and Optimizer
# # # CrossEntropyLoss is the standard for classification tasks
# # criterion = nn.CrossEntropyLoss()
# # # Adam is a very reliable, fast optimizer
# # optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# # # 3. The Training Loop
# # epochs = 7# Start with a small number of passes through the whole dataset
# # for epoch in range(epochs):
# # model.train() # Tell the model it is in training mode
# # total_loss = 0
# # # Loop through our batches of 64 articles
# # for batch_idx, (features, labels) in enumerate(train_dataloader):
# # # Move data to the same device as the model (GPU/CPU)
# # features = features.to(device)
# # labels = labels.to(device)
# # # Step A: Reset the optimizer's gradients
# # optimizer.zero_grad()
# # # Step B: Forward Pass (Have the model guess Real or Fake)
# # predictions = model(features)
# # # Step C: Calculate Loss (How wrong were the guesses?)
# # loss = criterion(predictions, labels)
# # # Step D: Backward Pass (Calculate how to fix the math)
# # loss.backward()
# # # Step E: Optimize (Actually apply the fixes to the model's weights)
# # optimizer.step()
# # total_loss += loss.item()
# # # Print an update every 100 batches so we know it's working
# # if batch_idx % 100 == 0:
# # print(f"Epoch [{epoch+1}/{epochs}] | Batch {batch_idx} | Loss: {loss.item():.4f}")
# # # Print the average loss at the end of each epoch
# # avg_loss = total_loss / len(train_dataloader)
# # print(f"--- End of Epoch {epoch+1} | Average Loss: {avg_loss:.4f} ---")
# %% [markdown]
"""
Advanced model
"""
# %%
# # 1. The Evaluation Function
# def evaluate_performance(model_adv, dataloader, device):
# model_adv.eval() # Put model in evaluation mode
# all_predictions = []
# all_true_labels = []
# # Turn off gradient tracking to save memory
# with torch.no_grad():
# for features, labels in dataloader:
# features = features.to(device)
# labels = labels.to(device)
# # Get model scores
# scores = model_adv(features)
# # Find the predicted class (0 or 1)
# _, predictions = scores.max(1)
# # Save predictions and actual labels to lists
# all_predictions.extend(predictions.cpu().tolist())
# all_true_labels.extend(labels.cpu().tolist())
# # Calculate Accuracy
# correct_guesses = sum(p == t for p, t in zip(all_predictions, all_true_labels))
# accuracy = (correct_guesses / len(all_true_labels)) * 100
# # Calculate F1 Score
# f1 = f1_score(all_true_labels, all_predictions, average='macro')
# model_adv.train() # Return model to training mode just in case
# return accuracy, f1
# %%
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# %%
print("Basic model ")
print(" Validation ")
val_acc995, val_f1_995 = evaluate_performance(model,val_dataloader, device)
print(f"Validation Accuracy: {val_acc995:.2f}%")
print(f"Validation F1 Score: {val_f1_995:.4f}")
print("\n Testing Phase ")
test_acc995, test_f1_995 = evaluate_performance(model, test_dataloader, device)
print(f"Test Accuracy: {test_acc995:.2f}%")
print(f"Test F1 Score: git {test_f1_995:.4f}")
# %%
print(" GURU model ")
print(" Validation ")
adv_val_acc995, val_f1995 = evaluate_performance(model_adv,val_dataloader, device)
print(f"Validation Accuracy: {adv_val_acc995:.2f}%")
print(f"Validation F1 Score: {val_f1_995:.4f}")
print("\n Testing ")
test_acc, test_f1 = evaluate_performance(model_adv, test_dataloader, device)
print(f"Test Accuracy: {test_acc955:.2f}%")
print(f"Test F1 Score: git {test_f1:.4f}")
# %% [markdown]
"""
# Liar data
"""
# %%
from helper import LIAR_labelling
f"../../data/training/LIAR.parquet"
df_LIAR = pd.read_parquet("../../data/testing/LIAR.parquet",columns=['tokens','type'])
df_LIAR['label'] = df_LIAR['type'].apply(LIAR_labelling).astype(str)
df_LIAR['label'] = df_LIAR['label'].map(label_map).astype(int)
df_LIAR = df_LIAR.drop(columns=['type'])
# %%
df_LIAR.head()
# %%
# count how many tokens we have in the corpuse
word_counts = Counter()
for x in df_LIAR['tokens']:
word_counts.update(x)
# Keep the top 50,000 words.
# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)
vocab = {"<PAD>": 0, "<UNK>": 1}
for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):
vocab[word] = idx
print(f"Vocabulary built with {len(vocab)} words.")
# %%
LR_DATA = FakeNewsDataset(dataframe=df_LIAR, vocab=vocab, max_length=256)
LR_dataloader = DataLoader(LR_DATA, batch_size=32, shuffle=False)
# %%
features, labels = next(iter(LR_dataloader))
# 2. Check the shapes (the dimensions of your tensors)
print("--- Tensor Shapes ---")
print(f"Features shape: {features.shape}")
print(f"Labels shape: {labels.shape}")
# 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)
print("\n--- Data Types ---")
print(f"Features dtype: {features.dtype}")
print(f"Labels dtype: {labels.dtype}")
# 4. Peek at the actual data for the very first article in this batch
print("\n--- First Article Peek ---")
print(f"Label: {labels[0].item()} (0 = Real, 1 = Fake)")
print(f"Tokens (first 20 IDs): {features[0][:20]}")
# %%
# # 1. Check a single sample from the Dataset directly
# single_features, single_label = LR_DATA[0]
# print(f"Single Sample - Features: {single_features.shape}, Label: {single_label.shape}")
# # 2. Check the DataLoader batch
# batch_features, batch_labels = next(iter(LR_dataloader))
# # print(f"Batch - Features: {batch_features.shape}, Labels: {batch_labels.shape}")
# %%
evaluate_performance(model_adv,LR_dataloader,device)
print("\n--- 2. Testing Avanced model ---")
test_acc, test_f1 = evaluate_performance(model_adv, LR_dataloader, device)
print(f"Test Accuracy: {test_acc:.2f}%")
print(f"Test F1 Score: git {test_f1:.4f}")
# %%
print("\n--- 2. Testing BASE-Model ---")
test_acc, test_f1 = evaluate_performance(model, LR_dataloader, device)
print(f"Test Accuracy: {test_acc:.2f}%")
print(f"Test F1 Score: git {test_f1:.4f}")
# %%