# %% import torch import torch.nn as nn import torch.nn.functional as F import pandas as pd from torch.utils.data import Dataset, DataLoader from collections import Counter import os import sys sys.path.append(os.path.join(os.getcwd(), '../')) from helper import default_labelling from sklearn.metrics import f1_score import numpy as np # %% label_map = { 'Label.FAKE': 0, 'Label.REAL': 1} # %% [markdown] """ # Pipelining process """ # %% df = pd.read_parquet("../../data/training/995,000_rows.parquet", columns=['tokens','type']) df['label'] = df['type'].apply(default_labelling).astype(str) df['label'] = df['label'].map(label_map).astype(int) df = df.drop(columns=['type']) # %% df_test = pd.read_parquet("../../data/testing/995,000_rows.parquet", columns=['tokens','type']) df_test['label'] = df_test['type'].apply(default_labelling).astype(str) df_test['label'] = df_test['label'].map(label_map).astype(int) df_test = df_test.drop(columns=['type']) # %% df_val = pd.read_parquet("../../data/validation/995,000_rows.parquet", columns=['tokens','type']) df_val['label'] = df_val['type'].apply(default_labelling).astype(str) df_val['label'] = df_val['label'].map(label_map).astype(int) df_val = df_val.drop(columns=['type']) # %% # print("Loading Parquet file...") # # Check the total number of rows (articles) # print(f"Total rows in the raw Parquet file: {len(df)}") # # Look at the first few rows to make sure the data looks correct # print("\n--- First 3 Rows ---") # print(df.head(3)) # %% # count how many tokens we have in the corpuse word_counts = Counter() for x in df['tokens']: word_counts.update(x) # Keep the top 50,000 words. # Index 0 is for (padding), Index 1 is for (unknown words) vocab = {"": 0, "": 1} for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2): vocab[word] = idx print(f"Vocabulary built with {len(vocab)} words.") # %% # Create a Custom PyTorch Datase # a wrapper for the data that PyTorch knows how to talk to. class FakeNewsDataset(Dataset): def __init__(self, dataframe, vocab, max_length=256): self.dataframe = dataframe self.vocab = vocab self.max_length = max_length # Tells PyTorch how many articles we have #PyTorch calls this internally to know when to stop fetching data. def __len__(self): return len(self.dataframe) def __getitem__(self, idx): # Grabs one article and its label at a time tokens = self.dataframe.iloc[idx]['tokens'] label = self.dataframe.iloc[idx]['label'] # Convert text tokens to Integer IDs article_ids = [self.vocab.get(word, 1) for word in tokens] # Truncate or Pad the article so they are all exactly 'max_length' long if len(article_ids) > self.max_length: article_ids = article_ids[:self.max_length] else: padding = [0] * (self.max_length - len(article_ids)) article_ids.extend(padding) # Return as PyTorch tensors return torch.tensor(article_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long) # %% ## Prepare the DataLoader # Wrap The dataframe in the Dataset class # The DataLoader feeds the data to the model in batches (e.g., 64 articles at a time) # This prevents the computer from running out of RAM! my_train_dataset = FakeNewsDataset(dataframe=df, vocab=vocab, max_length=256) # Shuffle is true for training so the data keeps getting shuffled when trained and the model does not memorise the data train_dataloader = DataLoader(my_train_dataset, batch_size=64, shuffle=True,num_workers=4, # Start with 4; if CPU stays cool, try 6 pin_memory=True, # Essential for fast data transfer prefetch_factor=2) val_data = FakeNewsDataset(dataframe=df_val, vocab=vocab, max_length=256) val_dataloader = DataLoader(val_data, batch_size=64, shuffle=False) test_data = FakeNewsDataset(dataframe=df_test, vocab=vocab, max_length=256) test_dataloader = DataLoader(test_data, batch_size=64, shuffle=False) # %% [markdown] """ Checking if the data conversion works """ # %% # features, labels = next(iter(train_dataloader)) # # 2. Check the shapes (the dimensions of your tensors) # print("--- Tensor Shapes ---") # print(f"Features shape: {features.shape}") # print(f"Labels shape: {labels.shape}") # # 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers) # print("\n--- Data Types ---") # print(f"Features dtype: {features.dtype}") # print(f"Labels dtype: {labels.dtype}") # # 4. Peek at the actual data for the very first article in this batch # print("\n--- First Article Peek ---") # print(f"Label: {labels[0].item()} (0 = Real, 1 = Fake)") # print(f"Tokens (first 20 IDs): {features[0][:20]}") # %% class BaseModel(nn.Module): def __init__(self, vocab_size, embed_dim=32, h1=256, h2=128, out_features=2): super().__init__() # The Embedding Layer: Turns word IDs into rich numerical vectors self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim) # The Linear Layers: Learn the patterns to decide Fake vs. Real self.fc1 = nn.Linear(embed_dim, h1) self.fc2 = nn.Linear(h1, h2) self.out = nn.Linear(h2, out_features) def forward(self, x): # x starts as integers: shape (batch_size, sequence_length) -> e.g., (64, 256) # Pass through embedding x = self.embedding(x) # Average the word vectors to get one single vector for the whole article x = x.mean(dim=1) # Pass through hidden layers with ReLU activation x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) # Output layer (gives us the raw scores for 'Real' and 'Fake') x = self.out(x) return x model_basic =BaseModel(vocab_size=len((vocab))) # %% [markdown] """ 'Advanced' """ # %% class advanced_model(nn.Module): def __init__(self, vocab_size, embed_dim=64, hidden_dim=128,num_layer = 2, out_features=2): super().__init__() # 1. The Embedding Layer (Same as before) self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim) # # 2. The GRU Layer (Extra layer) # batch_first=True is required because our DataLoader outputs (batch_size, sequence_length) self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, num_layers=2,batch_first=True,bidirectional=True, dropout=0.3) # 3. The Final Output Layer # connect the GRU's memory (hidden_dim) directly to our Real/Fake outputs self.out = nn.Linear(hidden_dim, out_features) self.fc = nn.Linear(hidden_dim * 2, out_features) def forward(self, x): # x shape: (batch_size, sequence_length) -> e.g., (64, 256) #Get the word embeddings x = self.embedding(x) # x shape becomes: (64, 256, 32) # Pass the embeddings into the GRU # A GRU outputs two things: the output at every single word, AND its final memory state. # We use '_' to ignore the step-by-step output, and save 'hidden_state'. _, hidden = self.gru(x) # 4. Extract and Concatenate the final forward and backward states # hidden[-2] is the last forward state, hidden[-1] is the last backward state out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1) return self.fc(out) # Initilize model_adv = advanced_model(vocab_size=len(vocab)) # %% [markdown] """ # Training """ # %% device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # %% def evaluate_performance(model, dataloader, device): model.eval() # Put model in evaluation mode all_predictions = [] all_true_labels = [] # Turn off gradient tracking to save memory with torch.no_grad(): for features, labels in dataloader: features = features.to(device) labels = labels.to(device) # Get model scores scores = model(features) # Find the predicted class (0 or 1) _, predictions = torch.max(scores,1) # Save predictions and actual labels to lists # all_predictions.extend(predictions.cpu().tolist()) # all_true_labels.extend(labels.cpu().tolist()) all_predictions.extend(predictions.cpu().numpy().flatten().tolist()) all_true_labels.extend(labels.cpu().numpy().flatten().tolist()) all_predictions = np.array(all_predictions) all_true_labels = np.array(all_true_labels) accuracy = (all_predictions == all_true_labels).mean() * 100 # 4. Calculate F1 Score # average='macro' is best for your report to show you care about both classes equally f1 = f1_score(all_true_labels, all_predictions, average='macro') model.train() # Return model to training mode just in case return accuracy, f1 # %% def train_model(model, train_loader, val_loader, device, epochs=5, lr=0.001): model = model.to(device) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr) # Dictionary to store results for your report history = {'train_loss': [], 'val_acc': [], 'val_f1': []} print(f"Training {model.__class__.__name__} on {device}...") for epoch in range(epochs): model.train() total_loss = 0 for batch_idx, (features, labels) in enumerate(train_loader): features, labels = features.to(device), labels.to(device) optimizer.zero_grad() predictions = model(features) loss = criterion(predictions, labels) loss.backward() optimizer.step() total_loss += loss.item() avg_loss = total_loss / len(train_loader) # After each epoch, evaluate on validation set val_acc, val_f1 = evaluate_performance(model, val_loader, device) # Save results to our history dictionary history['train_loss'].append(avg_loss) history['val_acc'].append(val_acc) history['val_f1'].append(val_f1) print(f"\n Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f} \n Val Acc: {val_acc:.2f}% \n Val F1: {val_f1:.4f}") return history # Return the results so we can plot them later # %% train_995_basic =train_model (model_basic, train_dataloader, val_dataloader, device, epochs =7 ) print(train_995_basic ) # %% train_995_adv =train_model (model_adv, train_dataloader, val_dataloader, device, epochs =7 ) print(train_995_adv ) # %% # %% # %% [markdown] """ # Evaluation """ # %% [markdown] """ Basic model """ # %% # # 1. The Evaluation Function # def evaluate_performance(model, dataloader, device): # model.eval() # Put model in evaluation mode # all_predictions = [] # all_true_labels = [] # # Turn off gradient tracking to save memory # with torch.no_grad(): # for features, labels in dataloader: # features = features.to(device) # labels = labels.to(device) # # Get model scores # scores = model(features) # # Find the predicted class (0 or 1) # _, predictions = torch.max(scores,1) # # Save predictions and actual labels to lists # # all_predictions.extend(predictions.cpu().tolist()) # # all_true_labels.extend(labels.cpu().tolist()) # all_predictions.extend(predictions.cpu().numpy().flatten().tolist()) # all_true_labels.extend(labels.cpu().numpy().flatten().tolist()) # all_predictions = np.array(all_predictions) # all_true_labels = np.array(all_true_labels) # accuracy = (all_predictions == all_true_labels).mean() * 100 # # 4. Calculate F1 Score # # average='macro' is best for your report to show you care about both classes equally # f1 = f1_score(all_true_labels, all_predictions, average='macro') # model.train() # Return model to training mode just in case # return accuracy, f1 # # # Change me based on the model # # model = model_basic.to(device) # # print(f"Training on: {device}") # # # 2. Setup Loss and Optimizer # # # CrossEntropyLoss is the standard for classification tasks # # criterion = nn.CrossEntropyLoss() # # # Adam is a very reliable, fast optimizer # # optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # # # 3. The Training Loop # # epochs = 7# Start with a small number of passes through the whole dataset # # for epoch in range(epochs): # # model.train() # Tell the model it is in training mode # # total_loss = 0 # # # Loop through our batches of 64 articles # # for batch_idx, (features, labels) in enumerate(train_dataloader): # # # Move data to the same device as the model (GPU/CPU) # # features = features.to(device) # # labels = labels.to(device) # # # Step A: Reset the optimizer's gradients # # optimizer.zero_grad() # # # Step B: Forward Pass (Have the model guess Real or Fake) # # predictions = model(features) # # # Step C: Calculate Loss (How wrong were the guesses?) # # loss = criterion(predictions, labels) # # # Step D: Backward Pass (Calculate how to fix the math) # # loss.backward() # # # Step E: Optimize (Actually apply the fixes to the model's weights) # # optimizer.step() # # total_loss += loss.item() # # # Print an update every 100 batches so we know it's working # # if batch_idx % 100 == 0: # # print(f"Epoch [{epoch+1}/{epochs}] | Batch {batch_idx} | Loss: {loss.item():.4f}") # # # Print the average loss at the end of each epoch # # avg_loss = total_loss / len(train_dataloader) # # print(f"--- End of Epoch {epoch+1} | Average Loss: {avg_loss:.4f} ---") # %% [markdown] """ Advanced model """ # %% # # 1. The Evaluation Function # def evaluate_performance(model_adv, dataloader, device): # model_adv.eval() # Put model in evaluation mode # all_predictions = [] # all_true_labels = [] # # Turn off gradient tracking to save memory # with torch.no_grad(): # for features, labels in dataloader: # features = features.to(device) # labels = labels.to(device) # # Get model scores # scores = model_adv(features) # # Find the predicted class (0 or 1) # _, predictions = scores.max(1) # # Save predictions and actual labels to lists # all_predictions.extend(predictions.cpu().tolist()) # all_true_labels.extend(labels.cpu().tolist()) # # Calculate Accuracy # correct_guesses = sum(p == t for p, t in zip(all_predictions, all_true_labels)) # accuracy = (correct_guesses / len(all_true_labels)) * 100 # # Calculate F1 Score # f1 = f1_score(all_true_labels, all_predictions, average='macro') # model_adv.train() # Return model to training mode just in case # return accuracy, f1 # %% device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # %% print("Basic model ") print(" Validation ") val_acc995, val_f1_995 = evaluate_performance(model,val_dataloader, device) print(f"Validation Accuracy: {val_acc995:.2f}%") print(f"Validation F1 Score: {val_f1_995:.4f}") print("\n Testing Phase ") test_acc995, test_f1_995 = evaluate_performance(model, test_dataloader, device) print(f"Test Accuracy: {test_acc995:.2f}%") print(f"Test F1 Score: git {test_f1_995:.4f}") # %% print(" GURU model ") print(" Validation ") adv_val_acc995, val_f1995 = evaluate_performance(model_adv,val_dataloader, device) print(f"Validation Accuracy: {adv_val_acc995:.2f}%") print(f"Validation F1 Score: {val_f1_995:.4f}") print("\n Testing ") test_acc, test_f1 = evaluate_performance(model_adv, test_dataloader, device) print(f"Test Accuracy: {test_acc955:.2f}%") print(f"Test F1 Score: git {test_f1:.4f}") # %% [markdown] """ # Liar data """ # %% from helper import LIAR_labelling f"../../data/training/LIAR.parquet" df_LIAR = pd.read_parquet("../../data/testing/LIAR.parquet",columns=['tokens','type']) df_LIAR['label'] = df_LIAR['type'].apply(LIAR_labelling).astype(str) df_LIAR['label'] = df_LIAR['label'].map(label_map).astype(int) df_LIAR = df_LIAR.drop(columns=['type']) # %% df_LIAR.head() # %% # count how many tokens we have in the corpuse word_counts = Counter() for x in df_LIAR['tokens']: word_counts.update(x) # Keep the top 50,000 words. # Index 0 is for (padding), Index 1 is for (unknown words) vocab = {"": 0, "": 1} for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2): vocab[word] = idx print(f"Vocabulary built with {len(vocab)} words.") # %% LR_DATA = FakeNewsDataset(dataframe=df_LIAR, vocab=vocab, max_length=256) LR_dataloader = DataLoader(LR_DATA, batch_size=32, shuffle=False) # %% features, labels = next(iter(LR_dataloader)) # 2. Check the shapes (the dimensions of your tensors) print("--- Tensor Shapes ---") print(f"Features shape: {features.shape}") print(f"Labels shape: {labels.shape}") # 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers) print("\n--- Data Types ---") print(f"Features dtype: {features.dtype}") print(f"Labels dtype: {labels.dtype}") # 4. Peek at the actual data for the very first article in this batch print("\n--- First Article Peek ---") print(f"Label: {labels[0].item()} (0 = Real, 1 = Fake)") print(f"Tokens (first 20 IDs): {features[0][:20]}") # %% # # 1. Check a single sample from the Dataset directly # single_features, single_label = LR_DATA[0] # print(f"Single Sample - Features: {single_features.shape}, Label: {single_label.shape}") # # 2. Check the DataLoader batch # batch_features, batch_labels = next(iter(LR_dataloader)) # # print(f"Batch - Features: {batch_features.shape}, Labels: {batch_labels.shape}") # %% evaluate_performance(model_adv,LR_dataloader,device) print("\n--- 2. Testing Avanced model ---") test_acc, test_f1 = evaluate_performance(model_adv, LR_dataloader, device) print(f"Test Accuracy: {test_acc:.2f}%") print(f"Test F1 Score: git {test_f1:.4f}") # %% print("\n--- 2. Testing BASE-Model ---") test_acc, test_f1 = evaluate_performance(model, LR_dataloader, device) print(f"Test Accuracy: {test_acc:.2f}%") print(f"Test F1 Score: git {test_f1:.4f}") # %%