fake-news-backup/src/models/nn.ju.py

# %%
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import os
import sys
sys.path.append(os.path.join(os.getcwd(), '../'))
from helper import default_labelling
from sklearn.metrics import f1_score
import numpy as np


# %%
label_map = {
    'Label.FAKE': 0,
    'Label.REAL': 1}

# %% [markdown]
"""
# Pipelining process
"""

# %%
df = pd.read_parquet("../../data/training/995,000_rows.parquet", columns=['tokens','type'])


df['label'] = df['type'].apply(default_labelling).astype(str)
df['label'] = df['label'].map(label_map).astype(int)
df = df.drop(columns=['type'])

# %%
df_test = pd.read_parquet("../../data/testing/995,000_rows.parquet", columns=['tokens','type'])

df_test['label'] = df_test['type'].apply(default_labelling).astype(str)
df_test['label'] = df_test['label'].map(label_map).astype(int)
df_test = df_test.drop(columns=['type'])

# %%
df_val = pd.read_parquet("../../data/validation/995,000_rows.parquet", columns=['tokens','type'])
df_val['label'] = df_val['type'].apply(default_labelling).astype(str)
df_val['label'] = df_val['label'].map(label_map).astype(int)
df_val = df_val.drop(columns=['type'])

# %%
# print("Loading Parquet file...")

# # Check the total number of rows (articles)
# print(f"Total rows in the raw Parquet file: {len(df)}")

# # Look at the first few rows to make sure the data looks correct
# print("\n--- First 3 Rows ---")
# print(df.head(3))

# %%
# count how many tokens we have in the corpuse
word_counts = Counter()
for x in df['tokens']:
    word_counts.update(x)

# Keep the top 50,000 words.
# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)
vocab = {"<PAD>": 0, "<UNK>": 1}
for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):
    vocab[word] = idx

print(f"Vocabulary built with {len(vocab)} words.")

# %%
# Create a Custom PyTorch Datase

# a wrapper for the data that PyTorch knows how to talk to.
class FakeNewsDataset(Dataset):
    def __init__(self, dataframe, vocab, max_length=256):
        self.dataframe = dataframe
        self.vocab = vocab
        self.max_length = max_length

# Tells PyTorch how many articles we have
#PyTorch calls this internally to know when to stop fetching data.
    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Grabs one article and its label at a time
        tokens = self.dataframe.iloc[idx]['tokens']
        label = self.dataframe.iloc[idx]['label']

        # Convert text tokens to Integer IDs
        article_ids = [self.vocab.get(word, 1) for word in tokens]

    # Truncate or Pad the article so they are all exactly 'max_length' long
        if len(article_ids) > self.max_length:
            article_ids = article_ids[:self.max_length]
        else:
            padding = [0] * (self.max_length - len(article_ids))
            article_ids.extend(padding)

        # Return as PyTorch tensors
        return torch.tensor(article_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)


# %%
## Prepare the DataLoader
# Wrap The dataframe in the Dataset class

# The DataLoader feeds the data to the model in batches (e.g., 64 articles at a time)
# This prevents the  computer from running out of RAM!


my_train_dataset = FakeNewsDataset(dataframe=df, vocab=vocab, max_length=256)
# Shuffle is true for training so the data keeps getting shuffled when trained and the model does not memorise the data
train_dataloader = DataLoader(my_train_dataset, batch_size=64, shuffle=True,num_workers=4,       # Start with 4; if CPU stays cool, try 6
pin_memory=True,     # Essential for fast data transfer
prefetch_factor=2)


val_data = FakeNewsDataset(dataframe=df_val, vocab=vocab, max_length=256)
val_dataloader = DataLoader(val_data, batch_size=64, shuffle=False)

test_data = FakeNewsDataset(dataframe=df_test, vocab=vocab, max_length=256)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=False)

# %% [markdown]
"""
Checking if the data conversion works
"""

# %%
# features, labels = next(iter(train_dataloader))
# # 2. Check the shapes (the dimensions of your tensors)
# print("--- Tensor Shapes ---")
# print(f"Features shape: {features.shape}")
# print(f"Labels shape:   {labels.shape}")

# # 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)
# print("\n--- Data Types ---")
# print(f"Features dtype: {features.dtype}")
# print(f"Labels dtype:   {labels.dtype}")

# # 4. Peek at the actual data for the very first article in this batch
# print("\n--- First Article Peek ---")
# print(f"Label: {labels[0].item()} (0 = Real, 1 = Fake)")
# print(f"Tokens (first 20 IDs): {features[0][:20]}")

# %%
class BaseModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=32, h1=256, h2=128, out_features=2):
        super().__init__()

        # The Embedding Layer: Turns word IDs into rich numerical vectors
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)

        # The Linear Layers: Learn the patterns to decide Fake vs. Real
        self.fc1 = nn.Linear(embed_dim, h1)
        self.fc2 = nn.Linear(h1, h2)
        self.out = nn.Linear(h2, out_features)

    def forward(self, x):

        # x starts as integers: shape (batch_size, sequence_length) -> e.g., (64, 256)
        # Pass through embedding
        x = self.embedding(x)
        # Average the word vectors to get one single vector for the whole article
        x = x.mean(dim=1)

        # Pass through hidden layers with ReLU activation
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))

        # Output layer (gives us the raw scores for 'Real' and 'Fake')
        x = self.out(x)
        return x
model_basic =BaseModel(vocab_size=len((vocab)))

# %% [markdown]
"""
'Advanced'
"""

# %%

class advanced_model(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128,num_layer = 2,  out_features=2):
        super().__init__()

        # 1. The Embedding Layer (Same as before)
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)

        # # 2. The GRU Layer (Extra layer)
        # batch_first=True is required because our DataLoader outputs (batch_size, sequence_length)
        self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, num_layers=2,batch_first=True,bidirectional=True,
            dropout=0.3)

        # 3. The Final Output Layer
        #  connect the GRU's memory (hidden_dim) directly to our Real/Fake outputs
        self.out = nn.Linear(hidden_dim, out_features)
        self.fc = nn.Linear(hidden_dim * 2, out_features)
    def forward(self, x):
        # x shape: (batch_size, sequence_length) -> e.g., (64, 256)

        #Get the word embeddings
        x = self.embedding(x)
        # x shape becomes: (64, 256, 32)

        # Pass the embeddings into the GRU
        # A GRU outputs two things: the output at every single word, AND its final memory state.
        # We use '_' to ignore the step-by-step output, and save 'hidden_state'.
        _, hidden = self.gru(x)

   # 4. Extract and Concatenate the final forward and backward states
        # hidden[-2] is the last forward state, hidden[-1] is the last backward state
        out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)

        return self.fc(out)

# Initilize
model_adv = advanced_model(vocab_size=len(vocab))

# %% [markdown]
"""
# Training

"""

# %%
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# %%
def evaluate_performance(model, dataloader, device):
    model.eval() # Put model in evaluation mode

    all_predictions = []
    all_true_labels = []

    # Turn off gradient tracking to save memory
    with torch.no_grad():
        for features, labels in dataloader:
            features = features.to(device)
            labels = labels.to(device)

            # Get model scores
            scores = model(features)

            # Find the predicted class (0 or 1)
            _, predictions = torch.max(scores,1)

            # Save predictions and actual labels to lists
            # all_predictions.extend(predictions.cpu().tolist())
            # all_true_labels.extend(labels.cpu().tolist())
            all_predictions.extend(predictions.cpu().numpy().flatten().tolist())
            all_true_labels.extend(labels.cpu().numpy().flatten().tolist())

    all_predictions = np.array(all_predictions)
    all_true_labels = np.array(all_true_labels)

    accuracy = (all_predictions == all_true_labels).mean() * 100

        # 4. Calculate F1 Score
        # average='macro' is best for your report to show you care about both classes equally
    f1 = f1_score(all_true_labels, all_predictions, average='macro')
    model.train() # Return model to training mode just in case
    return accuracy, f1


# %%
def train_model(model, train_loader, val_loader, device, epochs=5, lr=0.001):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Dictionary to store results for your report
    history = {'train_loss': [], 'val_acc': [], 'val_f1': []}

    print(f"Training {model.__class__.__name__} on {device}...")

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch_idx, (features, labels) in enumerate(train_loader):
            features, labels = features.to(device), labels.to(device)

            optimizer.zero_grad()
            predictions = model(features)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)

        # After each epoch, evaluate on validation set
        val_acc, val_f1 = evaluate_performance(model, val_loader, device)

        # Save results to our history dictionary
        history['train_loss'].append(avg_loss)
        history['val_acc'].append(val_acc)
        history['val_f1'].append(val_f1)

        print(f"\n Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f} \n  Val Acc: {val_acc:.2f}% \n  Val F1: {val_f1:.4f}")

    return history # Return the results so we can plot them later

# %%
train_995_basic =train_model (model_basic, train_dataloader, val_dataloader, device, epochs =7 )
print(train_995_basic )

# %%
train_995_adv =train_model (model_adv, train_dataloader, val_dataloader, device, epochs =7 )
print(train_995_adv )

# %%


# %%


# %% [markdown]
"""
# Evaluation
"""

# %% [markdown]
"""
Basic model
"""

# %%

# # 1. The Evaluation Function
# def evaluate_performance(model, dataloader, device):
#     model.eval() # Put model in evaluation mode

#     all_predictions = []
#     all_true_labels = []

#     # Turn off gradient tracking to save memory
#     with torch.no_grad():
#         for features, labels in dataloader:
#             features = features.to(device)
#             labels = labels.to(device)

#             # Get model scores
#             scores = model(features)

#             # Find the predicted class (0 or 1)
#             _, predictions = torch.max(scores,1)

#             # Save predictions and actual labels to lists
#             # all_predictions.extend(predictions.cpu().tolist())
#             # all_true_labels.extend(labels.cpu().tolist())
#             all_predictions.extend(predictions.cpu().numpy().flatten().tolist())
#             all_true_labels.extend(labels.cpu().numpy().flatten().tolist())

#     all_predictions = np.array(all_predictions)
#     all_true_labels = np.array(all_true_labels)

#     accuracy = (all_predictions == all_true_labels).mean() * 100

#         # 4. Calculate F1 Score
#         # average='macro' is best for your report to show you care about both classes equally
#     f1 = f1_score(all_true_labels, all_predictions, average='macro')
#     model.train() # Return model to training mode just in case
#     return accuracy, f1
# # # Change me based on the model

# # model = model_basic.to(device)


# # print(f"Training on: {device}")

# # # 2. Setup Loss and Optimizer
# # # CrossEntropyLoss is the standard for classification tasks
# # criterion = nn.CrossEntropyLoss()
# # # Adam is a very reliable, fast optimizer
# # optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# # # 3. The Training Loop
# # epochs = 7# Start with a small number of passes through the whole dataset

# # for epoch in range(epochs):
# #     model.train() # Tell the model it is in training mode
# #     total_loss = 0

# #     # Loop through our batches of 64 articles
# #     for batch_idx, (features, labels) in enumerate(train_dataloader):

# #         # Move data to the same device as the model (GPU/CPU)
# #         features = features.to(device)
# #         labels = labels.to(device)

# #         # Step A: Reset the optimizer's gradients
# #         optimizer.zero_grad()

# #         # Step B: Forward Pass (Have the model guess Real or Fake)
# #         predictions = model(features)

# #         # Step C: Calculate Loss (How wrong were the guesses?)
# #         loss = criterion(predictions, labels)

# #         # Step D: Backward Pass (Calculate how to fix the math)
# #         loss.backward()

# #         # Step E: Optimize (Actually apply the fixes to the model's weights)
# #         optimizer.step()

# #         total_loss += loss.item()

# #         # Print an update every 100 batches so we know it's working
# #         if batch_idx % 100 == 0:
# #             print(f"Epoch [{epoch+1}/{epochs}] | Batch {batch_idx} | Loss: {loss.item():.4f}")

# #     # Print the average loss at the end of each epoch
# #     avg_loss = total_loss / len(train_dataloader)
# #     print(f"--- End of Epoch {epoch+1} | Average Loss: {avg_loss:.4f} ---")


# %% [markdown]
"""
Advanced model

"""

# %%

# # 1. The Evaluation Function
# def evaluate_performance(model_adv, dataloader, device):
#     model_adv.eval() # Put model in evaluation mode

#     all_predictions = []
#     all_true_labels = []

#     # Turn off gradient tracking to save memory
#     with torch.no_grad():
#         for features, labels in dataloader:
#             features = features.to(device)
#             labels = labels.to(device)

#             # Get model scores
#             scores = model_adv(features)

#             # Find the predicted class (0 or 1)
#             _, predictions = scores.max(1)

#             # Save predictions and actual labels to lists
#             all_predictions.extend(predictions.cpu().tolist())
#             all_true_labels.extend(labels.cpu().tolist())

#     # Calculate Accuracy
#     correct_guesses = sum(p == t for p, t in zip(all_predictions, all_true_labels))
#     accuracy = (correct_guesses / len(all_true_labels)) * 100

#     # Calculate F1 Score
#     f1 = f1_score(all_true_labels, all_predictions, average='macro')

#     model_adv.train() # Return model to training mode just in case
#     return accuracy, f1


# %%
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# %%

print("Basic model ")
print(" Validation ")
val_acc995, val_f1_995 = evaluate_performance(model,val_dataloader, device)
print(f"Validation Accuracy: {val_acc995:.2f}%")
print(f"Validation F1 Score: {val_f1_995:.4f}")

print("\n Testing Phase ")
test_acc995, test_f1_995 = evaluate_performance(model, test_dataloader, device)
print(f"Test Accuracy:     {test_acc995:.2f}%")
print(f"Test F1 Score:    git  {test_f1_995:.4f}")

# %%


print(" GURU model ")
print(" Validation ")
adv_val_acc995, val_f1995 = evaluate_performance(model_adv,val_dataloader, device)
print(f"Validation Accuracy: {adv_val_acc995:.2f}%")
print(f"Validation F1 Score: {val_f1_995:.4f}")

print("\n  Testing ")
test_acc, test_f1 = evaluate_performance(model_adv, test_dataloader, device)
print(f"Test Accuracy:     {test_acc955:.2f}%")
print(f"Test F1 Score:    git  {test_f1:.4f}")

# %% [markdown]
"""
# Liar data


"""

# %%
from helper import  LIAR_labelling

f"../../data/training/LIAR.parquet"
df_LIAR = pd.read_parquet("../../data/testing/LIAR.parquet",columns=['tokens','type'])


df_LIAR['label'] = df_LIAR['type'].apply(LIAR_labelling).astype(str)
df_LIAR['label'] = df_LIAR['label'].map(label_map).astype(int)
df_LIAR = df_LIAR.drop(columns=['type'])

# %%
df_LIAR.head()

# %%
# count how many tokens we have in the corpuse
word_counts = Counter()
for x in df_LIAR['tokens']:
    word_counts.update(x)

# Keep the top 50,000 words.
# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)
vocab = {"<PAD>": 0, "<UNK>": 1}
for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):
    vocab[word] = idx

print(f"Vocabulary built with {len(vocab)} words.")

# %%

LR_DATA = FakeNewsDataset(dataframe=df_LIAR, vocab=vocab, max_length=256)
LR_dataloader  = DataLoader(LR_DATA, batch_size=32, shuffle=False)

# %%
features, labels = next(iter(LR_dataloader))
# 2. Check the shapes (the dimensions of your tensors)
print("--- Tensor Shapes ---")
print(f"Features shape: {features.shape}")
print(f"Labels shape:   {labels.shape}")

# 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)
print("\n--- Data Types ---")
print(f"Features dtype: {features.dtype}")
print(f"Labels dtype:   {labels.dtype}")

# 4. Peek at the actual data for the very first article in this batch
print("\n--- First Article Peek ---")
print(f"Label: {labels[0].item()} (0 = Real, 1 = Fake)")
print(f"Tokens (first 20 IDs): {features[0][:20]}")

# %%
# # 1. Check a single sample from the Dataset directly
# single_features, single_label = LR_DATA[0]
# print(f"Single Sample - Features: {single_features.shape}, Label: {single_label.shape}")

# # 2. Check the DataLoader batch
# batch_features, batch_labels = next(iter(LR_dataloader))
# # print(f"Batch - Features: {batch_features.shape}, Labels: {batch_labels.shape}")

# %%
evaluate_performance(model_adv,LR_dataloader,device)

print("\n--- 2. Testing Avanced model  ---")
test_acc, test_f1 = evaluate_performance(model_adv, LR_dataloader, device)
print(f"Test Accuracy:     {test_acc:.2f}%")
print(f"Test F1 Score:    git  {test_f1:.4f}")

# %%

print("\n--- 2. Testing BASE-Model ---")
test_acc, test_f1 = evaluate_performance(model, LR_dataloader, device)
print(f"Test Accuracy:     {test_acc:.2f}%")
print(f"Test F1 Score:    git  {test_f1:.4f}")

# %%