backup since codeberg is down

2026-03-27 13:35:43 +01:00
commit 8a61a214c6
45 changed files with 5038 additions and 0 deletions
--- a/src/models/nn.ju.py
+++ b/src/models/nn.ju.py
@@ -0,0 +1,579 @@
+# %%
+import torch 
+import torch.nn as nn
+import torch.nn.functional as F 
+import pandas as pd
+from torch.utils.data import Dataset, DataLoader
+from collections import Counter
+import os
+import sys
+sys.path.append(os.path.join(os.getcwd(), '../'))
+from helper import default_labelling
+from sklearn.metrics import f1_score
+import numpy as np
+
+
+# %%
+label_map = {
+    'Label.FAKE': 0,
+    'Label.REAL': 1}
+
+# %% [markdown]
+"""
+# Pipelining process
+"""
+
+# %%
+df = pd.read_parquet("../../data/training/995,000_rows.parquet", columns=['tokens','type'])
+
+
+df['label'] = df['type'].apply(default_labelling).astype(str)
+df['label'] = df['label'].map(label_map).astype(int)
+df = df.drop(columns=['type'])
+
+# %%
+df_test = pd.read_parquet("../../data/testing/995,000_rows.parquet", columns=['tokens','type'])
+
+df_test['label'] = df_test['type'].apply(default_labelling).astype(str)
+df_test['label'] = df_test['label'].map(label_map).astype(int)
+df_test = df_test.drop(columns=['type'])
+
+# %%
+df_val = pd.read_parquet("../../data/validation/995,000_rows.parquet", columns=['tokens','type'])
+df_val['label'] = df_val['type'].apply(default_labelling).astype(str)
+df_val['label'] = df_val['label'].map(label_map).astype(int)
+df_val = df_val.drop(columns=['type'])
+
+# %%
+# print("Loading Parquet file...")
+
+# # Check the total number of rows (articles)
+# print(f"Total rows in the raw Parquet file: {len(df)}")
+
+# # Look at the first few rows to make sure the data looks correct
+# print("\n--- First 3 Rows ---")
+# print(df.head(3))
+
+# %%
+# count how many tokens we have in the corpuse 
+word_counts = Counter()
+for x in df['tokens']:
+    word_counts.update(x)
+    
+# Keep the top 50,000 words. 
+# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)
+vocab = {"<PAD>": 0, "<UNK>": 1}
+for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):
+    vocab[word] = idx
+
+print(f"Vocabulary built with {len(vocab)} words.")
+
+# %%
+# Create a Custom PyTorch Datase
+
+# a wrapper for the data that PyTorch knows how to talk to.
+class FakeNewsDataset(Dataset):
+    def __init__(self, dataframe, vocab, max_length=256):
+        self.dataframe = dataframe
+        self.vocab = vocab
+        self.max_length = max_length
+
+# Tells PyTorch how many articles we have
+#PyTorch calls this internally to know when to stop fetching data.
+    def __len__(self):
+        return len(self.dataframe)
+    
+    def __getitem__(self, idx):
+        # Grabs one article and its label at a time
+        tokens = self.dataframe.iloc[idx]['tokens']
+        label = self.dataframe.iloc[idx]['label']
+
+        # Convert text tokens to Integer IDs
+        article_ids = [self.vocab.get(word, 1) for word in tokens]
+
+    # Truncate or Pad the article so they are all exactly 'max_length' long
+        if len(article_ids) > self.max_length:
+            article_ids = article_ids[:self.max_length]
+        else:
+            padding = [0] * (self.max_length - len(article_ids))
+            article_ids.extend(padding)
+            
+        # Return as PyTorch tensors
+        return torch.tensor(article_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)
+    
+
+# %%
+## Prepare the DataLoader 
+# Wrap The dataframe in the Dataset class
+
+# The DataLoader feeds the data to the model in batches (e.g., 64 articles at a time)
+# This prevents the  computer from running out of RAM!
+
+
+my_train_dataset = FakeNewsDataset(dataframe=df, vocab=vocab, max_length=256)
+# Shuffle is true for training so the data keeps getting shuffled when trained and the model does not memorise the data
+train_dataloader = DataLoader(my_train_dataset, batch_size=64, shuffle=True,num_workers=4,       # Start with 4; if CPU stays cool, try 6
+pin_memory=True,     # Essential for fast data transfer
+prefetch_factor=2)
+
+
+val_data = FakeNewsDataset(dataframe=df_val, vocab=vocab, max_length=256)
+val_dataloader = DataLoader(val_data, batch_size=64, shuffle=False)
+
+test_data = FakeNewsDataset(dataframe=df_test, vocab=vocab, max_length=256)
+test_dataloader = DataLoader(test_data, batch_size=64, shuffle=False)
+
+# %% [markdown]
+"""
+Checking if the data conversion works 
+"""
+
+# %%
+# features, labels = next(iter(train_dataloader))
+# # 2. Check the shapes (the dimensions of your tensors)
+# print("--- Tensor Shapes ---")
+# print(f"Features shape: {features.shape}") 
+# print(f"Labels shape:   {labels.shape}")   
+
+# # 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)
+# print("\n--- Data Types ---")
+# print(f"Features dtype: {features.dtype}")
+# print(f"Labels dtype:   {labels.dtype}")
+
+# # 4. Peek at the actual data for the very first article in this batch
+# print("\n--- First Article Peek ---")
+# print(f"Label: {labels[0].item()} (0 = Real, 1 = Fake)")
+# print(f"Tokens (first 20 IDs): {features[0][:20]}")
+
+# %%
+class BaseModel(nn.Module):
+    def __init__(self, vocab_size, embed_dim=32, h1=256, h2=128, out_features=2):
+        super().__init__()
+        
+        # The Embedding Layer: Turns word IDs into rich numerical vectors
+        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
+        
+        # The Linear Layers: Learn the patterns to decide Fake vs. Real
+        self.fc1 = nn.Linear(embed_dim, h1)
+        self.fc2 = nn.Linear(h1, h2)
+        self.out = nn.Linear(h2, out_features)
+        
+    def forward(self, x):
+        
+        # x starts as integers: shape (batch_size, sequence_length) -> e.g., (64, 256)
+        # Pass through embedding
+        x = self.embedding(x) 
+        # Average the word vectors to get one single vector for the whole article
+        x = x.mean(dim=1) 
+        
+        # Pass through hidden layers with ReLU activation
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        
+        # Output layer (gives us the raw scores for 'Real' and 'Fake')
+        x = self.out(x)
+        return x
+model_basic =BaseModel(vocab_size=len((vocab)))
+
+# %% [markdown]
+"""
+'Advanced' 
+"""
+
+# %%
+  
+class advanced_model(nn.Module):
+    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128,num_layer = 2,  out_features=2):
+        super().__init__()
+        
+        # 1. The Embedding Layer (Same as before)
+        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
+        
+        # # 2. The GRU Layer (Extra layer)
+        # batch_first=True is required because our DataLoader outputs (batch_size, sequence_length)   
+        self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, num_layers=2,batch_first=True,bidirectional=True, 
+            dropout=0.3)
+        
+        # 3. The Final Output Layer
+        #  connect the GRU's memory (hidden_dim) directly to our Real/Fake outputs
+        self.out = nn.Linear(hidden_dim, out_features)
+        self.fc = nn.Linear(hidden_dim * 2, out_features)
+    def forward(self, x):
+        # x shape: (batch_size, sequence_length) -> e.g., (64, 256)
+        
+        #Get the word embeddings
+        x = self.embedding(x) 
+        # x shape becomes: (64, 256, 32)
+        
+        # Pass the embeddings into the GRU
+        # A GRU outputs two things: the output at every single word, AND its final memory state.
+        # We use '_' to ignore the step-by-step output, and save 'hidden_state'.
+        _, hidden = self.gru(x)
+        
+   # 4. Extract and Concatenate the final forward and backward states
+        # hidden[-2] is the last forward state, hidden[-1] is the last backward state
+        out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
+        
+        return self.fc(out)
+    
+# Initilize
+model_adv = advanced_model(vocab_size=len(vocab))
+
+# %% [markdown]
+"""
+# Training 
+
+"""
+
+# %%
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# %%
+def evaluate_performance(model, dataloader, device):
+    model.eval() # Put model in evaluation mode
+    
+    all_predictions = []
+    all_true_labels = []
+    
+    # Turn off gradient tracking to save memory
+    with torch.no_grad():
+        for features, labels in dataloader:
+            features = features.to(device)
+            labels = labels.to(device)
+            
+            # Get model scores
+            scores = model(features)
+            
+            # Find the predicted class (0 or 1)
+            _, predictions = torch.max(scores,1)
+            
+            # Save predictions and actual labels to lists
+            # all_predictions.extend(predictions.cpu().tolist())
+            # all_true_labels.extend(labels.cpu().tolist())
+            all_predictions.extend(predictions.cpu().numpy().flatten().tolist())
+            all_true_labels.extend(labels.cpu().numpy().flatten().tolist())
+            
+    all_predictions = np.array(all_predictions)
+    all_true_labels = np.array(all_true_labels)
+        
+    accuracy = (all_predictions == all_true_labels).mean() * 100
+        
+        # 4. Calculate F1 Score
+        # average='macro' is best for your report to show you care about both classes equally
+    f1 = f1_score(all_true_labels, all_predictions, average='macro')
+    model.train() # Return model to training mode just in case
+    return accuracy, f1
+
+
+# %%
+def train_model(model, train_loader, val_loader, device, epochs=5, lr=0.001):
+    model = model.to(device)
+    criterion = nn.CrossEntropyLoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
+    
+    # Dictionary to store results for your report
+    history = {'train_loss': [], 'val_acc': [], 'val_f1': []}
+
+    print(f"Training {model.__class__.__name__} on {device}...")
+
+    for epoch in range(epochs):
+        model.train()
+        total_loss = 0
+        
+        for batch_idx, (features, labels) in enumerate(train_loader):
+            features, labels = features.to(device), labels.to(device)
+            
+            optimizer.zero_grad()
+            predictions = model(features)
+            loss = criterion(predictions, labels)
+            loss.backward()
+            optimizer.step()
+            
+            total_loss += loss.item()
+            
+        avg_loss = total_loss / len(train_loader)
+        
+        # After each epoch, evaluate on validation set
+        val_acc, val_f1 = evaluate_performance(model, val_loader, device)
+        
+        # Save results to our history dictionary
+        history['train_loss'].append(avg_loss)
+        history['val_acc'].append(val_acc)
+        history['val_f1'].append(val_f1)
+        
+        print(f"\n Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f} \n  Val Acc: {val_acc:.2f}% \n  Val F1: {val_f1:.4f}")
+
+    return history # Return the results so we can plot them later
+
+# %%
+train_995_basic =train_model (model_basic, train_dataloader, val_dataloader, device, epochs =7 )
+print(train_995_basic )
+
+# %%
+train_995_adv =train_model (model_adv, train_dataloader, val_dataloader, device, epochs =7 )
+print(train_995_adv )
+
+# %%
+
+
+# %%
+
+
+# %% [markdown]
+"""
+# Evaluation 
+"""
+
+# %% [markdown]
+"""
+Basic model
+"""
+
+# %%
+
+# # 1. The Evaluation Function
+# def evaluate_performance(model, dataloader, device):
+#     model.eval() # Put model in evaluation mode
+    
+#     all_predictions = []
+#     all_true_labels = []
+    
+#     # Turn off gradient tracking to save memory
+#     with torch.no_grad():
+#         for features, labels in dataloader:
+#             features = features.to(device)
+#             labels = labels.to(device)
+            
+#             # Get model scores
+#             scores = model(features)
+            
+#             # Find the predicted class (0 or 1)
+#             _, predictions = torch.max(scores,1)
+            
+#             # Save predictions and actual labels to lists
+#             # all_predictions.extend(predictions.cpu().tolist())
+#             # all_true_labels.extend(labels.cpu().tolist())
+#             all_predictions.extend(predictions.cpu().numpy().flatten().tolist())
+#             all_true_labels.extend(labels.cpu().numpy().flatten().tolist())
+            
+#     all_predictions = np.array(all_predictions)
+#     all_true_labels = np.array(all_true_labels)
+        
+#     accuracy = (all_predictions == all_true_labels).mean() * 100
+        
+#         # 4. Calculate F1 Score
+#         # average='macro' is best for your report to show you care about both classes equally
+#     f1 = f1_score(all_true_labels, all_predictions, average='macro')
+#     model.train() # Return model to training mode just in case
+#     return accuracy, f1
+# # # Change me based on the model
+
+# # model = model_basic.to(device)
+
+
+# # print(f"Training on: {device}")
+
+# # # 2. Setup Loss and Optimizer
+# # # CrossEntropyLoss is the standard for classification tasks
+# # criterion = nn.CrossEntropyLoss() 
+# # # Adam is a very reliable, fast optimizer
+# # optimizer = torch.optim.Adam(model.parameters(), lr=0.001) 
+
+# # # 3. The Training Loop
+# # epochs = 7# Start with a small number of passes through the whole dataset
+
+# # for epoch in range(epochs):
+# #     model.train() # Tell the model it is in training mode
+# #     total_loss = 0
+    
+# #     # Loop through our batches of 64 articles
+# #     for batch_idx, (features, labels) in enumerate(train_dataloader):
+        
+# #         # Move data to the same device as the model (GPU/CPU)
+# #         features = features.to(device)
+# #         labels = labels.to(device)
+        
+# #         # Step A: Reset the optimizer's gradients
+# #         optimizer.zero_grad()
+        
+# #         # Step B: Forward Pass (Have the model guess Real or Fake)
+# #         predictions = model(features)
+        
+# #         # Step C: Calculate Loss (How wrong were the guesses?)
+# #         loss = criterion(predictions, labels)
+        
+# #         # Step D: Backward Pass (Calculate how to fix the math)
+# #         loss.backward()
+        
+# #         # Step E: Optimize (Actually apply the fixes to the model's weights)
+# #         optimizer.step()
+        
+# #         total_loss += loss.item()
+        
+# #         # Print an update every 100 batches so we know it's working
+# #         if batch_idx % 100 == 0:
+# #             print(f"Epoch [{epoch+1}/{epochs}] | Batch {batch_idx} | Loss: {loss.item():.4f}")
+            
+# #     # Print the average loss at the end of each epoch
+# #     avg_loss = total_loss / len(train_dataloader)
+# #     print(f"--- End of Epoch {epoch+1} | Average Loss: {avg_loss:.4f} ---")
+
+
+# %% [markdown]
+"""
+Advanced model
+
+"""
+
+# %%
+
+# # 1. The Evaluation Function
+# def evaluate_performance(model_adv, dataloader, device):
+#     model_adv.eval() # Put model in evaluation mode
+    
+#     all_predictions = []
+#     all_true_labels = []
+    
+#     # Turn off gradient tracking to save memory
+#     with torch.no_grad():
+#         for features, labels in dataloader:
+#             features = features.to(device)
+#             labels = labels.to(device)
+            
+#             # Get model scores
+#             scores = model_adv(features)
+            
+#             # Find the predicted class (0 or 1)
+#             _, predictions = scores.max(1)
+            
+#             # Save predictions and actual labels to lists
+#             all_predictions.extend(predictions.cpu().tolist())
+#             all_true_labels.extend(labels.cpu().tolist())
+            
+#     # Calculate Accuracy
+#     correct_guesses = sum(p == t for p, t in zip(all_predictions, all_true_labels))
+#     accuracy = (correct_guesses / len(all_true_labels)) * 100
+    
+#     # Calculate F1 Score
+#     f1 = f1_score(all_true_labels, all_predictions, average='macro')
+    
+#     model_adv.train() # Return model to training mode just in case
+#     return accuracy, f1
+
+
+
+# %%
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+# %%
+
+print("Basic model ")
+print(" Validation ")
+val_acc995, val_f1_995 = evaluate_performance(model,val_dataloader, device)
+print(f"Validation Accuracy: {val_acc995:.2f}%")
+print(f"Validation F1 Score: {val_f1_995:.4f}")
+
+print("\n Testing Phase ")
+test_acc995, test_f1_995 = evaluate_performance(model, test_dataloader, device)
+print(f"Test Accuracy:     {test_acc995:.2f}%")
+print(f"Test F1 Score:    git  {test_f1_995:.4f}")
+
+# %%
+
+
+print(" GURU model ")
+print(" Validation ")
+adv_val_acc995, val_f1995 = evaluate_performance(model_adv,val_dataloader, device)
+print(f"Validation Accuracy: {adv_val_acc995:.2f}%")
+print(f"Validation F1 Score: {val_f1_995:.4f}")
+
+print("\n  Testing ")
+test_acc, test_f1 = evaluate_performance(model_adv, test_dataloader, device)
+print(f"Test Accuracy:     {test_acc955:.2f}%")
+print(f"Test F1 Score:    git  {test_f1:.4f}")
+
+# %% [markdown]
+"""
+# Liar data
+
+
+"""
+
+# %%
+from helper import  LIAR_labelling
+
+f"../../data/training/LIAR.parquet"
+df_LIAR = pd.read_parquet("../../data/testing/LIAR.parquet",columns=['tokens','type'])
+
+
+df_LIAR['label'] = df_LIAR['type'].apply(LIAR_labelling).astype(str)
+df_LIAR['label'] = df_LIAR['label'].map(label_map).astype(int)
+df_LIAR = df_LIAR.drop(columns=['type'])
+
+# %%
+df_LIAR.head()
+
+# %%
+# count how many tokens we have in the corpuse 
+word_counts = Counter()
+for x in df_LIAR['tokens']:
+    word_counts.update(x)
+    
+# Keep the top 50,000 words. 
+# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)
+vocab = {"<PAD>": 0, "<UNK>": 1}
+for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):
+    vocab[word] = idx
+
+print(f"Vocabulary built with {len(vocab)} words.")
+
+# %%
+
+LR_DATA = FakeNewsDataset(dataframe=df_LIAR, vocab=vocab, max_length=256)
+LR_dataloader  = DataLoader(LR_DATA, batch_size=32, shuffle=False)
+
+# %%
+features, labels = next(iter(LR_dataloader))
+# 2. Check the shapes (the dimensions of your tensors)
+print("--- Tensor Shapes ---")
+print(f"Features shape: {features.shape}") 
+print(f"Labels shape:   {labels.shape}")   
+
+# 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)
+print("\n--- Data Types ---")
+print(f"Features dtype: {features.dtype}")
+print(f"Labels dtype:   {labels.dtype}")
+
+# 4. Peek at the actual data for the very first article in this batch
+print("\n--- First Article Peek ---")
+print(f"Label: {labels[0].item()} (0 = Real, 1 = Fake)")
+print(f"Tokens (first 20 IDs): {features[0][:20]}")
+
+# %%
+# # 1. Check a single sample from the Dataset directly
+# single_features, single_label = LR_DATA[0]
+# print(f"Single Sample - Features: {single_features.shape}, Label: {single_label.shape}")
+
+# # 2. Check the DataLoader batch
+# batch_features, batch_labels = next(iter(LR_dataloader))
+# # print(f"Batch - Features: {batch_features.shape}, Labels: {batch_labels.shape}")
+
+# %%
+evaluate_performance(model_adv,LR_dataloader,device)
+
+print("\n--- 2. Testing Avanced model  ---")
+test_acc, test_f1 = evaluate_performance(model_adv, LR_dataloader, device)
+print(f"Test Accuracy:     {test_acc:.2f}%")
+print(f"Test F1 Score:    git  {test_f1:.4f}")
+
+# %%
+
+print("\n--- 2. Testing BASE-Model ---")
+test_acc, test_f1 = evaluate_performance(model, LR_dataloader, device)
+print(f"Test Accuracy:     {test_acc:.2f}%")
+print(f"Test F1 Score:    git  {test_f1:.4f}")
+
+# %%
+
+