backup since codeberg is down

2026-03-27 13:35:43 +01:00
commit 8a61a214c6
45 changed files with 5038 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,35 @@
+# Document
+*.pdf
+*.bak
+*.tex.backup
+*.tex~
+*.synctex.gz
+*.out
+.bak
+build/
+_minted/
+obj/
+bin/
+
+# Python
+__pycache__/
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+.ipynb_checkpoints/
+
+# data bs
+data/**
+!data/
+!data/**/
+!data/**/.gitkeep
+
+# general bs
+.DS_Store
+flake.lock
+.vscode/
--- a/README.md
+++ b/README.md
@@ -0,0 +1,6 @@
+- download the neccesary dataset files to data/datasets as csv (not zip). Move all tsv files from LIAR zip file direcly into the datasets folder.
+- run setup.py to setup nltk, and clean and split the datasets. It takes long, please wait.
+- run main.py from the src diretory to test the models. The function requrires the model type, model file, and dataset to be passed as parameters.
+Here is an example: python main.py --model_type logistic --model_file logistic.model --data_file 995,000_rows.parquet
+The model files can be found in the models directory (not the one in src), the data files can be found in data/testing (pass LIAR.parquet to test on LIAR dataset).
+The model types and more information including how to train models can be found with python main.py --help.
--- a/analysis/Split_analysis.ipynb
+++ b/analysis/Split_analysis.ipynb
@@ -0,0 +1,457 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "3b55d166",
+   "metadata": {},
+   "source": [
+    "# DO NOT RUN; DaATA WILL BE LOST"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9c2d25e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd \n",
+    "import os \n",
+    "import sys\n",
+    "sys.path.append(os.path.join(os.getcwd(), '../src'))\n",
+    "from constants import TRAINING_DIR, TESTING_DIR, VALIDATION_DIR\n",
+    "pd.set_option('display.max_columns', None)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd67fc64",
+   "metadata": {},
+   "source": [
+    "# Time Split "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "a917b0fa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_ty = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown')   \n",
+    "train_ty = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown')   \n",
+    "val_ty =  pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown')   \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "0098d6e4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "rows in train(818843, 1),\n",
+      " rows in test (99499, 1), \n",
+      " rows in validation(76645, 1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f'rows in train{train_ty.shape },\\n rows in test {test_ty.shape}, \\n rows in validation{val_ty.shape}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5985a4f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "timeline = pd.concat([\n",
+    "    train_ty.value_counts().rename('train'),\n",
+    "    test_ty.value_counts().rename('test'),\n",
+    "    val_ty.value_counts().rename('val'),\n",
+    "], axis=1).fillna(0).astype(int)\n",
+    "\n",
+    "timeline.index.name = 'type'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "b0673e19",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>train</th>\n",
+       "      <th>test</th>\n",
+       "      <th>val</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>type</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>political</th>\n",
+       "      <td>194518</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>bias</th>\n",
+       "      <td>133232</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>fake</th>\n",
+       "      <td>104883</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>conspiracy</th>\n",
+       "      <td>97314</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>rumor</th>\n",
+       "      <td>56445</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>unknown</th>\n",
+       "      <td>43534</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>reliable</th>\n",
+       "      <td>42419</td>\n",
+       "      <td>99499</td>\n",
+       "      <td>76645</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>unreliable</th>\n",
+       "      <td>35332</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>clickbait</th>\n",
+       "      <td>27412</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>junksci</th>\n",
+       "      <td>14040</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>satire</th>\n",
+       "      <td>13160</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>hate</th>\n",
+       "      <td>8779</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             train   test    val\n",
+       "type                            \n",
+       "political   194518      0      0\n",
+       "bias        133232      0      0\n",
+       "fake        104883      0      0\n",
+       "conspiracy   97314      0      0\n",
+       "rumor        56445      0      0\n",
+       "unknown      43534      0      0\n",
+       "reliable     42419  99499  76645\n",
+       "unreliable   35332      0      0\n",
+       "clickbait    27412      0      0\n",
+       "junksci      14040      0      0\n",
+       "satire       13160      0      0\n",
+       "hate          8779      0      0"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "timeline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6bdc7d84",
+   "metadata": {},
+   "source": [
+    "# Random Split "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "cd5ca57b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_ty_R = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown')   \n",
+    "train_ty_R = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown')   \n",
+    "val_ty_R =  pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown')   \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "c793a37c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "rows in train(745724, 1),\n",
+      " rows in test (149766, 1), \n",
+      " rows in validation(99510, 1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f'rows in train{train_ty_R.shape },\\n rows in test {test_ty_R.shape}, \\n rows in validation{val_ty_R.shape}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "583304ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "timeline_R = pd.concat([\n",
+    "    train_ty_R.value_counts().rename('train'),\n",
+    "    test_ty_R.value_counts().rename('test'),\n",
+    "    val_ty_R.value_counts().rename('val'),\n",
+    "], axis=1).fillna(0).astype(int)\n",
+    "\n",
+    "timeline_R.index.name = 'type'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "d8255b60",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>train</th>\n",
+       "      <th>test</th>\n",
+       "      <th>val</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>type</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>reliable</th>\n",
+       "      <td>163802</td>\n",
+       "      <td>33010</td>\n",
+       "      <td>21752</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>political</th>\n",
+       "      <td>145779</td>\n",
+       "      <td>29241</td>\n",
+       "      <td>19498</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>bias</th>\n",
+       "      <td>99797</td>\n",
+       "      <td>20079</td>\n",
+       "      <td>13356</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>fake</th>\n",
+       "      <td>78736</td>\n",
+       "      <td>15602</td>\n",
+       "      <td>10545</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>conspiracy</th>\n",
+       "      <td>72837</td>\n",
+       "      <td>14676</td>\n",
+       "      <td>9801</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>unknown</th>\n",
+       "      <td>68468</td>\n",
+       "      <td>13754</td>\n",
+       "      <td>9098</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>rumor</th>\n",
+       "      <td>42254</td>\n",
+       "      <td>8553</td>\n",
+       "      <td>5638</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>unreliable</th>\n",
+       "      <td>26489</td>\n",
+       "      <td>5346</td>\n",
+       "      <td>3497</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>clickbait</th>\n",
+       "      <td>20552</td>\n",
+       "      <td>4161</td>\n",
+       "      <td>2699</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>junksci</th>\n",
+       "      <td>10516</td>\n",
+       "      <td>2066</td>\n",
+       "      <td>1458</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>satire</th>\n",
+       "      <td>9852</td>\n",
+       "      <td>1971</td>\n",
+       "      <td>1337</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>hate</th>\n",
+       "      <td>6641</td>\n",
+       "      <td>1307</td>\n",
+       "      <td>831</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2018-02-10 13:43:39.521661</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                             train   test    val\n",
+       "type                                            \n",
+       "reliable                    163802  33010  21752\n",
+       "political                   145779  29241  19498\n",
+       "bias                         99797  20079  13356\n",
+       "fake                         78736  15602  10545\n",
+       "conspiracy                   72837  14676   9801\n",
+       "unknown                      68468  13754   9098\n",
+       "rumor                        42254   8553   5638\n",
+       "unreliable                   26489   5346   3497\n",
+       "clickbait                    20552   4161   2699\n",
+       "junksci                      10516   2066   1458\n",
+       "satire                        9852   1971   1337\n",
+       "hate                          6641   1307    831\n",
+       "2018-02-10 13:43:39.521661       1      0      0"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "timeline_R"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "355d343a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "main_asg",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.14.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/analysis/analysis2.ipynb
+++ b/analysis/analysis2.ipynb
--- a/analysis/analyz_split_time.ipynb
+++ b/analysis/analyz_split_time.ipynb
@@ -0,0 +1,237 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9c2d25e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd \n",
+    "import os \n",
+    "import sys\n",
+    "sys.path.append(os.path.join(os.getcwd(), '../src'))\n",
+    "from constants import TRAINING_DIR, TESTING_DIR, VALIDATION_DIR\n",
+    "pd.set_option('display.max_columns', None)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "a917b0fa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_ty = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown')   \n",
+    "train_ty = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown')   \n",
+    "val_ty =  pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown')   \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "0098d6e4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "rows in train(818843, 1),\n",
+      " rows in test (99499, 1), \n",
+      " rows in validation(76645, 1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f'rows in train{train_ty.shape },\\n rows in test {test_ty.shape}, \\n rows in validation{val_ty.shape}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "5985a4f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "timeline = pd.concat([\n",
+    "    b.value_counts().rename('train'),\n",
+    "    a.value_counts().rename('test'),\n",
+    "    c.value_counts().rename('val'),\n",
+    "], axis=1).fillna(0).astype(int)\n",
+    "\n",
+    "timeline.index.name = 'type'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "b0673e19",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>train</th>\n",
+       "      <th>test</th>\n",
+       "      <th>val</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>type</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>political</th>\n",
+       "      <td>194518</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>bias</th>\n",
+       "      <td>133232</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>fake</th>\n",
+       "      <td>104883</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>conspiracy</th>\n",
+       "      <td>97314</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>rumor</th>\n",
+       "      <td>56445</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>unknown</th>\n",
+       "      <td>43534</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>reliable</th>\n",
+       "      <td>42419</td>\n",
+       "      <td>99499</td>\n",
+       "      <td>76645</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>unreliable</th>\n",
+       "      <td>35332</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>clickbait</th>\n",
+       "      <td>27412</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>junksci</th>\n",
+       "      <td>14040</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>satire</th>\n",
+       "      <td>13160</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>hate</th>\n",
+       "      <td>8779</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             train   test    val\n",
+       "type                            \n",
+       "political   194518      0      0\n",
+       "bias        133232      0      0\n",
+       "fake        104883      0      0\n",
+       "conspiracy   97314      0      0\n",
+       "rumor        56445      0      0\n",
+       "unknown      43534      0      0\n",
+       "reliable     42419  99499  76645\n",
+       "unreliable   35332      0      0\n",
+       "clickbait    27412      0      0\n",
+       "junksci      14040      0      0\n",
+       "satire       13160      0      0\n",
+       "hate          8779      0      0"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "timeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2bcfc84",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "main_asg",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.14.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/data/datasets/.gitkeep
+++ b/data/datasets/.gitkeep
--- a/data/temp/.gitkeep
+++ b/data/temp/.gitkeep
--- a/data/testing/.gitkeep
+++ b/data/testing/.gitkeep
--- a/data/training/.gitkeep
+++ b/data/training/.gitkeep
--- a/data/validation/.gitkeep
+++ b/data/validation/.gitkeep
--- a/flake.nix
+++ b/flake.nix
@@ -0,0 +1,34 @@
+# This is for my retarded nixos jupyter notebook setup. It makes a shell with requrements.txt and jupynium bs installed.
+{
+  inputs = {
+    nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
+    pyproject-nix.url = "github:pyproject-nix/pyproject.nix";
+    bozo_nixpkgs.url = "github:DuarteSJ/nixpkgs/4e926b09ba06301b08d0f12afd0640c079bdc4dc";
+  };
+
+  outputs =
+    { nixpkgs, pyproject-nix, bozo_nixpkgs, ... }:
+    let
+      project = pyproject-nix.lib.project.loadRequirementsTxt { projectRoot = ./.; };
+
+      pkgs = nixpkgs.legacyPackages.x86_64-linux;
+      bozo_pkgs = bozo_nixpkgs.legacyPackages.x86_64-linux;
+
+      python = pkgs.python3;
+      pythonEnv = pkgs.python3.withPackages (pkgs:
+        let base = project.renderers.withPackages { inherit python; } pkgs;
+        in base ++ (with pkgs; [ notebook nbclassic jupyter-console ipython]));
+      mental_retardation = bozo_pkgs.python3.withPackages (python-pkgs: with python-pkgs; [ jupynium ]);
+    in
+    {
+      devShells.x86_64-linux.default = pkgs.mkShell { 
+        packages = [ pythonEnv mental_retardation ]; 
+        shellHook = ''
+          export SHELL="which fish"
+          if [[ $- == *i* ]] && [ -z "$TMUX" ]; then
+                tmux new-session -A -s GDS-fake-news
+          fi
+        '';
+      };
+    };
+}
--- a/models/LIAR_baseline.model
+++ b/models/LIAR_baseline.model
--- a/models/baseline.model
+++ b/models/baseline.model
--- a/models/gradient_boosting.model
+++ b/models/gradient_boosting.model
--- a/models/logistic.model
+++ b/models/logistic.model
--- a/models/metadata_logistic.model
+++ b/models/metadata_logistic.model
--- a/models/not_reliable_logistic.model
+++ b/models/not_reliable_logistic.model
--- a/models/old/GB10K.model
+++ b/models/old/GB10K.model
--- a/models/old/GB1K.model
+++ b/models/old/GB1K.model
--- a/models/old/GB2K.model
+++ b/models/old/GB2K.model
--- a/models/old/GB4K.model
+++ b/models/old/GB4K.model
--- a/models/only_fake_logistic.model
+++ b/models/only_fake_logistic.model
--- a/models/svm.model
+++ b/models/svm.model
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -0,0 +1,4 @@
+{
+  "typeCheckingMode": "strict",
+  "reportMissingTypeStubs": false
+}
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,117 @@
+anyio==4.12.1
+argon2-cffi==25.1.0
+argon2-cffi-bindings==25.1.0
+arrow==1.4.0
+asttokens==3.0.1
+async-lru==2.1.0
+attrs==25.4.0
+babel==2.18.0
+beautifulsoup4==4.14.3
+bleach==6.3.0
+certifi==2026.1.4
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.1
+comm==0.2.3
+contourpy==1.3.3
+cycler==0.12.1
+debugpy==1.8.20
+decorator==5.2.1
+defusedxml==0.7.1
+executing==2.2.1
+fastjsonschema==2.21.2
+fonttools==4.61.1
+fqdn==1.5.1
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+idna==3.11
+ipykernel==7.2.0
+ipython==9.10.0
+ipython_pygments_lexers==1.1.1
+ipywidgets==8.1.8
+isoduration==20.11.0
+jedi==0.19.2
+Jinja2==3.1.6
+joblib==1.5.3
+json5==0.13.0
+jsonpointer==3.0.0
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+jupyter==1.1.1
+jupyter-console==6.6.3
+jupyter-events==0.12.0
+jupyter-lsp==2.3.0
+jupyter_client==8.8.0
+jupyter_core==5.9.1
+jupyter_server==2.17.0
+jupyter_server_terminals==0.5.4
+jupyterlab==4.5.4
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.28.0
+jupyterlab_widgets==3.0.16
+kiwisolver==1.4.9
+lark==1.3.1
+MarkupSafe==3.0.3
+matplotlib==3.10.8
+matplotlib-inline==0.2.1
+mistune==3.2.0
+nbclient==0.10.4
+nbconvert==7.17.0
+nbformat==5.10.4
+nest-asyncio==1.6.0
+nltk==3.9.2
+notebook==7.5.3
+notebook_shim==0.2.4
+numpy==2.4.2
+packaging==26.0
+pandas==3.0.1
+pandas-stubs==3.0.0.260204
+pandocfilters==1.5.1
+parso==0.8.6
+pexpect==4.9.0
+pillow==12.1.1
+platformdirs==4.9.2
+prometheus_client==0.24.1
+prompt_toolkit==3.0.52
+psutil==7.2.2
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==23.0.1
+pycparser==3.0
+Pygments==2.19.2
+pyparsing==3.3.2
+python-dateutil==2.9.0.post0
+python-json-logger==4.0.0
+PyYAML==6.0.3
+pyzmq==27.1.0
+referencing==0.37.0
+regex==2026.1.15
+requests==2.32.5
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rfc3987-syntax==1.1.0
+rpds-py==0.30.0
+scikit-learn==1.8.0
+scipy==1.17.1
+Send2Trash==2.1.0
+setuptools==82.0.0
+six==1.17.0
+soupsieve==2.8.3
+stack-data==0.6.3
+terminado==0.18.1
+threadpoolctl==3.6.0
+tinycss2==1.4.0
+tornado==6.5.4
+tqdm==4.67.3
+traitlets==5.14.3
+typing_extensions==4.15.0
+tzdata==2025.3
+uri-template==1.3.0
+urllib3==2.6.3
+wcwidth==0.6.0
+webcolors==25.10.0
+webencodings==0.5.1
+websocket-client==1.9.0
+widgetsnbextension==4.0.15
+zstandard==0.25.0
--- a/src/clean_data.py
+++ b/src/clean_data.py
@@ -0,0 +1,97 @@
+from constants import DATASET_DIR, TEMP_DIR
+from helper import dataset_iterator
+import pyarrow as pa
+import pyarrow.parquet as pq
+import nltk
+import re
+import shutil
+
+# cleans text and returns a list of tokens.
+def clean_text(
+        text:str,
+        remove_regex_patterns:bool = True,
+        remove_stopwords:bool = True,
+        remove_special_characters:bool = True,
+        stemming:bool = True
+    ) -> list[str]:
+
+    text = str(text).lower().strip()
+
+    if remove_regex_patterns:
+        url_pattern = r'https?://\S+|www\.\S+'
+        email_pattern = r'[\w.-]+@[\w]+\.[\w]+'
+        date_pattern = r'([a-z]+ \d{1,2}[a-z]?, \d{4}|\d{2,4}[-/]\d{2,4}[-/]\d{2,4})'  # add more date patterns
+        number_pattern = r'\d+'
+
+        text = re.sub(url_pattern, "<URL>", text)
+        text = re.sub(email_pattern, "<EMAIL>", text)
+        text = re.sub(date_pattern, "<DATE>", text)
+        text = re.sub(number_pattern, "<NUMBER>", text)
+
+    if remove_special_characters:
+        text = re.sub(r'[^\w (?:<\w+>)]', " ", text)
+
+    tokenizer = nltk.RegexpTokenizer(r'<\w+>|\w+')
+    tokens = tokenizer.tokenize(text) # type: ignore
+
+    if remove_stopwords:
+        stopwords = stopwords = nltk.corpus.stopwords.words('english')
+        tokens = [token for token in tokens if token not in stopwords] # type: ignore
+
+    if stemming:
+        stemmer = nltk.SnowballStemmer("english")
+        tokens = [stemmer.stem(token) if not re.match(r'<\w+>', token) else token for token in tokens] # type: ignore
+
+    return tokens # type: ignore
+
+def clean_dataset(filename:str) -> None:
+    output_path = f"{TEMP_DIR}/{filename}"
+    writer = None
+    for chunk in dataset_iterator(f"{DATASET_DIR}/{filename}"):
+        chunk['tokens'] = chunk['content'].apply(clean_text)
+
+        columns_in_chunk = chunk.columns #[c for c in chunk.columns]
+        table = pa.Table.from_pandas(chunk[columns_in_chunk])
+        if writer is None:
+            writer = pq.ParquetWriter(output_path, table.schema)
+        writer.write_table(table)
+    writer.close()
+
+    shutil.move(output_path, f"{DATASET_DIR}/{filename}")
+
+def compute_vocab_reduction(filename: str) -> dict[str, float | int]:
+
+    dataset_path = f"{DATASET_DIR}/{filename}"
+
+    vocab_before_stopwords: set[str] = set()
+    vocab_after_stopwords: set[str] = set()
+
+    vocab_after_stemming: set[str] = set()
+
+    for chunk in dataset_iterator(dataset_path):
+        contents = chunk["content"]
+        for text in contents:
+            vocab_before_stopwords.update(clean_text(text, remove_stopwords=False, stemming=False))
+            vocab_after_stopwords.update(clean_text(text, remove_stopwords=True, stemming=False))
+            vocab_after_stemming.update(clean_text(text, remove_stopwords=True, stemming=True))
+
+    before_stop_size = len(vocab_before_stopwords)
+    after_stop_size = len(vocab_after_stopwords)
+    before_stem_size = after_stop_size  
+    after_stem_size = len(vocab_after_stemming)
+
+    stopwords_reduction_rate = (
+        (before_stop_size - after_stop_size) / before_stop_size if before_stop_size else 0.0
+    )
+    stemming_reduction_rate = (
+        (before_stem_size - after_stem_size) / before_stem_size if before_stem_size else 0.0
+    )
+
+    return {
+        "vocab_size_before_stopwords": before_stop_size,
+        "vocab_size_after_stopwords": after_stop_size,
+        "stopwords_reduction_rate": stopwords_reduction_rate,
+        "vocab_size_before_stemming": before_stem_size,
+        "vocab_size_after_stemming": after_stem_size,
+        "stemming_reduction_rate": stemming_reduction_rate,
+    }
--- a/src/constants.py
+++ b/src/constants.py
@@ -0,0 +1,14 @@
+import os
+
+DATA_DIR = os.path.abspath("../data")
+MODEL_DIR = os.path.abspath("../models")
+DATASET_DIR = f"{DATA_DIR}/datasets"
+TRAINING_DIR = f"{DATA_DIR}/training"
+VALIDATION_DIR = f"{DATA_DIR}/validation"
+TESTING_DIR = f"{DATA_DIR}/testing"
+TEMP_DIR = f"{DATA_DIR}/temp"
+ORIGINAL_DATASET_FILES = ["news_sample.csv", "995,000_rows.csv"]
+DATASET_FILES = ["news_sample.parquet", "995,000_rows.parquet"]
+
+CHUNK_SIZE = 10000 # how many rows to work on at time, instead of loading the entire dataset into memory.
+MAX_ROWS = -1 # only work with MAX_ROWS rows so testing things out isnt crazy slow. Set to -1 for infinite.
--- a/src/helper.py
+++ b/src/helper.py
@@ -0,0 +1,60 @@
+from labels import Label
+from constants import CHUNK_SIZE, MAX_ROWS
+from typing import Iterator, cast
+import pyarrow.parquet as pq
+import pandas as pd
+
+def default_labelling(article_type:str) -> Label:
+    if article_type in ["reliable", "political", "clickbait"]:
+        return Label.REAL
+    return Label.FAKE
+
+def only_fake_labelling(article_type:str) -> Label:
+    if article_type == "fake":
+        return Label.FAKE
+    return Label.REAL
+
+def not_reliable_labelling(article_type:str) -> Label:
+    if article_type == "reliable":
+        return Label.REAL
+    return Label.FAKE
+
+def LIAR_labelling(article_type:str) -> Label:
+    if article_type in ["true", "half-true", "barely-true", "mostly-true"]:
+        return Label.REAL
+    return Label.FAKE
+
+# Deprecated, don't use, just use pd.read_parquet instead
+def dataset_iterator(dataset_file:str, columns:list[str] | None = None) -> Iterator[pd.DataFrame]:
+    pq_file = pq.ParquetFile(dataset_file)
+    rows_read = 0
+    for batch in pq_file.iter_batches(batch_size=CHUNK_SIZE, columns=columns): # type: ignore
+        rows_read += len(batch) # type: ignore
+        if rows_read > MAX_ROWS and MAX_ROWS > 0:
+            return
+        # cast to ignore type warnings.
+        yield cast(pd.DataFrame, batch.to_pandas()) # type: ignore
+
+def csv_to_parquet(input_path: str, output_path: str):
+    # csv.field_size_limit(sys.maxsize) # if i don't use this and engine="python" i cant read all datasets.
+    # writer = None
+    # for chunk in pd.read_csv(input_path, chunksize=CHUNK_SIZE, dtype=str, engine="python"): # who the fuck put a string in the id column
+    #     chunk = chunk.fillna("")
+    #     table = pa.Table.from_pandas(chunk) # type: ignore
+    #     if writer is None:
+    #         writer = pq.ParquetWriter(output_path, table.schema) # type: ignore
+    #     writer.write_table(table) # type: ignore
+    # writer.close() # type: ignore
+    pd.read_csv(input_path, low_memory=False).to_parquet(output_path)
+
+def get_time_boundaries (filename: str) : #type: ignore 
+    # Only load the timestamp column to save RAM
+    df_dates = pq.read_table(filename, columns=['scraped_at']).to_pandas() #type: ignore 
+    df_dates['scraped_at'] = pd.to_datetime(df_dates['scraped_at'], format='ISO8601', errors='coerce',utc=True) #type: ignore 
+    # Sort the dates to find the percentiles
+    sorted_dates = df_dates['scraped_at'].sort_values() #type: ignore 
+    # Find the timestamps at the 80th and 90th percentile
+    train_cut = sorted_dates.quantile(0.80) #type: ignore 
+    val_cut = sorted_dates.quantile(0.90) #type: ignore 
+    return train_cut, val_cut #type: ignore 
+
--- a/src/labels.py
+++ b/src/labels.py
@@ -0,0 +1,5 @@
+from enum import Enum
+
+class Label(Enum):
+    REAL = 0
+    FAKE = 1
--- a/src/main.py
+++ b/src/main.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+
+import argparse
+from models.svm import SVM_model
+from models.gradient_boosting import Gradient_boosting_model
+from models.logistic_regression import Logistic_model
+from models.baseline import Baseline_model
+from helper import default_labelling, not_reliable_labelling, only_fake_labelling, LIAR_labelling
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+            prog="Fakenews detector",
+            description="Train and test models",
+            usage="The following is an example of training a logistic regression model on news_sample.parquet:\n"+
+            "python main.py --model_type logistic --model_file logistic_news_sample.model --data_file news_sample.parquet --train",
+    )
+    parser.add_argument("--train", action="store_true", help="Whether model should be trained, if not set it will be tested instead")
+    parser.add_argument("--validate", action="store_true", help="Wheter to use validation set when testing/validating")
+    parser.add_argument("--model_type", "-t", required=True, choices=["baseline", "logistic", "svm", "gradient_boosting"], help="The type of model: baseline, logistic, ...")
+    parser.add_argument("--model_file", "-f", required=True, default="", help="The model file to save to when training, or load from when testing")
+    parser.add_argument("--data_file", "-d", required=True, help="The datafile used when training or testing")
+    parser.add_argument("--label_translator", "-l", required=False, default = "", help="The translator function used by the model, such as \"not_reliable\", that only considers 'reliable' tagged news Real, ignored if not using --train.")
+    parser.add_argument("--hyperparameters", "-p", required=False, nargs="+", default ="", help="The hyperparameters used when training the model, written like c=1")
+    
+    args = parser.parse_args()
+
+    label_translator = default_labelling
+    if "not_reliable" in args.label_translator.lower():
+        label_translator = not_reliable_labelling
+    if "only_fake" in args.label_translator.lower():
+        label_translator = only_fake_labelling
+    if "liar" in args.data_file.lower():
+        label_translator = LIAR_labelling
+        args.data_file = "LIAR.parquet"
+
+    if args.model_type == "logistic":
+        model = Logistic_model(label_translator=label_translator)
+    elif args.model_type == "svm":
+        model = SVM_model(label_translator=label_translator)
+    elif args.model_type == "gradient_boosting":
+        model = Gradient_boosting_model(label_translator=label_translator)
+    else:
+        model = Baseline_model(label_translator=label_translator)
+
+
+    if args.train:
+        hyperparameters:dict[str, float] = {}
+        for parameter in args.hyperparameters:
+            key, value = parameter.split("=")
+            hyperparameters[key] = float(value)
+
+        model.train(args.data_file, hyperparameters)
+        model.save(args.model_file)
+
+    if not args.train:
+        model.load(args.model_file)
+        if "liar" in args.data_file.lower():
+            model.test("LIAR.parquet", validate=False)
+        else:
+            model.test(args.data_file, args.validate)
+
+if __name__ == "__main__":
+    main()
--- a/src/models/Untitled.ipynb
+++ b/src/models/Untitled.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/src/models/Untitled1.ipynb
+++ b/src/models/Untitled1.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/src/models/Untitled2.ipynb
+++ b/src/models/Untitled2.ipynb
@@ -0,0 +1,846 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "3ed30f2e",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'torch'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mModuleNotFoundError\u001b[39m                       Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtorch\u001b[39;00m \n\u001b[32m      2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtorch\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mnn\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnn\u001b[39;00m\n\u001b[32m      3\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtorch\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mnn\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mfunctional\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mF\u001b[39;00m \n",
+      "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'torch'"
+     ]
+    }
+   ],
+   "source": [
+    "import torch \n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F \n",
+    "import pandas as pd\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "from collections import Counter\n",
+    "import os\n",
+    "import sys\n",
+    "sys.path.append(os.path.join(os.getcwd(), '../'))\n",
+    "from helper import default_labelling\n",
+    "from sklearn.metrics import f1_score\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "42edceb8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "label_map = {\n",
+    "    'Label.FAKE': 0,\n",
+    "    'Label.REAL': 1}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0aa1a427",
+   "metadata": {},
+   "source": [
+    "# Pipelining process"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c7730d65",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_parquet(\"../../data/training/995,000_rows.parquet\", columns=['tokens','type'])\n",
+    "\n",
+    "\n",
+    "df['label'] = df['type'].apply(default_labelling).astype(str)\n",
+    "df['label'] = df['label'].map(label_map).astype(int)\n",
+    "df = df.drop(columns=['type'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c31caf06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_test = pd.read_parquet(\"../../data/testing/995,000_rows.parquet\", columns=['tokens','type'])\n",
+    "\n",
+    "df_test['label'] = df_test['type'].apply(default_labelling).astype(str)\n",
+    "df_test['label'] = df_test['label'].map(label_map).astype(int)\n",
+    "df_test = df_test.drop(columns=['type'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5c0c93ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_val = pd.read_parquet(\"../../data/validation/995,000_rows.parquet\", columns=['tokens','type'])\n",
+    "df_val['label'] = df_val['type'].apply(default_labelling).astype(str)\n",
+    "df_val['label'] = df_val['label'].map(label_map).astype(int)\n",
+    "df_val = df_val.drop(columns=['type'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19188ef7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print(\"Loading Parquet file...\")\n",
+    "\n",
+    "# # Check the total number of rows (articles)\n",
+    "# print(f\"Total rows in the raw Parquet file: {len(df)}\")\n",
+    "\n",
+    "# # Look at the first few rows to make sure the data looks correct\n",
+    "# print(\"\\n--- First 3 Rows ---\")\n",
+    "# print(df.head(3))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fa455147",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# count how many tokens we have in the corpuse \n",
+    "word_counts = Counter()\n",
+    "for x in df['tokens']:\n",
+    "    word_counts.update(x)\n",
+    "    \n",
+    "# Keep the top 50,000 words. \n",
+    "# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)\n",
+    "vocab = {\"<PAD>\": 0, \"<UNK>\": 1}\n",
+    "for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):\n",
+    "    vocab[word] = idx\n",
+    "\n",
+    "print(f\"Vocabulary built with {len(vocab)} words.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b9ba0021",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a Custom PyTorch Datase\n",
+    "\n",
+    "# a wrapper for the data that PyTorch knows how to talk to.\n",
+    "class FakeNewsDataset(Dataset):\n",
+    "    def __init__(self, dataframe, vocab, max_length=256):\n",
+    "        self.dataframe = dataframe\n",
+    "        self.vocab = vocab\n",
+    "        self.max_length = max_length\n",
+    "\n",
+    "# Tells PyTorch how many articles we have\n",
+    "#PyTorch calls this internally to know when to stop fetching data.\n",
+    "    def __len__(self):\n",
+    "        return len(self.dataframe)\n",
+    "    \n",
+    "    def __getitem__(self, idx):\n",
+    "        # Grabs one article and its label at a time\n",
+    "        tokens = self.dataframe.iloc[idx]['tokens']\n",
+    "        label = self.dataframe.iloc[idx]['label']\n",
+    "\n",
+    "        # Convert text tokens to Integer IDs\n",
+    "        article_ids = [self.vocab.get(word, 1) for word in tokens]\n",
+    "\n",
+    "    # Truncate or Pad the article so they are all exactly 'max_length' long\n",
+    "        if len(article_ids) > self.max_length:\n",
+    "            article_ids = article_ids[:self.max_length]\n",
+    "        else:\n",
+    "            padding = [0] * (self.max_length - len(article_ids))\n",
+    "            article_ids.extend(padding)\n",
+    "            \n",
+    "        # Return as PyTorch tensors\n",
+    "        return torch.tensor(article_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f3f4096",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Prepare the DataLoader \n",
+    "# Wrap The dataframe in the Dataset class\n",
+    "\n",
+    "# The DataLoader feeds the data to the model in batches (e.g., 64 articles at a time)\n",
+    "# This prevents the  computer from running out of RAM!\n",
+    "\n",
+    "\n",
+    "my_train_dataset = FakeNewsDataset(dataframe=df, vocab=vocab, max_length=256)\n",
+    "# Shuffle is true for training so the data keeps getting shuffled when trained and the model does not memorise the data\n",
+    "train_dataloader = DataLoader(my_train_dataset, batch_size=64, shuffle=True,num_workers=4,       # Start with 4; if CPU stays cool, try 6\n",
+    "pin_memory=True,     # Essential for fast data transfer\n",
+    "prefetch_factor=2)\n",
+    "\n",
+    "\n",
+    "val_data = FakeNewsDataset(dataframe=df_val, vocab=vocab, max_length=256)\n",
+    "val_dataloader = DataLoader(val_data, batch_size=64, shuffle=False)\n",
+    "\n",
+    "test_data = FakeNewsDataset(dataframe=df_test, vocab=vocab, max_length=256)\n",
+    "test_dataloader = DataLoader(test_data, batch_size=64, shuffle=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fd4f08a6",
+   "metadata": {},
+   "source": [
+    "Checking if the data conversion works"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9bcbcf9b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# features, labels = next(iter(train_dataloader))\n",
+    "# # 2. Check the shapes (the dimensions of your tensors)\n",
+    "# print(\"--- Tensor Shapes ---\")\n",
+    "# print(f\"Features shape: {features.shape}\") \n",
+    "# print(f\"Labels shape:   {labels.shape}\")   \n",
+    "\n",
+    "# # 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)\n",
+    "# print(\"\\n--- Data Types ---\")\n",
+    "# print(f\"Features dtype: {features.dtype}\")\n",
+    "# print(f\"Labels dtype:   {labels.dtype}\")\n",
+    "\n",
+    "# # 4. Peek at the actual data for the very first article in this batch\n",
+    "# print(\"\\n--- First Article Peek ---\")\n",
+    "# print(f\"Label: {labels[0].item()} (0 = Real, 1 = Fake)\")\n",
+    "# print(f\"Tokens (first 20 IDs): {features[0][:20]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b70e45ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class BaseModel(nn.Module):\n",
+    "    def __init__(self, vocab_size, embed_dim=32, h1=256, h2=128, out_features=2):\n",
+    "        super().__init__()\n",
+    "        \n",
+    "        # The Embedding Layer: Turns word IDs into rich numerical vectors\n",
+    "        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)\n",
+    "        \n",
+    "        # The Linear Layers: Learn the patterns to decide Fake vs. Real\n",
+    "        self.fc1 = nn.Linear(embed_dim, h1)\n",
+    "        self.fc2 = nn.Linear(h1, h2)\n",
+    "        self.out = nn.Linear(h2, out_features)\n",
+    "        \n",
+    "    def forward(self, x):\n",
+    "        \n",
+    "        # x starts as integers: shape (batch_size, sequence_length) -> e.g., (64, 256)\n",
+    "        # Pass through embedding\n",
+    "        x = self.embedding(x) \n",
+    "        # Average the word vectors to get one single vector for the whole article\n",
+    "        x = x.mean(dim=1) \n",
+    "        \n",
+    "        # Pass through hidden layers with ReLU activation\n",
+    "        x = F.relu(self.fc1(x))\n",
+    "        x = F.relu(self.fc2(x))\n",
+    "        \n",
+    "        # Output layer (gives us the raw scores for 'Real' and 'Fake')\n",
+    "        x = self.out(x)\n",
+    "        return x\n",
+    "model_basic =BaseModel(vocab_size=len((vocab)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "efa6c453",
+   "metadata": {},
+   "source": [
+    "'Advanced'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "52cb9377",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class advanced_model(nn.Module):\n",
+    "    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128,num_layer = 2,  out_features=2):\n",
+    "        super().__init__()\n",
+    "        \n",
+    "        # 1. The Embedding Layer (Same as before)\n",
+    "        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)\n",
+    "        \n",
+    "        # # 2. The GRU Layer (Extra layer)\n",
+    "        # batch_first=True is required because our DataLoader outputs (batch_size, sequence_length)   \n",
+    "        self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, num_layers=2,batch_first=True,bidirectional=True, \n",
+    "            dropout=0.3)\n",
+    "        \n",
+    "        # 3. The Final Output Layer\n",
+    "        #  connect the GRU's memory (hidden_dim) directly to our Real/Fake outputs\n",
+    "        self.out = nn.Linear(hidden_dim, out_features)\n",
+    "        self.fc = nn.Linear(hidden_dim * 2, out_features)\n",
+    "    def forward(self, x):\n",
+    "        # x shape: (batch_size, sequence_length) -> e.g., (64, 256)\n",
+    "        \n",
+    "        #Get the word embeddings\n",
+    "        x = self.embedding(x) \n",
+    "        # x shape becomes: (64, 256, 32)\n",
+    "        \n",
+    "        # Pass the embeddings into the GRU\n",
+    "        # A GRU outputs two things: the output at every single word, AND its final memory state.\n",
+    "        # We use '_' to ignore the step-by-step output, and save 'hidden_state'.\n",
+    "        _, hidden = self.gru(x)\n",
+    "        \n",
+    "   # 4. Extract and Concatenate the final forward and backward states\n",
+    "        # hidden[-2] is the last forward state, hidden[-1] is the last backward state\n",
+    "        out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)\n",
+    "        \n",
+    "        return self.fc(out)\n",
+    "    \n",
+    "# Initilize\n",
+    "model_adv = advanced_model(vocab_size=len(vocab))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "31b581d0",
+   "metadata": {},
+   "source": [
+    "# Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a8e1f849",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ae976afb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def evaluate_performance(model, dataloader, device):\n",
+    "    model.eval() # Put model in evaluation mode\n",
+    "    \n",
+    "    all_predictions = []\n",
+    "    all_true_labels = []\n",
+    "    \n",
+    "    # Turn off gradient tracking to save memory\n",
+    "    with torch.no_grad():\n",
+    "        for features, labels in dataloader:\n",
+    "            features = features.to(device)\n",
+    "            labels = labels.to(device)\n",
+    "            \n",
+    "            # Get model scores\n",
+    "            scores = model(features)\n",
+    "            \n",
+    "            # Find the predicted class (0 or 1)\n",
+    "            _, predictions = torch.max(scores,1)\n",
+    "            \n",
+    "            # Save predictions and actual labels to lists\n",
+    "            # all_predictions.extend(predictions.cpu().tolist())\n",
+    "            # all_true_labels.extend(labels.cpu().tolist())\n",
+    "            all_predictions.extend(predictions.cpu().numpy().flatten().tolist())\n",
+    "            all_true_labels.extend(labels.cpu().numpy().flatten().tolist())\n",
+    "            \n",
+    "    all_predictions = np.array(all_predictions)\n",
+    "    all_true_labels = np.array(all_true_labels)\n",
+    "        \n",
+    "    accuracy = (all_predictions == all_true_labels).mean() * 100\n",
+    "        \n",
+    "        # 4. Calculate F1 Score\n",
+    "        # average='macro' is best for your report to show you care about both classes equally\n",
+    "    f1 = f1_score(all_true_labels, all_predictions, average='macro')\n",
+    "    model.train() # Return model to training mode just in case\n",
+    "    return accuracy, f1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "65e26f88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train_model(model, train_loader, val_loader, device, epochs=5, lr=0.001):\n",
+    "    model = model.to(device)\n",
+    "    criterion = nn.CrossEntropyLoss()\n",
+    "    optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n",
+    "    \n",
+    "    # Dictionary to store results for your report\n",
+    "    history = {'train_loss': [], 'val_acc': [], 'val_f1': []}\n",
+    "\n",
+    "    print(f\"Training {model.__class__.__name__} on {device}...\")\n",
+    "\n",
+    "    for epoch in range(epochs):\n",
+    "        model.train()\n",
+    "        total_loss = 0\n",
+    "        \n",
+    "        for batch_idx, (features, labels) in enumerate(train_loader):\n",
+    "            features, labels = features.to(device), labels.to(device)\n",
+    "            \n",
+    "            optimizer.zero_grad()\n",
+    "            predictions = model(features)\n",
+    "            loss = criterion(predictions, labels)\n",
+    "            loss.backward()\n",
+    "            optimizer.step()\n",
+    "            \n",
+    "            total_loss += loss.item()\n",
+    "            \n",
+    "        avg_loss = total_loss / len(train_loader)\n",
+    "        \n",
+    "        # After each epoch, evaluate on validation set\n",
+    "        val_acc, val_f1 = evaluate_performance(model, val_loader, device)\n",
+    "        \n",
+    "        # Save results to our history dictionary\n",
+    "        history['train_loss'].append(avg_loss)\n",
+    "        history['val_acc'].append(val_acc)\n",
+    "        history['val_f1'].append(val_f1)\n",
+    "        \n",
+    "        print(f\"\\n Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f} \\n  Val Acc: {val_acc:.2f}% \\n  Val F1: {val_f1:.4f}\")\n",
+    "\n",
+    "    return history # Return the results so we can plot them later"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3acf0f2b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_995_basic =train_model (model_basic, train_dataloader, val_dataloader, device, epochs =7 )\n",
+    "print(train_995_basic )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9c0f7f65",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_995_adv =train_model (model_adv, train_dataloader, val_dataloader, device, epochs =7 )\n",
+    "print(train_995_adv )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1e10032",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12959462",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9fb31c02",
+   "metadata": {},
+   "source": [
+    "# Evaluation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2630d40a",
+   "metadata": {},
+   "source": [
+    "Basic model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73c388e7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # 1. The Evaluation Function\n",
+    "# def evaluate_performance(model, dataloader, device):\n",
+    "#     model.eval() # Put model in evaluation mode\n",
+    "    \n",
+    "#     all_predictions = []\n",
+    "#     all_true_labels = []\n",
+    "    \n",
+    "#     # Turn off gradient tracking to save memory\n",
+    "#     with torch.no_grad():\n",
+    "#         for features, labels in dataloader:\n",
+    "#             features = features.to(device)\n",
+    "#             labels = labels.to(device)\n",
+    "            \n",
+    "#             # Get model scores\n",
+    "#             scores = model(features)\n",
+    "            \n",
+    "#             # Find the predicted class (0 or 1)\n",
+    "#             _, predictions = torch.max(scores,1)\n",
+    "            \n",
+    "#             # Save predictions and actual labels to lists\n",
+    "#             # all_predictions.extend(predictions.cpu().tolist())\n",
+    "#             # all_true_labels.extend(labels.cpu().tolist())\n",
+    "#             all_predictions.extend(predictions.cpu().numpy().flatten().tolist())\n",
+    "#             all_true_labels.extend(labels.cpu().numpy().flatten().tolist())\n",
+    "            \n",
+    "#     all_predictions = np.array(all_predictions)\n",
+    "#     all_true_labels = np.array(all_true_labels)\n",
+    "        \n",
+    "#     accuracy = (all_predictions == all_true_labels).mean() * 100\n",
+    "        \n",
+    "#         # 4. Calculate F1 Score\n",
+    "#         # average='macro' is best for your report to show you care about both classes equally\n",
+    "#     f1 = f1_score(all_true_labels, all_predictions, average='macro')\n",
+    "#     model.train() # Return model to training mode just in case\n",
+    "#     return accuracy, f1\n",
+    "# # # Change me based on the model\n",
+    "\n",
+    "# # model = model_basic.to(device)\n",
+    "\n",
+    "\n",
+    "# # print(f\"Training on: {device}\")\n",
+    "\n",
+    "# # # 2. Setup Loss and Optimizer\n",
+    "# # # CrossEntropyLoss is the standard for classification tasks\n",
+    "# # criterion = nn.CrossEntropyLoss() \n",
+    "# # # Adam is a very reliable, fast optimizer\n",
+    "# # optimizer = torch.optim.Adam(model.parameters(), lr=0.001) \n",
+    "\n",
+    "# # # 3. The Training Loop\n",
+    "# # epochs = 7# Start with a small number of passes through the whole dataset\n",
+    "\n",
+    "# # for epoch in range(epochs):\n",
+    "# #     model.train() # Tell the model it is in training mode\n",
+    "# #     total_loss = 0\n",
+    "    \n",
+    "# #     # Loop through our batches of 64 articles\n",
+    "# #     for batch_idx, (features, labels) in enumerate(train_dataloader):\n",
+    "        \n",
+    "# #         # Move data to the same device as the model (GPU/CPU)\n",
+    "# #         features = features.to(device)\n",
+    "# #         labels = labels.to(device)\n",
+    "        \n",
+    "# #         # Step A: Reset the optimizer's gradients\n",
+    "# #         optimizer.zero_grad()\n",
+    "        \n",
+    "# #         # Step B: Forward Pass (Have the model guess Real or Fake)\n",
+    "# #         predictions = model(features)\n",
+    "        \n",
+    "# #         # Step C: Calculate Loss (How wrong were the guesses?)\n",
+    "# #         loss = criterion(predictions, labels)\n",
+    "        \n",
+    "# #         # Step D: Backward Pass (Calculate how to fix the math)\n",
+    "# #         loss.backward()\n",
+    "        \n",
+    "# #         # Step E: Optimize (Actually apply the fixes to the model's weights)\n",
+    "# #         optimizer.step()\n",
+    "        \n",
+    "# #         total_loss += loss.item()\n",
+    "        \n",
+    "# #         # Print an update every 100 batches so we know it's working\n",
+    "# #         if batch_idx % 100 == 0:\n",
+    "# #             print(f\"Epoch [{epoch+1}/{epochs}] | Batch {batch_idx} | Loss: {loss.item():.4f}\")\n",
+    "            \n",
+    "# #     # Print the average loss at the end of each epoch\n",
+    "# #     avg_loss = total_loss / len(train_dataloader)\n",
+    "# #     print(f\"--- End of Epoch {epoch+1} | Average Loss: {avg_loss:.4f} ---\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "09b0ce98",
+   "metadata": {},
+   "source": [
+    "Advanced model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2ca196d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # 1. The Evaluation Function\n",
+    "# def evaluate_performance(model_adv, dataloader, device):\n",
+    "#     model_adv.eval() # Put model in evaluation mode\n",
+    "    \n",
+    "#     all_predictions = []\n",
+    "#     all_true_labels = []\n",
+    "    \n",
+    "#     # Turn off gradient tracking to save memory\n",
+    "#     with torch.no_grad():\n",
+    "#         for features, labels in dataloader:\n",
+    "#             features = features.to(device)\n",
+    "#             labels = labels.to(device)\n",
+    "            \n",
+    "#             # Get model scores\n",
+    "#             scores = model_adv(features)\n",
+    "            \n",
+    "#             # Find the predicted class (0 or 1)\n",
+    "#             _, predictions = scores.max(1)\n",
+    "            \n",
+    "#             # Save predictions and actual labels to lists\n",
+    "#             all_predictions.extend(predictions.cpu().tolist())\n",
+    "#             all_true_labels.extend(labels.cpu().tolist())\n",
+    "            \n",
+    "#     # Calculate Accuracy\n",
+    "#     correct_guesses = sum(p == t for p, t in zip(all_predictions, all_true_labels))\n",
+    "#     accuracy = (correct_guesses / len(all_true_labels)) * 100\n",
+    "    \n",
+    "#     # Calculate F1 Score\n",
+    "#     f1 = f1_score(all_true_labels, all_predictions, average='macro')\n",
+    "    \n",
+    "#     model_adv.train() # Return model to training mode just in case\n",
+    "#     return accuracy, f1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5835388c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6ca6771",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Basic model \")\n",
+    "print(\" Validation \")\n",
+    "val_acc995, val_f1_995 = evaluate_performance(model,val_dataloader, device)\n",
+    "print(f\"Validation Accuracy: {val_acc995:.2f}%\")\n",
+    "print(f\"Validation F1 Score: {val_f1_995:.4f}\")\n",
+    "\n",
+    "print(\"\\n Testing Phase \")\n",
+    "test_acc995, test_f1_995 = evaluate_performance(model, test_dataloader, device)\n",
+    "print(f\"Test Accuracy:     {test_acc995:.2f}%\")\n",
+    "print(f\"Test F1 Score:    git  {test_f1_995:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e206d094",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\" GURU model \")\n",
+    "print(\" Validation \")\n",
+    "adv_val_acc995, val_f1995 = evaluate_performance(model_adv,val_dataloader, device)\n",
+    "print(f\"Validation Accuracy: {adv_val_acc995:.2f}%\")\n",
+    "print(f\"Validation F1 Score: {val_f1_995:.4f}\")\n",
+    "\n",
+    "print(\"\\n  Testing \")\n",
+    "test_acc, test_f1 = evaluate_performance(model_adv, test_dataloader, device)\n",
+    "print(f\"Test Accuracy:     {test_acc955:.2f}%\")\n",
+    "print(f\"Test F1 Score:    git  {test_f1:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f6a4ae72",
+   "metadata": {},
+   "source": [
+    "# Liar data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc7b8dac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from helper import  LIAR_labelling\n",
+    "\n",
+    "f\"../../data/training/LIAR.parquet\"\n",
+    "df_LIAR = pd.read_parquet(\"../../data/testing/LIAR.parquet\",columns=['tokens','type'])\n",
+    "\n",
+    "\n",
+    "df_LIAR['label'] = df_LIAR['type'].apply(LIAR_labelling).astype(str)\n",
+    "df_LIAR['label'] = df_LIAR['label'].map(label_map).astype(int)\n",
+    "df_LIAR = df_LIAR.drop(columns=['type'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f73f6f84",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_LIAR.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9a76196e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# count how many tokens we have in the corpuse \n",
+    "word_counts = Counter()\n",
+    "for x in df_LIAR['tokens']:\n",
+    "    word_counts.update(x)\n",
+    "    \n",
+    "# Keep the top 50,000 words. \n",
+    "# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)\n",
+    "vocab = {\"<PAD>\": 0, \"<UNK>\": 1}\n",
+    "for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):\n",
+    "    vocab[word] = idx\n",
+    "\n",
+    "print(f\"Vocabulary built with {len(vocab)} words.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "39dbe869",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "LR_DATA = FakeNewsDataset(dataframe=df_LIAR, vocab=vocab, max_length=256)\n",
+    "LR_dataloader  = DataLoader(LR_DATA, batch_size=32, shuffle=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ccbc7885",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features, labels = next(iter(LR_dataloader))\n",
+    "# 2. Check the shapes (the dimensions of your tensors)\n",
+    "print(\"--- Tensor Shapes ---\")\n",
+    "print(f\"Features shape: {features.shape}\") \n",
+    "print(f\"Labels shape:   {labels.shape}\")   \n",
+    "\n",
+    "# 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)\n",
+    "print(\"\\n--- Data Types ---\")\n",
+    "print(f\"Features dtype: {features.dtype}\")\n",
+    "print(f\"Labels dtype:   {labels.dtype}\")\n",
+    "\n",
+    "# 4. Peek at the actual data for the very first article in this batch\n",
+    "print(\"\\n--- First Article Peek ---\")\n",
+    "print(f\"Label: {labels[0].item()} (0 = Real, 1 = Fake)\")\n",
+    "print(f\"Tokens (first 20 IDs): {features[0][:20]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4698cd06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # 1. Check a single sample from the Dataset directly\n",
+    "# single_features, single_label = LR_DATA[0]\n",
+    "# print(f\"Single Sample - Features: {single_features.shape}, Label: {single_label.shape}\")\n",
+    "\n",
+    "# # 2. Check the DataLoader batch\n",
+    "# batch_features, batch_labels = next(iter(LR_dataloader))\n",
+    "# # print(f\"Batch - Features: {batch_features.shape}, Labels: {batch_labels.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed9c57c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evaluate_performance(model_adv,LR_dataloader,device)\n",
+    "\n",
+    "print(\"\\n--- 2. Testing Avanced model  ---\")\n",
+    "test_acc, test_f1 = evaluate_performance(model_adv, LR_dataloader, device)\n",
+    "print(f\"Test Accuracy:     {test_acc:.2f}%\")\n",
+    "print(f\"Test F1 Score:    git  {test_f1:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74127f71",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"\\n--- 2. Testing BASE-Model ---\")\n",
+    "test_acc, test_f1 = evaluate_performance(model, LR_dataloader, device)\n",
+    "print(f\"Test Accuracy:     {test_acc:.2f}%\")\n",
+    "print(f\"Test F1 Score:    git  {test_f1:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "33c54c0e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/src/models/baseline.py
+++ b/src/models/baseline.py
@@ -0,0 +1,51 @@
+import pickle
+from typing import override, Callable
+from constants import TRAINING_DIR, MODEL_DIR
+from labels import Label
+from models.model import Model
+from helper import dataset_iterator, default_labelling
+import pandas as pd
+from random import random
+
+class Baseline_model(Model):
+    def __init__(self, model_filename:str="", label_translator: Callable[[str], Label] = default_labelling) -> None:
+        self.fake_probability = 0
+        super().__init__(model_filename, label_translator)
+
+    @override
+    def train(self, training_dataset:str, hyperparameters:dict[str, float]={}) -> None:
+        fake_amount = 0
+        real_amount = 0
+        total_amount = 0
+
+        for chunk in dataset_iterator(f"{TRAINING_DIR}/{training_dataset}", columns=['type']):
+            chunk_fake_amount = (chunk['type'].map(self.label_translator) == Label.FAKE).sum()
+
+            fake_amount += chunk_fake_amount
+            real_amount += len(chunk) - chunk_fake_amount
+            total_amount += len(chunk)
+
+        self.fake_probability = fake_amount/total_amount
+
+    @override
+    def classify(self, input:pd.Series) -> Label:
+        if random() <= self.fake_probability:
+            return Label.FAKE
+        return Label.REAL
+
+    @override
+    def save(self, filename:str) -> None:
+        data = {}
+        data["label_translator"] = self.label_translator
+        data["fake_probability"] = self.fake_probability
+
+        with open(f"{MODEL_DIR}/{filename}", 'wb') as file:
+            pickle.dump(data, file)
+
+    @override
+    def load(self, filename:str) -> None:
+        with open(f"{MODEL_DIR}/{filename}", 'rb') as file:
+            data = pickle.load(file)
+
+        self.label_translator = data["label_translator"]
+        self.fake_probability = data["fake_probability"]
--- a/src/models/gradient_boosting.py
+++ b/src/models/gradient_boosting.py
@@ -0,0 +1,57 @@
+from constants import TRAINING_DIR, MODEL_DIR
+from models.model import Model
+from labels import Label
+from helper import default_labelling
+from typing import override, Callable
+import pandas as pd
+import pickle
+from sklearn.pipeline import Pipeline
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.ensemble import GradientBoostingClassifier
+
+def no_tokenization(str):
+    return str.split(" ")
+
+class Gradient_boosting_model(Model):
+    def __init__(self, model_filename: str = "", label_translator: Callable[[str], Label] = default_labelling) -> None:
+        super().__init__(model_filename, label_translator)
+
+    @override
+    def train(self, training_dataset: str, hyperparameters: dict[str, float] = {}) -> None:
+        print("this model takes around 10 hours to train")
+
+        X = pd.read_parquet(f"{TRAINING_DIR}/{training_dataset}", columns=['tokens'])['tokens']
+        X = X.apply(lambda token_list: " ".join(token_list))
+        Y = pd.read_parquet(f"{TRAINING_DIR}/{training_dataset}", columns=['type'])['type']
+        Y = Y.apply(lambda label: self.label_translator(label).value)
+
+        X = X[:250000]
+        Y = Y[:250000]
+
+        model = Pipeline([
+            ("L string", TfidfVectorizer(tokenizer=no_tokenization)),
+            ("forest", GradientBoostingClassifier(random_state=0, n_estimators=4000))
+        ])
+
+        model.fit(X, Y)
+        self.model = model
+
+    @override
+    def classify(self, input: pd.Series) -> Label:
+        X = " ".join(input['tokens'])
+        return Label(self.model.predict([X])[0])
+
+    @override
+    def save(self, filename: str) -> None:
+        data = {}
+        data["label_translator"] = self.label_translator
+        data["model"] = self.model
+        with open(f"{MODEL_DIR}/{filename}", 'wb') as file:
+            pickle.dump(data, file)
+
+    @override
+    def load(self, filename: str) -> None:
+        with open(f"{MODEL_DIR}/{filename}", 'rb') as file:
+            data = pickle.load(file)
+        self.label_translator = data["label_translator"]
+        self.model = data["model"]
--- a/src/models/logistic_regression.py
+++ b/src/models/logistic_regression.py
@@ -0,0 +1,133 @@
+import pickle
+from typing import override, Callable
+from scipy.sparse import lil_array
+from constants import TRAINING_DIR, MODEL_DIR
+from labels import Label
+from models.model import Model
+from helper import dataset_iterator
+from sklearn.linear_model import LogisticRegression
+import pandas as pd
+import numpy as np
+from helper import default_labelling
+
+class Logistic_model(Model):
+    def __init__(self, model_filename: str = "", label_translator: Callable[[str], Label] = default_labelling) -> None:
+        super().__init__(model_filename, label_translator)
+
+    @override
+    def train(self, training_dataset: str, hyperparameters: dict[str, float] = {}) -> None:
+        token_counts:dict[str, int] = {}
+        sorted_token_counts:dict[str, int] = {}
+        token_id:dict[str, int] = {} # converts top 10K words to id's.
+
+        domain_counts:dict[str, int] = {}
+        sorted_domain_counts:dict[str, int] = {}
+        domain_id:dict[str, int] = {} # converts top 500 domains to id's.
+
+        self.consider_metadata = False
+        if "metadata" in hyperparameters and hyperparameters["metadata"] == 1:
+            self.consider_metadata = True
+
+        columns = ["tokens", "domain"]
+        rows_processed = 0
+        for chunk in dataset_iterator(f"{TRAINING_DIR}/{training_dataset}", columns=columns):
+            rows_processed += len(chunk)
+            for _, row in chunk.iterrows():
+                for token in row['tokens']:
+                    if token not in token_counts:
+                        token_counts[token] = 0
+                    token_counts[token] += 1
+                if row['domain'] not in domain_counts:
+                    domain_counts[row['domain']] = 0
+                domain_counts[row['domain']] += 1
+        for token in sorted(token_counts, key=lambda token: token_counts[token], reverse=True):
+            sorted_token_counts[token] = token_counts[token]
+        for domain in sorted(domain_counts, key=lambda domain: domain_counts[domain], reverse=True):
+            sorted_domain_counts[domain] = domain_counts[domain]
+        
+        idx = 0
+        for token in sorted_token_counts:
+            token_id[token] = idx
+            idx += 1
+            if idx >= 10000:
+                break
+
+        idx = 0
+        for domain in sorted_domain_counts:
+            domain_id[domain] = idx
+            idx += 1
+            if idx >= 1000:
+                break
+
+        if self.consider_metadata: # consider things other than tokens
+            X = lil_array((rows_processed, 11000), dtype="float64")
+        else:
+            X = lil_array((rows_processed, 10000), dtype="float64") # non-sparse array uses 74GiB ram on 995,000_rows. Sklearn LogisticRegression supports sparse arrays though. It still uses 9+ now.
+
+        Y = np.zeros(rows_processed, dtype=int)
+
+        columns.append("type")
+        article_num = 0
+        for chunk in dataset_iterator(f"{TRAINING_DIR}/{training_dataset}", columns=columns):
+            for _, row in chunk.iterrows():
+                tokens = row['tokens']
+                article_type = row['type']
+
+                article_word_counts = np.zeros(10000)
+                for token in tokens:
+                    if token not in token_id:
+                        continue # if they are not in top 10K vocab we can ignore them
+                    article_word_counts[token_id[token]] += 1
+                X[article_num, :10000] = article_word_counts
+                if self.consider_metadata:
+                    if row['domain'] in domain_id:
+                        X[article_num, 10000+domain_id[row['domain']]] = 1
+
+                Y[article_num] = self.label_translator(article_type).value
+                article_num += 1
+
+        self.regression_model = LogisticRegression(max_iter=10000, n_jobs = -1, class_weight="balanced").fit(X, Y)
+        self.token_id = token_id
+        self.domain_id = domain_id
+
+    @override
+    def classify(self, input: pd.Series) -> Label:
+        if self.consider_metadata:
+            x = np.zeros(11000)
+        else:
+            x = np.zeros(10000)
+
+        for token in input['tokens']:
+            if token not in self.token_id:
+                continue
+            x[self.token_id[token]] += 1
+
+        if self.consider_metadata:
+            if input['domain'] in self.domain_id:
+                x[10000+self.domain_id[input['domain']]] = 1
+
+        prediction = self.regression_model.predict([x])[0]
+        return Label(prediction)
+
+    @override
+    def save(self, filename: str) -> None:
+        data = {}
+        data["label_translator"] = self.label_translator
+        data["regression_model"] = self.regression_model
+        data["token_id"] = self.token_id
+        data["domain_id"] = self.domain_id
+        data["consider_metadata"] = self.consider_metadata
+
+        with open(f"{MODEL_DIR}/{filename}", 'wb') as file:
+            pickle.dump(data, file)
+
+    @override
+    def load(self, filename: str) -> None:
+        with open(f"{MODEL_DIR}/{filename}", 'rb') as file:
+            data = pickle.load(file)
+
+        self.label_translator = data["label_translator"]
+        self.regression_model = data["regression_model"]
+        self.token_id = data["token_id"]
+        self.domain_id = data["domain_id"]
+        self.consider_metadata = data["consider_metadata"] 
--- a/src/models/model.py
+++ b/src/models/model.py
@@ -0,0 +1,61 @@
+from abc import ABC, abstractmethod
+import pandas as pd
+from time import perf_counter
+
+from constants import TESTING_DIR, VALIDATION_DIR
+from helper import LIAR_labelling, dataset_iterator, default_labelling
+from labels import Label
+from typing import Callable
+
+class Model(ABC):
+    def __init__(self, model_filename:str="", label_translator: Callable[[str], Label] = default_labelling) -> None:
+        self.label_translator = label_translator
+        if model_filename:
+            self.load(model_filename)
+
+    @abstractmethod
+    def train(self, training_dataset:str, hyperparameters:dict[str, float]) -> None:
+        pass
+
+    @abstractmethod
+    def classify(self, input:pd.Series) -> Label:
+        pass
+
+    @abstractmethod
+    def save(self, filename:str) -> None:
+        pass
+
+    @abstractmethod
+    def load(self, filename:str) -> None:
+        pass
+
+    def test(self, test_dataset: str, validate:bool=True) -> tuple[float, float, float, float]:
+        TP = 0
+        TN = 0
+        FP = 0
+        FN = 0
+
+        if test_dataset == "LIAR.parquet":
+            self.label_translator = LIAR_labelling
+
+        dataset_dir = VALIDATION_DIR if validate else TESTING_DIR
+        df = pd.read_parquet(f"{dataset_dir}/{test_dataset}")
+
+        expected = df['type'].apply(self.label_translator)
+        predicted = df.apply(self.classify, axis=1)
+
+        TP = ((expected == Label.FAKE) & (predicted == Label.FAKE)).sum()
+        FP = ((expected == Label.REAL) & (predicted == Label.FAKE)).sum()
+        TN = ((expected == Label.REAL) & (predicted == Label.REAL)).sum()
+        FN = ((expected == Label.FAKE) & (predicted == Label.REAL)).sum()
+
+        accuracy = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) > 0 else 0
+        recall = (TP) / (TP + FN) if (TP + FN) > 0 else 0
+        precision = (TP) / (TP + FP) if (TP + FP) > 0 else 0
+        F1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+
+        print(f"Accuracy {accuracy}")
+        print(f"Recall {recall}")
+        print(f"precision {precision}")
+        print(f"F1-score {F1}")
+        return (accuracy, recall, precision, F1)
--- a/src/models/nn.ipynb
+++ b/src/models/nn.ipynb
--- a/src/models/nn.ju.py
+++ b/src/models/nn.ju.py
@@ -0,0 +1,579 @@
+# %%
+import torch 
+import torch.nn as nn
+import torch.nn.functional as F 
+import pandas as pd
+from torch.utils.data import Dataset, DataLoader
+from collections import Counter
+import os
+import sys
+sys.path.append(os.path.join(os.getcwd(), '../'))
+from helper import default_labelling
+from sklearn.metrics import f1_score
+import numpy as np
+
+
+# %%
+label_map = {
+    'Label.FAKE': 0,
+    'Label.REAL': 1}
+
+# %% [markdown]
+"""
+# Pipelining process
+"""
+
+# %%
+df = pd.read_parquet("../../data/training/995,000_rows.parquet", columns=['tokens','type'])
+
+
+df['label'] = df['type'].apply(default_labelling).astype(str)
+df['label'] = df['label'].map(label_map).astype(int)
+df = df.drop(columns=['type'])
+
+# %%
+df_test = pd.read_parquet("../../data/testing/995,000_rows.parquet", columns=['tokens','type'])
+
+df_test['label'] = df_test['type'].apply(default_labelling).astype(str)
+df_test['label'] = df_test['label'].map(label_map).astype(int)
+df_test = df_test.drop(columns=['type'])
+
+# %%
+df_val = pd.read_parquet("../../data/validation/995,000_rows.parquet", columns=['tokens','type'])
+df_val['label'] = df_val['type'].apply(default_labelling).astype(str)
+df_val['label'] = df_val['label'].map(label_map).astype(int)
+df_val = df_val.drop(columns=['type'])
+
+# %%
+# print("Loading Parquet file...")
+
+# # Check the total number of rows (articles)
+# print(f"Total rows in the raw Parquet file: {len(df)}")
+
+# # Look at the first few rows to make sure the data looks correct
+# print("\n--- First 3 Rows ---")
+# print(df.head(3))
+
+# %%
+# count how many tokens we have in the corpuse 
+word_counts = Counter()
+for x in df['tokens']:
+    word_counts.update(x)
+    
+# Keep the top 50,000 words. 
+# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)
+vocab = {"<PAD>": 0, "<UNK>": 1}
+for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):
+    vocab[word] = idx
+
+print(f"Vocabulary built with {len(vocab)} words.")
+
+# %%
+# Create a Custom PyTorch Datase
+
+# a wrapper for the data that PyTorch knows how to talk to.
+class FakeNewsDataset(Dataset):
+    def __init__(self, dataframe, vocab, max_length=256):
+        self.dataframe = dataframe
+        self.vocab = vocab
+        self.max_length = max_length
+
+# Tells PyTorch how many articles we have
+#PyTorch calls this internally to know when to stop fetching data.
+    def __len__(self):
+        return len(self.dataframe)
+    
+    def __getitem__(self, idx):
+        # Grabs one article and its label at a time
+        tokens = self.dataframe.iloc[idx]['tokens']
+        label = self.dataframe.iloc[idx]['label']
+
+        # Convert text tokens to Integer IDs
+        article_ids = [self.vocab.get(word, 1) for word in tokens]
+
+    # Truncate or Pad the article so they are all exactly 'max_length' long
+        if len(article_ids) > self.max_length:
+            article_ids = article_ids[:self.max_length]
+        else:
+            padding = [0] * (self.max_length - len(article_ids))
+            article_ids.extend(padding)
+            
+        # Return as PyTorch tensors
+        return torch.tensor(article_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)
+    
+
+# %%
+## Prepare the DataLoader 
+# Wrap The dataframe in the Dataset class
+
+# The DataLoader feeds the data to the model in batches (e.g., 64 articles at a time)
+# This prevents the  computer from running out of RAM!
+
+
+my_train_dataset = FakeNewsDataset(dataframe=df, vocab=vocab, max_length=256)
+# Shuffle is true for training so the data keeps getting shuffled when trained and the model does not memorise the data
+train_dataloader = DataLoader(my_train_dataset, batch_size=64, shuffle=True,num_workers=4,       # Start with 4; if CPU stays cool, try 6
+pin_memory=True,     # Essential for fast data transfer
+prefetch_factor=2)
+
+
+val_data = FakeNewsDataset(dataframe=df_val, vocab=vocab, max_length=256)
+val_dataloader = DataLoader(val_data, batch_size=64, shuffle=False)
+
+test_data = FakeNewsDataset(dataframe=df_test, vocab=vocab, max_length=256)
+test_dataloader = DataLoader(test_data, batch_size=64, shuffle=False)
+
+# %% [markdown]
+"""
+Checking if the data conversion works 
+"""
+
+# %%
+# features, labels = next(iter(train_dataloader))
+# # 2. Check the shapes (the dimensions of your tensors)
+# print("--- Tensor Shapes ---")
+# print(f"Features shape: {features.shape}") 
+# print(f"Labels shape:   {labels.shape}")   
+
+# # 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)
+# print("\n--- Data Types ---")
+# print(f"Features dtype: {features.dtype}")
+# print(f"Labels dtype:   {labels.dtype}")
+
+# # 4. Peek at the actual data for the very first article in this batch
+# print("\n--- First Article Peek ---")
+# print(f"Label: {labels[0].item()} (0 = Real, 1 = Fake)")
+# print(f"Tokens (first 20 IDs): {features[0][:20]}")
+
+# %%
+class BaseModel(nn.Module):
+    def __init__(self, vocab_size, embed_dim=32, h1=256, h2=128, out_features=2):
+        super().__init__()
+        
+        # The Embedding Layer: Turns word IDs into rich numerical vectors
+        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
+        
+        # The Linear Layers: Learn the patterns to decide Fake vs. Real
+        self.fc1 = nn.Linear(embed_dim, h1)
+        self.fc2 = nn.Linear(h1, h2)
+        self.out = nn.Linear(h2, out_features)
+        
+    def forward(self, x):
+        
+        # x starts as integers: shape (batch_size, sequence_length) -> e.g., (64, 256)
+        # Pass through embedding
+        x = self.embedding(x) 
+        # Average the word vectors to get one single vector for the whole article
+        x = x.mean(dim=1) 
+        
+        # Pass through hidden layers with ReLU activation
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        
+        # Output layer (gives us the raw scores for 'Real' and 'Fake')
+        x = self.out(x)
+        return x
+model_basic =BaseModel(vocab_size=len((vocab)))
+
+# %% [markdown]
+"""
+'Advanced' 
+"""
+
+# %%
+  
+class advanced_model(nn.Module):
+    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128,num_layer = 2,  out_features=2):
+        super().__init__()
+        
+        # 1. The Embedding Layer (Same as before)
+        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
+        
+        # # 2. The GRU Layer (Extra layer)
+        # batch_first=True is required because our DataLoader outputs (batch_size, sequence_length)   
+        self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, num_layers=2,batch_first=True,bidirectional=True, 
+            dropout=0.3)
+        
+        # 3. The Final Output Layer
+        #  connect the GRU's memory (hidden_dim) directly to our Real/Fake outputs
+        self.out = nn.Linear(hidden_dim, out_features)
+        self.fc = nn.Linear(hidden_dim * 2, out_features)
+    def forward(self, x):
+        # x shape: (batch_size, sequence_length) -> e.g., (64, 256)
+        
+        #Get the word embeddings
+        x = self.embedding(x) 
+        # x shape becomes: (64, 256, 32)
+        
+        # Pass the embeddings into the GRU
+        # A GRU outputs two things: the output at every single word, AND its final memory state.
+        # We use '_' to ignore the step-by-step output, and save 'hidden_state'.
+        _, hidden = self.gru(x)
+        
+   # 4. Extract and Concatenate the final forward and backward states
+        # hidden[-2] is the last forward state, hidden[-1] is the last backward state
+        out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
+        
+        return self.fc(out)
+    
+# Initilize
+model_adv = advanced_model(vocab_size=len(vocab))
+
+# %% [markdown]
+"""
+# Training 
+
+"""
+
+# %%
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# %%
+def evaluate_performance(model, dataloader, device):
+    model.eval() # Put model in evaluation mode
+    
+    all_predictions = []
+    all_true_labels = []
+    
+    # Turn off gradient tracking to save memory
+    with torch.no_grad():
+        for features, labels in dataloader:
+            features = features.to(device)
+            labels = labels.to(device)
+            
+            # Get model scores
+            scores = model(features)
+            
+            # Find the predicted class (0 or 1)
+            _, predictions = torch.max(scores,1)
+            
+            # Save predictions and actual labels to lists
+            # all_predictions.extend(predictions.cpu().tolist())
+            # all_true_labels.extend(labels.cpu().tolist())
+            all_predictions.extend(predictions.cpu().numpy().flatten().tolist())
+            all_true_labels.extend(labels.cpu().numpy().flatten().tolist())
+            
+    all_predictions = np.array(all_predictions)
+    all_true_labels = np.array(all_true_labels)
+        
+    accuracy = (all_predictions == all_true_labels).mean() * 100
+        
+        # 4. Calculate F1 Score
+        # average='macro' is best for your report to show you care about both classes equally
+    f1 = f1_score(all_true_labels, all_predictions, average='macro')
+    model.train() # Return model to training mode just in case
+    return accuracy, f1
+
+
+# %%
+def train_model(model, train_loader, val_loader, device, epochs=5, lr=0.001):
+    model = model.to(device)
+    criterion = nn.CrossEntropyLoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
+    
+    # Dictionary to store results for your report
+    history = {'train_loss': [], 'val_acc': [], 'val_f1': []}
+
+    print(f"Training {model.__class__.__name__} on {device}...")
+
+    for epoch in range(epochs):
+        model.train()
+        total_loss = 0
+        
+        for batch_idx, (features, labels) in enumerate(train_loader):
+            features, labels = features.to(device), labels.to(device)
+            
+            optimizer.zero_grad()
+            predictions = model(features)
+            loss = criterion(predictions, labels)
+            loss.backward()
+            optimizer.step()
+            
+            total_loss += loss.item()
+            
+        avg_loss = total_loss / len(train_loader)
+        
+        # After each epoch, evaluate on validation set
+        val_acc, val_f1 = evaluate_performance(model, val_loader, device)
+        
+        # Save results to our history dictionary
+        history['train_loss'].append(avg_loss)
+        history['val_acc'].append(val_acc)
+        history['val_f1'].append(val_f1)
+        
+        print(f"\n Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f} \n  Val Acc: {val_acc:.2f}% \n  Val F1: {val_f1:.4f}")
+
+    return history # Return the results so we can plot them later
+
+# %%
+train_995_basic =train_model (model_basic, train_dataloader, val_dataloader, device, epochs =7 )
+print(train_995_basic )
+
+# %%
+train_995_adv =train_model (model_adv, train_dataloader, val_dataloader, device, epochs =7 )
+print(train_995_adv )
+
+# %%
+
+
+# %%
+
+
+# %% [markdown]
+"""
+# Evaluation 
+"""
+
+# %% [markdown]
+"""
+Basic model
+"""
+
+# %%
+
+# # 1. The Evaluation Function
+# def evaluate_performance(model, dataloader, device):
+#     model.eval() # Put model in evaluation mode
+    
+#     all_predictions = []
+#     all_true_labels = []
+    
+#     # Turn off gradient tracking to save memory
+#     with torch.no_grad():
+#         for features, labels in dataloader:
+#             features = features.to(device)
+#             labels = labels.to(device)
+            
+#             # Get model scores
+#             scores = model(features)
+            
+#             # Find the predicted class (0 or 1)
+#             _, predictions = torch.max(scores,1)
+            
+#             # Save predictions and actual labels to lists
+#             # all_predictions.extend(predictions.cpu().tolist())
+#             # all_true_labels.extend(labels.cpu().tolist())
+#             all_predictions.extend(predictions.cpu().numpy().flatten().tolist())
+#             all_true_labels.extend(labels.cpu().numpy().flatten().tolist())
+            
+#     all_predictions = np.array(all_predictions)
+#     all_true_labels = np.array(all_true_labels)
+        
+#     accuracy = (all_predictions == all_true_labels).mean() * 100
+        
+#         # 4. Calculate F1 Score
+#         # average='macro' is best for your report to show you care about both classes equally
+#     f1 = f1_score(all_true_labels, all_predictions, average='macro')
+#     model.train() # Return model to training mode just in case
+#     return accuracy, f1
+# # # Change me based on the model
+
+# # model = model_basic.to(device)
+
+
+# # print(f"Training on: {device}")
+
+# # # 2. Setup Loss and Optimizer
+# # # CrossEntropyLoss is the standard for classification tasks
+# # criterion = nn.CrossEntropyLoss() 
+# # # Adam is a very reliable, fast optimizer
+# # optimizer = torch.optim.Adam(model.parameters(), lr=0.001) 
+
+# # # 3. The Training Loop
+# # epochs = 7# Start with a small number of passes through the whole dataset
+
+# # for epoch in range(epochs):
+# #     model.train() # Tell the model it is in training mode
+# #     total_loss = 0
+    
+# #     # Loop through our batches of 64 articles
+# #     for batch_idx, (features, labels) in enumerate(train_dataloader):
+        
+# #         # Move data to the same device as the model (GPU/CPU)
+# #         features = features.to(device)
+# #         labels = labels.to(device)
+        
+# #         # Step A: Reset the optimizer's gradients
+# #         optimizer.zero_grad()
+        
+# #         # Step B: Forward Pass (Have the model guess Real or Fake)
+# #         predictions = model(features)
+        
+# #         # Step C: Calculate Loss (How wrong were the guesses?)
+# #         loss = criterion(predictions, labels)
+        
+# #         # Step D: Backward Pass (Calculate how to fix the math)
+# #         loss.backward()
+        
+# #         # Step E: Optimize (Actually apply the fixes to the model's weights)
+# #         optimizer.step()
+        
+# #         total_loss += loss.item()
+        
+# #         # Print an update every 100 batches so we know it's working
+# #         if batch_idx % 100 == 0:
+# #             print(f"Epoch [{epoch+1}/{epochs}] | Batch {batch_idx} | Loss: {loss.item():.4f}")
+            
+# #     # Print the average loss at the end of each epoch
+# #     avg_loss = total_loss / len(train_dataloader)
+# #     print(f"--- End of Epoch {epoch+1} | Average Loss: {avg_loss:.4f} ---")
+
+
+# %% [markdown]
+"""
+Advanced model
+
+"""
+
+# %%
+
+# # 1. The Evaluation Function
+# def evaluate_performance(model_adv, dataloader, device):
+#     model_adv.eval() # Put model in evaluation mode
+    
+#     all_predictions = []
+#     all_true_labels = []
+    
+#     # Turn off gradient tracking to save memory
+#     with torch.no_grad():
+#         for features, labels in dataloader:
+#             features = features.to(device)
+#             labels = labels.to(device)
+            
+#             # Get model scores
+#             scores = model_adv(features)
+            
+#             # Find the predicted class (0 or 1)
+#             _, predictions = scores.max(1)
+            
+#             # Save predictions and actual labels to lists
+#             all_predictions.extend(predictions.cpu().tolist())
+#             all_true_labels.extend(labels.cpu().tolist())
+            
+#     # Calculate Accuracy
+#     correct_guesses = sum(p == t for p, t in zip(all_predictions, all_true_labels))
+#     accuracy = (correct_guesses / len(all_true_labels)) * 100
+    
+#     # Calculate F1 Score
+#     f1 = f1_score(all_true_labels, all_predictions, average='macro')
+    
+#     model_adv.train() # Return model to training mode just in case
+#     return accuracy, f1
+
+
+
+# %%
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+# %%
+
+print("Basic model ")
+print(" Validation ")
+val_acc995, val_f1_995 = evaluate_performance(model,val_dataloader, device)
+print(f"Validation Accuracy: {val_acc995:.2f}%")
+print(f"Validation F1 Score: {val_f1_995:.4f}")
+
+print("\n Testing Phase ")
+test_acc995, test_f1_995 = evaluate_performance(model, test_dataloader, device)
+print(f"Test Accuracy:     {test_acc995:.2f}%")
+print(f"Test F1 Score:    git  {test_f1_995:.4f}")
+
+# %%
+
+
+print(" GURU model ")
+print(" Validation ")
+adv_val_acc995, val_f1995 = evaluate_performance(model_adv,val_dataloader, device)
+print(f"Validation Accuracy: {adv_val_acc995:.2f}%")
+print(f"Validation F1 Score: {val_f1_995:.4f}")
+
+print("\n  Testing ")
+test_acc, test_f1 = evaluate_performance(model_adv, test_dataloader, device)
+print(f"Test Accuracy:     {test_acc955:.2f}%")
+print(f"Test F1 Score:    git  {test_f1:.4f}")
+
+# %% [markdown]
+"""
+# Liar data
+
+
+"""
+
+# %%
+from helper import  LIAR_labelling
+
+f"../../data/training/LIAR.parquet"
+df_LIAR = pd.read_parquet("../../data/testing/LIAR.parquet",columns=['tokens','type'])
+
+
+df_LIAR['label'] = df_LIAR['type'].apply(LIAR_labelling).astype(str)
+df_LIAR['label'] = df_LIAR['label'].map(label_map).astype(int)
+df_LIAR = df_LIAR.drop(columns=['type'])
+
+# %%
+df_LIAR.head()
+
+# %%
+# count how many tokens we have in the corpuse 
+word_counts = Counter()
+for x in df_LIAR['tokens']:
+    word_counts.update(x)
+    
+# Keep the top 50,000 words. 
+# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)
+vocab = {"<PAD>": 0, "<UNK>": 1}
+for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):
+    vocab[word] = idx
+
+print(f"Vocabulary built with {len(vocab)} words.")
+
+# %%
+
+LR_DATA = FakeNewsDataset(dataframe=df_LIAR, vocab=vocab, max_length=256)
+LR_dataloader  = DataLoader(LR_DATA, batch_size=32, shuffle=False)
+
+# %%
+features, labels = next(iter(LR_dataloader))
+# 2. Check the shapes (the dimensions of your tensors)
+print("--- Tensor Shapes ---")
+print(f"Features shape: {features.shape}") 
+print(f"Labels shape:   {labels.shape}")   
+
+# 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)
+print("\n--- Data Types ---")
+print(f"Features dtype: {features.dtype}")
+print(f"Labels dtype:   {labels.dtype}")
+
+# 4. Peek at the actual data for the very first article in this batch
+print("\n--- First Article Peek ---")
+print(f"Label: {labels[0].item()} (0 = Real, 1 = Fake)")
+print(f"Tokens (first 20 IDs): {features[0][:20]}")
+
+# %%
+# # 1. Check a single sample from the Dataset directly
+# single_features, single_label = LR_DATA[0]
+# print(f"Single Sample - Features: {single_features.shape}, Label: {single_label.shape}")
+
+# # 2. Check the DataLoader batch
+# batch_features, batch_labels = next(iter(LR_dataloader))
+# # print(f"Batch - Features: {batch_features.shape}, Labels: {batch_labels.shape}")
+
+# %%
+evaluate_performance(model_adv,LR_dataloader,device)
+
+print("\n--- 2. Testing Avanced model  ---")
+test_acc, test_f1 = evaluate_performance(model_adv, LR_dataloader, device)
+print(f"Test Accuracy:     {test_acc:.2f}%")
+print(f"Test F1 Score:    git  {test_f1:.4f}")
+
+# %%
+
+print("\n--- 2. Testing BASE-Model ---")
+test_acc, test_f1 = evaluate_performance(model, LR_dataloader, device)
+print(f"Test Accuracy:     {test_acc:.2f}%")
+print(f"Test F1 Score:    git  {test_f1:.4f}")
+
+# %%
+
+
--- a/src/models/svm.py
+++ b/src/models/svm.py
@@ -0,0 +1,52 @@
+from constants import TRAINING_DIR, MODEL_DIR
+from models.model import Model
+from labels import Label
+from helper import default_labelling
+from typing import override, Callable
+import pandas as pd
+import pickle
+from sklearn.pipeline import Pipeline
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.svm import LinearSVC
+
+def no_tokenization(str):
+    return str.split(" ")
+
+class SVM_model(Model):
+    def __init__(self, model_filename: str = "", label_translator: Callable[[str], Label] = default_labelling) -> None:
+        super().__init__(model_filename, label_translator)
+
+    @override
+    def train(self, training_dataset: str, hyperparameters: dict[str, float] = {}) -> None:
+        X = pd.read_parquet(f"{TRAINING_DIR}/{training_dataset}", columns=['tokens'])['tokens']
+        X = X.apply(lambda token_list: " ".join(token_list))
+        Y = pd.read_parquet(f"{TRAINING_DIR}/{training_dataset}", columns=['type'])['type']
+        Y = Y.apply(lambda label: self.label_translator(label).value)
+
+        model = Pipeline([
+            ("L string", TfidfVectorizer(tokenizer=no_tokenization)),
+            ("svm", LinearSVC(random_state=0))
+        ])
+
+        model.fit(X, Y)
+        self.model = model
+
+    @override
+    def classify(self, input: pd.Series) -> Label:
+        X = " ".join(input['tokens'])
+        return Label(self.model.predict([X])[0])
+
+    @override
+    def save(self, filename: str) -> None:
+        data = {}
+        data["label_translator"] = self.label_translator
+        data["model"] = self.model
+        with open(f"{MODEL_DIR}/{filename}", 'wb') as file:
+            pickle.dump(data, file)
+
+    @override
+    def load(self, filename: str) -> None:
+        with open(f"{MODEL_DIR}/{filename}", 'rb') as file:
+            data = pickle.load(file)
+        self.label_translator = data["label_translator"]
+        self.model = data["model"]
--- a/src/old_notebooks/data_processing.ipynb
+++ b/src/old_notebooks/data_processing.ipynb
--- a/src/old_notebooks/data_processing.ju.py
+++ b/src/old_notebooks/data_processing.ju.py
@@ -0,0 +1,121 @@
+# %% [markdown]
+"""
+# cleaning
+big_data.csv.zst is the main file we will be using. Every step in the pipeline adds a new column and overwrites the file. This is reversible and when any step changes everything can be run again regardless og the state of the file.
+"""
+
+# %%
+import nltk
+import re
+import os
+import time
+import pandas as pd
+
+DATA_DIR = "../data"
+
+# %%
+# download nltk data
+nltk.download("all")
+
+# %%
+news_sample = pd.read_csv(f"{DATA_DIR}/news_sample.csv")
+
+# %%
+# We will not waste space on csv files, L.
+if (os.path.exists(f"{DATA_DIR}/995,000_rows.csv")):
+    big_data = pd.read_csv(f"{DATA_DIR}/995,000_rows.csv", low_memory=False)
+    big_data.to_csv(f"{DATA_DIR}/big_data.csv.zst")
+    os.remove(f"{DATA_DIR}/995,000_rows.csv")
+big_data = None
+
+# %%
+# cleans text and returns a list of tokens.
+def clean_text(
+        text,
+        remove_regex_patterns = True,
+        remove_stopwords = True,
+        remove_special_characters = True,
+        stemming = True):
+
+    text = str(text).lower().strip()
+
+    if remove_regex_patterns:
+        url_pattern = r'\S+\.\S+'
+        email_pattern = r'\w+@\w+\.\w+'
+        date_pattern = r'[a-z]+ \d{1,2}[a-z]?, \d{4}' # add more date patterns
+        number_pattern = r'\d+'
+
+        text = re.sub(url_pattern, "<URL>", text)
+        text = re.sub(email_pattern, "<EMAIL>", text)
+        text = re.sub(date_pattern, "<DATE>", text)
+        text = re.sub(number_pattern, "<NUMBER>", text)
+
+    if remove_special_characters:
+        text = re.sub(r'[^\w (?:<\w+>)]', " ", text)
+
+    tokenizer = nltk.RegexpTokenizer(r'<\w+>|\w+')
+    tokens = tokenizer.tokenize(text)
+
+    if remove_stopwords:
+        stopwords = stopwords = nltk.corpus.stopwords.words('english')
+        tokens = [token for token in tokens if token not in stopwords]
+
+    if stemming:
+        stemmer = nltk.SnowballStemmer("english")
+        tokens = [stemmer.stem(token) if not re.match(r'<\w+>', token) else token for token in tokens]
+
+    return tokens
+
+# %% [markdown]
+"""
+## Output
+Now we check what the function does and how the vocabulary changes.
+"""
+
+# %%
+# Generates a vocabulary (set of unique words) from a pandas series.
+def generate_vocabulary(series):
+    vocabulary = set()
+    series.apply(lambda tokens: vocabulary.update(tokens))
+    return vocabulary
+
+# %%
+print("original text:\n")
+print(news_sample['content'][1])
+print("\n" + "-" * 100 + "\n")
+
+print("cleaned tokens:\n")
+print(clean_text(news_sample['content'][1]))
+print("\n" + "-" * 100 + "\n")
+
+tokenization_size = len(generate_vocabulary(news_sample['content'].apply(clean_text, remove_stopwords = False, stemming = False)))
+stopwords_size = len(generate_vocabulary(news_sample['content'].apply(clean_text, remove_stopwords = True, stemming = False)))
+stemming_size = len(generate_vocabulary(news_sample['content'].apply(clean_text, remove_stopwords = True, stemming = True)))
+
+print("Unique words after tokenization:")
+print(tokenization_size)
+print("\nUnique words after stopword removal:")
+print(stopwords_size)
+print("\nUnique words after stemming:")
+print(stemming_size)
+print("\nStemming reduction rate:")
+print(f"{round(1 - stemming_size / stopwords_size, 4) * 100}%")
+
+# %% [markdown]
+"""
+## Big Data
+Now we clean the big dataset and save it to csv.zst file. Pandas can save and load zstd files just fine, and since it's realtime compression it doesn't really take more time while heavily reducing the file size.
+"""
+
+# %%
+start = time.perf_counter()
+first = True
+for big_data in pd.read_csv(f"{DATA_DIR}/big_data.csv.zst", chunksize=10000):
+    big_data['tokens'] = big_data['content'].apply(clean_text)
+    if first:
+        big_data.to_csv(f"{DATA_DIR}/big_data_new.csv.zst", mode='w')
+        first = False
+    else:
+        big_data.to_csv(f"{DATA_DIR}/big_data_new.csv.zst", mode='a')
+os.rename(f"{DATA_DIR}/big_data_new.csv.zst",f"{DATA_DIR}/big_data.csv.zst")
+print(f"cleaning took {round((time.perf_counter() - start) / 60, 5)} minutes")
--- a/src/old_notebooks/data_processing_ja.ipynb
+++ b/src/old_notebooks/data_processing_ja.ipynb
@@ -0,0 +1,65 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "95706a2e-9e23-4272-aeaa-4510254f7feb",
+   "metadata": {},
+   "source": [
+    "# Cleaning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1be89b54-76dd-4c2e-bcdd-ff956bf375bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import nltk\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.tokenize import word_tokenize\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b82cf2b2-7cee-4c34-83b9-37c5c4828289",
+   "metadata": {},
+   "source": [
+    "1. Tokenize the text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc8058fc-0ed9-4daf-918d-d3e82064a3a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nltk.download('punkt')\n",
+    "text = ("
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/src/setup.py
+++ b/src/setup.py
@@ -0,0 +1,42 @@
+from constants import DATASET_DIR, TRAINING_DIR, VALIDATION_DIR, TESTING_DIR, ORIGINAL_DATASET_FILES
+from clean_data import clean_dataset
+from helper import csv_to_parquet
+from split import split_dataset, split_dataset_random
+import nltk
+import os
+import shutil
+import pandas as pd
+
+def setup() -> None:
+    # make sure nltk can be used later.
+    nltk.download("all")
+
+    for dataset_file in ORIGINAL_DATASET_FILES:
+        if not os.path.exists(f"{DATASET_DIR}/{dataset_file}"):
+            raise Exception(f"Please add {dataset_file} to {DATASET_DIR}")
+
+        name = os.path.splitext(dataset_file)[0]
+        if not os.path.exists(f"{DATASET_DIR}/{name}.parquet"):
+            csv_to_parquet(f"{DATASET_DIR}/{dataset_file}", f"{DATASET_DIR}/{name}.parquet")
+            print(f"finished converting {dataset_file} to parquet")
+            clean_dataset(f"{name}.parquet")
+            print(f"cleaned {name}.parquet")
+            split_dataset_random(f"{name}.parquet")
+            print(f"split {name}.parquet into traning, validation and test")
+
+        # LIAR
+        for dataset, destination in [("train.tsv", TRAINING_DIR), ("valid.tsv", VALIDATION_DIR), ("test.tsv", TESTING_DIR)]:
+            if os.path.exists(f"{DATASET_DIR}/{dataset}"):
+                df = pd.read_csv(f"{DATASET_DIR}/{dataset}", sep='\t', header=None)
+                df = df.rename(columns={
+                    1: "type",
+                    2: "content"
+                })
+                name = os.path.splitext(dataset)[0]
+                df.to_parquet(f"{DATASET_DIR}/{name}.parquet")
+                clean_dataset(f"{name}.parquet")
+
+                shutil.move(f"{DATASET_DIR}/{name}.parquet", f"{destination}/LIAR.parquet")
+
+if __name__ == "__main__":
+    setup()
--- a/src/split.py
+++ b/src/split.py
@@ -0,0 +1,91 @@
+from constants import CHUNK_SIZE, DATASET_DIR, TRAINING_DIR, VALIDATION_DIR, TESTING_DIR
+import pyarrow.parquet as pq
+import pyarrow as pa
+import os 
+from helper import get_time_boundaries
+import pandas as pd 
+import numpy as np
+
+def split_dataset_random(filename:str) -> None:
+    pq_file = pq.ParquetFile(f"{DATASET_DIR}/{filename}")
+
+    training_writer = None
+    validation_writer = None
+    testing_writer = None
+
+    for batch in pq_file.iter_batches(batch_size=CHUNK_SIZE):
+        table = pa.Table.from_batches([batch])
+
+        rng = np.random.rand(table.num_rows)
+
+        training = table.filter(rng < 0.75)
+        validation = table.filter((rng >= 0.75) & (rng < 0.85))
+        testing = table.filter(rng >= 0.85)
+
+        if not training_writer and training.num_rows:
+            training_writer = pq.ParquetWriter(f"{TRAINING_DIR}/{filename}", training.schema)
+        if not validation_writer and validation.num_rows:
+            validation_writer = pq.ParquetWriter(f"{VALIDATION_DIR}/{filename}", validation.schema)
+        if not testing_writer and testing.num_rows:
+            testing_writer = pq.ParquetWriter(f"{TESTING_DIR}/{filename}", testing.schema)
+
+        if training.num_rows:
+            training_writer.write(training)
+        if validation.num_rows:
+            validation_writer.write(validation)
+        if testing.num_rows:
+            testing_writer.write(testing)
+
+    training_writer.close()
+    validation_writer.close()
+    testing_writer.close()
+
+def split_dataset(filename: str) -> None:
+    df = pd.read_parquet(f"{DATASET_DIR}/{filename}")
+    n = len(df)
+    df['scraped_at'] = pd.to_datetime(df['scraped_at'], format='ISO8601', errors='coerce', utc=True)
+    df = df.sort_values(by='scraped_at')
+
+    df.iloc[:int(n * 0.8)].to_parquet(f"{TRAINING_DIR}/{filename}")
+    df.iloc[int(n * 0.8):int(n * 0.9)].to_parquet(f"{VALIDATION_DIR}/{filename}")
+    df.iloc[int(n * 0.9):].to_parquet(f"{TESTING_DIR}/{filename}")
+    return
+
+    # ── Writers start as None — initialized on first batch ───────────────────
+    train_writer = None
+    val_writer   = None
+    test_writer  = None
+
+    try:
+        parquet_file = pq.ParquetFile(filepath)
+        for batch in parquet_file.iter_batches(batch_size=CHUNK_SIZE):  # type: ignore
+            chunk = batch.to_pandas()  # type: ignore
+            chunk['scraped_at'] = pd.to_datetime(chunk['scraped_at'], format='ISO8601', errors='coerce', utc=True)
+
+            # Initialize writers on first batch AFTER datetime conversion
+            if train_writer is None:
+                schema= pa.Schema.from_pandas(chunk)
+                train_writer = pq.ParquetWriter(os.path.join(TRAINING_DIR,   filename), schema)
+                val_writer   = pq.ParquetWriter(os.path.join(VALIDATION_DIR, filename), schema)
+                test_writer  = pq.ParquetWriter(os.path.join(TESTING_DIR,    filename), schema)
+
+            # Split the chunk
+            train_chunk = chunk[chunk['scraped_at'] <= train_cut]
+            val_chunk   = chunk[(chunk['scraped_at'] > train_cut) & (chunk['scraped_at'] <= val_cut)]
+            test_chunk  = chunk[chunk['scraped_at'] > val_cut]
+
+            # Write each split
+            if not train_chunk.empty:
+                train_writer.write_table(pa.Table.from_pandas(train_chunk, schema=schema))
+            if not val_chunk.empty:
+                val_writer.write_table(pa.Table.from_pandas(val_chunk, schema=schema))
+            if not test_chunk.empty:
+                test_writer.write_table(pa.Table.from_pandas(test_chunk, schema=schema))
+
+    finally:
+        if train_writer:
+            train_writer.close()
+        if val_writer:
+            val_writer.close()
+        if test_writer:
+            test_writer.close()