backup since codeberg is down

2026-03-27 13:35:43 +01:00
commit 8a61a214c6
45 changed files with 5038 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,35 @@
 # Document
 *.pdf
 *.bak
 *.tex.backup
 *.tex~
 *.synctex.gz
 *.out
 .bak
 build/
 _minted/
 obj/
 bin/
 # Python
 __pycache__/
 .env
 .envrc
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 .ipynb_checkpoints/
 # data bs
 data/**
 !data/
 !data/**/
 !data/**/.gitkeep
 # general bs
 .DS_Store
 flake.lock
 .vscode/
--- a/README.md
+++ b/README.md
@@ -0,0 +1,6 @@
 - download the neccesary dataset files to data/datasets as csv (not zip). Move all tsv files from LIAR zip file direcly into the datasets folder.
 - run setup.py to setup nltk, and clean and split the datasets. It takes long, please wait.
 - run main.py from the src diretory to test the models. The function requrires the model type, model file, and dataset to be passed as parameters.
 Here is an example: python main.py --model_type logistic --model_file logistic.model --data_file 995,000_rows.parquet
 The model files can be found in the models directory (not the one in src), the data files can be found in data/testing (pass LIAR.parquet to test on LIAR dataset).
 The model types and more information including how to train models can be found with python main.py --help.
--- a/analysis/Split_analysis.ipynb
+++ b/analysis/Split_analysis.ipynb
@@ -0,0 +1,457 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "3b55d166",
   "metadata": {},
   "source": [
    "# DO NOT RUN; DaATA WILL BE LOST"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9c2d25e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import os \n",
    "import sys\n",
    "sys.path.append(os.path.join(os.getcwd(), '../src'))\n",
    "from constants import TRAINING_DIR, TESTING_DIR, VALIDATION_DIR\n",
    "pd.set_option('display.max_columns', None)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cd67fc64",
   "metadata": {},
   "source": [
    "# Time Split "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "a917b0fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_ty = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown')   \n",
    "train_ty = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown')   \n",
    "val_ty =  pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown')   \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "0098d6e4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "rows in train(818843, 1),\n",
      " rows in test (99499, 1), \n",
      " rows in validation(76645, 1)\n"
     ]
    }
   ],
   "source": [
    "print(f'rows in train{train_ty.shape },\\n rows in test {test_ty.shape}, \\n rows in validation{val_ty.shape}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5985a4f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "timeline = pd.concat([\n",
    "    train_ty.value_counts().rename('train'),\n",
    "    test_ty.value_counts().rename('test'),\n",
    "    val_ty.value_counts().rename('val'),\n",
    "], axis=1).fillna(0).astype(int)\n",
    "\n",
    "timeline.index.name = 'type'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "b0673e19",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>train</th>\n",
       "      <th>test</th>\n",
       "      <th>val</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>type</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>political</th>\n",
       "      <td>194518</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bias</th>\n",
       "      <td>133232</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fake</th>\n",
       "      <td>104883</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>conspiracy</th>\n",
       "      <td>97314</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>rumor</th>\n",
       "      <td>56445</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unknown</th>\n",
       "      <td>43534</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>reliable</th>\n",
       "      <td>42419</td>\n",
       "      <td>99499</td>\n",
       "      <td>76645</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unreliable</th>\n",
       "      <td>35332</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>clickbait</th>\n",
       "      <td>27412</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>junksci</th>\n",
       "      <td>14040</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>satire</th>\n",
       "      <td>13160</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hate</th>\n",
       "      <td>8779</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             train   test    val\n",
       "type                            \n",
       "political   194518      0      0\n",
       "bias        133232      0      0\n",
       "fake        104883      0      0\n",
       "conspiracy   97314      0      0\n",
       "rumor        56445      0      0\n",
       "unknown      43534      0      0\n",
       "reliable     42419  99499  76645\n",
       "unreliable   35332      0      0\n",
       "clickbait    27412      0      0\n",
       "junksci      14040      0      0\n",
       "satire       13160      0      0\n",
       "hate          8779      0      0"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "timeline"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6bdc7d84",
   "metadata": {},
   "source": [
    "# Random Split "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "cd5ca57b",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_ty_R = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown')   \n",
    "train_ty_R = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown')   \n",
    "val_ty_R =  pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown')   \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "c793a37c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "rows in train(745724, 1),\n",
      " rows in test (149766, 1), \n",
      " rows in validation(99510, 1)\n"
     ]
    }
   ],
   "source": [
    "print(f'rows in train{train_ty_R.shape },\\n rows in test {test_ty_R.shape}, \\n rows in validation{val_ty_R.shape}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "583304ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "timeline_R = pd.concat([\n",
    "    train_ty_R.value_counts().rename('train'),\n",
    "    test_ty_R.value_counts().rename('test'),\n",
    "    val_ty_R.value_counts().rename('val'),\n",
    "], axis=1).fillna(0).astype(int)\n",
    "\n",
    "timeline_R.index.name = 'type'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "d8255b60",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>train</th>\n",
       "      <th>test</th>\n",
       "      <th>val</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>type</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>reliable</th>\n",
       "      <td>163802</td>\n",
       "      <td>33010</td>\n",
       "      <td>21752</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>political</th>\n",
       "      <td>145779</td>\n",
       "      <td>29241</td>\n",
       "      <td>19498</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bias</th>\n",
       "      <td>99797</td>\n",
       "      <td>20079</td>\n",
       "      <td>13356</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fake</th>\n",
       "      <td>78736</td>\n",
       "      <td>15602</td>\n",
       "      <td>10545</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>conspiracy</th>\n",
       "      <td>72837</td>\n",
       "      <td>14676</td>\n",
       "      <td>9801</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unknown</th>\n",
       "      <td>68468</td>\n",
       "      <td>13754</td>\n",
       "      <td>9098</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>rumor</th>\n",
       "      <td>42254</td>\n",
       "      <td>8553</td>\n",
       "      <td>5638</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unreliable</th>\n",
       "      <td>26489</td>\n",
       "      <td>5346</td>\n",
       "      <td>3497</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>clickbait</th>\n",
       "      <td>20552</td>\n",
       "      <td>4161</td>\n",
       "      <td>2699</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>junksci</th>\n",
       "      <td>10516</td>\n",
       "      <td>2066</td>\n",
       "      <td>1458</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>satire</th>\n",
       "      <td>9852</td>\n",
       "      <td>1971</td>\n",
       "      <td>1337</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hate</th>\n",
       "      <td>6641</td>\n",
       "      <td>1307</td>\n",
       "      <td>831</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2018-02-10 13:43:39.521661</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             train   test    val\n",
       "type                                            \n",
       "reliable                    163802  33010  21752\n",
       "political                   145779  29241  19498\n",
       "bias                         99797  20079  13356\n",
       "fake                         78736  15602  10545\n",
       "conspiracy                   72837  14676   9801\n",
       "unknown                      68468  13754   9098\n",
       "rumor                        42254   8553   5638\n",
       "unreliable                   26489   5346   3497\n",
       "clickbait                    20552   4161   2699\n",
       "junksci                      10516   2066   1458\n",
       "satire                        9852   1971   1337\n",
       "hate                          6641   1307    831\n",
       "2018-02-10 13:43:39.521661       1      0      0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "timeline_R"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "355d343a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "main_asg",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.14.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/analysis/analysis2.ipynb
+++ b/analysis/analysis2.ipynb
--- a/analysis/analyz_split_time.ipynb
+++ b/analysis/analyz_split_time.ipynb
@@ -0,0 +1,237 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c2d25e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import os \n",
    "import sys\n",
    "sys.path.append(os.path.join(os.getcwd(), '../src'))\n",
    "from constants import TRAINING_DIR, TESTING_DIR, VALIDATION_DIR\n",
    "pd.set_option('display.max_columns', None)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "a917b0fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_ty = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown')   \n",
    "train_ty = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown')   \n",
    "val_ty =  pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown')   \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "0098d6e4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "rows in train(818843, 1),\n",
      " rows in test (99499, 1), \n",
      " rows in validation(76645, 1)\n"
     ]
    }
   ],
   "source": [
    "print(f'rows in train{train_ty.shape },\\n rows in test {test_ty.shape}, \\n rows in validation{val_ty.shape}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "5985a4f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "timeline = pd.concat([\n",
    "    b.value_counts().rename('train'),\n",
    "    a.value_counts().rename('test'),\n",
    "    c.value_counts().rename('val'),\n",
    "], axis=1).fillna(0).astype(int)\n",
    "\n",
    "timeline.index.name = 'type'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "b0673e19",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>train</th>\n",
       "      <th>test</th>\n",
       "      <th>val</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>type</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>political</th>\n",
       "      <td>194518</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bias</th>\n",
       "      <td>133232</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fake</th>\n",
       "      <td>104883</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>conspiracy</th>\n",
       "      <td>97314</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>rumor</th>\n",
       "      <td>56445</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unknown</th>\n",
       "      <td>43534</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>reliable</th>\n",
       "      <td>42419</td>\n",
       "      <td>99499</td>\n",
       "      <td>76645</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unreliable</th>\n",
       "      <td>35332</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>clickbait</th>\n",
       "      <td>27412</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>junksci</th>\n",
       "      <td>14040</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>satire</th>\n",
       "      <td>13160</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hate</th>\n",
       "      <td>8779</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             train   test    val\n",
       "type                            \n",
       "political   194518      0      0\n",
       "bias        133232      0      0\n",
       "fake        104883      0      0\n",
       "conspiracy   97314      0      0\n",
       "rumor        56445      0      0\n",
       "unknown      43534      0      0\n",
       "reliable     42419  99499  76645\n",
       "unreliable   35332      0      0\n",
       "clickbait    27412      0      0\n",
       "junksci      14040      0      0\n",
       "satire       13160      0      0\n",
       "hate          8779      0      0"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "timeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c2bcfc84",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "main_asg",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.14.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/data/datasets/.gitkeep
+++ b/data/datasets/.gitkeep
--- a/data/temp/.gitkeep
+++ b/data/temp/.gitkeep
--- a/data/testing/.gitkeep
+++ b/data/testing/.gitkeep
--- a/data/training/.gitkeep
+++ b/data/training/.gitkeep
--- a/data/validation/.gitkeep
+++ b/data/validation/.gitkeep
--- a/flake.nix
+++ b/flake.nix
@@ -0,0 +1,34 @@
 # This is for my retarded nixos jupyter notebook setup. It makes a shell with requrements.txt and jupynium bs installed.
 {
  inputs = {
    nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
    pyproject-nix.url = "github:pyproject-nix/pyproject.nix";
    bozo_nixpkgs.url = "github:DuarteSJ/nixpkgs/4e926b09ba06301b08d0f12afd0640c079bdc4dc";
  };
  outputs =
    { nixpkgs, pyproject-nix, bozo_nixpkgs, ... }:
    let
      project = pyproject-nix.lib.project.loadRequirementsTxt { projectRoot = ./.; };
      pkgs = nixpkgs.legacyPackages.x86_64-linux;
      bozo_pkgs = bozo_nixpkgs.legacyPackages.x86_64-linux;
      python = pkgs.python3;
      pythonEnv = pkgs.python3.withPackages (pkgs:
        let base = project.renderers.withPackages { inherit python; } pkgs;
        in base ++ (with pkgs; [ notebook nbclassic jupyter-console ipython]));
      mental_retardation = bozo_pkgs.python3.withPackages (python-pkgs: with python-pkgs; [ jupynium ]);
    in
    {
      devShells.x86_64-linux.default = pkgs.mkShell { 
        packages = [ pythonEnv mental_retardation ]; 
        shellHook = ''
          export SHELL="which fish"
          if [[ $- == *i* ]] && [ -z "$TMUX" ]; then
                tmux new-session -A -s GDS-fake-news
          fi
        '';
      };
    };
 }
--- a/models/LIAR_baseline.model
+++ b/models/LIAR_baseline.model
--- a/models/baseline.model
+++ b/models/baseline.model
--- a/models/gradient_boosting.model
+++ b/models/gradient_boosting.model
--- a/models/logistic.model
+++ b/models/logistic.model
--- a/models/metadata_logistic.model
+++ b/models/metadata_logistic.model
--- a/models/not_reliable_logistic.model
+++ b/models/not_reliable_logistic.model
--- a/models/old/GB10K.model
+++ b/models/old/GB10K.model
--- a/models/old/GB1K.model
+++ b/models/old/GB1K.model
--- a/models/old/GB2K.model
+++ b/models/old/GB2K.model
--- a/models/old/GB4K.model
+++ b/models/old/GB4K.model
--- a/models/only_fake_logistic.model
+++ b/models/only_fake_logistic.model
--- a/models/svm.model
+++ b/models/svm.model
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -0,0 +1,4 @@
 {
  "typeCheckingMode": "strict",
  "reportMissingTypeStubs": false
 }
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,117 @@
 anyio==4.12.1
 argon2-cffi==25.1.0
 argon2-cffi-bindings==25.1.0
 arrow==1.4.0
 asttokens==3.0.1
 async-lru==2.1.0
 attrs==25.4.0
 babel==2.18.0
 beautifulsoup4==4.14.3
 bleach==6.3.0
 certifi==2026.1.4
 cffi==2.0.0
 charset-normalizer==3.4.4
 click==8.3.1
 comm==0.2.3
 contourpy==1.3.3
 cycler==0.12.1
 debugpy==1.8.20
 decorator==5.2.1
 defusedxml==0.7.1
 executing==2.2.1
 fastjsonschema==2.21.2
 fonttools==4.61.1
 fqdn==1.5.1
 h11==0.16.0
 httpcore==1.0.9
 httpx==0.28.1
 idna==3.11
 ipykernel==7.2.0
 ipython==9.10.0
 ipython_pygments_lexers==1.1.1
 ipywidgets==8.1.8
 isoduration==20.11.0
 jedi==0.19.2
 Jinja2==3.1.6
 joblib==1.5.3
 json5==0.13.0
 jsonpointer==3.0.0
 jsonschema==4.26.0
 jsonschema-specifications==2025.9.1
 jupyter==1.1.1
 jupyter-console==6.6.3
 jupyter-events==0.12.0
 jupyter-lsp==2.3.0
 jupyter_client==8.8.0
 jupyter_core==5.9.1
 jupyter_server==2.17.0
 jupyter_server_terminals==0.5.4
 jupyterlab==4.5.4
 jupyterlab_pygments==0.3.0
 jupyterlab_server==2.28.0
 jupyterlab_widgets==3.0.16
 kiwisolver==1.4.9
 lark==1.3.1
 MarkupSafe==3.0.3
 matplotlib==3.10.8
 matplotlib-inline==0.2.1
 mistune==3.2.0
 nbclient==0.10.4
 nbconvert==7.17.0
 nbformat==5.10.4
 nest-asyncio==1.6.0
 nltk==3.9.2
 notebook==7.5.3
 notebook_shim==0.2.4
 numpy==2.4.2
 packaging==26.0
 pandas==3.0.1
 pandas-stubs==3.0.0.260204
 pandocfilters==1.5.1
 parso==0.8.6
 pexpect==4.9.0
 pillow==12.1.1
 platformdirs==4.9.2
 prometheus_client==0.24.1
 prompt_toolkit==3.0.52
 psutil==7.2.2
 ptyprocess==0.7.0
 pure_eval==0.2.3
 pyarrow==23.0.1
 pycparser==3.0
 Pygments==2.19.2
 pyparsing==3.3.2
 python-dateutil==2.9.0.post0
 python-json-logger==4.0.0
 PyYAML==6.0.3
 pyzmq==27.1.0
 referencing==0.37.0
 regex==2026.1.15
 requests==2.32.5
 rfc3339-validator==0.1.4
 rfc3986-validator==0.1.1
 rfc3987-syntax==1.1.0
 rpds-py==0.30.0
 scikit-learn==1.8.0
 scipy==1.17.1
 Send2Trash==2.1.0
 setuptools==82.0.0
 six==1.17.0
 soupsieve==2.8.3
 stack-data==0.6.3
 terminado==0.18.1
 threadpoolctl==3.6.0
 tinycss2==1.4.0
 tornado==6.5.4
 tqdm==4.67.3
 traitlets==5.14.3
 typing_extensions==4.15.0
 tzdata==2025.3
 uri-template==1.3.0
 urllib3==2.6.3
 wcwidth==0.6.0
 webcolors==25.10.0
 webencodings==0.5.1
 websocket-client==1.9.0
 widgetsnbextension==4.0.15
 zstandard==0.25.0
--- a/src/clean_data.py
+++ b/src/clean_data.py
@@ -0,0 +1,97 @@
 from constants import DATASET_DIR, TEMP_DIR
 from helper import dataset_iterator
 import pyarrow as pa
 import pyarrow.parquet as pq
 import nltk
 import re
 import shutil
 # cleans text and returns a list of tokens.
 def clean_text(
        text:str,
        remove_regex_patterns:bool = True,
        remove_stopwords:bool = True,
        remove_special_characters:bool = True,
        stemming:bool = True
    ) -> list[str]:
    text = str(text).lower().strip()
    if remove_regex_patterns:
        url_pattern = r'https?://\S+|www\.\S+'
        email_pattern = r'[\w.-]+@[\w]+\.[\w]+'
        date_pattern = r'([a-z]+ \d{1,2}[a-z]?, \d{4}|\d{2,4}[-/]\d{2,4}[-/]\d{2,4})'  # add more date patterns
        number_pattern = r'\d+'
        text = re.sub(url_pattern, "<URL>", text)
        text = re.sub(email_pattern, "<EMAIL>", text)
        text = re.sub(date_pattern, "<DATE>", text)
        text = re.sub(number_pattern, "<NUMBER>", text)
    if remove_special_characters:
        text = re.sub(r'[^\w (?:<\w+>)]', " ", text)
    tokenizer = nltk.RegexpTokenizer(r'<\w+>|\w+')
    tokens = tokenizer.tokenize(text) # type: ignore
    if remove_stopwords:
        stopwords = stopwords = nltk.corpus.stopwords.words('english')
        tokens = [token for token in tokens if token not in stopwords] # type: ignore
    if stemming:
        stemmer = nltk.SnowballStemmer("english")
        tokens = [stemmer.stem(token) if not re.match(r'<\w+>', token) else token for token in tokens] # type: ignore
    return tokens # type: ignore
 def clean_dataset(filename:str) -> None:
    output_path = f"{TEMP_DIR}/{filename}"
    writer = None
    for chunk in dataset_iterator(f"{DATASET_DIR}/{filename}"):
        chunk['tokens'] = chunk['content'].apply(clean_text)
        columns_in_chunk = chunk.columns #[c for c in chunk.columns]
        table = pa.Table.from_pandas(chunk[columns_in_chunk])
        if writer is None:
            writer = pq.ParquetWriter(output_path, table.schema)
        writer.write_table(table)
    writer.close()
    shutil.move(output_path, f"{DATASET_DIR}/{filename}")
 def compute_vocab_reduction(filename: str) -> dict[str, float | int]:
    dataset_path = f"{DATASET_DIR}/{filename}"
    vocab_before_stopwords: set[str] = set()
    vocab_after_stopwords: set[str] = set()
    vocab_after_stemming: set[str] = set()
    for chunk in dataset_iterator(dataset_path):
        contents = chunk["content"]
        for text in contents:
            vocab_before_stopwords.update(clean_text(text, remove_stopwords=False, stemming=False))
            vocab_after_stopwords.update(clean_text(text, remove_stopwords=True, stemming=False))
            vocab_after_stemming.update(clean_text(text, remove_stopwords=True, stemming=True))
    before_stop_size = len(vocab_before_stopwords)
    after_stop_size = len(vocab_after_stopwords)
    before_stem_size = after_stop_size  
    after_stem_size = len(vocab_after_stemming)
    stopwords_reduction_rate = (
        (before_stop_size - after_stop_size) / before_stop_size if before_stop_size else 0.0
    )
    stemming_reduction_rate = (
        (before_stem_size - after_stem_size) / before_stem_size if before_stem_size else 0.0
    )
    return {
        "vocab_size_before_stopwords": before_stop_size,
        "vocab_size_after_stopwords": after_stop_size,
        "stopwords_reduction_rate": stopwords_reduction_rate,
        "vocab_size_before_stemming": before_stem_size,
        "vocab_size_after_stemming": after_stem_size,
        "stemming_reduction_rate": stemming_reduction_rate,
    }
--- a/src/constants.py
+++ b/src/constants.py
@@ -0,0 +1,14 @@
 import os
 DATA_DIR = os.path.abspath("../data")
 MODEL_DIR = os.path.abspath("../models")
 DATASET_DIR = f"{DATA_DIR}/datasets"
 TRAINING_DIR = f"{DATA_DIR}/training"
 VALIDATION_DIR = f"{DATA_DIR}/validation"
 TESTING_DIR = f"{DATA_DIR}/testing"
 TEMP_DIR = f"{DATA_DIR}/temp"
 ORIGINAL_DATASET_FILES = ["news_sample.csv", "995,000_rows.csv"]
 DATASET_FILES = ["news_sample.parquet", "995,000_rows.parquet"]
 CHUNK_SIZE = 10000 # how many rows to work on at time, instead of loading the entire dataset into memory.
 MAX_ROWS = -1 # only work with MAX_ROWS rows so testing things out isnt crazy slow. Set to -1 for infinite.
--- a/src/helper.py
+++ b/src/helper.py
@@ -0,0 +1,60 @@
 from labels import Label
 from constants import CHUNK_SIZE, MAX_ROWS
 from typing import Iterator, cast
 import pyarrow.parquet as pq
 import pandas as pd
 def default_labelling(article_type:str) -> Label:
    if article_type in ["reliable", "political", "clickbait"]:
        return Label.REAL
    return Label.FAKE
 def only_fake_labelling(article_type:str) -> Label:
    if article_type == "fake":
        return Label.FAKE
    return Label.REAL
 def not_reliable_labelling(article_type:str) -> Label:
    if article_type == "reliable":
        return Label.REAL
    return Label.FAKE
 def LIAR_labelling(article_type:str) -> Label:
    if article_type in ["true", "half-true", "barely-true", "mostly-true"]:
        return Label.REAL
    return Label.FAKE
 # Deprecated, don't use, just use pd.read_parquet instead
 def dataset_iterator(dataset_file:str, columns:list[str] | None = None) -> Iterator[pd.DataFrame]:
    pq_file = pq.ParquetFile(dataset_file)
    rows_read = 0
    for batch in pq_file.iter_batches(batch_size=CHUNK_SIZE, columns=columns): # type: ignore
        rows_read += len(batch) # type: ignore
        if rows_read > MAX_ROWS and MAX_ROWS > 0:
            return
        # cast to ignore type warnings.
        yield cast(pd.DataFrame, batch.to_pandas()) # type: ignore
 def csv_to_parquet(input_path: str, output_path: str):
    # csv.field_size_limit(sys.maxsize) # if i don't use this and engine="python" i cant read all datasets.
    # writer = None
    # for chunk in pd.read_csv(input_path, chunksize=CHUNK_SIZE, dtype=str, engine="python"): # who the fuck put a string in the id column
    #     chunk = chunk.fillna("")
    #     table = pa.Table.from_pandas(chunk) # type: ignore
    #     if writer is None:
    #         writer = pq.ParquetWriter(output_path, table.schema) # type: ignore
    #     writer.write_table(table) # type: ignore
    # writer.close() # type: ignore
    pd.read_csv(input_path, low_memory=False).to_parquet(output_path)
 def get_time_boundaries (filename: str) : #type: ignore 
    # Only load the timestamp column to save RAM
    df_dates = pq.read_table(filename, columns=['scraped_at']).to_pandas() #type: ignore 
    df_dates['scraped_at'] = pd.to_datetime(df_dates['scraped_at'], format='ISO8601', errors='coerce',utc=True) #type: ignore 
    # Sort the dates to find the percentiles
    sorted_dates = df_dates['scraped_at'].sort_values() #type: ignore 
    # Find the timestamps at the 80th and 90th percentile
    train_cut = sorted_dates.quantile(0.80) #type: ignore 
    val_cut = sorted_dates.quantile(0.90) #type: ignore 
    return train_cut, val_cut #type: ignore 
--- a/src/labels.py
+++ b/src/labels.py
@@ -0,0 +1,5 @@
 from enum import Enum
 class Label(Enum):
    REAL = 0
    FAKE = 1
--- a/src/main.py
+++ b/src/main.py
@@ -0,0 +1,64 @@
 #!/usr/bin/env python
 import argparse
 from models.svm import SVM_model
 from models.gradient_boosting import Gradient_boosting_model
 from models.logistic_regression import Logistic_model
 from models.baseline import Baseline_model
 from helper import default_labelling, not_reliable_labelling, only_fake_labelling, LIAR_labelling
 def main() -> None:
    parser = argparse.ArgumentParser(
            prog="Fakenews detector",
            description="Train and test models",
            usage="The following is an example of training a logistic regression model on news_sample.parquet:\n"+
            "python main.py --model_type logistic --model_file logistic_news_sample.model --data_file news_sample.parquet --train",
    )
    parser.add_argument("--train", action="store_true", help="Whether model should be trained, if not set it will be tested instead")
    parser.add_argument("--validate", action="store_true", help="Wheter to use validation set when testing/validating")
    parser.add_argument("--model_type", "-t", required=True, choices=["baseline", "logistic", "svm", "gradient_boosting"], help="The type of model: baseline, logistic, ...")
    parser.add_argument("--model_file", "-f", required=True, default="", help="The model file to save to when training, or load from when testing")
    parser.add_argument("--data_file", "-d", required=True, help="The datafile used when training or testing")
    parser.add_argument("--label_translator", "-l", required=False, default = "", help="The translator function used by the model, such as \"not_reliable\", that only considers 'reliable' tagged news Real, ignored if not using --train.")
    parser.add_argument("--hyperparameters", "-p", required=False, nargs="+", default ="", help="The hyperparameters used when training the model, written like c=1")
    args = parser.parse_args()
    label_translator = default_labelling
    if "not_reliable" in args.label_translator.lower():
        label_translator = not_reliable_labelling
    if "only_fake" in args.label_translator.lower():
        label_translator = only_fake_labelling
    if "liar" in args.data_file.lower():
        label_translator = LIAR_labelling
        args.data_file = "LIAR.parquet"
    if args.model_type == "logistic":
        model = Logistic_model(label_translator=label_translator)
    elif args.model_type == "svm":
        model = SVM_model(label_translator=label_translator)
    elif args.model_type == "gradient_boosting":
        model = Gradient_boosting_model(label_translator=label_translator)
    else:
        model = Baseline_model(label_translator=label_translator)
    if args.train:
        hyperparameters:dict[str, float] = {}
        for parameter in args.hyperparameters:
            key, value = parameter.split("=")
            hyperparameters[key] = float(value)
        model.train(args.data_file, hyperparameters)
        model.save(args.model_file)
    if not args.train:
        model.load(args.model_file)
        if "liar" in args.data_file.lower():
            model.test("LIAR.parquet", validate=False)
        else:
            model.test(args.data_file, args.validate)
 if __name__ == "__main__":
    main()
--- a/src/models/Untitled.ipynb
+++ b/src/models/Untitled.ipynb
@@ -0,0 +1,6 @@
 {
 "cells": [],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/src/models/Untitled1.ipynb
+++ b/src/models/Untitled1.ipynb
@@ -0,0 +1,6 @@
 {
 "cells": [],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/src/models/Untitled2.ipynb
+++ b/src/models/Untitled2.ipynb
@@ -0,0 +1,846 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3ed30f2e",
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'torch'",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mModuleNotFoundError\u001b[39m                       Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtorch\u001b[39;00m \n\u001b[32m      2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtorch\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mnn\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnn\u001b[39;00m\n\u001b[32m      3\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtorch\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mnn\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mfunctional\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mF\u001b[39;00m \n",
      "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'torch'"
     ]
    }
   ],
   "source": [
    "import torch \n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F \n",
    "import pandas as pd\n",
    "from torch.utils.data import Dataset, DataLoader\n",
    "from collections import Counter\n",
    "import os\n",
    "import sys\n",
    "sys.path.append(os.path.join(os.getcwd(), '../'))\n",
    "from helper import default_labelling\n",
    "from sklearn.metrics import f1_score\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42edceb8",
   "metadata": {},
   "outputs": [],
   "source": [
    "label_map = {\n",
    "    'Label.FAKE': 0,\n",
    "    'Label.REAL': 1}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0aa1a427",
   "metadata": {},
   "source": [
    "# Pipelining process"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c7730d65",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_parquet(\"../../data/training/995,000_rows.parquet\", columns=['tokens','type'])\n",
    "\n",
    "\n",
    "df['label'] = df['type'].apply(default_labelling).astype(str)\n",
    "df['label'] = df['label'].map(label_map).astype(int)\n",
    "df = df.drop(columns=['type'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c31caf06",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_test = pd.read_parquet(\"../../data/testing/995,000_rows.parquet\", columns=['tokens','type'])\n",
    "\n",
    "df_test['label'] = df_test['type'].apply(default_labelling).astype(str)\n",
    "df_test['label'] = df_test['label'].map(label_map).astype(int)\n",
    "df_test = df_test.drop(columns=['type'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5c0c93ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_val = pd.read_parquet(\"../../data/validation/995,000_rows.parquet\", columns=['tokens','type'])\n",
    "df_val['label'] = df_val['type'].apply(default_labelling).astype(str)\n",
    "df_val['label'] = df_val['label'].map(label_map).astype(int)\n",
    "df_val = df_val.drop(columns=['type'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19188ef7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# print(\"Loading Parquet file...\")\n",
    "\n",
    "# # Check the total number of rows (articles)\n",
    "# print(f\"Total rows in the raw Parquet file: {len(df)}\")\n",
    "\n",
    "# # Look at the first few rows to make sure the data looks correct\n",
    "# print(\"\\n--- First 3 Rows ---\")\n",
    "# print(df.head(3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa455147",
   "metadata": {},
   "outputs": [],
   "source": [
    "# count how many tokens we have in the corpuse \n",
    "word_counts = Counter()\n",
    "for x in df['tokens']:\n",
    "    word_counts.update(x)\n",
    "    \n",
    "# Keep the top 50,000 words. \n",
    "# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)\n",
    "vocab = {\"<PAD>\": 0, \"<UNK>\": 1}\n",
    "for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):\n",
    "    vocab[word] = idx\n",
    "\n",
    "print(f\"Vocabulary built with {len(vocab)} words.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9ba0021",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a Custom PyTorch Datase\n",
    "\n",
    "# a wrapper for the data that PyTorch knows how to talk to.\n",
    "class FakeNewsDataset(Dataset):\n",
    "    def __init__(self, dataframe, vocab, max_length=256):\n",
    "        self.dataframe = dataframe\n",
    "        self.vocab = vocab\n",
    "        self.max_length = max_length\n",
    "\n",
    "# Tells PyTorch how many articles we have\n",
    "#PyTorch calls this internally to know when to stop fetching data.\n",
    "    def __len__(self):\n",
    "        return len(self.dataframe)\n",
    "    \n",
    "    def __getitem__(self, idx):\n",
    "        # Grabs one article and its label at a time\n",
    "        tokens = self.dataframe.iloc[idx]['tokens']\n",
    "        label = self.dataframe.iloc[idx]['label']\n",
    "\n",
    "        # Convert text tokens to Integer IDs\n",
    "        article_ids = [self.vocab.get(word, 1) for word in tokens]\n",
    "\n",
    "    # Truncate or Pad the article so they are all exactly 'max_length' long\n",
    "        if len(article_ids) > self.max_length:\n",
    "            article_ids = article_ids[:self.max_length]\n",
    "        else:\n",
    "            padding = [0] * (self.max_length - len(article_ids))\n",
    "            article_ids.extend(padding)\n",
    "            \n",
    "        # Return as PyTorch tensors\n",
    "        return torch.tensor(article_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f3f4096",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Prepare the DataLoader \n",
    "# Wrap The dataframe in the Dataset class\n",
    "\n",
    "# The DataLoader feeds the data to the model in batches (e.g., 64 articles at a time)\n",
    "# This prevents the  computer from running out of RAM!\n",
    "\n",
    "\n",
    "my_train_dataset = FakeNewsDataset(dataframe=df, vocab=vocab, max_length=256)\n",
    "# Shuffle is true for training so the data keeps getting shuffled when trained and the model does not memorise the data\n",
    "train_dataloader = DataLoader(my_train_dataset, batch_size=64, shuffle=True,num_workers=4,       # Start with 4; if CPU stays cool, try 6\n",
    "pin_memory=True,     # Essential for fast data transfer\n",
    "prefetch_factor=2)\n",
    "\n",
    "\n",
    "val_data = FakeNewsDataset(dataframe=df_val, vocab=vocab, max_length=256)\n",
    "val_dataloader = DataLoader(val_data, batch_size=64, shuffle=False)\n",
    "\n",
    "test_data = FakeNewsDataset(dataframe=df_test, vocab=vocab, max_length=256)\n",
    "test_dataloader = DataLoader(test_data, batch_size=64, shuffle=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fd4f08a6",
   "metadata": {},
   "source": [
    "Checking if the data conversion works"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9bcbcf9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# features, labels = next(iter(train_dataloader))\n",
    "# # 2. Check the shapes (the dimensions of your tensors)\n",
    "# print(\"--- Tensor Shapes ---\")\n",
    "# print(f\"Features shape: {features.shape}\") \n",
    "# print(f\"Labels shape:   {labels.shape}\")   \n",
    "\n",
    "# # 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)\n",
    "# print(\"\\n--- Data Types ---\")\n",
    "# print(f\"Features dtype: {features.dtype}\")\n",
    "# print(f\"Labels dtype:   {labels.dtype}\")\n",
    "\n",
    "# # 4. Peek at the actual data for the very first article in this batch\n",
    "# print(\"\\n--- First Article Peek ---\")\n",
    "# print(f\"Label: {labels[0].item()} (0 = Real, 1 = Fake)\")\n",
    "# print(f\"Tokens (first 20 IDs): {features[0][:20]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b70e45ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "class BaseModel(nn.Module):\n",
    "    def __init__(self, vocab_size, embed_dim=32, h1=256, h2=128, out_features=2):\n",
    "        super().__init__()\n",
    "        \n",
    "        # The Embedding Layer: Turns word IDs into rich numerical vectors\n",
    "        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)\n",
    "        \n",
    "        # The Linear Layers: Learn the patterns to decide Fake vs. Real\n",
    "        self.fc1 = nn.Linear(embed_dim, h1)\n",
    "        self.fc2 = nn.Linear(h1, h2)\n",
    "        self.out = nn.Linear(h2, out_features)\n",
    "        \n",
    "    def forward(self, x):\n",
    "        \n",
    "        # x starts as integers: shape (batch_size, sequence_length) -> e.g., (64, 256)\n",
    "        # Pass through embedding\n",
    "        x = self.embedding(x) \n",
    "        # Average the word vectors to get one single vector for the whole article\n",
    "        x = x.mean(dim=1) \n",
    "        \n",
    "        # Pass through hidden layers with ReLU activation\n",
    "        x = F.relu(self.fc1(x))\n",
    "        x = F.relu(self.fc2(x))\n",
    "        \n",
    "        # Output layer (gives us the raw scores for 'Real' and 'Fake')\n",
    "        x = self.out(x)\n",
    "        return x\n",
    "model_basic =BaseModel(vocab_size=len((vocab)))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "efa6c453",
   "metadata": {},
   "source": [
    "'Advanced'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52cb9377",
   "metadata": {},
   "outputs": [],
   "source": [
    "class advanced_model(nn.Module):\n",
    "    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128,num_layer = 2,  out_features=2):\n",
    "        super().__init__()\n",
    "        \n",
    "        # 1. The Embedding Layer (Same as before)\n",
    "        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)\n",
    "        \n",
    "        # # 2. The GRU Layer (Extra layer)\n",
    "        # batch_first=True is required because our DataLoader outputs (batch_size, sequence_length)   \n",
    "        self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, num_layers=2,batch_first=True,bidirectional=True, \n",
    "            dropout=0.3)\n",
    "        \n",
    "        # 3. The Final Output Layer\n",
    "        #  connect the GRU's memory (hidden_dim) directly to our Real/Fake outputs\n",
    "        self.out = nn.Linear(hidden_dim, out_features)\n",
    "        self.fc = nn.Linear(hidden_dim * 2, out_features)\n",
    "    def forward(self, x):\n",
    "        # x shape: (batch_size, sequence_length) -> e.g., (64, 256)\n",
    "        \n",
    "        #Get the word embeddings\n",
    "        x = self.embedding(x) \n",
    "        # x shape becomes: (64, 256, 32)\n",
    "        \n",
    "        # Pass the embeddings into the GRU\n",
    "        # A GRU outputs two things: the output at every single word, AND its final memory state.\n",
    "        # We use '_' to ignore the step-by-step output, and save 'hidden_state'.\n",
    "        _, hidden = self.gru(x)\n",
    "        \n",
    "   # 4. Extract and Concatenate the final forward and backward states\n",
    "        # hidden[-2] is the last forward state, hidden[-1] is the last backward state\n",
    "        out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)\n",
    "        \n",
    "        return self.fc(out)\n",
    "    \n",
    "# Initilize\n",
    "model_adv = advanced_model(vocab_size=len(vocab))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "31b581d0",
   "metadata": {},
   "source": [
    "# Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a8e1f849",
   "metadata": {},
   "outputs": [],
   "source": [
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae976afb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate_performance(model, dataloader, device):\n",
    "    model.eval() # Put model in evaluation mode\n",
    "    \n",
    "    all_predictions = []\n",
    "    all_true_labels = []\n",
    "    \n",
    "    # Turn off gradient tracking to save memory\n",
    "    with torch.no_grad():\n",
    "        for features, labels in dataloader:\n",
    "            features = features.to(device)\n",
    "            labels = labels.to(device)\n",
    "            \n",
    "            # Get model scores\n",
    "            scores = model(features)\n",
    "            \n",
    "            # Find the predicted class (0 or 1)\n",
    "            _, predictions = torch.max(scores,1)\n",
    "            \n",
    "            # Save predictions and actual labels to lists\n",
    "            # all_predictions.extend(predictions.cpu().tolist())\n",
    "            # all_true_labels.extend(labels.cpu().tolist())\n",
    "            all_predictions.extend(predictions.cpu().numpy().flatten().tolist())\n",
    "            all_true_labels.extend(labels.cpu().numpy().flatten().tolist())\n",
    "            \n",
    "    all_predictions = np.array(all_predictions)\n",
    "    all_true_labels = np.array(all_true_labels)\n",
    "        \n",
    "    accuracy = (all_predictions == all_true_labels).mean() * 100\n",
    "        \n",
    "        # 4. Calculate F1 Score\n",
    "        # average='macro' is best for your report to show you care about both classes equally\n",
    "    f1 = f1_score(all_true_labels, all_predictions, average='macro')\n",
    "    model.train() # Return model to training mode just in case\n",
    "    return accuracy, f1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65e26f88",
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_model(model, train_loader, val_loader, device, epochs=5, lr=0.001):\n",
    "    model = model.to(device)\n",
    "    criterion = nn.CrossEntropyLoss()\n",
    "    optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n",
    "    \n",
    "    # Dictionary to store results for your report\n",
    "    history = {'train_loss': [], 'val_acc': [], 'val_f1': []}\n",
    "\n",
    "    print(f\"Training {model.__class__.__name__} on {device}...\")\n",
    "\n",
    "    for epoch in range(epochs):\n",
    "        model.train()\n",
    "        total_loss = 0\n",
    "        \n",
    "        for batch_idx, (features, labels) in enumerate(train_loader):\n",
    "            features, labels = features.to(device), labels.to(device)\n",
    "            \n",
    "            optimizer.zero_grad()\n",
    "            predictions = model(features)\n",
    "            loss = criterion(predictions, labels)\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "            \n",
    "            total_loss += loss.item()\n",
    "            \n",
    "        avg_loss = total_loss / len(train_loader)\n",
    "        \n",
    "        # After each epoch, evaluate on validation set\n",
    "        val_acc, val_f1 = evaluate_performance(model, val_loader, device)\n",
    "        \n",
    "        # Save results to our history dictionary\n",
    "        history['train_loss'].append(avg_loss)\n",
    "        history['val_acc'].append(val_acc)\n",
    "        history['val_f1'].append(val_f1)\n",
    "        \n",
    "        print(f\"\\n Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f} \\n  Val Acc: {val_acc:.2f}% \\n  Val F1: {val_f1:.4f}\")\n",
    "\n",
    "    return history # Return the results so we can plot them later"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3acf0f2b",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_995_basic =train_model (model_basic, train_dataloader, val_dataloader, device, epochs =7 )\n",
    "print(train_995_basic )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c0f7f65",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_995_adv =train_model (model_adv, train_dataloader, val_dataloader, device, epochs =7 )\n",
    "print(train_995_adv )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a1e10032",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12959462",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "9fb31c02",
   "metadata": {},
   "source": [
    "# Evaluation"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2630d40a",
   "metadata": {},
   "source": [
    "Basic model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73c388e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # 1. The Evaluation Function\n",
    "# def evaluate_performance(model, dataloader, device):\n",
    "#     model.eval() # Put model in evaluation mode\n",
    "    \n",
    "#     all_predictions = []\n",
    "#     all_true_labels = []\n",
    "    \n",
    "#     # Turn off gradient tracking to save memory\n",
    "#     with torch.no_grad():\n",
    "#         for features, labels in dataloader:\n",
    "#             features = features.to(device)\n",
    "#             labels = labels.to(device)\n",
    "            \n",
    "#             # Get model scores\n",
    "#             scores = model(features)\n",
    "            \n",
    "#             # Find the predicted class (0 or 1)\n",
    "#             _, predictions = torch.max(scores,1)\n",
    "            \n",
    "#             # Save predictions and actual labels to lists\n",
    "#             # all_predictions.extend(predictions.cpu().tolist())\n",
    "#             # all_true_labels.extend(labels.cpu().tolist())\n",
    "#             all_predictions.extend(predictions.cpu().numpy().flatten().tolist())\n",
    "#             all_true_labels.extend(labels.cpu().numpy().flatten().tolist())\n",
    "            \n",
    "#     all_predictions = np.array(all_predictions)\n",
    "#     all_true_labels = np.array(all_true_labels)\n",
    "        \n",
    "#     accuracy = (all_predictions == all_true_labels).mean() * 100\n",
    "        \n",
    "#         # 4. Calculate F1 Score\n",
    "#         # average='macro' is best for your report to show you care about both classes equally\n",
    "#     f1 = f1_score(all_true_labels, all_predictions, average='macro')\n",
    "#     model.train() # Return model to training mode just in case\n",
    "#     return accuracy, f1\n",
    "# # # Change me based on the model\n",
    "\n",
    "# # model = model_basic.to(device)\n",
    "\n",
    "\n",
    "# # print(f\"Training on: {device}\")\n",
    "\n",
    "# # # 2. Setup Loss and Optimizer\n",
    "# # # CrossEntropyLoss is the standard for classification tasks\n",
    "# # criterion = nn.CrossEntropyLoss() \n",
    "# # # Adam is a very reliable, fast optimizer\n",
    "# # optimizer = torch.optim.Adam(model.parameters(), lr=0.001) \n",
    "\n",
    "# # # 3. The Training Loop\n",
    "# # epochs = 7# Start with a small number of passes through the whole dataset\n",
    "\n",
    "# # for epoch in range(epochs):\n",
    "# #     model.train() # Tell the model it is in training mode\n",
    "# #     total_loss = 0\n",
    "    \n",
    "# #     # Loop through our batches of 64 articles\n",
    "# #     for batch_idx, (features, labels) in enumerate(train_dataloader):\n",
    "        \n",
    "# #         # Move data to the same device as the model (GPU/CPU)\n",
    "# #         features = features.to(device)\n",
    "# #         labels = labels.to(device)\n",
    "        \n",
    "# #         # Step A: Reset the optimizer's gradients\n",
    "# #         optimizer.zero_grad()\n",
    "        \n",
    "# #         # Step B: Forward Pass (Have the model guess Real or Fake)\n",
    "# #         predictions = model(features)\n",
    "        \n",
    "# #         # Step C: Calculate Loss (How wrong were the guesses?)\n",
    "# #         loss = criterion(predictions, labels)\n",
    "        \n",
    "# #         # Step D: Backward Pass (Calculate how to fix the math)\n",
    "# #         loss.backward()\n",
    "        \n",
    "# #         # Step E: Optimize (Actually apply the fixes to the model's weights)\n",
    "# #         optimizer.step()\n",
    "        \n",
    "# #         total_loss += loss.item()\n",
    "        \n",
    "# #         # Print an update every 100 batches so we know it's working\n",
    "# #         if batch_idx % 100 == 0:\n",
    "# #             print(f\"Epoch [{epoch+1}/{epochs}] | Batch {batch_idx} | Loss: {loss.item():.4f}\")\n",
    "            \n",
    "# #     # Print the average loss at the end of each epoch\n",
    "# #     avg_loss = total_loss / len(train_dataloader)\n",
    "# #     print(f\"--- End of Epoch {epoch+1} | Average Loss: {avg_loss:.4f} ---\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "09b0ce98",
   "metadata": {},
   "source": [
    "Advanced model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b2ca196d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # 1. The Evaluation Function\n",
    "# def evaluate_performance(model_adv, dataloader, device):\n",
    "#     model_adv.eval() # Put model in evaluation mode\n",
    "    \n",
    "#     all_predictions = []\n",
    "#     all_true_labels = []\n",
    "    \n",
    "#     # Turn off gradient tracking to save memory\n",
    "#     with torch.no_grad():\n",
    "#         for features, labels in dataloader:\n",
    "#             features = features.to(device)\n",
    "#             labels = labels.to(device)\n",
    "            \n",
    "#             # Get model scores\n",
    "#             scores = model_adv(features)\n",
    "            \n",
    "#             # Find the predicted class (0 or 1)\n",
    "#             _, predictions = scores.max(1)\n",
    "            \n",
    "#             # Save predictions and actual labels to lists\n",
    "#             all_predictions.extend(predictions.cpu().tolist())\n",
    "#             all_true_labels.extend(labels.cpu().tolist())\n",
    "            \n",
    "#     # Calculate Accuracy\n",
    "#     correct_guesses = sum(p == t for p, t in zip(all_predictions, all_true_labels))\n",
    "#     accuracy = (correct_guesses / len(all_true_labels)) * 100\n",
    "    \n",
    "#     # Calculate F1 Score\n",
    "#     f1 = f1_score(all_true_labels, all_predictions, average='macro')\n",
    "    \n",
    "#     model_adv.train() # Return model to training mode just in case\n",
    "#     return accuracy, f1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5835388c",
   "metadata": {},
   "outputs": [],
   "source": [
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c6ca6771",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Basic model \")\n",
    "print(\" Validation \")\n",
    "val_acc995, val_f1_995 = evaluate_performance(model,val_dataloader, device)\n",
    "print(f\"Validation Accuracy: {val_acc995:.2f}%\")\n",
    "print(f\"Validation F1 Score: {val_f1_995:.4f}\")\n",
    "\n",
    "print(\"\\n Testing Phase \")\n",
    "test_acc995, test_f1_995 = evaluate_performance(model, test_dataloader, device)\n",
    "print(f\"Test Accuracy:     {test_acc995:.2f}%\")\n",
    "print(f\"Test F1 Score:    git  {test_f1_995:.4f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e206d094",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\" GURU model \")\n",
    "print(\" Validation \")\n",
    "adv_val_acc995, val_f1995 = evaluate_performance(model_adv,val_dataloader, device)\n",
    "print(f\"Validation Accuracy: {adv_val_acc995:.2f}%\")\n",
    "print(f\"Validation F1 Score: {val_f1_995:.4f}\")\n",
    "\n",
    "print(\"\\n  Testing \")\n",
    "test_acc, test_f1 = evaluate_performance(model_adv, test_dataloader, device)\n",
    "print(f\"Test Accuracy:     {test_acc955:.2f}%\")\n",
    "print(f\"Test F1 Score:    git  {test_f1:.4f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f6a4ae72",
   "metadata": {},
   "source": [
    "# Liar data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fc7b8dac",
   "metadata": {},
   "outputs": [],
   "source": [
    "from helper import  LIAR_labelling\n",
    "\n",
    "f\"../../data/training/LIAR.parquet\"\n",
    "df_LIAR = pd.read_parquet(\"../../data/testing/LIAR.parquet\",columns=['tokens','type'])\n",
    "\n",
    "\n",
    "df_LIAR['label'] = df_LIAR['type'].apply(LIAR_labelling).astype(str)\n",
    "df_LIAR['label'] = df_LIAR['label'].map(label_map).astype(int)\n",
    "df_LIAR = df_LIAR.drop(columns=['type'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f73f6f84",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_LIAR.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9a76196e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# count how many tokens we have in the corpuse \n",
    "word_counts = Counter()\n",
    "for x in df_LIAR['tokens']:\n",
    "    word_counts.update(x)\n",
    "    \n",
    "# Keep the top 50,000 words. \n",
    "# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)\n",
    "vocab = {\"<PAD>\": 0, \"<UNK>\": 1}\n",
    "for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):\n",
    "    vocab[word] = idx\n",
    "\n",
    "print(f\"Vocabulary built with {len(vocab)} words.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39dbe869",
   "metadata": {},
   "outputs": [],
   "source": [
    "LR_DATA = FakeNewsDataset(dataframe=df_LIAR, vocab=vocab, max_length=256)\n",
    "LR_dataloader  = DataLoader(LR_DATA, batch_size=32, shuffle=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ccbc7885",
   "metadata": {},
   "outputs": [],
   "source": [
    "features, labels = next(iter(LR_dataloader))\n",
    "# 2. Check the shapes (the dimensions of your tensors)\n",
    "print(\"--- Tensor Shapes ---\")\n",
    "print(f\"Features shape: {features.shape}\") \n",
    "print(f\"Labels shape:   {labels.shape}\")   \n",
    "\n",
    "# 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)\n",
    "print(\"\\n--- Data Types ---\")\n",
    "print(f\"Features dtype: {features.dtype}\")\n",
    "print(f\"Labels dtype:   {labels.dtype}\")\n",
    "\n",
    "# 4. Peek at the actual data for the very first article in this batch\n",
    "print(\"\\n--- First Article Peek ---\")\n",
    "print(f\"Label: {labels[0].item()} (0 = Real, 1 = Fake)\")\n",
    "print(f\"Tokens (first 20 IDs): {features[0][:20]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4698cd06",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # 1. Check a single sample from the Dataset directly\n",
    "# single_features, single_label = LR_DATA[0]\n",
    "# print(f\"Single Sample - Features: {single_features.shape}, Label: {single_label.shape}\")\n",
    "\n",
    "# # 2. Check the DataLoader batch\n",
    "# batch_features, batch_labels = next(iter(LR_dataloader))\n",
    "# # print(f\"Batch - Features: {batch_features.shape}, Labels: {batch_labels.shape}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed9c57c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "evaluate_performance(model_adv,LR_dataloader,device)\n",
    "\n",
    "print(\"\\n--- 2. Testing Avanced model  ---\")\n",
    "test_acc, test_f1 = evaluate_performance(model_adv, LR_dataloader, device)\n",
    "print(f\"Test Accuracy:     {test_acc:.2f}%\")\n",
    "print(f\"Test F1 Score:    git  {test_f1:.4f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "74127f71",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n--- 2. Testing BASE-Model ---\")\n",
    "test_acc, test_f1 = evaluate_performance(model, LR_dataloader, device)\n",
    "print(f\"Test Accuracy:     {test_acc:.2f}%\")\n",
    "print(f\"Test F1 Score:    git  {test_f1:.4f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33c54c0e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/src/models/baseline.py
+++ b/src/models/baseline.py
@@ -0,0 +1,51 @@
 import pickle
 from typing import override, Callable
 from constants import TRAINING_DIR, MODEL_DIR
 from labels import Label
 from models.model import Model
 from helper import dataset_iterator, default_labelling
 import pandas as pd
 from random import random
 class Baseline_model(Model):
    def __init__(self, model_filename:str="", label_translator: Callable[[str], Label] = default_labelling) -> None:
        self.fake_probability = 0
        super().__init__(model_filename, label_translator)
    @override
    def train(self, training_dataset:str, hyperparameters:dict[str, float]={}) -> None:
        fake_amount = 0
        real_amount = 0
        total_amount = 0
        for chunk in dataset_iterator(f"{TRAINING_DIR}/{training_dataset}", columns=['type']):
            chunk_fake_amount = (chunk['type'].map(self.label_translator) == Label.FAKE).sum()
            fake_amount += chunk_fake_amount
            real_amount += len(chunk) - chunk_fake_amount
            total_amount += len(chunk)
        self.fake_probability = fake_amount/total_amount
    @override
    def classify(self, input:pd.Series) -> Label:
        if random() <= self.fake_probability:
            return Label.FAKE
        return Label.REAL
    @override
    def save(self, filename:str) -> None:
        data = {}
        data["label_translator"] = self.label_translator
        data["fake_probability"] = self.fake_probability
        with open(f"{MODEL_DIR}/{filename}", 'wb') as file:
            pickle.dump(data, file)
    @override
    def load(self, filename:str) -> None:
        with open(f"{MODEL_DIR}/{filename}", 'rb') as file:
            data = pickle.load(file)
        self.label_translator = data["label_translator"]
        self.fake_probability = data["fake_probability"]
--- a/src/models/gradient_boosting.py
+++ b/src/models/gradient_boosting.py
@@ -0,0 +1,57 @@
 from constants import TRAINING_DIR, MODEL_DIR
 from models.model import Model
 from labels import Label
 from helper import default_labelling
 from typing import override, Callable
 import pandas as pd
 import pickle
 from sklearn.pipeline import Pipeline
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.ensemble import GradientBoostingClassifier
 def no_tokenization(str):
    return str.split(" ")
 class Gradient_boosting_model(Model):
    def __init__(self, model_filename: str = "", label_translator: Callable[[str], Label] = default_labelling) -> None:
        super().__init__(model_filename, label_translator)
    @override
    def train(self, training_dataset: str, hyperparameters: dict[str, float] = {}) -> None:
        print("this model takes around 10 hours to train")
        X = pd.read_parquet(f"{TRAINING_DIR}/{training_dataset}", columns=['tokens'])['tokens']
        X = X.apply(lambda token_list: " ".join(token_list))
        Y = pd.read_parquet(f"{TRAINING_DIR}/{training_dataset}", columns=['type'])['type']
        Y = Y.apply(lambda label: self.label_translator(label).value)
        X = X[:250000]
        Y = Y[:250000]
        model = Pipeline([
            ("L string", TfidfVectorizer(tokenizer=no_tokenization)),
            ("forest", GradientBoostingClassifier(random_state=0, n_estimators=4000))
        ])
        model.fit(X, Y)
        self.model = model
    @override
    def classify(self, input: pd.Series) -> Label:
        X = " ".join(input['tokens'])
        return Label(self.model.predict([X])[0])
    @override
    def save(self, filename: str) -> None:
        data = {}
        data["label_translator"] = self.label_translator
        data["model"] = self.model
        with open(f"{MODEL_DIR}/{filename}", 'wb') as file:
            pickle.dump(data, file)
    @override
    def load(self, filename: str) -> None:
        with open(f"{MODEL_DIR}/{filename}", 'rb') as file:
            data = pickle.load(file)
        self.label_translator = data["label_translator"]
        self.model = data["model"]
--- a/src/models/logistic_regression.py
+++ b/src/models/logistic_regression.py
@@ -0,0 +1,133 @@
 import pickle
 from typing import override, Callable
 from scipy.sparse import lil_array
 from constants import TRAINING_DIR, MODEL_DIR
 from labels import Label
 from models.model import Model
 from helper import dataset_iterator
 from sklearn.linear_model import LogisticRegression
 import pandas as pd
 import numpy as np
 from helper import default_labelling
 class Logistic_model(Model):
    def __init__(self, model_filename: str = "", label_translator: Callable[[str], Label] = default_labelling) -> None:
        super().__init__(model_filename, label_translator)
    @override
    def train(self, training_dataset: str, hyperparameters: dict[str, float] = {}) -> None:
        token_counts:dict[str, int] = {}
        sorted_token_counts:dict[str, int] = {}
        token_id:dict[str, int] = {} # converts top 10K words to id's.
        domain_counts:dict[str, int] = {}
        sorted_domain_counts:dict[str, int] = {}
        domain_id:dict[str, int] = {} # converts top 500 domains to id's.
        self.consider_metadata = False
        if "metadata" in hyperparameters and hyperparameters["metadata"] == 1:
            self.consider_metadata = True
        columns = ["tokens", "domain"]
        rows_processed = 0
        for chunk in dataset_iterator(f"{TRAINING_DIR}/{training_dataset}", columns=columns):
            rows_processed += len(chunk)
            for _, row in chunk.iterrows():
                for token in row['tokens']:
                    if token not in token_counts:
                        token_counts[token] = 0
                    token_counts[token] += 1
                if row['domain'] not in domain_counts:
                    domain_counts[row['domain']] = 0
                domain_counts[row['domain']] += 1
        for token in sorted(token_counts, key=lambda token: token_counts[token], reverse=True):
            sorted_token_counts[token] = token_counts[token]
        for domain in sorted(domain_counts, key=lambda domain: domain_counts[domain], reverse=True):
            sorted_domain_counts[domain] = domain_counts[domain]
        idx = 0
        for token in sorted_token_counts:
            token_id[token] = idx
            idx += 1
            if idx >= 10000:
                break
        idx = 0
        for domain in sorted_domain_counts:
            domain_id[domain] = idx
            idx += 1
            if idx >= 1000:
                break
        if self.consider_metadata: # consider things other than tokens
            X = lil_array((rows_processed, 11000), dtype="float64")
        else:
            X = lil_array((rows_processed, 10000), dtype="float64") # non-sparse array uses 74GiB ram on 995,000_rows. Sklearn LogisticRegression supports sparse arrays though. It still uses 9+ now.
        Y = np.zeros(rows_processed, dtype=int)
        columns.append("type")
        article_num = 0
        for chunk in dataset_iterator(f"{TRAINING_DIR}/{training_dataset}", columns=columns):
            for _, row in chunk.iterrows():
                tokens = row['tokens']
                article_type = row['type']
                article_word_counts = np.zeros(10000)
                for token in tokens:
                    if token not in token_id:
                        continue # if they are not in top 10K vocab we can ignore them
                    article_word_counts[token_id[token]] += 1
                X[article_num, :10000] = article_word_counts
                if self.consider_metadata:
                    if row['domain'] in domain_id:
                        X[article_num, 10000+domain_id[row['domain']]] = 1
                Y[article_num] = self.label_translator(article_type).value
                article_num += 1
        self.regression_model = LogisticRegression(max_iter=10000, n_jobs = -1, class_weight="balanced").fit(X, Y)
        self.token_id = token_id
        self.domain_id = domain_id
    @override
    def classify(self, input: pd.Series) -> Label:
        if self.consider_metadata:
            x = np.zeros(11000)
        else:
            x = np.zeros(10000)
        for token in input['tokens']:
            if token not in self.token_id:
                continue
            x[self.token_id[token]] += 1
        if self.consider_metadata:
            if input['domain'] in self.domain_id:
                x[10000+self.domain_id[input['domain']]] = 1
        prediction = self.regression_model.predict([x])[0]
        return Label(prediction)
    @override
    def save(self, filename: str) -> None:
        data = {}
        data["label_translator"] = self.label_translator
        data["regression_model"] = self.regression_model
        data["token_id"] = self.token_id
        data["domain_id"] = self.domain_id
        data["consider_metadata"] = self.consider_metadata
        with open(f"{MODEL_DIR}/{filename}", 'wb') as file:
            pickle.dump(data, file)
    @override
    def load(self, filename: str) -> None:
        with open(f"{MODEL_DIR}/{filename}", 'rb') as file:
            data = pickle.load(file)
        self.label_translator = data["label_translator"]
        self.regression_model = data["regression_model"]
        self.token_id = data["token_id"]
        self.domain_id = data["domain_id"]
        self.consider_metadata = data["consider_metadata"] 
--- a/src/models/model.py
+++ b/src/models/model.py
@@ -0,0 +1,61 @@
 from abc import ABC, abstractmethod
 import pandas as pd
 from time import perf_counter
 from constants import TESTING_DIR, VALIDATION_DIR
 from helper import LIAR_labelling, dataset_iterator, default_labelling
 from labels import Label
 from typing import Callable
 class Model(ABC):
    def __init__(self, model_filename:str="", label_translator: Callable[[str], Label] = default_labelling) -> None:
        self.label_translator = label_translator
        if model_filename:
            self.load(model_filename)
    @abstractmethod
    def train(self, training_dataset:str, hyperparameters:dict[str, float]) -> None:
        pass
    @abstractmethod
    def classify(self, input:pd.Series) -> Label:
        pass
    @abstractmethod
    def save(self, filename:str) -> None:
        pass
    @abstractmethod
    def load(self, filename:str) -> None:
        pass
    def test(self, test_dataset: str, validate:bool=True) -> tuple[float, float, float, float]:
        TP = 0
        TN = 0
        FP = 0
        FN = 0
        if test_dataset == "LIAR.parquet":
            self.label_translator = LIAR_labelling
        dataset_dir = VALIDATION_DIR if validate else TESTING_DIR
        df = pd.read_parquet(f"{dataset_dir}/{test_dataset}")
        expected = df['type'].apply(self.label_translator)
        predicted = df.apply(self.classify, axis=1)
        TP = ((expected == Label.FAKE) & (predicted == Label.FAKE)).sum()
        FP = ((expected == Label.REAL) & (predicted == Label.FAKE)).sum()
        TN = ((expected == Label.REAL) & (predicted == Label.REAL)).sum()
        FN = ((expected == Label.FAKE) & (predicted == Label.REAL)).sum()
        accuracy = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) > 0 else 0
        recall = (TP) / (TP + FN) if (TP + FN) > 0 else 0
        precision = (TP) / (TP + FP) if (TP + FP) > 0 else 0
        F1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        print(f"Accuracy {accuracy}")
        print(f"Recall {recall}")
        print(f"precision {precision}")
        print(f"F1-score {F1}")
        return (accuracy, recall, precision, F1)
--- a/src/models/nn.ipynb
+++ b/src/models/nn.ipynb
--- a/src/models/nn.ju.py
+++ b/src/models/nn.ju.py
@@ -0,0 +1,579 @@
 # %%
 import torch 
 import torch.nn as nn
 import torch.nn.functional as F 
 import pandas as pd
 from torch.utils.data import Dataset, DataLoader
 from collections import Counter
 import os
 import sys
 sys.path.append(os.path.join(os.getcwd(), '../'))
 from helper import default_labelling
 from sklearn.metrics import f1_score
 import numpy as np
 # %%
 label_map = {
    'Label.FAKE': 0,
    'Label.REAL': 1}
 # %% [markdown]
 """
 # Pipelining process
 """
 # %%
 df = pd.read_parquet("../../data/training/995,000_rows.parquet", columns=['tokens','type'])
 df['label'] = df['type'].apply(default_labelling).astype(str)
 df['label'] = df['label'].map(label_map).astype(int)
 df = df.drop(columns=['type'])
 # %%
 df_test = pd.read_parquet("../../data/testing/995,000_rows.parquet", columns=['tokens','type'])
 df_test['label'] = df_test['type'].apply(default_labelling).astype(str)
 df_test['label'] = df_test['label'].map(label_map).astype(int)
 df_test = df_test.drop(columns=['type'])
 # %%
 df_val = pd.read_parquet("../../data/validation/995,000_rows.parquet", columns=['tokens','type'])
 df_val['label'] = df_val['type'].apply(default_labelling).astype(str)
 df_val['label'] = df_val['label'].map(label_map).astype(int)
 df_val = df_val.drop(columns=['type'])
 # %%
 # print("Loading Parquet file...")
 # # Check the total number of rows (articles)
 # print(f"Total rows in the raw Parquet file: {len(df)}")
 # # Look at the first few rows to make sure the data looks correct
 # print("\n--- First 3 Rows ---")
 # print(df.head(3))
 # %%
 # count how many tokens we have in the corpuse 
 word_counts = Counter()
 for x in df['tokens']:
    word_counts.update(x)
 # Keep the top 50,000 words. 
 # Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)
 vocab = {"<PAD>": 0, "<UNK>": 1}
 for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):
    vocab[word] = idx
 print(f"Vocabulary built with {len(vocab)} words.")
 # %%
 # Create a Custom PyTorch Datase
 # a wrapper for the data that PyTorch knows how to talk to.
 class FakeNewsDataset(Dataset):
    def __init__(self, dataframe, vocab, max_length=256):
        self.dataframe = dataframe
        self.vocab = vocab
        self.max_length = max_length
 # Tells PyTorch how many articles we have
 #PyTorch calls this internally to know when to stop fetching data.
    def __len__(self):
        return len(self.dataframe)
    def __getitem__(self, idx):
        # Grabs one article and its label at a time
        tokens = self.dataframe.iloc[idx]['tokens']
        label = self.dataframe.iloc[idx]['label']
        # Convert text tokens to Integer IDs
        article_ids = [self.vocab.get(word, 1) for word in tokens]
    # Truncate or Pad the article so they are all exactly 'max_length' long
        if len(article_ids) > self.max_length:
            article_ids = article_ids[:self.max_length]
        else:
            padding = [0] * (self.max_length - len(article_ids))
            article_ids.extend(padding)
        # Return as PyTorch tensors
        return torch.tensor(article_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)
 # %%
 ## Prepare the DataLoader 
 # Wrap The dataframe in the Dataset class
 # The DataLoader feeds the data to the model in batches (e.g., 64 articles at a time)
 # This prevents the  computer from running out of RAM!
 my_train_dataset = FakeNewsDataset(dataframe=df, vocab=vocab, max_length=256)
 # Shuffle is true for training so the data keeps getting shuffled when trained and the model does not memorise the data
 train_dataloader = DataLoader(my_train_dataset, batch_size=64, shuffle=True,num_workers=4,       # Start with 4; if CPU stays cool, try 6
 pin_memory=True,     # Essential for fast data transfer
 prefetch_factor=2)
 val_data = FakeNewsDataset(dataframe=df_val, vocab=vocab, max_length=256)
 val_dataloader = DataLoader(val_data, batch_size=64, shuffle=False)
 test_data = FakeNewsDataset(dataframe=df_test, vocab=vocab, max_length=256)
 test_dataloader = DataLoader(test_data, batch_size=64, shuffle=False)
 # %% [markdown]
 """
 Checking if the data conversion works 
 """
 # %%
 # features, labels = next(iter(train_dataloader))
 # # 2. Check the shapes (the dimensions of your tensors)
 # print("--- Tensor Shapes ---")
 # print(f"Features shape: {features.shape}") 
 # print(f"Labels shape:   {labels.shape}")   
 # # 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)
 # print("\n--- Data Types ---")
 # print(f"Features dtype: {features.dtype}")
 # print(f"Labels dtype:   {labels.dtype}")
 # # 4. Peek at the actual data for the very first article in this batch
 # print("\n--- First Article Peek ---")
 # print(f"Label: {labels[0].item()} (0 = Real, 1 = Fake)")
 # print(f"Tokens (first 20 IDs): {features[0][:20]}")
 # %%
 class BaseModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=32, h1=256, h2=128, out_features=2):
        super().__init__()
        # The Embedding Layer: Turns word IDs into rich numerical vectors
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
        # The Linear Layers: Learn the patterns to decide Fake vs. Real
        self.fc1 = nn.Linear(embed_dim, h1)
        self.fc2 = nn.Linear(h1, h2)
        self.out = nn.Linear(h2, out_features)
    def forward(self, x):
        # x starts as integers: shape (batch_size, sequence_length) -> e.g., (64, 256)
        # Pass through embedding
        x = self.embedding(x) 
        # Average the word vectors to get one single vector for the whole article
        x = x.mean(dim=1) 
        # Pass through hidden layers with ReLU activation
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        # Output layer (gives us the raw scores for 'Real' and 'Fake')
        x = self.out(x)
        return x
 model_basic =BaseModel(vocab_size=len((vocab)))
 # %% [markdown]
 """
 'Advanced' 
 """
 # %%
 class advanced_model(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128,num_layer = 2,  out_features=2):
        super().__init__()
        # 1. The Embedding Layer (Same as before)
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
        # # 2. The GRU Layer (Extra layer)
        # batch_first=True is required because our DataLoader outputs (batch_size, sequence_length)   
        self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, num_layers=2,batch_first=True,bidirectional=True, 
            dropout=0.3)
        # 3. The Final Output Layer
        #  connect the GRU's memory (hidden_dim) directly to our Real/Fake outputs
        self.out = nn.Linear(hidden_dim, out_features)
        self.fc = nn.Linear(hidden_dim * 2, out_features)
    def forward(self, x):
        # x shape: (batch_size, sequence_length) -> e.g., (64, 256)
        #Get the word embeddings
        x = self.embedding(x) 
        # x shape becomes: (64, 256, 32)
        # Pass the embeddings into the GRU
        # A GRU outputs two things: the output at every single word, AND its final memory state.
        # We use '_' to ignore the step-by-step output, and save 'hidden_state'.
        _, hidden = self.gru(x)
   # 4. Extract and Concatenate the final forward and backward states
        # hidden[-2] is the last forward state, hidden[-1] is the last backward state
        out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        return self.fc(out)
 # Initilize
 model_adv = advanced_model(vocab_size=len(vocab))
 # %% [markdown]
 """
 # Training 
 """
 # %%
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # %%
 def evaluate_performance(model, dataloader, device):
    model.eval() # Put model in evaluation mode
    all_predictions = []
    all_true_labels = []
    # Turn off gradient tracking to save memory
    with torch.no_grad():
        for features, labels in dataloader:
            features = features.to(device)
            labels = labels.to(device)
            # Get model scores
            scores = model(features)
            # Find the predicted class (0 or 1)
            _, predictions = torch.max(scores,1)
            # Save predictions and actual labels to lists
            # all_predictions.extend(predictions.cpu().tolist())
            # all_true_labels.extend(labels.cpu().tolist())
            all_predictions.extend(predictions.cpu().numpy().flatten().tolist())
            all_true_labels.extend(labels.cpu().numpy().flatten().tolist())
    all_predictions = np.array(all_predictions)
    all_true_labels = np.array(all_true_labels)
    accuracy = (all_predictions == all_true_labels).mean() * 100
        # 4. Calculate F1 Score
        # average='macro' is best for your report to show you care about both classes equally
    f1 = f1_score(all_true_labels, all_predictions, average='macro')
    model.train() # Return model to training mode just in case
    return accuracy, f1
 # %%
 def train_model(model, train_loader, val_loader, device, epochs=5, lr=0.001):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    # Dictionary to store results for your report
    history = {'train_loss': [], 'val_acc': [], 'val_f1': []}
    print(f"Training {model.__class__.__name__} on {device}...")
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch_idx, (features, labels) in enumerate(train_loader):
            features, labels = features.to(device), labels.to(device)
            optimizer.zero_grad()
            predictions = model(features)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        # After each epoch, evaluate on validation set
        val_acc, val_f1 = evaluate_performance(model, val_loader, device)
        # Save results to our history dictionary
        history['train_loss'].append(avg_loss)
        history['val_acc'].append(val_acc)
        history['val_f1'].append(val_f1)
        print(f"\n Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f} \n  Val Acc: {val_acc:.2f}% \n  Val F1: {val_f1:.4f}")
    return history # Return the results so we can plot them later
 # %%
 train_995_basic =train_model (model_basic, train_dataloader, val_dataloader, device, epochs =7 )
 print(train_995_basic )
 # %%
 train_995_adv =train_model (model_adv, train_dataloader, val_dataloader, device, epochs =7 )
 print(train_995_adv )
 # %%
 # %%
 # %% [markdown]
 """
 # Evaluation 
 """
 # %% [markdown]
 """
 Basic model
 """
 # %%
 # # 1. The Evaluation Function
 # def evaluate_performance(model, dataloader, device):
 #     model.eval() # Put model in evaluation mode
 #     all_predictions = []
 #     all_true_labels = []
 #     # Turn off gradient tracking to save memory
 #     with torch.no_grad():
 #         for features, labels in dataloader:
 #             features = features.to(device)
 #             labels = labels.to(device)
 #             # Get model scores
 #             scores = model(features)
 #             # Find the predicted class (0 or 1)
 #             _, predictions = torch.max(scores,1)
 #             # Save predictions and actual labels to lists
 #             # all_predictions.extend(predictions.cpu().tolist())
 #             # all_true_labels.extend(labels.cpu().tolist())
 #             all_predictions.extend(predictions.cpu().numpy().flatten().tolist())
 #             all_true_labels.extend(labels.cpu().numpy().flatten().tolist())
 #     all_predictions = np.array(all_predictions)
 #     all_true_labels = np.array(all_true_labels)
 #     accuracy = (all_predictions == all_true_labels).mean() * 100
 #         # 4. Calculate F1 Score
 #         # average='macro' is best for your report to show you care about both classes equally
 #     f1 = f1_score(all_true_labels, all_predictions, average='macro')
 #     model.train() # Return model to training mode just in case
 #     return accuracy, f1
 # # # Change me based on the model
 # # model = model_basic.to(device)
 # # print(f"Training on: {device}")
 # # # 2. Setup Loss and Optimizer
 # # # CrossEntropyLoss is the standard for classification tasks
 # # criterion = nn.CrossEntropyLoss() 
 # # # Adam is a very reliable, fast optimizer
 # # optimizer = torch.optim.Adam(model.parameters(), lr=0.001) 
 # # # 3. The Training Loop
 # # epochs = 7# Start with a small number of passes through the whole dataset
 # # for epoch in range(epochs):
 # #     model.train() # Tell the model it is in training mode
 # #     total_loss = 0
 # #     # Loop through our batches of 64 articles
 # #     for batch_idx, (features, labels) in enumerate(train_dataloader):
 # #         # Move data to the same device as the model (GPU/CPU)
 # #         features = features.to(device)
 # #         labels = labels.to(device)
 # #         # Step A: Reset the optimizer's gradients
 # #         optimizer.zero_grad()
 # #         # Step B: Forward Pass (Have the model guess Real or Fake)
 # #         predictions = model(features)
 # #         # Step C: Calculate Loss (How wrong were the guesses?)
 # #         loss = criterion(predictions, labels)
 # #         # Step D: Backward Pass (Calculate how to fix the math)
 # #         loss.backward()
 # #         # Step E: Optimize (Actually apply the fixes to the model's weights)
 # #         optimizer.step()
 # #         total_loss += loss.item()
 # #         # Print an update every 100 batches so we know it's working
 # #         if batch_idx % 100 == 0:
 # #             print(f"Epoch [{epoch+1}/{epochs}] | Batch {batch_idx} | Loss: {loss.item():.4f}")
 # #     # Print the average loss at the end of each epoch
 # #     avg_loss = total_loss / len(train_dataloader)
 # #     print(f"--- End of Epoch {epoch+1} | Average Loss: {avg_loss:.4f} ---")
 # %% [markdown]
 """
 Advanced model
 """
 # %%
 # # 1. The Evaluation Function
 # def evaluate_performance(model_adv, dataloader, device):
 #     model_adv.eval() # Put model in evaluation mode
 #     all_predictions = []
 #     all_true_labels = []
 #     # Turn off gradient tracking to save memory
 #     with torch.no_grad():
 #         for features, labels in dataloader:
 #             features = features.to(device)
 #             labels = labels.to(device)
 #             # Get model scores
 #             scores = model_adv(features)
 #             # Find the predicted class (0 or 1)
 #             _, predictions = scores.max(1)
 #             # Save predictions and actual labels to lists
 #             all_predictions.extend(predictions.cpu().tolist())
 #             all_true_labels.extend(labels.cpu().tolist())
 #     # Calculate Accuracy
 #     correct_guesses = sum(p == t for p, t in zip(all_predictions, all_true_labels))
 #     accuracy = (correct_guesses / len(all_true_labels)) * 100
 #     # Calculate F1 Score
 #     f1 = f1_score(all_true_labels, all_predictions, average='macro')
 #     model_adv.train() # Return model to training mode just in case
 #     return accuracy, f1
 # %%
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # %%
 print("Basic model ")
 print(" Validation ")
 val_acc995, val_f1_995 = evaluate_performance(model,val_dataloader, device)
 print(f"Validation Accuracy: {val_acc995:.2f}%")
 print(f"Validation F1 Score: {val_f1_995:.4f}")
 print("\n Testing Phase ")
 test_acc995, test_f1_995 = evaluate_performance(model, test_dataloader, device)
 print(f"Test Accuracy:     {test_acc995:.2f}%")
 print(f"Test F1 Score:    git  {test_f1_995:.4f}")
 # %%
 print(" GURU model ")
 print(" Validation ")
 adv_val_acc995, val_f1995 = evaluate_performance(model_adv,val_dataloader, device)
 print(f"Validation Accuracy: {adv_val_acc995:.2f}%")
 print(f"Validation F1 Score: {val_f1_995:.4f}")
 print("\n  Testing ")
 test_acc, test_f1 = evaluate_performance(model_adv, test_dataloader, device)
 print(f"Test Accuracy:     {test_acc955:.2f}%")
 print(f"Test F1 Score:    git  {test_f1:.4f}")
 # %% [markdown]
 """
 # Liar data
 """
 # %%
 from helper import  LIAR_labelling
 f"../../data/training/LIAR.parquet"
 df_LIAR = pd.read_parquet("../../data/testing/LIAR.parquet",columns=['tokens','type'])
 df_LIAR['label'] = df_LIAR['type'].apply(LIAR_labelling).astype(str)
 df_LIAR['label'] = df_LIAR['label'].map(label_map).astype(int)
 df_LIAR = df_LIAR.drop(columns=['type'])
 # %%
 df_LIAR.head()
 # %%
 # count how many tokens we have in the corpuse 
 word_counts = Counter()
 for x in df_LIAR['tokens']:
    word_counts.update(x)
 # Keep the top 50,000 words. 
 # Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)
 vocab = {"<PAD>": 0, "<UNK>": 1}
 for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):
    vocab[word] = idx
 print(f"Vocabulary built with {len(vocab)} words.")
 # %%
 LR_DATA = FakeNewsDataset(dataframe=df_LIAR, vocab=vocab, max_length=256)
 LR_dataloader  = DataLoader(LR_DATA, batch_size=32, shuffle=False)
 # %%
 features, labels = next(iter(LR_dataloader))
 # 2. Check the shapes (the dimensions of your tensors)
 print("--- Tensor Shapes ---")
 print(f"Features shape: {features.shape}") 
 print(f"Labels shape:   {labels.shape}")   
 # 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)
 print("\n--- Data Types ---")
 print(f"Features dtype: {features.dtype}")
 print(f"Labels dtype:   {labels.dtype}")
 # 4. Peek at the actual data for the very first article in this batch
 print("\n--- First Article Peek ---")
 print(f"Label: {labels[0].item()} (0 = Real, 1 = Fake)")
 print(f"Tokens (first 20 IDs): {features[0][:20]}")
 # %%
 # # 1. Check a single sample from the Dataset directly
 # single_features, single_label = LR_DATA[0]
 # print(f"Single Sample - Features: {single_features.shape}, Label: {single_label.shape}")
 # # 2. Check the DataLoader batch
 # batch_features, batch_labels = next(iter(LR_dataloader))
 # # print(f"Batch - Features: {batch_features.shape}, Labels: {batch_labels.shape}")
 # %%
 evaluate_performance(model_adv,LR_dataloader,device)
 print("\n--- 2. Testing Avanced model  ---")
 test_acc, test_f1 = evaluate_performance(model_adv, LR_dataloader, device)
 print(f"Test Accuracy:     {test_acc:.2f}%")
 print(f"Test F1 Score:    git  {test_f1:.4f}")
 # %%
 print("\n--- 2. Testing BASE-Model ---")
 test_acc, test_f1 = evaluate_performance(model, LR_dataloader, device)
 print(f"Test Accuracy:     {test_acc:.2f}%")
 print(f"Test F1 Score:    git  {test_f1:.4f}")
 # %%
--- a/src/models/svm.py
+++ b/src/models/svm.py
@@ -0,0 +1,52 @@
 from constants import TRAINING_DIR, MODEL_DIR
 from models.model import Model
 from labels import Label
 from helper import default_labelling
 from typing import override, Callable
 import pandas as pd
 import pickle
 from sklearn.pipeline import Pipeline
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.svm import LinearSVC
 def no_tokenization(str):
    return str.split(" ")
 class SVM_model(Model):
    def __init__(self, model_filename: str = "", label_translator: Callable[[str], Label] = default_labelling) -> None:
        super().__init__(model_filename, label_translator)
    @override
    def train(self, training_dataset: str, hyperparameters: dict[str, float] = {}) -> None:
        X = pd.read_parquet(f"{TRAINING_DIR}/{training_dataset}", columns=['tokens'])['tokens']
        X = X.apply(lambda token_list: " ".join(token_list))
        Y = pd.read_parquet(f"{TRAINING_DIR}/{training_dataset}", columns=['type'])['type']
        Y = Y.apply(lambda label: self.label_translator(label).value)
        model = Pipeline([
            ("L string", TfidfVectorizer(tokenizer=no_tokenization)),
            ("svm", LinearSVC(random_state=0))
        ])
        model.fit(X, Y)
        self.model = model
    @override
    def classify(self, input: pd.Series) -> Label:
        X = " ".join(input['tokens'])
        return Label(self.model.predict([X])[0])
    @override
    def save(self, filename: str) -> None:
        data = {}
        data["label_translator"] = self.label_translator
        data["model"] = self.model
        with open(f"{MODEL_DIR}/{filename}", 'wb') as file:
            pickle.dump(data, file)
    @override
    def load(self, filename: str) -> None:
        with open(f"{MODEL_DIR}/{filename}", 'rb') as file:
            data = pickle.load(file)
        self.label_translator = data["label_translator"]
        self.model = data["model"]
--- a/src/old_notebooks/data_processing.ipynb
+++ b/src/old_notebooks/data_processing.ipynb
--- a/src/old_notebooks/data_processing.ju.py
+++ b/src/old_notebooks/data_processing.ju.py
@@ -0,0 +1,121 @@
 # %% [markdown]
 """
 # cleaning
 big_data.csv.zst is the main file we will be using. Every step in the pipeline adds a new column and overwrites the file. This is reversible and when any step changes everything can be run again regardless og the state of the file.
 """
 # %%
 import nltk
 import re
 import os
 import time
 import pandas as pd
 DATA_DIR = "../data"
 # %%
 # download nltk data
 nltk.download("all")
 # %%
 news_sample = pd.read_csv(f"{DATA_DIR}/news_sample.csv")
 # %%
 # We will not waste space on csv files, L.
 if (os.path.exists(f"{DATA_DIR}/995,000_rows.csv")):
    big_data = pd.read_csv(f"{DATA_DIR}/995,000_rows.csv", low_memory=False)
    big_data.to_csv(f"{DATA_DIR}/big_data.csv.zst")
    os.remove(f"{DATA_DIR}/995,000_rows.csv")
 big_data = None
 # %%
 # cleans text and returns a list of tokens.
 def clean_text(
        text,
        remove_regex_patterns = True,
        remove_stopwords = True,
        remove_special_characters = True,
        stemming = True):
    text = str(text).lower().strip()
    if remove_regex_patterns:
        url_pattern = r'\S+\.\S+'
        email_pattern = r'\w+@\w+\.\w+'
        date_pattern = r'[a-z]+ \d{1,2}[a-z]?, \d{4}' # add more date patterns
        number_pattern = r'\d+'
        text = re.sub(url_pattern, "<URL>", text)
        text = re.sub(email_pattern, "<EMAIL>", text)
        text = re.sub(date_pattern, "<DATE>", text)
        text = re.sub(number_pattern, "<NUMBER>", text)
    if remove_special_characters:
        text = re.sub(r'[^\w (?:<\w+>)]', " ", text)
    tokenizer = nltk.RegexpTokenizer(r'<\w+>|\w+')
    tokens = tokenizer.tokenize(text)
    if remove_stopwords:
        stopwords = stopwords = nltk.corpus.stopwords.words('english')
        tokens = [token for token in tokens if token not in stopwords]
    if stemming:
        stemmer = nltk.SnowballStemmer("english")
        tokens = [stemmer.stem(token) if not re.match(r'<\w+>', token) else token for token in tokens]
    return tokens
 # %% [markdown]
 """
 ## Output
 Now we check what the function does and how the vocabulary changes.
 """
 # %%
 # Generates a vocabulary (set of unique words) from a pandas series.
 def generate_vocabulary(series):
    vocabulary = set()
    series.apply(lambda tokens: vocabulary.update(tokens))
    return vocabulary
 # %%
 print("original text:\n")
 print(news_sample['content'][1])
 print("\n" + "-" * 100 + "\n")
 print("cleaned tokens:\n")
 print(clean_text(news_sample['content'][1]))
 print("\n" + "-" * 100 + "\n")
 tokenization_size = len(generate_vocabulary(news_sample['content'].apply(clean_text, remove_stopwords = False, stemming = False)))
 stopwords_size = len(generate_vocabulary(news_sample['content'].apply(clean_text, remove_stopwords = True, stemming = False)))
 stemming_size = len(generate_vocabulary(news_sample['content'].apply(clean_text, remove_stopwords = True, stemming = True)))
 print("Unique words after tokenization:")
 print(tokenization_size)
 print("\nUnique words after stopword removal:")
 print(stopwords_size)
 print("\nUnique words after stemming:")
 print(stemming_size)
 print("\nStemming reduction rate:")
 print(f"{round(1 - stemming_size / stopwords_size, 4) * 100}%")
 # %% [markdown]
 """
 ## Big Data
 Now we clean the big dataset and save it to csv.zst file. Pandas can save and load zstd files just fine, and since it's realtime compression it doesn't really take more time while heavily reducing the file size.
 """
 # %%
 start = time.perf_counter()
 first = True
 for big_data in pd.read_csv(f"{DATA_DIR}/big_data.csv.zst", chunksize=10000):
    big_data['tokens'] = big_data['content'].apply(clean_text)
    if first:
        big_data.to_csv(f"{DATA_DIR}/big_data_new.csv.zst", mode='w')
        first = False
    else:
        big_data.to_csv(f"{DATA_DIR}/big_data_new.csv.zst", mode='a')
 os.rename(f"{DATA_DIR}/big_data_new.csv.zst",f"{DATA_DIR}/big_data.csv.zst")
 print(f"cleaning took {round((time.perf_counter() - start) / 60, 5)} minutes")
--- a/src/old_notebooks/data_processing_ja.ipynb
+++ b/src/old_notebooks/data_processing_ja.ipynb
@@ -0,0 +1,65 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "95706a2e-9e23-4272-aeaa-4510254f7feb",
   "metadata": {},
   "source": [
    "# Cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1be89b54-76dd-4c2e-bcdd-ff956bf375bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b82cf2b2-7cee-4c34-83b9-37c5c4828289",
   "metadata": {},
   "source": [
    "1. Tokenize the text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc8058fc-0ed9-4daf-918d-d3e82064a3a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "nltk.download('punkt')\n",
    "text = ("
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/src/setup.py
+++ b/src/setup.py
@@ -0,0 +1,42 @@
 from constants import DATASET_DIR, TRAINING_DIR, VALIDATION_DIR, TESTING_DIR, ORIGINAL_DATASET_FILES
 from clean_data import clean_dataset
 from helper import csv_to_parquet
 from split import split_dataset, split_dataset_random
 import nltk
 import os
 import shutil
 import pandas as pd
 def setup() -> None:
    # make sure nltk can be used later.
    nltk.download("all")
    for dataset_file in ORIGINAL_DATASET_FILES:
        if not os.path.exists(f"{DATASET_DIR}/{dataset_file}"):
            raise Exception(f"Please add {dataset_file} to {DATASET_DIR}")
        name = os.path.splitext(dataset_file)[0]
        if not os.path.exists(f"{DATASET_DIR}/{name}.parquet"):
            csv_to_parquet(f"{DATASET_DIR}/{dataset_file}", f"{DATASET_DIR}/{name}.parquet")
            print(f"finished converting {dataset_file} to parquet")
            clean_dataset(f"{name}.parquet")
            print(f"cleaned {name}.parquet")
            split_dataset_random(f"{name}.parquet")
            print(f"split {name}.parquet into traning, validation and test")
        # LIAR
        for dataset, destination in [("train.tsv", TRAINING_DIR), ("valid.tsv", VALIDATION_DIR), ("test.tsv", TESTING_DIR)]:
            if os.path.exists(f"{DATASET_DIR}/{dataset}"):
                df = pd.read_csv(f"{DATASET_DIR}/{dataset}", sep='\t', header=None)
                df = df.rename(columns={
                    1: "type",
                    2: "content"
                })
                name = os.path.splitext(dataset)[0]
                df.to_parquet(f"{DATASET_DIR}/{name}.parquet")
                clean_dataset(f"{name}.parquet")
                shutil.move(f"{DATASET_DIR}/{name}.parquet", f"{destination}/LIAR.parquet")
 if __name__ == "__main__":
    setup()
--- a/src/split.py
+++ b/src/split.py
@@ -0,0 +1,91 @@
 from constants import CHUNK_SIZE, DATASET_DIR, TRAINING_DIR, VALIDATION_DIR, TESTING_DIR
 import pyarrow.parquet as pq
 import pyarrow as pa
 import os 
 from helper import get_time_boundaries
 import pandas as pd 
 import numpy as np
 def split_dataset_random(filename:str) -> None:
    pq_file = pq.ParquetFile(f"{DATASET_DIR}/{filename}")
    training_writer = None
    validation_writer = None
    testing_writer = None
    for batch in pq_file.iter_batches(batch_size=CHUNK_SIZE):
        table = pa.Table.from_batches([batch])
        rng = np.random.rand(table.num_rows)
        training = table.filter(rng < 0.75)
        validation = table.filter((rng >= 0.75) & (rng < 0.85))
        testing = table.filter(rng >= 0.85)
        if not training_writer and training.num_rows:
            training_writer = pq.ParquetWriter(f"{TRAINING_DIR}/{filename}", training.schema)
        if not validation_writer and validation.num_rows:
            validation_writer = pq.ParquetWriter(f"{VALIDATION_DIR}/{filename}", validation.schema)
        if not testing_writer and testing.num_rows:
            testing_writer = pq.ParquetWriter(f"{TESTING_DIR}/{filename}", testing.schema)
        if training.num_rows:
            training_writer.write(training)
        if validation.num_rows:
            validation_writer.write(validation)
        if testing.num_rows:
            testing_writer.write(testing)
    training_writer.close()
    validation_writer.close()
    testing_writer.close()
 def split_dataset(filename: str) -> None:
    df = pd.read_parquet(f"{DATASET_DIR}/{filename}")
    n = len(df)
    df['scraped_at'] = pd.to_datetime(df['scraped_at'], format='ISO8601', errors='coerce', utc=True)
    df = df.sort_values(by='scraped_at')
    df.iloc[:int(n * 0.8)].to_parquet(f"{TRAINING_DIR}/{filename}")
    df.iloc[int(n * 0.8):int(n * 0.9)].to_parquet(f"{VALIDATION_DIR}/{filename}")
    df.iloc[int(n * 0.9):].to_parquet(f"{TESTING_DIR}/{filename}")
    return
    # ── Writers start as None — initialized on first batch ───────────────────
    train_writer = None
    val_writer   = None
    test_writer  = None
    try:
        parquet_file = pq.ParquetFile(filepath)
        for batch in parquet_file.iter_batches(batch_size=CHUNK_SIZE):  # type: ignore
            chunk = batch.to_pandas()  # type: ignore
            chunk['scraped_at'] = pd.to_datetime(chunk['scraped_at'], format='ISO8601', errors='coerce', utc=True)
            # Initialize writers on first batch AFTER datetime conversion
            if train_writer is None:
                schema= pa.Schema.from_pandas(chunk)
                train_writer = pq.ParquetWriter(os.path.join(TRAINING_DIR,   filename), schema)
                val_writer   = pq.ParquetWriter(os.path.join(VALIDATION_DIR, filename), schema)
                test_writer  = pq.ParquetWriter(os.path.join(TESTING_DIR,    filename), schema)
            # Split the chunk
            train_chunk = chunk[chunk['scraped_at'] <= train_cut]
            val_chunk   = chunk[(chunk['scraped_at'] > train_cut) & (chunk['scraped_at'] <= val_cut)]
            test_chunk  = chunk[chunk['scraped_at'] > val_cut]
            # Write each split
            if not train_chunk.empty:
                train_writer.write_table(pa.Table.from_pandas(train_chunk, schema=schema))
            if not val_chunk.empty:
                val_writer.write_table(pa.Table.from_pandas(val_chunk, schema=schema))
            if not test_chunk.empty:
                test_writer.write_table(pa.Table.from_pandas(test_chunk, schema=schema))
    finally:
        if train_writer:
            train_writer.close()
        if val_writer:
            val_writer.close()
        if test_writer:
            test_writer.close()