backup since codeberg is down

This commit is contained in:
2026-03-27 13:35:43 +01:00
commit 8a61a214c6
45 changed files with 5038 additions and 0 deletions

35
.gitignore vendored Normal file
View File

@@ -0,0 +1,35 @@
# Document
*.pdf
*.bak
*.tex.backup
*.tex~
*.synctex.gz
*.out
.bak
build/
_minted/
obj/
bin/
# Python
__pycache__/
.env
.envrc
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
.ipynb_checkpoints/
# data bs
data/**
!data/
!data/**/
!data/**/.gitkeep
# general bs
.DS_Store
flake.lock
.vscode/

6
README.md Normal file
View File

@@ -0,0 +1,6 @@
- download the neccesary dataset files to data/datasets as csv (not zip). Move all tsv files from LIAR zip file direcly into the datasets folder.
- run setup.py to setup nltk, and clean and split the datasets. It takes long, please wait.
- run main.py from the src diretory to test the models. The function requrires the model type, model file, and dataset to be passed as parameters.
Here is an example: python main.py --model_type logistic --model_file logistic.model --data_file 995,000_rows.parquet
The model files can be found in the models directory (not the one in src), the data files can be found in data/testing (pass LIAR.parquet to test on LIAR dataset).
The model types and more information including how to train models can be found with python main.py --help.

View File

@@ -0,0 +1,457 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "3b55d166",
"metadata": {},
"source": [
"# DO NOT RUN; DaATA WILL BE LOST"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9c2d25e9",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd \n",
"import os \n",
"import sys\n",
"sys.path.append(os.path.join(os.getcwd(), '../src'))\n",
"from constants import TRAINING_DIR, TESTING_DIR, VALIDATION_DIR\n",
"pd.set_option('display.max_columns', None)\n"
]
},
{
"cell_type": "markdown",
"id": "cd67fc64",
"metadata": {},
"source": [
"# Time Split "
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "a917b0fa",
"metadata": {},
"outputs": [],
"source": [
"test_ty = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
"train_ty = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
"val_ty = pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "0098d6e4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"rows in train(818843, 1),\n",
" rows in test (99499, 1), \n",
" rows in validation(76645, 1)\n"
]
}
],
"source": [
"print(f'rows in train{train_ty.shape },\\n rows in test {test_ty.shape}, \\n rows in validation{val_ty.shape}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5985a4f3",
"metadata": {},
"outputs": [],
"source": [
"timeline = pd.concat([\n",
" train_ty.value_counts().rename('train'),\n",
" test_ty.value_counts().rename('test'),\n",
" val_ty.value_counts().rename('val'),\n",
"], axis=1).fillna(0).astype(int)\n",
"\n",
"timeline.index.name = 'type'"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "b0673e19",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>train</th>\n",
" <th>test</th>\n",
" <th>val</th>\n",
" </tr>\n",
" <tr>\n",
" <th>type</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>political</th>\n",
" <td>194518</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bias</th>\n",
" <td>133232</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>fake</th>\n",
" <td>104883</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>conspiracy</th>\n",
" <td>97314</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>rumor</th>\n",
" <td>56445</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unknown</th>\n",
" <td>43534</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>reliable</th>\n",
" <td>42419</td>\n",
" <td>99499</td>\n",
" <td>76645</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unreliable</th>\n",
" <td>35332</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>clickbait</th>\n",
" <td>27412</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>junksci</th>\n",
" <td>14040</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>satire</th>\n",
" <td>13160</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>hate</th>\n",
" <td>8779</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" train test val\n",
"type \n",
"political 194518 0 0\n",
"bias 133232 0 0\n",
"fake 104883 0 0\n",
"conspiracy 97314 0 0\n",
"rumor 56445 0 0\n",
"unknown 43534 0 0\n",
"reliable 42419 99499 76645\n",
"unreliable 35332 0 0\n",
"clickbait 27412 0 0\n",
"junksci 14040 0 0\n",
"satire 13160 0 0\n",
"hate 8779 0 0"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"timeline"
]
},
{
"cell_type": "markdown",
"id": "6bdc7d84",
"metadata": {},
"source": [
"# Random Split "
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "cd5ca57b",
"metadata": {},
"outputs": [],
"source": [
"test_ty_R = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
"train_ty_R = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
"val_ty_R = pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "c793a37c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"rows in train(745724, 1),\n",
" rows in test (149766, 1), \n",
" rows in validation(99510, 1)\n"
]
}
],
"source": [
"print(f'rows in train{train_ty_R.shape },\\n rows in test {test_ty_R.shape}, \\n rows in validation{val_ty_R.shape}')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "583304ff",
"metadata": {},
"outputs": [],
"source": [
"timeline_R = pd.concat([\n",
" train_ty_R.value_counts().rename('train'),\n",
" test_ty_R.value_counts().rename('test'),\n",
" val_ty_R.value_counts().rename('val'),\n",
"], axis=1).fillna(0).astype(int)\n",
"\n",
"timeline_R.index.name = 'type'"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "d8255b60",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>train</th>\n",
" <th>test</th>\n",
" <th>val</th>\n",
" </tr>\n",
" <tr>\n",
" <th>type</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>reliable</th>\n",
" <td>163802</td>\n",
" <td>33010</td>\n",
" <td>21752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>political</th>\n",
" <td>145779</td>\n",
" <td>29241</td>\n",
" <td>19498</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bias</th>\n",
" <td>99797</td>\n",
" <td>20079</td>\n",
" <td>13356</td>\n",
" </tr>\n",
" <tr>\n",
" <th>fake</th>\n",
" <td>78736</td>\n",
" <td>15602</td>\n",
" <td>10545</td>\n",
" </tr>\n",
" <tr>\n",
" <th>conspiracy</th>\n",
" <td>72837</td>\n",
" <td>14676</td>\n",
" <td>9801</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unknown</th>\n",
" <td>68468</td>\n",
" <td>13754</td>\n",
" <td>9098</td>\n",
" </tr>\n",
" <tr>\n",
" <th>rumor</th>\n",
" <td>42254</td>\n",
" <td>8553</td>\n",
" <td>5638</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unreliable</th>\n",
" <td>26489</td>\n",
" <td>5346</td>\n",
" <td>3497</td>\n",
" </tr>\n",
" <tr>\n",
" <th>clickbait</th>\n",
" <td>20552</td>\n",
" <td>4161</td>\n",
" <td>2699</td>\n",
" </tr>\n",
" <tr>\n",
" <th>junksci</th>\n",
" <td>10516</td>\n",
" <td>2066</td>\n",
" <td>1458</td>\n",
" </tr>\n",
" <tr>\n",
" <th>satire</th>\n",
" <td>9852</td>\n",
" <td>1971</td>\n",
" <td>1337</td>\n",
" </tr>\n",
" <tr>\n",
" <th>hate</th>\n",
" <td>6641</td>\n",
" <td>1307</td>\n",
" <td>831</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2018-02-10 13:43:39.521661</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" train test val\n",
"type \n",
"reliable 163802 33010 21752\n",
"political 145779 29241 19498\n",
"bias 99797 20079 13356\n",
"fake 78736 15602 10545\n",
"conspiracy 72837 14676 9801\n",
"unknown 68468 13754 9098\n",
"rumor 42254 8553 5638\n",
"unreliable 26489 5346 3497\n",
"clickbait 20552 4161 2699\n",
"junksci 10516 2066 1458\n",
"satire 9852 1971 1337\n",
"hate 6641 1307 831\n",
"2018-02-10 13:43:39.521661 1 0 0"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"timeline_R"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "355d343a",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "main_asg",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

399
analysis/analysis2.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,237 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "9c2d25e9",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd \n",
"import os \n",
"import sys\n",
"sys.path.append(os.path.join(os.getcwd(), '../src'))\n",
"from constants import TRAINING_DIR, TESTING_DIR, VALIDATION_DIR\n",
"pd.set_option('display.max_columns', None)\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "a917b0fa",
"metadata": {},
"outputs": [],
"source": [
"test_ty = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
"train_ty = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
"val_ty = pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "0098d6e4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"rows in train(818843, 1),\n",
" rows in test (99499, 1), \n",
" rows in validation(76645, 1)\n"
]
}
],
"source": [
"print(f'rows in train{train_ty.shape },\\n rows in test {test_ty.shape}, \\n rows in validation{val_ty.shape}')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "5985a4f3",
"metadata": {},
"outputs": [],
"source": [
"timeline = pd.concat([\n",
" b.value_counts().rename('train'),\n",
" a.value_counts().rename('test'),\n",
" c.value_counts().rename('val'),\n",
"], axis=1).fillna(0).astype(int)\n",
"\n",
"timeline.index.name = 'type'"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "b0673e19",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>train</th>\n",
" <th>test</th>\n",
" <th>val</th>\n",
" </tr>\n",
" <tr>\n",
" <th>type</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>political</th>\n",
" <td>194518</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bias</th>\n",
" <td>133232</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>fake</th>\n",
" <td>104883</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>conspiracy</th>\n",
" <td>97314</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>rumor</th>\n",
" <td>56445</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unknown</th>\n",
" <td>43534</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>reliable</th>\n",
" <td>42419</td>\n",
" <td>99499</td>\n",
" <td>76645</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unreliable</th>\n",
" <td>35332</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>clickbait</th>\n",
" <td>27412</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>junksci</th>\n",
" <td>14040</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>satire</th>\n",
" <td>13160</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>hate</th>\n",
" <td>8779</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" train test val\n",
"type \n",
"political 194518 0 0\n",
"bias 133232 0 0\n",
"fake 104883 0 0\n",
"conspiracy 97314 0 0\n",
"rumor 56445 0 0\n",
"unknown 43534 0 0\n",
"reliable 42419 99499 76645\n",
"unreliable 35332 0 0\n",
"clickbait 27412 0 0\n",
"junksci 14040 0 0\n",
"satire 13160 0 0\n",
"hate 8779 0 0"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"timeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c2bcfc84",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "main_asg",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0
data/datasets/.gitkeep Normal file
View File

0
data/temp/.gitkeep Normal file
View File

0
data/testing/.gitkeep Normal file
View File

0
data/training/.gitkeep Normal file
View File

0
data/validation/.gitkeep Normal file
View File

34
flake.nix Normal file
View File

@@ -0,0 +1,34 @@
# This is for my retarded nixos jupyter notebook setup. It makes a shell with requrements.txt and jupynium bs installed.
{
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
pyproject-nix.url = "github:pyproject-nix/pyproject.nix";
bozo_nixpkgs.url = "github:DuarteSJ/nixpkgs/4e926b09ba06301b08d0f12afd0640c079bdc4dc";
};
outputs =
{ nixpkgs, pyproject-nix, bozo_nixpkgs, ... }:
let
project = pyproject-nix.lib.project.loadRequirementsTxt { projectRoot = ./.; };
pkgs = nixpkgs.legacyPackages.x86_64-linux;
bozo_pkgs = bozo_nixpkgs.legacyPackages.x86_64-linux;
python = pkgs.python3;
pythonEnv = pkgs.python3.withPackages (pkgs:
let base = project.renderers.withPackages { inherit python; } pkgs;
in base ++ (with pkgs; [ notebook nbclassic jupyter-console ipython]));
mental_retardation = bozo_pkgs.python3.withPackages (python-pkgs: with python-pkgs; [ jupynium ]);
in
{
devShells.x86_64-linux.default = pkgs.mkShell {
packages = [ pythonEnv mental_retardation ];
shellHook = ''
export SHELL="which fish"
if [[ $- == *i* ]] && [ -z "$TMUX" ]; then
tmux new-session -A -s GDS-fake-news
fi
'';
};
};
}

BIN
models/LIAR_baseline.model Normal file

Binary file not shown.

BIN
models/baseline.model Normal file

Binary file not shown.

Binary file not shown.

BIN
models/logistic.model Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
models/old/GB10K.model Normal file

Binary file not shown.

BIN
models/old/GB1K.model Normal file

Binary file not shown.

BIN
models/old/GB2K.model Normal file

Binary file not shown.

BIN
models/old/GB4K.model Normal file

Binary file not shown.

Binary file not shown.

BIN
models/svm.model Normal file

Binary file not shown.

4
pyrightconfig.json Normal file
View File

@@ -0,0 +1,4 @@
{
"typeCheckingMode": "strict",
"reportMissingTypeStubs": false
}

117
requirements.txt Normal file
View File

@@ -0,0 +1,117 @@
anyio==4.12.1
argon2-cffi==25.1.0
argon2-cffi-bindings==25.1.0
arrow==1.4.0
asttokens==3.0.1
async-lru==2.1.0
attrs==25.4.0
babel==2.18.0
beautifulsoup4==4.14.3
bleach==6.3.0
certifi==2026.1.4
cffi==2.0.0
charset-normalizer==3.4.4
click==8.3.1
comm==0.2.3
contourpy==1.3.3
cycler==0.12.1
debugpy==1.8.20
decorator==5.2.1
defusedxml==0.7.1
executing==2.2.1
fastjsonschema==2.21.2
fonttools==4.61.1
fqdn==1.5.1
h11==0.16.0
httpcore==1.0.9
httpx==0.28.1
idna==3.11
ipykernel==7.2.0
ipython==9.10.0
ipython_pygments_lexers==1.1.1
ipywidgets==8.1.8
isoduration==20.11.0
jedi==0.19.2
Jinja2==3.1.6
joblib==1.5.3
json5==0.13.0
jsonpointer==3.0.0
jsonschema==4.26.0
jsonschema-specifications==2025.9.1
jupyter==1.1.1
jupyter-console==6.6.3
jupyter-events==0.12.0
jupyter-lsp==2.3.0
jupyter_client==8.8.0
jupyter_core==5.9.1
jupyter_server==2.17.0
jupyter_server_terminals==0.5.4
jupyterlab==4.5.4
jupyterlab_pygments==0.3.0
jupyterlab_server==2.28.0
jupyterlab_widgets==3.0.16
kiwisolver==1.4.9
lark==1.3.1
MarkupSafe==3.0.3
matplotlib==3.10.8
matplotlib-inline==0.2.1
mistune==3.2.0
nbclient==0.10.4
nbconvert==7.17.0
nbformat==5.10.4
nest-asyncio==1.6.0
nltk==3.9.2
notebook==7.5.3
notebook_shim==0.2.4
numpy==2.4.2
packaging==26.0
pandas==3.0.1
pandas-stubs==3.0.0.260204
pandocfilters==1.5.1
parso==0.8.6
pexpect==4.9.0
pillow==12.1.1
platformdirs==4.9.2
prometheus_client==0.24.1
prompt_toolkit==3.0.52
psutil==7.2.2
ptyprocess==0.7.0
pure_eval==0.2.3
pyarrow==23.0.1
pycparser==3.0
Pygments==2.19.2
pyparsing==3.3.2
python-dateutil==2.9.0.post0
python-json-logger==4.0.0
PyYAML==6.0.3
pyzmq==27.1.0
referencing==0.37.0
regex==2026.1.15
requests==2.32.5
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
rfc3987-syntax==1.1.0
rpds-py==0.30.0
scikit-learn==1.8.0
scipy==1.17.1
Send2Trash==2.1.0
setuptools==82.0.0
six==1.17.0
soupsieve==2.8.3
stack-data==0.6.3
terminado==0.18.1
threadpoolctl==3.6.0
tinycss2==1.4.0
tornado==6.5.4
tqdm==4.67.3
traitlets==5.14.3
typing_extensions==4.15.0
tzdata==2025.3
uri-template==1.3.0
urllib3==2.6.3
wcwidth==0.6.0
webcolors==25.10.0
webencodings==0.5.1
websocket-client==1.9.0
widgetsnbextension==4.0.15
zstandard==0.25.0

97
src/clean_data.py Normal file
View File

@@ -0,0 +1,97 @@
from constants import DATASET_DIR, TEMP_DIR
from helper import dataset_iterator
import pyarrow as pa
import pyarrow.parquet as pq
import nltk
import re
import shutil
# cleans text and returns a list of tokens.
def clean_text(
text:str,
remove_regex_patterns:bool = True,
remove_stopwords:bool = True,
remove_special_characters:bool = True,
stemming:bool = True
) -> list[str]:
text = str(text).lower().strip()
if remove_regex_patterns:
url_pattern = r'https?://\S+|www\.\S+'
email_pattern = r'[\w.-]+@[\w]+\.[\w]+'
date_pattern = r'([a-z]+ \d{1,2}[a-z]?, \d{4}|\d{2,4}[-/]\d{2,4}[-/]\d{2,4})' # add more date patterns
number_pattern = r'\d+'
text = re.sub(url_pattern, "<URL>", text)
text = re.sub(email_pattern, "<EMAIL>", text)
text = re.sub(date_pattern, "<DATE>", text)
text = re.sub(number_pattern, "<NUMBER>", text)
if remove_special_characters:
text = re.sub(r'[^\w (?:<\w+>)]', " ", text)
tokenizer = nltk.RegexpTokenizer(r'<\w+>|\w+')
tokens = tokenizer.tokenize(text) # type: ignore
if remove_stopwords:
stopwords = stopwords = nltk.corpus.stopwords.words('english')
tokens = [token for token in tokens if token not in stopwords] # type: ignore
if stemming:
stemmer = nltk.SnowballStemmer("english")
tokens = [stemmer.stem(token) if not re.match(r'<\w+>', token) else token for token in tokens] # type: ignore
return tokens # type: ignore
def clean_dataset(filename:str) -> None:
output_path = f"{TEMP_DIR}/{filename}"
writer = None
for chunk in dataset_iterator(f"{DATASET_DIR}/{filename}"):
chunk['tokens'] = chunk['content'].apply(clean_text)
columns_in_chunk = chunk.columns #[c for c in chunk.columns]
table = pa.Table.from_pandas(chunk[columns_in_chunk])
if writer is None:
writer = pq.ParquetWriter(output_path, table.schema)
writer.write_table(table)
writer.close()
shutil.move(output_path, f"{DATASET_DIR}/{filename}")
def compute_vocab_reduction(filename: str) -> dict[str, float | int]:
dataset_path = f"{DATASET_DIR}/{filename}"
vocab_before_stopwords: set[str] = set()
vocab_after_stopwords: set[str] = set()
vocab_after_stemming: set[str] = set()
for chunk in dataset_iterator(dataset_path):
contents = chunk["content"]
for text in contents:
vocab_before_stopwords.update(clean_text(text, remove_stopwords=False, stemming=False))
vocab_after_stopwords.update(clean_text(text, remove_stopwords=True, stemming=False))
vocab_after_stemming.update(clean_text(text, remove_stopwords=True, stemming=True))
before_stop_size = len(vocab_before_stopwords)
after_stop_size = len(vocab_after_stopwords)
before_stem_size = after_stop_size
after_stem_size = len(vocab_after_stemming)
stopwords_reduction_rate = (
(before_stop_size - after_stop_size) / before_stop_size if before_stop_size else 0.0
)
stemming_reduction_rate = (
(before_stem_size - after_stem_size) / before_stem_size if before_stem_size else 0.0
)
return {
"vocab_size_before_stopwords": before_stop_size,
"vocab_size_after_stopwords": after_stop_size,
"stopwords_reduction_rate": stopwords_reduction_rate,
"vocab_size_before_stemming": before_stem_size,
"vocab_size_after_stemming": after_stem_size,
"stemming_reduction_rate": stemming_reduction_rate,
}

14
src/constants.py Normal file
View File

@@ -0,0 +1,14 @@
import os
DATA_DIR = os.path.abspath("../data")
MODEL_DIR = os.path.abspath("../models")
DATASET_DIR = f"{DATA_DIR}/datasets"
TRAINING_DIR = f"{DATA_DIR}/training"
VALIDATION_DIR = f"{DATA_DIR}/validation"
TESTING_DIR = f"{DATA_DIR}/testing"
TEMP_DIR = f"{DATA_DIR}/temp"
ORIGINAL_DATASET_FILES = ["news_sample.csv", "995,000_rows.csv"]
DATASET_FILES = ["news_sample.parquet", "995,000_rows.parquet"]
CHUNK_SIZE = 10000 # how many rows to work on at time, instead of loading the entire dataset into memory.
MAX_ROWS = -1 # only work with MAX_ROWS rows so testing things out isnt crazy slow. Set to -1 for infinite.

60
src/helper.py Normal file
View File

@@ -0,0 +1,60 @@
from labels import Label
from constants import CHUNK_SIZE, MAX_ROWS
from typing import Iterator, cast
import pyarrow.parquet as pq
import pandas as pd
def default_labelling(article_type:str) -> Label:
if article_type in ["reliable", "political", "clickbait"]:
return Label.REAL
return Label.FAKE
def only_fake_labelling(article_type:str) -> Label:
if article_type == "fake":
return Label.FAKE
return Label.REAL
def not_reliable_labelling(article_type:str) -> Label:
if article_type == "reliable":
return Label.REAL
return Label.FAKE
def LIAR_labelling(article_type:str) -> Label:
if article_type in ["true", "half-true", "barely-true", "mostly-true"]:
return Label.REAL
return Label.FAKE
# Deprecated, don't use, just use pd.read_parquet instead
def dataset_iterator(dataset_file:str, columns:list[str] | None = None) -> Iterator[pd.DataFrame]:
pq_file = pq.ParquetFile(dataset_file)
rows_read = 0
for batch in pq_file.iter_batches(batch_size=CHUNK_SIZE, columns=columns): # type: ignore
rows_read += len(batch) # type: ignore
if rows_read > MAX_ROWS and MAX_ROWS > 0:
return
# cast to ignore type warnings.
yield cast(pd.DataFrame, batch.to_pandas()) # type: ignore
def csv_to_parquet(input_path: str, output_path: str):
# csv.field_size_limit(sys.maxsize) # if i don't use this and engine="python" i cant read all datasets.
# writer = None
# for chunk in pd.read_csv(input_path, chunksize=CHUNK_SIZE, dtype=str, engine="python"): # who the fuck put a string in the id column
# chunk = chunk.fillna("")
# table = pa.Table.from_pandas(chunk) # type: ignore
# if writer is None:
# writer = pq.ParquetWriter(output_path, table.schema) # type: ignore
# writer.write_table(table) # type: ignore
# writer.close() # type: ignore
pd.read_csv(input_path, low_memory=False).to_parquet(output_path)
def get_time_boundaries (filename: str) : #type: ignore
# Only load the timestamp column to save RAM
df_dates = pq.read_table(filename, columns=['scraped_at']).to_pandas() #type: ignore
df_dates['scraped_at'] = pd.to_datetime(df_dates['scraped_at'], format='ISO8601', errors='coerce',utc=True) #type: ignore
# Sort the dates to find the percentiles
sorted_dates = df_dates['scraped_at'].sort_values() #type: ignore
# Find the timestamps at the 80th and 90th percentile
train_cut = sorted_dates.quantile(0.80) #type: ignore
val_cut = sorted_dates.quantile(0.90) #type: ignore
return train_cut, val_cut #type: ignore

5
src/labels.py Normal file
View File

@@ -0,0 +1,5 @@
from enum import Enum
class Label(Enum):
REAL = 0
FAKE = 1

64
src/main.py Executable file
View File

@@ -0,0 +1,64 @@
#!/usr/bin/env python
import argparse
from models.svm import SVM_model
from models.gradient_boosting import Gradient_boosting_model
from models.logistic_regression import Logistic_model
from models.baseline import Baseline_model
from helper import default_labelling, not_reliable_labelling, only_fake_labelling, LIAR_labelling
def main() -> None:
parser = argparse.ArgumentParser(
prog="Fakenews detector",
description="Train and test models",
usage="The following is an example of training a logistic regression model on news_sample.parquet:\n"+
"python main.py --model_type logistic --model_file logistic_news_sample.model --data_file news_sample.parquet --train",
)
parser.add_argument("--train", action="store_true", help="Whether model should be trained, if not set it will be tested instead")
parser.add_argument("--validate", action="store_true", help="Wheter to use validation set when testing/validating")
parser.add_argument("--model_type", "-t", required=True, choices=["baseline", "logistic", "svm", "gradient_boosting"], help="The type of model: baseline, logistic, ...")
parser.add_argument("--model_file", "-f", required=True, default="", help="The model file to save to when training, or load from when testing")
parser.add_argument("--data_file", "-d", required=True, help="The datafile used when training or testing")
parser.add_argument("--label_translator", "-l", required=False, default = "", help="The translator function used by the model, such as \"not_reliable\", that only considers 'reliable' tagged news Real, ignored if not using --train.")
parser.add_argument("--hyperparameters", "-p", required=False, nargs="+", default ="", help="The hyperparameters used when training the model, written like c=1")
args = parser.parse_args()
label_translator = default_labelling
if "not_reliable" in args.label_translator.lower():
label_translator = not_reliable_labelling
if "only_fake" in args.label_translator.lower():
label_translator = only_fake_labelling
if "liar" in args.data_file.lower():
label_translator = LIAR_labelling
args.data_file = "LIAR.parquet"
if args.model_type == "logistic":
model = Logistic_model(label_translator=label_translator)
elif args.model_type == "svm":
model = SVM_model(label_translator=label_translator)
elif args.model_type == "gradient_boosting":
model = Gradient_boosting_model(label_translator=label_translator)
else:
model = Baseline_model(label_translator=label_translator)
if args.train:
hyperparameters:dict[str, float] = {}
for parameter in args.hyperparameters:
key, value = parameter.split("=")
hyperparameters[key] = float(value)
model.train(args.data_file, hyperparameters)
model.save(args.model_file)
if not args.train:
model.load(args.model_file)
if "liar" in args.data_file.lower():
model.test("LIAR.parquet", validate=False)
else:
model.test(args.data_file, args.validate)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}

846
src/models/Untitled2.ipynb Normal file
View File

@@ -0,0 +1,846 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "3ed30f2e",
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'torch'",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtorch\u001b[39;00m \n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtorch\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mnn\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnn\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtorch\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mnn\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mfunctional\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mF\u001b[39;00m \n",
"\u001b[31mModuleNotFoundError\u001b[39m: No module named 'torch'"
]
}
],
"source": [
"import torch \n",
"import torch.nn as nn\n",
"import torch.nn.functional as F \n",
"import pandas as pd\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from collections import Counter\n",
"import os\n",
"import sys\n",
"sys.path.append(os.path.join(os.getcwd(), '../'))\n",
"from helper import default_labelling\n",
"from sklearn.metrics import f1_score\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "42edceb8",
"metadata": {},
"outputs": [],
"source": [
"label_map = {\n",
" 'Label.FAKE': 0,\n",
" 'Label.REAL': 1}"
]
},
{
"cell_type": "markdown",
"id": "0aa1a427",
"metadata": {},
"source": [
"# Pipelining process"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c7730d65",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_parquet(\"../../data/training/995,000_rows.parquet\", columns=['tokens','type'])\n",
"\n",
"\n",
"df['label'] = df['type'].apply(default_labelling).astype(str)\n",
"df['label'] = df['label'].map(label_map).astype(int)\n",
"df = df.drop(columns=['type'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c31caf06",
"metadata": {},
"outputs": [],
"source": [
"df_test = pd.read_parquet(\"../../data/testing/995,000_rows.parquet\", columns=['tokens','type'])\n",
"\n",
"df_test['label'] = df_test['type'].apply(default_labelling).astype(str)\n",
"df_test['label'] = df_test['label'].map(label_map).astype(int)\n",
"df_test = df_test.drop(columns=['type'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5c0c93ab",
"metadata": {},
"outputs": [],
"source": [
"df_val = pd.read_parquet(\"../../data/validation/995,000_rows.parquet\", columns=['tokens','type'])\n",
"df_val['label'] = df_val['type'].apply(default_labelling).astype(str)\n",
"df_val['label'] = df_val['label'].map(label_map).astype(int)\n",
"df_val = df_val.drop(columns=['type'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "19188ef7",
"metadata": {},
"outputs": [],
"source": [
"# print(\"Loading Parquet file...\")\n",
"\n",
"# # Check the total number of rows (articles)\n",
"# print(f\"Total rows in the raw Parquet file: {len(df)}\")\n",
"\n",
"# # Look at the first few rows to make sure the data looks correct\n",
"# print(\"\\n--- First 3 Rows ---\")\n",
"# print(df.head(3))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa455147",
"metadata": {},
"outputs": [],
"source": [
"# count how many tokens we have in the corpuse \n",
"word_counts = Counter()\n",
"for x in df['tokens']:\n",
" word_counts.update(x)\n",
" \n",
"# Keep the top 50,000 words. \n",
"# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)\n",
"vocab = {\"<PAD>\": 0, \"<UNK>\": 1}\n",
"for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):\n",
" vocab[word] = idx\n",
"\n",
"print(f\"Vocabulary built with {len(vocab)} words.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b9ba0021",
"metadata": {},
"outputs": [],
"source": [
"# Create a Custom PyTorch Datase\n",
"\n",
"# a wrapper for the data that PyTorch knows how to talk to.\n",
"class FakeNewsDataset(Dataset):\n",
" def __init__(self, dataframe, vocab, max_length=256):\n",
" self.dataframe = dataframe\n",
" self.vocab = vocab\n",
" self.max_length = max_length\n",
"\n",
"# Tells PyTorch how many articles we have\n",
"#PyTorch calls this internally to know when to stop fetching data.\n",
" def __len__(self):\n",
" return len(self.dataframe)\n",
" \n",
" def __getitem__(self, idx):\n",
" # Grabs one article and its label at a time\n",
" tokens = self.dataframe.iloc[idx]['tokens']\n",
" label = self.dataframe.iloc[idx]['label']\n",
"\n",
" # Convert text tokens to Integer IDs\n",
" article_ids = [self.vocab.get(word, 1) for word in tokens]\n",
"\n",
" # Truncate or Pad the article so they are all exactly 'max_length' long\n",
" if len(article_ids) > self.max_length:\n",
" article_ids = article_ids[:self.max_length]\n",
" else:\n",
" padding = [0] * (self.max_length - len(article_ids))\n",
" article_ids.extend(padding)\n",
" \n",
" # Return as PyTorch tensors\n",
" return torch.tensor(article_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5f3f4096",
"metadata": {},
"outputs": [],
"source": [
"## Prepare the DataLoader \n",
"# Wrap The dataframe in the Dataset class\n",
"\n",
"# The DataLoader feeds the data to the model in batches (e.g., 64 articles at a time)\n",
"# This prevents the computer from running out of RAM!\n",
"\n",
"\n",
"my_train_dataset = FakeNewsDataset(dataframe=df, vocab=vocab, max_length=256)\n",
"# Shuffle is true for training so the data keeps getting shuffled when trained and the model does not memorise the data\n",
"train_dataloader = DataLoader(my_train_dataset, batch_size=64, shuffle=True,num_workers=4, # Start with 4; if CPU stays cool, try 6\n",
"pin_memory=True, # Essential for fast data transfer\n",
"prefetch_factor=2)\n",
"\n",
"\n",
"val_data = FakeNewsDataset(dataframe=df_val, vocab=vocab, max_length=256)\n",
"val_dataloader = DataLoader(val_data, batch_size=64, shuffle=False)\n",
"\n",
"test_data = FakeNewsDataset(dataframe=df_test, vocab=vocab, max_length=256)\n",
"test_dataloader = DataLoader(test_data, batch_size=64, shuffle=False)"
]
},
{
"cell_type": "markdown",
"id": "fd4f08a6",
"metadata": {},
"source": [
"Checking if the data conversion works"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9bcbcf9b",
"metadata": {},
"outputs": [],
"source": [
"# features, labels = next(iter(train_dataloader))\n",
"# # 2. Check the shapes (the dimensions of your tensors)\n",
"# print(\"--- Tensor Shapes ---\")\n",
"# print(f\"Features shape: {features.shape}\") \n",
"# print(f\"Labels shape: {labels.shape}\") \n",
"\n",
"# # 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)\n",
"# print(\"\\n--- Data Types ---\")\n",
"# print(f\"Features dtype: {features.dtype}\")\n",
"# print(f\"Labels dtype: {labels.dtype}\")\n",
"\n",
"# # 4. Peek at the actual data for the very first article in this batch\n",
"# print(\"\\n--- First Article Peek ---\")\n",
"# print(f\"Label: {labels[0].item()} (0 = Real, 1 = Fake)\")\n",
"# print(f\"Tokens (first 20 IDs): {features[0][:20]}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b70e45ac",
"metadata": {},
"outputs": [],
"source": [
"class BaseModel(nn.Module):\n",
" def __init__(self, vocab_size, embed_dim=32, h1=256, h2=128, out_features=2):\n",
" super().__init__()\n",
" \n",
" # The Embedding Layer: Turns word IDs into rich numerical vectors\n",
" self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)\n",
" \n",
" # The Linear Layers: Learn the patterns to decide Fake vs. Real\n",
" self.fc1 = nn.Linear(embed_dim, h1)\n",
" self.fc2 = nn.Linear(h1, h2)\n",
" self.out = nn.Linear(h2, out_features)\n",
" \n",
" def forward(self, x):\n",
" \n",
" # x starts as integers: shape (batch_size, sequence_length) -> e.g., (64, 256)\n",
" # Pass through embedding\n",
" x = self.embedding(x) \n",
" # Average the word vectors to get one single vector for the whole article\n",
" x = x.mean(dim=1) \n",
" \n",
" # Pass through hidden layers with ReLU activation\n",
" x = F.relu(self.fc1(x))\n",
" x = F.relu(self.fc2(x))\n",
" \n",
" # Output layer (gives us the raw scores for 'Real' and 'Fake')\n",
" x = self.out(x)\n",
" return x\n",
"model_basic =BaseModel(vocab_size=len((vocab)))"
]
},
{
"cell_type": "markdown",
"id": "efa6c453",
"metadata": {},
"source": [
"'Advanced'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52cb9377",
"metadata": {},
"outputs": [],
"source": [
"class advanced_model(nn.Module):\n",
" def __init__(self, vocab_size, embed_dim=64, hidden_dim=128,num_layer = 2, out_features=2):\n",
" super().__init__()\n",
" \n",
" # 1. The Embedding Layer (Same as before)\n",
" self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)\n",
" \n",
" # # 2. The GRU Layer (Extra layer)\n",
" # batch_first=True is required because our DataLoader outputs (batch_size, sequence_length) \n",
" self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, num_layers=2,batch_first=True,bidirectional=True, \n",
" dropout=0.3)\n",
" \n",
" # 3. The Final Output Layer\n",
" # connect the GRU's memory (hidden_dim) directly to our Real/Fake outputs\n",
" self.out = nn.Linear(hidden_dim, out_features)\n",
" self.fc = nn.Linear(hidden_dim * 2, out_features)\n",
" def forward(self, x):\n",
" # x shape: (batch_size, sequence_length) -> e.g., (64, 256)\n",
" \n",
" #Get the word embeddings\n",
" x = self.embedding(x) \n",
" # x shape becomes: (64, 256, 32)\n",
" \n",
" # Pass the embeddings into the GRU\n",
" # A GRU outputs two things: the output at every single word, AND its final memory state.\n",
" # We use '_' to ignore the step-by-step output, and save 'hidden_state'.\n",
" _, hidden = self.gru(x)\n",
" \n",
" # 4. Extract and Concatenate the final forward and backward states\n",
" # hidden[-2] is the last forward state, hidden[-1] is the last backward state\n",
" out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)\n",
" \n",
" return self.fc(out)\n",
" \n",
"# Initilize\n",
"model_adv = advanced_model(vocab_size=len(vocab))"
]
},
{
"cell_type": "markdown",
"id": "31b581d0",
"metadata": {},
"source": [
"# Training"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a8e1f849",
"metadata": {},
"outputs": [],
"source": [
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ae976afb",
"metadata": {},
"outputs": [],
"source": [
"def evaluate_performance(model, dataloader, device):\n",
" model.eval() # Put model in evaluation mode\n",
" \n",
" all_predictions = []\n",
" all_true_labels = []\n",
" \n",
" # Turn off gradient tracking to save memory\n",
" with torch.no_grad():\n",
" for features, labels in dataloader:\n",
" features = features.to(device)\n",
" labels = labels.to(device)\n",
" \n",
" # Get model scores\n",
" scores = model(features)\n",
" \n",
" # Find the predicted class (0 or 1)\n",
" _, predictions = torch.max(scores,1)\n",
" \n",
" # Save predictions and actual labels to lists\n",
" # all_predictions.extend(predictions.cpu().tolist())\n",
" # all_true_labels.extend(labels.cpu().tolist())\n",
" all_predictions.extend(predictions.cpu().numpy().flatten().tolist())\n",
" all_true_labels.extend(labels.cpu().numpy().flatten().tolist())\n",
" \n",
" all_predictions = np.array(all_predictions)\n",
" all_true_labels = np.array(all_true_labels)\n",
" \n",
" accuracy = (all_predictions == all_true_labels).mean() * 100\n",
" \n",
" # 4. Calculate F1 Score\n",
" # average='macro' is best for your report to show you care about both classes equally\n",
" f1 = f1_score(all_true_labels, all_predictions, average='macro')\n",
" model.train() # Return model to training mode just in case\n",
" return accuracy, f1"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "65e26f88",
"metadata": {},
"outputs": [],
"source": [
"def train_model(model, train_loader, val_loader, device, epochs=5, lr=0.001):\n",
" model = model.to(device)\n",
" criterion = nn.CrossEntropyLoss()\n",
" optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n",
" \n",
" # Dictionary to store results for your report\n",
" history = {'train_loss': [], 'val_acc': [], 'val_f1': []}\n",
"\n",
" print(f\"Training {model.__class__.__name__} on {device}...\")\n",
"\n",
" for epoch in range(epochs):\n",
" model.train()\n",
" total_loss = 0\n",
" \n",
" for batch_idx, (features, labels) in enumerate(train_loader):\n",
" features, labels = features.to(device), labels.to(device)\n",
" \n",
" optimizer.zero_grad()\n",
" predictions = model(features)\n",
" loss = criterion(predictions, labels)\n",
" loss.backward()\n",
" optimizer.step()\n",
" \n",
" total_loss += loss.item()\n",
" \n",
" avg_loss = total_loss / len(train_loader)\n",
" \n",
" # After each epoch, evaluate on validation set\n",
" val_acc, val_f1 = evaluate_performance(model, val_loader, device)\n",
" \n",
" # Save results to our history dictionary\n",
" history['train_loss'].append(avg_loss)\n",
" history['val_acc'].append(val_acc)\n",
" history['val_f1'].append(val_f1)\n",
" \n",
" print(f\"\\n Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f} \\n Val Acc: {val_acc:.2f}% \\n Val F1: {val_f1:.4f}\")\n",
"\n",
" return history # Return the results so we can plot them later"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3acf0f2b",
"metadata": {},
"outputs": [],
"source": [
"train_995_basic =train_model (model_basic, train_dataloader, val_dataloader, device, epochs =7 )\n",
"print(train_995_basic )"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c0f7f65",
"metadata": {},
"outputs": [],
"source": [
"train_995_adv =train_model (model_adv, train_dataloader, val_dataloader, device, epochs =7 )\n",
"print(train_995_adv )"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a1e10032",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "12959462",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "9fb31c02",
"metadata": {},
"source": [
"# Evaluation"
]
},
{
"cell_type": "markdown",
"id": "2630d40a",
"metadata": {},
"source": [
"Basic model"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "73c388e7",
"metadata": {},
"outputs": [],
"source": [
"# # 1. The Evaluation Function\n",
"# def evaluate_performance(model, dataloader, device):\n",
"# model.eval() # Put model in evaluation mode\n",
" \n",
"# all_predictions = []\n",
"# all_true_labels = []\n",
" \n",
"# # Turn off gradient tracking to save memory\n",
"# with torch.no_grad():\n",
"# for features, labels in dataloader:\n",
"# features = features.to(device)\n",
"# labels = labels.to(device)\n",
" \n",
"# # Get model scores\n",
"# scores = model(features)\n",
" \n",
"# # Find the predicted class (0 or 1)\n",
"# _, predictions = torch.max(scores,1)\n",
" \n",
"# # Save predictions and actual labels to lists\n",
"# # all_predictions.extend(predictions.cpu().tolist())\n",
"# # all_true_labels.extend(labels.cpu().tolist())\n",
"# all_predictions.extend(predictions.cpu().numpy().flatten().tolist())\n",
"# all_true_labels.extend(labels.cpu().numpy().flatten().tolist())\n",
" \n",
"# all_predictions = np.array(all_predictions)\n",
"# all_true_labels = np.array(all_true_labels)\n",
" \n",
"# accuracy = (all_predictions == all_true_labels).mean() * 100\n",
" \n",
"# # 4. Calculate F1 Score\n",
"# # average='macro' is best for your report to show you care about both classes equally\n",
"# f1 = f1_score(all_true_labels, all_predictions, average='macro')\n",
"# model.train() # Return model to training mode just in case\n",
"# return accuracy, f1\n",
"# # # Change me based on the model\n",
"\n",
"# # model = model_basic.to(device)\n",
"\n",
"\n",
"# # print(f\"Training on: {device}\")\n",
"\n",
"# # # 2. Setup Loss and Optimizer\n",
"# # # CrossEntropyLoss is the standard for classification tasks\n",
"# # criterion = nn.CrossEntropyLoss() \n",
"# # # Adam is a very reliable, fast optimizer\n",
"# # optimizer = torch.optim.Adam(model.parameters(), lr=0.001) \n",
"\n",
"# # # 3. The Training Loop\n",
"# # epochs = 7# Start with a small number of passes through the whole dataset\n",
"\n",
"# # for epoch in range(epochs):\n",
"# # model.train() # Tell the model it is in training mode\n",
"# # total_loss = 0\n",
" \n",
"# # # Loop through our batches of 64 articles\n",
"# # for batch_idx, (features, labels) in enumerate(train_dataloader):\n",
" \n",
"# # # Move data to the same device as the model (GPU/CPU)\n",
"# # features = features.to(device)\n",
"# # labels = labels.to(device)\n",
" \n",
"# # # Step A: Reset the optimizer's gradients\n",
"# # optimizer.zero_grad()\n",
" \n",
"# # # Step B: Forward Pass (Have the model guess Real or Fake)\n",
"# # predictions = model(features)\n",
" \n",
"# # # Step C: Calculate Loss (How wrong were the guesses?)\n",
"# # loss = criterion(predictions, labels)\n",
" \n",
"# # # Step D: Backward Pass (Calculate how to fix the math)\n",
"# # loss.backward()\n",
" \n",
"# # # Step E: Optimize (Actually apply the fixes to the model's weights)\n",
"# # optimizer.step()\n",
" \n",
"# # total_loss += loss.item()\n",
" \n",
"# # # Print an update every 100 batches so we know it's working\n",
"# # if batch_idx % 100 == 0:\n",
"# # print(f\"Epoch [{epoch+1}/{epochs}] | Batch {batch_idx} | Loss: {loss.item():.4f}\")\n",
" \n",
"# # # Print the average loss at the end of each epoch\n",
"# # avg_loss = total_loss / len(train_dataloader)\n",
"# # print(f\"--- End of Epoch {epoch+1} | Average Loss: {avg_loss:.4f} ---\")"
]
},
{
"cell_type": "markdown",
"id": "09b0ce98",
"metadata": {},
"source": [
"Advanced model"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b2ca196d",
"metadata": {},
"outputs": [],
"source": [
"# # 1. The Evaluation Function\n",
"# def evaluate_performance(model_adv, dataloader, device):\n",
"# model_adv.eval() # Put model in evaluation mode\n",
" \n",
"# all_predictions = []\n",
"# all_true_labels = []\n",
" \n",
"# # Turn off gradient tracking to save memory\n",
"# with torch.no_grad():\n",
"# for features, labels in dataloader:\n",
"# features = features.to(device)\n",
"# labels = labels.to(device)\n",
" \n",
"# # Get model scores\n",
"# scores = model_adv(features)\n",
" \n",
"# # Find the predicted class (0 or 1)\n",
"# _, predictions = scores.max(1)\n",
" \n",
"# # Save predictions and actual labels to lists\n",
"# all_predictions.extend(predictions.cpu().tolist())\n",
"# all_true_labels.extend(labels.cpu().tolist())\n",
" \n",
"# # Calculate Accuracy\n",
"# correct_guesses = sum(p == t for p, t in zip(all_predictions, all_true_labels))\n",
"# accuracy = (correct_guesses / len(all_true_labels)) * 100\n",
" \n",
"# # Calculate F1 Score\n",
"# f1 = f1_score(all_true_labels, all_predictions, average='macro')\n",
" \n",
"# model_adv.train() # Return model to training mode just in case\n",
"# return accuracy, f1"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5835388c",
"metadata": {},
"outputs": [],
"source": [
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c6ca6771",
"metadata": {},
"outputs": [],
"source": [
"print(\"Basic model \")\n",
"print(\" Validation \")\n",
"val_acc995, val_f1_995 = evaluate_performance(model,val_dataloader, device)\n",
"print(f\"Validation Accuracy: {val_acc995:.2f}%\")\n",
"print(f\"Validation F1 Score: {val_f1_995:.4f}\")\n",
"\n",
"print(\"\\n Testing Phase \")\n",
"test_acc995, test_f1_995 = evaluate_performance(model, test_dataloader, device)\n",
"print(f\"Test Accuracy: {test_acc995:.2f}%\")\n",
"print(f\"Test F1 Score: git {test_f1_995:.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e206d094",
"metadata": {},
"outputs": [],
"source": [
"print(\" GURU model \")\n",
"print(\" Validation \")\n",
"adv_val_acc995, val_f1995 = evaluate_performance(model_adv,val_dataloader, device)\n",
"print(f\"Validation Accuracy: {adv_val_acc995:.2f}%\")\n",
"print(f\"Validation F1 Score: {val_f1_995:.4f}\")\n",
"\n",
"print(\"\\n Testing \")\n",
"test_acc, test_f1 = evaluate_performance(model_adv, test_dataloader, device)\n",
"print(f\"Test Accuracy: {test_acc955:.2f}%\")\n",
"print(f\"Test F1 Score: git {test_f1:.4f}\")"
]
},
{
"cell_type": "markdown",
"id": "f6a4ae72",
"metadata": {},
"source": [
"# Liar data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fc7b8dac",
"metadata": {},
"outputs": [],
"source": [
"from helper import LIAR_labelling\n",
"\n",
"f\"../../data/training/LIAR.parquet\"\n",
"df_LIAR = pd.read_parquet(\"../../data/testing/LIAR.parquet\",columns=['tokens','type'])\n",
"\n",
"\n",
"df_LIAR['label'] = df_LIAR['type'].apply(LIAR_labelling).astype(str)\n",
"df_LIAR['label'] = df_LIAR['label'].map(label_map).astype(int)\n",
"df_LIAR = df_LIAR.drop(columns=['type'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f73f6f84",
"metadata": {},
"outputs": [],
"source": [
"df_LIAR.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9a76196e",
"metadata": {},
"outputs": [],
"source": [
"# count how many tokens we have in the corpuse \n",
"word_counts = Counter()\n",
"for x in df_LIAR['tokens']:\n",
" word_counts.update(x)\n",
" \n",
"# Keep the top 50,000 words. \n",
"# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)\n",
"vocab = {\"<PAD>\": 0, \"<UNK>\": 1}\n",
"for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):\n",
" vocab[word] = idx\n",
"\n",
"print(f\"Vocabulary built with {len(vocab)} words.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "39dbe869",
"metadata": {},
"outputs": [],
"source": [
"LR_DATA = FakeNewsDataset(dataframe=df_LIAR, vocab=vocab, max_length=256)\n",
"LR_dataloader = DataLoader(LR_DATA, batch_size=32, shuffle=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ccbc7885",
"metadata": {},
"outputs": [],
"source": [
"features, labels = next(iter(LR_dataloader))\n",
"# 2. Check the shapes (the dimensions of your tensors)\n",
"print(\"--- Tensor Shapes ---\")\n",
"print(f\"Features shape: {features.shape}\") \n",
"print(f\"Labels shape: {labels.shape}\") \n",
"\n",
"# 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)\n",
"print(\"\\n--- Data Types ---\")\n",
"print(f\"Features dtype: {features.dtype}\")\n",
"print(f\"Labels dtype: {labels.dtype}\")\n",
"\n",
"# 4. Peek at the actual data for the very first article in this batch\n",
"print(\"\\n--- First Article Peek ---\")\n",
"print(f\"Label: {labels[0].item()} (0 = Real, 1 = Fake)\")\n",
"print(f\"Tokens (first 20 IDs): {features[0][:20]}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4698cd06",
"metadata": {},
"outputs": [],
"source": [
"# # 1. Check a single sample from the Dataset directly\n",
"# single_features, single_label = LR_DATA[0]\n",
"# print(f\"Single Sample - Features: {single_features.shape}, Label: {single_label.shape}\")\n",
"\n",
"# # 2. Check the DataLoader batch\n",
"# batch_features, batch_labels = next(iter(LR_dataloader))\n",
"# # print(f\"Batch - Features: {batch_features.shape}, Labels: {batch_labels.shape}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ed9c57c2",
"metadata": {},
"outputs": [],
"source": [
"evaluate_performance(model_adv,LR_dataloader,device)\n",
"\n",
"print(\"\\n--- 2. Testing Avanced model ---\")\n",
"test_acc, test_f1 = evaluate_performance(model_adv, LR_dataloader, device)\n",
"print(f\"Test Accuracy: {test_acc:.2f}%\")\n",
"print(f\"Test F1 Score: git {test_f1:.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "74127f71",
"metadata": {},
"outputs": [],
"source": [
"print(\"\\n--- 2. Testing BASE-Model ---\")\n",
"test_acc, test_f1 = evaluate_performance(model, LR_dataloader, device)\n",
"print(f\"Test Accuracy: {test_acc:.2f}%\")\n",
"print(f\"Test F1 Score: git {test_f1:.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "33c54c0e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

51
src/models/baseline.py Normal file
View File

@@ -0,0 +1,51 @@
import pickle
from typing import override, Callable
from constants import TRAINING_DIR, MODEL_DIR
from labels import Label
from models.model import Model
from helper import dataset_iterator, default_labelling
import pandas as pd
from random import random
class Baseline_model(Model):
def __init__(self, model_filename:str="", label_translator: Callable[[str], Label] = default_labelling) -> None:
self.fake_probability = 0
super().__init__(model_filename, label_translator)
@override
def train(self, training_dataset:str, hyperparameters:dict[str, float]={}) -> None:
fake_amount = 0
real_amount = 0
total_amount = 0
for chunk in dataset_iterator(f"{TRAINING_DIR}/{training_dataset}", columns=['type']):
chunk_fake_amount = (chunk['type'].map(self.label_translator) == Label.FAKE).sum()
fake_amount += chunk_fake_amount
real_amount += len(chunk) - chunk_fake_amount
total_amount += len(chunk)
self.fake_probability = fake_amount/total_amount
@override
def classify(self, input:pd.Series) -> Label:
if random() <= self.fake_probability:
return Label.FAKE
return Label.REAL
@override
def save(self, filename:str) -> None:
data = {}
data["label_translator"] = self.label_translator
data["fake_probability"] = self.fake_probability
with open(f"{MODEL_DIR}/{filename}", 'wb') as file:
pickle.dump(data, file)
@override
def load(self, filename:str) -> None:
with open(f"{MODEL_DIR}/{filename}", 'rb') as file:
data = pickle.load(file)
self.label_translator = data["label_translator"]
self.fake_probability = data["fake_probability"]

View File

@@ -0,0 +1,57 @@
from constants import TRAINING_DIR, MODEL_DIR
from models.model import Model
from labels import Label
from helper import default_labelling
from typing import override, Callable
import pandas as pd
import pickle
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
def no_tokenization(str):
return str.split(" ")
class Gradient_boosting_model(Model):
def __init__(self, model_filename: str = "", label_translator: Callable[[str], Label] = default_labelling) -> None:
super().__init__(model_filename, label_translator)
@override
def train(self, training_dataset: str, hyperparameters: dict[str, float] = {}) -> None:
print("this model takes around 10 hours to train")
X = pd.read_parquet(f"{TRAINING_DIR}/{training_dataset}", columns=['tokens'])['tokens']
X = X.apply(lambda token_list: " ".join(token_list))
Y = pd.read_parquet(f"{TRAINING_DIR}/{training_dataset}", columns=['type'])['type']
Y = Y.apply(lambda label: self.label_translator(label).value)
X = X[:250000]
Y = Y[:250000]
model = Pipeline([
("L string", TfidfVectorizer(tokenizer=no_tokenization)),
("forest", GradientBoostingClassifier(random_state=0, n_estimators=4000))
])
model.fit(X, Y)
self.model = model
@override
def classify(self, input: pd.Series) -> Label:
X = " ".join(input['tokens'])
return Label(self.model.predict([X])[0])
@override
def save(self, filename: str) -> None:
data = {}
data["label_translator"] = self.label_translator
data["model"] = self.model
with open(f"{MODEL_DIR}/{filename}", 'wb') as file:
pickle.dump(data, file)
@override
def load(self, filename: str) -> None:
with open(f"{MODEL_DIR}/{filename}", 'rb') as file:
data = pickle.load(file)
self.label_translator = data["label_translator"]
self.model = data["model"]

View File

@@ -0,0 +1,133 @@
import pickle
from typing import override, Callable
from scipy.sparse import lil_array
from constants import TRAINING_DIR, MODEL_DIR
from labels import Label
from models.model import Model
from helper import dataset_iterator
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
from helper import default_labelling
class Logistic_model(Model):
def __init__(self, model_filename: str = "", label_translator: Callable[[str], Label] = default_labelling) -> None:
super().__init__(model_filename, label_translator)
@override
def train(self, training_dataset: str, hyperparameters: dict[str, float] = {}) -> None:
token_counts:dict[str, int] = {}
sorted_token_counts:dict[str, int] = {}
token_id:dict[str, int] = {} # converts top 10K words to id's.
domain_counts:dict[str, int] = {}
sorted_domain_counts:dict[str, int] = {}
domain_id:dict[str, int] = {} # converts top 500 domains to id's.
self.consider_metadata = False
if "metadata" in hyperparameters and hyperparameters["metadata"] == 1:
self.consider_metadata = True
columns = ["tokens", "domain"]
rows_processed = 0
for chunk in dataset_iterator(f"{TRAINING_DIR}/{training_dataset}", columns=columns):
rows_processed += len(chunk)
for _, row in chunk.iterrows():
for token in row['tokens']:
if token not in token_counts:
token_counts[token] = 0
token_counts[token] += 1
if row['domain'] not in domain_counts:
domain_counts[row['domain']] = 0
domain_counts[row['domain']] += 1
for token in sorted(token_counts, key=lambda token: token_counts[token], reverse=True):
sorted_token_counts[token] = token_counts[token]
for domain in sorted(domain_counts, key=lambda domain: domain_counts[domain], reverse=True):
sorted_domain_counts[domain] = domain_counts[domain]
idx = 0
for token in sorted_token_counts:
token_id[token] = idx
idx += 1
if idx >= 10000:
break
idx = 0
for domain in sorted_domain_counts:
domain_id[domain] = idx
idx += 1
if idx >= 1000:
break
if self.consider_metadata: # consider things other than tokens
X = lil_array((rows_processed, 11000), dtype="float64")
else:
X = lil_array((rows_processed, 10000), dtype="float64") # non-sparse array uses 74GiB ram on 995,000_rows. Sklearn LogisticRegression supports sparse arrays though. It still uses 9+ now.
Y = np.zeros(rows_processed, dtype=int)
columns.append("type")
article_num = 0
for chunk in dataset_iterator(f"{TRAINING_DIR}/{training_dataset}", columns=columns):
for _, row in chunk.iterrows():
tokens = row['tokens']
article_type = row['type']
article_word_counts = np.zeros(10000)
for token in tokens:
if token not in token_id:
continue # if they are not in top 10K vocab we can ignore them
article_word_counts[token_id[token]] += 1
X[article_num, :10000] = article_word_counts
if self.consider_metadata:
if row['domain'] in domain_id:
X[article_num, 10000+domain_id[row['domain']]] = 1
Y[article_num] = self.label_translator(article_type).value
article_num += 1
self.regression_model = LogisticRegression(max_iter=10000, n_jobs = -1, class_weight="balanced").fit(X, Y)
self.token_id = token_id
self.domain_id = domain_id
@override
def classify(self, input: pd.Series) -> Label:
if self.consider_metadata:
x = np.zeros(11000)
else:
x = np.zeros(10000)
for token in input['tokens']:
if token not in self.token_id:
continue
x[self.token_id[token]] += 1
if self.consider_metadata:
if input['domain'] in self.domain_id:
x[10000+self.domain_id[input['domain']]] = 1
prediction = self.regression_model.predict([x])[0]
return Label(prediction)
@override
def save(self, filename: str) -> None:
data = {}
data["label_translator"] = self.label_translator
data["regression_model"] = self.regression_model
data["token_id"] = self.token_id
data["domain_id"] = self.domain_id
data["consider_metadata"] = self.consider_metadata
with open(f"{MODEL_DIR}/{filename}", 'wb') as file:
pickle.dump(data, file)
@override
def load(self, filename: str) -> None:
with open(f"{MODEL_DIR}/{filename}", 'rb') as file:
data = pickle.load(file)
self.label_translator = data["label_translator"]
self.regression_model = data["regression_model"]
self.token_id = data["token_id"]
self.domain_id = data["domain_id"]
self.consider_metadata = data["consider_metadata"]

61
src/models/model.py Normal file
View File

@@ -0,0 +1,61 @@
from abc import ABC, abstractmethod
import pandas as pd
from time import perf_counter
from constants import TESTING_DIR, VALIDATION_DIR
from helper import LIAR_labelling, dataset_iterator, default_labelling
from labels import Label
from typing import Callable
class Model(ABC):
def __init__(self, model_filename:str="", label_translator: Callable[[str], Label] = default_labelling) -> None:
self.label_translator = label_translator
if model_filename:
self.load(model_filename)
@abstractmethod
def train(self, training_dataset:str, hyperparameters:dict[str, float]) -> None:
pass
@abstractmethod
def classify(self, input:pd.Series) -> Label:
pass
@abstractmethod
def save(self, filename:str) -> None:
pass
@abstractmethod
def load(self, filename:str) -> None:
pass
def test(self, test_dataset: str, validate:bool=True) -> tuple[float, float, float, float]:
TP = 0
TN = 0
FP = 0
FN = 0
if test_dataset == "LIAR.parquet":
self.label_translator = LIAR_labelling
dataset_dir = VALIDATION_DIR if validate else TESTING_DIR
df = pd.read_parquet(f"{dataset_dir}/{test_dataset}")
expected = df['type'].apply(self.label_translator)
predicted = df.apply(self.classify, axis=1)
TP = ((expected == Label.FAKE) & (predicted == Label.FAKE)).sum()
FP = ((expected == Label.REAL) & (predicted == Label.FAKE)).sum()
TN = ((expected == Label.REAL) & (predicted == Label.REAL)).sum()
FN = ((expected == Label.FAKE) & (predicted == Label.REAL)).sum()
accuracy = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) > 0 else 0
recall = (TP) / (TP + FN) if (TP + FN) > 0 else 0
precision = (TP) / (TP + FP) if (TP + FP) > 0 else 0
F1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f"Accuracy {accuracy}")
print(f"Recall {recall}")
print(f"precision {precision}")
print(f"F1-score {F1}")
return (accuracy, recall, precision, F1)

1189
src/models/nn.ipynb Normal file

File diff suppressed because it is too large Load Diff

579
src/models/nn.ju.py Normal file
View File

@@ -0,0 +1,579 @@
# %%
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import os
import sys
sys.path.append(os.path.join(os.getcwd(), '../'))
from helper import default_labelling
from sklearn.metrics import f1_score
import numpy as np
# %%
label_map = {
'Label.FAKE': 0,
'Label.REAL': 1}
# %% [markdown]
"""
# Pipelining process
"""
# %%
df = pd.read_parquet("../../data/training/995,000_rows.parquet", columns=['tokens','type'])
df['label'] = df['type'].apply(default_labelling).astype(str)
df['label'] = df['label'].map(label_map).astype(int)
df = df.drop(columns=['type'])
# %%
df_test = pd.read_parquet("../../data/testing/995,000_rows.parquet", columns=['tokens','type'])
df_test['label'] = df_test['type'].apply(default_labelling).astype(str)
df_test['label'] = df_test['label'].map(label_map).astype(int)
df_test = df_test.drop(columns=['type'])
# %%
df_val = pd.read_parquet("../../data/validation/995,000_rows.parquet", columns=['tokens','type'])
df_val['label'] = df_val['type'].apply(default_labelling).astype(str)
df_val['label'] = df_val['label'].map(label_map).astype(int)
df_val = df_val.drop(columns=['type'])
# %%
# print("Loading Parquet file...")
# # Check the total number of rows (articles)
# print(f"Total rows in the raw Parquet file: {len(df)}")
# # Look at the first few rows to make sure the data looks correct
# print("\n--- First 3 Rows ---")
# print(df.head(3))
# %%
# count how many tokens we have in the corpuse
word_counts = Counter()
for x in df['tokens']:
word_counts.update(x)
# Keep the top 50,000 words.
# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)
vocab = {"<PAD>": 0, "<UNK>": 1}
for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):
vocab[word] = idx
print(f"Vocabulary built with {len(vocab)} words.")
# %%
# Create a Custom PyTorch Datase
# a wrapper for the data that PyTorch knows how to talk to.
class FakeNewsDataset(Dataset):
def __init__(self, dataframe, vocab, max_length=256):
self.dataframe = dataframe
self.vocab = vocab
self.max_length = max_length
# Tells PyTorch how many articles we have
#PyTorch calls this internally to know when to stop fetching data.
def __len__(self):
return len(self.dataframe)
def __getitem__(self, idx):
# Grabs one article and its label at a time
tokens = self.dataframe.iloc[idx]['tokens']
label = self.dataframe.iloc[idx]['label']
# Convert text tokens to Integer IDs
article_ids = [self.vocab.get(word, 1) for word in tokens]
# Truncate or Pad the article so they are all exactly 'max_length' long
if len(article_ids) > self.max_length:
article_ids = article_ids[:self.max_length]
else:
padding = [0] * (self.max_length - len(article_ids))
article_ids.extend(padding)
# Return as PyTorch tensors
return torch.tensor(article_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)
# %%
## Prepare the DataLoader
# Wrap The dataframe in the Dataset class
# The DataLoader feeds the data to the model in batches (e.g., 64 articles at a time)
# This prevents the computer from running out of RAM!
my_train_dataset = FakeNewsDataset(dataframe=df, vocab=vocab, max_length=256)
# Shuffle is true for training so the data keeps getting shuffled when trained and the model does not memorise the data
train_dataloader = DataLoader(my_train_dataset, batch_size=64, shuffle=True,num_workers=4, # Start with 4; if CPU stays cool, try 6
pin_memory=True, # Essential for fast data transfer
prefetch_factor=2)
val_data = FakeNewsDataset(dataframe=df_val, vocab=vocab, max_length=256)
val_dataloader = DataLoader(val_data, batch_size=64, shuffle=False)
test_data = FakeNewsDataset(dataframe=df_test, vocab=vocab, max_length=256)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=False)
# %% [markdown]
"""
Checking if the data conversion works
"""
# %%
# features, labels = next(iter(train_dataloader))
# # 2. Check the shapes (the dimensions of your tensors)
# print("--- Tensor Shapes ---")
# print(f"Features shape: {features.shape}")
# print(f"Labels shape: {labels.shape}")
# # 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)
# print("\n--- Data Types ---")
# print(f"Features dtype: {features.dtype}")
# print(f"Labels dtype: {labels.dtype}")
# # 4. Peek at the actual data for the very first article in this batch
# print("\n--- First Article Peek ---")
# print(f"Label: {labels[0].item()} (0 = Real, 1 = Fake)")
# print(f"Tokens (first 20 IDs): {features[0][:20]}")
# %%
class BaseModel(nn.Module):
def __init__(self, vocab_size, embed_dim=32, h1=256, h2=128, out_features=2):
super().__init__()
# The Embedding Layer: Turns word IDs into rich numerical vectors
self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
# The Linear Layers: Learn the patterns to decide Fake vs. Real
self.fc1 = nn.Linear(embed_dim, h1)
self.fc2 = nn.Linear(h1, h2)
self.out = nn.Linear(h2, out_features)
def forward(self, x):
# x starts as integers: shape (batch_size, sequence_length) -> e.g., (64, 256)
# Pass through embedding
x = self.embedding(x)
# Average the word vectors to get one single vector for the whole article
x = x.mean(dim=1)
# Pass through hidden layers with ReLU activation
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
# Output layer (gives us the raw scores for 'Real' and 'Fake')
x = self.out(x)
return x
model_basic =BaseModel(vocab_size=len((vocab)))
# %% [markdown]
"""
'Advanced'
"""
# %%
class advanced_model(nn.Module):
def __init__(self, vocab_size, embed_dim=64, hidden_dim=128,num_layer = 2, out_features=2):
super().__init__()
# 1. The Embedding Layer (Same as before)
self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
# # 2. The GRU Layer (Extra layer)
# batch_first=True is required because our DataLoader outputs (batch_size, sequence_length)
self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, num_layers=2,batch_first=True,bidirectional=True,
dropout=0.3)
# 3. The Final Output Layer
# connect the GRU's memory (hidden_dim) directly to our Real/Fake outputs
self.out = nn.Linear(hidden_dim, out_features)
self.fc = nn.Linear(hidden_dim * 2, out_features)
def forward(self, x):
# x shape: (batch_size, sequence_length) -> e.g., (64, 256)
#Get the word embeddings
x = self.embedding(x)
# x shape becomes: (64, 256, 32)
# Pass the embeddings into the GRU
# A GRU outputs two things: the output at every single word, AND its final memory state.
# We use '_' to ignore the step-by-step output, and save 'hidden_state'.
_, hidden = self.gru(x)
# 4. Extract and Concatenate the final forward and backward states
# hidden[-2] is the last forward state, hidden[-1] is the last backward state
out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
return self.fc(out)
# Initilize
model_adv = advanced_model(vocab_size=len(vocab))
# %% [markdown]
"""
# Training
"""
# %%
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# %%
def evaluate_performance(model, dataloader, device):
model.eval() # Put model in evaluation mode
all_predictions = []
all_true_labels = []
# Turn off gradient tracking to save memory
with torch.no_grad():
for features, labels in dataloader:
features = features.to(device)
labels = labels.to(device)
# Get model scores
scores = model(features)
# Find the predicted class (0 or 1)
_, predictions = torch.max(scores,1)
# Save predictions and actual labels to lists
# all_predictions.extend(predictions.cpu().tolist())
# all_true_labels.extend(labels.cpu().tolist())
all_predictions.extend(predictions.cpu().numpy().flatten().tolist())
all_true_labels.extend(labels.cpu().numpy().flatten().tolist())
all_predictions = np.array(all_predictions)
all_true_labels = np.array(all_true_labels)
accuracy = (all_predictions == all_true_labels).mean() * 100
# 4. Calculate F1 Score
# average='macro' is best for your report to show you care about both classes equally
f1 = f1_score(all_true_labels, all_predictions, average='macro')
model.train() # Return model to training mode just in case
return accuracy, f1
# %%
def train_model(model, train_loader, val_loader, device, epochs=5, lr=0.001):
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# Dictionary to store results for your report
history = {'train_loss': [], 'val_acc': [], 'val_f1': []}
print(f"Training {model.__class__.__name__} on {device}...")
for epoch in range(epochs):
model.train()
total_loss = 0
for batch_idx, (features, labels) in enumerate(train_loader):
features, labels = features.to(device), labels.to(device)
optimizer.zero_grad()
predictions = model(features)
loss = criterion(predictions, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
avg_loss = total_loss / len(train_loader)
# After each epoch, evaluate on validation set
val_acc, val_f1 = evaluate_performance(model, val_loader, device)
# Save results to our history dictionary
history['train_loss'].append(avg_loss)
history['val_acc'].append(val_acc)
history['val_f1'].append(val_f1)
print(f"\n Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f} \n Val Acc: {val_acc:.2f}% \n Val F1: {val_f1:.4f}")
return history # Return the results so we can plot them later
# %%
train_995_basic =train_model (model_basic, train_dataloader, val_dataloader, device, epochs =7 )
print(train_995_basic )
# %%
train_995_adv =train_model (model_adv, train_dataloader, val_dataloader, device, epochs =7 )
print(train_995_adv )
# %%
# %%
# %% [markdown]
"""
# Evaluation
"""
# %% [markdown]
"""
Basic model
"""
# %%
# # 1. The Evaluation Function
# def evaluate_performance(model, dataloader, device):
# model.eval() # Put model in evaluation mode
# all_predictions = []
# all_true_labels = []
# # Turn off gradient tracking to save memory
# with torch.no_grad():
# for features, labels in dataloader:
# features = features.to(device)
# labels = labels.to(device)
# # Get model scores
# scores = model(features)
# # Find the predicted class (0 or 1)
# _, predictions = torch.max(scores,1)
# # Save predictions and actual labels to lists
# # all_predictions.extend(predictions.cpu().tolist())
# # all_true_labels.extend(labels.cpu().tolist())
# all_predictions.extend(predictions.cpu().numpy().flatten().tolist())
# all_true_labels.extend(labels.cpu().numpy().flatten().tolist())
# all_predictions = np.array(all_predictions)
# all_true_labels = np.array(all_true_labels)
# accuracy = (all_predictions == all_true_labels).mean() * 100
# # 4. Calculate F1 Score
# # average='macro' is best for your report to show you care about both classes equally
# f1 = f1_score(all_true_labels, all_predictions, average='macro')
# model.train() # Return model to training mode just in case
# return accuracy, f1
# # # Change me based on the model
# # model = model_basic.to(device)
# # print(f"Training on: {device}")
# # # 2. Setup Loss and Optimizer
# # # CrossEntropyLoss is the standard for classification tasks
# # criterion = nn.CrossEntropyLoss()
# # # Adam is a very reliable, fast optimizer
# # optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# # # 3. The Training Loop
# # epochs = 7# Start with a small number of passes through the whole dataset
# # for epoch in range(epochs):
# # model.train() # Tell the model it is in training mode
# # total_loss = 0
# # # Loop through our batches of 64 articles
# # for batch_idx, (features, labels) in enumerate(train_dataloader):
# # # Move data to the same device as the model (GPU/CPU)
# # features = features.to(device)
# # labels = labels.to(device)
# # # Step A: Reset the optimizer's gradients
# # optimizer.zero_grad()
# # # Step B: Forward Pass (Have the model guess Real or Fake)
# # predictions = model(features)
# # # Step C: Calculate Loss (How wrong were the guesses?)
# # loss = criterion(predictions, labels)
# # # Step D: Backward Pass (Calculate how to fix the math)
# # loss.backward()
# # # Step E: Optimize (Actually apply the fixes to the model's weights)
# # optimizer.step()
# # total_loss += loss.item()
# # # Print an update every 100 batches so we know it's working
# # if batch_idx % 100 == 0:
# # print(f"Epoch [{epoch+1}/{epochs}] | Batch {batch_idx} | Loss: {loss.item():.4f}")
# # # Print the average loss at the end of each epoch
# # avg_loss = total_loss / len(train_dataloader)
# # print(f"--- End of Epoch {epoch+1} | Average Loss: {avg_loss:.4f} ---")
# %% [markdown]
"""
Advanced model
"""
# %%
# # 1. The Evaluation Function
# def evaluate_performance(model_adv, dataloader, device):
# model_adv.eval() # Put model in evaluation mode
# all_predictions = []
# all_true_labels = []
# # Turn off gradient tracking to save memory
# with torch.no_grad():
# for features, labels in dataloader:
# features = features.to(device)
# labels = labels.to(device)
# # Get model scores
# scores = model_adv(features)
# # Find the predicted class (0 or 1)
# _, predictions = scores.max(1)
# # Save predictions and actual labels to lists
# all_predictions.extend(predictions.cpu().tolist())
# all_true_labels.extend(labels.cpu().tolist())
# # Calculate Accuracy
# correct_guesses = sum(p == t for p, t in zip(all_predictions, all_true_labels))
# accuracy = (correct_guesses / len(all_true_labels)) * 100
# # Calculate F1 Score
# f1 = f1_score(all_true_labels, all_predictions, average='macro')
# model_adv.train() # Return model to training mode just in case
# return accuracy, f1
# %%
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# %%
print("Basic model ")
print(" Validation ")
val_acc995, val_f1_995 = evaluate_performance(model,val_dataloader, device)
print(f"Validation Accuracy: {val_acc995:.2f}%")
print(f"Validation F1 Score: {val_f1_995:.4f}")
print("\n Testing Phase ")
test_acc995, test_f1_995 = evaluate_performance(model, test_dataloader, device)
print(f"Test Accuracy: {test_acc995:.2f}%")
print(f"Test F1 Score: git {test_f1_995:.4f}")
# %%
print(" GURU model ")
print(" Validation ")
adv_val_acc995, val_f1995 = evaluate_performance(model_adv,val_dataloader, device)
print(f"Validation Accuracy: {adv_val_acc995:.2f}%")
print(f"Validation F1 Score: {val_f1_995:.4f}")
print("\n Testing ")
test_acc, test_f1 = evaluate_performance(model_adv, test_dataloader, device)
print(f"Test Accuracy: {test_acc955:.2f}%")
print(f"Test F1 Score: git {test_f1:.4f}")
# %% [markdown]
"""
# Liar data
"""
# %%
from helper import LIAR_labelling
f"../../data/training/LIAR.parquet"
df_LIAR = pd.read_parquet("../../data/testing/LIAR.parquet",columns=['tokens','type'])
df_LIAR['label'] = df_LIAR['type'].apply(LIAR_labelling).astype(str)
df_LIAR['label'] = df_LIAR['label'].map(label_map).astype(int)
df_LIAR = df_LIAR.drop(columns=['type'])
# %%
df_LIAR.head()
# %%
# count how many tokens we have in the corpuse
word_counts = Counter()
for x in df_LIAR['tokens']:
word_counts.update(x)
# Keep the top 50,000 words.
# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)
vocab = {"<PAD>": 0, "<UNK>": 1}
for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):
vocab[word] = idx
print(f"Vocabulary built with {len(vocab)} words.")
# %%
LR_DATA = FakeNewsDataset(dataframe=df_LIAR, vocab=vocab, max_length=256)
LR_dataloader = DataLoader(LR_DATA, batch_size=32, shuffle=False)
# %%
features, labels = next(iter(LR_dataloader))
# 2. Check the shapes (the dimensions of your tensors)
print("--- Tensor Shapes ---")
print(f"Features shape: {features.shape}")
print(f"Labels shape: {labels.shape}")
# 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)
print("\n--- Data Types ---")
print(f"Features dtype: {features.dtype}")
print(f"Labels dtype: {labels.dtype}")
# 4. Peek at the actual data for the very first article in this batch
print("\n--- First Article Peek ---")
print(f"Label: {labels[0].item()} (0 = Real, 1 = Fake)")
print(f"Tokens (first 20 IDs): {features[0][:20]}")
# %%
# # 1. Check a single sample from the Dataset directly
# single_features, single_label = LR_DATA[0]
# print(f"Single Sample - Features: {single_features.shape}, Label: {single_label.shape}")
# # 2. Check the DataLoader batch
# batch_features, batch_labels = next(iter(LR_dataloader))
# # print(f"Batch - Features: {batch_features.shape}, Labels: {batch_labels.shape}")
# %%
evaluate_performance(model_adv,LR_dataloader,device)
print("\n--- 2. Testing Avanced model ---")
test_acc, test_f1 = evaluate_performance(model_adv, LR_dataloader, device)
print(f"Test Accuracy: {test_acc:.2f}%")
print(f"Test F1 Score: git {test_f1:.4f}")
# %%
print("\n--- 2. Testing BASE-Model ---")
test_acc, test_f1 = evaluate_performance(model, LR_dataloader, device)
print(f"Test Accuracy: {test_acc:.2f}%")
print(f"Test F1 Score: git {test_f1:.4f}")
# %%

52
src/models/svm.py Normal file
View File

@@ -0,0 +1,52 @@
from constants import TRAINING_DIR, MODEL_DIR
from models.model import Model
from labels import Label
from helper import default_labelling
from typing import override, Callable
import pandas as pd
import pickle
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
def no_tokenization(str):
return str.split(" ")
class SVM_model(Model):
def __init__(self, model_filename: str = "", label_translator: Callable[[str], Label] = default_labelling) -> None:
super().__init__(model_filename, label_translator)
@override
def train(self, training_dataset: str, hyperparameters: dict[str, float] = {}) -> None:
X = pd.read_parquet(f"{TRAINING_DIR}/{training_dataset}", columns=['tokens'])['tokens']
X = X.apply(lambda token_list: " ".join(token_list))
Y = pd.read_parquet(f"{TRAINING_DIR}/{training_dataset}", columns=['type'])['type']
Y = Y.apply(lambda label: self.label_translator(label).value)
model = Pipeline([
("L string", TfidfVectorizer(tokenizer=no_tokenization)),
("svm", LinearSVC(random_state=0))
])
model.fit(X, Y)
self.model = model
@override
def classify(self, input: pd.Series) -> Label:
X = " ".join(input['tokens'])
return Label(self.model.predict([X])[0])
@override
def save(self, filename: str) -> None:
data = {}
data["label_translator"] = self.label_translator
data["model"] = self.model
with open(f"{MODEL_DIR}/{filename}", 'wb') as file:
pickle.dump(data, file)
@override
def load(self, filename: str) -> None:
with open(f"{MODEL_DIR}/{filename}", 'rb') as file:
data = pickle.load(file)
self.label_translator = data["label_translator"]
self.model = data["model"]

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,121 @@
# %% [markdown]
"""
# cleaning
big_data.csv.zst is the main file we will be using. Every step in the pipeline adds a new column and overwrites the file. This is reversible and when any step changes everything can be run again regardless og the state of the file.
"""
# %%
import nltk
import re
import os
import time
import pandas as pd
DATA_DIR = "../data"
# %%
# download nltk data
nltk.download("all")
# %%
news_sample = pd.read_csv(f"{DATA_DIR}/news_sample.csv")
# %%
# We will not waste space on csv files, L.
if (os.path.exists(f"{DATA_DIR}/995,000_rows.csv")):
big_data = pd.read_csv(f"{DATA_DIR}/995,000_rows.csv", low_memory=False)
big_data.to_csv(f"{DATA_DIR}/big_data.csv.zst")
os.remove(f"{DATA_DIR}/995,000_rows.csv")
big_data = None
# %%
# cleans text and returns a list of tokens.
def clean_text(
text,
remove_regex_patterns = True,
remove_stopwords = True,
remove_special_characters = True,
stemming = True):
text = str(text).lower().strip()
if remove_regex_patterns:
url_pattern = r'\S+\.\S+'
email_pattern = r'\w+@\w+\.\w+'
date_pattern = r'[a-z]+ \d{1,2}[a-z]?, \d{4}' # add more date patterns
number_pattern = r'\d+'
text = re.sub(url_pattern, "<URL>", text)
text = re.sub(email_pattern, "<EMAIL>", text)
text = re.sub(date_pattern, "<DATE>", text)
text = re.sub(number_pattern, "<NUMBER>", text)
if remove_special_characters:
text = re.sub(r'[^\w (?:<\w+>)]', " ", text)
tokenizer = nltk.RegexpTokenizer(r'<\w+>|\w+')
tokens = tokenizer.tokenize(text)
if remove_stopwords:
stopwords = stopwords = nltk.corpus.stopwords.words('english')
tokens = [token for token in tokens if token not in stopwords]
if stemming:
stemmer = nltk.SnowballStemmer("english")
tokens = [stemmer.stem(token) if not re.match(r'<\w+>', token) else token for token in tokens]
return tokens
# %% [markdown]
"""
## Output
Now we check what the function does and how the vocabulary changes.
"""
# %%
# Generates a vocabulary (set of unique words) from a pandas series.
def generate_vocabulary(series):
vocabulary = set()
series.apply(lambda tokens: vocabulary.update(tokens))
return vocabulary
# %%
print("original text:\n")
print(news_sample['content'][1])
print("\n" + "-" * 100 + "\n")
print("cleaned tokens:\n")
print(clean_text(news_sample['content'][1]))
print("\n" + "-" * 100 + "\n")
tokenization_size = len(generate_vocabulary(news_sample['content'].apply(clean_text, remove_stopwords = False, stemming = False)))
stopwords_size = len(generate_vocabulary(news_sample['content'].apply(clean_text, remove_stopwords = True, stemming = False)))
stemming_size = len(generate_vocabulary(news_sample['content'].apply(clean_text, remove_stopwords = True, stemming = True)))
print("Unique words after tokenization:")
print(tokenization_size)
print("\nUnique words after stopword removal:")
print(stopwords_size)
print("\nUnique words after stemming:")
print(stemming_size)
print("\nStemming reduction rate:")
print(f"{round(1 - stemming_size / stopwords_size, 4) * 100}%")
# %% [markdown]
"""
## Big Data
Now we clean the big dataset and save it to csv.zst file. Pandas can save and load zstd files just fine, and since it's realtime compression it doesn't really take more time while heavily reducing the file size.
"""
# %%
start = time.perf_counter()
first = True
for big_data in pd.read_csv(f"{DATA_DIR}/big_data.csv.zst", chunksize=10000):
big_data['tokens'] = big_data['content'].apply(clean_text)
if first:
big_data.to_csv(f"{DATA_DIR}/big_data_new.csv.zst", mode='w')
first = False
else:
big_data.to_csv(f"{DATA_DIR}/big_data_new.csv.zst", mode='a')
os.rename(f"{DATA_DIR}/big_data_new.csv.zst",f"{DATA_DIR}/big_data.csv.zst")
print(f"cleaning took {round((time.perf_counter() - start) / 60, 5)} minutes")

View File

@@ -0,0 +1,65 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "95706a2e-9e23-4272-aeaa-4510254f7feb",
"metadata": {},
"source": [
"# Cleaning"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1be89b54-76dd-4c2e-bcdd-ff956bf375bf",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import word_tokenize\n"
]
},
{
"cell_type": "markdown",
"id": "b82cf2b2-7cee-4c34-83b9-37c5c4828289",
"metadata": {},
"source": [
"1. Tokenize the text"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dc8058fc-0ed9-4daf-918d-d3e82064a3a6",
"metadata": {},
"outputs": [],
"source": [
"nltk.download('punkt')\n",
"text = ("
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

42
src/setup.py Normal file
View File

@@ -0,0 +1,42 @@
from constants import DATASET_DIR, TRAINING_DIR, VALIDATION_DIR, TESTING_DIR, ORIGINAL_DATASET_FILES
from clean_data import clean_dataset
from helper import csv_to_parquet
from split import split_dataset, split_dataset_random
import nltk
import os
import shutil
import pandas as pd
def setup() -> None:
# make sure nltk can be used later.
nltk.download("all")
for dataset_file in ORIGINAL_DATASET_FILES:
if not os.path.exists(f"{DATASET_DIR}/{dataset_file}"):
raise Exception(f"Please add {dataset_file} to {DATASET_DIR}")
name = os.path.splitext(dataset_file)[0]
if not os.path.exists(f"{DATASET_DIR}/{name}.parquet"):
csv_to_parquet(f"{DATASET_DIR}/{dataset_file}", f"{DATASET_DIR}/{name}.parquet")
print(f"finished converting {dataset_file} to parquet")
clean_dataset(f"{name}.parquet")
print(f"cleaned {name}.parquet")
split_dataset_random(f"{name}.parquet")
print(f"split {name}.parquet into traning, validation and test")
# LIAR
for dataset, destination in [("train.tsv", TRAINING_DIR), ("valid.tsv", VALIDATION_DIR), ("test.tsv", TESTING_DIR)]:
if os.path.exists(f"{DATASET_DIR}/{dataset}"):
df = pd.read_csv(f"{DATASET_DIR}/{dataset}", sep='\t', header=None)
df = df.rename(columns={
1: "type",
2: "content"
})
name = os.path.splitext(dataset)[0]
df.to_parquet(f"{DATASET_DIR}/{name}.parquet")
clean_dataset(f"{name}.parquet")
shutil.move(f"{DATASET_DIR}/{name}.parquet", f"{destination}/LIAR.parquet")
if __name__ == "__main__":
setup()

91
src/split.py Normal file
View File

@@ -0,0 +1,91 @@
from constants import CHUNK_SIZE, DATASET_DIR, TRAINING_DIR, VALIDATION_DIR, TESTING_DIR
import pyarrow.parquet as pq
import pyarrow as pa
import os
from helper import get_time_boundaries
import pandas as pd
import numpy as np
def split_dataset_random(filename:str) -> None:
pq_file = pq.ParquetFile(f"{DATASET_DIR}/{filename}")
training_writer = None
validation_writer = None
testing_writer = None
for batch in pq_file.iter_batches(batch_size=CHUNK_SIZE):
table = pa.Table.from_batches([batch])
rng = np.random.rand(table.num_rows)
training = table.filter(rng < 0.75)
validation = table.filter((rng >= 0.75) & (rng < 0.85))
testing = table.filter(rng >= 0.85)
if not training_writer and training.num_rows:
training_writer = pq.ParquetWriter(f"{TRAINING_DIR}/{filename}", training.schema)
if not validation_writer and validation.num_rows:
validation_writer = pq.ParquetWriter(f"{VALIDATION_DIR}/{filename}", validation.schema)
if not testing_writer and testing.num_rows:
testing_writer = pq.ParquetWriter(f"{TESTING_DIR}/{filename}", testing.schema)
if training.num_rows:
training_writer.write(training)
if validation.num_rows:
validation_writer.write(validation)
if testing.num_rows:
testing_writer.write(testing)
training_writer.close()
validation_writer.close()
testing_writer.close()
def split_dataset(filename: str) -> None:
df = pd.read_parquet(f"{DATASET_DIR}/{filename}")
n = len(df)
df['scraped_at'] = pd.to_datetime(df['scraped_at'], format='ISO8601', errors='coerce', utc=True)
df = df.sort_values(by='scraped_at')
df.iloc[:int(n * 0.8)].to_parquet(f"{TRAINING_DIR}/{filename}")
df.iloc[int(n * 0.8):int(n * 0.9)].to_parquet(f"{VALIDATION_DIR}/{filename}")
df.iloc[int(n * 0.9):].to_parquet(f"{TESTING_DIR}/{filename}")
return
# ── Writers start as None — initialized on first batch ───────────────────
train_writer = None
val_writer = None
test_writer = None
try:
parquet_file = pq.ParquetFile(filepath)
for batch in parquet_file.iter_batches(batch_size=CHUNK_SIZE): # type: ignore
chunk = batch.to_pandas() # type: ignore
chunk['scraped_at'] = pd.to_datetime(chunk['scraped_at'], format='ISO8601', errors='coerce', utc=True)
# Initialize writers on first batch AFTER datetime conversion
if train_writer is None:
schema= pa.Schema.from_pandas(chunk)
train_writer = pq.ParquetWriter(os.path.join(TRAINING_DIR, filename), schema)
val_writer = pq.ParquetWriter(os.path.join(VALIDATION_DIR, filename), schema)
test_writer = pq.ParquetWriter(os.path.join(TESTING_DIR, filename), schema)
# Split the chunk
train_chunk = chunk[chunk['scraped_at'] <= train_cut]
val_chunk = chunk[(chunk['scraped_at'] > train_cut) & (chunk['scraped_at'] <= val_cut)]
test_chunk = chunk[chunk['scraped_at'] > val_cut]
# Write each split
if not train_chunk.empty:
train_writer.write_table(pa.Table.from_pandas(train_chunk, schema=schema))
if not val_chunk.empty:
val_writer.write_table(pa.Table.from_pandas(val_chunk, schema=schema))
if not test_chunk.empty:
test_writer.write_table(pa.Table.from_pandas(test_chunk, schema=schema))
finally:
if train_writer:
train_writer.close()
if val_writer:
val_writer.close()
if test_writer:
test_writer.close()