backup since codeberg is down
This commit is contained in:
35
.gitignore
vendored
Normal file
35
.gitignore
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
# Document
|
||||
*.pdf
|
||||
*.bak
|
||||
*.tex.backup
|
||||
*.tex~
|
||||
*.synctex.gz
|
||||
*.out
|
||||
.bak
|
||||
build/
|
||||
_minted/
|
||||
obj/
|
||||
bin/
|
||||
|
||||
# Python
|
||||
__pycache__/
|
||||
.env
|
||||
.envrc
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
.ipynb_checkpoints/
|
||||
|
||||
# data bs
|
||||
data/**
|
||||
!data/
|
||||
!data/**/
|
||||
!data/**/.gitkeep
|
||||
|
||||
# general bs
|
||||
.DS_Store
|
||||
flake.lock
|
||||
.vscode/
|
||||
6
README.md
Normal file
6
README.md
Normal file
@@ -0,0 +1,6 @@
|
||||
- download the neccesary dataset files to data/datasets as csv (not zip). Move all tsv files from LIAR zip file direcly into the datasets folder.
|
||||
- run setup.py to setup nltk, and clean and split the datasets. It takes long, please wait.
|
||||
- run main.py from the src diretory to test the models. The function requrires the model type, model file, and dataset to be passed as parameters.
|
||||
Here is an example: python main.py --model_type logistic --model_file logistic.model --data_file 995,000_rows.parquet
|
||||
The model files can be found in the models directory (not the one in src), the data files can be found in data/testing (pass LIAR.parquet to test on LIAR dataset).
|
||||
The model types and more information including how to train models can be found with python main.py --help.
|
||||
457
analysis/Split_analysis.ipynb
Normal file
457
analysis/Split_analysis.ipynb
Normal file
@@ -0,0 +1,457 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3b55d166",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# DO NOT RUN; DaATA WILL BE LOST"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "9c2d25e9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd \n",
|
||||
"import os \n",
|
||||
"import sys\n",
|
||||
"sys.path.append(os.path.join(os.getcwd(), '../src'))\n",
|
||||
"from constants import TRAINING_DIR, TESTING_DIR, VALIDATION_DIR\n",
|
||||
"pd.set_option('display.max_columns', None)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cd67fc64",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Time Split "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "a917b0fa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_ty = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
|
||||
"train_ty = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
|
||||
"val_ty = pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "0098d6e4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"rows in train(818843, 1),\n",
|
||||
" rows in test (99499, 1), \n",
|
||||
" rows in validation(76645, 1)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(f'rows in train{train_ty.shape },\\n rows in test {test_ty.shape}, \\n rows in validation{val_ty.shape}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5985a4f3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"timeline = pd.concat([\n",
|
||||
" train_ty.value_counts().rename('train'),\n",
|
||||
" test_ty.value_counts().rename('test'),\n",
|
||||
" val_ty.value_counts().rename('val'),\n",
|
||||
"], axis=1).fillna(0).astype(int)\n",
|
||||
"\n",
|
||||
"timeline.index.name = 'type'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "b0673e19",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>train</th>\n",
|
||||
" <th>test</th>\n",
|
||||
" <th>val</th>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>type</th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>political</th>\n",
|
||||
" <td>194518</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>bias</th>\n",
|
||||
" <td>133232</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>fake</th>\n",
|
||||
" <td>104883</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>conspiracy</th>\n",
|
||||
" <td>97314</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>rumor</th>\n",
|
||||
" <td>56445</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>unknown</th>\n",
|
||||
" <td>43534</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>reliable</th>\n",
|
||||
" <td>42419</td>\n",
|
||||
" <td>99499</td>\n",
|
||||
" <td>76645</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>unreliable</th>\n",
|
||||
" <td>35332</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>clickbait</th>\n",
|
||||
" <td>27412</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>junksci</th>\n",
|
||||
" <td>14040</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>satire</th>\n",
|
||||
" <td>13160</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>hate</th>\n",
|
||||
" <td>8779</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" train test val\n",
|
||||
"type \n",
|
||||
"political 194518 0 0\n",
|
||||
"bias 133232 0 0\n",
|
||||
"fake 104883 0 0\n",
|
||||
"conspiracy 97314 0 0\n",
|
||||
"rumor 56445 0 0\n",
|
||||
"unknown 43534 0 0\n",
|
||||
"reliable 42419 99499 76645\n",
|
||||
"unreliable 35332 0 0\n",
|
||||
"clickbait 27412 0 0\n",
|
||||
"junksci 14040 0 0\n",
|
||||
"satire 13160 0 0\n",
|
||||
"hate 8779 0 0"
|
||||
]
|
||||
},
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"timeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6bdc7d84",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Random Split "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "cd5ca57b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_ty_R = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
|
||||
"train_ty_R = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
|
||||
"val_ty_R = pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "c793a37c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"rows in train(745724, 1),\n",
|
||||
" rows in test (149766, 1), \n",
|
||||
" rows in validation(99510, 1)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(f'rows in train{train_ty_R.shape },\\n rows in test {test_ty_R.shape}, \\n rows in validation{val_ty_R.shape}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "583304ff",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"timeline_R = pd.concat([\n",
|
||||
" train_ty_R.value_counts().rename('train'),\n",
|
||||
" test_ty_R.value_counts().rename('test'),\n",
|
||||
" val_ty_R.value_counts().rename('val'),\n",
|
||||
"], axis=1).fillna(0).astype(int)\n",
|
||||
"\n",
|
||||
"timeline_R.index.name = 'type'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "d8255b60",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>train</th>\n",
|
||||
" <th>test</th>\n",
|
||||
" <th>val</th>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>type</th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>reliable</th>\n",
|
||||
" <td>163802</td>\n",
|
||||
" <td>33010</td>\n",
|
||||
" <td>21752</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>political</th>\n",
|
||||
" <td>145779</td>\n",
|
||||
" <td>29241</td>\n",
|
||||
" <td>19498</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>bias</th>\n",
|
||||
" <td>99797</td>\n",
|
||||
" <td>20079</td>\n",
|
||||
" <td>13356</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>fake</th>\n",
|
||||
" <td>78736</td>\n",
|
||||
" <td>15602</td>\n",
|
||||
" <td>10545</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>conspiracy</th>\n",
|
||||
" <td>72837</td>\n",
|
||||
" <td>14676</td>\n",
|
||||
" <td>9801</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>unknown</th>\n",
|
||||
" <td>68468</td>\n",
|
||||
" <td>13754</td>\n",
|
||||
" <td>9098</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>rumor</th>\n",
|
||||
" <td>42254</td>\n",
|
||||
" <td>8553</td>\n",
|
||||
" <td>5638</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>unreliable</th>\n",
|
||||
" <td>26489</td>\n",
|
||||
" <td>5346</td>\n",
|
||||
" <td>3497</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>clickbait</th>\n",
|
||||
" <td>20552</td>\n",
|
||||
" <td>4161</td>\n",
|
||||
" <td>2699</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>junksci</th>\n",
|
||||
" <td>10516</td>\n",
|
||||
" <td>2066</td>\n",
|
||||
" <td>1458</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>satire</th>\n",
|
||||
" <td>9852</td>\n",
|
||||
" <td>1971</td>\n",
|
||||
" <td>1337</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>hate</th>\n",
|
||||
" <td>6641</td>\n",
|
||||
" <td>1307</td>\n",
|
||||
" <td>831</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2018-02-10 13:43:39.521661</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" train test val\n",
|
||||
"type \n",
|
||||
"reliable 163802 33010 21752\n",
|
||||
"political 145779 29241 19498\n",
|
||||
"bias 99797 20079 13356\n",
|
||||
"fake 78736 15602 10545\n",
|
||||
"conspiracy 72837 14676 9801\n",
|
||||
"unknown 68468 13754 9098\n",
|
||||
"rumor 42254 8553 5638\n",
|
||||
"unreliable 26489 5346 3497\n",
|
||||
"clickbait 20552 4161 2699\n",
|
||||
"junksci 10516 2066 1458\n",
|
||||
"satire 9852 1971 1337\n",
|
||||
"hate 6641 1307 831\n",
|
||||
"2018-02-10 13:43:39.521661 1 0 0"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"timeline_R"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "355d343a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "main_asg",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.14.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
399
analysis/analysis2.ipynb
Normal file
399
analysis/analysis2.ipynb
Normal file
File diff suppressed because one or more lines are too long
237
analysis/analyz_split_time.ipynb
Normal file
237
analysis/analyz_split_time.ipynb
Normal file
@@ -0,0 +1,237 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9c2d25e9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd \n",
|
||||
"import os \n",
|
||||
"import sys\n",
|
||||
"sys.path.append(os.path.join(os.getcwd(), '../src'))\n",
|
||||
"from constants import TRAINING_DIR, TESTING_DIR, VALIDATION_DIR\n",
|
||||
"pd.set_option('display.max_columns', None)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "a917b0fa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_ty = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
|
||||
"train_ty = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
|
||||
"val_ty = pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "0098d6e4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"rows in train(818843, 1),\n",
|
||||
" rows in test (99499, 1), \n",
|
||||
" rows in validation(76645, 1)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(f'rows in train{train_ty.shape },\\n rows in test {test_ty.shape}, \\n rows in validation{val_ty.shape}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "5985a4f3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"timeline = pd.concat([\n",
|
||||
" b.value_counts().rename('train'),\n",
|
||||
" a.value_counts().rename('test'),\n",
|
||||
" c.value_counts().rename('val'),\n",
|
||||
"], axis=1).fillna(0).astype(int)\n",
|
||||
"\n",
|
||||
"timeline.index.name = 'type'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "b0673e19",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>train</th>\n",
|
||||
" <th>test</th>\n",
|
||||
" <th>val</th>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>type</th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>political</th>\n",
|
||||
" <td>194518</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>bias</th>\n",
|
||||
" <td>133232</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>fake</th>\n",
|
||||
" <td>104883</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>conspiracy</th>\n",
|
||||
" <td>97314</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>rumor</th>\n",
|
||||
" <td>56445</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>unknown</th>\n",
|
||||
" <td>43534</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>reliable</th>\n",
|
||||
" <td>42419</td>\n",
|
||||
" <td>99499</td>\n",
|
||||
" <td>76645</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>unreliable</th>\n",
|
||||
" <td>35332</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>clickbait</th>\n",
|
||||
" <td>27412</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>junksci</th>\n",
|
||||
" <td>14040</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>satire</th>\n",
|
||||
" <td>13160</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>hate</th>\n",
|
||||
" <td>8779</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" train test val\n",
|
||||
"type \n",
|
||||
"political 194518 0 0\n",
|
||||
"bias 133232 0 0\n",
|
||||
"fake 104883 0 0\n",
|
||||
"conspiracy 97314 0 0\n",
|
||||
"rumor 56445 0 0\n",
|
||||
"unknown 43534 0 0\n",
|
||||
"reliable 42419 99499 76645\n",
|
||||
"unreliable 35332 0 0\n",
|
||||
"clickbait 27412 0 0\n",
|
||||
"junksci 14040 0 0\n",
|
||||
"satire 13160 0 0\n",
|
||||
"hate 8779 0 0"
|
||||
]
|
||||
},
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"timeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c2bcfc84",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "main_asg",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.14.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
0
data/datasets/.gitkeep
Normal file
0
data/datasets/.gitkeep
Normal file
0
data/temp/.gitkeep
Normal file
0
data/temp/.gitkeep
Normal file
0
data/testing/.gitkeep
Normal file
0
data/testing/.gitkeep
Normal file
0
data/training/.gitkeep
Normal file
0
data/training/.gitkeep
Normal file
0
data/validation/.gitkeep
Normal file
0
data/validation/.gitkeep
Normal file
34
flake.nix
Normal file
34
flake.nix
Normal file
@@ -0,0 +1,34 @@
|
||||
# This is for my retarded nixos jupyter notebook setup. It makes a shell with requrements.txt and jupynium bs installed.
|
||||
{
|
||||
inputs = {
|
||||
nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
|
||||
pyproject-nix.url = "github:pyproject-nix/pyproject.nix";
|
||||
bozo_nixpkgs.url = "github:DuarteSJ/nixpkgs/4e926b09ba06301b08d0f12afd0640c079bdc4dc";
|
||||
};
|
||||
|
||||
outputs =
|
||||
{ nixpkgs, pyproject-nix, bozo_nixpkgs, ... }:
|
||||
let
|
||||
project = pyproject-nix.lib.project.loadRequirementsTxt { projectRoot = ./.; };
|
||||
|
||||
pkgs = nixpkgs.legacyPackages.x86_64-linux;
|
||||
bozo_pkgs = bozo_nixpkgs.legacyPackages.x86_64-linux;
|
||||
|
||||
python = pkgs.python3;
|
||||
pythonEnv = pkgs.python3.withPackages (pkgs:
|
||||
let base = project.renderers.withPackages { inherit python; } pkgs;
|
||||
in base ++ (with pkgs; [ notebook nbclassic jupyter-console ipython]));
|
||||
mental_retardation = bozo_pkgs.python3.withPackages (python-pkgs: with python-pkgs; [ jupynium ]);
|
||||
in
|
||||
{
|
||||
devShells.x86_64-linux.default = pkgs.mkShell {
|
||||
packages = [ pythonEnv mental_retardation ];
|
||||
shellHook = ''
|
||||
export SHELL="which fish"
|
||||
if [[ $- == *i* ]] && [ -z "$TMUX" ]; then
|
||||
tmux new-session -A -s GDS-fake-news
|
||||
fi
|
||||
'';
|
||||
};
|
||||
};
|
||||
}
|
||||
BIN
models/LIAR_baseline.model
Normal file
BIN
models/LIAR_baseline.model
Normal file
Binary file not shown.
BIN
models/baseline.model
Normal file
BIN
models/baseline.model
Normal file
Binary file not shown.
BIN
models/gradient_boosting.model
Normal file
BIN
models/gradient_boosting.model
Normal file
Binary file not shown.
BIN
models/logistic.model
Normal file
BIN
models/logistic.model
Normal file
Binary file not shown.
BIN
models/metadata_logistic.model
Normal file
BIN
models/metadata_logistic.model
Normal file
Binary file not shown.
BIN
models/not_reliable_logistic.model
Normal file
BIN
models/not_reliable_logistic.model
Normal file
Binary file not shown.
BIN
models/old/GB10K.model
Normal file
BIN
models/old/GB10K.model
Normal file
Binary file not shown.
BIN
models/old/GB1K.model
Normal file
BIN
models/old/GB1K.model
Normal file
Binary file not shown.
BIN
models/old/GB2K.model
Normal file
BIN
models/old/GB2K.model
Normal file
Binary file not shown.
BIN
models/old/GB4K.model
Normal file
BIN
models/old/GB4K.model
Normal file
Binary file not shown.
BIN
models/only_fake_logistic.model
Normal file
BIN
models/only_fake_logistic.model
Normal file
Binary file not shown.
BIN
models/svm.model
Normal file
BIN
models/svm.model
Normal file
Binary file not shown.
4
pyrightconfig.json
Normal file
4
pyrightconfig.json
Normal file
@@ -0,0 +1,4 @@
|
||||
{
|
||||
"typeCheckingMode": "strict",
|
||||
"reportMissingTypeStubs": false
|
||||
}
|
||||
117
requirements.txt
Normal file
117
requirements.txt
Normal file
@@ -0,0 +1,117 @@
|
||||
anyio==4.12.1
|
||||
argon2-cffi==25.1.0
|
||||
argon2-cffi-bindings==25.1.0
|
||||
arrow==1.4.0
|
||||
asttokens==3.0.1
|
||||
async-lru==2.1.0
|
||||
attrs==25.4.0
|
||||
babel==2.18.0
|
||||
beautifulsoup4==4.14.3
|
||||
bleach==6.3.0
|
||||
certifi==2026.1.4
|
||||
cffi==2.0.0
|
||||
charset-normalizer==3.4.4
|
||||
click==8.3.1
|
||||
comm==0.2.3
|
||||
contourpy==1.3.3
|
||||
cycler==0.12.1
|
||||
debugpy==1.8.20
|
||||
decorator==5.2.1
|
||||
defusedxml==0.7.1
|
||||
executing==2.2.1
|
||||
fastjsonschema==2.21.2
|
||||
fonttools==4.61.1
|
||||
fqdn==1.5.1
|
||||
h11==0.16.0
|
||||
httpcore==1.0.9
|
||||
httpx==0.28.1
|
||||
idna==3.11
|
||||
ipykernel==7.2.0
|
||||
ipython==9.10.0
|
||||
ipython_pygments_lexers==1.1.1
|
||||
ipywidgets==8.1.8
|
||||
isoduration==20.11.0
|
||||
jedi==0.19.2
|
||||
Jinja2==3.1.6
|
||||
joblib==1.5.3
|
||||
json5==0.13.0
|
||||
jsonpointer==3.0.0
|
||||
jsonschema==4.26.0
|
||||
jsonschema-specifications==2025.9.1
|
||||
jupyter==1.1.1
|
||||
jupyter-console==6.6.3
|
||||
jupyter-events==0.12.0
|
||||
jupyter-lsp==2.3.0
|
||||
jupyter_client==8.8.0
|
||||
jupyter_core==5.9.1
|
||||
jupyter_server==2.17.0
|
||||
jupyter_server_terminals==0.5.4
|
||||
jupyterlab==4.5.4
|
||||
jupyterlab_pygments==0.3.0
|
||||
jupyterlab_server==2.28.0
|
||||
jupyterlab_widgets==3.0.16
|
||||
kiwisolver==1.4.9
|
||||
lark==1.3.1
|
||||
MarkupSafe==3.0.3
|
||||
matplotlib==3.10.8
|
||||
matplotlib-inline==0.2.1
|
||||
mistune==3.2.0
|
||||
nbclient==0.10.4
|
||||
nbconvert==7.17.0
|
||||
nbformat==5.10.4
|
||||
nest-asyncio==1.6.0
|
||||
nltk==3.9.2
|
||||
notebook==7.5.3
|
||||
notebook_shim==0.2.4
|
||||
numpy==2.4.2
|
||||
packaging==26.0
|
||||
pandas==3.0.1
|
||||
pandas-stubs==3.0.0.260204
|
||||
pandocfilters==1.5.1
|
||||
parso==0.8.6
|
||||
pexpect==4.9.0
|
||||
pillow==12.1.1
|
||||
platformdirs==4.9.2
|
||||
prometheus_client==0.24.1
|
||||
prompt_toolkit==3.0.52
|
||||
psutil==7.2.2
|
||||
ptyprocess==0.7.0
|
||||
pure_eval==0.2.3
|
||||
pyarrow==23.0.1
|
||||
pycparser==3.0
|
||||
Pygments==2.19.2
|
||||
pyparsing==3.3.2
|
||||
python-dateutil==2.9.0.post0
|
||||
python-json-logger==4.0.0
|
||||
PyYAML==6.0.3
|
||||
pyzmq==27.1.0
|
||||
referencing==0.37.0
|
||||
regex==2026.1.15
|
||||
requests==2.32.5
|
||||
rfc3339-validator==0.1.4
|
||||
rfc3986-validator==0.1.1
|
||||
rfc3987-syntax==1.1.0
|
||||
rpds-py==0.30.0
|
||||
scikit-learn==1.8.0
|
||||
scipy==1.17.1
|
||||
Send2Trash==2.1.0
|
||||
setuptools==82.0.0
|
||||
six==1.17.0
|
||||
soupsieve==2.8.3
|
||||
stack-data==0.6.3
|
||||
terminado==0.18.1
|
||||
threadpoolctl==3.6.0
|
||||
tinycss2==1.4.0
|
||||
tornado==6.5.4
|
||||
tqdm==4.67.3
|
||||
traitlets==5.14.3
|
||||
typing_extensions==4.15.0
|
||||
tzdata==2025.3
|
||||
uri-template==1.3.0
|
||||
urllib3==2.6.3
|
||||
wcwidth==0.6.0
|
||||
webcolors==25.10.0
|
||||
webencodings==0.5.1
|
||||
websocket-client==1.9.0
|
||||
widgetsnbextension==4.0.15
|
||||
zstandard==0.25.0
|
||||
97
src/clean_data.py
Normal file
97
src/clean_data.py
Normal file
@@ -0,0 +1,97 @@
|
||||
from constants import DATASET_DIR, TEMP_DIR
|
||||
from helper import dataset_iterator
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
import nltk
|
||||
import re
|
||||
import shutil
|
||||
|
||||
# cleans text and returns a list of tokens.
|
||||
def clean_text(
|
||||
text:str,
|
||||
remove_regex_patterns:bool = True,
|
||||
remove_stopwords:bool = True,
|
||||
remove_special_characters:bool = True,
|
||||
stemming:bool = True
|
||||
) -> list[str]:
|
||||
|
||||
text = str(text).lower().strip()
|
||||
|
||||
if remove_regex_patterns:
|
||||
url_pattern = r'https?://\S+|www\.\S+'
|
||||
email_pattern = r'[\w.-]+@[\w]+\.[\w]+'
|
||||
date_pattern = r'([a-z]+ \d{1,2}[a-z]?, \d{4}|\d{2,4}[-/]\d{2,4}[-/]\d{2,4})' # add more date patterns
|
||||
number_pattern = r'\d+'
|
||||
|
||||
text = re.sub(url_pattern, "<URL>", text)
|
||||
text = re.sub(email_pattern, "<EMAIL>", text)
|
||||
text = re.sub(date_pattern, "<DATE>", text)
|
||||
text = re.sub(number_pattern, "<NUMBER>", text)
|
||||
|
||||
if remove_special_characters:
|
||||
text = re.sub(r'[^\w (?:<\w+>)]', " ", text)
|
||||
|
||||
tokenizer = nltk.RegexpTokenizer(r'<\w+>|\w+')
|
||||
tokens = tokenizer.tokenize(text) # type: ignore
|
||||
|
||||
if remove_stopwords:
|
||||
stopwords = stopwords = nltk.corpus.stopwords.words('english')
|
||||
tokens = [token for token in tokens if token not in stopwords] # type: ignore
|
||||
|
||||
if stemming:
|
||||
stemmer = nltk.SnowballStemmer("english")
|
||||
tokens = [stemmer.stem(token) if not re.match(r'<\w+>', token) else token for token in tokens] # type: ignore
|
||||
|
||||
return tokens # type: ignore
|
||||
|
||||
def clean_dataset(filename:str) -> None:
|
||||
output_path = f"{TEMP_DIR}/{filename}"
|
||||
writer = None
|
||||
for chunk in dataset_iterator(f"{DATASET_DIR}/{filename}"):
|
||||
chunk['tokens'] = chunk['content'].apply(clean_text)
|
||||
|
||||
columns_in_chunk = chunk.columns #[c for c in chunk.columns]
|
||||
table = pa.Table.from_pandas(chunk[columns_in_chunk])
|
||||
if writer is None:
|
||||
writer = pq.ParquetWriter(output_path, table.schema)
|
||||
writer.write_table(table)
|
||||
writer.close()
|
||||
|
||||
shutil.move(output_path, f"{DATASET_DIR}/{filename}")
|
||||
|
||||
def compute_vocab_reduction(filename: str) -> dict[str, float | int]:
|
||||
|
||||
dataset_path = f"{DATASET_DIR}/{filename}"
|
||||
|
||||
vocab_before_stopwords: set[str] = set()
|
||||
vocab_after_stopwords: set[str] = set()
|
||||
|
||||
vocab_after_stemming: set[str] = set()
|
||||
|
||||
for chunk in dataset_iterator(dataset_path):
|
||||
contents = chunk["content"]
|
||||
for text in contents:
|
||||
vocab_before_stopwords.update(clean_text(text, remove_stopwords=False, stemming=False))
|
||||
vocab_after_stopwords.update(clean_text(text, remove_stopwords=True, stemming=False))
|
||||
vocab_after_stemming.update(clean_text(text, remove_stopwords=True, stemming=True))
|
||||
|
||||
before_stop_size = len(vocab_before_stopwords)
|
||||
after_stop_size = len(vocab_after_stopwords)
|
||||
before_stem_size = after_stop_size
|
||||
after_stem_size = len(vocab_after_stemming)
|
||||
|
||||
stopwords_reduction_rate = (
|
||||
(before_stop_size - after_stop_size) / before_stop_size if before_stop_size else 0.0
|
||||
)
|
||||
stemming_reduction_rate = (
|
||||
(before_stem_size - after_stem_size) / before_stem_size if before_stem_size else 0.0
|
||||
)
|
||||
|
||||
return {
|
||||
"vocab_size_before_stopwords": before_stop_size,
|
||||
"vocab_size_after_stopwords": after_stop_size,
|
||||
"stopwords_reduction_rate": stopwords_reduction_rate,
|
||||
"vocab_size_before_stemming": before_stem_size,
|
||||
"vocab_size_after_stemming": after_stem_size,
|
||||
"stemming_reduction_rate": stemming_reduction_rate,
|
||||
}
|
||||
14
src/constants.py
Normal file
14
src/constants.py
Normal file
@@ -0,0 +1,14 @@
|
||||
import os
|
||||
|
||||
DATA_DIR = os.path.abspath("../data")
|
||||
MODEL_DIR = os.path.abspath("../models")
|
||||
DATASET_DIR = f"{DATA_DIR}/datasets"
|
||||
TRAINING_DIR = f"{DATA_DIR}/training"
|
||||
VALIDATION_DIR = f"{DATA_DIR}/validation"
|
||||
TESTING_DIR = f"{DATA_DIR}/testing"
|
||||
TEMP_DIR = f"{DATA_DIR}/temp"
|
||||
ORIGINAL_DATASET_FILES = ["news_sample.csv", "995,000_rows.csv"]
|
||||
DATASET_FILES = ["news_sample.parquet", "995,000_rows.parquet"]
|
||||
|
||||
CHUNK_SIZE = 10000 # how many rows to work on at time, instead of loading the entire dataset into memory.
|
||||
MAX_ROWS = -1 # only work with MAX_ROWS rows so testing things out isnt crazy slow. Set to -1 for infinite.
|
||||
60
src/helper.py
Normal file
60
src/helper.py
Normal file
@@ -0,0 +1,60 @@
|
||||
from labels import Label
|
||||
from constants import CHUNK_SIZE, MAX_ROWS
|
||||
from typing import Iterator, cast
|
||||
import pyarrow.parquet as pq
|
||||
import pandas as pd
|
||||
|
||||
def default_labelling(article_type:str) -> Label:
|
||||
if article_type in ["reliable", "political", "clickbait"]:
|
||||
return Label.REAL
|
||||
return Label.FAKE
|
||||
|
||||
def only_fake_labelling(article_type:str) -> Label:
|
||||
if article_type == "fake":
|
||||
return Label.FAKE
|
||||
return Label.REAL
|
||||
|
||||
def not_reliable_labelling(article_type:str) -> Label:
|
||||
if article_type == "reliable":
|
||||
return Label.REAL
|
||||
return Label.FAKE
|
||||
|
||||
def LIAR_labelling(article_type:str) -> Label:
|
||||
if article_type in ["true", "half-true", "barely-true", "mostly-true"]:
|
||||
return Label.REAL
|
||||
return Label.FAKE
|
||||
|
||||
# Deprecated, don't use, just use pd.read_parquet instead
|
||||
def dataset_iterator(dataset_file:str, columns:list[str] | None = None) -> Iterator[pd.DataFrame]:
|
||||
pq_file = pq.ParquetFile(dataset_file)
|
||||
rows_read = 0
|
||||
for batch in pq_file.iter_batches(batch_size=CHUNK_SIZE, columns=columns): # type: ignore
|
||||
rows_read += len(batch) # type: ignore
|
||||
if rows_read > MAX_ROWS and MAX_ROWS > 0:
|
||||
return
|
||||
# cast to ignore type warnings.
|
||||
yield cast(pd.DataFrame, batch.to_pandas()) # type: ignore
|
||||
|
||||
def csv_to_parquet(input_path: str, output_path: str):
|
||||
# csv.field_size_limit(sys.maxsize) # if i don't use this and engine="python" i cant read all datasets.
|
||||
# writer = None
|
||||
# for chunk in pd.read_csv(input_path, chunksize=CHUNK_SIZE, dtype=str, engine="python"): # who the fuck put a string in the id column
|
||||
# chunk = chunk.fillna("")
|
||||
# table = pa.Table.from_pandas(chunk) # type: ignore
|
||||
# if writer is None:
|
||||
# writer = pq.ParquetWriter(output_path, table.schema) # type: ignore
|
||||
# writer.write_table(table) # type: ignore
|
||||
# writer.close() # type: ignore
|
||||
pd.read_csv(input_path, low_memory=False).to_parquet(output_path)
|
||||
|
||||
def get_time_boundaries (filename: str) : #type: ignore
|
||||
# Only load the timestamp column to save RAM
|
||||
df_dates = pq.read_table(filename, columns=['scraped_at']).to_pandas() #type: ignore
|
||||
df_dates['scraped_at'] = pd.to_datetime(df_dates['scraped_at'], format='ISO8601', errors='coerce',utc=True) #type: ignore
|
||||
# Sort the dates to find the percentiles
|
||||
sorted_dates = df_dates['scraped_at'].sort_values() #type: ignore
|
||||
# Find the timestamps at the 80th and 90th percentile
|
||||
train_cut = sorted_dates.quantile(0.80) #type: ignore
|
||||
val_cut = sorted_dates.quantile(0.90) #type: ignore
|
||||
return train_cut, val_cut #type: ignore
|
||||
|
||||
5
src/labels.py
Normal file
5
src/labels.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from enum import Enum
|
||||
|
||||
class Label(Enum):
|
||||
REAL = 0
|
||||
FAKE = 1
|
||||
64
src/main.py
Executable file
64
src/main.py
Executable file
@@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import argparse
|
||||
from models.svm import SVM_model
|
||||
from models.gradient_boosting import Gradient_boosting_model
|
||||
from models.logistic_regression import Logistic_model
|
||||
from models.baseline import Baseline_model
|
||||
from helper import default_labelling, not_reliable_labelling, only_fake_labelling, LIAR_labelling
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="Fakenews detector",
|
||||
description="Train and test models",
|
||||
usage="The following is an example of training a logistic regression model on news_sample.parquet:\n"+
|
||||
"python main.py --model_type logistic --model_file logistic_news_sample.model --data_file news_sample.parquet --train",
|
||||
)
|
||||
parser.add_argument("--train", action="store_true", help="Whether model should be trained, if not set it will be tested instead")
|
||||
parser.add_argument("--validate", action="store_true", help="Wheter to use validation set when testing/validating")
|
||||
parser.add_argument("--model_type", "-t", required=True, choices=["baseline", "logistic", "svm", "gradient_boosting"], help="The type of model: baseline, logistic, ...")
|
||||
parser.add_argument("--model_file", "-f", required=True, default="", help="The model file to save to when training, or load from when testing")
|
||||
parser.add_argument("--data_file", "-d", required=True, help="The datafile used when training or testing")
|
||||
parser.add_argument("--label_translator", "-l", required=False, default = "", help="The translator function used by the model, such as \"not_reliable\", that only considers 'reliable' tagged news Real, ignored if not using --train.")
|
||||
parser.add_argument("--hyperparameters", "-p", required=False, nargs="+", default ="", help="The hyperparameters used when training the model, written like c=1")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
label_translator = default_labelling
|
||||
if "not_reliable" in args.label_translator.lower():
|
||||
label_translator = not_reliable_labelling
|
||||
if "only_fake" in args.label_translator.lower():
|
||||
label_translator = only_fake_labelling
|
||||
if "liar" in args.data_file.lower():
|
||||
label_translator = LIAR_labelling
|
||||
args.data_file = "LIAR.parquet"
|
||||
|
||||
if args.model_type == "logistic":
|
||||
model = Logistic_model(label_translator=label_translator)
|
||||
elif args.model_type == "svm":
|
||||
model = SVM_model(label_translator=label_translator)
|
||||
elif args.model_type == "gradient_boosting":
|
||||
model = Gradient_boosting_model(label_translator=label_translator)
|
||||
else:
|
||||
model = Baseline_model(label_translator=label_translator)
|
||||
|
||||
|
||||
if args.train:
|
||||
hyperparameters:dict[str, float] = {}
|
||||
for parameter in args.hyperparameters:
|
||||
key, value = parameter.split("=")
|
||||
hyperparameters[key] = float(value)
|
||||
|
||||
model.train(args.data_file, hyperparameters)
|
||||
model.save(args.model_file)
|
||||
|
||||
if not args.train:
|
||||
model.load(args.model_file)
|
||||
if "liar" in args.data_file.lower():
|
||||
model.test("LIAR.parquet", validate=False)
|
||||
else:
|
||||
model.test(args.data_file, args.validate)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
6
src/models/Untitled.ipynb
Normal file
6
src/models/Untitled.ipynb
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"cells": [],
|
||||
"metadata": {},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
6
src/models/Untitled1.ipynb
Normal file
6
src/models/Untitled1.ipynb
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"cells": [],
|
||||
"metadata": {},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
846
src/models/Untitled2.ipynb
Normal file
846
src/models/Untitled2.ipynb
Normal file
@@ -0,0 +1,846 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "3ed30f2e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ModuleNotFoundError",
|
||||
"evalue": "No module named 'torch'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtorch\u001b[39;00m \n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtorch\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mnn\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnn\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtorch\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mnn\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mfunctional\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mF\u001b[39;00m \n",
|
||||
"\u001b[31mModuleNotFoundError\u001b[39m: No module named 'torch'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch \n",
|
||||
"import torch.nn as nn\n",
|
||||
"import torch.nn.functional as F \n",
|
||||
"import pandas as pd\n",
|
||||
"from torch.utils.data import Dataset, DataLoader\n",
|
||||
"from collections import Counter\n",
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"sys.path.append(os.path.join(os.getcwd(), '../'))\n",
|
||||
"from helper import default_labelling\n",
|
||||
"from sklearn.metrics import f1_score\n",
|
||||
"import numpy as np"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "42edceb8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"label_map = {\n",
|
||||
" 'Label.FAKE': 0,\n",
|
||||
" 'Label.REAL': 1}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0aa1a427",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Pipelining process"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c7730d65",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = pd.read_parquet(\"../../data/training/995,000_rows.parquet\", columns=['tokens','type'])\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"df['label'] = df['type'].apply(default_labelling).astype(str)\n",
|
||||
"df['label'] = df['label'].map(label_map).astype(int)\n",
|
||||
"df = df.drop(columns=['type'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c31caf06",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_test = pd.read_parquet(\"../../data/testing/995,000_rows.parquet\", columns=['tokens','type'])\n",
|
||||
"\n",
|
||||
"df_test['label'] = df_test['type'].apply(default_labelling).astype(str)\n",
|
||||
"df_test['label'] = df_test['label'].map(label_map).astype(int)\n",
|
||||
"df_test = df_test.drop(columns=['type'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5c0c93ab",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_val = pd.read_parquet(\"../../data/validation/995,000_rows.parquet\", columns=['tokens','type'])\n",
|
||||
"df_val['label'] = df_val['type'].apply(default_labelling).astype(str)\n",
|
||||
"df_val['label'] = df_val['label'].map(label_map).astype(int)\n",
|
||||
"df_val = df_val.drop(columns=['type'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "19188ef7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# print(\"Loading Parquet file...\")\n",
|
||||
"\n",
|
||||
"# # Check the total number of rows (articles)\n",
|
||||
"# print(f\"Total rows in the raw Parquet file: {len(df)}\")\n",
|
||||
"\n",
|
||||
"# # Look at the first few rows to make sure the data looks correct\n",
|
||||
"# print(\"\\n--- First 3 Rows ---\")\n",
|
||||
"# print(df.head(3))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fa455147",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# count how many tokens we have in the corpuse \n",
|
||||
"word_counts = Counter()\n",
|
||||
"for x in df['tokens']:\n",
|
||||
" word_counts.update(x)\n",
|
||||
" \n",
|
||||
"# Keep the top 50,000 words. \n",
|
||||
"# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)\n",
|
||||
"vocab = {\"<PAD>\": 0, \"<UNK>\": 1}\n",
|
||||
"for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):\n",
|
||||
" vocab[word] = idx\n",
|
||||
"\n",
|
||||
"print(f\"Vocabulary built with {len(vocab)} words.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b9ba0021",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a Custom PyTorch Datase\n",
|
||||
"\n",
|
||||
"# a wrapper for the data that PyTorch knows how to talk to.\n",
|
||||
"class FakeNewsDataset(Dataset):\n",
|
||||
" def __init__(self, dataframe, vocab, max_length=256):\n",
|
||||
" self.dataframe = dataframe\n",
|
||||
" self.vocab = vocab\n",
|
||||
" self.max_length = max_length\n",
|
||||
"\n",
|
||||
"# Tells PyTorch how many articles we have\n",
|
||||
"#PyTorch calls this internally to know when to stop fetching data.\n",
|
||||
" def __len__(self):\n",
|
||||
" return len(self.dataframe)\n",
|
||||
" \n",
|
||||
" def __getitem__(self, idx):\n",
|
||||
" # Grabs one article and its label at a time\n",
|
||||
" tokens = self.dataframe.iloc[idx]['tokens']\n",
|
||||
" label = self.dataframe.iloc[idx]['label']\n",
|
||||
"\n",
|
||||
" # Convert text tokens to Integer IDs\n",
|
||||
" article_ids = [self.vocab.get(word, 1) for word in tokens]\n",
|
||||
"\n",
|
||||
" # Truncate or Pad the article so they are all exactly 'max_length' long\n",
|
||||
" if len(article_ids) > self.max_length:\n",
|
||||
" article_ids = article_ids[:self.max_length]\n",
|
||||
" else:\n",
|
||||
" padding = [0] * (self.max_length - len(article_ids))\n",
|
||||
" article_ids.extend(padding)\n",
|
||||
" \n",
|
||||
" # Return as PyTorch tensors\n",
|
||||
" return torch.tensor(article_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5f3f4096",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## Prepare the DataLoader \n",
|
||||
"# Wrap The dataframe in the Dataset class\n",
|
||||
"\n",
|
||||
"# The DataLoader feeds the data to the model in batches (e.g., 64 articles at a time)\n",
|
||||
"# This prevents the computer from running out of RAM!\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"my_train_dataset = FakeNewsDataset(dataframe=df, vocab=vocab, max_length=256)\n",
|
||||
"# Shuffle is true for training so the data keeps getting shuffled when trained and the model does not memorise the data\n",
|
||||
"train_dataloader = DataLoader(my_train_dataset, batch_size=64, shuffle=True,num_workers=4, # Start with 4; if CPU stays cool, try 6\n",
|
||||
"pin_memory=True, # Essential for fast data transfer\n",
|
||||
"prefetch_factor=2)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"val_data = FakeNewsDataset(dataframe=df_val, vocab=vocab, max_length=256)\n",
|
||||
"val_dataloader = DataLoader(val_data, batch_size=64, shuffle=False)\n",
|
||||
"\n",
|
||||
"test_data = FakeNewsDataset(dataframe=df_test, vocab=vocab, max_length=256)\n",
|
||||
"test_dataloader = DataLoader(test_data, batch_size=64, shuffle=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fd4f08a6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Checking if the data conversion works"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9bcbcf9b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# features, labels = next(iter(train_dataloader))\n",
|
||||
"# # 2. Check the shapes (the dimensions of your tensors)\n",
|
||||
"# print(\"--- Tensor Shapes ---\")\n",
|
||||
"# print(f\"Features shape: {features.shape}\") \n",
|
||||
"# print(f\"Labels shape: {labels.shape}\") \n",
|
||||
"\n",
|
||||
"# # 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)\n",
|
||||
"# print(\"\\n--- Data Types ---\")\n",
|
||||
"# print(f\"Features dtype: {features.dtype}\")\n",
|
||||
"# print(f\"Labels dtype: {labels.dtype}\")\n",
|
||||
"\n",
|
||||
"# # 4. Peek at the actual data for the very first article in this batch\n",
|
||||
"# print(\"\\n--- First Article Peek ---\")\n",
|
||||
"# print(f\"Label: {labels[0].item()} (0 = Real, 1 = Fake)\")\n",
|
||||
"# print(f\"Tokens (first 20 IDs): {features[0][:20]}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b70e45ac",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class BaseModel(nn.Module):\n",
|
||||
" def __init__(self, vocab_size, embed_dim=32, h1=256, h2=128, out_features=2):\n",
|
||||
" super().__init__()\n",
|
||||
" \n",
|
||||
" # The Embedding Layer: Turns word IDs into rich numerical vectors\n",
|
||||
" self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)\n",
|
||||
" \n",
|
||||
" # The Linear Layers: Learn the patterns to decide Fake vs. Real\n",
|
||||
" self.fc1 = nn.Linear(embed_dim, h1)\n",
|
||||
" self.fc2 = nn.Linear(h1, h2)\n",
|
||||
" self.out = nn.Linear(h2, out_features)\n",
|
||||
" \n",
|
||||
" def forward(self, x):\n",
|
||||
" \n",
|
||||
" # x starts as integers: shape (batch_size, sequence_length) -> e.g., (64, 256)\n",
|
||||
" # Pass through embedding\n",
|
||||
" x = self.embedding(x) \n",
|
||||
" # Average the word vectors to get one single vector for the whole article\n",
|
||||
" x = x.mean(dim=1) \n",
|
||||
" \n",
|
||||
" # Pass through hidden layers with ReLU activation\n",
|
||||
" x = F.relu(self.fc1(x))\n",
|
||||
" x = F.relu(self.fc2(x))\n",
|
||||
" \n",
|
||||
" # Output layer (gives us the raw scores for 'Real' and 'Fake')\n",
|
||||
" x = self.out(x)\n",
|
||||
" return x\n",
|
||||
"model_basic =BaseModel(vocab_size=len((vocab)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "efa6c453",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"'Advanced'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "52cb9377",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class advanced_model(nn.Module):\n",
|
||||
" def __init__(self, vocab_size, embed_dim=64, hidden_dim=128,num_layer = 2, out_features=2):\n",
|
||||
" super().__init__()\n",
|
||||
" \n",
|
||||
" # 1. The Embedding Layer (Same as before)\n",
|
||||
" self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)\n",
|
||||
" \n",
|
||||
" # # 2. The GRU Layer (Extra layer)\n",
|
||||
" # batch_first=True is required because our DataLoader outputs (batch_size, sequence_length) \n",
|
||||
" self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, num_layers=2,batch_first=True,bidirectional=True, \n",
|
||||
" dropout=0.3)\n",
|
||||
" \n",
|
||||
" # 3. The Final Output Layer\n",
|
||||
" # connect the GRU's memory (hidden_dim) directly to our Real/Fake outputs\n",
|
||||
" self.out = nn.Linear(hidden_dim, out_features)\n",
|
||||
" self.fc = nn.Linear(hidden_dim * 2, out_features)\n",
|
||||
" def forward(self, x):\n",
|
||||
" # x shape: (batch_size, sequence_length) -> e.g., (64, 256)\n",
|
||||
" \n",
|
||||
" #Get the word embeddings\n",
|
||||
" x = self.embedding(x) \n",
|
||||
" # x shape becomes: (64, 256, 32)\n",
|
||||
" \n",
|
||||
" # Pass the embeddings into the GRU\n",
|
||||
" # A GRU outputs two things: the output at every single word, AND its final memory state.\n",
|
||||
" # We use '_' to ignore the step-by-step output, and save 'hidden_state'.\n",
|
||||
" _, hidden = self.gru(x)\n",
|
||||
" \n",
|
||||
" # 4. Extract and Concatenate the final forward and backward states\n",
|
||||
" # hidden[-2] is the last forward state, hidden[-1] is the last backward state\n",
|
||||
" out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)\n",
|
||||
" \n",
|
||||
" return self.fc(out)\n",
|
||||
" \n",
|
||||
"# Initilize\n",
|
||||
"model_adv = advanced_model(vocab_size=len(vocab))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "31b581d0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Training"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a8e1f849",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ae976afb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def evaluate_performance(model, dataloader, device):\n",
|
||||
" model.eval() # Put model in evaluation mode\n",
|
||||
" \n",
|
||||
" all_predictions = []\n",
|
||||
" all_true_labels = []\n",
|
||||
" \n",
|
||||
" # Turn off gradient tracking to save memory\n",
|
||||
" with torch.no_grad():\n",
|
||||
" for features, labels in dataloader:\n",
|
||||
" features = features.to(device)\n",
|
||||
" labels = labels.to(device)\n",
|
||||
" \n",
|
||||
" # Get model scores\n",
|
||||
" scores = model(features)\n",
|
||||
" \n",
|
||||
" # Find the predicted class (0 or 1)\n",
|
||||
" _, predictions = torch.max(scores,1)\n",
|
||||
" \n",
|
||||
" # Save predictions and actual labels to lists\n",
|
||||
" # all_predictions.extend(predictions.cpu().tolist())\n",
|
||||
" # all_true_labels.extend(labels.cpu().tolist())\n",
|
||||
" all_predictions.extend(predictions.cpu().numpy().flatten().tolist())\n",
|
||||
" all_true_labels.extend(labels.cpu().numpy().flatten().tolist())\n",
|
||||
" \n",
|
||||
" all_predictions = np.array(all_predictions)\n",
|
||||
" all_true_labels = np.array(all_true_labels)\n",
|
||||
" \n",
|
||||
" accuracy = (all_predictions == all_true_labels).mean() * 100\n",
|
||||
" \n",
|
||||
" # 4. Calculate F1 Score\n",
|
||||
" # average='macro' is best for your report to show you care about both classes equally\n",
|
||||
" f1 = f1_score(all_true_labels, all_predictions, average='macro')\n",
|
||||
" model.train() # Return model to training mode just in case\n",
|
||||
" return accuracy, f1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "65e26f88",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def train_model(model, train_loader, val_loader, device, epochs=5, lr=0.001):\n",
|
||||
" model = model.to(device)\n",
|
||||
" criterion = nn.CrossEntropyLoss()\n",
|
||||
" optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n",
|
||||
" \n",
|
||||
" # Dictionary to store results for your report\n",
|
||||
" history = {'train_loss': [], 'val_acc': [], 'val_f1': []}\n",
|
||||
"\n",
|
||||
" print(f\"Training {model.__class__.__name__} on {device}...\")\n",
|
||||
"\n",
|
||||
" for epoch in range(epochs):\n",
|
||||
" model.train()\n",
|
||||
" total_loss = 0\n",
|
||||
" \n",
|
||||
" for batch_idx, (features, labels) in enumerate(train_loader):\n",
|
||||
" features, labels = features.to(device), labels.to(device)\n",
|
||||
" \n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" predictions = model(features)\n",
|
||||
" loss = criterion(predictions, labels)\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
" \n",
|
||||
" total_loss += loss.item()\n",
|
||||
" \n",
|
||||
" avg_loss = total_loss / len(train_loader)\n",
|
||||
" \n",
|
||||
" # After each epoch, evaluate on validation set\n",
|
||||
" val_acc, val_f1 = evaluate_performance(model, val_loader, device)\n",
|
||||
" \n",
|
||||
" # Save results to our history dictionary\n",
|
||||
" history['train_loss'].append(avg_loss)\n",
|
||||
" history['val_acc'].append(val_acc)\n",
|
||||
" history['val_f1'].append(val_f1)\n",
|
||||
" \n",
|
||||
" print(f\"\\n Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f} \\n Val Acc: {val_acc:.2f}% \\n Val F1: {val_f1:.4f}\")\n",
|
||||
"\n",
|
||||
" return history # Return the results so we can plot them later"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3acf0f2b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_995_basic =train_model (model_basic, train_dataloader, val_dataloader, device, epochs =7 )\n",
|
||||
"print(train_995_basic )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9c0f7f65",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_995_adv =train_model (model_adv, train_dataloader, val_dataloader, device, epochs =7 )\n",
|
||||
"print(train_995_adv )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a1e10032",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "12959462",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9fb31c02",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Evaluation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2630d40a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Basic model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "73c388e7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# # 1. The Evaluation Function\n",
|
||||
"# def evaluate_performance(model, dataloader, device):\n",
|
||||
"# model.eval() # Put model in evaluation mode\n",
|
||||
" \n",
|
||||
"# all_predictions = []\n",
|
||||
"# all_true_labels = []\n",
|
||||
" \n",
|
||||
"# # Turn off gradient tracking to save memory\n",
|
||||
"# with torch.no_grad():\n",
|
||||
"# for features, labels in dataloader:\n",
|
||||
"# features = features.to(device)\n",
|
||||
"# labels = labels.to(device)\n",
|
||||
" \n",
|
||||
"# # Get model scores\n",
|
||||
"# scores = model(features)\n",
|
||||
" \n",
|
||||
"# # Find the predicted class (0 or 1)\n",
|
||||
"# _, predictions = torch.max(scores,1)\n",
|
||||
" \n",
|
||||
"# # Save predictions and actual labels to lists\n",
|
||||
"# # all_predictions.extend(predictions.cpu().tolist())\n",
|
||||
"# # all_true_labels.extend(labels.cpu().tolist())\n",
|
||||
"# all_predictions.extend(predictions.cpu().numpy().flatten().tolist())\n",
|
||||
"# all_true_labels.extend(labels.cpu().numpy().flatten().tolist())\n",
|
||||
" \n",
|
||||
"# all_predictions = np.array(all_predictions)\n",
|
||||
"# all_true_labels = np.array(all_true_labels)\n",
|
||||
" \n",
|
||||
"# accuracy = (all_predictions == all_true_labels).mean() * 100\n",
|
||||
" \n",
|
||||
"# # 4. Calculate F1 Score\n",
|
||||
"# # average='macro' is best for your report to show you care about both classes equally\n",
|
||||
"# f1 = f1_score(all_true_labels, all_predictions, average='macro')\n",
|
||||
"# model.train() # Return model to training mode just in case\n",
|
||||
"# return accuracy, f1\n",
|
||||
"# # # Change me based on the model\n",
|
||||
"\n",
|
||||
"# # model = model_basic.to(device)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# # print(f\"Training on: {device}\")\n",
|
||||
"\n",
|
||||
"# # # 2. Setup Loss and Optimizer\n",
|
||||
"# # # CrossEntropyLoss is the standard for classification tasks\n",
|
||||
"# # criterion = nn.CrossEntropyLoss() \n",
|
||||
"# # # Adam is a very reliable, fast optimizer\n",
|
||||
"# # optimizer = torch.optim.Adam(model.parameters(), lr=0.001) \n",
|
||||
"\n",
|
||||
"# # # 3. The Training Loop\n",
|
||||
"# # epochs = 7# Start with a small number of passes through the whole dataset\n",
|
||||
"\n",
|
||||
"# # for epoch in range(epochs):\n",
|
||||
"# # model.train() # Tell the model it is in training mode\n",
|
||||
"# # total_loss = 0\n",
|
||||
" \n",
|
||||
"# # # Loop through our batches of 64 articles\n",
|
||||
"# # for batch_idx, (features, labels) in enumerate(train_dataloader):\n",
|
||||
" \n",
|
||||
"# # # Move data to the same device as the model (GPU/CPU)\n",
|
||||
"# # features = features.to(device)\n",
|
||||
"# # labels = labels.to(device)\n",
|
||||
" \n",
|
||||
"# # # Step A: Reset the optimizer's gradients\n",
|
||||
"# # optimizer.zero_grad()\n",
|
||||
" \n",
|
||||
"# # # Step B: Forward Pass (Have the model guess Real or Fake)\n",
|
||||
"# # predictions = model(features)\n",
|
||||
" \n",
|
||||
"# # # Step C: Calculate Loss (How wrong were the guesses?)\n",
|
||||
"# # loss = criterion(predictions, labels)\n",
|
||||
" \n",
|
||||
"# # # Step D: Backward Pass (Calculate how to fix the math)\n",
|
||||
"# # loss.backward()\n",
|
||||
" \n",
|
||||
"# # # Step E: Optimize (Actually apply the fixes to the model's weights)\n",
|
||||
"# # optimizer.step()\n",
|
||||
" \n",
|
||||
"# # total_loss += loss.item()\n",
|
||||
" \n",
|
||||
"# # # Print an update every 100 batches so we know it's working\n",
|
||||
"# # if batch_idx % 100 == 0:\n",
|
||||
"# # print(f\"Epoch [{epoch+1}/{epochs}] | Batch {batch_idx} | Loss: {loss.item():.4f}\")\n",
|
||||
" \n",
|
||||
"# # # Print the average loss at the end of each epoch\n",
|
||||
"# # avg_loss = total_loss / len(train_dataloader)\n",
|
||||
"# # print(f\"--- End of Epoch {epoch+1} | Average Loss: {avg_loss:.4f} ---\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "09b0ce98",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Advanced model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b2ca196d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# # 1. The Evaluation Function\n",
|
||||
"# def evaluate_performance(model_adv, dataloader, device):\n",
|
||||
"# model_adv.eval() # Put model in evaluation mode\n",
|
||||
" \n",
|
||||
"# all_predictions = []\n",
|
||||
"# all_true_labels = []\n",
|
||||
" \n",
|
||||
"# # Turn off gradient tracking to save memory\n",
|
||||
"# with torch.no_grad():\n",
|
||||
"# for features, labels in dataloader:\n",
|
||||
"# features = features.to(device)\n",
|
||||
"# labels = labels.to(device)\n",
|
||||
" \n",
|
||||
"# # Get model scores\n",
|
||||
"# scores = model_adv(features)\n",
|
||||
" \n",
|
||||
"# # Find the predicted class (0 or 1)\n",
|
||||
"# _, predictions = scores.max(1)\n",
|
||||
" \n",
|
||||
"# # Save predictions and actual labels to lists\n",
|
||||
"# all_predictions.extend(predictions.cpu().tolist())\n",
|
||||
"# all_true_labels.extend(labels.cpu().tolist())\n",
|
||||
" \n",
|
||||
"# # Calculate Accuracy\n",
|
||||
"# correct_guesses = sum(p == t for p, t in zip(all_predictions, all_true_labels))\n",
|
||||
"# accuracy = (correct_guesses / len(all_true_labels)) * 100\n",
|
||||
" \n",
|
||||
"# # Calculate F1 Score\n",
|
||||
"# f1 = f1_score(all_true_labels, all_predictions, average='macro')\n",
|
||||
" \n",
|
||||
"# model_adv.train() # Return model to training mode just in case\n",
|
||||
"# return accuracy, f1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5835388c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c6ca6771",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Basic model \")\n",
|
||||
"print(\" Validation \")\n",
|
||||
"val_acc995, val_f1_995 = evaluate_performance(model,val_dataloader, device)\n",
|
||||
"print(f\"Validation Accuracy: {val_acc995:.2f}%\")\n",
|
||||
"print(f\"Validation F1 Score: {val_f1_995:.4f}\")\n",
|
||||
"\n",
|
||||
"print(\"\\n Testing Phase \")\n",
|
||||
"test_acc995, test_f1_995 = evaluate_performance(model, test_dataloader, device)\n",
|
||||
"print(f\"Test Accuracy: {test_acc995:.2f}%\")\n",
|
||||
"print(f\"Test F1 Score: git {test_f1_995:.4f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e206d094",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\" GURU model \")\n",
|
||||
"print(\" Validation \")\n",
|
||||
"adv_val_acc995, val_f1995 = evaluate_performance(model_adv,val_dataloader, device)\n",
|
||||
"print(f\"Validation Accuracy: {adv_val_acc995:.2f}%\")\n",
|
||||
"print(f\"Validation F1 Score: {val_f1_995:.4f}\")\n",
|
||||
"\n",
|
||||
"print(\"\\n Testing \")\n",
|
||||
"test_acc, test_f1 = evaluate_performance(model_adv, test_dataloader, device)\n",
|
||||
"print(f\"Test Accuracy: {test_acc955:.2f}%\")\n",
|
||||
"print(f\"Test F1 Score: git {test_f1:.4f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f6a4ae72",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Liar data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fc7b8dac",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from helper import LIAR_labelling\n",
|
||||
"\n",
|
||||
"f\"../../data/training/LIAR.parquet\"\n",
|
||||
"df_LIAR = pd.read_parquet(\"../../data/testing/LIAR.parquet\",columns=['tokens','type'])\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"df_LIAR['label'] = df_LIAR['type'].apply(LIAR_labelling).astype(str)\n",
|
||||
"df_LIAR['label'] = df_LIAR['label'].map(label_map).astype(int)\n",
|
||||
"df_LIAR = df_LIAR.drop(columns=['type'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f73f6f84",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_LIAR.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9a76196e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# count how many tokens we have in the corpuse \n",
|
||||
"word_counts = Counter()\n",
|
||||
"for x in df_LIAR['tokens']:\n",
|
||||
" word_counts.update(x)\n",
|
||||
" \n",
|
||||
"# Keep the top 50,000 words. \n",
|
||||
"# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)\n",
|
||||
"vocab = {\"<PAD>\": 0, \"<UNK>\": 1}\n",
|
||||
"for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):\n",
|
||||
" vocab[word] = idx\n",
|
||||
"\n",
|
||||
"print(f\"Vocabulary built with {len(vocab)} words.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "39dbe869",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"LR_DATA = FakeNewsDataset(dataframe=df_LIAR, vocab=vocab, max_length=256)\n",
|
||||
"LR_dataloader = DataLoader(LR_DATA, batch_size=32, shuffle=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ccbc7885",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"features, labels = next(iter(LR_dataloader))\n",
|
||||
"# 2. Check the shapes (the dimensions of your tensors)\n",
|
||||
"print(\"--- Tensor Shapes ---\")\n",
|
||||
"print(f\"Features shape: {features.shape}\") \n",
|
||||
"print(f\"Labels shape: {labels.shape}\") \n",
|
||||
"\n",
|
||||
"# 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)\n",
|
||||
"print(\"\\n--- Data Types ---\")\n",
|
||||
"print(f\"Features dtype: {features.dtype}\")\n",
|
||||
"print(f\"Labels dtype: {labels.dtype}\")\n",
|
||||
"\n",
|
||||
"# 4. Peek at the actual data for the very first article in this batch\n",
|
||||
"print(\"\\n--- First Article Peek ---\")\n",
|
||||
"print(f\"Label: {labels[0].item()} (0 = Real, 1 = Fake)\")\n",
|
||||
"print(f\"Tokens (first 20 IDs): {features[0][:20]}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4698cd06",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# # 1. Check a single sample from the Dataset directly\n",
|
||||
"# single_features, single_label = LR_DATA[0]\n",
|
||||
"# print(f\"Single Sample - Features: {single_features.shape}, Label: {single_label.shape}\")\n",
|
||||
"\n",
|
||||
"# # 2. Check the DataLoader batch\n",
|
||||
"# batch_features, batch_labels = next(iter(LR_dataloader))\n",
|
||||
"# # print(f\"Batch - Features: {batch_features.shape}, Labels: {batch_labels.shape}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ed9c57c2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"evaluate_performance(model_adv,LR_dataloader,device)\n",
|
||||
"\n",
|
||||
"print(\"\\n--- 2. Testing Avanced model ---\")\n",
|
||||
"test_acc, test_f1 = evaluate_performance(model_adv, LR_dataloader, device)\n",
|
||||
"print(f\"Test Accuracy: {test_acc:.2f}%\")\n",
|
||||
"print(f\"Test F1 Score: git {test_f1:.4f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "74127f71",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"\\n--- 2. Testing BASE-Model ---\")\n",
|
||||
"test_acc, test_f1 = evaluate_performance(model, LR_dataloader, device)\n",
|
||||
"print(f\"Test Accuracy: {test_acc:.2f}%\")\n",
|
||||
"print(f\"Test F1 Score: git {test_f1:.4f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "33c54c0e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
51
src/models/baseline.py
Normal file
51
src/models/baseline.py
Normal file
@@ -0,0 +1,51 @@
|
||||
import pickle
|
||||
from typing import override, Callable
|
||||
from constants import TRAINING_DIR, MODEL_DIR
|
||||
from labels import Label
|
||||
from models.model import Model
|
||||
from helper import dataset_iterator, default_labelling
|
||||
import pandas as pd
|
||||
from random import random
|
||||
|
||||
class Baseline_model(Model):
|
||||
def __init__(self, model_filename:str="", label_translator: Callable[[str], Label] = default_labelling) -> None:
|
||||
self.fake_probability = 0
|
||||
super().__init__(model_filename, label_translator)
|
||||
|
||||
@override
|
||||
def train(self, training_dataset:str, hyperparameters:dict[str, float]={}) -> None:
|
||||
fake_amount = 0
|
||||
real_amount = 0
|
||||
total_amount = 0
|
||||
|
||||
for chunk in dataset_iterator(f"{TRAINING_DIR}/{training_dataset}", columns=['type']):
|
||||
chunk_fake_amount = (chunk['type'].map(self.label_translator) == Label.FAKE).sum()
|
||||
|
||||
fake_amount += chunk_fake_amount
|
||||
real_amount += len(chunk) - chunk_fake_amount
|
||||
total_amount += len(chunk)
|
||||
|
||||
self.fake_probability = fake_amount/total_amount
|
||||
|
||||
@override
|
||||
def classify(self, input:pd.Series) -> Label:
|
||||
if random() <= self.fake_probability:
|
||||
return Label.FAKE
|
||||
return Label.REAL
|
||||
|
||||
@override
|
||||
def save(self, filename:str) -> None:
|
||||
data = {}
|
||||
data["label_translator"] = self.label_translator
|
||||
data["fake_probability"] = self.fake_probability
|
||||
|
||||
with open(f"{MODEL_DIR}/{filename}", 'wb') as file:
|
||||
pickle.dump(data, file)
|
||||
|
||||
@override
|
||||
def load(self, filename:str) -> None:
|
||||
with open(f"{MODEL_DIR}/{filename}", 'rb') as file:
|
||||
data = pickle.load(file)
|
||||
|
||||
self.label_translator = data["label_translator"]
|
||||
self.fake_probability = data["fake_probability"]
|
||||
57
src/models/gradient_boosting.py
Normal file
57
src/models/gradient_boosting.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from constants import TRAINING_DIR, MODEL_DIR
|
||||
from models.model import Model
|
||||
from labels import Label
|
||||
from helper import default_labelling
|
||||
from typing import override, Callable
|
||||
import pandas as pd
|
||||
import pickle
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.ensemble import GradientBoostingClassifier
|
||||
|
||||
def no_tokenization(str):
|
||||
return str.split(" ")
|
||||
|
||||
class Gradient_boosting_model(Model):
|
||||
def __init__(self, model_filename: str = "", label_translator: Callable[[str], Label] = default_labelling) -> None:
|
||||
super().__init__(model_filename, label_translator)
|
||||
|
||||
@override
|
||||
def train(self, training_dataset: str, hyperparameters: dict[str, float] = {}) -> None:
|
||||
print("this model takes around 10 hours to train")
|
||||
|
||||
X = pd.read_parquet(f"{TRAINING_DIR}/{training_dataset}", columns=['tokens'])['tokens']
|
||||
X = X.apply(lambda token_list: " ".join(token_list))
|
||||
Y = pd.read_parquet(f"{TRAINING_DIR}/{training_dataset}", columns=['type'])['type']
|
||||
Y = Y.apply(lambda label: self.label_translator(label).value)
|
||||
|
||||
X = X[:250000]
|
||||
Y = Y[:250000]
|
||||
|
||||
model = Pipeline([
|
||||
("L string", TfidfVectorizer(tokenizer=no_tokenization)),
|
||||
("forest", GradientBoostingClassifier(random_state=0, n_estimators=4000))
|
||||
])
|
||||
|
||||
model.fit(X, Y)
|
||||
self.model = model
|
||||
|
||||
@override
|
||||
def classify(self, input: pd.Series) -> Label:
|
||||
X = " ".join(input['tokens'])
|
||||
return Label(self.model.predict([X])[0])
|
||||
|
||||
@override
|
||||
def save(self, filename: str) -> None:
|
||||
data = {}
|
||||
data["label_translator"] = self.label_translator
|
||||
data["model"] = self.model
|
||||
with open(f"{MODEL_DIR}/{filename}", 'wb') as file:
|
||||
pickle.dump(data, file)
|
||||
|
||||
@override
|
||||
def load(self, filename: str) -> None:
|
||||
with open(f"{MODEL_DIR}/{filename}", 'rb') as file:
|
||||
data = pickle.load(file)
|
||||
self.label_translator = data["label_translator"]
|
||||
self.model = data["model"]
|
||||
133
src/models/logistic_regression.py
Normal file
133
src/models/logistic_regression.py
Normal file
@@ -0,0 +1,133 @@
|
||||
import pickle
|
||||
from typing import override, Callable
|
||||
from scipy.sparse import lil_array
|
||||
from constants import TRAINING_DIR, MODEL_DIR
|
||||
from labels import Label
|
||||
from models.model import Model
|
||||
from helper import dataset_iterator
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from helper import default_labelling
|
||||
|
||||
class Logistic_model(Model):
|
||||
def __init__(self, model_filename: str = "", label_translator: Callable[[str], Label] = default_labelling) -> None:
|
||||
super().__init__(model_filename, label_translator)
|
||||
|
||||
@override
|
||||
def train(self, training_dataset: str, hyperparameters: dict[str, float] = {}) -> None:
|
||||
token_counts:dict[str, int] = {}
|
||||
sorted_token_counts:dict[str, int] = {}
|
||||
token_id:dict[str, int] = {} # converts top 10K words to id's.
|
||||
|
||||
domain_counts:dict[str, int] = {}
|
||||
sorted_domain_counts:dict[str, int] = {}
|
||||
domain_id:dict[str, int] = {} # converts top 500 domains to id's.
|
||||
|
||||
self.consider_metadata = False
|
||||
if "metadata" in hyperparameters and hyperparameters["metadata"] == 1:
|
||||
self.consider_metadata = True
|
||||
|
||||
columns = ["tokens", "domain"]
|
||||
rows_processed = 0
|
||||
for chunk in dataset_iterator(f"{TRAINING_DIR}/{training_dataset}", columns=columns):
|
||||
rows_processed += len(chunk)
|
||||
for _, row in chunk.iterrows():
|
||||
for token in row['tokens']:
|
||||
if token not in token_counts:
|
||||
token_counts[token] = 0
|
||||
token_counts[token] += 1
|
||||
if row['domain'] not in domain_counts:
|
||||
domain_counts[row['domain']] = 0
|
||||
domain_counts[row['domain']] += 1
|
||||
for token in sorted(token_counts, key=lambda token: token_counts[token], reverse=True):
|
||||
sorted_token_counts[token] = token_counts[token]
|
||||
for domain in sorted(domain_counts, key=lambda domain: domain_counts[domain], reverse=True):
|
||||
sorted_domain_counts[domain] = domain_counts[domain]
|
||||
|
||||
idx = 0
|
||||
for token in sorted_token_counts:
|
||||
token_id[token] = idx
|
||||
idx += 1
|
||||
if idx >= 10000:
|
||||
break
|
||||
|
||||
idx = 0
|
||||
for domain in sorted_domain_counts:
|
||||
domain_id[domain] = idx
|
||||
idx += 1
|
||||
if idx >= 1000:
|
||||
break
|
||||
|
||||
if self.consider_metadata: # consider things other than tokens
|
||||
X = lil_array((rows_processed, 11000), dtype="float64")
|
||||
else:
|
||||
X = lil_array((rows_processed, 10000), dtype="float64") # non-sparse array uses 74GiB ram on 995,000_rows. Sklearn LogisticRegression supports sparse arrays though. It still uses 9+ now.
|
||||
|
||||
Y = np.zeros(rows_processed, dtype=int)
|
||||
|
||||
columns.append("type")
|
||||
article_num = 0
|
||||
for chunk in dataset_iterator(f"{TRAINING_DIR}/{training_dataset}", columns=columns):
|
||||
for _, row in chunk.iterrows():
|
||||
tokens = row['tokens']
|
||||
article_type = row['type']
|
||||
|
||||
article_word_counts = np.zeros(10000)
|
||||
for token in tokens:
|
||||
if token not in token_id:
|
||||
continue # if they are not in top 10K vocab we can ignore them
|
||||
article_word_counts[token_id[token]] += 1
|
||||
X[article_num, :10000] = article_word_counts
|
||||
if self.consider_metadata:
|
||||
if row['domain'] in domain_id:
|
||||
X[article_num, 10000+domain_id[row['domain']]] = 1
|
||||
|
||||
Y[article_num] = self.label_translator(article_type).value
|
||||
article_num += 1
|
||||
|
||||
self.regression_model = LogisticRegression(max_iter=10000, n_jobs = -1, class_weight="balanced").fit(X, Y)
|
||||
self.token_id = token_id
|
||||
self.domain_id = domain_id
|
||||
|
||||
@override
|
||||
def classify(self, input: pd.Series) -> Label:
|
||||
if self.consider_metadata:
|
||||
x = np.zeros(11000)
|
||||
else:
|
||||
x = np.zeros(10000)
|
||||
|
||||
for token in input['tokens']:
|
||||
if token not in self.token_id:
|
||||
continue
|
||||
x[self.token_id[token]] += 1
|
||||
|
||||
if self.consider_metadata:
|
||||
if input['domain'] in self.domain_id:
|
||||
x[10000+self.domain_id[input['domain']]] = 1
|
||||
|
||||
prediction = self.regression_model.predict([x])[0]
|
||||
return Label(prediction)
|
||||
|
||||
@override
|
||||
def save(self, filename: str) -> None:
|
||||
data = {}
|
||||
data["label_translator"] = self.label_translator
|
||||
data["regression_model"] = self.regression_model
|
||||
data["token_id"] = self.token_id
|
||||
data["domain_id"] = self.domain_id
|
||||
data["consider_metadata"] = self.consider_metadata
|
||||
|
||||
with open(f"{MODEL_DIR}/{filename}", 'wb') as file:
|
||||
pickle.dump(data, file)
|
||||
|
||||
@override
|
||||
def load(self, filename: str) -> None:
|
||||
with open(f"{MODEL_DIR}/{filename}", 'rb') as file:
|
||||
data = pickle.load(file)
|
||||
|
||||
self.label_translator = data["label_translator"]
|
||||
self.regression_model = data["regression_model"]
|
||||
self.token_id = data["token_id"]
|
||||
self.domain_id = data["domain_id"]
|
||||
self.consider_metadata = data["consider_metadata"]
|
||||
61
src/models/model.py
Normal file
61
src/models/model.py
Normal file
@@ -0,0 +1,61 @@
|
||||
from abc import ABC, abstractmethod
|
||||
import pandas as pd
|
||||
from time import perf_counter
|
||||
|
||||
from constants import TESTING_DIR, VALIDATION_DIR
|
||||
from helper import LIAR_labelling, dataset_iterator, default_labelling
|
||||
from labels import Label
|
||||
from typing import Callable
|
||||
|
||||
class Model(ABC):
|
||||
def __init__(self, model_filename:str="", label_translator: Callable[[str], Label] = default_labelling) -> None:
|
||||
self.label_translator = label_translator
|
||||
if model_filename:
|
||||
self.load(model_filename)
|
||||
|
||||
@abstractmethod
|
||||
def train(self, training_dataset:str, hyperparameters:dict[str, float]) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def classify(self, input:pd.Series) -> Label:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def save(self, filename:str) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def load(self, filename:str) -> None:
|
||||
pass
|
||||
|
||||
def test(self, test_dataset: str, validate:bool=True) -> tuple[float, float, float, float]:
|
||||
TP = 0
|
||||
TN = 0
|
||||
FP = 0
|
||||
FN = 0
|
||||
|
||||
if test_dataset == "LIAR.parquet":
|
||||
self.label_translator = LIAR_labelling
|
||||
|
||||
dataset_dir = VALIDATION_DIR if validate else TESTING_DIR
|
||||
df = pd.read_parquet(f"{dataset_dir}/{test_dataset}")
|
||||
|
||||
expected = df['type'].apply(self.label_translator)
|
||||
predicted = df.apply(self.classify, axis=1)
|
||||
|
||||
TP = ((expected == Label.FAKE) & (predicted == Label.FAKE)).sum()
|
||||
FP = ((expected == Label.REAL) & (predicted == Label.FAKE)).sum()
|
||||
TN = ((expected == Label.REAL) & (predicted == Label.REAL)).sum()
|
||||
FN = ((expected == Label.FAKE) & (predicted == Label.REAL)).sum()
|
||||
|
||||
accuracy = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) > 0 else 0
|
||||
recall = (TP) / (TP + FN) if (TP + FN) > 0 else 0
|
||||
precision = (TP) / (TP + FP) if (TP + FP) > 0 else 0
|
||||
F1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
|
||||
|
||||
print(f"Accuracy {accuracy}")
|
||||
print(f"Recall {recall}")
|
||||
print(f"precision {precision}")
|
||||
print(f"F1-score {F1}")
|
||||
return (accuracy, recall, precision, F1)
|
||||
1189
src/models/nn.ipynb
Normal file
1189
src/models/nn.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
579
src/models/nn.ju.py
Normal file
579
src/models/nn.ju.py
Normal file
@@ -0,0 +1,579 @@
|
||||
# %%
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import pandas as pd
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
from collections import Counter
|
||||
import os
|
||||
import sys
|
||||
sys.path.append(os.path.join(os.getcwd(), '../'))
|
||||
from helper import default_labelling
|
||||
from sklearn.metrics import f1_score
|
||||
import numpy as np
|
||||
|
||||
|
||||
# %%
|
||||
label_map = {
|
||||
'Label.FAKE': 0,
|
||||
'Label.REAL': 1}
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
# Pipelining process
|
||||
"""
|
||||
|
||||
# %%
|
||||
df = pd.read_parquet("../../data/training/995,000_rows.parquet", columns=['tokens','type'])
|
||||
|
||||
|
||||
df['label'] = df['type'].apply(default_labelling).astype(str)
|
||||
df['label'] = df['label'].map(label_map).astype(int)
|
||||
df = df.drop(columns=['type'])
|
||||
|
||||
# %%
|
||||
df_test = pd.read_parquet("../../data/testing/995,000_rows.parquet", columns=['tokens','type'])
|
||||
|
||||
df_test['label'] = df_test['type'].apply(default_labelling).astype(str)
|
||||
df_test['label'] = df_test['label'].map(label_map).astype(int)
|
||||
df_test = df_test.drop(columns=['type'])
|
||||
|
||||
# %%
|
||||
df_val = pd.read_parquet("../../data/validation/995,000_rows.parquet", columns=['tokens','type'])
|
||||
df_val['label'] = df_val['type'].apply(default_labelling).astype(str)
|
||||
df_val['label'] = df_val['label'].map(label_map).astype(int)
|
||||
df_val = df_val.drop(columns=['type'])
|
||||
|
||||
# %%
|
||||
# print("Loading Parquet file...")
|
||||
|
||||
# # Check the total number of rows (articles)
|
||||
# print(f"Total rows in the raw Parquet file: {len(df)}")
|
||||
|
||||
# # Look at the first few rows to make sure the data looks correct
|
||||
# print("\n--- First 3 Rows ---")
|
||||
# print(df.head(3))
|
||||
|
||||
# %%
|
||||
# count how many tokens we have in the corpuse
|
||||
word_counts = Counter()
|
||||
for x in df['tokens']:
|
||||
word_counts.update(x)
|
||||
|
||||
# Keep the top 50,000 words.
|
||||
# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)
|
||||
vocab = {"<PAD>": 0, "<UNK>": 1}
|
||||
for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):
|
||||
vocab[word] = idx
|
||||
|
||||
print(f"Vocabulary built with {len(vocab)} words.")
|
||||
|
||||
# %%
|
||||
# Create a Custom PyTorch Datase
|
||||
|
||||
# a wrapper for the data that PyTorch knows how to talk to.
|
||||
class FakeNewsDataset(Dataset):
|
||||
def __init__(self, dataframe, vocab, max_length=256):
|
||||
self.dataframe = dataframe
|
||||
self.vocab = vocab
|
||||
self.max_length = max_length
|
||||
|
||||
# Tells PyTorch how many articles we have
|
||||
#PyTorch calls this internally to know when to stop fetching data.
|
||||
def __len__(self):
|
||||
return len(self.dataframe)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
# Grabs one article and its label at a time
|
||||
tokens = self.dataframe.iloc[idx]['tokens']
|
||||
label = self.dataframe.iloc[idx]['label']
|
||||
|
||||
# Convert text tokens to Integer IDs
|
||||
article_ids = [self.vocab.get(word, 1) for word in tokens]
|
||||
|
||||
# Truncate or Pad the article so they are all exactly 'max_length' long
|
||||
if len(article_ids) > self.max_length:
|
||||
article_ids = article_ids[:self.max_length]
|
||||
else:
|
||||
padding = [0] * (self.max_length - len(article_ids))
|
||||
article_ids.extend(padding)
|
||||
|
||||
# Return as PyTorch tensors
|
||||
return torch.tensor(article_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)
|
||||
|
||||
|
||||
# %%
|
||||
## Prepare the DataLoader
|
||||
# Wrap The dataframe in the Dataset class
|
||||
|
||||
# The DataLoader feeds the data to the model in batches (e.g., 64 articles at a time)
|
||||
# This prevents the computer from running out of RAM!
|
||||
|
||||
|
||||
my_train_dataset = FakeNewsDataset(dataframe=df, vocab=vocab, max_length=256)
|
||||
# Shuffle is true for training so the data keeps getting shuffled when trained and the model does not memorise the data
|
||||
train_dataloader = DataLoader(my_train_dataset, batch_size=64, shuffle=True,num_workers=4, # Start with 4; if CPU stays cool, try 6
|
||||
pin_memory=True, # Essential for fast data transfer
|
||||
prefetch_factor=2)
|
||||
|
||||
|
||||
val_data = FakeNewsDataset(dataframe=df_val, vocab=vocab, max_length=256)
|
||||
val_dataloader = DataLoader(val_data, batch_size=64, shuffle=False)
|
||||
|
||||
test_data = FakeNewsDataset(dataframe=df_test, vocab=vocab, max_length=256)
|
||||
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=False)
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
Checking if the data conversion works
|
||||
"""
|
||||
|
||||
# %%
|
||||
# features, labels = next(iter(train_dataloader))
|
||||
# # 2. Check the shapes (the dimensions of your tensors)
|
||||
# print("--- Tensor Shapes ---")
|
||||
# print(f"Features shape: {features.shape}")
|
||||
# print(f"Labels shape: {labels.shape}")
|
||||
|
||||
# # 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)
|
||||
# print("\n--- Data Types ---")
|
||||
# print(f"Features dtype: {features.dtype}")
|
||||
# print(f"Labels dtype: {labels.dtype}")
|
||||
|
||||
# # 4. Peek at the actual data for the very first article in this batch
|
||||
# print("\n--- First Article Peek ---")
|
||||
# print(f"Label: {labels[0].item()} (0 = Real, 1 = Fake)")
|
||||
# print(f"Tokens (first 20 IDs): {features[0][:20]}")
|
||||
|
||||
# %%
|
||||
class BaseModel(nn.Module):
|
||||
def __init__(self, vocab_size, embed_dim=32, h1=256, h2=128, out_features=2):
|
||||
super().__init__()
|
||||
|
||||
# The Embedding Layer: Turns word IDs into rich numerical vectors
|
||||
self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
|
||||
|
||||
# The Linear Layers: Learn the patterns to decide Fake vs. Real
|
||||
self.fc1 = nn.Linear(embed_dim, h1)
|
||||
self.fc2 = nn.Linear(h1, h2)
|
||||
self.out = nn.Linear(h2, out_features)
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
# x starts as integers: shape (batch_size, sequence_length) -> e.g., (64, 256)
|
||||
# Pass through embedding
|
||||
x = self.embedding(x)
|
||||
# Average the word vectors to get one single vector for the whole article
|
||||
x = x.mean(dim=1)
|
||||
|
||||
# Pass through hidden layers with ReLU activation
|
||||
x = F.relu(self.fc1(x))
|
||||
x = F.relu(self.fc2(x))
|
||||
|
||||
# Output layer (gives us the raw scores for 'Real' and 'Fake')
|
||||
x = self.out(x)
|
||||
return x
|
||||
model_basic =BaseModel(vocab_size=len((vocab)))
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
'Advanced'
|
||||
"""
|
||||
|
||||
# %%
|
||||
|
||||
class advanced_model(nn.Module):
|
||||
def __init__(self, vocab_size, embed_dim=64, hidden_dim=128,num_layer = 2, out_features=2):
|
||||
super().__init__()
|
||||
|
||||
# 1. The Embedding Layer (Same as before)
|
||||
self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
|
||||
|
||||
# # 2. The GRU Layer (Extra layer)
|
||||
# batch_first=True is required because our DataLoader outputs (batch_size, sequence_length)
|
||||
self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, num_layers=2,batch_first=True,bidirectional=True,
|
||||
dropout=0.3)
|
||||
|
||||
# 3. The Final Output Layer
|
||||
# connect the GRU's memory (hidden_dim) directly to our Real/Fake outputs
|
||||
self.out = nn.Linear(hidden_dim, out_features)
|
||||
self.fc = nn.Linear(hidden_dim * 2, out_features)
|
||||
def forward(self, x):
|
||||
# x shape: (batch_size, sequence_length) -> e.g., (64, 256)
|
||||
|
||||
#Get the word embeddings
|
||||
x = self.embedding(x)
|
||||
# x shape becomes: (64, 256, 32)
|
||||
|
||||
# Pass the embeddings into the GRU
|
||||
# A GRU outputs two things: the output at every single word, AND its final memory state.
|
||||
# We use '_' to ignore the step-by-step output, and save 'hidden_state'.
|
||||
_, hidden = self.gru(x)
|
||||
|
||||
# 4. Extract and Concatenate the final forward and backward states
|
||||
# hidden[-2] is the last forward state, hidden[-1] is the last backward state
|
||||
out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
|
||||
|
||||
return self.fc(out)
|
||||
|
||||
# Initilize
|
||||
model_adv = advanced_model(vocab_size=len(vocab))
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
# Training
|
||||
|
||||
"""
|
||||
|
||||
# %%
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
# %%
|
||||
def evaluate_performance(model, dataloader, device):
|
||||
model.eval() # Put model in evaluation mode
|
||||
|
||||
all_predictions = []
|
||||
all_true_labels = []
|
||||
|
||||
# Turn off gradient tracking to save memory
|
||||
with torch.no_grad():
|
||||
for features, labels in dataloader:
|
||||
features = features.to(device)
|
||||
labels = labels.to(device)
|
||||
|
||||
# Get model scores
|
||||
scores = model(features)
|
||||
|
||||
# Find the predicted class (0 or 1)
|
||||
_, predictions = torch.max(scores,1)
|
||||
|
||||
# Save predictions and actual labels to lists
|
||||
# all_predictions.extend(predictions.cpu().tolist())
|
||||
# all_true_labels.extend(labels.cpu().tolist())
|
||||
all_predictions.extend(predictions.cpu().numpy().flatten().tolist())
|
||||
all_true_labels.extend(labels.cpu().numpy().flatten().tolist())
|
||||
|
||||
all_predictions = np.array(all_predictions)
|
||||
all_true_labels = np.array(all_true_labels)
|
||||
|
||||
accuracy = (all_predictions == all_true_labels).mean() * 100
|
||||
|
||||
# 4. Calculate F1 Score
|
||||
# average='macro' is best for your report to show you care about both classes equally
|
||||
f1 = f1_score(all_true_labels, all_predictions, average='macro')
|
||||
model.train() # Return model to training mode just in case
|
||||
return accuracy, f1
|
||||
|
||||
|
||||
# %%
|
||||
def train_model(model, train_loader, val_loader, device, epochs=5, lr=0.001):
|
||||
model = model.to(device)
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
|
||||
|
||||
# Dictionary to store results for your report
|
||||
history = {'train_loss': [], 'val_acc': [], 'val_f1': []}
|
||||
|
||||
print(f"Training {model.__class__.__name__} on {device}...")
|
||||
|
||||
for epoch in range(epochs):
|
||||
model.train()
|
||||
total_loss = 0
|
||||
|
||||
for batch_idx, (features, labels) in enumerate(train_loader):
|
||||
features, labels = features.to(device), labels.to(device)
|
||||
|
||||
optimizer.zero_grad()
|
||||
predictions = model(features)
|
||||
loss = criterion(predictions, labels)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
total_loss += loss.item()
|
||||
|
||||
avg_loss = total_loss / len(train_loader)
|
||||
|
||||
# After each epoch, evaluate on validation set
|
||||
val_acc, val_f1 = evaluate_performance(model, val_loader, device)
|
||||
|
||||
# Save results to our history dictionary
|
||||
history['train_loss'].append(avg_loss)
|
||||
history['val_acc'].append(val_acc)
|
||||
history['val_f1'].append(val_f1)
|
||||
|
||||
print(f"\n Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f} \n Val Acc: {val_acc:.2f}% \n Val F1: {val_f1:.4f}")
|
||||
|
||||
return history # Return the results so we can plot them later
|
||||
|
||||
# %%
|
||||
train_995_basic =train_model (model_basic, train_dataloader, val_dataloader, device, epochs =7 )
|
||||
print(train_995_basic )
|
||||
|
||||
# %%
|
||||
train_995_adv =train_model (model_adv, train_dataloader, val_dataloader, device, epochs =7 )
|
||||
print(train_995_adv )
|
||||
|
||||
# %%
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
# Evaluation
|
||||
"""
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
Basic model
|
||||
"""
|
||||
|
||||
# %%
|
||||
|
||||
# # 1. The Evaluation Function
|
||||
# def evaluate_performance(model, dataloader, device):
|
||||
# model.eval() # Put model in evaluation mode
|
||||
|
||||
# all_predictions = []
|
||||
# all_true_labels = []
|
||||
|
||||
# # Turn off gradient tracking to save memory
|
||||
# with torch.no_grad():
|
||||
# for features, labels in dataloader:
|
||||
# features = features.to(device)
|
||||
# labels = labels.to(device)
|
||||
|
||||
# # Get model scores
|
||||
# scores = model(features)
|
||||
|
||||
# # Find the predicted class (0 or 1)
|
||||
# _, predictions = torch.max(scores,1)
|
||||
|
||||
# # Save predictions and actual labels to lists
|
||||
# # all_predictions.extend(predictions.cpu().tolist())
|
||||
# # all_true_labels.extend(labels.cpu().tolist())
|
||||
# all_predictions.extend(predictions.cpu().numpy().flatten().tolist())
|
||||
# all_true_labels.extend(labels.cpu().numpy().flatten().tolist())
|
||||
|
||||
# all_predictions = np.array(all_predictions)
|
||||
# all_true_labels = np.array(all_true_labels)
|
||||
|
||||
# accuracy = (all_predictions == all_true_labels).mean() * 100
|
||||
|
||||
# # 4. Calculate F1 Score
|
||||
# # average='macro' is best for your report to show you care about both classes equally
|
||||
# f1 = f1_score(all_true_labels, all_predictions, average='macro')
|
||||
# model.train() # Return model to training mode just in case
|
||||
# return accuracy, f1
|
||||
# # # Change me based on the model
|
||||
|
||||
# # model = model_basic.to(device)
|
||||
|
||||
|
||||
# # print(f"Training on: {device}")
|
||||
|
||||
# # # 2. Setup Loss and Optimizer
|
||||
# # # CrossEntropyLoss is the standard for classification tasks
|
||||
# # criterion = nn.CrossEntropyLoss()
|
||||
# # # Adam is a very reliable, fast optimizer
|
||||
# # optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
||||
|
||||
# # # 3. The Training Loop
|
||||
# # epochs = 7# Start with a small number of passes through the whole dataset
|
||||
|
||||
# # for epoch in range(epochs):
|
||||
# # model.train() # Tell the model it is in training mode
|
||||
# # total_loss = 0
|
||||
|
||||
# # # Loop through our batches of 64 articles
|
||||
# # for batch_idx, (features, labels) in enumerate(train_dataloader):
|
||||
|
||||
# # # Move data to the same device as the model (GPU/CPU)
|
||||
# # features = features.to(device)
|
||||
# # labels = labels.to(device)
|
||||
|
||||
# # # Step A: Reset the optimizer's gradients
|
||||
# # optimizer.zero_grad()
|
||||
|
||||
# # # Step B: Forward Pass (Have the model guess Real or Fake)
|
||||
# # predictions = model(features)
|
||||
|
||||
# # # Step C: Calculate Loss (How wrong were the guesses?)
|
||||
# # loss = criterion(predictions, labels)
|
||||
|
||||
# # # Step D: Backward Pass (Calculate how to fix the math)
|
||||
# # loss.backward()
|
||||
|
||||
# # # Step E: Optimize (Actually apply the fixes to the model's weights)
|
||||
# # optimizer.step()
|
||||
|
||||
# # total_loss += loss.item()
|
||||
|
||||
# # # Print an update every 100 batches so we know it's working
|
||||
# # if batch_idx % 100 == 0:
|
||||
# # print(f"Epoch [{epoch+1}/{epochs}] | Batch {batch_idx} | Loss: {loss.item():.4f}")
|
||||
|
||||
# # # Print the average loss at the end of each epoch
|
||||
# # avg_loss = total_loss / len(train_dataloader)
|
||||
# # print(f"--- End of Epoch {epoch+1} | Average Loss: {avg_loss:.4f} ---")
|
||||
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
Advanced model
|
||||
|
||||
"""
|
||||
|
||||
# %%
|
||||
|
||||
# # 1. The Evaluation Function
|
||||
# def evaluate_performance(model_adv, dataloader, device):
|
||||
# model_adv.eval() # Put model in evaluation mode
|
||||
|
||||
# all_predictions = []
|
||||
# all_true_labels = []
|
||||
|
||||
# # Turn off gradient tracking to save memory
|
||||
# with torch.no_grad():
|
||||
# for features, labels in dataloader:
|
||||
# features = features.to(device)
|
||||
# labels = labels.to(device)
|
||||
|
||||
# # Get model scores
|
||||
# scores = model_adv(features)
|
||||
|
||||
# # Find the predicted class (0 or 1)
|
||||
# _, predictions = scores.max(1)
|
||||
|
||||
# # Save predictions and actual labels to lists
|
||||
# all_predictions.extend(predictions.cpu().tolist())
|
||||
# all_true_labels.extend(labels.cpu().tolist())
|
||||
|
||||
# # Calculate Accuracy
|
||||
# correct_guesses = sum(p == t for p, t in zip(all_predictions, all_true_labels))
|
||||
# accuracy = (correct_guesses / len(all_true_labels)) * 100
|
||||
|
||||
# # Calculate F1 Score
|
||||
# f1 = f1_score(all_true_labels, all_predictions, average='macro')
|
||||
|
||||
# model_adv.train() # Return model to training mode just in case
|
||||
# return accuracy, f1
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
print("Basic model ")
|
||||
print(" Validation ")
|
||||
val_acc995, val_f1_995 = evaluate_performance(model,val_dataloader, device)
|
||||
print(f"Validation Accuracy: {val_acc995:.2f}%")
|
||||
print(f"Validation F1 Score: {val_f1_995:.4f}")
|
||||
|
||||
print("\n Testing Phase ")
|
||||
test_acc995, test_f1_995 = evaluate_performance(model, test_dataloader, device)
|
||||
print(f"Test Accuracy: {test_acc995:.2f}%")
|
||||
print(f"Test F1 Score: git {test_f1_995:.4f}")
|
||||
|
||||
# %%
|
||||
|
||||
|
||||
print(" GURU model ")
|
||||
print(" Validation ")
|
||||
adv_val_acc995, val_f1995 = evaluate_performance(model_adv,val_dataloader, device)
|
||||
print(f"Validation Accuracy: {adv_val_acc995:.2f}%")
|
||||
print(f"Validation F1 Score: {val_f1_995:.4f}")
|
||||
|
||||
print("\n Testing ")
|
||||
test_acc, test_f1 = evaluate_performance(model_adv, test_dataloader, device)
|
||||
print(f"Test Accuracy: {test_acc955:.2f}%")
|
||||
print(f"Test F1 Score: git {test_f1:.4f}")
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
# Liar data
|
||||
|
||||
|
||||
"""
|
||||
|
||||
# %%
|
||||
from helper import LIAR_labelling
|
||||
|
||||
f"../../data/training/LIAR.parquet"
|
||||
df_LIAR = pd.read_parquet("../../data/testing/LIAR.parquet",columns=['tokens','type'])
|
||||
|
||||
|
||||
df_LIAR['label'] = df_LIAR['type'].apply(LIAR_labelling).astype(str)
|
||||
df_LIAR['label'] = df_LIAR['label'].map(label_map).astype(int)
|
||||
df_LIAR = df_LIAR.drop(columns=['type'])
|
||||
|
||||
# %%
|
||||
df_LIAR.head()
|
||||
|
||||
# %%
|
||||
# count how many tokens we have in the corpuse
|
||||
word_counts = Counter()
|
||||
for x in df_LIAR['tokens']:
|
||||
word_counts.update(x)
|
||||
|
||||
# Keep the top 50,000 words.
|
||||
# Index 0 is for <PAD> (padding), Index 1 is for <UNK> (unknown words)
|
||||
vocab = {"<PAD>": 0, "<UNK>": 1}
|
||||
for idx, (word, count) in enumerate(word_counts.most_common(50000), start=2):
|
||||
vocab[word] = idx
|
||||
|
||||
print(f"Vocabulary built with {len(vocab)} words.")
|
||||
|
||||
# %%
|
||||
|
||||
LR_DATA = FakeNewsDataset(dataframe=df_LIAR, vocab=vocab, max_length=256)
|
||||
LR_dataloader = DataLoader(LR_DATA, batch_size=32, shuffle=False)
|
||||
|
||||
# %%
|
||||
features, labels = next(iter(LR_dataloader))
|
||||
# 2. Check the shapes (the dimensions of your tensors)
|
||||
print("--- Tensor Shapes ---")
|
||||
print(f"Features shape: {features.shape}")
|
||||
print(f"Labels shape: {labels.shape}")
|
||||
|
||||
# 3. Check the data types (PyTorch Embedding layers require 'torch.long' integers)
|
||||
print("\n--- Data Types ---")
|
||||
print(f"Features dtype: {features.dtype}")
|
||||
print(f"Labels dtype: {labels.dtype}")
|
||||
|
||||
# 4. Peek at the actual data for the very first article in this batch
|
||||
print("\n--- First Article Peek ---")
|
||||
print(f"Label: {labels[0].item()} (0 = Real, 1 = Fake)")
|
||||
print(f"Tokens (first 20 IDs): {features[0][:20]}")
|
||||
|
||||
# %%
|
||||
# # 1. Check a single sample from the Dataset directly
|
||||
# single_features, single_label = LR_DATA[0]
|
||||
# print(f"Single Sample - Features: {single_features.shape}, Label: {single_label.shape}")
|
||||
|
||||
# # 2. Check the DataLoader batch
|
||||
# batch_features, batch_labels = next(iter(LR_dataloader))
|
||||
# # print(f"Batch - Features: {batch_features.shape}, Labels: {batch_labels.shape}")
|
||||
|
||||
# %%
|
||||
evaluate_performance(model_adv,LR_dataloader,device)
|
||||
|
||||
print("\n--- 2. Testing Avanced model ---")
|
||||
test_acc, test_f1 = evaluate_performance(model_adv, LR_dataloader, device)
|
||||
print(f"Test Accuracy: {test_acc:.2f}%")
|
||||
print(f"Test F1 Score: git {test_f1:.4f}")
|
||||
|
||||
# %%
|
||||
|
||||
print("\n--- 2. Testing BASE-Model ---")
|
||||
test_acc, test_f1 = evaluate_performance(model, LR_dataloader, device)
|
||||
print(f"Test Accuracy: {test_acc:.2f}%")
|
||||
print(f"Test F1 Score: git {test_f1:.4f}")
|
||||
|
||||
# %%
|
||||
|
||||
|
||||
52
src/models/svm.py
Normal file
52
src/models/svm.py
Normal file
@@ -0,0 +1,52 @@
|
||||
from constants import TRAINING_DIR, MODEL_DIR
|
||||
from models.model import Model
|
||||
from labels import Label
|
||||
from helper import default_labelling
|
||||
from typing import override, Callable
|
||||
import pandas as pd
|
||||
import pickle
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
def no_tokenization(str):
|
||||
return str.split(" ")
|
||||
|
||||
class SVM_model(Model):
|
||||
def __init__(self, model_filename: str = "", label_translator: Callable[[str], Label] = default_labelling) -> None:
|
||||
super().__init__(model_filename, label_translator)
|
||||
|
||||
@override
|
||||
def train(self, training_dataset: str, hyperparameters: dict[str, float] = {}) -> None:
|
||||
X = pd.read_parquet(f"{TRAINING_DIR}/{training_dataset}", columns=['tokens'])['tokens']
|
||||
X = X.apply(lambda token_list: " ".join(token_list))
|
||||
Y = pd.read_parquet(f"{TRAINING_DIR}/{training_dataset}", columns=['type'])['type']
|
||||
Y = Y.apply(lambda label: self.label_translator(label).value)
|
||||
|
||||
model = Pipeline([
|
||||
("L string", TfidfVectorizer(tokenizer=no_tokenization)),
|
||||
("svm", LinearSVC(random_state=0))
|
||||
])
|
||||
|
||||
model.fit(X, Y)
|
||||
self.model = model
|
||||
|
||||
@override
|
||||
def classify(self, input: pd.Series) -> Label:
|
||||
X = " ".join(input['tokens'])
|
||||
return Label(self.model.predict([X])[0])
|
||||
|
||||
@override
|
||||
def save(self, filename: str) -> None:
|
||||
data = {}
|
||||
data["label_translator"] = self.label_translator
|
||||
data["model"] = self.model
|
||||
with open(f"{MODEL_DIR}/{filename}", 'wb') as file:
|
||||
pickle.dump(data, file)
|
||||
|
||||
@override
|
||||
def load(self, filename: str) -> None:
|
||||
with open(f"{MODEL_DIR}/{filename}", 'rb') as file:
|
||||
data = pickle.load(file)
|
||||
self.label_translator = data["label_translator"]
|
||||
self.model = data["model"]
|
||||
210
src/old_notebooks/data_processing.ipynb
Normal file
210
src/old_notebooks/data_processing.ipynb
Normal file
File diff suppressed because one or more lines are too long
121
src/old_notebooks/data_processing.ju.py
Normal file
121
src/old_notebooks/data_processing.ju.py
Normal file
@@ -0,0 +1,121 @@
|
||||
# %% [markdown]
|
||||
"""
|
||||
# cleaning
|
||||
big_data.csv.zst is the main file we will be using. Every step in the pipeline adds a new column and overwrites the file. This is reversible and when any step changes everything can be run again regardless og the state of the file.
|
||||
"""
|
||||
|
||||
# %%
|
||||
import nltk
|
||||
import re
|
||||
import os
|
||||
import time
|
||||
import pandas as pd
|
||||
|
||||
DATA_DIR = "../data"
|
||||
|
||||
# %%
|
||||
# download nltk data
|
||||
nltk.download("all")
|
||||
|
||||
# %%
|
||||
news_sample = pd.read_csv(f"{DATA_DIR}/news_sample.csv")
|
||||
|
||||
# %%
|
||||
# We will not waste space on csv files, L.
|
||||
if (os.path.exists(f"{DATA_DIR}/995,000_rows.csv")):
|
||||
big_data = pd.read_csv(f"{DATA_DIR}/995,000_rows.csv", low_memory=False)
|
||||
big_data.to_csv(f"{DATA_DIR}/big_data.csv.zst")
|
||||
os.remove(f"{DATA_DIR}/995,000_rows.csv")
|
||||
big_data = None
|
||||
|
||||
# %%
|
||||
# cleans text and returns a list of tokens.
|
||||
def clean_text(
|
||||
text,
|
||||
remove_regex_patterns = True,
|
||||
remove_stopwords = True,
|
||||
remove_special_characters = True,
|
||||
stemming = True):
|
||||
|
||||
text = str(text).lower().strip()
|
||||
|
||||
if remove_regex_patterns:
|
||||
url_pattern = r'\S+\.\S+'
|
||||
email_pattern = r'\w+@\w+\.\w+'
|
||||
date_pattern = r'[a-z]+ \d{1,2}[a-z]?, \d{4}' # add more date patterns
|
||||
number_pattern = r'\d+'
|
||||
|
||||
text = re.sub(url_pattern, "<URL>", text)
|
||||
text = re.sub(email_pattern, "<EMAIL>", text)
|
||||
text = re.sub(date_pattern, "<DATE>", text)
|
||||
text = re.sub(number_pattern, "<NUMBER>", text)
|
||||
|
||||
if remove_special_characters:
|
||||
text = re.sub(r'[^\w (?:<\w+>)]', " ", text)
|
||||
|
||||
tokenizer = nltk.RegexpTokenizer(r'<\w+>|\w+')
|
||||
tokens = tokenizer.tokenize(text)
|
||||
|
||||
if remove_stopwords:
|
||||
stopwords = stopwords = nltk.corpus.stopwords.words('english')
|
||||
tokens = [token for token in tokens if token not in stopwords]
|
||||
|
||||
if stemming:
|
||||
stemmer = nltk.SnowballStemmer("english")
|
||||
tokens = [stemmer.stem(token) if not re.match(r'<\w+>', token) else token for token in tokens]
|
||||
|
||||
return tokens
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## Output
|
||||
Now we check what the function does and how the vocabulary changes.
|
||||
"""
|
||||
|
||||
# %%
|
||||
# Generates a vocabulary (set of unique words) from a pandas series.
|
||||
def generate_vocabulary(series):
|
||||
vocabulary = set()
|
||||
series.apply(lambda tokens: vocabulary.update(tokens))
|
||||
return vocabulary
|
||||
|
||||
# %%
|
||||
print("original text:\n")
|
||||
print(news_sample['content'][1])
|
||||
print("\n" + "-" * 100 + "\n")
|
||||
|
||||
print("cleaned tokens:\n")
|
||||
print(clean_text(news_sample['content'][1]))
|
||||
print("\n" + "-" * 100 + "\n")
|
||||
|
||||
tokenization_size = len(generate_vocabulary(news_sample['content'].apply(clean_text, remove_stopwords = False, stemming = False)))
|
||||
stopwords_size = len(generate_vocabulary(news_sample['content'].apply(clean_text, remove_stopwords = True, stemming = False)))
|
||||
stemming_size = len(generate_vocabulary(news_sample['content'].apply(clean_text, remove_stopwords = True, stemming = True)))
|
||||
|
||||
print("Unique words after tokenization:")
|
||||
print(tokenization_size)
|
||||
print("\nUnique words after stopword removal:")
|
||||
print(stopwords_size)
|
||||
print("\nUnique words after stemming:")
|
||||
print(stemming_size)
|
||||
print("\nStemming reduction rate:")
|
||||
print(f"{round(1 - stemming_size / stopwords_size, 4) * 100}%")
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## Big Data
|
||||
Now we clean the big dataset and save it to csv.zst file. Pandas can save and load zstd files just fine, and since it's realtime compression it doesn't really take more time while heavily reducing the file size.
|
||||
"""
|
||||
|
||||
# %%
|
||||
start = time.perf_counter()
|
||||
first = True
|
||||
for big_data in pd.read_csv(f"{DATA_DIR}/big_data.csv.zst", chunksize=10000):
|
||||
big_data['tokens'] = big_data['content'].apply(clean_text)
|
||||
if first:
|
||||
big_data.to_csv(f"{DATA_DIR}/big_data_new.csv.zst", mode='w')
|
||||
first = False
|
||||
else:
|
||||
big_data.to_csv(f"{DATA_DIR}/big_data_new.csv.zst", mode='a')
|
||||
os.rename(f"{DATA_DIR}/big_data_new.csv.zst",f"{DATA_DIR}/big_data.csv.zst")
|
||||
print(f"cleaning took {round((time.perf_counter() - start) / 60, 5)} minutes")
|
||||
65
src/old_notebooks/data_processing_ja.ipynb
Normal file
65
src/old_notebooks/data_processing_ja.ipynb
Normal file
@@ -0,0 +1,65 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "95706a2e-9e23-4272-aeaa-4510254f7feb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Cleaning"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1be89b54-76dd-4c2e-bcdd-ff956bf375bf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import nltk\n",
|
||||
"from nltk.corpus import stopwords\n",
|
||||
"from nltk.tokenize import word_tokenize\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b82cf2b2-7cee-4c34-83b9-37c5c4828289",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"1. Tokenize the text"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dc8058fc-0ed9-4daf-918d-d3e82064a3a6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"nltk.download('punkt')\n",
|
||||
"text = ("
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
42
src/setup.py
Normal file
42
src/setup.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from constants import DATASET_DIR, TRAINING_DIR, VALIDATION_DIR, TESTING_DIR, ORIGINAL_DATASET_FILES
|
||||
from clean_data import clean_dataset
|
||||
from helper import csv_to_parquet
|
||||
from split import split_dataset, split_dataset_random
|
||||
import nltk
|
||||
import os
|
||||
import shutil
|
||||
import pandas as pd
|
||||
|
||||
def setup() -> None:
|
||||
# make sure nltk can be used later.
|
||||
nltk.download("all")
|
||||
|
||||
for dataset_file in ORIGINAL_DATASET_FILES:
|
||||
if not os.path.exists(f"{DATASET_DIR}/{dataset_file}"):
|
||||
raise Exception(f"Please add {dataset_file} to {DATASET_DIR}")
|
||||
|
||||
name = os.path.splitext(dataset_file)[0]
|
||||
if not os.path.exists(f"{DATASET_DIR}/{name}.parquet"):
|
||||
csv_to_parquet(f"{DATASET_DIR}/{dataset_file}", f"{DATASET_DIR}/{name}.parquet")
|
||||
print(f"finished converting {dataset_file} to parquet")
|
||||
clean_dataset(f"{name}.parquet")
|
||||
print(f"cleaned {name}.parquet")
|
||||
split_dataset_random(f"{name}.parquet")
|
||||
print(f"split {name}.parquet into traning, validation and test")
|
||||
|
||||
# LIAR
|
||||
for dataset, destination in [("train.tsv", TRAINING_DIR), ("valid.tsv", VALIDATION_DIR), ("test.tsv", TESTING_DIR)]:
|
||||
if os.path.exists(f"{DATASET_DIR}/{dataset}"):
|
||||
df = pd.read_csv(f"{DATASET_DIR}/{dataset}", sep='\t', header=None)
|
||||
df = df.rename(columns={
|
||||
1: "type",
|
||||
2: "content"
|
||||
})
|
||||
name = os.path.splitext(dataset)[0]
|
||||
df.to_parquet(f"{DATASET_DIR}/{name}.parquet")
|
||||
clean_dataset(f"{name}.parquet")
|
||||
|
||||
shutil.move(f"{DATASET_DIR}/{name}.parquet", f"{destination}/LIAR.parquet")
|
||||
|
||||
if __name__ == "__main__":
|
||||
setup()
|
||||
91
src/split.py
Normal file
91
src/split.py
Normal file
@@ -0,0 +1,91 @@
|
||||
from constants import CHUNK_SIZE, DATASET_DIR, TRAINING_DIR, VALIDATION_DIR, TESTING_DIR
|
||||
import pyarrow.parquet as pq
|
||||
import pyarrow as pa
|
||||
import os
|
||||
from helper import get_time_boundaries
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
def split_dataset_random(filename:str) -> None:
|
||||
pq_file = pq.ParquetFile(f"{DATASET_DIR}/{filename}")
|
||||
|
||||
training_writer = None
|
||||
validation_writer = None
|
||||
testing_writer = None
|
||||
|
||||
for batch in pq_file.iter_batches(batch_size=CHUNK_SIZE):
|
||||
table = pa.Table.from_batches([batch])
|
||||
|
||||
rng = np.random.rand(table.num_rows)
|
||||
|
||||
training = table.filter(rng < 0.75)
|
||||
validation = table.filter((rng >= 0.75) & (rng < 0.85))
|
||||
testing = table.filter(rng >= 0.85)
|
||||
|
||||
if not training_writer and training.num_rows:
|
||||
training_writer = pq.ParquetWriter(f"{TRAINING_DIR}/{filename}", training.schema)
|
||||
if not validation_writer and validation.num_rows:
|
||||
validation_writer = pq.ParquetWriter(f"{VALIDATION_DIR}/{filename}", validation.schema)
|
||||
if not testing_writer and testing.num_rows:
|
||||
testing_writer = pq.ParquetWriter(f"{TESTING_DIR}/{filename}", testing.schema)
|
||||
|
||||
if training.num_rows:
|
||||
training_writer.write(training)
|
||||
if validation.num_rows:
|
||||
validation_writer.write(validation)
|
||||
if testing.num_rows:
|
||||
testing_writer.write(testing)
|
||||
|
||||
training_writer.close()
|
||||
validation_writer.close()
|
||||
testing_writer.close()
|
||||
|
||||
def split_dataset(filename: str) -> None:
|
||||
df = pd.read_parquet(f"{DATASET_DIR}/{filename}")
|
||||
n = len(df)
|
||||
df['scraped_at'] = pd.to_datetime(df['scraped_at'], format='ISO8601', errors='coerce', utc=True)
|
||||
df = df.sort_values(by='scraped_at')
|
||||
|
||||
df.iloc[:int(n * 0.8)].to_parquet(f"{TRAINING_DIR}/{filename}")
|
||||
df.iloc[int(n * 0.8):int(n * 0.9)].to_parquet(f"{VALIDATION_DIR}/{filename}")
|
||||
df.iloc[int(n * 0.9):].to_parquet(f"{TESTING_DIR}/{filename}")
|
||||
return
|
||||
|
||||
# ── Writers start as None — initialized on first batch ───────────────────
|
||||
train_writer = None
|
||||
val_writer = None
|
||||
test_writer = None
|
||||
|
||||
try:
|
||||
parquet_file = pq.ParquetFile(filepath)
|
||||
for batch in parquet_file.iter_batches(batch_size=CHUNK_SIZE): # type: ignore
|
||||
chunk = batch.to_pandas() # type: ignore
|
||||
chunk['scraped_at'] = pd.to_datetime(chunk['scraped_at'], format='ISO8601', errors='coerce', utc=True)
|
||||
|
||||
# Initialize writers on first batch AFTER datetime conversion
|
||||
if train_writer is None:
|
||||
schema= pa.Schema.from_pandas(chunk)
|
||||
train_writer = pq.ParquetWriter(os.path.join(TRAINING_DIR, filename), schema)
|
||||
val_writer = pq.ParquetWriter(os.path.join(VALIDATION_DIR, filename), schema)
|
||||
test_writer = pq.ParquetWriter(os.path.join(TESTING_DIR, filename), schema)
|
||||
|
||||
# Split the chunk
|
||||
train_chunk = chunk[chunk['scraped_at'] <= train_cut]
|
||||
val_chunk = chunk[(chunk['scraped_at'] > train_cut) & (chunk['scraped_at'] <= val_cut)]
|
||||
test_chunk = chunk[chunk['scraped_at'] > val_cut]
|
||||
|
||||
# Write each split
|
||||
if not train_chunk.empty:
|
||||
train_writer.write_table(pa.Table.from_pandas(train_chunk, schema=schema))
|
||||
if not val_chunk.empty:
|
||||
val_writer.write_table(pa.Table.from_pandas(val_chunk, schema=schema))
|
||||
if not test_chunk.empty:
|
||||
test_writer.write_table(pa.Table.from_pandas(test_chunk, schema=schema))
|
||||
|
||||
finally:
|
||||
if train_writer:
|
||||
train_writer.close()
|
||||
if val_writer:
|
||||
val_writer.close()
|
||||
if test_writer:
|
||||
test_writer.close()
|
||||
Reference in New Issue
Block a user