backup since codeberg is down

This commit is contained in:
2026-03-27 13:35:43 +01:00
commit 8a61a214c6
45 changed files with 5038 additions and 0 deletions

View File

@@ -0,0 +1,457 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "3b55d166",
"metadata": {},
"source": [
"# DO NOT RUN; DaATA WILL BE LOST"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9c2d25e9",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd \n",
"import os \n",
"import sys\n",
"sys.path.append(os.path.join(os.getcwd(), '../src'))\n",
"from constants import TRAINING_DIR, TESTING_DIR, VALIDATION_DIR\n",
"pd.set_option('display.max_columns', None)\n"
]
},
{
"cell_type": "markdown",
"id": "cd67fc64",
"metadata": {},
"source": [
"# Time Split "
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "a917b0fa",
"metadata": {},
"outputs": [],
"source": [
"test_ty = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
"train_ty = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
"val_ty = pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "0098d6e4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"rows in train(818843, 1),\n",
" rows in test (99499, 1), \n",
" rows in validation(76645, 1)\n"
]
}
],
"source": [
"print(f'rows in train{train_ty.shape },\\n rows in test {test_ty.shape}, \\n rows in validation{val_ty.shape}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5985a4f3",
"metadata": {},
"outputs": [],
"source": [
"timeline = pd.concat([\n",
" train_ty.value_counts().rename('train'),\n",
" test_ty.value_counts().rename('test'),\n",
" val_ty.value_counts().rename('val'),\n",
"], axis=1).fillna(0).astype(int)\n",
"\n",
"timeline.index.name = 'type'"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "b0673e19",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>train</th>\n",
" <th>test</th>\n",
" <th>val</th>\n",
" </tr>\n",
" <tr>\n",
" <th>type</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>political</th>\n",
" <td>194518</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bias</th>\n",
" <td>133232</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>fake</th>\n",
" <td>104883</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>conspiracy</th>\n",
" <td>97314</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>rumor</th>\n",
" <td>56445</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unknown</th>\n",
" <td>43534</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>reliable</th>\n",
" <td>42419</td>\n",
" <td>99499</td>\n",
" <td>76645</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unreliable</th>\n",
" <td>35332</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>clickbait</th>\n",
" <td>27412</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>junksci</th>\n",
" <td>14040</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>satire</th>\n",
" <td>13160</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>hate</th>\n",
" <td>8779</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" train test val\n",
"type \n",
"political 194518 0 0\n",
"bias 133232 0 0\n",
"fake 104883 0 0\n",
"conspiracy 97314 0 0\n",
"rumor 56445 0 0\n",
"unknown 43534 0 0\n",
"reliable 42419 99499 76645\n",
"unreliable 35332 0 0\n",
"clickbait 27412 0 0\n",
"junksci 14040 0 0\n",
"satire 13160 0 0\n",
"hate 8779 0 0"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"timeline"
]
},
{
"cell_type": "markdown",
"id": "6bdc7d84",
"metadata": {},
"source": [
"# Random Split "
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "cd5ca57b",
"metadata": {},
"outputs": [],
"source": [
"test_ty_R = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
"train_ty_R = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
"val_ty_R = pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "c793a37c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"rows in train(745724, 1),\n",
" rows in test (149766, 1), \n",
" rows in validation(99510, 1)\n"
]
}
],
"source": [
"print(f'rows in train{train_ty_R.shape },\\n rows in test {test_ty_R.shape}, \\n rows in validation{val_ty_R.shape}')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "583304ff",
"metadata": {},
"outputs": [],
"source": [
"timeline_R = pd.concat([\n",
" train_ty_R.value_counts().rename('train'),\n",
" test_ty_R.value_counts().rename('test'),\n",
" val_ty_R.value_counts().rename('val'),\n",
"], axis=1).fillna(0).astype(int)\n",
"\n",
"timeline_R.index.name = 'type'"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "d8255b60",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>train</th>\n",
" <th>test</th>\n",
" <th>val</th>\n",
" </tr>\n",
" <tr>\n",
" <th>type</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>reliable</th>\n",
" <td>163802</td>\n",
" <td>33010</td>\n",
" <td>21752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>political</th>\n",
" <td>145779</td>\n",
" <td>29241</td>\n",
" <td>19498</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bias</th>\n",
" <td>99797</td>\n",
" <td>20079</td>\n",
" <td>13356</td>\n",
" </tr>\n",
" <tr>\n",
" <th>fake</th>\n",
" <td>78736</td>\n",
" <td>15602</td>\n",
" <td>10545</td>\n",
" </tr>\n",
" <tr>\n",
" <th>conspiracy</th>\n",
" <td>72837</td>\n",
" <td>14676</td>\n",
" <td>9801</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unknown</th>\n",
" <td>68468</td>\n",
" <td>13754</td>\n",
" <td>9098</td>\n",
" </tr>\n",
" <tr>\n",
" <th>rumor</th>\n",
" <td>42254</td>\n",
" <td>8553</td>\n",
" <td>5638</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unreliable</th>\n",
" <td>26489</td>\n",
" <td>5346</td>\n",
" <td>3497</td>\n",
" </tr>\n",
" <tr>\n",
" <th>clickbait</th>\n",
" <td>20552</td>\n",
" <td>4161</td>\n",
" <td>2699</td>\n",
" </tr>\n",
" <tr>\n",
" <th>junksci</th>\n",
" <td>10516</td>\n",
" <td>2066</td>\n",
" <td>1458</td>\n",
" </tr>\n",
" <tr>\n",
" <th>satire</th>\n",
" <td>9852</td>\n",
" <td>1971</td>\n",
" <td>1337</td>\n",
" </tr>\n",
" <tr>\n",
" <th>hate</th>\n",
" <td>6641</td>\n",
" <td>1307</td>\n",
" <td>831</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2018-02-10 13:43:39.521661</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" train test val\n",
"type \n",
"reliable 163802 33010 21752\n",
"political 145779 29241 19498\n",
"bias 99797 20079 13356\n",
"fake 78736 15602 10545\n",
"conspiracy 72837 14676 9801\n",
"unknown 68468 13754 9098\n",
"rumor 42254 8553 5638\n",
"unreliable 26489 5346 3497\n",
"clickbait 20552 4161 2699\n",
"junksci 10516 2066 1458\n",
"satire 9852 1971 1337\n",
"hate 6641 1307 831\n",
"2018-02-10 13:43:39.521661 1 0 0"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"timeline_R"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "355d343a",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "main_asg",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}