458 lines
12 KiB
Plaintext
458 lines
12 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "3b55d166",
|
|
"metadata": {},
|
|
"source": [
|
|
"# DO NOT RUN; DaATA WILL BE LOST"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "9c2d25e9",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd \n",
|
|
"import os \n",
|
|
"import sys\n",
|
|
"sys.path.append(os.path.join(os.getcwd(), '../src'))\n",
|
|
"from constants import TRAINING_DIR, TESTING_DIR, VALIDATION_DIR\n",
|
|
"pd.set_option('display.max_columns', None)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "cd67fc64",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Time Split "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"id": "a917b0fa",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"test_ty = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
|
|
"train_ty = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
|
|
"val_ty = pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"id": "0098d6e4",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"rows in train(818843, 1),\n",
|
|
" rows in test (99499, 1), \n",
|
|
" rows in validation(76645, 1)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(f'rows in train{train_ty.shape },\\n rows in test {test_ty.shape}, \\n rows in validation{val_ty.shape}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "5985a4f3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"timeline = pd.concat([\n",
|
|
" train_ty.value_counts().rename('train'),\n",
|
|
" test_ty.value_counts().rename('test'),\n",
|
|
" val_ty.value_counts().rename('val'),\n",
|
|
"], axis=1).fillna(0).astype(int)\n",
|
|
"\n",
|
|
"timeline.index.name = 'type'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"id": "b0673e19",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>train</th>\n",
|
|
" <th>test</th>\n",
|
|
" <th>val</th>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>type</th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>political</th>\n",
|
|
" <td>194518</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>bias</th>\n",
|
|
" <td>133232</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>fake</th>\n",
|
|
" <td>104883</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>conspiracy</th>\n",
|
|
" <td>97314</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>rumor</th>\n",
|
|
" <td>56445</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unknown</th>\n",
|
|
" <td>43534</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>reliable</th>\n",
|
|
" <td>42419</td>\n",
|
|
" <td>99499</td>\n",
|
|
" <td>76645</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unreliable</th>\n",
|
|
" <td>35332</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>clickbait</th>\n",
|
|
" <td>27412</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>junksci</th>\n",
|
|
" <td>14040</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>satire</th>\n",
|
|
" <td>13160</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>hate</th>\n",
|
|
" <td>8779</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" train test val\n",
|
|
"type \n",
|
|
"political 194518 0 0\n",
|
|
"bias 133232 0 0\n",
|
|
"fake 104883 0 0\n",
|
|
"conspiracy 97314 0 0\n",
|
|
"rumor 56445 0 0\n",
|
|
"unknown 43534 0 0\n",
|
|
"reliable 42419 99499 76645\n",
|
|
"unreliable 35332 0 0\n",
|
|
"clickbait 27412 0 0\n",
|
|
"junksci 14040 0 0\n",
|
|
"satire 13160 0 0\n",
|
|
"hate 8779 0 0"
|
|
]
|
|
},
|
|
"execution_count": 22,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"timeline"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "6bdc7d84",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Random Split "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "cd5ca57b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"test_ty_R = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
|
|
"train_ty_R = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
|
|
"val_ty_R = pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "c793a37c",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"rows in train(745724, 1),\n",
|
|
" rows in test (149766, 1), \n",
|
|
" rows in validation(99510, 1)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(f'rows in train{train_ty_R.shape },\\n rows in test {test_ty_R.shape}, \\n rows in validation{val_ty_R.shape}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "583304ff",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"timeline_R = pd.concat([\n",
|
|
" train_ty_R.value_counts().rename('train'),\n",
|
|
" test_ty_R.value_counts().rename('test'),\n",
|
|
" val_ty_R.value_counts().rename('val'),\n",
|
|
"], axis=1).fillna(0).astype(int)\n",
|
|
"\n",
|
|
"timeline_R.index.name = 'type'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "d8255b60",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>train</th>\n",
|
|
" <th>test</th>\n",
|
|
" <th>val</th>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>type</th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>reliable</th>\n",
|
|
" <td>163802</td>\n",
|
|
" <td>33010</td>\n",
|
|
" <td>21752</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>political</th>\n",
|
|
" <td>145779</td>\n",
|
|
" <td>29241</td>\n",
|
|
" <td>19498</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>bias</th>\n",
|
|
" <td>99797</td>\n",
|
|
" <td>20079</td>\n",
|
|
" <td>13356</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>fake</th>\n",
|
|
" <td>78736</td>\n",
|
|
" <td>15602</td>\n",
|
|
" <td>10545</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>conspiracy</th>\n",
|
|
" <td>72837</td>\n",
|
|
" <td>14676</td>\n",
|
|
" <td>9801</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unknown</th>\n",
|
|
" <td>68468</td>\n",
|
|
" <td>13754</td>\n",
|
|
" <td>9098</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>rumor</th>\n",
|
|
" <td>42254</td>\n",
|
|
" <td>8553</td>\n",
|
|
" <td>5638</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unreliable</th>\n",
|
|
" <td>26489</td>\n",
|
|
" <td>5346</td>\n",
|
|
" <td>3497</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>clickbait</th>\n",
|
|
" <td>20552</td>\n",
|
|
" <td>4161</td>\n",
|
|
" <td>2699</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>junksci</th>\n",
|
|
" <td>10516</td>\n",
|
|
" <td>2066</td>\n",
|
|
" <td>1458</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>satire</th>\n",
|
|
" <td>9852</td>\n",
|
|
" <td>1971</td>\n",
|
|
" <td>1337</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>hate</th>\n",
|
|
" <td>6641</td>\n",
|
|
" <td>1307</td>\n",
|
|
" <td>831</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2018-02-10 13:43:39.521661</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" train test val\n",
|
|
"type \n",
|
|
"reliable 163802 33010 21752\n",
|
|
"political 145779 29241 19498\n",
|
|
"bias 99797 20079 13356\n",
|
|
"fake 78736 15602 10545\n",
|
|
"conspiracy 72837 14676 9801\n",
|
|
"unknown 68468 13754 9098\n",
|
|
"rumor 42254 8553 5638\n",
|
|
"unreliable 26489 5346 3497\n",
|
|
"clickbait 20552 4161 2699\n",
|
|
"junksci 10516 2066 1458\n",
|
|
"satire 9852 1971 1337\n",
|
|
"hate 6641 1307 831\n",
|
|
"2018-02-10 13:43:39.521661 1 0 0"
|
|
]
|
|
},
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"timeline_R"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "355d343a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "main_asg",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.14.2"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|