238 lines
6.1 KiB
Plaintext
238 lines
6.1 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "9c2d25e9",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd \n",
|
|
"import os \n",
|
|
"import sys\n",
|
|
"sys.path.append(os.path.join(os.getcwd(), '../src'))\n",
|
|
"from constants import TRAINING_DIR, TESTING_DIR, VALIDATION_DIR\n",
|
|
"pd.set_option('display.max_columns', None)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"id": "a917b0fa",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"test_ty = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
|
|
"train_ty = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
|
|
"val_ty = pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"id": "0098d6e4",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"rows in train(818843, 1),\n",
|
|
" rows in test (99499, 1), \n",
|
|
" rows in validation(76645, 1)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(f'rows in train{train_ty.shape },\\n rows in test {test_ty.shape}, \\n rows in validation{val_ty.shape}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"id": "5985a4f3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"timeline = pd.concat([\n",
|
|
" b.value_counts().rename('train'),\n",
|
|
" a.value_counts().rename('test'),\n",
|
|
" c.value_counts().rename('val'),\n",
|
|
"], axis=1).fillna(0).astype(int)\n",
|
|
"\n",
|
|
"timeline.index.name = 'type'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"id": "b0673e19",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>train</th>\n",
|
|
" <th>test</th>\n",
|
|
" <th>val</th>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>type</th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>political</th>\n",
|
|
" <td>194518</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>bias</th>\n",
|
|
" <td>133232</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>fake</th>\n",
|
|
" <td>104883</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>conspiracy</th>\n",
|
|
" <td>97314</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>rumor</th>\n",
|
|
" <td>56445</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unknown</th>\n",
|
|
" <td>43534</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>reliable</th>\n",
|
|
" <td>42419</td>\n",
|
|
" <td>99499</td>\n",
|
|
" <td>76645</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unreliable</th>\n",
|
|
" <td>35332</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>clickbait</th>\n",
|
|
" <td>27412</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>junksci</th>\n",
|
|
" <td>14040</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>satire</th>\n",
|
|
" <td>13160</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>hate</th>\n",
|
|
" <td>8779</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" train test val\n",
|
|
"type \n",
|
|
"political 194518 0 0\n",
|
|
"bias 133232 0 0\n",
|
|
"fake 104883 0 0\n",
|
|
"conspiracy 97314 0 0\n",
|
|
"rumor 56445 0 0\n",
|
|
"unknown 43534 0 0\n",
|
|
"reliable 42419 99499 76645\n",
|
|
"unreliable 35332 0 0\n",
|
|
"clickbait 27412 0 0\n",
|
|
"junksci 14040 0 0\n",
|
|
"satire 13160 0 0\n",
|
|
"hate 8779 0 0"
|
|
]
|
|
},
|
|
"execution_count": 22,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"timeline"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c2bcfc84",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "main_asg",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.14.2"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|