This repository has been archived on 2026-03-27. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
fake-news-backup/analysis/analyz_split_time.ipynb
2026-03-27 13:35:43 +01:00

238 lines
6.1 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "9c2d25e9",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd \n",
"import os \n",
"import sys\n",
"sys.path.append(os.path.join(os.getcwd(), '../src'))\n",
"from constants import TRAINING_DIR, TESTING_DIR, VALIDATION_DIR\n",
"pd.set_option('display.max_columns', None)\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "a917b0fa",
"metadata": {},
"outputs": [],
"source": [
"test_ty = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
"train_ty = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n",
"val_ty = pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "0098d6e4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"rows in train(818843, 1),\n",
" rows in test (99499, 1), \n",
" rows in validation(76645, 1)\n"
]
}
],
"source": [
"print(f'rows in train{train_ty.shape },\\n rows in test {test_ty.shape}, \\n rows in validation{val_ty.shape}')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "5985a4f3",
"metadata": {},
"outputs": [],
"source": [
"timeline = pd.concat([\n",
" b.value_counts().rename('train'),\n",
" a.value_counts().rename('test'),\n",
" c.value_counts().rename('val'),\n",
"], axis=1).fillna(0).astype(int)\n",
"\n",
"timeline.index.name = 'type'"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "b0673e19",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>train</th>\n",
" <th>test</th>\n",
" <th>val</th>\n",
" </tr>\n",
" <tr>\n",
" <th>type</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>political</th>\n",
" <td>194518</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bias</th>\n",
" <td>133232</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>fake</th>\n",
" <td>104883</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>conspiracy</th>\n",
" <td>97314</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>rumor</th>\n",
" <td>56445</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unknown</th>\n",
" <td>43534</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>reliable</th>\n",
" <td>42419</td>\n",
" <td>99499</td>\n",
" <td>76645</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unreliable</th>\n",
" <td>35332</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>clickbait</th>\n",
" <td>27412</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>junksci</th>\n",
" <td>14040</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>satire</th>\n",
" <td>13160</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>hate</th>\n",
" <td>8779</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" train test val\n",
"type \n",
"political 194518 0 0\n",
"bias 133232 0 0\n",
"fake 104883 0 0\n",
"conspiracy 97314 0 0\n",
"rumor 56445 0 0\n",
"unknown 43534 0 0\n",
"reliable 42419 99499 76645\n",
"unreliable 35332 0 0\n",
"clickbait 27412 0 0\n",
"junksci 14040 0 0\n",
"satire 13160 0 0\n",
"hate 8779 0 0"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"timeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c2bcfc84",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "main_asg",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}