{ "cells": [ { "cell_type": "markdown", "id": "3b55d166", "metadata": {}, "source": [ "# DO NOT RUN; DaATA WILL BE LOST" ] }, { "cell_type": "code", "execution_count": 3, "id": "9c2d25e9", "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import os \n", "import sys\n", "sys.path.append(os.path.join(os.getcwd(), '../src'))\n", "from constants import TRAINING_DIR, TESTING_DIR, VALIDATION_DIR\n", "pd.set_option('display.max_columns', None)\n" ] }, { "cell_type": "markdown", "id": "cd67fc64", "metadata": {}, "source": [ "# Time Split " ] }, { "cell_type": "code", "execution_count": 20, "id": "a917b0fa", "metadata": {}, "outputs": [], "source": [ "test_ty = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n", "train_ty = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n", "val_ty = pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n" ] }, { "cell_type": "code", "execution_count": 19, "id": "0098d6e4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "rows in train(818843, 1),\n", " rows in test (99499, 1), \n", " rows in validation(76645, 1)\n" ] } ], "source": [ "print(f'rows in train{train_ty.shape },\\n rows in test {test_ty.shape}, \\n rows in validation{val_ty.shape}')" ] }, { "cell_type": "code", "execution_count": null, "id": "5985a4f3", "metadata": {}, "outputs": [], "source": [ "timeline = pd.concat([\n", " train_ty.value_counts().rename('train'),\n", " test_ty.value_counts().rename('test'),\n", " val_ty.value_counts().rename('val'),\n", "], axis=1).fillna(0).astype(int)\n", "\n", "timeline.index.name = 'type'" ] }, { "cell_type": "code", "execution_count": 22, "id": "b0673e19", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
traintestval
type
political19451800
bias13323200
fake10488300
conspiracy9731400
rumor5644500
unknown4353400
reliable424199949976645
unreliable3533200
clickbait2741200
junksci1404000
satire1316000
hate877900
\n", "
" ], "text/plain": [ " train test val\n", "type \n", "political 194518 0 0\n", "bias 133232 0 0\n", "fake 104883 0 0\n", "conspiracy 97314 0 0\n", "rumor 56445 0 0\n", "unknown 43534 0 0\n", "reliable 42419 99499 76645\n", "unreliable 35332 0 0\n", "clickbait 27412 0 0\n", "junksci 14040 0 0\n", "satire 13160 0 0\n", "hate 8779 0 0" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "timeline" ] }, { "cell_type": "markdown", "id": "6bdc7d84", "metadata": {}, "source": [ "# Random Split " ] }, { "cell_type": "code", "execution_count": 4, "id": "cd5ca57b", "metadata": {}, "outputs": [], "source": [ "test_ty_R = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n", "train_ty_R = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n", "val_ty_R = pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n" ] }, { "cell_type": "code", "execution_count": 7, "id": "c793a37c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "rows in train(745724, 1),\n", " rows in test (149766, 1), \n", " rows in validation(99510, 1)\n" ] } ], "source": [ "print(f'rows in train{train_ty_R.shape },\\n rows in test {test_ty_R.shape}, \\n rows in validation{val_ty_R.shape}')" ] }, { "cell_type": "code", "execution_count": 11, "id": "583304ff", "metadata": {}, "outputs": [], "source": [ "timeline_R = pd.concat([\n", " train_ty_R.value_counts().rename('train'),\n", " test_ty_R.value_counts().rename('test'),\n", " val_ty_R.value_counts().rename('val'),\n", "], axis=1).fillna(0).astype(int)\n", "\n", "timeline_R.index.name = 'type'" ] }, { "cell_type": "code", "execution_count": 12, "id": "d8255b60", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
traintestval
type
reliable1638023301021752
political1457792924119498
bias997972007913356
fake787361560210545
conspiracy72837146769801
unknown68468137549098
rumor4225485535638
unreliable2648953463497
clickbait2055241612699
junksci1051620661458
satire985219711337
hate66411307831
2018-02-10 13:43:39.521661100
\n", "
" ], "text/plain": [ " train test val\n", "type \n", "reliable 163802 33010 21752\n", "political 145779 29241 19498\n", "bias 99797 20079 13356\n", "fake 78736 15602 10545\n", "conspiracy 72837 14676 9801\n", "unknown 68468 13754 9098\n", "rumor 42254 8553 5638\n", "unreliable 26489 5346 3497\n", "clickbait 20552 4161 2699\n", "junksci 10516 2066 1458\n", "satire 9852 1971 1337\n", "hate 6641 1307 831\n", "2018-02-10 13:43:39.521661 1 0 0" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "timeline_R" ] }, { "cell_type": "code", "execution_count": null, "id": "355d343a", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "main_asg", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.14.2" } }, "nbformat": 4, "nbformat_minor": 5 }