{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "9c2d25e9", "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import os \n", "import sys\n", "sys.path.append(os.path.join(os.getcwd(), '../src'))\n", "from constants import TRAINING_DIR, TESTING_DIR, VALIDATION_DIR\n", "pd.set_option('display.max_columns', None)\n" ] }, { "cell_type": "code", "execution_count": 20, "id": "a917b0fa", "metadata": {}, "outputs": [], "source": [ "test_ty = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n", "train_ty = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n", "val_ty = pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n" ] }, { "cell_type": "code", "execution_count": 19, "id": "0098d6e4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "rows in train(818843, 1),\n", " rows in test (99499, 1), \n", " rows in validation(76645, 1)\n" ] } ], "source": [ "print(f'rows in train{train_ty.shape },\\n rows in test {test_ty.shape}, \\n rows in validation{val_ty.shape}')" ] }, { "cell_type": "code", "execution_count": 21, "id": "5985a4f3", "metadata": {}, "outputs": [], "source": [ "timeline = pd.concat([\n", " b.value_counts().rename('train'),\n", " a.value_counts().rename('test'),\n", " c.value_counts().rename('val'),\n", "], axis=1).fillna(0).astype(int)\n", "\n", "timeline.index.name = 'type'" ] }, { "cell_type": "code", "execution_count": 22, "id": "b0673e19", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
traintestval
type
political19451800
bias13323200
fake10488300
conspiracy9731400
rumor5644500
unknown4353400
reliable424199949976645
unreliable3533200
clickbait2741200
junksci1404000
satire1316000
hate877900
\n", "
" ], "text/plain": [ " train test val\n", "type \n", "political 194518 0 0\n", "bias 133232 0 0\n", "fake 104883 0 0\n", "conspiracy 97314 0 0\n", "rumor 56445 0 0\n", "unknown 43534 0 0\n", "reliable 42419 99499 76645\n", "unreliable 35332 0 0\n", "clickbait 27412 0 0\n", "junksci 14040 0 0\n", "satire 13160 0 0\n", "hate 8779 0 0" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "timeline" ] }, { "cell_type": "code", "execution_count": null, "id": "c2bcfc84", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "main_asg", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.14.2" } }, "nbformat": 4, "nbformat_minor": 5 }