{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "9c2d25e9", "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import os \n", "import sys\n", "sys.path.append(os.path.join(os.getcwd(), '../src'))\n", "from constants import TRAINING_DIR, TESTING_DIR, VALIDATION_DIR\n", "pd.set_option('display.max_columns', None)\n" ] }, { "cell_type": "code", "execution_count": 20, "id": "a917b0fa", "metadata": {}, "outputs": [], "source": [ "test_ty = pd.read_parquet(f\"{TESTING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n", "train_ty = pd.read_parquet(f\"{TRAINING_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n", "val_ty = pd.read_parquet(f\"{VALIDATION_DIR}/995,000_rows.parquet\", columns=['type']).fillna('unknown') \n" ] }, { "cell_type": "code", "execution_count": 19, "id": "0098d6e4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "rows in train(818843, 1),\n", " rows in test (99499, 1), \n", " rows in validation(76645, 1)\n" ] } ], "source": [ "print(f'rows in train{train_ty.shape },\\n rows in test {test_ty.shape}, \\n rows in validation{val_ty.shape}')" ] }, { "cell_type": "code", "execution_count": 21, "id": "5985a4f3", "metadata": {}, "outputs": [], "source": [ "timeline = pd.concat([\n", " b.value_counts().rename('train'),\n", " a.value_counts().rename('test'),\n", " c.value_counts().rename('val'),\n", "], axis=1).fillna(0).astype(int)\n", "\n", "timeline.index.name = 'type'" ] }, { "cell_type": "code", "execution_count": 22, "id": "b0673e19", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | train | \n", "test | \n", "val | \n", "
|---|---|---|---|
| type | \n", "\n", " | \n", " | \n", " |
| political | \n", "194518 | \n", "0 | \n", "0 | \n", "
| bias | \n", "133232 | \n", "0 | \n", "0 | \n", "
| fake | \n", "104883 | \n", "0 | \n", "0 | \n", "
| conspiracy | \n", "97314 | \n", "0 | \n", "0 | \n", "
| rumor | \n", "56445 | \n", "0 | \n", "0 | \n", "
| unknown | \n", "43534 | \n", "0 | \n", "0 | \n", "
| reliable | \n", "42419 | \n", "99499 | \n", "76645 | \n", "
| unreliable | \n", "35332 | \n", "0 | \n", "0 | \n", "
| clickbait | \n", "27412 | \n", "0 | \n", "0 | \n", "
| junksci | \n", "14040 | \n", "0 | \n", "0 | \n", "
| satire | \n", "13160 | \n", "0 | \n", "0 | \n", "
| hate | \n", "8779 | \n", "0 | \n", "0 | \n", "