{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "aa962731", "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import numpy as np\n", "import matplotlib as plt\n", "import os \n", "import sys\n", "sys.path.append(os.path.join(os.getcwd(), '../src'))\n", "from constants import DATASET_DIR, TEMP_DIR\n", "from helper import default_labelling \n", "import matplotlib.pyplot as plt\n", "pd.set_option('display.max_columns', None)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ff743a62", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Unnamed: 0', 'id', 'domain', 'type', 'url', 'content', 'scraped_at', 'inserted_at', 'updated_at', 'title', 'authors', 'keywords', 'meta_keywords', 'meta_description', 'tags', 'summary', 'source', 'tokens']\n" ] } ], "source": [ " # schema = pq.read_schema(f\"{DATASET_DIR}/995,000_rows.parquet\")\n", " # print(schema.names)" ] }, { "cell_type": "code", "execution_count": 21, "id": "b9dbee99", "metadata": {}, "outputs": [], "source": [ "cols = pd.read_parquet(f\"{DATASET_DIR}/995,000_rows.parquet\", columns=['type', 'domain'])\n", "cols['type'] = cols['type'].fillna('unknown') # fill directly in cols\n", "cols['domain'] = cols['domain'].fillna('unknown') # fix typo 'unkown' → 'unknown'\n", "cols['labels'] = cols['type'].apply(default_labelling).astype(str)\n", "labels = cols['labels']\n", "types = cols['type']\n", "domain = cols['domain']" ] }, { "cell_type": "code", "execution_count": 22, "id": "2e967f82", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | type | \n", "domain | \n", "labels | \n", "
|---|---|---|---|
| 0 | \n", "political | \n", "nationalreview.com | \n", "Label.REAL | \n", "
| 1 | \n", "fake | \n", "beforeitsnews.com | \n", "Label.FAKE | \n", "
| 2 | \n", "satire | \n", "dailycurrant.com | \n", "Label.FAKE | \n", "
| 3 | \n", "reliable | \n", "nytimes.com | \n", "Label.REAL | \n", "
| 4 | \n", "conspiracy | \n", "infiniteunknown.net | \n", "Label.FAKE | \n", "