From 525e359c6b59db7c63a18f488c950d57294f4a9c Mon Sep 17 00:00:00 2001 From: prabhaavp Date: Thu, 12 Mar 2026 12:14:31 -0400 Subject: [PATCH] - Html -> TSV --- requirements.txt | 3 +- scripts/extract_wiki_html.py | 115 +++++++++++++++++++++++++++++++++++ scripts/extract_wiki_zim.py | 6 +- scripts/rank_cols.py | 63 +++++++++++++++++++ 4 files changed, 183 insertions(+), 4 deletions(-) create mode 100644 scripts/extract_wiki_html.py create mode 100644 scripts/rank_cols.py diff --git a/requirements.txt b/requirements.txt index 7b5077898..94ce5a178 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ pandas~=3.0.0 dtale~=3.19.1 requests~=2.32.5 beautifulsoup4~=4.14.3 -libzim~=3.8.0 \ No newline at end of file +libzim~=3.8.0 +python-slugify~=8.0.4 \ No newline at end of file diff --git a/scripts/extract_wiki_html.py b/scripts/extract_wiki_html.py new file mode 100644 index 000000000..c6cfced93 --- /dev/null +++ b/scripts/extract_wiki_html.py @@ -0,0 +1,115 @@ +import os +import re +import csv +import pandas as pd +from bs4 import BeautifulSoup + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +INPUT_DIR = os.path.join(BASE_DIR, "../data/processed/wikipedia_html") +OUTPUT_TSV = os.path.join(BASE_DIR, "../data/processed/spreadsheet/wikipedia_metadata4.tsv") + +WHITELIST = { + "slug", + "title", + "poster_filename", + "Directed by", + "Produced by", + "Written by", + "Starring", + "Release date", + "Running time", + "Country", + "Language", + "Budget", + "Box office", + "Plot" +} + +def clean(el): + if not el: + return "" + for br in el.find_all("br"): + br.replace_with(" | ") + return re.sub(r"\s+", " ", el.get_text(" ", strip=True)).strip() + +def parse_html(path, slug): + with open(path, encoding="utf-8") as f: + soup = BeautifulSoup(f, "html.parser") + row = {"slug": slug} + h1 = soup.select_one("h1.firstHeading") + if h1: + row["title"] = h1.get_text(strip=True) + else: + row["title"] = "" + # infobox + infobox = soup.select_one("table.infobox") + if infobox: + img = infobox.select_one("img") + if img and img.get("src"): + row["poster_filename"] = os.path.basename(img["src"]) + else: + row["poster_filename"] = "" + for tr in infobox.select("tr"): + th = tr.select_one(".infobox-label") + td = tr.select_one(".infobox-data") + if th and td: + row[clean(th)] = clean(td) + # sections + content = soup.select_one(".mw-parser-output") + if not content: + return {k: v for k, v in row.items() if k in WHITELIST} + skip = {"references", "external links", "see also"} + current = None + lead = [] + for el in content.children: + if getattr(el, "name", None) == "div" and "mw-heading" in el.get("class", []): + h = el.find(["h2", "h3", "h4", "h5", "h6"]) #assuming no more than first 6 headers need to be looked at + if h: + title = clean(h) + if title.lower() in skip: + current = None + else: + current = title + if current: + row[current] = "" + continue + if not current: + if getattr(el, "name", None) == "p": + text = clean(el) + if text: + lead.append(text) + continue + if el.name in ["p", "ul", "ol", "table"]: + text = clean(el) + if text: + row[current] += text + if lead: + if row.get("Plot"): + row["Plot"] = " | ".join(lead) + " | " + row["Plot"] + else: + row["Plot"] = " | ".join(lead) + return {k: v for k, v in row.items() if k in WHITELIST} + +def main(): + rows = [] + for folder in os.listdir(INPUT_DIR): + path = os.path.join(INPUT_DIR, folder) + html = next((f for f in os.listdir(path) if f.endswith(".html")), None) + if not html: + continue + try: + rows.append(parse_html(os.path.join(path, html), folder)) + except Exception as e: + print("error:", html, e) + df = pd.DataFrame(rows).fillna("") + if df.empty: + print("The folder was empty / None parsed") + return + cols = ["slug", "poster_filename"] + [c for c in df.columns if c not in ("slug", "poster_filename")] + df = df[cols] + os.makedirs(os.path.dirname(OUTPUT_TSV), exist_ok=True) + df.to_csv(OUTPUT_TSV, sep="\t", index=False, quoting=csv.QUOTE_NONE, escapechar="\\") + print(f"Wrote {len(df)} rows -> {OUTPUT_TSV}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/extract_wiki_zim.py b/scripts/extract_wiki_zim.py index d029d7870..38955b63a 100644 --- a/scripts/extract_wiki_zim.py +++ b/scripts/extract_wiki_zim.py @@ -5,6 +5,7 @@ import os from libzim.reader import Archive from libzim.search import Query, Searcher import csv +from slugify import slugify BASE_DIR = os.path.dirname(os.path.abspath(__file__)) INPUT_TSV = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/imdb_datasets/title.basics.tsv")) @@ -16,10 +17,9 @@ zim = Archive(ZIM_PATH) searcher = Searcher(zim) print("The Zim file is now opened") + def sanitize_slug(slug): - slug = slug.replace("/", "_").replace("\\", "_") - slug = re.sub(r'[<>:"|?*]', "_", slug) - return slug[:200] + return slugify(slug, separator="_", max_length=200) or "_unknown" #Fetch the html AND the images and put them in a folder def fetch_wikipedia_html_with_images(query, save_dir): diff --git a/scripts/rank_cols.py b/scripts/rank_cols.py new file mode 100644 index 000000000..03aa1ed94 --- /dev/null +++ b/scripts/rank_cols.py @@ -0,0 +1,63 @@ +import os +import csv +import sys +from collections import defaultdict +from tqdm import tqdm + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +TSV_PATH = os.path.join(BASE_DIR, "../data/processed/spreadsheet/wikipedia_metadata3.tsv") +OUTPUT_PATH = os.path.join(BASE_DIR, "../data/processed/spreadsheet/rank_cols_output.txt") + +csv.field_size_limit(min(sys.maxsize, 2**31 - 1)) # try to increase max buffer so it doesn't fail +#https://stackoverflow.com/questions/53538888/counting-csv-column-occurrences-on-the-fly-in-python + +def main(): + lines = [] + + def log(msg=""): + print(msg) + lines.append(str(msg)) + + log(f"Reading: {TSV_PATH}") + + file_size = os.path.getsize(TSV_PATH) + col_filled = defaultdict(int) + row_count = 0 + + with open(TSV_PATH, encoding="utf-8", buffering=4 * 1024 * 1024) as f: + reader = csv.reader(f, delimiter="\t") + headers = next(reader) + num_cols = len(headers) + + with tqdm(total=file_size, unit="B", unit_scale=True, unit_divisor=1024, desc="Processing") as pbar: + for row in reader: + row_count += 1 + for i, val in enumerate(row): + if val and val.strip(): + col_filled[headers[i]] += 1 + pbar.update(sum(map(len, row)) + num_cols) #progress bar + + log(f"\nTotal rows: {row_count:,}") + log(f"Total columns: {num_cols}\n") + + ranked = sorted( + headers, + key=lambda c: col_filled.get(c, 0) / row_count, + reverse=True, + ) + + log(f"{'#':<5} {'Column':<40} {'Filled':>10} {'Total':>10} {'Fill %':>8}") + log("-" * 75) + for i, col in enumerate(ranked, 1): + filled = col_filled.get(col, 0) + pct = filled / row_count * 100 + log(f"{i:<5} {col:<40} {filled:>10,} {row_count:>10,} {pct:>7.1f}%") + + with open(OUTPUT_PATH, "w", encoding="utf-8") as out: + out.write("\n".join(lines)) + + print(f"\nOutput written to: {OUTPUT_PATH}") + + +if __name__ == "__main__": + main() \ No newline at end of file