- Html -> TSV

2026-03-12 12:14:31 -04:00
parent 1614d85270
commit 525e359c6b
4 changed files with 183 additions and 4 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ dtale~=3.19.1
 requests~=2.32.5
 beautifulsoup4~=4.14.3
 libzim~=3.8.0
 python-slugify~=8.0.4
--- a/scripts/extract_wiki_html.py
+++ b/scripts/extract_wiki_html.py
@@ -0,0 +1,115 @@
 import os
 import re
 import csv
 import pandas as pd
 from bs4 import BeautifulSoup
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 INPUT_DIR = os.path.join(BASE_DIR, "../data/processed/wikipedia_html")
 OUTPUT_TSV = os.path.join(BASE_DIR, "../data/processed/spreadsheet/wikipedia_metadata4.tsv")
 WHITELIST = {
    "slug",
    "title",
    "poster_filename",
    "Directed by",
    "Produced by",
    "Written by",
    "Starring",
    "Release date",
    "Running time",
    "Country",
    "Language",
    "Budget",
    "Box office",
    "Plot"
 }
 def clean(el):
    if not el:
        return ""
    for br in el.find_all("br"):
        br.replace_with(" | ")
    return re.sub(r"\s+", " ", el.get_text(" ", strip=True)).strip()
 def parse_html(path, slug):
    with open(path, encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
    row = {"slug": slug}
    h1 = soup.select_one("h1.firstHeading")
    if h1:
        row["title"] = h1.get_text(strip=True)
    else:
        row["title"] = ""
    # infobox
    infobox = soup.select_one("table.infobox")
    if infobox:
        img = infobox.select_one("img")
        if img and img.get("src"):
            row["poster_filename"] = os.path.basename(img["src"])
        else:
            row["poster_filename"] = ""
        for tr in infobox.select("tr"):
            th = tr.select_one(".infobox-label")
            td = tr.select_one(".infobox-data")
            if th and td:
                row[clean(th)] = clean(td)
    # sections
    content = soup.select_one(".mw-parser-output")
    if not content:
        return {k: v for k, v in row.items() if k in WHITELIST}
    skip = {"references", "external links", "see also"}
    current = None
    lead = []
    for el in content.children:
        if getattr(el, "name", None) == "div" and "mw-heading" in el.get("class", []):
            h = el.find(["h2", "h3", "h4", "h5", "h6"]) #assuming no more than first 6 headers need to be looked at
            if h:
                title = clean(h)
                if title.lower() in skip:
                    current = None
                else:
                    current = title
                if current:
                    row[current] = ""
            continue
        if not current:
            if getattr(el, "name", None) == "p":
                text = clean(el)
                if text:
                    lead.append(text)
            continue
        if el.name in ["p", "ul", "ol", "table"]:
            text = clean(el)
            if text:
                row[current] += text
    if lead:
        if row.get("Plot"):
            row["Plot"] = " | ".join(lead) + " | " + row["Plot"]
        else:
            row["Plot"] = " | ".join(lead)
    return {k: v for k, v in row.items() if k in WHITELIST}
 def main():
    rows = []
    for folder in os.listdir(INPUT_DIR):
        path = os.path.join(INPUT_DIR, folder)
        html = next((f for f in os.listdir(path) if f.endswith(".html")), None)
        if not html:
            continue
        try:
            rows.append(parse_html(os.path.join(path, html), folder))
        except Exception as e:
            print("error:", html, e)
    df = pd.DataFrame(rows).fillna("")
    if df.empty:
        print("The folder was empty / None parsed")
        return
    cols = ["slug", "poster_filename"] + [c for c in df.columns if c not in ("slug", "poster_filename")]
    df = df[cols]
    os.makedirs(os.path.dirname(OUTPUT_TSV), exist_ok=True)
    df.to_csv(OUTPUT_TSV, sep="\t", index=False, quoting=csv.QUOTE_NONE, escapechar="\\")
    print(f"Wrote {len(df)} rows -> {OUTPUT_TSV}")
 if __name__ == "__main__":
    main()
--- a/scripts/extract_wiki_zim.py
+++ b/scripts/extract_wiki_zim.py
@@ -5,6 +5,7 @@ import os
 from libzim.reader import Archive
 from libzim.search import Query, Searcher
 import csv
 from slugify import slugify
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 INPUT_TSV = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/imdb_datasets/title.basics.tsv"))
@@ -16,10 +17,9 @@ zim = Archive(ZIM_PATH)
 searcher = Searcher(zim)
 print("The Zim file is now opened")
 def sanitize_slug(slug):
-    slug = slug.replace("/", "_").replace("\\", "_")
+    return slugify(slug, separator="_", max_length=200) or "_unknown"
    slug = re.sub(r'[<>:"|?*]', "_", slug)
    return slug[:200]
 #Fetch the html AND the images and put them in a folder
 def fetch_wikipedia_html_with_images(query, save_dir):
--- a/scripts/rank_cols.py
+++ b/scripts/rank_cols.py
@@ -0,0 +1,63 @@
 import os
 import csv
 import sys
 from collections import defaultdict
 from tqdm import tqdm
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 TSV_PATH = os.path.join(BASE_DIR, "../data/processed/spreadsheet/wikipedia_metadata3.tsv")
 OUTPUT_PATH = os.path.join(BASE_DIR, "../data/processed/spreadsheet/rank_cols_output.txt")
 csv.field_size_limit(min(sys.maxsize, 2**31 - 1)) # try to increase max buffer so it doesn't fail
 #https://stackoverflow.com/questions/53538888/counting-csv-column-occurrences-on-the-fly-in-python
 def main():
    lines = []
    def log(msg=""):
        print(msg)
        lines.append(str(msg))
    log(f"Reading: {TSV_PATH}")
    file_size = os.path.getsize(TSV_PATH)
    col_filled = defaultdict(int)
    row_count = 0
    with open(TSV_PATH, encoding="utf-8", buffering=4 * 1024 * 1024) as f:
        reader = csv.reader(f, delimiter="\t")
        headers = next(reader)
        num_cols = len(headers)
        with tqdm(total=file_size, unit="B", unit_scale=True, unit_divisor=1024, desc="Processing") as pbar:
            for row in reader:
                row_count += 1
                for i, val in enumerate(row):
                    if val and val.strip():
                        col_filled[headers[i]] += 1
                pbar.update(sum(map(len, row)) + num_cols) #progress bar
    log(f"\nTotal rows: {row_count:,}")
    log(f"Total columns: {num_cols}\n")
    ranked = sorted(
        headers,
        key=lambda c: col_filled.get(c, 0) / row_count,
        reverse=True,
    )
    log(f"{'#':<5} {'Column':<40} {'Filled':>10} {'Total':>10} {'Fill %':>8}")
    log("-" * 75)
    for i, col in enumerate(ranked, 1):
        filled = col_filled.get(col, 0)
        pct = filled / row_count * 100
        log(f"{i:<5} {col:<40} {filled:>10,} {row_count:>10,} {pct:>7.1f}%")
    with open(OUTPUT_PATH, "w", encoding="utf-8") as out:
        out.write("\n".join(lines))
    print(f"\nOutput written to: {OUTPUT_PATH}")
 if __name__ == "__main__":
    main()