- Html -> TSV

2026-03-12 12:14:31 -04:00
parent 1614d85270
commit 525e359c6b
4 changed files with 183 additions and 4 deletions
--- a/scripts/extract_wiki_html.py
+++ b/scripts/extract_wiki_html.py
@@ -0,0 +1,115 @@
+import os
+import re
+import csv
+import pandas as pd
+from bs4 import BeautifulSoup
+
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+INPUT_DIR = os.path.join(BASE_DIR, "../data/processed/wikipedia_html")
+OUTPUT_TSV = os.path.join(BASE_DIR, "../data/processed/spreadsheet/wikipedia_metadata4.tsv")
+
+WHITELIST = {
+    "slug",
+    "title",
+    "poster_filename",
+    "Directed by",
+    "Produced by",
+    "Written by",
+    "Starring",
+    "Release date",
+    "Running time",
+    "Country",
+    "Language",
+    "Budget",
+    "Box office",
+    "Plot"
+}
+
+def clean(el):
+    if not el:
+        return ""
+    for br in el.find_all("br"):
+        br.replace_with(" | ")
+    return re.sub(r"\s+", " ", el.get_text(" ", strip=True)).strip()
+
+def parse_html(path, slug):
+    with open(path, encoding="utf-8") as f:
+        soup = BeautifulSoup(f, "html.parser")
+    row = {"slug": slug}
+    h1 = soup.select_one("h1.firstHeading")
+    if h1:
+        row["title"] = h1.get_text(strip=True)
+    else:
+        row["title"] = ""
+    # infobox
+    infobox = soup.select_one("table.infobox")
+    if infobox:
+        img = infobox.select_one("img")
+        if img and img.get("src"):
+            row["poster_filename"] = os.path.basename(img["src"])
+        else:
+            row["poster_filename"] = ""
+        for tr in infobox.select("tr"):
+            th = tr.select_one(".infobox-label")
+            td = tr.select_one(".infobox-data")
+            if th and td:
+                row[clean(th)] = clean(td)
+    # sections
+    content = soup.select_one(".mw-parser-output")
+    if not content:
+        return {k: v for k, v in row.items() if k in WHITELIST}
+    skip = {"references", "external links", "see also"}
+    current = None
+    lead = []
+    for el in content.children:
+        if getattr(el, "name", None) == "div" and "mw-heading" in el.get("class", []):
+            h = el.find(["h2", "h3", "h4", "h5", "h6"]) #assuming no more than first 6 headers need to be looked at
+            if h:
+                title = clean(h)
+                if title.lower() in skip:
+                    current = None
+                else:
+                    current = title
+                if current:
+                    row[current] = ""
+            continue
+        if not current:
+            if getattr(el, "name", None) == "p":
+                text = clean(el)
+                if text:
+                    lead.append(text)
+            continue
+        if el.name in ["p", "ul", "ol", "table"]:
+            text = clean(el)
+            if text:
+                row[current] += text
+    if lead:
+        if row.get("Plot"):
+            row["Plot"] = " | ".join(lead) + " | " + row["Plot"]
+        else:
+            row["Plot"] = " | ".join(lead)
+    return {k: v for k, v in row.items() if k in WHITELIST}
+
+def main():
+    rows = []
+    for folder in os.listdir(INPUT_DIR):
+        path = os.path.join(INPUT_DIR, folder)
+        html = next((f for f in os.listdir(path) if f.endswith(".html")), None)
+        if not html:
+            continue
+        try:
+            rows.append(parse_html(os.path.join(path, html), folder))
+        except Exception as e:
+            print("error:", html, e)
+    df = pd.DataFrame(rows).fillna("")
+    if df.empty:
+        print("The folder was empty / None parsed")
+        return
+    cols = ["slug", "poster_filename"] + [c for c in df.columns if c not in ("slug", "poster_filename")]
+    df = df[cols]
+    os.makedirs(os.path.dirname(OUTPUT_TSV), exist_ok=True)
+    df.to_csv(OUTPUT_TSV, sep="\t", index=False, quoting=csv.QUOTE_NONE, escapechar="\\")
+    print(f"Wrote {len(df)} rows -> {OUTPUT_TSV}")
+
+if __name__ == "__main__":
+    main()