From 0ac1234afaf1c1fde69768e70db55cabc42adc0b Mon Sep 17 00:00:00 2001 From: prabhaavp Date: Tue, 10 Mar 2026 13:10:25 -0400 Subject: [PATCH] - Fix directories --- data/raw/wikipedia/.gitkeep | 0 scripts/extract_wiki_zim.py | 10 ++++-- scripts/scrape.py | 2 +- scripts/scrape_wiki.py | 69 +++++++++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 3 deletions(-) create mode 100644 data/raw/wikipedia/.gitkeep create mode 100644 scripts/scrape_wiki.py diff --git a/data/raw/wikipedia/.gitkeep b/data/raw/wikipedia/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/scripts/extract_wiki_zim.py b/scripts/extract_wiki_zim.py index 27a2c81e3..df15ec220 100644 --- a/scripts/extract_wiki_zim.py +++ b/scripts/extract_wiki_zim.py @@ -6,7 +6,7 @@ import csv BASE_DIR = os.path.dirname(os.path.abspath(__file__)) INPUT_TSV = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/imdb_datasets/title.basics.test.tsv")) -OUTPUT_DIR = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/wikipedia/wikipedia_html")) +OUTPUT_DIR = os.path.abspath(os.path.join(BASE_DIR, "../data/processed/wikipedia_html")) ZIM_PATH = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/wikipedia/wikipedia_en_all_maxi_2025-08.zim")) os.makedirs(OUTPUT_DIR, exist_ok=True) @@ -53,16 +53,22 @@ with open(INPUT_TSV, encoding="utf-8") as f: tconst = row["tconst"] title = row["primaryTitle"] year = row["startYear"] + titleType = row["titleType"] + if year is None or titleType != "movie": + print("Skipping from TSV: ", title) + continue # folder for each movie movie_dir = os.path.join(OUTPUT_DIR, tconst) os.makedirs(movie_dir, exist_ok=True) outfile = os.path.join(movie_dir, f"{tconst}.html") if os.path.exists(outfile): continue - query = f"{title} {year}" if year != "\\N" else title #if year not empty + query = f"{title} ({year} film)" if year != "\\N" else title #if year not empty print(f"fetching Wikipedia HTML + images for {tconst}: {query}") html_with_images = fetch_wikipedia_html_with_images(query, movie_dir) if html_with_images: + if "Directed by" not in html_with_images: + continue with open(outfile, "w", encoding="utf-8") as out: out.write(html_with_images) else: diff --git a/scripts/scrape.py b/scripts/scrape.py index fb9f976ac..9559bf152 100644 --- a/scripts/scrape.py +++ b/scripts/scrape.py @@ -19,7 +19,7 @@ response = requests.get(url, headers=headers, params=params) print("Status:", response.status_code) print("Content-Type:", response.headers.get("content-type")) -print("First 200 chars:\n", response.text[:200]) +print("First 200 chars:\n", response.text) data = response.json() diff --git a/scripts/scrape_wiki.py b/scripts/scrape_wiki.py new file mode 100644 index 000000000..8c7c1e7bd --- /dev/null +++ b/scripts/scrape_wiki.py @@ -0,0 +1,69 @@ +import csv +import os +import requests +from time import sleep + +HEADERS = {"User-Agent": "cse881"} +SEARCH_URL = "https://en.wikipedia.org/w/api.php" +BASE_URL = "https://en.wikipedia.org/api/rest_v1" +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) + +INPUT_TSV = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/imdb_datasets/title.basics.test.tsv")) +OUTPUT_DIR = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/wikipedia/wikipedia_html")) + +os.makedirs(OUTPUT_DIR, exist_ok=True) + +def fetch_wikipedia_html(query): + params = { + "action": "query", + "list": "search", + "srsearch": query, + "format": "json" + } + + resp = requests.get(SEARCH_URL, params=params, headers=HEADERS).json() + results = resp.get("query", {}).get("search", []) + + if not results: + return None + + best_title = results[0]["title"] + wiki_title = best_title.replace(" ", "_") + html_url = f"{BASE_URL}/page/html/{wiki_title}" + r = requests.get(html_url, headers=HEADERS) + + if r.status_code != 200: + return None + return r.text + + +with open(INPUT_TSV, encoding="utf-8") as f: + print("Opened file:", INPUT_TSV) + print("First 500 chars:") + print(f.read(500)) + f.seek(0) + + reader = csv.DictReader(f, delimiter="\t") + for row in reader: + tconst = row["tconst"] + title = row["primaryTitle"] + year = row["startYear"] + outfile = os.path.join(OUTPUT_DIR, f"{tconst}.html") + print(outfile) + + if os.path.exists(outfile): + print(f"Skipping {tconst}: {query}") + continue #if exists, skip + + query = f"{title} {year}" if year != "\\N" else title + print(f"Fetching Wikipedia for {tconst}: {query}") + html = fetch_wikipedia_html(query) + if html: + with open(outfile, "w", encoding="utf-8") as out: + out.write(html) + else: + print(f"No Wikipedia page found") + sleep(0.5) +print("Completed") + +#https://en.wikipedia.org/w/index.php?api=wmf-restbase&title=Special%3ARestSandbox#/Page%20content/get_page_summary__title_