Revisions to Zim parsing, netflix parsing, and updates to html scraping to include synopsis

2026-03-19 01:56:14 -04:00
parent 0a70920ba9
commit 492160c3a3
13 changed files with 252 additions and 63 deletions
--- a/scripts/scrape.py
+++ b/scripts/scrape.py
@@ -1,11 +1,8 @@
 from bs4 import BeautifulSoup
 import os

-# script_dir = os.path.dirname(os.path.abspath(__file__))
-# file_path = os.path.join(script_dir, "..", "data", "tt0074888.html")

 def extract_movie_info(file_path):
-
    with open(file_path, "r", encoding="utf-8") as f:
        html = f.read()

@@ -66,17 +63,25 @@ def extract_movie_info(file_path):
    # -----------------------------
    plot = ""

-    plot_header = soup.find(id="Plot")
+    plot_header = soup.find(id="Plot") or soup.find(id="Synopsis")

    if plot_header:
        current = plot_header.parent
-
        for sibling in current.find_next_siblings():
            if sibling.name == "div" and "mw-heading2" in sibling.get("class", []):
                break
            if sibling.name == "p":
                plot += sibling.get_text(" ", strip=True) + " "

+    if not plot and content:
+        for el in content.find_all(["p", "div"], recursive=False):
+            if el.name == "div" and el.find(["h2"]):
+                break
+            if el.name == "p":
+                text = el.get_text(" ", strip=True)
+                if text:
+                    plot += text + " "
+
    plot = plot.strip()

    return title, directed_by, cast, genre, plot, year, poster_filename