Revisions to Zim parsing, netflix parsing, and updates to html scraping to include synopsis

This commit is contained in:
prabhaavp
2026-03-19 01:56:14 -04:00
parent 0a70920ba9
commit 492160c3a3
13 changed files with 252 additions and 63 deletions

View File

@@ -1,11 +1,8 @@
from bs4 import BeautifulSoup
import os
# script_dir = os.path.dirname(os.path.abspath(__file__))
# file_path = os.path.join(script_dir, "..", "data", "tt0074888.html")
def extract_movie_info(file_path):
with open(file_path, "r", encoding="utf-8") as f:
html = f.read()
@@ -66,17 +63,25 @@ def extract_movie_info(file_path):
# -----------------------------
plot = ""
plot_header = soup.find(id="Plot")
plot_header = soup.find(id="Plot") or soup.find(id="Synopsis")
if plot_header:
current = plot_header.parent
for sibling in current.find_next_siblings():
if sibling.name == "div" and "mw-heading2" in sibling.get("class", []):
break
if sibling.name == "p":
plot += sibling.get_text(" ", strip=True) + " "
if not plot and content:
for el in content.find_all(["p", "div"], recursive=False):
if el.name == "div" and el.find(["h2"]):
break
if el.name == "p":
text = el.get_text(" ", strip=True)
if text:
plot += text + " "
plot = plot.strip()
return title, directed_by, cast, genre, plot, year, poster_filename