97 lines
2.8 KiB
Python
97 lines
2.8 KiB
Python
from bs4 import BeautifulSoup
|
|
import os
|
|
|
|
|
|
def extract_movie_info(file_path):
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
html = f.read()
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
# -----------------------------
|
|
# Title
|
|
# -----------------------------
|
|
title_tag = soup.find("h1")
|
|
title = title_tag.get_text(strip=True) if title_tag else None
|
|
|
|
# -----------------------------
|
|
# Genre (first line)
|
|
# -----------------------------
|
|
genre = None
|
|
|
|
content = soup.find("div", id="mw-content-text")
|
|
if content:
|
|
first_paragraph = content.find("p")
|
|
if first_paragraph:
|
|
genre = first_paragraph.get_text(" ", strip=True)
|
|
|
|
# -----------------------------
|
|
# Infobox: Directed by + Starring
|
|
# -----------------------------
|
|
infobox = soup.find("table", class_="infobox")
|
|
|
|
directed_by = None
|
|
cast = []
|
|
poster_filename = None
|
|
year = None
|
|
|
|
if infobox:
|
|
rows = infobox.find_all("tr")
|
|
img = infobox.select_one("img")
|
|
if img and img.get("src"):
|
|
poster_filename = os.path.basename(img["src"])
|
|
|
|
for row in rows:
|
|
header = row.find("th")
|
|
data = row.find("td")
|
|
|
|
if not header or not data:
|
|
continue
|
|
|
|
header_text = header.get_text(" ", strip=True)
|
|
|
|
if header_text == "Directed by":
|
|
directed_by = data.get_text(" ", strip=True)
|
|
elif "Release date" in header_text:
|
|
year = data.get_text(" ", strip=True)
|
|
elif header_text == "Starring":
|
|
cast_items = list(data.stripped_strings)
|
|
cast = cast_items[:5]
|
|
|
|
# -----------------------------
|
|
# Plot section
|
|
# -----------------------------
|
|
plot = ""
|
|
|
|
plot_header = soup.find(id="Plot") or soup.find(id="Synopsis")
|
|
|
|
if plot_header:
|
|
current = plot_header.parent
|
|
for sibling in current.find_next_siblings():
|
|
if sibling.name == "div" and "mw-heading2" in sibling.get("class", []):
|
|
break
|
|
if sibling.name == "p":
|
|
plot += sibling.get_text(" ", strip=True) + " "
|
|
|
|
if not plot and content:
|
|
for el in content.find_all(["p", "div"], recursive=False):
|
|
if el.name == "div" and el.find(["h2"]):
|
|
break
|
|
if el.name == "p":
|
|
text = el.get_text(" ", strip=True)
|
|
if text:
|
|
plot += text + " "
|
|
|
|
plot = plot.strip()
|
|
|
|
return title, directed_by, cast, genre, plot, year, poster_filename
|
|
|
|
# # -----------------------------
|
|
# # Print results
|
|
# # -----------------------------
|
|
# title, directed_by, cast, genre, plot = extract_movie_info(file_path)
|
|
# print("Title:", title)
|
|
# print("Directed by:", directed_by)
|
|
# print("Cast:", cast)
|
|
# print("Genre:", genre)
|
|
# print("\nPlot:\n", plot) |