datamining_881/scripts/scrape.py

from bs4 import BeautifulSoup
import os


def extract_movie_info(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        html = f.read()

    soup = BeautifulSoup(html, "lxml")

    # -----------------------------
    # Title
    # -----------------------------
    title_tag = soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else None

    # -----------------------------
    # Genre (first line)
    # -----------------------------
    genre = None

    content = soup.find("div", id="mw-content-text")
    if content:
        first_paragraph = content.find("p")
        if first_paragraph:
            genre = first_paragraph.get_text(" ", strip=True)

    # -----------------------------
    # Infobox: Directed by + Starring
    # -----------------------------
    infobox = soup.find("table", class_="infobox")

    directed_by = None
    cast = []
    poster_filename = None
    year = None

    if infobox:
        rows = infobox.find_all("tr")
        img = infobox.select_one("img")
        if img and img.get("src"):
            poster_filename = os.path.basename(img["src"])

        for row in rows:
            header = row.find("th")
            data = row.find("td")

            if not header or not data:
                continue

            header_text = header.get_text(" ", strip=True)

            if header_text == "Directed by":
                directed_by = data.get_text(" ", strip=True)
            elif "Release date" in header_text:
                year = data.get_text(" ", strip=True)
            elif header_text == "Starring":
                cast_items = list(data.stripped_strings)
                cast = cast_items[:5]

    # -----------------------------
    # Plot section
    # -----------------------------
    plot = ""

    plot_header = soup.find(id="Plot") or soup.find(id="Synopsis")

    if plot_header:
        current = plot_header.parent
        for sibling in current.find_next_siblings():
            if sibling.name == "div" and "mw-heading2" in sibling.get("class", []):
                break
            if sibling.name == "p":
                plot += sibling.get_text(" ", strip=True) + " "

    if not plot and content:
        for el in content.find_all(["p", "div"], recursive=False):
            if el.name == "div" and el.find(["h2"]):
                break
            if el.name == "p":
                text = el.get_text(" ", strip=True)
                if text:
                    plot += text + " "

    plot = plot.strip()

    return title, directed_by, cast, genre, plot, year, poster_filename

# # -----------------------------
# # Print results
# # -----------------------------
# title, directed_by, cast, genre, plot = extract_movie_info(file_path)
# print("Title:", title)
# print("Directed by:", directed_by)
# print("Cast:", cast)
# print("Genre:", genre)
# print("\nPlot:\n", plot)