beatifulsoup extract code

2026-03-12 12:11:37 -04:00
parent 8fa2cdba3c
commit a1beba6730
6 changed files with 437 additions and 22 deletions
--- a/scripts/dataset_create.py
+++ b/scripts/dataset_create.py
@@ -0,0 +1,24 @@
+import pandas as pd
+import os
+from scrape import extract_movie_info
+
+script_dir = os.path.dirname(os.path.abspath(__file__))
+file_path = os.path.join(script_dir, "..", "sample_data.xlsx")
+movie_data = pd.read_excel(file_path)
+print(movie_data.columns)
+
+script_dir = os.path.dirname(os.path.abspath(__file__))
+movie_html = os.path.join(script_dir, "..", "data", "tt0074888.html")
+
+title, directed_by, cast, genre, plot = extract_movie_info(movie_html)
+new_row = {
+    "Movie": title,
+    "Director": directed_by,
+    "Cast": ", ".join(cast),
+    "Genre": genre,
+    "Plot": plot
+}
+
+movie_data.loc[len(movie_data)] = new_row
+output_path = os.path.join(script_dir, "..", "updated_data.xlsx")
+movie_data.to_excel(output_path, index=False)
--- a/scripts/scrape.py
+++ b/scripts/scrape.py
@@ -1,32 +1,89 @@
-import requests
+from bs4 import BeautifulSoup
+import os

-url = "https://en.wikipedia.org/w/api.php"
+script_dir = os.path.dirname(os.path.abspath(__file__))
+file_path = os.path.join(script_dir, "..", "data", "tt0074888.html")

-headers = {
-    "User-Agent": "CSE881-MovieProject/1.0 (ishaa@msu.edu)"
-}
+def extract_movie_info(file_path):

-params = {
-    "action": "query",
-    "format": "json",
-    "titles": "Godfather",
-    "prop": "extracts",
-    "explaintext": True,
-    "redirects": 1
-}
+    with open(file_path, "r", encoding="utf-8") as f:
+        html = f.read()

-response = requests.get(url, headers=headers, params=params)
+    soup = BeautifulSoup(html, "lxml")

-print("Status:", response.status_code)
-print("Content-Type:", response.headers.get("content-type"))
-print("First 200 chars:\n", response.text[:1000])
+    # -----------------------------
+    # Title
+    # -----------------------------
+    title_tag = soup.find("h1")
+    title = title_tag.get_text(strip=True) if title_tag else None

-data = response.json()
+    # -----------------------------
+    # Genre (first line)
+    # -----------------------------
+    genre = None

+    content = soup.find("div", id="mw-content-text")
+    if content:
+        first_paragraph = content.find("p")
+        if first_paragraph:
+            genre = first_paragraph.get_text(" ", strip=True)

+    # -----------------------------
+    # Infobox: Directed by + Starring
+    # -----------------------------
+    infobox = soup.find("table", class_="infobox")

-pages = data["query"]["pages"]
-page = next(iter(pages.values()))
+    directed_by = None
+    cast = []

-print("\nTitle:", page["title"])
-print("\nPreview:\n", page["extract"])
+    if infobox:
+        rows = infobox.find_all("tr")
+
+        for row in rows:
+            header = row.find("th")
+            data = row.find("td")
+
+            if not header or not data:
+                continue
+
+            header_text = header.get_text(" ", strip=True)
+
+            if header_text == "Directed by":
+                directed_by = data.get_text(" ", strip=True)
+
+            elif header_text == "Starring":
+                # Get cast members split by <br> or links/text
+                cast_items = list(data.stripped_strings)
+                cast = cast_items[:5]
+
+    # -----------------------------
+    # Plot section
+    # -----------------------------
+    plot = ""
+
+    plot_header = soup.find(id="Plot")
+
+    if plot_header:
+        # Move to the parent heading container if needed
+        current = plot_header.parent
+
+        for sibling in current.find_next_siblings():
+            # Stop when next h2 section begins
+            if sibling.name == "div" and "mw-heading2" in sibling.get("class", []):
+                break
+            if sibling.name == "p":
+                plot += sibling.get_text(" ", strip=True) + " "
+
+    plot = plot.strip()
+
+    return title, directed_by, cast, genre, plot
+
+# -----------------------------
+# Print results
+# -----------------------------
+title, directed_by, cast, genre, plot = extract_movie_info(file_path)
+print("Title:", title)
+print("Directed by:", directed_by)
+print("Cast:", cast)
+print("Genre:", genre)
+print("\nPlot:\n", plot)