beatifulsoup extract code

This commit is contained in:
IshaAtteri
2026-03-12 12:11:37 -04:00
parent 8fa2cdba3c
commit a1beba6730
6 changed files with 437 additions and 22 deletions

24
scripts/dataset_create.py Normal file
View File

@@ -0,0 +1,24 @@
import pandas as pd
import os
from scrape import extract_movie_info
script_dir = os.path.dirname(os.path.abspath(__file__))
file_path = os.path.join(script_dir, "..", "sample_data.xlsx")
movie_data = pd.read_excel(file_path)
print(movie_data.columns)
script_dir = os.path.dirname(os.path.abspath(__file__))
movie_html = os.path.join(script_dir, "..", "data", "tt0074888.html")
title, directed_by, cast, genre, plot = extract_movie_info(movie_html)
new_row = {
"Movie": title,
"Director": directed_by,
"Cast": ", ".join(cast),
"Genre": genre,
"Plot": plot
}
movie_data.loc[len(movie_data)] = new_row
output_path = os.path.join(script_dir, "..", "updated_data.xlsx")
movie_data.to_excel(output_path, index=False)

View File

@@ -1,32 +1,89 @@
import requests
from bs4 import BeautifulSoup
import os
url = "https://en.wikipedia.org/w/api.php"
script_dir = os.path.dirname(os.path.abspath(__file__))
file_path = os.path.join(script_dir, "..", "data", "tt0074888.html")
headers = {
"User-Agent": "CSE881-MovieProject/1.0 (ishaa@msu.edu)"
}
def extract_movie_info(file_path):
params = {
"action": "query",
"format": "json",
"titles": "Godfather",
"prop": "extracts",
"explaintext": True,
"redirects": 1
}
with open(file_path, "r", encoding="utf-8") as f:
html = f.read()
response = requests.get(url, headers=headers, params=params)
soup = BeautifulSoup(html, "lxml")
print("Status:", response.status_code)
print("Content-Type:", response.headers.get("content-type"))
print("First 200 chars:\n", response.text[:1000])
# -----------------------------
# Title
# -----------------------------
title_tag = soup.find("h1")
title = title_tag.get_text(strip=True) if title_tag else None
data = response.json()
# -----------------------------
# Genre (first line)
# -----------------------------
genre = None
content = soup.find("div", id="mw-content-text")
if content:
first_paragraph = content.find("p")
if first_paragraph:
genre = first_paragraph.get_text(" ", strip=True)
# -----------------------------
# Infobox: Directed by + Starring
# -----------------------------
infobox = soup.find("table", class_="infobox")
pages = data["query"]["pages"]
page = next(iter(pages.values()))
directed_by = None
cast = []
print("\nTitle:", page["title"])
print("\nPreview:\n", page["extract"])
if infobox:
rows = infobox.find_all("tr")
for row in rows:
header = row.find("th")
data = row.find("td")
if not header or not data:
continue
header_text = header.get_text(" ", strip=True)
if header_text == "Directed by":
directed_by = data.get_text(" ", strip=True)
elif header_text == "Starring":
# Get cast members split by <br> or links/text
cast_items = list(data.stripped_strings)
cast = cast_items[:5]
# -----------------------------
# Plot section
# -----------------------------
plot = ""
plot_header = soup.find(id="Plot")
if plot_header:
# Move to the parent heading container if needed
current = plot_header.parent
for sibling in current.find_next_siblings():
# Stop when next h2 section begins
if sibling.name == "div" and "mw-heading2" in sibling.get("class", []):
break
if sibling.name == "p":
plot += sibling.get_text(" ", strip=True) + " "
plot = plot.strip()
return title, directed_by, cast, genre, plot
# -----------------------------
# Print results
# -----------------------------
title, directed_by, cast, genre, plot = extract_movie_info(file_path)
print("Title:", title)
print("Directed by:", directed_by)
print("Cast:", cast)
print("Genre:", genre)
print("\nPlot:\n", plot)