The code to extract zim into a spreadsheet.
This commit is contained in:
@@ -3,22 +3,44 @@ import os
|
||||
from scrape import extract_movie_info
|
||||
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
file_path = os.path.join(script_dir, "..", "sample_data.xlsx")
|
||||
movie_data = pd.read_excel(file_path)
|
||||
print(movie_data.columns)
|
||||
# file_path = os.path.join(script_dir, "..", "sample_data.xlsx")
|
||||
# movie_data = pd.read_excel(file_path)
|
||||
# print(movie_data.columns)
|
||||
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
movie_html = os.path.join(script_dir, "..", "data", "tt0074888.html")
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
INPUT_DIR = r'C:\Users\Prabhaav\Projects\PyCharm\datamining_881\data\processed\wikipedia_html'
|
||||
SPREADSHEET_DIR = os.path.join(BASE_DIR, "../data/processed/spreadsheets/")
|
||||
|
||||
title, directed_by, cast, genre, plot = extract_movie_info(movie_html)
|
||||
new_row = {
|
||||
"Movie": title,
|
||||
"Director": directed_by,
|
||||
"Cast": ", ".join(cast),
|
||||
"Genre": genre,
|
||||
"Plot": plot
|
||||
}
|
||||
movie_data = pd.DataFrame(columns=['Title', 'Director', 'Cast', 'Genre', 'Plot', 'Release Date', 'Slug', 'Poster Filename'])
|
||||
|
||||
movie_data.loc[len(movie_data)] = new_row
|
||||
output_path = os.path.join(script_dir, "..", "updated_data.xlsx")
|
||||
for folder in os.listdir(INPUT_DIR):
|
||||
path = os.path.join(INPUT_DIR, folder)
|
||||
script_dir = os.path.join(path, next((f for f in os.listdir(path) if f.endswith(".html")), None))
|
||||
if not script_dir:
|
||||
continue
|
||||
try:
|
||||
print(script_dir)
|
||||
title, directed_by, cast, genre, plot, year, poster_filename = extract_movie_info(script_dir)
|
||||
new_row = {
|
||||
"Title": title,
|
||||
"Director": directed_by,
|
||||
"Cast": ", ".join(cast),
|
||||
"Genre": genre,
|
||||
"Plot": plot,
|
||||
"Release Date": year,
|
||||
"Slug": script_dir,
|
||||
"Poster Filename": poster_filename
|
||||
}
|
||||
movie_data.loc[len(movie_data)] = new_row
|
||||
|
||||
except Exception as e:
|
||||
print("error:", e)
|
||||
except KeyboardInterrupt:
|
||||
output_path = os.path.join(SPREADSHEET_DIR, "updated_data.xlsx")
|
||||
print(output_path)
|
||||
movie_data.to_excel(output_path, index=False)
|
||||
quit()
|
||||
|
||||
output_path = os.path.join(SPREADSHEET_DIR, "updated_data.xlsx")
|
||||
print(output_path)
|
||||
movie_data.to_excel(output_path, index=False)
|
||||
@@ -1,8 +1,8 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
file_path = os.path.join(script_dir, "..", "data", "tt0074888.html")
|
||||
# script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
# file_path = os.path.join(script_dir, "..", "data", "tt0074888.html")
|
||||
|
||||
def extract_movie_info(file_path):
|
||||
|
||||
@@ -35,9 +35,14 @@ def extract_movie_info(file_path):
|
||||
|
||||
directed_by = None
|
||||
cast = []
|
||||
poster_filename = None
|
||||
year = None
|
||||
|
||||
if infobox:
|
||||
rows = infobox.find_all("tr")
|
||||
img = infobox.select_one("img")
|
||||
if img and img.get("src"):
|
||||
poster_filename = os.path.basename(img["src"])
|
||||
|
||||
for row in rows:
|
||||
header = row.find("th")
|
||||
@@ -50,7 +55,8 @@ def extract_movie_info(file_path):
|
||||
|
||||
if header_text == "Directed by":
|
||||
directed_by = data.get_text(" ", strip=True)
|
||||
|
||||
elif "Release date" in header_text:
|
||||
year = data.get_text(" ", strip=True)
|
||||
elif header_text == "Starring":
|
||||
cast_items = list(data.stripped_strings)
|
||||
cast = cast_items[:5]
|
||||
@@ -73,14 +79,14 @@ def extract_movie_info(file_path):
|
||||
|
||||
plot = plot.strip()
|
||||
|
||||
return title, directed_by, cast, genre, plot
|
||||
return title, directed_by, cast, genre, plot, year, poster_filename
|
||||
|
||||
# -----------------------------
|
||||
# Print results
|
||||
# -----------------------------
|
||||
title, directed_by, cast, genre, plot = extract_movie_info(file_path)
|
||||
print("Title:", title)
|
||||
print("Directed by:", directed_by)
|
||||
print("Cast:", cast)
|
||||
print("Genre:", genre)
|
||||
print("\nPlot:\n", plot)
|
||||
# # -----------------------------
|
||||
# # Print results
|
||||
# # -----------------------------
|
||||
# title, directed_by, cast, genre, plot = extract_movie_info(file_path)
|
||||
# print("Title:", title)
|
||||
# print("Directed by:", directed_by)
|
||||
# print("Cast:", cast)
|
||||
# print("Genre:", genre)
|
||||
# print("\nPlot:\n", plot)
|
||||
Reference in New Issue
Block a user