The code to extract zim into a spreadsheet.

This commit is contained in:
prabhaavp
2026-03-12 14:19:40 -04:00
parent a435592f75
commit 2638de1191
10 changed files with 598 additions and 28 deletions

View File

@@ -3,22 +3,44 @@ import os
from scrape import extract_movie_info
script_dir = os.path.dirname(os.path.abspath(__file__))
file_path = os.path.join(script_dir, "..", "sample_data.xlsx")
movie_data = pd.read_excel(file_path)
print(movie_data.columns)
# file_path = os.path.join(script_dir, "..", "sample_data.xlsx")
# movie_data = pd.read_excel(file_path)
# print(movie_data.columns)
script_dir = os.path.dirname(os.path.abspath(__file__))
movie_html = os.path.join(script_dir, "..", "data", "tt0074888.html")
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
INPUT_DIR = r'C:\Users\Prabhaav\Projects\PyCharm\datamining_881\data\processed\wikipedia_html'
SPREADSHEET_DIR = os.path.join(BASE_DIR, "../data/processed/spreadsheets/")
title, directed_by, cast, genre, plot = extract_movie_info(movie_html)
new_row = {
"Movie": title,
"Director": directed_by,
"Cast": ", ".join(cast),
"Genre": genre,
"Plot": plot
}
movie_data = pd.DataFrame(columns=['Title', 'Director', 'Cast', 'Genre', 'Plot', 'Release Date', 'Slug', 'Poster Filename'])
movie_data.loc[len(movie_data)] = new_row
output_path = os.path.join(script_dir, "..", "updated_data.xlsx")
for folder in os.listdir(INPUT_DIR):
path = os.path.join(INPUT_DIR, folder)
script_dir = os.path.join(path, next((f for f in os.listdir(path) if f.endswith(".html")), None))
if not script_dir:
continue
try:
print(script_dir)
title, directed_by, cast, genre, plot, year, poster_filename = extract_movie_info(script_dir)
new_row = {
"Title": title,
"Director": directed_by,
"Cast": ", ".join(cast),
"Genre": genre,
"Plot": plot,
"Release Date": year,
"Slug": script_dir,
"Poster Filename": poster_filename
}
movie_data.loc[len(movie_data)] = new_row
except Exception as e:
print("error:", e)
except KeyboardInterrupt:
output_path = os.path.join(SPREADSHEET_DIR, "updated_data.xlsx")
print(output_path)
movie_data.to_excel(output_path, index=False)
quit()
output_path = os.path.join(SPREADSHEET_DIR, "updated_data.xlsx")
print(output_path)
movie_data.to_excel(output_path, index=False)