The code to extract zim into a spreadsheet.

2026-03-12 14:19:40 -04:00
parent a435592f75
commit 2638de1191
10 changed files with 598 additions and 28 deletions
--- a/scripts/dataset_create.py
+++ b/scripts/dataset_create.py
@@ -3,22 +3,44 @@ import os
 from scrape import extract_movie_info

 script_dir = os.path.dirname(os.path.abspath(__file__))
-file_path = os.path.join(script_dir, "..", "sample_data.xlsx")
-movie_data = pd.read_excel(file_path)
-print(movie_data.columns)
+# file_path = os.path.join(script_dir, "..", "sample_data.xlsx")
+# movie_data = pd.read_excel(file_path)
+# print(movie_data.columns)

-script_dir = os.path.dirname(os.path.abspath(__file__))
-movie_html = os.path.join(script_dir, "..", "data", "tt0074888.html")
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+INPUT_DIR = r'C:\Users\Prabhaav\Projects\PyCharm\datamining_881\data\processed\wikipedia_html'
+SPREADSHEET_DIR = os.path.join(BASE_DIR, "../data/processed/spreadsheets/")

-title, directed_by, cast, genre, plot = extract_movie_info(movie_html)
-new_row = {
-    "Movie": title,
-    "Director": directed_by,
-    "Cast": ", ".join(cast),
-    "Genre": genre,
-    "Plot": plot
-}
+movie_data = pd.DataFrame(columns=['Title', 'Director', 'Cast', 'Genre', 'Plot', 'Release Date', 'Slug', 'Poster Filename'])

-movie_data.loc[len(movie_data)] = new_row
-output_path = os.path.join(script_dir, "..", "updated_data.xlsx")
+for folder in os.listdir(INPUT_DIR):
+    path = os.path.join(INPUT_DIR, folder)
+    script_dir = os.path.join(path, next((f for f in os.listdir(path) if f.endswith(".html")), None))
+    if not script_dir:
+        continue
+    try:
+        print(script_dir)
+        title, directed_by, cast, genre, plot, year, poster_filename = extract_movie_info(script_dir)
+        new_row = {
+            "Title": title,
+            "Director": directed_by,
+            "Cast": ", ".join(cast),
+            "Genre": genre,
+            "Plot": plot,
+            "Release Date": year,
+            "Slug": script_dir,
+            "Poster Filename": poster_filename
+        }
+        movie_data.loc[len(movie_data)] = new_row
+
+    except Exception as e:
+        print("error:", e)
+    except KeyboardInterrupt:
+        output_path = os.path.join(SPREADSHEET_DIR, "updated_data.xlsx")
+        print(output_path)
+        movie_data.to_excel(output_path, index=False)
+        quit()
+
+output_path = os.path.join(SPREADSHEET_DIR, "updated_data.xlsx")
+print(output_path)
 movie_data.to_excel(output_path, index=False)
--- a/scripts/scrape.py
+++ b/scripts/scrape.py
@@ -1,8 +1,8 @@
 from bs4 import BeautifulSoup
 import os

-script_dir = os.path.dirname(os.path.abspath(__file__))
-file_path = os.path.join(script_dir, "..", "data", "tt0074888.html")
+# script_dir = os.path.dirname(os.path.abspath(__file__))
+# file_path = os.path.join(script_dir, "..", "data", "tt0074888.html")

 def extract_movie_info(file_path):

@@ -35,9 +35,14 @@ def extract_movie_info(file_path):

    directed_by = None
    cast = []
+    poster_filename = None
+    year = None

    if infobox:
        rows = infobox.find_all("tr")
+        img = infobox.select_one("img")
+        if img and img.get("src"):
+            poster_filename = os.path.basename(img["src"])

        for row in rows:
            header = row.find("th")
@@ -50,7 +55,8 @@ def extract_movie_info(file_path):

            if header_text == "Directed by":
                directed_by = data.get_text(" ", strip=True)
-
+            elif "Release date" in header_text:
+                year = data.get_text(" ", strip=True)
            elif header_text == "Starring":
                cast_items = list(data.stripped_strings)
                cast = cast_items[:5]
@@ -73,14 +79,14 @@ def extract_movie_info(file_path):

    plot = plot.strip()

-    return title, directed_by, cast, genre, plot
+    return title, directed_by, cast, genre, plot, year, poster_filename

-# -----------------------------
-# Print results
-# -----------------------------
-title, directed_by, cast, genre, plot = extract_movie_info(file_path)
-print("Title:", title)
-print("Directed by:", directed_by)
-print("Cast:", cast)
-print("Genre:", genre)
-print("\nPlot:\n", plot)
+# # -----------------------------
+# # Print results
+# # -----------------------------
+# title, directed_by, cast, genre, plot = extract_movie_info(file_path)
+# print("Title:", title)
+# print("Directed by:", directed_by)
+# print("Cast:", cast)
+# print("Genre:", genre)
+# print("\nPlot:\n", plot)