Revisions to Zim parsing, netflix parsing, and updates to html scraping to include synopsis

2026-03-19 01:56:14 -04:00
parent 0a70920ba9
commit 492160c3a3
13 changed files with 252 additions and 63 deletions
--- a/scripts/dataset_create.py
+++ b/scripts/dataset_create.py
@@ -2,45 +2,42 @@ import pandas as pd
 import os
 from scrape import extract_movie_info

-script_dir = os.path.dirname(os.path.abspath(__file__))
-# file_path = os.path.join(script_dir, "..", "sample_data.xlsx")
-# movie_data = pd.read_excel(file_path)
-# print(movie_data.columns)
-
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-INPUT_DIR = r'C:\Users\Prabhaav\Projects\PyCharm\datamining_881\data\processed\wikipedia_html'
+INPUT_DIR = os.path.join(BASE_DIR, "../data/processed/wikipedia_html_test/")
 SPREADSHEET_DIR = os.path.join(BASE_DIR, "../data/processed/spreadsheets/")

-movie_data = pd.DataFrame(columns=['Title', 'Director', 'Cast', 'Genre', 'Plot', 'Release Date', 'Slug', 'Poster Filename'])
+rows = []

 for folder in os.listdir(INPUT_DIR):
    path = os.path.join(INPUT_DIR, folder)
-    script_dir = os.path.join(path, next((f for f in os.listdir(path) if f.endswith(".html")), None))
+    script_dir = next((f for f in os.listdir(path) if f.endswith(".html")), None)
    if not script_dir:
        continue
+    full_path = os.path.join(path, script_dir)
+    slug = os.path.splitext(script_dir)[0]
    try:
-        print(script_dir)
-        title, directed_by, cast, genre, plot, year, poster_filename = extract_movie_info(script_dir)
-        new_row = {
+        print(full_path)
+        title, directed_by, cast, genre, plot, year, poster_filename = extract_movie_info(full_path)
+        rows.append({
            "Title": title,
            "Director": directed_by,
            "Cast": ", ".join(cast),
            "Genre": genre,
            "Plot": plot,
            "Release Date": year,
-            "Slug": script_dir,
+            "Slug": slug,
            "Poster Filename": poster_filename
-        }
-        movie_data.loc[len(movie_data)] = new_row
+        })

-    except Exception as e:
-        print("error:", e)
    except KeyboardInterrupt:
-        output_path = os.path.join(SPREADSHEET_DIR, "updated_data.xlsx")
-        print(output_path)
+        movie_data = pd.DataFrame(rows)
+        output_path = os.path.join(SPREADSHEET_DIR, "updated_datav_test.xlsx")
        movie_data.to_excel(output_path, index=False)
        quit()
+    except Exception as e:
+        print("error:", e)

-output_path = os.path.join(SPREADSHEET_DIR, "updated_data.xlsx")
+movie_data = pd.DataFrame(rows)
+output_path = os.path.join(SPREADSHEET_DIR, "updated_data_test.xlsx")
 print(output_path)
 movie_data.to_excel(output_path, index=False)