Revisions to Zim parsing, netflix parsing, and updates to html scraping to include synopsis

2026-03-19 01:56:14 -04:00
parent 0a70920ba9
commit 492160c3a3
13 changed files with 252 additions and 63 deletions
--- a/scripts/fuse_with_netflix.py
+++ b/scripts/fuse_with_netflix.py
@@ -0,0 +1,91 @@
+import pandas as pd
+import os
+from rapidfuzz import fuzz
+
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+NETFLIX_DIR = os.path.join(BASE_DIR, "../data/raw/netflix/")
+MOVIE_EXCEL = os.path.join(BASE_DIR, "../data/processed/spreadsheets/updated_data_test.xlsx")
+MOVIE_TITLES = os.path.join(NETFLIX_DIR, "movie_titles.csv")
+COMBINED_FILES = [os.path.join(NETFLIX_DIR, f"combined_data_{i}.txt") for i in range(1, 5)]
+OUTPUT = os.path.join(BASE_DIR, "../data/processed/spreadsheets/fused_gtruth_test.csv")
+
+TITLE_THRESHOLD = 85   # fuzzy search
+
+main_data = pd.read_excel(MOVIE_EXCEL)
+main_data["title_lower"] = main_data["Title"].str.lower().str.strip()
+main_data["director_lower"] = main_data["Director"].fillna("").str.lower().str.strip()
+
+
+records = []
+with open(MOVIE_TITLES, encoding="latin-1") as f:
+    for line in f:
+        line = line.strip()
+        parts = line.split(",", 2)
+        if len(parts) == 3:
+            records.append({"netflix_id": int(parts[0]), "year": parts[1], "title": parts[2].strip()})
+
+titles_df = pd.DataFrame(records)
+titles_df["title_lower"] = titles_df["title"].str.lower().str.strip()
+netflix_id_to_tt = {}   # netflix_id -> tt_id
+
+for _, nrow in titles_df.iterrows():
+    best_score = 0
+    best_meta = None
+
+    #https://github.com/rapidfuzz/RapidFuzz docs
+    for _, mrow in main_data.iterrows():
+        score = fuzz.ratio(nrow["title_lower"], mrow["title_lower"])
+        if score > best_score:
+            best_score = score
+            best_meta = mrow
+
+    if best_score < TITLE_THRESHOLD or best_meta is None:
+        continue
+
+    # Director match
+    confirmed = best_score >= TITLE_THRESHOLD
+    print(best_score)
+    if best_meta["director_lower"] and best_score >= 70:
+        # year relese year match
+        try:
+            meta_year = str(best_meta["Release Date"])
+            nf_year = str(int(nrow["year"])) if pd.notna(nrow["year"]) else ""
+            if nf_year and nf_year in meta_year:
+                confirmed = True
+        except Exception:
+            pass
+
+    if confirmed:
+        netflix_id_to_tt[int(nrow["netflix_id"])] = best_meta["Slug"]
+
+print(f"Matched {len(netflix_id_to_tt)} Netflix movies to tt Ids")
+
+valid_netflix_ids = set(netflix_id_to_tt.keys())
+rows = []
+current_movie_id = None
+
+for filepath in COMBINED_FILES:
+    print(f"Reading {os.path.basename(filepath)}...")
+    with open(filepath, encoding="latin-1") as f:
+        for line in f:
+            line = line.strip()
+            if line.endswith(":"):
+                current_movie_id = int(line[:-1])
+            elif current_movie_id in valid_netflix_ids:
+                parts = line.split(",")
+                if len(parts) == 3:
+                    customer_id, rating, date = parts
+                    rows.append({
+                        "customer_id": int(customer_id),
+                        "tt_id": netflix_id_to_tt[current_movie_id],
+                        "rating": int(rating),
+                        "date": date,
+                    })
+
+print(f"Found {len(rows):,} rating")
+print(f"Found {len(valid_netflix_ids):,} movies ground truth")
+
+
+df = pd.DataFrame(rows)
+df.to_csv(OUTPUT, index=False)
+print(f"Written to {OUTPUT}")