diff --git a/.gitignore b/.gitignore index 155e2dc7a..d313207c5 100644 --- a/.gitignore +++ b/.gitignore @@ -217,4 +217,6 @@ __marimo__/ .streamlit/secrets.toml # Data Folder -*.tsv \ No newline at end of file +*.tsv + +data/ \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 000000000..13566b81b --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/datamining_881_new.iml b/.idea/datamining_881_new.iml new file mode 100644 index 000000000..8648a4cb7 --- /dev/null +++ b/.idea/datamining_881_new.iml @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 000000000..37bff01aa --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,13 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 000000000..105ce2da2 --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 000000000..cbca5c197 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 000000000..1ae2bbceb --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 000000000..830674470 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/data/processed/spreadsheets/updated_data.xlsx b/data/processed/spreadsheets/updated_data.xlsx deleted file mode 100644 index 7eeb42ac6..000000000 Binary files a/data/processed/spreadsheets/updated_data.xlsx and /dev/null differ diff --git a/scripts/dataset_create.py b/scripts/dataset_create.py index f845d79ce..0cb5a169f 100644 --- a/scripts/dataset_create.py +++ b/scripts/dataset_create.py @@ -2,45 +2,42 @@ import pandas as pd import os from scrape import extract_movie_info -script_dir = os.path.dirname(os.path.abspath(__file__)) -# file_path = os.path.join(script_dir, "..", "sample_data.xlsx") -# movie_data = pd.read_excel(file_path) -# print(movie_data.columns) - BASE_DIR = os.path.dirname(os.path.abspath(__file__)) -INPUT_DIR = r'C:\Users\Prabhaav\Projects\PyCharm\datamining_881\data\processed\wikipedia_html' +INPUT_DIR = os.path.join(BASE_DIR, "../data/processed/wikipedia_html_test/") SPREADSHEET_DIR = os.path.join(BASE_DIR, "../data/processed/spreadsheets/") -movie_data = pd.DataFrame(columns=['Title', 'Director', 'Cast', 'Genre', 'Plot', 'Release Date', 'Slug', 'Poster Filename']) +rows = [] for folder in os.listdir(INPUT_DIR): path = os.path.join(INPUT_DIR, folder) - script_dir = os.path.join(path, next((f for f in os.listdir(path) if f.endswith(".html")), None)) + script_dir = next((f for f in os.listdir(path) if f.endswith(".html")), None) if not script_dir: continue + full_path = os.path.join(path, script_dir) + slug = os.path.splitext(script_dir)[0] try: - print(script_dir) - title, directed_by, cast, genre, plot, year, poster_filename = extract_movie_info(script_dir) - new_row = { + print(full_path) + title, directed_by, cast, genre, plot, year, poster_filename = extract_movie_info(full_path) + rows.append({ "Title": title, "Director": directed_by, "Cast": ", ".join(cast), "Genre": genre, "Plot": plot, "Release Date": year, - "Slug": script_dir, + "Slug": slug, "Poster Filename": poster_filename - } - movie_data.loc[len(movie_data)] = new_row + }) - except Exception as e: - print("error:", e) except KeyboardInterrupt: - output_path = os.path.join(SPREADSHEET_DIR, "updated_data.xlsx") - print(output_path) + movie_data = pd.DataFrame(rows) + output_path = os.path.join(SPREADSHEET_DIR, "updated_datav_test.xlsx") movie_data.to_excel(output_path, index=False) quit() + except Exception as e: + print("error:", e) -output_path = os.path.join(SPREADSHEET_DIR, "updated_data.xlsx") +movie_data = pd.DataFrame(rows) +output_path = os.path.join(SPREADSHEET_DIR, "updated_data_test.xlsx") print(output_path) movie_data.to_excel(output_path, index=False) \ No newline at end of file diff --git a/scripts/extract_wiki_zim.py b/scripts/extract_wiki_zim.py index 38955b63a..7c389000d 100644 --- a/scripts/extract_wiki_zim.py +++ b/scripts/extract_wiki_zim.py @@ -8,8 +8,8 @@ import csv from slugify import slugify BASE_DIR = os.path.dirname(os.path.abspath(__file__)) -INPUT_TSV = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/imdb_datasets/title.basics.tsv")) -OUTPUT_DIR = os.path.abspath(os.path.join(BASE_DIR, "../data/processed/wikipedia_html")) +INPUT_TSV = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/imdb_datasets/title.basics.test.tsv")) +OUTPUT_DIR = os.path.abspath(os.path.join(BASE_DIR, "../data/processed/wikipedia_html_test")) ZIM_PATH = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/wikipedia/wikipedia_en_all_maxi_2025-08.zim")) os.makedirs(OUTPUT_DIR, exist_ok=True) @@ -21,39 +21,80 @@ print("The Zim file is now opened") def sanitize_slug(slug): return slugify(slug, separator="_", max_length=200) or "_unknown" -#Fetch the html AND the images and put them in a folder -def fetch_wikipedia_html_with_images(query, save_dir): + +def is_movie_page(html_content, primary_title, original_title, year): + soup = BeautifulSoup(html_content, "html.parser") + page_title = soup.find("h1", {"id": "firstHeading"}) + if not page_title: + return False + page_title_text = page_title.get_text().lower() + if primary_title.lower() not in page_title_text and original_title.lower() not in page_title_text: + return False + infobox = soup.find("table", {"class": "infobox"}) + if not infobox: + return False + infobox_text = infobox.get_text() + if "Directed by" not in infobox_text or ("Produced by" not in infobox_text and "Written by" not in infobox_text): + return False + # Also verify the year appears in the infobox + if year and year != "\\N" and year not in infobox_text: + return False + return True + + +# Fetch the html AND the images and put them in a folder +def fetch_wikipedia_html_with_images(query, save_dir, primary_title, original_title, year): q = Query().set_query(query) search = searcher.search(q) if search.getEstimatedMatches() == 0: return None results = list(search.getResults(0, 5)) - best_path = results[0] - try: - entry = zim.get_entry_by_path(best_path) - item = entry.get_item() - html_content = bytes(item.content).decode("UTF-8") - except Exception: - return None - soup = BeautifulSoup(html_content, "html.parser") - for img in soup.find_all("img"): - src = img.get("src") - if not src: - continue - img_path = src.lstrip("/") + + for best_path in results: try: - img_entry = zim.get_entry_by_path(img_path) - img_bytes = bytes(img_entry.get_item().content) + entry = zim.get_entry_by_path(best_path) + item = entry.get_item() + html_content = bytes(item.content).decode("UTF-8") except Exception: continue - img_name = os.path.basename(img_path) - img_file_path = os.path.join(save_dir, img_name) - with open(img_file_path, "wb") as f: - f.write(img_bytes) - img["src"] = img_name - return str(soup), best_path -#Go through each row of the tsv file and try to get the movie on wiki + if not is_movie_page(html_content, primary_title, original_title, year): + continue + + soup = BeautifulSoup(html_content, "html.parser") + poster_img = None + infobox = soup.find("table", class_="infobox") + if infobox: + poster_img = infobox.select_one("img") + if poster_img and poster_img.get("src"): + img_path = poster_img["src"].lstrip("/") + try: + img_entry = zim.get_entry_by_path(img_path) + img_bytes = bytes(img_entry.get_item().content) + img_name = os.path.basename(img_path) + with open(os.path.join(save_dir, img_name), "wb") as f: + f.write(img_bytes) + poster_img["src"] = img_name + except Exception: + pass + for img in soup.find_all("img"): + if img is not poster_img: + img["src"] = "" + return str(soup), best_path + + return None + + +done_set = { + fname[:-5] + for d in os.listdir(OUTPUT_DIR) + if not d.startswith("_tmp_") + for fname in os.listdir(os.path.join(OUTPUT_DIR, d)) + if fname.endswith(".html") +} +print(f"Found {len(done_set)} already processed") + +# Go through each row of the tsv file and try to get the movie on wiki with open(INPUT_TSV, encoding="utf-8") as f: reader = csv.DictReader(f, delimiter="\t") for row in reader: @@ -64,20 +105,15 @@ with open(INPUT_TSV, encoding="utf-8") as f: if year is None or titleType != "movie": print("Skipping from TSV: ", title) continue - already_done = False - for d in os.listdir(OUTPUT_DIR): - if os.path.exists(os.path.join(OUTPUT_DIR, d, f"{tconst}.html")): - already_done = True - break - if already_done: + if tconst in done_set: print(f"Skipping already processed: {tconst}") continue - # folder for each movie + # folder for each movie movie_dir = os.path.join(OUTPUT_DIR, f"_tmp_{tconst}") os.makedirs(movie_dir, exist_ok=True) - query = f"{title} ({year} film)" if year != "\\N" else title #if year not empty + query = f"{title} ({year} film)" if year != "\\N" else title # if year not empty print(f"fetching Wikipedia HTML + images for {tconst}: {query}") - result = fetch_wikipedia_html_with_images(query, movie_dir) + result = fetch_wikipedia_html_with_images(query, movie_dir, title, row["originalTitle"], row["startYear"]) if result is None: print("Wikipedia fetch failed") shutil.rmtree(movie_dir, ignore_errors=True) @@ -86,9 +122,6 @@ with open(INPUT_TSV, encoding="utf-8") as f: html_with_images, slug = result slug_dir = os.path.join(OUTPUT_DIR, sanitize_slug(slug)) if html_with_images: - if "Directed by" not in html_with_images: - shutil.rmtree(movie_dir, ignore_errors=True) - continue if os.path.exists(slug_dir): shutil.rmtree(movie_dir, ignore_errors=True) else: @@ -98,6 +131,7 @@ with open(INPUT_TSV, encoding="utf-8") as f: continue with open(outfile, "w", encoding="utf-8") as out: out.write(html_with_images) + done_set.add(tconst) else: shutil.rmtree(movie_dir, ignore_errors=True) print(f"no Wikipedia page found for {query}") \ No newline at end of file diff --git a/scripts/fuse_with_netflix.py b/scripts/fuse_with_netflix.py new file mode 100644 index 000000000..40decba99 --- /dev/null +++ b/scripts/fuse_with_netflix.py @@ -0,0 +1,91 @@ +import pandas as pd +import os +from rapidfuzz import fuzz + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +NETFLIX_DIR = os.path.join(BASE_DIR, "../data/raw/netflix/") +MOVIE_EXCEL = os.path.join(BASE_DIR, "../data/processed/spreadsheets/updated_data_test.xlsx") +MOVIE_TITLES = os.path.join(NETFLIX_DIR, "movie_titles.csv") +COMBINED_FILES = [os.path.join(NETFLIX_DIR, f"combined_data_{i}.txt") for i in range(1, 5)] +OUTPUT = os.path.join(BASE_DIR, "../data/processed/spreadsheets/fused_gtruth_test.csv") + +TITLE_THRESHOLD = 85 # fuzzy search + +main_data = pd.read_excel(MOVIE_EXCEL) +main_data["title_lower"] = main_data["Title"].str.lower().str.strip() +main_data["director_lower"] = main_data["Director"].fillna("").str.lower().str.strip() + + +records = [] +with open(MOVIE_TITLES, encoding="latin-1") as f: + for line in f: + line = line.strip() + parts = line.split(",", 2) + if len(parts) == 3: + records.append({"netflix_id": int(parts[0]), "year": parts[1], "title": parts[2].strip()}) + +titles_df = pd.DataFrame(records) +titles_df["title_lower"] = titles_df["title"].str.lower().str.strip() +netflix_id_to_tt = {} # netflix_id -> tt_id + +for _, nrow in titles_df.iterrows(): + best_score = 0 + best_meta = None + + #https://github.com/rapidfuzz/RapidFuzz docs + for _, mrow in main_data.iterrows(): + score = fuzz.ratio(nrow["title_lower"], mrow["title_lower"]) + if score > best_score: + best_score = score + best_meta = mrow + + if best_score < TITLE_THRESHOLD or best_meta is None: + continue + + # Director match + confirmed = best_score >= TITLE_THRESHOLD + print(best_score) + if best_meta["director_lower"] and best_score >= 70: + # year relese year match + try: + meta_year = str(best_meta["Release Date"]) + nf_year = str(int(nrow["year"])) if pd.notna(nrow["year"]) else "" + if nf_year and nf_year in meta_year: + confirmed = True + except Exception: + pass + + if confirmed: + netflix_id_to_tt[int(nrow["netflix_id"])] = best_meta["Slug"] + +print(f"Matched {len(netflix_id_to_tt)} Netflix movies to tt Ids") + +valid_netflix_ids = set(netflix_id_to_tt.keys()) +rows = [] +current_movie_id = None + +for filepath in COMBINED_FILES: + print(f"Reading {os.path.basename(filepath)}...") + with open(filepath, encoding="latin-1") as f: + for line in f: + line = line.strip() + if line.endswith(":"): + current_movie_id = int(line[:-1]) + elif current_movie_id in valid_netflix_ids: + parts = line.split(",") + if len(parts) == 3: + customer_id, rating, date = parts + rows.append({ + "customer_id": int(customer_id), + "tt_id": netflix_id_to_tt[current_movie_id], + "rating": int(rating), + "date": date, + }) + +print(f"Found {len(rows):,} rating") +print(f"Found {len(valid_netflix_ids):,} movies ground truth") + + +df = pd.DataFrame(rows) +df.to_csv(OUTPUT, index=False) +print(f"Written to {OUTPUT}") \ No newline at end of file diff --git a/scripts/scrape.py b/scripts/scrape.py index d8e782b75..17c5dcf41 100644 --- a/scripts/scrape.py +++ b/scripts/scrape.py @@ -1,11 +1,8 @@ from bs4 import BeautifulSoup import os -# script_dir = os.path.dirname(os.path.abspath(__file__)) -# file_path = os.path.join(script_dir, "..", "data", "tt0074888.html") def extract_movie_info(file_path): - with open(file_path, "r", encoding="utf-8") as f: html = f.read() @@ -66,17 +63,25 @@ def extract_movie_info(file_path): # ----------------------------- plot = "" - plot_header = soup.find(id="Plot") + plot_header = soup.find(id="Plot") or soup.find(id="Synopsis") if plot_header: current = plot_header.parent - for sibling in current.find_next_siblings(): if sibling.name == "div" and "mw-heading2" in sibling.get("class", []): break if sibling.name == "p": plot += sibling.get_text(" ", strip=True) + " " + if not plot and content: + for el in content.find_all(["p", "div"], recursive=False): + if el.name == "div" and el.find(["h2"]): + break + if el.name == "p": + text = el.get_text(" ", strip=True) + if text: + plot += text + " " + plot = plot.strip() return title, directed_by, cast, genre, plot, year, poster_filename