Merge branch 'main' of https://github.com/IshaAtteri/datamining_881
changes to embeddings for plot
This commit is contained in:
29
scripts/convert_to_JSON.js
Normal file
29
scripts/convert_to_JSON.js
Normal file
@@ -0,0 +1,29 @@
|
||||
// convert excel dataset to JSON for easier parsing & easier deployment
|
||||
const XLSX = require("xlsx");
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
|
||||
const filePath = path.join(__dirname, "../preprocessed_data.xlsx") // path to excel file -- change this path to be spreadsheets\updated_datav2.xlsx
|
||||
const workbook = XLSX.readFile(filePath); // read excel workbook
|
||||
|
||||
const sheetName = workbook.SheetNames[0]; // get first sheet
|
||||
const sheet = workbook.Sheets[sheetName];
|
||||
|
||||
const rawData = XLSX.utils.sheet_to_json(sheet); // convert to JSON
|
||||
|
||||
const cleanedData = rawData.map((movie) => ({ // transform data
|
||||
title: movie.Title,
|
||||
director: movie.Director || "",
|
||||
cast: movie.Cast || "",
|
||||
genre: movie.Genre || "",
|
||||
plot: movie.Plot || "",
|
||||
releaseDate: movie["Release Date"] || "",
|
||||
slug: movie.Slug || "",
|
||||
poster: movie["Poster Filename"]
|
||||
? `/posters/${movie["Poster Filename"]}`
|
||||
: null,
|
||||
}));
|
||||
|
||||
fs.writeFileSync(path.join(__dirname, "../movies.json"), JSON.stringify(cleanedData, null, 2));
|
||||
|
||||
// to run: node scripts/convertExcelToJson.js
|
||||
33
scripts/move_images_to_folder.py
Normal file
33
scripts/move_images_to_folder.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import os
|
||||
import shutil
|
||||
from tqdm import tqdm
|
||||
from collections import defaultdict
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
SOURCE_DIR = os.path.abspath(os.path.join(BASE_DIR, "../data/processed/wikipedia_html"))
|
||||
DEST_DIR = os.path.abspath(os.path.join(BASE_DIR, "../data/processed/wikipedia_images"))
|
||||
|
||||
os.makedirs(DEST_DIR, exist_ok=True)
|
||||
|
||||
IMAGE_EXTS = (".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg")
|
||||
|
||||
all_images = []
|
||||
for movie_folder in os.listdir(SOURCE_DIR):
|
||||
folder_path = os.path.join(SOURCE_DIR, movie_folder)
|
||||
if not os.path.isdir(folder_path):
|
||||
continue
|
||||
for fname in os.listdir(folder_path):
|
||||
if fname.lower().endswith(IMAGE_EXTS):
|
||||
all_images.append(os.path.join(folder_path, fname))
|
||||
|
||||
skipped = 0
|
||||
copied = 0
|
||||
|
||||
for src_path in tqdm(all_images, desc="Copying images", unit="file"):
|
||||
fname = os.path.basename(src_path)
|
||||
dest_path = os.path.join(DEST_DIR, fname)
|
||||
if os.path.exists(dest_path):
|
||||
skipped += 1
|
||||
continue
|
||||
shutil.copy2(src_path, dest_path)
|
||||
copied += 1
|
||||
Reference in New Issue
Block a user