diff --git a/preprocessed_data.xlsx b/preprocessed_data.xlsx new file mode 100644 index 000000000..2425cf35f Binary files /dev/null and b/preprocessed_data.xlsx differ diff --git a/scripts/preprocessing.py b/scripts/preprocessing.py index 131be10fe..37d775f25 100644 --- a/scripts/preprocessing.py +++ b/scripts/preprocessing.py @@ -5,33 +5,58 @@ from nltk.corpus import stopwords import nltk from nltk.stem import PorterStemmer, WordNetLemmatizer from sentence_transformers import SentenceTransformer +import pkg_resources +from symspellpy.symspellpy import SymSpell, Verbosity nltk.download('wordnet') nltk.download('punkt_tab') nltk.download('stopwords') stop_words = set(stopwords.words('english')) +sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) stemmer = PorterStemmer() lemmatizer = WordNetLemmatizer() -model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') +# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') -df = pd.read_excel('C:\\Users\\ishaa\\OneDrive\\Documents\\MSU\\Spring 2026\\Data mining\\Project\\sample_data.xlsx', engine='openpyxl') +# df = pd.read_excel('C:\\Users\\ishaa\\OneDrive\\Documents\\MSU\\Spring 2026\\Data mining\\Project\\sample_data.xlsx', engine='openpyxl') -def clean_text(text): +def clean_plot(text): text = text.lower() text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation - text = re.sub(r'\W', ' ', text) # Remove special characters + text = re.sub(r'\W', ' ', text) + suggestions = sym_spell.lookup_compound(text, max_edit_distance=2) + if suggestions: + text = suggestions[0].term text = ([word for word in word_tokenize(text) if word not in stop_words]) text = [stemmer.stem(word) for word in text] text = ' '.join(lemmatizer.lemmatize(word) for word in text) return text + +def get_genre(row): + movie = row['Title'] + print(movie) + text = row['Genre'] + text = text.split(".")[0] + text = text.replace(movie, "") + text = text.lower() + match = re.search(r'is a ((?:\S+\s+){4}\S+)', text) + if match: + words = match.group(1).split() + text = ' '.join(words[1:]) + text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation + text = re.sub(r'\W', ' ', text) # Remove special characters + text = ([word for word in word_tokenize(text) if word not in stop_words]) + text = ' '.join(text) + + return text + # print(df.columns) -df['preprocessed'] = df['Plot'].apply(clean_text) -sample_plot = df['preprocessed'][0] -print(sample_plot) +# df['preprocessed'] = df['Plot'].apply(clean_text) +# sample_plot = df['preprocessed'][0] +# print(sample_plot) -embeddings = model.encode(sample_plot) -print(embeddings) \ No newline at end of file +# embeddings = model.encode(sample_plot) +# print(embeddings) \ No newline at end of file diff --git a/scripts/scrape.py b/scripts/scrape.py index a5fbadf8f..ac4fa883e 100644 --- a/scripts/scrape.py +++ b/scripts/scrape.py @@ -73,7 +73,7 @@ def extract_movie_info(file_path): plot = plot.strip() - return title, directed_by, cast, genre, plot + return title, directed_by, cast, genre, plot #image url # ----------------------------- # Print results diff --git a/scripts/vec_representation.py b/scripts/vec_representation.py new file mode 100644 index 000000000..67bde3f8d --- /dev/null +++ b/scripts/vec_representation.py @@ -0,0 +1,22 @@ +import pandas as pd +from sentence_transformers import SentenceTransformer +from preprocessing import clean_plot, get_genre +from sklearn.metrics.pairwise import cosine_similarity + +df = pd.read_excel('C:\\Users\\ishaa\\OneDrive\\Documents\\MSU\\Spring 2026\\Data mining\\Project\\updated_data.xlsx', engine='openpyxl') + +print(len(df)) + +df = df.dropna(subset=['Genre', 'Plot']) + +print(len(df)) + +# df = df[:2] + +df['Processed_Plot'] = df['Plot'].apply(clean_plot) + +df['Genre'] = df[['Genre', 'Title']].apply(get_genre, axis=1) + +df.to_excel('C:\\Users\\ishaa\\OneDrive\\Documents\\MSU\\Spring 2026\\Data mining\\Project\\preprocessed_data.xlsx', index=False) + +print(df.columns) \ No newline at end of file diff --git a/updated_data.xlsx b/updated_data.xlsx index fe854a26d..515a20507 100644 Binary files a/updated_data.xlsx and b/updated_data.xlsx differ