diff --git a/Not_used/preprocessing.py b/Not_used/preprocessing.py deleted file mode 100644 index 131be10fe..000000000 --- a/Not_used/preprocessing.py +++ /dev/null @@ -1,37 +0,0 @@ -import pandas as pd -import string, re -from nltk.tokenize import word_tokenize -from nltk.corpus import stopwords -import nltk -from nltk.stem import PorterStemmer, WordNetLemmatizer -from sentence_transformers import SentenceTransformer - -nltk.download('wordnet') -nltk.download('punkt_tab') -nltk.download('stopwords') - -stop_words = set(stopwords.words('english')) - -stemmer = PorterStemmer() -lemmatizer = WordNetLemmatizer() - -model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') - -df = pd.read_excel('C:\\Users\\ishaa\\OneDrive\\Documents\\MSU\\Spring 2026\\Data mining\\Project\\sample_data.xlsx', engine='openpyxl') - -def clean_text(text): - text = text.lower() - text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation - text = re.sub(r'\W', ' ', text) # Remove special characters - text = ([word for word in word_tokenize(text) if word not in stop_words]) - text = [stemmer.stem(word) for word in text] - text = ' '.join(lemmatizer.lemmatize(word) for word in text) - return text -# print(df.columns) - -df['preprocessed'] = df['Plot'].apply(clean_text) -sample_plot = df['preprocessed'][0] -print(sample_plot) - -embeddings = model.encode(sample_plot) -print(embeddings) \ No newline at end of file diff --git a/preprocessed_data.xlsx b/preprocessed_data.xlsx new file mode 100644 index 000000000..2425cf35f Binary files /dev/null and b/preprocessed_data.xlsx differ diff --git a/scripts/preprocessing.py b/scripts/preprocessing.py new file mode 100644 index 000000000..37d775f25 --- /dev/null +++ b/scripts/preprocessing.py @@ -0,0 +1,62 @@ +import pandas as pd +import string, re +from nltk.tokenize import word_tokenize +from nltk.corpus import stopwords +import nltk +from nltk.stem import PorterStemmer, WordNetLemmatizer +from sentence_transformers import SentenceTransformer +import pkg_resources +from symspellpy.symspellpy import SymSpell, Verbosity + +nltk.download('wordnet') +nltk.download('punkt_tab') +nltk.download('stopwords') + +stop_words = set(stopwords.words('english')) +sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) + +stemmer = PorterStemmer() +lemmatizer = WordNetLemmatizer() + +# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') + +# df = pd.read_excel('C:\\Users\\ishaa\\OneDrive\\Documents\\MSU\\Spring 2026\\Data mining\\Project\\sample_data.xlsx', engine='openpyxl') + +def clean_plot(text): + text = text.lower() + text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation + text = re.sub(r'\W', ' ', text) + suggestions = sym_spell.lookup_compound(text, max_edit_distance=2) + if suggestions: + text = suggestions[0].term + text = ([word for word in word_tokenize(text) if word not in stop_words]) + text = [stemmer.stem(word) for word in text] + text = ' '.join(lemmatizer.lemmatize(word) for word in text) + return text + +def get_genre(row): + movie = row['Title'] + print(movie) + text = row['Genre'] + text = text.split(".")[0] + text = text.replace(movie, "") + text = text.lower() + match = re.search(r'is a ((?:\S+\s+){4}\S+)', text) + if match: + words = match.group(1).split() + text = ' '.join(words[1:]) + text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation + text = re.sub(r'\W', ' ', text) # Remove special characters + text = ([word for word in word_tokenize(text) if word not in stop_words]) + text = ' '.join(text) + + return text + +# print(df.columns) + +# df['preprocessed'] = df['Plot'].apply(clean_text) +# sample_plot = df['preprocessed'][0] +# print(sample_plot) + +# embeddings = model.encode(sample_plot) +# print(embeddings) \ No newline at end of file diff --git a/scripts/vec_representation.py b/scripts/vec_representation.py new file mode 100644 index 000000000..67bde3f8d --- /dev/null +++ b/scripts/vec_representation.py @@ -0,0 +1,22 @@ +import pandas as pd +from sentence_transformers import SentenceTransformer +from preprocessing import clean_plot, get_genre +from sklearn.metrics.pairwise import cosine_similarity + +df = pd.read_excel('C:\\Users\\ishaa\\OneDrive\\Documents\\MSU\\Spring 2026\\Data mining\\Project\\updated_data.xlsx', engine='openpyxl') + +print(len(df)) + +df = df.dropna(subset=['Genre', 'Plot']) + +print(len(df)) + +# df = df[:2] + +df['Processed_Plot'] = df['Plot'].apply(clean_plot) + +df['Genre'] = df[['Genre', 'Title']].apply(get_genre, axis=1) + +df.to_excel('C:\\Users\\ishaa\\OneDrive\\Documents\\MSU\\Spring 2026\\Data mining\\Project\\preprocessed_data.xlsx', index=False) + +print(df.columns) \ No newline at end of file