diff --git a/sample_data.xlsx b/sample_data.xlsx new file mode 100644 index 000000000..4d3b3f5a0 Binary files /dev/null and b/sample_data.xlsx differ diff --git a/scripts/preprocessing.py b/scripts/preprocessing.py new file mode 100644 index 000000000..131be10fe --- /dev/null +++ b/scripts/preprocessing.py @@ -0,0 +1,37 @@ +import pandas as pd +import string, re +from nltk.tokenize import word_tokenize +from nltk.corpus import stopwords +import nltk +from nltk.stem import PorterStemmer, WordNetLemmatizer +from sentence_transformers import SentenceTransformer + +nltk.download('wordnet') +nltk.download('punkt_tab') +nltk.download('stopwords') + +stop_words = set(stopwords.words('english')) + +stemmer = PorterStemmer() +lemmatizer = WordNetLemmatizer() + +model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') + +df = pd.read_excel('C:\\Users\\ishaa\\OneDrive\\Documents\\MSU\\Spring 2026\\Data mining\\Project\\sample_data.xlsx', engine='openpyxl') + +def clean_text(text): + text = text.lower() + text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation + text = re.sub(r'\W', ' ', text) # Remove special characters + text = ([word for word in word_tokenize(text) if word not in stop_words]) + text = [stemmer.stem(word) for word in text] + text = ' '.join(lemmatizer.lemmatize(word) for word in text) + return text +# print(df.columns) + +df['preprocessed'] = df['Plot'].apply(clean_text) +sample_plot = df['preprocessed'][0] +print(sample_plot) + +embeddings = model.encode(sample_plot) +print(embeddings) \ No newline at end of file diff --git a/scripts/scrape.py b/scripts/scrape.py index fb9f976ac..ac0a44926 100644 --- a/scripts/scrape.py +++ b/scripts/scrape.py @@ -9,8 +9,8 @@ headers = { params = { "action": "query", "format": "json", + "titles": "Godfather", "prop": "extracts", - "titles": "Interstellar", "explaintext": True, "redirects": 1 } @@ -19,12 +19,14 @@ response = requests.get(url, headers=headers, params=params) print("Status:", response.status_code) print("Content-Type:", response.headers.get("content-type")) -print("First 200 chars:\n", response.text[:200]) +print("First 200 chars:\n", response.text[:1000]) data = response.json() + + pages = data["query"]["pages"] page = next(iter(pages.values())) print("\nTitle:", page["title"]) -print("\nPreview:\n", page["extract"][:500]) +print("\nPreview:\n", page["extract"])