From 233fa3df17d719fb67da694253e213398ece541d Mon Sep 17 00:00:00 2001 From: IshaAtteri Date: Wed, 25 Mar 2026 18:14:03 -0400 Subject: [PATCH] preprocessing changes --- scripts/preprocessing.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scripts/preprocessing.py b/scripts/preprocessing.py index 37d775f25..74ed5fcfc 100644 --- a/scripts/preprocessing.py +++ b/scripts/preprocessing.py @@ -52,6 +52,20 @@ def get_genre(row): return text +def pre_director(text): + if not text: + return "" + text = text.lower().strip() + return text + +def clean_cast(text, top_k=5): + if not text: + return [] + text = text.lower() + cast_list = [actor.strip() for actor in text.split(",")] + cast_list = [actor for actor in cast_list if actor] + return cast_list + # print(df.columns) # df['preprocessed'] = df['Plot'].apply(clean_text)