Refactored do klasy

2026-07-04 19:23:03 +02:00 · 2024-05-12 14:17:51 +02:00 · 2024-05-12 14:17:51 +02:00 · 3f22570c4e
commit 3f22570c4e
parent 6302bf37c2
1 changed files with 42 additions and 77 deletions
--- a/movie_recommendations/main.py
+++ b/movie_recommendations/main.py
@ -1,30 +1,8 @@
 import pandas as pd
 import numpy as np
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import linear_kernel
 from ast import literal_eval
-
-
-# Function that takes in movie title as input and outputs most similar movies based on description
-def get_recommendations(title, cosine_sim, df):
-    indices = pd.Series(df.index, index=df['title']).drop_duplicates()
-    # Get the index of the movie that matches the title
-    idx = indices[title]
-
-    # Get the pairwsie similarity scores of all movies with that movie
-    sim_scores = list(enumerate(cosine_sim[idx]))
-
-    # Sort the movies based on the similarity scores
-    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
-
-    # Get the scores of the 10 most similar movies
-    sim_scores = sim_scores[1:11]
-
-    # Get the movie indices
-    movie_indices = [i[0] for i in sim_scores]
-
-    # Return the top 10 most similar movies
-    return df['title'].iloc[movie_indices]
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics.pairwise import cosine_similarity


 def get_director(x):
@ -34,25 +12,19 @@ def get_director(x):
    return np.nan


-# Returns the list top 3 elements or entire list; whichever is more.
 def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
-        # Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names
-
-    # Return empty list in case of missing/malformed data
    return []


-# Function to convert all strings to lower case and strip names of spaces
 def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
-        # Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
@ -63,61 +35,54 @@ def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])


-def main():
-    df1 = pd.read_csv('datasets/tmdb_5000_credits.csv')
-    df2 = pd.read_csv('datasets/tmdb_5000_movies.csv')
+class MovieRecommender:
+    def __init__(self):
+        self.df = None
+        self.cosine_sim = None

-    df1.columns = ['id', 'tittle', 'cast', 'crew']
-    df2 = df2.merge(df1, on='id')
-    df2['overview'] = df2['overview'].fillna('')
+    def get_recommendations(self, title):
+        indices = pd.Series(self.df.index, index=self.df['title']).drop_duplicates()
+        idx = indices[title]
+        sim_scores = list(enumerate(self.cosine_sim[idx]))
+        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
+        sim_scores = sim_scores[1:11]
+        movie_indices = [i[0] for i in sim_scores]
+        return self.df['title'].iloc[movie_indices]

-    # Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
-    tfidf = TfidfVectorizer(stop_words='english')
+    def fit(self, credits_file, movies_file):
+        df1 = pd.read_csv(credits_file)
+        df2 = pd.read_csv(movies_file)
+        df1.columns = ['id', 'tittle', 'cast', 'crew']
+        df2 = df2.merge(df1, on='id')
+        df2['overview'] = df2['overview'].fillna('')
+        self.df = df2

-    # Construct the required TF-IDF matrix by fitting and transforming the data
-    tfidf_matrix = tfidf.fit_transform(df2['overview'])     # matrix with word usage
+        features = ['cast', 'crew', 'keywords', 'genres']
+        for feature in features:
+            df2[feature] = df2[feature].apply(literal_eval)

-    # Compute the cosine similarity matrix
-    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
+        df2['director'] = df2['crew'].apply(get_director)

-    # print(get_recommendations('The Avengers', cosine_sim, df_movies))
+        features = ['cast', 'keywords', 'genres']
+        for feature in features:
+            df2[feature] = df2[feature].apply(get_list)

-    features = ['cast', 'crew', 'keywords', 'genres']
-    for feature in features:
-        df2[feature] = df2[feature].apply(literal_eval)
+        features = ['cast', 'keywords', 'director', 'genres']
+        for feature in features:
+            df2[feature] = df2[feature].apply(clean_data)

-    # Define new director, cast, genres and keywords features that are in a suitable form.
-    df2['director'] = df2['crew'].apply(get_director)
+        df2['soup'] = df2.apply(create_soup, axis=1)

-    features = ['cast', 'keywords', 'genres']
-    for feature in features:
-        df2[feature] = df2[feature].apply(get_list)
+        count = CountVectorizer(stop_words='english')
+        count_matrix = count.fit_transform(df2['soup'])
+        self.cosine_sim = cosine_similarity(count_matrix, count_matrix)

-    # Apply clean_data function to your features.
-    features = ['cast', 'keywords', 'director', 'genres']
-
-    for feature in features:
-        df2[feature] = df2[feature].apply(clean_data)
-
-    df2['soup'] = df2.apply(create_soup, axis=1)
-
-    # Import CountVectorizer and create the count matrix
-    from sklearn.feature_extraction.text import CountVectorizer
-
-    count = CountVectorizer(stop_words='english')
-    count_matrix = count.fit_transform(df2['soup'])
-
-    # Compute the Cosine Similarity matrix based on the count_matrix
-    from sklearn.metrics.pairwise import cosine_similarity
-
-    cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
-
-    # Reset index of our main DataFrame and construct reverse mapping as before
-    df2 = df2.reset_index()
-    indices = pd.Series(df2.index, index=df2['title'])
-
-    print(get_recommendations('The Dark Knight Rises', cosine_sim2, df2))
+        self.df = df2.reset_index()


-if __name__ == '__main__':
-    main()
+# Example usage:
+if __name__ == "__main__":
+    recommender = MovieRecommender()
+    recommender.fit('datasets/tmdb_5000_credits.csv', 'datasets/tmdb_5000_movies.csv')
+    recommendations = recommender.get_recommendations('The Dark Knight Rises')
+    print(recommendations)