Podstawowa wersja AI do rekomendacji

Aktualnie tylko na podstawie jednego filmu, wszystko wwalone do maina
2026-07-04 14:43:08 +02:00 · 2024-05-11 15:58:04 +02:00 · 2024-05-11 15:58:04 +02:00 · 6302bf37c2
commit 6302bf37c2
parent 770559d5ea
3 changed files with 9731 additions and 0 deletions
--- a/movie_recommendations/datasets/tmdb_5000_credits.csv
+++ b/movie_recommendations/datasets/tmdb_5000_credits.csv
--- a/movie_recommendations/datasets/tmdb_5000_movies.csv
+++ b/movie_recommendations/datasets/tmdb_5000_movies.csv
--- a/movie_recommendations/main.py
+++ b/movie_recommendations/main.py
@ -0,0 +1,123 @@
+import pandas as pd
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import linear_kernel
+from ast import literal_eval
+
+
+# Function that takes in movie title as input and outputs most similar movies based on description
+def get_recommendations(title, cosine_sim, df):
+    indices = pd.Series(df.index, index=df['title']).drop_duplicates()
+    # Get the index of the movie that matches the title
+    idx = indices[title]
+
+    # Get the pairwsie similarity scores of all movies with that movie
+    sim_scores = list(enumerate(cosine_sim[idx]))
+
+    # Sort the movies based on the similarity scores
+    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
+
+    # Get the scores of the 10 most similar movies
+    sim_scores = sim_scores[1:11]
+
+    # Get the movie indices
+    movie_indices = [i[0] for i in sim_scores]
+
+    # Return the top 10 most similar movies
+    return df['title'].iloc[movie_indices]
+
+
+def get_director(x):
+    for i in x:
+        if i['job'] == 'Director':
+            return i['name']
+    return np.nan
+
+
+# Returns the list top 3 elements or entire list; whichever is more.
+def get_list(x):
+    if isinstance(x, list):
+        names = [i['name'] for i in x]
+        # Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
+        if len(names) > 3:
+            names = names[:3]
+        return names
+
+    # Return empty list in case of missing/malformed data
+    return []
+
+
+# Function to convert all strings to lower case and strip names of spaces
+def clean_data(x):
+    if isinstance(x, list):
+        return [str.lower(i.replace(" ", "")) for i in x]
+    else:
+        # Check if director exists. If not, return empty string
+        if isinstance(x, str):
+            return str.lower(x.replace(" ", ""))
+        else:
+            return ''
+
+
+def create_soup(x):
+    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
+
+
+def main():
+    df1 = pd.read_csv('datasets/tmdb_5000_credits.csv')
+    df2 = pd.read_csv('datasets/tmdb_5000_movies.csv')
+
+    df1.columns = ['id', 'tittle', 'cast', 'crew']
+    df2 = df2.merge(df1, on='id')
+    df2['overview'] = df2['overview'].fillna('')
+
+    # Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
+    tfidf = TfidfVectorizer(stop_words='english')
+
+    # Construct the required TF-IDF matrix by fitting and transforming the data
+    tfidf_matrix = tfidf.fit_transform(df2['overview'])     # matrix with word usage
+
+    # Compute the cosine similarity matrix
+    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
+
+    # print(get_recommendations('The Avengers', cosine_sim, df_movies))
+
+    features = ['cast', 'crew', 'keywords', 'genres']
+    for feature in features:
+        df2[feature] = df2[feature].apply(literal_eval)
+
+    # Define new director, cast, genres and keywords features that are in a suitable form.
+    df2['director'] = df2['crew'].apply(get_director)
+
+    features = ['cast', 'keywords', 'genres']
+    for feature in features:
+        df2[feature] = df2[feature].apply(get_list)
+
+    # Apply clean_data function to your features.
+    features = ['cast', 'keywords', 'director', 'genres']
+
+    for feature in features:
+        df2[feature] = df2[feature].apply(clean_data)
+
+    df2['soup'] = df2.apply(create_soup, axis=1)
+
+    # Import CountVectorizer and create the count matrix
+    from sklearn.feature_extraction.text import CountVectorizer
+
+    count = CountVectorizer(stop_words='english')
+    count_matrix = count.fit_transform(df2['soup'])
+
+    # Compute the Cosine Similarity matrix based on the count_matrix
+    from sklearn.metrics.pairwise import cosine_similarity
+
+    cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
+
+    # Reset index of our main DataFrame and construct reverse mapping as before
+    df2 = df2.reset_index()
+    indices = pd.Series(df2.index, index=df2['title'])
+
+    print(get_recommendations('The Dark Knight Rises', cosine_sim2, df2))
+
+
+if __name__ == '__main__':
+    main()