Podstawowa wersja AI do rekomendacji

Aktualnie tylko na podstawie jednego filmu, wszystko wwalone do maina
This commit is contained in:
gzub04 2024-05-11 15:58:04 +02:00
parent 770559d5ea
commit 6302bf37c2
3 changed files with 9731 additions and 0 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,123 @@
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
# Function that takes in movie title as input and outputs most similar movies based on description
def get_recommendations(title, cosine_sim, df):
indices = pd.Series(df.index, index=df['title']).drop_duplicates()
# Get the index of the movie that matches the title
idx = indices[title]
# Get the pairwsie similarity scores of all movies with that movie
sim_scores = list(enumerate(cosine_sim[idx]))
# Sort the movies based on the similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Get the scores of the 10 most similar movies
sim_scores = sim_scores[1:11]
# Get the movie indices
movie_indices = [i[0] for i in sim_scores]
# Return the top 10 most similar movies
return df['title'].iloc[movie_indices]
def get_director(x):
for i in x:
if i['job'] == 'Director':
return i['name']
return np.nan
# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
if isinstance(x, list):
names = [i['name'] for i in x]
# Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
if len(names) > 3:
names = names[:3]
return names
# Return empty list in case of missing/malformed data
return []
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
if isinstance(x, list):
return [str.lower(i.replace(" ", "")) for i in x]
else:
# Check if director exists. If not, return empty string
if isinstance(x, str):
return str.lower(x.replace(" ", ""))
else:
return ''
def create_soup(x):
return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
def main():
df1 = pd.read_csv('datasets/tmdb_5000_credits.csv')
df2 = pd.read_csv('datasets/tmdb_5000_movies.csv')
df1.columns = ['id', 'tittle', 'cast', 'crew']
df2 = df2.merge(df1, on='id')
df2['overview'] = df2['overview'].fillna('')
# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')
# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df2['overview']) # matrix with word usage
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# print(get_recommendations('The Avengers', cosine_sim, df_movies))
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
df2[feature] = df2[feature].apply(literal_eval)
# Define new director, cast, genres and keywords features that are in a suitable form.
df2['director'] = df2['crew'].apply(get_director)
features = ['cast', 'keywords', 'genres']
for feature in features:
df2[feature] = df2[feature].apply(get_list)
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']
for feature in features:
df2[feature] = df2[feature].apply(clean_data)
df2['soup'] = df2.apply(create_soup, axis=1)
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df2['soup'])
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
# Reset index of our main DataFrame and construct reverse mapping as before
df2 = df2.reset_index()
indices = pd.Series(df2.index, index=df2['title'])
print(get_recommendations('The Dark Knight Rises', cosine_sim2, df2))
if __name__ == '__main__':
main()