mirror of
https://github.com/kuhyx/WUT_Computer_Science.git
synced 2026-07-04 19:23:03 +02:00
Refactored do klasy
This commit is contained in:
parent
6302bf37c2
commit
3f22570c4e
@ -1,30 +1,8 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import linear_kernel
|
||||
from ast import literal_eval
|
||||
|
||||
|
||||
# Function that takes in movie title as input and outputs most similar movies based on description
|
||||
def get_recommendations(title, cosine_sim, df):
|
||||
indices = pd.Series(df.index, index=df['title']).drop_duplicates()
|
||||
# Get the index of the movie that matches the title
|
||||
idx = indices[title]
|
||||
|
||||
# Get the pairwsie similarity scores of all movies with that movie
|
||||
sim_scores = list(enumerate(cosine_sim[idx]))
|
||||
|
||||
# Sort the movies based on the similarity scores
|
||||
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
|
||||
|
||||
# Get the scores of the 10 most similar movies
|
||||
sim_scores = sim_scores[1:11]
|
||||
|
||||
# Get the movie indices
|
||||
movie_indices = [i[0] for i in sim_scores]
|
||||
|
||||
# Return the top 10 most similar movies
|
||||
return df['title'].iloc[movie_indices]
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
|
||||
def get_director(x):
|
||||
@ -34,25 +12,19 @@ def get_director(x):
|
||||
return np.nan
|
||||
|
||||
|
||||
# Returns the list top 3 elements or entire list; whichever is more.
|
||||
def get_list(x):
|
||||
if isinstance(x, list):
|
||||
names = [i['name'] for i in x]
|
||||
# Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
|
||||
if len(names) > 3:
|
||||
names = names[:3]
|
||||
return names
|
||||
|
||||
# Return empty list in case of missing/malformed data
|
||||
return []
|
||||
|
||||
|
||||
# Function to convert all strings to lower case and strip names of spaces
|
||||
def clean_data(x):
|
||||
if isinstance(x, list):
|
||||
return [str.lower(i.replace(" ", "")) for i in x]
|
||||
else:
|
||||
# Check if director exists. If not, return empty string
|
||||
if isinstance(x, str):
|
||||
return str.lower(x.replace(" ", ""))
|
||||
else:
|
||||
@ -63,61 +35,54 @@ def create_soup(x):
|
||||
return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
|
||||
|
||||
|
||||
def main():
|
||||
df1 = pd.read_csv('datasets/tmdb_5000_credits.csv')
|
||||
df2 = pd.read_csv('datasets/tmdb_5000_movies.csv')
|
||||
class MovieRecommender:
|
||||
def __init__(self):
|
||||
self.df = None
|
||||
self.cosine_sim = None
|
||||
|
||||
df1.columns = ['id', 'tittle', 'cast', 'crew']
|
||||
df2 = df2.merge(df1, on='id')
|
||||
df2['overview'] = df2['overview'].fillna('')
|
||||
def get_recommendations(self, title):
|
||||
indices = pd.Series(self.df.index, index=self.df['title']).drop_duplicates()
|
||||
idx = indices[title]
|
||||
sim_scores = list(enumerate(self.cosine_sim[idx]))
|
||||
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
|
||||
sim_scores = sim_scores[1:11]
|
||||
movie_indices = [i[0] for i in sim_scores]
|
||||
return self.df['title'].iloc[movie_indices]
|
||||
|
||||
# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
|
||||
tfidf = TfidfVectorizer(stop_words='english')
|
||||
def fit(self, credits_file, movies_file):
|
||||
df1 = pd.read_csv(credits_file)
|
||||
df2 = pd.read_csv(movies_file)
|
||||
df1.columns = ['id', 'tittle', 'cast', 'crew']
|
||||
df2 = df2.merge(df1, on='id')
|
||||
df2['overview'] = df2['overview'].fillna('')
|
||||
self.df = df2
|
||||
|
||||
# Construct the required TF-IDF matrix by fitting and transforming the data
|
||||
tfidf_matrix = tfidf.fit_transform(df2['overview']) # matrix with word usage
|
||||
features = ['cast', 'crew', 'keywords', 'genres']
|
||||
for feature in features:
|
||||
df2[feature] = df2[feature].apply(literal_eval)
|
||||
|
||||
# Compute the cosine similarity matrix
|
||||
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
|
||||
df2['director'] = df2['crew'].apply(get_director)
|
||||
|
||||
# print(get_recommendations('The Avengers', cosine_sim, df_movies))
|
||||
features = ['cast', 'keywords', 'genres']
|
||||
for feature in features:
|
||||
df2[feature] = df2[feature].apply(get_list)
|
||||
|
||||
features = ['cast', 'crew', 'keywords', 'genres']
|
||||
for feature in features:
|
||||
df2[feature] = df2[feature].apply(literal_eval)
|
||||
features = ['cast', 'keywords', 'director', 'genres']
|
||||
for feature in features:
|
||||
df2[feature] = df2[feature].apply(clean_data)
|
||||
|
||||
# Define new director, cast, genres and keywords features that are in a suitable form.
|
||||
df2['director'] = df2['crew'].apply(get_director)
|
||||
df2['soup'] = df2.apply(create_soup, axis=1)
|
||||
|
||||
features = ['cast', 'keywords', 'genres']
|
||||
for feature in features:
|
||||
df2[feature] = df2[feature].apply(get_list)
|
||||
count = CountVectorizer(stop_words='english')
|
||||
count_matrix = count.fit_transform(df2['soup'])
|
||||
self.cosine_sim = cosine_similarity(count_matrix, count_matrix)
|
||||
|
||||
# Apply clean_data function to your features.
|
||||
features = ['cast', 'keywords', 'director', 'genres']
|
||||
|
||||
for feature in features:
|
||||
df2[feature] = df2[feature].apply(clean_data)
|
||||
|
||||
df2['soup'] = df2.apply(create_soup, axis=1)
|
||||
|
||||
# Import CountVectorizer and create the count matrix
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
|
||||
count = CountVectorizer(stop_words='english')
|
||||
count_matrix = count.fit_transform(df2['soup'])
|
||||
|
||||
# Compute the Cosine Similarity matrix based on the count_matrix
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
|
||||
|
||||
# Reset index of our main DataFrame and construct reverse mapping as before
|
||||
df2 = df2.reset_index()
|
||||
indices = pd.Series(df2.index, index=df2['title'])
|
||||
|
||||
print(get_recommendations('The Dark Knight Rises', cosine_sim2, df2))
|
||||
self.df = df2.reset_index()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
# Example usage:
|
||||
if __name__ == "__main__":
|
||||
recommender = MovieRecommender()
|
||||
recommender.fit('datasets/tmdb_5000_credits.csv', 'datasets/tmdb_5000_movies.csv')
|
||||
recommendations = recommender.get_recommendations('The Dark Knight Rises')
|
||||
print(recommendations)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user