mirror of
https://github.com/kuhyx/WUT_Computer_Science.git
synced 2026-07-04 12:43:04 +02:00
git-subtree-dir: Programming/ERSMS-project git-subtree-mainline:7861d69ae9git-subtree-split:d060e8285a
171 lines
5.3 KiB
Python
171 lines
5.3 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
from ast import literal_eval
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
import hashlib
|
|
import json
|
|
from configparser import ConfigParser
|
|
import psycopg2
|
|
from flask import Flask, request, jsonify
|
|
from flask_caching import Cache
|
|
|
|
|
|
app = Flask(__name__)
|
|
cache = Cache(config={'CACHE_TYPE': 'SimpleCache'})
|
|
db_connector = None
|
|
conn = None
|
|
|
|
|
|
def get_director(x):
|
|
for i in x:
|
|
if i['job'] == 'Director':
|
|
return i['name']
|
|
return np.nan
|
|
|
|
|
|
def get_list(x):
|
|
if isinstance(x, list):
|
|
names = [i['name'] for i in x]
|
|
if len(names) > 3:
|
|
names = names[:3]
|
|
return names
|
|
return []
|
|
|
|
|
|
def clean_data(x):
|
|
if isinstance(x, list):
|
|
return [str.lower(i.replace(" ", "")) for i in x]
|
|
else:
|
|
if isinstance(x, str):
|
|
return str.lower(x.replace(" ", ""))
|
|
else:
|
|
return ''
|
|
|
|
|
|
def create_soup(x):
|
|
return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
|
|
|
|
|
|
class MovieRecommender:
|
|
def __init__(self):
|
|
self.df = None
|
|
self.cosine_sim = None
|
|
|
|
def fit(self, credits_file, movies_file):
|
|
"""
|
|
Fittuje AI do przekazanych danych
|
|
:param credits_file: csv z creditsami
|
|
:param movies_file: csv z filmami
|
|
:return: Nic
|
|
"""
|
|
df1 = pd.read_csv(credits_file)
|
|
df2 = pd.read_csv(movies_file)
|
|
df1.columns = ['id', 'tittle', 'cast', 'crew']
|
|
df2 = df2.merge(df1, on='id')
|
|
df2['overview'] = df2['overview'].fillna('')
|
|
self.df = df2
|
|
|
|
features = ['cast', 'crew', 'keywords', 'genres']
|
|
for feature in features:
|
|
df2[feature] = df2[feature].apply(literal_eval)
|
|
|
|
df2['director'] = df2['crew'].apply(get_director)
|
|
|
|
features = ['cast', 'keywords', 'genres']
|
|
for feature in features:
|
|
df2[feature] = df2[feature].apply(get_list)
|
|
|
|
features = ['cast', 'keywords', 'director', 'genres']
|
|
for feature in features:
|
|
df2[feature] = df2[feature].apply(clean_data)
|
|
|
|
df2['soup'] = df2.apply(create_soup, axis=1)
|
|
|
|
count = CountVectorizer(stop_words='english')
|
|
count_matrix = count.fit_transform(df2['soup'])
|
|
self.cosine_sim = cosine_similarity(count_matrix, count_matrix)
|
|
|
|
self.df = df2.reset_index()
|
|
|
|
def _get_recommendations_one_input(self, movie_id):
|
|
"""
|
|
Tworzy rekomendacje, bazując na jednym filmie
|
|
:param movie_id: id filmu, dla którego ma zrobić rekomendację
|
|
:return: Zwraca listę [movie_ids, similarity_scores] gdzie oba argumenty są np.array
|
|
"""
|
|
indices = pd.Series(self.df.index, index=self.df['id']).drop_duplicates()
|
|
idx = indices[movie_id]
|
|
sim_scores = list(enumerate(self.cosine_sim[idx]))
|
|
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
|
|
sim_scores = sim_scores[1:101]
|
|
movie_indices = [i[0] for i in sim_scores]
|
|
sim_scores = np.array([t[1] for t in sim_scores])
|
|
return [self.df['id'].iloc[movie_indices].values, sim_scores]
|
|
|
|
def get_recommendations(self, movie_ids: list) -> {}:
|
|
"""
|
|
Tworzy listę rekomendacji bazującą na id podanych filmów
|
|
:param movie_ids: id filmów, na podstawie których ma wybrać rekomendowane filmy
|
|
:return: Zwraca dicta {movie_id: similarity_scores}
|
|
"""
|
|
recommended_movies = {}
|
|
for movie_id in movie_ids:
|
|
recommended_ids, sim_scores = self._get_recommendations_one_input(movie_id)
|
|
for recommended_id, sim_score in zip(recommended_ids, sim_scores):
|
|
if recommended_id in movie_ids:
|
|
continue
|
|
|
|
if recommended_movies.get(int(recommended_id)) is None:
|
|
recommended_movies[int(recommended_id)] = float(round((sim_score / len(movie_ids)), 4))
|
|
else:
|
|
recommended_movies[int(recommended_id)] += float(round((sim_score / len(movie_ids)), 4))
|
|
return recommended_movies
|
|
|
|
|
|
recommender = MovieRecommender()
|
|
recommender.fit('datasets/tmdb_5000_credits.csv',
|
|
'datasets/tmdb_5000_movies.csv')
|
|
|
|
|
|
def make_cache_key():
|
|
data = request.get_json()
|
|
if isinstance(data, list):
|
|
data = sorted(data)
|
|
key = hashlib.md5(json.dumps(data).encode('utf-8')).hexdigest()
|
|
return key
|
|
|
|
|
|
@app.route("/api/v3/AI_recommendations", methods=["POST"])
|
|
@cache.cached(timeout=300, key_prefix=make_cache_key)
|
|
def AI_recommendations():
|
|
ids = request.get_json()
|
|
recommendations = recommender.get_recommendations(ids)
|
|
return jsonify(recommendations)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
config = ConfigParser()
|
|
config.read("init_scripts/constants.ini")
|
|
|
|
while True:
|
|
try:
|
|
conn = psycopg2.connect(
|
|
host=config["postgres"]["host"],
|
|
database=config["postgres"]["database"],
|
|
user=config["postgres"]["user"],
|
|
password=config["postgres"]["password"],
|
|
port=int(config["postgres"]["port"])
|
|
)
|
|
|
|
except Exception:
|
|
print("Trying to connect with database")
|
|
continue
|
|
else:
|
|
break
|
|
|
|
cache.init_app(app)
|
|
app.run(host="localhost", port=8081, debug=True)
|
|
|
|
conn.close()
|