From 6bdb869e13bf24bff4b9e5b3707d195cc0fc634e Mon Sep 17 00:00:00 2001 From: Krzysztof Rudnicki Date: Sun, 28 May 2023 23:05:07 +0200 Subject: [PATCH] feat: add midterm code so far --- midterm/main.py | 196 +++++++++++++++++++++++++++++++++++++++ midterm/requirements.txt | 4 + 2 files changed, 200 insertions(+) create mode 100644 midterm/main.py create mode 100644 midterm/requirements.txt diff --git a/midterm/main.py b/midterm/main.py new file mode 100644 index 00000000..767ba90f --- /dev/null +++ b/midterm/main.py @@ -0,0 +1,196 @@ +""" +Code for preprocessing data and creating model that predicts and +recomends anime based on another anime entered by user +""" +import pandas as pd +import numpy as np +from sklearn.neighbors import NearestNeighbors +from scipy.sparse import csr_matrix + + +def get_data(limit_data=-1, data_folder_path="database"): + """ + Reads anime from csv database + """ + if limit_data > -1: + # User can limit number of data taken into consideration, + # model seems to work with limit_data value as low as 500,000 + rating_data = pd.read_csv( + data_folder_path + "/animelist.csv", nrows=limit_data) + else: + rating_data = pd.read_csv(data_folder_path + "/animelist.csv") + anime_data = pd.read_csv(data_folder_path + "/anime.csv") + + # used to fetch anime_id(MAL_ID) + anime_data = anime_data.rename(columns={"MAL_ID": "anime_id"}) + anime_contact_data = anime_data[["anime_id", "Name"]] + return rating_data, anime_contact_data + + +def merge_rating_anime_data(rating_data, anime_contact_data, debug=False): + """ + Preprocesses the data used for rating + """ + rating_data = rating_data.merge( + anime_contact_data, left_on="anime_id", right_on="anime_id", how="left" + ) + rating_data = rating_data[ + ["user_id", "Name", "anime_id", "rating", + "watching_status", "watched_episodes"] + ] + rating_head = rating_data.head() + if debug: + print(rating_head) + rating_shape_complete = rating_data.shape + if debug: + print(rating_shape_complete) + return rating_data + + +def split_data_below_thresholds(rating_data, data_name, threshold, debug=False): + """ + Removes data with data_name which is below given threshold + """ + count = rating_data[data_name].value_counts() + rating_data = rating_data[ + rating_data[data_name].isin(count[count >= threshold].index) + ].copy() + rating_shape_cut = rating_data.shape + if debug: + print(rating_shape_cut) + return rating_data + + +def combine_name_and_ratings(rating_data, debug=False): + """ + Create table which holds name of the anime and number of its reviews + then we merge this with rating_data + """ + combine_movie_rating = rating_data.dropna(axis=0, subset=["Name"]) + movie_rating_count = ( + combine_movie_rating.groupby(by=["Name"])["rating"] + .count() + .reset_index()[["Name", "rating"]] + ) + rating_head = movie_rating_count.head() + if debug: + print(rating_head) + rating_data = combine_movie_rating.merge( + movie_rating_count, left_on="Name", right_on="Name", how="left" + ) + return rating_data + + +def get_length_of_data(rating_data, data_name): + """ + We get amount of data in the database with a given column data_name + """ + # Encoding categorical data + column_ids = rating_data[data_name + "_id"].unique().tolist() + column_to_column = {x: i for i, x in enumerate(column_ids)} + rating_data[data_name] = rating_data[data_name + + "_id"].map(column_to_column) + users_number = len(column_to_column) + return users_number + + +def get_top_ranked(rating_data, data_name, join_table=None, top_data_taken=20): + """ + Get anime with highest ranking + """ + if join_table is None: + join_table = rating_data + group_data_by_rating = rating_data.groupby( + data_name + "_id")["rating"].count() + top_users = group_data_by_rating.dropna().sort_values(ascending=False)[ + :top_data_taken] + top_rated = join_table.join(top_users, rsuffix="_r", + how="inner", on=data_name + "_id") + return top_rated + + +def get_data_info(rating_data, debug=False): + """ + Get some informations about data + """ + users_number = get_length_of_data(rating_data, "user") + animes_number = get_length_of_data(rating_data, "anime") + + top_rated = get_top_ranked(rating_data, "user") + top_rated = get_top_ranked(rating_data, "anime", top_rated) + + pivot = pd.crosstab(top_rated.user_id, top_rated.anime_id, + top_rated.rating, aggfunc=np.sum) + + pivot.fillna(0, inplace=True) + smallest_rating = min(rating_data["rating"]) + highest_rating = max(rating_data["rating"]) + if debug: + print(pivot) + if debug: + print(f"Num of users: {users_number}, Num of animes: {animes_number}") + print( + f"Min total rating: {smallest_rating}, Max total rating: {highest_rating}") + + +def preprocessing(rating_data, anime_contact_data, debug=False): + """ + Preprocesses data for making model more accurate and/or faster + """ + rating_data = merge_rating_anime_data(rating_data, anime_contact_data) + rating_data = split_data_below_thresholds(rating_data, "user_id", 500) + rating_data = split_data_below_thresholds(rating_data, "anime_id", 200) + rating_data = combine_name_and_ratings(rating_data) + + rating_data = rating_data.drop(columns="rating_x") + rating_data = rating_data.rename(columns={"rating_y": "rating"}) + if debug: + print(rating_data) + get_data_info(rating_data) + + piviot_table = rating_data.pivot_table( + index="Name", columns="user_id", values="rating" + ).fillna(0) + if debug: + print(piviot_table) + return piviot_table + + +def predict(prediction_model, piviot_table): + """ + This will choose a random anime name and our prediction_model will predict similar anime. + """ + random_anime = np.random.choice(piviot_table.shape[0]) + + query = piviot_table.iloc[random_anime, :].values.reshape(1, -1) + distance, suggestions = prediction_model.kneighbors(query, n_neighbors=6) + + for i in range(0, len(distance.flatten())): + if i == 0: + print(f"Recommendations for {0}:\n".format( + piviot_table.index[random_anime])) + else: + print( + f"{0}: {1}, with distance of {2}:".format( + i, + piviot_table.index[suggestions.flatten()[i]], + distance.flatten()[i], + ) + ) + + +def create_model(piviot_table): + """ + Creates model based on neaarest neighbor for anime prediction + """ + piviot_table_matrix = csr_matrix(piviot_table.values) + model = NearestNeighbors(metric="cosine", algorithm="brute") + model.fit(piviot_table_matrix) + return model + + +if __name__ == "__main__": + RATING_DATA, ANIME_CONTACT_DATA = get_data(524288) + PIVOT_TABLE = preprocessing(RATING_DATA, ANIME_CONTACT_DATA) + MODEL = create_model(PIVOT_TABLE) + predict(MODEL, PIVOT_TABLE) diff --git a/midterm/requirements.txt b/midterm/requirements.txt new file mode 100644 index 00000000..b0c17db6 --- /dev/null +++ b/midterm/requirements.txt @@ -0,0 +1,4 @@ +pandas +numpy +seaborn +matplotlib \ No newline at end of file