From 6bdb869e13bf24bff4b9e5b3707d195cc0fc634e Mon Sep 17 00:00:00 2001
From: Krzysztof Rudnicki <krzysztofrudnicki0@gmail.com>
Date: Sun, 28 May 2023 23:05:07 +0200
Subject: [PATCH] feat: add midterm code so far

---
 midterm/main.py          | 196 +++++++++++++++++++++++++++++++++++++++
 midterm/requirements.txt |   4 +
 2 files changed, 200 insertions(+)
 create mode 100644 midterm/main.py
 create mode 100644 midterm/requirements.txt

diff --git a/midterm/main.py b/midterm/main.py
new file mode 100644
index 00000000..767ba90f
--- /dev/null
+++ b/midterm/main.py
@@ -0,0 +1,196 @@
+"""
+Code for preprocessing data and creating model that predicts and
+recomends anime based on another anime entered by user
+"""
+import pandas as pd
+import numpy as np
+from sklearn.neighbors import NearestNeighbors
+from scipy.sparse import csr_matrix
+
+
+def get_data(limit_data=-1, data_folder_path="database"):
+    """
+    Reads anime from csv database
+    """
+    if limit_data > -1:
+        # User can limit number of data taken into consideration,
+        # model seems to work with limit_data value as low as 500,000
+        rating_data = pd.read_csv(
+            data_folder_path + "/animelist.csv", nrows=limit_data)
+    else:
+        rating_data = pd.read_csv(data_folder_path + "/animelist.csv")
+    anime_data = pd.read_csv(data_folder_path + "/anime.csv")
+
+    # used to fetch anime_id(MAL_ID)
+    anime_data = anime_data.rename(columns={"MAL_ID": "anime_id"})
+    anime_contact_data = anime_data[["anime_id", "Name"]]
+    return rating_data, anime_contact_data
+
+
+def merge_rating_anime_data(rating_data, anime_contact_data, debug=False):
+    """
+    Preprocesses the data used for rating
+    """
+    rating_data = rating_data.merge(
+        anime_contact_data, left_on="anime_id", right_on="anime_id", how="left"
+    )
+    rating_data = rating_data[
+        ["user_id", "Name", "anime_id", "rating",
+            "watching_status", "watched_episodes"]
+    ]
+    rating_head = rating_data.head()
+    if debug:
+        print(rating_head)
+    rating_shape_complete = rating_data.shape
+    if debug:
+        print(rating_shape_complete)
+    return rating_data
+
+
+def split_data_below_thresholds(rating_data, data_name, threshold, debug=False):
+    """
+    Removes data with data_name which is below given threshold
+    """
+    count = rating_data[data_name].value_counts()
+    rating_data = rating_data[
+        rating_data[data_name].isin(count[count >= threshold].index)
+    ].copy()
+    rating_shape_cut = rating_data.shape
+    if debug:
+        print(rating_shape_cut)
+    return rating_data
+
+
+def combine_name_and_ratings(rating_data, debug=False):
+    """
+    Create table which holds name of the anime and number of its reviews
+    then we merge this with rating_data
+    """
+    combine_movie_rating = rating_data.dropna(axis=0, subset=["Name"])
+    movie_rating_count = (
+        combine_movie_rating.groupby(by=["Name"])["rating"]
+        .count()
+        .reset_index()[["Name", "rating"]]
+    )
+    rating_head = movie_rating_count.head()
+    if debug:
+        print(rating_head)
+    rating_data = combine_movie_rating.merge(
+        movie_rating_count, left_on="Name", right_on="Name", how="left"
+    )
+    return rating_data
+
+
+def get_length_of_data(rating_data, data_name):
+    """
+    We get amount of data in the database with a given column data_name
+    """
+    # Encoding categorical data
+    column_ids = rating_data[data_name + "_id"].unique().tolist()
+    column_to_column = {x: i for i, x in enumerate(column_ids)}
+    rating_data[data_name] = rating_data[data_name +
+                                         "_id"].map(column_to_column)
+    users_number = len(column_to_column)
+    return users_number
+
+
+def get_top_ranked(rating_data, data_name, join_table=None, top_data_taken=20):
+    """
+    Get anime with highest ranking
+    """
+    if join_table is None:
+        join_table = rating_data
+    group_data_by_rating = rating_data.groupby(
+        data_name + "_id")["rating"].count()
+    top_users = group_data_by_rating.dropna().sort_values(ascending=False)[
+        :top_data_taken]
+    top_rated = join_table.join(top_users, rsuffix="_r",
+                            how="inner", on=data_name + "_id")
+    return top_rated
+
+
+def get_data_info(rating_data, debug=False):
+    """
+    Get some informations about data
+    """
+    users_number = get_length_of_data(rating_data, "user")
+    animes_number = get_length_of_data(rating_data, "anime")
+
+    top_rated = get_top_ranked(rating_data, "user")
+    top_rated = get_top_ranked(rating_data, "anime", top_rated)
+
+    pivot = pd.crosstab(top_rated.user_id, top_rated.anime_id,
+                        top_rated.rating, aggfunc=np.sum)
+
+    pivot.fillna(0, inplace=True)
+    smallest_rating = min(rating_data["rating"])
+    highest_rating = max(rating_data["rating"])
+    if debug:
+        print(pivot)
+    if debug:
+        print(f"Num of users: {users_number}, Num of animes: {animes_number}")
+        print(
+            f"Min total rating: {smallest_rating}, Max total rating: {highest_rating}")
+
+
+def preprocessing(rating_data, anime_contact_data, debug=False):
+    """
+    Preprocesses data for making model more accurate and/or faster
+    """
+    rating_data = merge_rating_anime_data(rating_data, anime_contact_data)
+    rating_data = split_data_below_thresholds(rating_data, "user_id", 500)
+    rating_data = split_data_below_thresholds(rating_data, "anime_id", 200)
+    rating_data = combine_name_and_ratings(rating_data)
+
+    rating_data = rating_data.drop(columns="rating_x")
+    rating_data = rating_data.rename(columns={"rating_y": "rating"})
+    if debug:
+        print(rating_data)
+        get_data_info(rating_data)
+
+    piviot_table = rating_data.pivot_table(
+        index="Name", columns="user_id", values="rating"
+    ).fillna(0)
+    if debug:
+        print(piviot_table)
+    return piviot_table
+
+
+def predict(prediction_model, piviot_table):
+    """
+    This will choose a random anime name and our prediction_model will predict similar anime.
+    """
+    random_anime = np.random.choice(piviot_table.shape[0])
+
+    query = piviot_table.iloc[random_anime, :].values.reshape(1, -1)
+    distance, suggestions = prediction_model.kneighbors(query, n_neighbors=6)
+
+    for i in range(0, len(distance.flatten())):
+        if i == 0:
+            print(f"Recommendations for {0}:\n".format(
+                piviot_table.index[random_anime]))
+        else:
+            print(
+                f"{0}: {1}, with distance of {2}:".format(
+                    i,
+                    piviot_table.index[suggestions.flatten()[i]],
+                    distance.flatten()[i],
+                )
+            )
+
+
+def create_model(piviot_table):
+    """
+    Creates model based on neaarest neighbor for anime prediction
+    """
+    piviot_table_matrix = csr_matrix(piviot_table.values)
+    model = NearestNeighbors(metric="cosine", algorithm="brute")
+    model.fit(piviot_table_matrix)
+    return model
+
+
+if __name__ == "__main__":
+    RATING_DATA, ANIME_CONTACT_DATA = get_data(524288)
+    PIVOT_TABLE = preprocessing(RATING_DATA, ANIME_CONTACT_DATA)
+    MODEL = create_model(PIVOT_TABLE)
+    predict(MODEL, PIVOT_TABLE)
diff --git a/midterm/requirements.txt b/midterm/requirements.txt
new file mode 100644
index 00000000..b0c17db6
--- /dev/null
+++ b/midterm/requirements.txt
@@ -0,0 +1,4 @@
+pandas
+numpy
+seaborn
+matplotlib
\ No newline at end of file