2023-06-08 14:41:24 +02:00
|
|
|
"""
|
|
|
|
|
Code for preprocessing data and creating model that predicts and
|
|
|
|
|
recomends anime based on another anime entered by user
|
|
|
|
|
"""
|
2023-06-08 19:32:30 +02:00
|
|
|
import math
|
2023-06-08 16:20:20 +02:00
|
|
|
import argparse
|
2023-06-11 18:35:51 +02:00
|
|
|
import shutil
|
2023-06-11 16:42:37 +02:00
|
|
|
import os
|
|
|
|
|
import datetime
|
2023-06-08 14:41:24 +02:00
|
|
|
import pandas as pd
|
|
|
|
|
import numpy as np
|
|
|
|
|
from sklearn.neighbors import NearestNeighbors
|
2023-06-08 19:32:30 +02:00
|
|
|
from sklearn.neighbors import VALID_METRICS_SPARSE
|
2023-06-08 14:41:24 +02:00
|
|
|
from scipy.sparse import csr_matrix
|
|
|
|
|
|
|
|
|
|
|
2023-06-08 19:49:56 +02:00
|
|
|
def get_data_cpu(limit_data=-1, data_folder_path="database"):
|
2023-06-08 14:41:24 +02:00
|
|
|
"""
|
|
|
|
|
Reads anime from csv database
|
|
|
|
|
"""
|
|
|
|
|
if limit_data > -1:
|
|
|
|
|
# User can limit number of data taken into consideration,
|
|
|
|
|
# model seems to work with limit_data value as low as 500,000
|
|
|
|
|
rating_data = pd.read_csv(
|
|
|
|
|
data_folder_path + "/animelist.csv", nrows=limit_data)
|
|
|
|
|
else:
|
|
|
|
|
rating_data = pd.read_csv(data_folder_path + "/animelist.csv")
|
|
|
|
|
anime_data = pd.read_csv(data_folder_path + "/anime.csv")
|
2023-06-08 19:49:56 +02:00
|
|
|
return rating_data, anime_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_data(limit_data=-1, data_folder_path="database", gpu=False):
|
|
|
|
|
rating_data, anime_data = get_data_cpu(limit_data, data_folder_path)
|
2023-06-08 14:41:24 +02:00
|
|
|
|
|
|
|
|
# used to fetch anime_id(MAL_ID)
|
|
|
|
|
anime_data = anime_data.rename(columns={"MAL_ID": "anime_id"})
|
|
|
|
|
anime_contact_data = anime_data[["anime_id", "Name"]]
|
2023-06-08 18:40:27 +02:00
|
|
|
rows_number = rating_data.shape[0]
|
|
|
|
|
return rating_data, anime_contact_data, rows_number
|
2023-06-08 14:41:24 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def merge_rating_anime_data(rating_data, anime_contact_data, debug=False):
|
|
|
|
|
"""
|
|
|
|
|
Preprocesses the data used for rating
|
|
|
|
|
"""
|
|
|
|
|
rating_data = rating_data.merge(
|
|
|
|
|
anime_contact_data, left_on="anime_id", right_on="anime_id", how="left"
|
|
|
|
|
)
|
|
|
|
|
rating_data = rating_data[
|
|
|
|
|
["user_id", "Name", "anime_id", "rating",
|
|
|
|
|
"watching_status", "watched_episodes"]
|
|
|
|
|
]
|
|
|
|
|
rating_head = rating_data.head()
|
|
|
|
|
if debug:
|
|
|
|
|
print(rating_head)
|
|
|
|
|
rating_shape_complete = rating_data.shape
|
|
|
|
|
if debug:
|
|
|
|
|
print(rating_shape_complete)
|
|
|
|
|
return rating_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def split_data_below_thresholds(rating_data, data_name, threshold=-1, debug=False):
|
|
|
|
|
"""
|
|
|
|
|
Removes data with data_name which is below given threshold
|
|
|
|
|
"""
|
|
|
|
|
if threshold != -1:
|
|
|
|
|
count = rating_data[data_name].value_counts()
|
|
|
|
|
rating_data = rating_data[
|
|
|
|
|
rating_data[data_name].isin(count[count >= threshold].index)
|
|
|
|
|
].copy()
|
|
|
|
|
rating_shape_cut = rating_data.shape
|
|
|
|
|
if debug:
|
|
|
|
|
print(rating_shape_cut)
|
|
|
|
|
return rating_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def combine_name_and_ratings(rating_data, debug=False):
|
|
|
|
|
"""
|
|
|
|
|
Create table which holds name of the anime and number of its reviews
|
|
|
|
|
then we merge this with rating_data
|
|
|
|
|
"""
|
|
|
|
|
combine_movie_rating = rating_data.dropna(axis=0, subset=["Name"])
|
|
|
|
|
movie_rating_count = (
|
|
|
|
|
combine_movie_rating.groupby(by=["Name"])["rating"]
|
|
|
|
|
.count()
|
|
|
|
|
.reset_index()[["Name", "rating"]]
|
|
|
|
|
)
|
|
|
|
|
rating_head = movie_rating_count.head()
|
|
|
|
|
if debug:
|
|
|
|
|
print(rating_head)
|
|
|
|
|
rating_data = combine_movie_rating.merge(
|
|
|
|
|
movie_rating_count, left_on="Name", right_on="Name", how="left"
|
|
|
|
|
)
|
|
|
|
|
return rating_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_length_of_data(rating_data, data_name):
|
|
|
|
|
"""
|
|
|
|
|
We get amount of data in the database with a given column data_name
|
|
|
|
|
"""
|
|
|
|
|
# Encoding categorical data
|
|
|
|
|
column_ids = rating_data[data_name + "_id"].unique().tolist()
|
|
|
|
|
column_to_column = {x: i for i, x in enumerate(column_ids)}
|
|
|
|
|
rating_data[data_name] = rating_data[data_name +
|
|
|
|
|
"_id"].map(column_to_column)
|
|
|
|
|
users_number = len(column_to_column)
|
|
|
|
|
return users_number
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_top_ranked(rating_data, data_name, join_table=None, top_data_taken=20):
|
|
|
|
|
"""
|
|
|
|
|
Get anime with highest ranking
|
|
|
|
|
"""
|
|
|
|
|
if join_table is None:
|
|
|
|
|
join_table = rating_data
|
|
|
|
|
group_data_by_rating = rating_data.groupby(
|
|
|
|
|
data_name + "_id")["rating"].count()
|
|
|
|
|
top_users = group_data_by_rating.dropna().sort_values(ascending=False)[
|
|
|
|
|
:top_data_taken]
|
|
|
|
|
top_rated = join_table.join(top_users, rsuffix="_r",
|
|
|
|
|
how="inner", on=data_name + "_id")
|
|
|
|
|
return top_rated
|
|
|
|
|
|
|
|
|
|
|
2023-06-08 19:49:56 +02:00
|
|
|
def get_data_info(rating_data, debug=False, gpu=False):
|
2023-06-08 14:41:24 +02:00
|
|
|
"""
|
|
|
|
|
Get some informations about data
|
|
|
|
|
"""
|
|
|
|
|
users_number = get_length_of_data(rating_data, "user")
|
|
|
|
|
animes_number = get_length_of_data(rating_data, "anime")
|
|
|
|
|
|
|
|
|
|
top_rated = get_top_ranked(rating_data, "user")
|
|
|
|
|
top_rated = get_top_ranked(rating_data, "anime", top_rated)
|
|
|
|
|
pivot = pd.crosstab(top_rated.user_id, top_rated.anime_id,
|
|
|
|
|
top_rated.rating, aggfunc=np.sum)
|
|
|
|
|
|
|
|
|
|
pivot.fillna(0, inplace=True)
|
|
|
|
|
smallest_rating = min(rating_data["rating"])
|
|
|
|
|
highest_rating = max(rating_data["rating"])
|
|
|
|
|
if debug:
|
|
|
|
|
print(pivot)
|
|
|
|
|
if debug:
|
|
|
|
|
print(f"Num of users: {users_number}, Num of animes: {animes_number}")
|
|
|
|
|
print(
|
|
|
|
|
f"Min total rating: {smallest_rating}, Max total rating: {highest_rating}")
|
|
|
|
|
|
|
|
|
|
|
2023-06-08 16:20:20 +02:00
|
|
|
def preprocessing(rating_data, anime_contact_data,
|
|
|
|
|
debug=False, user_threshold=500, anime_threshold=200, auto=False):
|
2023-06-08 14:41:24 +02:00
|
|
|
"""
|
|
|
|
|
Preprocesses data for making model more accurate and/or faster
|
|
|
|
|
"""
|
|
|
|
|
rating_data = merge_rating_anime_data(rating_data, anime_contact_data)
|
|
|
|
|
rating_data = split_data_below_thresholds(
|
|
|
|
|
rating_data, "user_id", user_threshold)
|
|
|
|
|
rating_data = split_data_below_thresholds(
|
|
|
|
|
rating_data, "anime_id", anime_threshold)
|
|
|
|
|
rating_data = combine_name_and_ratings(rating_data)
|
|
|
|
|
|
2023-06-11 21:58:41 +02:00
|
|
|
rating_data = rating_data.drop(columns="rating_y")
|
|
|
|
|
rating_data = rating_data.rename(columns={"rating_x": "rating"})
|
2023-06-08 16:20:20 +02:00
|
|
|
if debug and not auto:
|
2023-06-08 14:41:24 +02:00
|
|
|
print(rating_data)
|
2023-06-08 19:49:56 +02:00
|
|
|
get_data_info(rating_data, True)
|
2023-06-08 14:41:24 +02:00
|
|
|
|
|
|
|
|
pivot_table = rating_data.pivot_table(
|
|
|
|
|
index="Name", columns="user_id", values="rating"
|
|
|
|
|
).fillna(0)
|
2023-06-08 16:20:20 +02:00
|
|
|
if debug and not auto:
|
2023-06-08 14:41:24 +02:00
|
|
|
print(pivot_table)
|
|
|
|
|
return pivot_table
|
|
|
|
|
|
|
|
|
|
|
2023-06-11 21:58:41 +02:00
|
|
|
def predict(prediction_model, pivot_table, seed=42, anime="RANDOM", recommendation_number=6, auto=False, debug=False):
|
2023-06-08 14:41:24 +02:00
|
|
|
"""
|
|
|
|
|
This will choose a random anime name and our prediction_model will predict similar anime.
|
|
|
|
|
"""
|
|
|
|
|
np.random.seed(seed)
|
|
|
|
|
if anime == "RANDOM":
|
|
|
|
|
chosen_anime = np.random.choice(pivot_table.shape[0])
|
|
|
|
|
query = pivot_table.iloc[chosen_anime, :].values.reshape(1, -1)
|
|
|
|
|
chosen_anime_name = pivot_table.index[chosen_anime]
|
|
|
|
|
else:
|
|
|
|
|
query = pivot_table.loc[anime].values.reshape(1, -1)
|
|
|
|
|
chosen_anime_name = anime
|
|
|
|
|
distance, suggestions = prediction_model.kneighbors(
|
2023-06-11 18:35:51 +02:00
|
|
|
query)
|
|
|
|
|
if debug:
|
|
|
|
|
print("prediction model, distance: ", distance)
|
2023-06-11 23:16:25 +02:00
|
|
|
for i in range(0, 2):
|
|
|
|
|
if i == 0:
|
2023-06-08 14:41:24 +02:00
|
|
|
print(f"Recommendations for {chosen_anime_name}:\n")
|
2023-06-11 23:16:25 +02:00
|
|
|
else:
|
2023-06-08 14:41:24 +02:00
|
|
|
print(
|
2023-06-08 16:20:20 +02:00
|
|
|
f"""{i}: {pivot_table.index[suggestions.flatten()[i]]},
|
|
|
|
|
with distance of {distance.flatten()[i]}:"""
|
2023-06-08 14:41:24 +02:00
|
|
|
)
|
2023-06-11 18:35:51 +02:00
|
|
|
average_distance = np.mean(distance.flatten())
|
|
|
|
|
closest_anime_name = pivot_table.index[suggestions.flatten()[1]]
|
|
|
|
|
closest_anime_distance = distance.flatten()[1]
|
2023-06-11 23:16:25 +02:00
|
|
|
average_minus_closest_distance = average_distance - closest_anime_distance
|
2023-06-11 21:58:41 +02:00
|
|
|
print(
|
|
|
|
|
f"Average distance: {average_distance}, average_minus_closest_distance: {average_minus_closest_distance}")
|
|
|
|
|
|
2023-06-11 23:16:25 +02:00
|
|
|
return chosen_anime, suggestions.flatten()[1:recommendation_number+1], distance.flatten()[1:recommendation_number+1], f"{closest_anime_distance}_{average_distance}_{average_minus_closest_distance}"
|
2023-06-11 21:58:41 +02:00
|
|
|
# return f"{chosen_anime_name}_{closest_anime_name}_{closest_anime_distance}_{average_distance}_{average_minus_closest_distance}"
|
2023-06-08 14:41:24 +02:00
|
|
|
|
|
|
|
|
|
2023-06-08 18:40:27 +02:00
|
|
|
def calculate_neighbors(rows_number, neighbors=5):
|
|
|
|
|
neighbor_value = {
|
|
|
|
|
"default": 5,
|
2023-06-08 19:32:30 +02:00
|
|
|
"sqrt": math.floor(math.sqrt(rows_number)),
|
|
|
|
|
"half": math.floor(rows_number / 2),
|
|
|
|
|
"log": math.floor(math.log(rows_number)),
|
2023-06-08 18:40:27 +02:00
|
|
|
"n-1": rows_number - 1
|
|
|
|
|
}
|
2023-06-08 19:32:30 +02:00
|
|
|
if isinstance(neighbors, str):
|
2023-06-08 18:40:27 +02:00
|
|
|
return neighbor_value[neighbors]
|
|
|
|
|
return neighbors
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_model(pivot_table, rows_number, metric="cosine", algorithm="brute", neighbors=5):
|
2023-06-08 14:41:24 +02:00
|
|
|
"""
|
|
|
|
|
Creates model based on neaarest neighbor for anime prediction
|
|
|
|
|
"""
|
2023-06-11 18:35:51 +02:00
|
|
|
neighbors_number = calculate_neighbors(pivot_table.shape[0], neighbors)
|
2023-06-08 14:41:24 +02:00
|
|
|
pivot_table_matrix = csr_matrix(pivot_table.values)
|
2023-06-11 20:08:50 +02:00
|
|
|
if algorithm == "brute":
|
|
|
|
|
model = NearestNeighbors(n_neighbors=neighbors_number,
|
2023-06-11 21:58:41 +02:00
|
|
|
metric=metric, algorithm=algorithm)
|
2023-06-11 20:08:50 +02:00
|
|
|
else:
|
2023-06-11 21:58:41 +02:00
|
|
|
model = NearestNeighbors(
|
|
|
|
|
n_neighbors=neighbors_number, algorithm=algorithm)
|
2023-06-11 16:42:37 +02:00
|
|
|
try:
|
|
|
|
|
model.fit(pivot_table_matrix)
|
|
|
|
|
except:
|
|
|
|
|
print(f"""Error in create_model, probably wrong metric for data
|
|
|
|
|
Metric: {metric}, algorithm: {algorithm}""")
|
|
|
|
|
return "Error!"
|
2023-06-08 14:41:24 +02:00
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def handle_arguments():
|
2023-06-08 16:20:20 +02:00
|
|
|
"""
|
|
|
|
|
Handles all arguments that can be used to change algorithm behaviour or program display
|
|
|
|
|
"""
|
2023-06-08 14:41:24 +02:00
|
|
|
parser = argparse.ArgumentParser(description='Example script with pyargs')
|
|
|
|
|
parser.add_argument('--data_limit', '-dl',
|
2023-06-08 16:20:20 +02:00
|
|
|
help="""Specify data limit,
|
|
|
|
|
Recommended at least 500k, set to -1 for no limit""",
|
|
|
|
|
required=False, type=int, default=-1)
|
|
|
|
|
parser.add_argument('--seed', '-s',
|
|
|
|
|
help='Specify seed',
|
2023-06-08 14:41:24 +02:00
|
|
|
type=int, required=False, default=42)
|
2023-06-08 16:20:20 +02:00
|
|
|
parser.add_argument('--debug', '-d',
|
|
|
|
|
help='Use debug (more information) prints',
|
2023-06-08 14:41:24 +02:00
|
|
|
type=bool, required=False, default=False)
|
2023-06-08 16:20:20 +02:00
|
|
|
parser.add_argument('--database', '-db',
|
|
|
|
|
help='Specify database path',
|
2023-06-08 14:41:24 +02:00
|
|
|
required=False, default="database")
|
|
|
|
|
|
|
|
|
|
allowed_metric = ["cosine", "mahalanobis", "euclidean"]
|
2023-06-08 16:20:20 +02:00
|
|
|
parser.add_argument('--metric', '-m',
|
|
|
|
|
help='Specify metric for NearestNeighbor learner',
|
2023-06-08 14:41:24 +02:00
|
|
|
required=False, default="cosine", choices=allowed_metric)
|
|
|
|
|
allowed_algorithms = ['auto', 'ball_tree', 'kd_tree', 'brute']
|
2023-06-08 16:20:20 +02:00
|
|
|
parser.add_argument('--algorithm', '-a',
|
|
|
|
|
help='Specify algorithm for Nearest Neighbor learner',
|
2023-06-08 14:41:24 +02:00
|
|
|
required=False, default="brute", choices=allowed_algorithms)
|
2023-06-08 16:20:20 +02:00
|
|
|
parser.add_argument('--anime', '-an',
|
|
|
|
|
help='Specify anime to choose',
|
2023-06-08 14:41:24 +02:00
|
|
|
required=False, default="RANDOM")
|
2023-06-08 16:20:20 +02:00
|
|
|
parser.add_argument('--neighbors', '-n',
|
|
|
|
|
help='Specify number of nearest neighbors',
|
2023-06-08 14:41:24 +02:00
|
|
|
required=False, default=5)
|
2023-06-08 16:20:20 +02:00
|
|
|
parser.add_argument('--user_threshold', '-ut',
|
|
|
|
|
help="""Specify minimal number of votes required for user to be
|
|
|
|
|
included in the data, set to -1 for no threshold""",
|
2023-06-08 14:41:24 +02:00
|
|
|
required=False, type=int, default=500)
|
2023-06-08 16:20:20 +02:00
|
|
|
parser.add_argument('--anime_threshold', '-at',
|
|
|
|
|
help="""Specify minimal number of votes required for anime
|
|
|
|
|
to be included in the data, set to -1 for no threshold""",
|
2023-06-08 14:41:24 +02:00
|
|
|
required=False, type=int, default=200)
|
2023-06-08 16:20:20 +02:00
|
|
|
parser.add_argument('--recommendation_amount', '-ra',
|
|
|
|
|
help='Specify how much anime should be recommended',
|
2023-06-08 14:41:24 +02:00
|
|
|
required=False, type=int, default=5)
|
2023-06-08 16:20:20 +02:00
|
|
|
parser.add_argument('--auto', '-au',
|
|
|
|
|
help="""Enable auto mode, no debug, no user parameters,
|
|
|
|
|
automatic testing and saving results""",
|
|
|
|
|
type=bool, required=False, default=False)
|
2023-06-08 14:41:24 +02:00
|
|
|
|
|
|
|
|
# Parse the command-line arguments
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
args.recommendation_amount = args.recommendation_amount + 1
|
|
|
|
|
# Access the values of the arguments
|
2023-06-08 16:20:20 +02:00
|
|
|
return args.seed, args.debug, args.data_limit, args.database, args.metric, args.algorithm, args.anime, args.neighbors, args.user_threshold, args.anime_threshold, args.recommendation_amount, args.auto
|
2023-06-08 14:41:24 +02:00
|
|
|
|
2023-06-11 21:58:41 +02:00
|
|
|
|
|
|
|
|
def auto_mode(data_limit=-1, seed=42, anime="RANDOM"):
|
2023-06-08 19:32:30 +02:00
|
|
|
print("Started auto mode")
|
2023-06-11 23:16:25 +02:00
|
|
|
algorithm_spread = ['auto', 'brute']
|
2023-06-08 19:32:30 +02:00
|
|
|
neighbor_spread = [5, "sqrt", "half", "log", "n-1"]
|
|
|
|
|
# No reason to access and waste computational power every time we run the simulation
|
2023-06-11 21:58:41 +02:00
|
|
|
starting_rating_data, starting_anime_contact_data, starting_rows_number = get_data(
|
|
|
|
|
limit_data=data_limit)
|
2023-06-08 19:49:56 +02:00
|
|
|
original_pivot_table = preprocessing(
|
|
|
|
|
starting_rating_data, starting_anime_contact_data)
|
2023-06-11 18:35:51 +02:00
|
|
|
if os.path.exists('test_results'):
|
|
|
|
|
shutil.rmtree('test_results')
|
2023-06-08 19:32:30 +02:00
|
|
|
for algorithm in algorithm_spread:
|
2023-06-11 20:08:50 +02:00
|
|
|
possibleMetrics = []
|
|
|
|
|
if algorithm != 'auto':
|
|
|
|
|
possibleMetrics = sorted(VALID_METRICS_SPARSE[algorithm])
|
|
|
|
|
print("testing for algorithm: ", algorithm, possibleMetrics)
|
|
|
|
|
if possibleMetrics == []:
|
|
|
|
|
possibleMetrics = [""]
|
2023-06-11 18:35:51 +02:00
|
|
|
for metric in possibleMetrics:
|
2023-06-11 22:39:34 +02:00
|
|
|
if metric != 'precomputed':
|
|
|
|
|
print("testing for algorithm, metric: ", algorithm, metric)
|
|
|
|
|
for neighbor_amount in neighbor_spread:
|
|
|
|
|
print("testing for algorithm, metric, neighbor_amount: ",
|
|
|
|
|
algorithm, metric, neighbor_amount)
|
|
|
|
|
preprocess_model_predict(starting_rating_data, starting_anime_contact_data,
|
|
|
|
|
starting_rows_number, original_pivot_table, seed=seed, anime=anime, neighbors=neighbor_amount, algorithm=algorithm, metric=metric)
|
2023-06-11 21:58:41 +02:00
|
|
|
|
2023-06-11 16:42:37 +02:00
|
|
|
|
2023-06-11 21:58:41 +02:00
|
|
|
def write_test_results(title, result=""):
|
2023-06-11 16:42:37 +02:00
|
|
|
# Create directory if it doesn't already exist
|
2023-06-11 18:35:51 +02:00
|
|
|
|
2023-06-11 16:42:37 +02:00
|
|
|
if not os.path.exists('test_results'):
|
|
|
|
|
os.makedirs('test_results')
|
|
|
|
|
|
|
|
|
|
# Generate timestamped filename
|
2023-06-11 21:58:41 +02:00
|
|
|
timestamp = datetime.datetime.now().strftime(
|
|
|
|
|
'%Y%m%d%H%M%S') # e.g., 20230611235959
|
2023-06-11 16:42:37 +02:00
|
|
|
filename = f"{title}_{timestamp}.txt"
|
2023-06-11 21:58:41 +02:00
|
|
|
|
2023-06-11 16:42:37 +02:00
|
|
|
# Create and write to the file
|
|
|
|
|
with open(os.path.join('test_results', filename), 'a') as file:
|
2023-06-11 18:35:51 +02:00
|
|
|
file.write(result)
|
2023-06-08 19:32:30 +02:00
|
|
|
|
2023-06-11 21:58:41 +02:00
|
|
|
|
|
|
|
|
def calculate_precision(predictions, threshold=8):
|
|
|
|
|
ratings = [anime[anime > 0].mean() for anime in predictions]
|
|
|
|
|
precision = [1 if r >= threshold else 0 for r in ratings]
|
|
|
|
|
return np.mean(precision)
|
|
|
|
|
|
|
|
|
|
|
2023-06-08 19:49:56 +02:00
|
|
|
def preprocess_model_predict(rating_data, anime_contact_data, rows_number, pivot_table, data_limit=-1, db="database", debug=False, user_threshold=500, anime_threshold=200, metric="cosine", algorithm="brute", neighbors=5, seed=42, anime="RANDOM", recommendation_amount=5):
|
|
|
|
|
MODEL = create_model(pivot_table, rows_number,
|
2023-06-08 18:40:27 +02:00
|
|
|
metric, algorithm, neighbors)
|
2023-06-11 18:35:51 +02:00
|
|
|
result = ""
|
2023-06-11 16:42:37 +02:00
|
|
|
if MODEL != "Error!":
|
2023-06-11 23:16:25 +02:00
|
|
|
chosen_anime, suggestions, distance, distance_data = predict(MODEL, pivot_table, seed,
|
|
|
|
|
anime, recommendation_amount)
|
2023-06-11 21:58:41 +02:00
|
|
|
|
|
|
|
|
chosen_anime_name = pivot_table.index[chosen_anime]
|
|
|
|
|
# average_distance = np.mean(distance)
|
|
|
|
|
# closest_anime_name = pivot_table.index[suggestions[1]]
|
|
|
|
|
# closest_anime_distance = distance[1]
|
|
|
|
|
# average_minus_closest_distance = closest_anime_distance - average_distance
|
|
|
|
|
precision = calculate_precision(
|
|
|
|
|
[pivot_table.iloc[s] for s in suggestions])
|
|
|
|
|
|
|
|
|
|
result = f"{chosen_anime_name}:\n"
|
|
|
|
|
for i in range(len(suggestions)):
|
|
|
|
|
result += f"{pivot_table.index[suggestions[i]]}; Distance: {distance[i]}\n"
|
2023-06-11 23:16:25 +02:00
|
|
|
result += f"Precision: {precision*100}%\n"
|
|
|
|
|
result += "Smallest distance, average distance, Average - Smallest distance: " + distance_data
|
2023-06-11 21:58:41 +02:00
|
|
|
# result = f"{chosen_anime_name}_{closest_anime_name}_{closest_anime_distance}_{average_distance}_{average_minus_closest_distance}"
|
|
|
|
|
write_test_results(
|
|
|
|
|
f"dl={rows_number}&s={seed}&m={metric}&a={algorithm}&ut={user_threshold}&at={anime_threshold}&n={neighbors}", result)
|
2023-06-08 18:40:27 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
SEED, DEBUG, DATA_LIMIT, DB, METRIC, ALGORITHM, ANIME, NEIGHBORS, USER_THRESHOLD, ANIME_THRESHOLD, RECOMMENDATION_AMOUNT, AUTO = handle_arguments()
|
|
|
|
|
if not AUTO:
|
2023-06-11 23:16:25 +02:00
|
|
|
print("Entered not auto mode")
|
|
|
|
|
starting_rating_data, starting_anime_contact_data, starting_rows_number = get_data(
|
|
|
|
|
limit_data=DATA_LIMIT, data_folder_path=DB)
|
2023-06-11 21:58:41 +02:00
|
|
|
pivot_table = preprocessing(
|
|
|
|
|
starting_rating_data, starting_anime_contact_data, USER_THRESHOLD, ANIME_THRESHOLD)
|
2023-06-08 19:32:30 +02:00
|
|
|
preprocess_model_predict(starting_rating_data, starting_anime_contact_data, starting_rows_number,
|
2023-06-11 21:58:41 +02:00
|
|
|
pivot_table, data_limit=DATA_LIMIT, db=DB, debug=DEBUG, user_threshold=USER_THRESHOLD, anime_threshold=ANIME_THRESHOLD,
|
|
|
|
|
metric=METRIC, algorithm=ALGORITHM, neighbors=NEIGHBORS, seed=SEED, anime=ANIME, recommendation_amount=RECOMMENDATION_AMOUNT)
|
2023-06-08 18:40:27 +02:00
|
|
|
if AUTO:
|
2023-06-11 18:35:51 +02:00
|
|
|
auto_mode(DATA_LIMIT, SEED, ANIME)
|