diff --git a/.gitignore b/.gitignore index d6ffe6a6..5bc6c4f6 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,306 @@ anime_with_synopsis.csv anime.csv animelist.csv rating_complete.csv -watching_status.csv \ No newline at end of file +watching_status.csv + +## Core latex/pdflatex auxiliary files: +*.aux +*.lof +*.log +*.lot +*.fls +*.out +*.toc +*.fmt +*.fot +*.cb +*.cb2 +.*.lb + +## Intermediate documents: +*.dvi +*.xdv +*-converted-to.* +# these rules might exclude image files for figures etc. +# *.ps +# *.eps +# *.pdf + +## Generated if empty string is given at "Please type another file name for output:" +.pdf + +## Bibliography auxiliary files (bibtex/biblatex/biber): +*.bbl +*.bcf +*.blg +*-blx.aux +*-blx.bib +*.run.xml + +## Build tool auxiliary files: +*.fdb_latexmk +*.synctex +*.synctex(busy) +*.synctex.gz +*.synctex.gz(busy) +*.pdfsync + +## Build tool directories for auxiliary files +# latexrun +latex.out/ + +## Auxiliary and intermediate files from other packages: +# algorithms +*.alg +*.loa + +# achemso +acs-*.bib + +# amsthm +*.thm + +# beamer +*.nav +*.pre +*.snm +*.vrb + +# changes +*.soc + +# comment +*.cut + +# cprotect +*.cpt + +# elsarticle (documentclass of Elsevier journals) +*.spl + +# endnotes +*.ent + +# fixme +*.lox + +# feynmf/feynmp +*.mf +*.mp +*.t[1-9] +*.t[1-9][0-9] +*.tfm + +#(r)(e)ledmac/(r)(e)ledpar +*.end +*.?end +*.[1-9] +*.[1-9][0-9] +*.[1-9][0-9][0-9] +*.[1-9]R +*.[1-9][0-9]R +*.[1-9][0-9][0-9]R +*.eledsec[1-9] +*.eledsec[1-9]R +*.eledsec[1-9][0-9] +*.eledsec[1-9][0-9]R +*.eledsec[1-9][0-9][0-9] +*.eledsec[1-9][0-9][0-9]R + +# glossaries +*.acn +*.acr +*.glg +*.glo +*.gls +*.glsdefs +*.lzo +*.lzs +*.slg +*.slo +*.sls + +# uncomment this for glossaries-extra (will ignore makeindex's style files!) +# *.ist + +# gnuplot +*.gnuplot +*.table + +# gnuplottex +*-gnuplottex-* + +# gregoriotex +*.gaux +*.glog +*.gtex + +# htlatex +*.4ct +*.4tc +*.idv +*.lg +*.trc +*.xref + +# hyperref +*.brf + +# knitr +*-concordance.tex +# TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files +# *.tikz +*-tikzDictionary + +# listings +*.lol + +# luatexja-ruby +*.ltjruby + +# makeidx +*.idx +*.ilg +*.ind + +# minitoc +*.maf +*.mlf +*.mlt +*.mtc[0-9]* +*.slf[0-9]* +*.slt[0-9]* +*.stc[0-9]* + +# minted +_minted* +*.pyg + +# morewrites +*.mw + +# newpax +*.newpax + +# nomencl +*.nlg +*.nlo +*.nls + +# pax +*.pax + +# pdfpcnotes +*.pdfpc + +# sagetex +*.sagetex.sage +*.sagetex.py +*.sagetex.scmd + +# scrwfile +*.wrt + +# svg +svg-inkscape/ + +# sympy +*.sout +*.sympy +sympy-plots-for-*.tex/ + +# pdfcomment +*.upa +*.upb + +# pythontex +*.pytxcode +pythontex-files-*/ + +# tcolorbox +*.listing + +# thmtools +*.loe + +# TikZ & PGF +*.dpth +*.md5 +*.auxlock + +# titletoc +*.ptc + +# todonotes +*.tdo + +# vhistory +*.hst +*.ver + +# easy-todo +*.lod + +# xcolor +*.xcp + +# xmpincl +*.xmpi + +# xindy +*.xdy + +# xypic precompiled matrices and outlines +*.xyc +*.xyd + +# endfloat +*.ttt +*.fff + +# Latexian +TSWLatexianTemp* + +## Editors: +# WinEdt +*.bak +*.sav + +# Texpad +.texpadtmp + +# LyX +*.lyx~ + +# Kile +*.backup + +# gummi +.*.swp + +# KBibTeX +*~[0-9]* + +# TeXnicCenter +*.tps + +# auto folder when using emacs and auctex +./auto/* +*.el + +# expex forward references with \gathertags +*-tags.tex + +# standalone packages +*.sta + +# Makeindex log files +*.lpz + +# xwatermark package +*.xwm + +# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib +# option is specified. Footnotes are the stored in a file with suffix Notes.bib. +# Uncomment the next line to have this generated file ignored. +#*Notes.bib \ No newline at end of file diff --git a/final/code/main.py b/final/code/main.py new file mode 100644 index 00000000..cc9b662a --- /dev/null +++ b/final/code/main.py @@ -0,0 +1,244 @@ +""" +Code for preprocessing data and creating model that predicts and +recomends anime based on another anime entered by user +""" +import pandas as pd + +import numpy as np +import argparse + +import sklearn +from sklearn.neighbors import NearestNeighbors +from scipy.sparse import csr_matrix + + +def get_data(limit_data=-1, data_folder_path="database"): + """ + Reads anime from csv database + """ + if limit_data > -1: + # User can limit number of data taken into consideration, + # model seems to work with limit_data value as low as 500,000 + rating_data = pd.read_csv( + data_folder_path + "/animelist.csv", nrows=limit_data) + else: + rating_data = pd.read_csv(data_folder_path + "/animelist.csv") + anime_data = pd.read_csv(data_folder_path + "/anime.csv") + + # used to fetch anime_id(MAL_ID) + anime_data = anime_data.rename(columns={"MAL_ID": "anime_id"}) + anime_contact_data = anime_data[["anime_id", "Name"]] + return rating_data, anime_contact_data + + +def merge_rating_anime_data(rating_data, anime_contact_data, debug=False): + """ + Preprocesses the data used for rating + """ + rating_data = rating_data.merge( + anime_contact_data, left_on="anime_id", right_on="anime_id", how="left" + ) + rating_data = rating_data[ + ["user_id", "Name", "anime_id", "rating", + "watching_status", "watched_episodes"] + ] + rating_head = rating_data.head() + if debug: + print(rating_head) + rating_shape_complete = rating_data.shape + if debug: + print(rating_shape_complete) + return rating_data + + +def split_data_below_thresholds(rating_data, data_name, threshold=-1, debug=False): + """ + Removes data with data_name which is below given threshold + """ + if threshold != -1: + count = rating_data[data_name].value_counts() + rating_data = rating_data[ + rating_data[data_name].isin(count[count >= threshold].index) + ].copy() + rating_shape_cut = rating_data.shape + if debug: + print(rating_shape_cut) + return rating_data + + +def combine_name_and_ratings(rating_data, debug=False): + """ + Create table which holds name of the anime and number of its reviews + then we merge this with rating_data + """ + combine_movie_rating = rating_data.dropna(axis=0, subset=["Name"]) + movie_rating_count = ( + combine_movie_rating.groupby(by=["Name"])["rating"] + .count() + .reset_index()[["Name", "rating"]] + ) + rating_head = movie_rating_count.head() + if debug: + print(rating_head) + rating_data = combine_movie_rating.merge( + movie_rating_count, left_on="Name", right_on="Name", how="left" + ) + return rating_data + + +def get_length_of_data(rating_data, data_name): + """ + We get amount of data in the database with a given column data_name + """ + # Encoding categorical data + column_ids = rating_data[data_name + "_id"].unique().tolist() + column_to_column = {x: i for i, x in enumerate(column_ids)} + rating_data[data_name] = rating_data[data_name + + "_id"].map(column_to_column) + users_number = len(column_to_column) + return users_number + + +def get_top_ranked(rating_data, data_name, join_table=None, top_data_taken=20): + """ + Get anime with highest ranking + """ + if join_table is None: + join_table = rating_data + group_data_by_rating = rating_data.groupby( + data_name + "_id")["rating"].count() + top_users = group_data_by_rating.dropna().sort_values(ascending=False)[ + :top_data_taken] + top_rated = join_table.join(top_users, rsuffix="_r", + how="inner", on=data_name + "_id") + return top_rated + + +def get_data_info(rating_data, debug=False): + """ + Get some informations about data + """ + users_number = get_length_of_data(rating_data, "user") + animes_number = get_length_of_data(rating_data, "anime") + + top_rated = get_top_ranked(rating_data, "user") + top_rated = get_top_ranked(rating_data, "anime", top_rated) + + pivot = pd.crosstab(top_rated.user_id, top_rated.anime_id, + top_rated.rating, aggfunc=np.sum) + + pivot.fillna(0, inplace=True) + smallest_rating = min(rating_data["rating"]) + highest_rating = max(rating_data["rating"]) + if debug: + print(pivot) + if debug: + print(f"Num of users: {users_number}, Num of animes: {animes_number}") + print( + f"Min total rating: {smallest_rating}, Max total rating: {highest_rating}") + + +def preprocessing(rating_data, anime_contact_data, debug=False, user_threshold=500, anime_threshold=200): + """ + Preprocesses data for making model more accurate and/or faster + """ + rating_data = merge_rating_anime_data(rating_data, anime_contact_data) + rating_data = split_data_below_thresholds( + rating_data, "user_id", user_threshold) + rating_data = split_data_below_thresholds( + rating_data, "anime_id", anime_threshold) + rating_data = combine_name_and_ratings(rating_data) + + rating_data = rating_data.drop(columns="rating_x") + rating_data = rating_data.rename(columns={"rating_y": "rating"}) + if debug: + print(rating_data) + get_data_info(rating_data) + + pivot_table = rating_data.pivot_table( + index="Name", columns="user_id", values="rating" + ).fillna(0) + if debug: + print(pivot_table) + return pivot_table + + +def predict(prediction_model, pivot_table, seed=42, anime="RANDOM", recommendation_number=6): + """ + This will choose a random anime name and our prediction_model will predict similar anime. + """ + np.random.seed(seed) + print(pivot_table) + if anime == "RANDOM": + chosen_anime = np.random.choice(pivot_table.shape[0]) + query = pivot_table.iloc[chosen_anime, :].values.reshape(1, -1) + chosen_anime_name = pivot_table.index[chosen_anime] + else: + query = pivot_table.loc[anime].values.reshape(1, -1) + chosen_anime_name = anime + + distance, suggestions = prediction_model.kneighbors( + query, n_neighbors=recommendation_number) + for i in range(0, len(distance.flatten())): + if i == 0: + print(f"Recommendations for {chosen_anime_name}:\n") + else: + print( + f"{i}: {pivot_table.index[suggestions.flatten()[i]]}, with distance of {distance.flatten()[i]}:" + ) + + +def create_model(pivot_table, metric="cosine", algorithm="brute", neighbors=5): + """ + Creates model based on neaarest neighbor for anime prediction + """ + pivot_table_matrix = csr_matrix(pivot_table.values) + model = NearestNeighbors(n_neighbors=neighbors, + metric=metric, algorithm=algorithm) + model.fit(pivot_table_matrix) + return model + + +def handle_arguments(): + parser = argparse.ArgumentParser(description='Example script with pyargs') + parser.add_argument('--data_limit', '-dl', + help='Specify data limit, Recommended at least 500k, set to -1 for no limit', required=False, type=int, default=-1) + parser.add_argument('--seed', '-s', help='Specify seed', + type=int, required=False, default=42) + parser.add_argument('--debug', '-d', help='Use debug (more information) prints', + type=bool, required=False, default=False) + parser.add_argument('--database', '-db', help='Specify database path', + required=False, default="database") + + allowed_metric = ["cosine", "mahalanobis", "euclidean"] + parser.add_argument('--metric', '-m', help='Specify metric for NearestNeighbor learner', + required=False, default="cosine", choices=allowed_metric) + allowed_algorithms = ['auto', 'ball_tree', 'kd_tree', 'brute'] + parser.add_argument('--algorithm', '-a', help='Specify algorithm for Nearest Neighbor learner', + required=False, default="brute", choices=allowed_algorithms) + parser.add_argument('--anime', '-an', help='Specify anime to choose', + required=False, default="RANDOM") + parser.add_argument('--neighbors', '-n', help='Specify number of nearest neighbors', + required=False, default=5) + parser.add_argument('--user_threshold', '-ut', help='Specify minimal number of votes required for user to be included in the data, set to -1 for no threshold', + required=False, type=int, default=500) + parser.add_argument('--anime_threshold', '-at', help='Specify minimal number of votes required for anime to be included in the data, set to -1 for no threshold', + required=False, type=int, default=200) + parser.add_argument('--recommendation_amount', '-ra', help='Specify how much anime should be recommended', + required=False, type=int, default=5) + + # Parse the command-line arguments + args = parser.parse_args() + args.recommendation_amount = args.recommendation_amount + 1 + # Access the values of the arguments + return args.seed, args.debug, args.data_limit, args.database, args.metric, args.algorithm, args.anime, args.neighbors, args.user_threshold, args.anime_threshold, args.recommendation_amount + + +if __name__ == "__main__": + seed, debug, data_limit, db, metric, algorithm, anime, neighbors, user_threshold, anime_threshold, recommendation_amount = handle_arguments() + + RATING_DATA, ANIME_CONTACT_DATA = get_data(data_limit, db) + PIVOT_TABLE = preprocessing( + RATING_DATA, ANIME_CONTACT_DATA, debug, user_threshold, anime_threshold) + MODEL = create_model(PIVOT_TABLE, metric, algorithm, neighbors) + predict(MODEL, PIVOT_TABLE, seed, anime, recommendation_amount) diff --git a/final/code/requirements.txt b/final/code/requirements.txt new file mode 100644 index 00000000..b0c17db6 --- /dev/null +++ b/final/code/requirements.txt @@ -0,0 +1,4 @@ +pandas +numpy +seaborn +matplotlib \ No newline at end of file diff --git a/final/report/KLISZKO_RUDNICKI_MIDTERM_EARIN.pdf b/final/report/KLISZKO_RUDNICKI_MIDTERM_EARIN.pdf new file mode 100644 index 00000000..17f7465b Binary files /dev/null and b/final/report/KLISZKO_RUDNICKI_MIDTERM_EARIN.pdf differ diff --git a/final/report/execution_time.png b/final/report/execution_time.png new file mode 100644 index 00000000..9604f3c0 Binary files /dev/null and b/final/report/execution_time.png differ diff --git a/final/report/report.tex b/final/report/report.tex new file mode 100644 index 00000000..237ec86f --- /dev/null +++ b/final/report/report.tex @@ -0,0 +1,97 @@ +\documentclass[12pt]{article} +\usepackage{listings} +\usepackage{hyperref} +\usepackage{graphicx} +\title{EARIN project Midterm report} +\author{Krzysztof Rudnicki \\ Jakub Kliszko} +\begin{document} +\maketitle +\section{Progress} +We have implemented reading data from csv files, preprocessing them with optional showing of some of the information about the data and used model/learner for implementing neighbour searches \\ +Program is very flexible and allows for a lot of modification from command line arguments \\ +Full list here: +\begin{lstlisting}[language=bash] +options: +-h, --help show this help message and exit +--data_limit DATA_LIMIT, -dl DATA_LIMIT + Specify data limit, Recommended at least 500k, + set to -1 for no limit +--seed SEED, -s SEED Specify seed +--debug DEBUG, -d DEBUG + Use debug (more information) prints +--database DATABASE, -db DATABASE + Specify database path +--metric {cosine,mahalanobis,euclidean} +-m {cosine,mahalanobis,euclidean} + Specify metric for NearestNeighbor learner +--algorithm {auto,ball_tree,kd_tree,brute} +-a {auto,ball_tree,kd_tree,brute} + Specify algorithm for Nearest Neighbor learner +--anime ANIME, -an ANIME + Specify anime to choose +--neighbors NEIGHBORS, -n NEIGHBORS + Specify number of nearest neighbors +--user_threshold USER_THRESHOLD, -ut USER_THRESHOLD + Specify minimal number of votes + required for user to be included in + the data, set to -1 for no threshold +--anime_threshold ANIME_THRESHOLD, -at ANIME_THRESHOLD + Specify minimal number of votes + required for anime to be included + in the data, set to -1 for no threshold +\end{lstlisting} +\section{Results} +Currently recommendations are displayed in a following way: +\begin{lstlisting}[language=bash] +Recommendations for Kill la Kill: + +1: Shingeki no Kyojin, with distance of 0.11106648055176693: +2: Steins;Gate, with distance of 0.12104265014640536: +3: Toradora!, with distance of 0.12112848901274798: +4: Sword Art Online, with distance of 0.13046005032340824: +5: No Game No Life, with distance of 0.1306815843129835: +6: One Punch Man, with distance of 0.14848484728234945: +7: Angel Beats!, with distance of 0.15175709939974935: +8: Hataraku Maou-sama!, with distance of 0.15244674042590045: +9: Psycho-Pass, with distance of 0.15288022814590008: + \end{lstlisting} + Where we are given name of the anime for which we create recommendation and list of animes recommended with distance to original anime (lower is better) + \subsection{Data size and execution time} + \begin{figure} + \caption{Chart showing how size of data taken impacts execution time } + \includegraphics[width=\textwidth]{execution_time.png} + \end{figure} +This data was taken using default parameters execpt for increasing data size, each of three runs uses different seed + + +\paragraph{Seed} We added seed in predict function for choosing random anime, using the same seed always returns same recommendations and choosing random anime is the only random part of our code \\ +User can specify their own seed by using -s or --seed flag by entering in command line: +\begin{lstlisting} +python -s 42 +\end{lstlisting} +\section{Challenges} +\subsection{Failed attempts} +Biggest challenge was realizing how overcomplicated and unnecessary difficult to implement is the first code we based on: \href{https://www.kaggle.com/code/chaitanya99/recommendation-system-cf-anime}{Kaggle code with tensorflow} \\ +This solutions runs for almost 10 minutes on kaggle and implementing it to run on our local devices was a real chore that took us a good day and a half to implement \\ +This implementation is based around very powerful Tensor Processing Unit from google and while it is possible to change it to run on local graphics card it requires downloading both cuda and cudnn to a downgraded version supported by tensorflow (11.8) and downgrading graphics card drivers \\ +Running it with CPU results in the model training for over 3 hours +\subsection{Corrections} +Suprisingly even though we based our preliminary report around different example code we managed to not make any corrections to preliminary report \\ +All of functionality that we want to implement is available in sklearn and scipy +\subsection{Results and findings} +We can see that the rating is skewed towards higher values, users tend to give ratings of 7, 8 or 9 which inflates average rating to be well above 5 +\begin{figure} + \caption{User rating count} + \includegraphics[width=\textwidth]{user_rating.png} +\end{figure} +\section{Finishing project} +\subsection{Embedding more data in user and anime} +Currently we are only embedding pure rating values of users, we do not take into consideration, popularity, "controversy", studio which created the anime, length of anime (number of episodes and length of episodes), and when it was aired \\ +\subsection{Evaluating our model accuracy} +We need to introduce some way to evaluate accuracy of our model, we will try to introduce at least some of the measures mentioned in preliminary report: precision, recall, F1 score and MAP +\subsection{More results representation} +We still need to introduce more representation for our model results. Mainly how well it predicts similarity based on different parameter values (different modes, arguments and so on) \\ +We already can modify those values easily from the code itself and as argument, we just need to run those values and collect results + + +\end{document} \ No newline at end of file diff --git a/final/report/user_rating.png b/final/report/user_rating.png new file mode 100644 index 00000000..da5493c4 Binary files /dev/null and b/final/report/user_rating.png differ