feat: created "final" folder, added tex gitignore

This commit is contained in:
Krzysztof Rudnicki 2023-06-08 14:41:24 +02:00
parent 40856073d0
commit 5b817e9e92
7 changed files with 648 additions and 1 deletions

304
.gitignore vendored
View File

@ -3,4 +3,306 @@ anime_with_synopsis.csv
anime.csv
animelist.csv
rating_complete.csv
watching_status.csv
watching_status.csv
## Core latex/pdflatex auxiliary files:
*.aux
*.lof
*.log
*.lot
*.fls
*.out
*.toc
*.fmt
*.fot
*.cb
*.cb2
.*.lb
## Intermediate documents:
*.dvi
*.xdv
*-converted-to.*
# these rules might exclude image files for figures etc.
# *.ps
# *.eps
# *.pdf
## Generated if empty string is given at "Please type another file name for output:"
.pdf
## Bibliography auxiliary files (bibtex/biblatex/biber):
*.bbl
*.bcf
*.blg
*-blx.aux
*-blx.bib
*.run.xml
## Build tool auxiliary files:
*.fdb_latexmk
*.synctex
*.synctex(busy)
*.synctex.gz
*.synctex.gz(busy)
*.pdfsync
## Build tool directories for auxiliary files
# latexrun
latex.out/
## Auxiliary and intermediate files from other packages:
# algorithms
*.alg
*.loa
# achemso
acs-*.bib
# amsthm
*.thm
# beamer
*.nav
*.pre
*.snm
*.vrb
# changes
*.soc
# comment
*.cut
# cprotect
*.cpt
# elsarticle (documentclass of Elsevier journals)
*.spl
# endnotes
*.ent
# fixme
*.lox
# feynmf/feynmp
*.mf
*.mp
*.t[1-9]
*.t[1-9][0-9]
*.tfm
#(r)(e)ledmac/(r)(e)ledpar
*.end
*.?end
*.[1-9]
*.[1-9][0-9]
*.[1-9][0-9][0-9]
*.[1-9]R
*.[1-9][0-9]R
*.[1-9][0-9][0-9]R
*.eledsec[1-9]
*.eledsec[1-9]R
*.eledsec[1-9][0-9]
*.eledsec[1-9][0-9]R
*.eledsec[1-9][0-9][0-9]
*.eledsec[1-9][0-9][0-9]R
# glossaries
*.acn
*.acr
*.glg
*.glo
*.gls
*.glsdefs
*.lzo
*.lzs
*.slg
*.slo
*.sls
# uncomment this for glossaries-extra (will ignore makeindex's style files!)
# *.ist
# gnuplot
*.gnuplot
*.table
# gnuplottex
*-gnuplottex-*
# gregoriotex
*.gaux
*.glog
*.gtex
# htlatex
*.4ct
*.4tc
*.idv
*.lg
*.trc
*.xref
# hyperref
*.brf
# knitr
*-concordance.tex
# TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files
# *.tikz
*-tikzDictionary
# listings
*.lol
# luatexja-ruby
*.ltjruby
# makeidx
*.idx
*.ilg
*.ind
# minitoc
*.maf
*.mlf
*.mlt
*.mtc[0-9]*
*.slf[0-9]*
*.slt[0-9]*
*.stc[0-9]*
# minted
_minted*
*.pyg
# morewrites
*.mw
# newpax
*.newpax
# nomencl
*.nlg
*.nlo
*.nls
# pax
*.pax
# pdfpcnotes
*.pdfpc
# sagetex
*.sagetex.sage
*.sagetex.py
*.sagetex.scmd
# scrwfile
*.wrt
# svg
svg-inkscape/
# sympy
*.sout
*.sympy
sympy-plots-for-*.tex/
# pdfcomment
*.upa
*.upb
# pythontex
*.pytxcode
pythontex-files-*/
# tcolorbox
*.listing
# thmtools
*.loe
# TikZ & PGF
*.dpth
*.md5
*.auxlock
# titletoc
*.ptc
# todonotes
*.tdo
# vhistory
*.hst
*.ver
# easy-todo
*.lod
# xcolor
*.xcp
# xmpincl
*.xmpi
# xindy
*.xdy
# xypic precompiled matrices and outlines
*.xyc
*.xyd
# endfloat
*.ttt
*.fff
# Latexian
TSWLatexianTemp*
## Editors:
# WinEdt
*.bak
*.sav
# Texpad
.texpadtmp
# LyX
*.lyx~
# Kile
*.backup
# gummi
.*.swp
# KBibTeX
*~[0-9]*
# TeXnicCenter
*.tps
# auto folder when using emacs and auctex
./auto/*
*.el
# expex forward references with \gathertags
*-tags.tex
# standalone packages
*.sta
# Makeindex log files
*.lpz
# xwatermark package
*.xwm
# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
# Uncomment the next line to have this generated file ignored.
#*Notes.bib

244
final/code/main.py Normal file
View File

@ -0,0 +1,244 @@
"""
Code for preprocessing data and creating model that predicts and
recomends anime based on another anime entered by user
"""
import pandas as pd
import numpy as np
import argparse
import sklearn
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
def get_data(limit_data=-1, data_folder_path="database"):
"""
Reads anime from csv database
"""
if limit_data > -1:
# User can limit number of data taken into consideration,
# model seems to work with limit_data value as low as 500,000
rating_data = pd.read_csv(
data_folder_path + "/animelist.csv", nrows=limit_data)
else:
rating_data = pd.read_csv(data_folder_path + "/animelist.csv")
anime_data = pd.read_csv(data_folder_path + "/anime.csv")
# used to fetch anime_id(MAL_ID)
anime_data = anime_data.rename(columns={"MAL_ID": "anime_id"})
anime_contact_data = anime_data[["anime_id", "Name"]]
return rating_data, anime_contact_data
def merge_rating_anime_data(rating_data, anime_contact_data, debug=False):
"""
Preprocesses the data used for rating
"""
rating_data = rating_data.merge(
anime_contact_data, left_on="anime_id", right_on="anime_id", how="left"
)
rating_data = rating_data[
["user_id", "Name", "anime_id", "rating",
"watching_status", "watched_episodes"]
]
rating_head = rating_data.head()
if debug:
print(rating_head)
rating_shape_complete = rating_data.shape
if debug:
print(rating_shape_complete)
return rating_data
def split_data_below_thresholds(rating_data, data_name, threshold=-1, debug=False):
"""
Removes data with data_name which is below given threshold
"""
if threshold != -1:
count = rating_data[data_name].value_counts()
rating_data = rating_data[
rating_data[data_name].isin(count[count >= threshold].index)
].copy()
rating_shape_cut = rating_data.shape
if debug:
print(rating_shape_cut)
return rating_data
def combine_name_and_ratings(rating_data, debug=False):
"""
Create table which holds name of the anime and number of its reviews
then we merge this with rating_data
"""
combine_movie_rating = rating_data.dropna(axis=0, subset=["Name"])
movie_rating_count = (
combine_movie_rating.groupby(by=["Name"])["rating"]
.count()
.reset_index()[["Name", "rating"]]
)
rating_head = movie_rating_count.head()
if debug:
print(rating_head)
rating_data = combine_movie_rating.merge(
movie_rating_count, left_on="Name", right_on="Name", how="left"
)
return rating_data
def get_length_of_data(rating_data, data_name):
"""
We get amount of data in the database with a given column data_name
"""
# Encoding categorical data
column_ids = rating_data[data_name + "_id"].unique().tolist()
column_to_column = {x: i for i, x in enumerate(column_ids)}
rating_data[data_name] = rating_data[data_name +
"_id"].map(column_to_column)
users_number = len(column_to_column)
return users_number
def get_top_ranked(rating_data, data_name, join_table=None, top_data_taken=20):
"""
Get anime with highest ranking
"""
if join_table is None:
join_table = rating_data
group_data_by_rating = rating_data.groupby(
data_name + "_id")["rating"].count()
top_users = group_data_by_rating.dropna().sort_values(ascending=False)[
:top_data_taken]
top_rated = join_table.join(top_users, rsuffix="_r",
how="inner", on=data_name + "_id")
return top_rated
def get_data_info(rating_data, debug=False):
"""
Get some informations about data
"""
users_number = get_length_of_data(rating_data, "user")
animes_number = get_length_of_data(rating_data, "anime")
top_rated = get_top_ranked(rating_data, "user")
top_rated = get_top_ranked(rating_data, "anime", top_rated)
pivot = pd.crosstab(top_rated.user_id, top_rated.anime_id,
top_rated.rating, aggfunc=np.sum)
pivot.fillna(0, inplace=True)
smallest_rating = min(rating_data["rating"])
highest_rating = max(rating_data["rating"])
if debug:
print(pivot)
if debug:
print(f"Num of users: {users_number}, Num of animes: {animes_number}")
print(
f"Min total rating: {smallest_rating}, Max total rating: {highest_rating}")
def preprocessing(rating_data, anime_contact_data, debug=False, user_threshold=500, anime_threshold=200):
"""
Preprocesses data for making model more accurate and/or faster
"""
rating_data = merge_rating_anime_data(rating_data, anime_contact_data)
rating_data = split_data_below_thresholds(
rating_data, "user_id", user_threshold)
rating_data = split_data_below_thresholds(
rating_data, "anime_id", anime_threshold)
rating_data = combine_name_and_ratings(rating_data)
rating_data = rating_data.drop(columns="rating_x")
rating_data = rating_data.rename(columns={"rating_y": "rating"})
if debug:
print(rating_data)
get_data_info(rating_data)
pivot_table = rating_data.pivot_table(
index="Name", columns="user_id", values="rating"
).fillna(0)
if debug:
print(pivot_table)
return pivot_table
def predict(prediction_model, pivot_table, seed=42, anime="RANDOM", recommendation_number=6):
"""
This will choose a random anime name and our prediction_model will predict similar anime.
"""
np.random.seed(seed)
print(pivot_table)
if anime == "RANDOM":
chosen_anime = np.random.choice(pivot_table.shape[0])
query = pivot_table.iloc[chosen_anime, :].values.reshape(1, -1)
chosen_anime_name = pivot_table.index[chosen_anime]
else:
query = pivot_table.loc[anime].values.reshape(1, -1)
chosen_anime_name = anime
distance, suggestions = prediction_model.kneighbors(
query, n_neighbors=recommendation_number)
for i in range(0, len(distance.flatten())):
if i == 0:
print(f"Recommendations for {chosen_anime_name}:\n")
else:
print(
f"{i}: {pivot_table.index[suggestions.flatten()[i]]}, with distance of {distance.flatten()[i]}:"
)
def create_model(pivot_table, metric="cosine", algorithm="brute", neighbors=5):
"""
Creates model based on neaarest neighbor for anime prediction
"""
pivot_table_matrix = csr_matrix(pivot_table.values)
model = NearestNeighbors(n_neighbors=neighbors,
metric=metric, algorithm=algorithm)
model.fit(pivot_table_matrix)
return model
def handle_arguments():
parser = argparse.ArgumentParser(description='Example script with pyargs')
parser.add_argument('--data_limit', '-dl',
help='Specify data limit, Recommended at least 500k, set to -1 for no limit', required=False, type=int, default=-1)
parser.add_argument('--seed', '-s', help='Specify seed',
type=int, required=False, default=42)
parser.add_argument('--debug', '-d', help='Use debug (more information) prints',
type=bool, required=False, default=False)
parser.add_argument('--database', '-db', help='Specify database path',
required=False, default="database")
allowed_metric = ["cosine", "mahalanobis", "euclidean"]
parser.add_argument('--metric', '-m', help='Specify metric for NearestNeighbor learner',
required=False, default="cosine", choices=allowed_metric)
allowed_algorithms = ['auto', 'ball_tree', 'kd_tree', 'brute']
parser.add_argument('--algorithm', '-a', help='Specify algorithm for Nearest Neighbor learner',
required=False, default="brute", choices=allowed_algorithms)
parser.add_argument('--anime', '-an', help='Specify anime to choose',
required=False, default="RANDOM")
parser.add_argument('--neighbors', '-n', help='Specify number of nearest neighbors',
required=False, default=5)
parser.add_argument('--user_threshold', '-ut', help='Specify minimal number of votes required for user to be included in the data, set to -1 for no threshold',
required=False, type=int, default=500)
parser.add_argument('--anime_threshold', '-at', help='Specify minimal number of votes required for anime to be included in the data, set to -1 for no threshold',
required=False, type=int, default=200)
parser.add_argument('--recommendation_amount', '-ra', help='Specify how much anime should be recommended',
required=False, type=int, default=5)
# Parse the command-line arguments
args = parser.parse_args()
args.recommendation_amount = args.recommendation_amount + 1
# Access the values of the arguments
return args.seed, args.debug, args.data_limit, args.database, args.metric, args.algorithm, args.anime, args.neighbors, args.user_threshold, args.anime_threshold, args.recommendation_amount
if __name__ == "__main__":
seed, debug, data_limit, db, metric, algorithm, anime, neighbors, user_threshold, anime_threshold, recommendation_amount = handle_arguments()
RATING_DATA, ANIME_CONTACT_DATA = get_data(data_limit, db)
PIVOT_TABLE = preprocessing(
RATING_DATA, ANIME_CONTACT_DATA, debug, user_threshold, anime_threshold)
MODEL = create_model(PIVOT_TABLE, metric, algorithm, neighbors)
predict(MODEL, PIVOT_TABLE, seed, anime, recommendation_amount)

View File

@ -0,0 +1,4 @@
pandas
numpy
seaborn
matplotlib

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 38 KiB

97
final/report/report.tex Normal file
View File

@ -0,0 +1,97 @@
\documentclass[12pt]{article}
\usepackage{listings}
\usepackage{hyperref}
\usepackage{graphicx}
\title{EARIN project Midterm report}
\author{Krzysztof Rudnicki \\ Jakub Kliszko}
\begin{document}
\maketitle
\section{Progress}
We have implemented reading data from csv files, preprocessing them with optional showing of some of the information about the data and used model/learner for implementing neighbour searches \\
Program is very flexible and allows for a lot of modification from command line arguments \\
Full list here:
\begin{lstlisting}[language=bash]
options:
-h, --help show this help message and exit
--data_limit DATA_LIMIT, -dl DATA_LIMIT
Specify data limit, Recommended at least 500k,
set to -1 for no limit
--seed SEED, -s SEED Specify seed
--debug DEBUG, -d DEBUG
Use debug (more information) prints
--database DATABASE, -db DATABASE
Specify database path
--metric {cosine,mahalanobis,euclidean}
-m {cosine,mahalanobis,euclidean}
Specify metric for NearestNeighbor learner
--algorithm {auto,ball_tree,kd_tree,brute}
-a {auto,ball_tree,kd_tree,brute}
Specify algorithm for Nearest Neighbor learner
--anime ANIME, -an ANIME
Specify anime to choose
--neighbors NEIGHBORS, -n NEIGHBORS
Specify number of nearest neighbors
--user_threshold USER_THRESHOLD, -ut USER_THRESHOLD
Specify minimal number of votes
required for user to be included in
the data, set to -1 for no threshold
--anime_threshold ANIME_THRESHOLD, -at ANIME_THRESHOLD
Specify minimal number of votes
required for anime to be included
in the data, set to -1 for no threshold
\end{lstlisting}
\section{Results}
Currently recommendations are displayed in a following way:
\begin{lstlisting}[language=bash]
Recommendations for Kill la Kill:
1: Shingeki no Kyojin, with distance of 0.11106648055176693:
2: Steins;Gate, with distance of 0.12104265014640536:
3: Toradora!, with distance of 0.12112848901274798:
4: Sword Art Online, with distance of 0.13046005032340824:
5: No Game No Life, with distance of 0.1306815843129835:
6: One Punch Man, with distance of 0.14848484728234945:
7: Angel Beats!, with distance of 0.15175709939974935:
8: Hataraku Maou-sama!, with distance of 0.15244674042590045:
9: Psycho-Pass, with distance of 0.15288022814590008:
\end{lstlisting}
Where we are given name of the anime for which we create recommendation and list of animes recommended with distance to original anime (lower is better)
\subsection{Data size and execution time}
\begin{figure}
\caption{Chart showing how size of data taken impacts execution time }
\includegraphics[width=\textwidth]{execution_time.png}
\end{figure}
This data was taken using default parameters execpt for increasing data size, each of three runs uses different seed
\paragraph{Seed} We added seed in predict function for choosing random anime, using the same seed always returns same recommendations and choosing random anime is the only random part of our code \\
User can specify their own seed by using -s or --seed flag by entering in command line:
\begin{lstlisting}
python -s 42
\end{lstlisting}
\section{Challenges}
\subsection{Failed attempts}
Biggest challenge was realizing how overcomplicated and unnecessary difficult to implement is the first code we based on: \href{https://www.kaggle.com/code/chaitanya99/recommendation-system-cf-anime}{Kaggle code with tensorflow} \\
This solutions runs for almost 10 minutes on kaggle and implementing it to run on our local devices was a real chore that took us a good day and a half to implement \\
This implementation is based around very powerful Tensor Processing Unit from google and while it is possible to change it to run on local graphics card it requires downloading both cuda and cudnn to a downgraded version supported by tensorflow (11.8) and downgrading graphics card drivers \\
Running it with CPU results in the model training for over 3 hours
\subsection{Corrections}
Suprisingly even though we based our preliminary report around different example code we managed to not make any corrections to preliminary report \\
All of functionality that we want to implement is available in sklearn and scipy
\subsection{Results and findings}
We can see that the rating is skewed towards higher values, users tend to give ratings of 7, 8 or 9 which inflates average rating to be well above 5
\begin{figure}
\caption{User rating count}
\includegraphics[width=\textwidth]{user_rating.png}
\end{figure}
\section{Finishing project}
\subsection{Embedding more data in user and anime}
Currently we are only embedding pure rating values of users, we do not take into consideration, popularity, "controversy", studio which created the anime, length of anime (number of episodes and length of episodes), and when it was aired \\
\subsection{Evaluating our model accuracy}
We need to introduce some way to evaluate accuracy of our model, we will try to introduce at least some of the measures mentioned in preliminary report: precision, recall, F1 score and MAP
\subsection{More results representation}
We still need to introduce more representation for our model results. Mainly how well it predicts similarity based on different parameter values (different modes, arguments and so on) \\
We already can modify those values easily from the code itself and as argument, we just need to run those values and collect results
\end{document}

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB