diff --git a/final/code/main.py b/final/code/main.py index fead2b80..170e69e4 100644 --- a/final/code/main.py +++ b/final/code/main.py @@ -299,6 +299,7 @@ def handle_arguments(): def auto_mode(data_limit=-1, seed=42, anime="RANDOM"): print("Started auto mode") algorithm_spread = ['auto', 'brute'] + metric_spread = ['manhattan', 'euclidean', 'cosine'] neighbor_spread = [5, "sqrt", "half", "log", "n-1"] # No reason to access and waste computational power every time we run the simulation starting_rating_data, starting_anime_contact_data, starting_rows_number = get_data( @@ -310,7 +311,7 @@ def auto_mode(data_limit=-1, seed=42, anime="RANDOM"): for algorithm in algorithm_spread: possibleMetrics = [] if algorithm != 'auto': - possibleMetrics = sorted(VALID_METRICS_SPARSE[algorithm]) + possibleMetrics = metric_spread print("testing for algorithm: ", algorithm, possibleMetrics) if possibleMetrics == []: possibleMetrics = [""] diff --git a/final/report/report.pdf b/final/report/report.pdf index ea269b1b..7a7018ac 100644 Binary files a/final/report/report.pdf and b/final/report/report.pdf differ diff --git a/final/report/report.tex b/final/report/report.tex index ddcd03ef..ad2ce89a 100644 --- a/final/report/report.tex +++ b/final/report/report.tex @@ -20,23 +20,33 @@ We decided to use collaborative filtering to develop our model, It makes persona We represent anime data-set as embedding vector \\ We use K-nearest neighbors model and decided to test it out with different metrics, neighbors and algorithms \\ \subsubsection{Algorithms} -We decided to test our model with 3 algorithms: +We decided to test our model with 2 algorithms: \begin{enumerate} - \item Ball Tree - \item KD Tree \item Brute + \item Auto \end{enumerate} +Ball Tree and KD Tree do not work on sparse input (as is the case with our input) so we decided to omit them \subsubsection{Neighbor number} We decided to test our model with 5 different neighbor amount: \begin{enumerate} - \item 5 - \item square root of available data - \item half of available data - \item logarithm of available data - \item n-1 neighbors + \item 5 - Popular starting point for small-medium datasets + \item square root of available data - Usually helps to balance between underfitting and overfitting + \item half of available data - Usually usefull for checking overall trend than specific nuances + \item logarithm of available data - Used for very large datasets + \item n-1 neighbors - Usually leads to overgeneralization as we use all instances excepct one for prediciton \end{enumerate} +\subsubsection{Metrics} +For brute algorithm we tested it will all possible metrics: +\begin{enumerate} + \item Cityblock + \item Cosine + \item Euclidean + \item l1 + \item l2 + \item Manhattan +\end{enumerate} \section{Intermediate results} \subsection{Results}