diff --git a/midterm/code/main.py b/midterm/code/main.py index dc2d75b4..b937cda7 100644 --- a/midterm/code/main.py +++ b/midterm/code/main.py @@ -50,17 +50,18 @@ def merge_rating_anime_data(rating_data, anime_contact_data, debug=False): return rating_data -def split_data_below_thresholds(rating_data, data_name, threshold, debug=False): +def split_data_below_thresholds(rating_data, data_name, threshold=-1, debug=False): """ Removes data with data_name which is below given threshold """ - count = rating_data[data_name].value_counts() - rating_data = rating_data[ - rating_data[data_name].isin(count[count >= threshold].index) - ].copy() - rating_shape_cut = rating_data.shape - if debug: - print(rating_shape_cut) + if threshold != -1: + count = rating_data[data_name].value_counts() + rating_data = rating_data[ + rating_data[data_name].isin(count[count >= threshold].index) + ].copy() + rating_shape_cut = rating_data.shape + if debug: + print(rating_shape_cut) return rating_data @@ -136,13 +137,15 @@ def get_data_info(rating_data, debug=False): f"Min total rating: {smallest_rating}, Max total rating: {highest_rating}") -def preprocessing(rating_data, anime_contact_data, debug=False): +def preprocessing(rating_data, anime_contact_data, debug=False, user_threshold=500, anime_threshold=200): """ Preprocesses data for making model more accurate and/or faster """ rating_data = merge_rating_anime_data(rating_data, anime_contact_data) - rating_data = split_data_below_thresholds(rating_data, "user_id", 500) - rating_data = split_data_below_thresholds(rating_data, "anime_id", 200) + rating_data = split_data_below_thresholds( + rating_data, "user_id", user_threshold) + rating_data = split_data_below_thresholds( + rating_data, "anime_id", anime_threshold) rating_data = combine_name_and_ratings(rating_data) rating_data = rating_data.drop(columns="rating_x") @@ -184,12 +187,13 @@ def predict(prediction_model, pivot_table, seed=42, anime="RANDOM"): ) -def create_model(pivot_table, metric="cosine", algorithm="brute"): +def create_model(pivot_table, metric="cosine", algorithm="brute", neighbors=5): """ Creates model based on neaarest neighbor for anime prediction """ pivot_table_matrix = csr_matrix(pivot_table.values) - model = NearestNeighbors(metric=metric, algorithm=algorithm) + model = NearestNeighbors(n_neighbors=neighbors, + metric=metric, algorithm=algorithm) model.fit(pivot_table_matrix) return model @@ -197,7 +201,7 @@ def create_model(pivot_table, metric="cosine", algorithm="brute"): def handle_arguments(): parser = argparse.ArgumentParser(description='Example script with pyargs') parser.add_argument('--data_limit', '-dl', - help='Specify data limit, Recommended at least 50k', required=False, type=int, default=-1) + help='Specify data limit, Recommended at least 500k, set to -1 for no limit', required=False, type=int, default=-1) parser.add_argument('--seed', '-s', help='Specify seed', type=int, required=False, default=42) parser.add_argument('--debug', '-d', help='Use debug (more information) prints', @@ -213,17 +217,24 @@ def handle_arguments(): required=False, default="brute", choices=allowed_algorithms) parser.add_argument('--anime', '-an', help='Specify anime to choose', required=False, default="RANDOM") + parser.add_argument('--neighbors', '-n', help='Specify number of nearest neighbors', + required=False, default=5) + parser.add_argument('--user_threshold', '-ut', help='Specify minimal number of votes required for user to be included in the data, set to -1 for no threshold', + required=False, default=500) + parser.add_argument('--anime_threshold', '-at', help='Specify minimal number of votes required for anime to be included in the data, set to -1 for no threshold', + required=False, default=200) # Parse the command-line arguments args = parser.parse_args() # Access the values of the arguments - return args.seed, args.debug, args.data_limit, args.database, args.metric, args.algorithm, args.anime + return args.seed, args.debug, args.data_limit, args.database, args.metric, args.algorithm, args.anime, args.neighbors, args.user_threshold, args.anime_threshold if __name__ == "__main__": - seed, debug, data_limit, db, metric, algorithm, anime = handle_arguments() + seed, debug, data_limit, db, metric, algorithm, anime, neighbors, user_threshold, anime_threshold = handle_arguments() RATING_DATA, ANIME_CONTACT_DATA = get_data(data_limit, db) - PIVOT_TABLE = preprocessing(RATING_DATA, ANIME_CONTACT_DATA, debug) - MODEL = create_model(PIVOT_TABLE, metric, algorithm) + PIVOT_TABLE = preprocessing( + RATING_DATA, ANIME_CONTACT_DATA, debug, user_threshold, anime_threshold) + MODEL = create_model(PIVOT_TABLE, metric, algorithm, neighbors) predict(MODEL, PIVOT_TABLE, seed, anime) diff --git a/midterm/report/report.aux b/midterm/report/report.aux index de7868f8..463b0deb 100644 --- a/midterm/report/report.aux +++ b/midterm/report/report.aux @@ -14,14 +14,15 @@ \providecommand*\HyPL@Entry[1]{} \HyPL@Entry{0<>} \@writefile{toc}{\contentsline {section}{\numberline {1}Progress}{1}{section.1}\protected@file@percent } -\@writefile{toc}{\contentsline {section}{\numberline {2}Results}{1}{section.2}\protected@file@percent } -\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Presentation}{1}{subsection.2.1}\protected@file@percent } -\@writefile{toc}{\contentsline {subsubsection}{\numberline {2.1.1}Plots}{1}{subsubsection.2.1.1}\protected@file@percent } -\@writefile{toc}{\contentsline {subsubsection}{\numberline {2.1.2}Tables}{1}{subsubsection.2.1.2}\protected@file@percent } -\@writefile{toc}{\contentsline {paragraph}{Seed}{1}{section*.1}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {2}Results}{2}{section.2}\protected@file@percent } +\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Presentation}{2}{subsection.2.1}\protected@file@percent } +\@writefile{toc}{\contentsline {subsubsection}{\numberline {2.1.1}Plots}{2}{subsubsection.2.1.1}\protected@file@percent } +\@writefile{toc}{\contentsline {subsubsection}{\numberline {2.1.2}Tables}{2}{subsubsection.2.1.2}\protected@file@percent } +\@writefile{toc}{\contentsline {paragraph}{Seed}{2}{section*.1}\protected@file@percent } \@writefile{toc}{\contentsline {section}{\numberline {3}Challenges}{2}{section.3}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Failed attempts}{2}{subsection.3.1}\protected@file@percent } -\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Corrections}{2}{subsection.3.2}\protected@file@percent } -\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Results and findings}{2}{subsection.3.3}\protected@file@percent } -\@writefile{toc}{\contentsline {section}{\numberline {4}Finishing project}{2}{section.4}\protected@file@percent } -\gdef \@abspage@last{2} +\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Corrections}{3}{subsection.3.2}\protected@file@percent } +\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Results and findings}{3}{subsection.3.3}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {4}Finishing project}{3}{section.4}\protected@file@percent } +\@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Embedding more data in user and anime}{3}{subsection.4.1}\protected@file@percent } +\gdef \@abspage@last{3} diff --git a/midterm/report/report.fdb_latexmk b/midterm/report/report.fdb_latexmk index c6e09ab0..1d2ea9bf 100644 --- a/midterm/report/report.fdb_latexmk +++ b/midterm/report/report.fdb_latexmk @@ -1,11 +1,11 @@ # Fdb version 4 -["xdvipdfmx"] 1685391330 "report.xdv" "report.pdf" "report" 1685391330 0 - "report.xdv" 1685391330 20600 f078bc2b2ced2fc2e41ec88d8ace67b6 "xelatex" +["xdvipdfmx"] 1685394955 "report.xdv" "report.pdf" "report" 1685394955 0 + "report.xdv" 1685394955 44220 7ffc5e08598d79076d223ac2a2fe2f18 "xelatex" (generated) "report.pdf" (rewritten before read) -["xelatex"] 1685391330 "/home/kuchy/earin/earin_project/midterm/report/report.tex" "report.xdv" "report" 1685391330 0 - "/home/kuchy/earin/earin_project/midterm/report/report.tex" 1685391329 2040 69a2ecee5e6200f6673defc4b47f00a6 "" +["xelatex"] 1685394955 "/home/kuchy/earin/earin_project/midterm/report/report.tex" "report.xdv" "report" 1685394955 0 + "/home/kuchy/earin/earin_project/midterm/report/report.tex" 1685394955 3734 295cc759a2472e638d3fb6d28689ee4f "" "/usr/share/texmf-dist/fonts/map/fontname/texfonts.map" 1680514707 3524 cb3e574dea2d1052e39280babc910dc8 "" "/usr/share/texmf-dist/fonts/tfm/adobe/zapfding/pzdr.tfm" 1680514707 1528 f853c4d1b4e0550255e02831fdc8496f "" "/usr/share/texmf-dist/fonts/tfm/public/cm/cmmi12.tfm" 1680514707 1524 4414a8315f39513458b80dfc63bff03a "" @@ -45,15 +45,16 @@ "/usr/share/texmf-dist/tex/latex/letltxmacro/letltxmacro.sty" 1680514707 5766 13a9e8766c47f30327caf893ece86ac8 "" "/usr/share/texmf-dist/tex/latex/listings/listings.cfg" 1680514707 1829 d8258b7d94f5f955e70c623e525f9f45 "" "/usr/share/texmf-dist/tex/latex/listings/listings.sty" 1680514707 80947 75a96bb4c9f40ae31d54a01d924df2ff "" + "/usr/share/texmf-dist/tex/latex/listings/lstlang1.sty" 1680514707 205154 31132370016e8c97e49bc3862419679b "" "/usr/share/texmf-dist/tex/latex/listings/lstmisc.sty" 1680514707 77021 d05e9115c67855816136d82929db8892 "" "/usr/share/texmf-dist/tex/latex/refcount/refcount.sty" 1680514707 9878 9e94e8fa600d95f9c7731bb21dfb67a4 "" "/usr/share/texmf-dist/tex/latex/rerunfilecheck/rerunfilecheck.sty" 1680514707 9714 ba3194bd52c8499b3f1e3eb91d409670 "" "/usr/share/texmf-dist/tex/latex/url/url.sty" 1680514707 12796 8edb7d69a20b857904dd0ea757c14ec9 "" "/usr/share/texmf-dist/web2c/texmf.cnf" 1680514707 39911 2da6c67557ec033436fe5418a70a8a61 "" "/var/lib/texmf/web2c/xetex/xelatex.fmt" 1681763698 11046108 24dffefc9850ac1939834d68c95379b0 "" - "report.aux" 1685391330 1770 8e01609f5240f79889437deb07594cdd "xelatex" - "report.out" 1685391330 1102 2cbb39a98792aded098f7ea274b20901 "xelatex" - "report.tex" 1685391329 2040 69a2ecee5e6200f6673defc4b47f00a6 "" + "report.aux" 1685394955 1914 4bfc226f6855c388cee78af5581432ff "xelatex" + "report.out" 1685394955 1363 3cd7533541c949103c3ce99ee358800f "xelatex" + "report.tex" 1685394955 3734 295cc759a2472e638d3fb6d28689ee4f "" (generated) "report.aux" "report.log" diff --git a/midterm/report/report.fls b/midterm/report/report.fls index 4afaf887..976fa21a 100644 --- a/midterm/report/report.fls +++ b/midterm/report/report.fls @@ -405,6 +405,14 @@ OUTPUT report.out INPUT /usr/share/texmf-dist/fonts/tfm/public/cm/cmr12.tfm INPUT /usr/share/texmf-dist/fonts/tfm/public/cm/cmmi12.tfm INPUT /usr/share/texmf-dist/fonts/tfm/public/cm/cmsy10.tfm +INPUT /usr/share/texmf-dist/tex/latex/listings/lstlang1.sty +INPUT /usr/share/texmf-dist/tex/latex/listings/lstlang1.sty +INPUT /usr/share/texmf-dist/tex/latex/listings/lstlang1.sty +INPUT /usr/share/texmf-dist/tex/latex/listings/lstlang1.sty +INPUT /usr/share/texmf-dist/tex/latex/listings/lstlang1.sty +INPUT /usr/share/texmf-dist/tex/latex/listings/lstlang1.sty +INPUT /usr/share/texmf-dist/tex/latex/listings/lstlang1.sty +INPUT /usr/share/texmf-dist/tex/latex/listings/lstlang1.sty OUTPUT report.xdv INPUT report.aux INPUT ./report.out diff --git a/midterm/report/report.log b/midterm/report/report.log index 8abf917d..934cc03b 100644 --- a/midterm/report/report.log +++ b/midterm/report/report.log @@ -1,4 +1,4 @@ -This is XeTeX, Version 3.141592653-2.6-0.999995 (TeX Live 2023/Arch Linux) (preloaded format=xelatex 2023.4.17) 29 MAY 2023 22:15 +This is XeTeX, Version 3.141592653-2.6-0.999995 (TeX Live 2023/Arch Linux) (preloaded format=xelatex 2023.4.17) 29 MAY 2023 23:15 entering extended mode restricted \write18 enabled. file:line:error style messages enabled. @@ -201,19 +201,73 @@ LaTeX Font Info: External font `cmex10' loaded for size (Font) <14.4> on input line 8. LaTeX Font Info: External font `cmex10' loaded for size (Font) <7> on input line 8. - [1 + (/usr/share/texmf-dist/tex/latex/listings/lstlang1.sty +File: lstlang1.sty 2023/02/27 1.9 listings language file +) (/usr/share/texmf-dist/tex/latex/listings/lstlang1.sty +File: lstlang1.sty 2023/02/27 1.9 listings language file +) +Overfull \hbox (99.60498pt too wide) in paragraph at lines 16--17 +[][][][][][][][][][][][][][][][][][][][] + [] -] [2] (./report.aux) + +Overfull \hbox (20.40417pt too wide) in paragraph at lines 20--21 +[][][][][][][][][][][][][][][] + [] + + +Overfull \hbox (70.80469pt too wide) in paragraph at lines 25--26 +[][][][][][][][][][][][][] + [] + + +Overfull \hbox (99.60498pt too wide) in paragraph at lines 28--29 +[][][][][][][][][][][][][][][] + [] + +[1 + +] +Overfull \hbox (20.40417pt too wide) in paragraph at lines 32--33 +[][][][][][][][][][][][][] + [] + + +Overfull \hbox (6.00403pt too wide) in paragraph at lines 35--36 +[][][][][][][][][][][][][][][][][] + [] + + +Overfull \hbox (13.2041pt too wide) in paragraph at lines 36--37 +[][][][][][][][][][][][][][][][][][][][] + [] + + +Overfull \hbox (6.00403pt too wide) in paragraph at lines 39--40 +[][][][][][][][][][][][][][][] + [] + + +Overfull \hbox (49.20447pt too wide) in paragraph at lines 40--41 +[][][][][][][][][][][][][][][][][][][][][][] + [] + +[2] +Underfull \hbox (badness 10000) in paragraph at lines 64--65 + + [] + +[3] (./report.aux) Package rerunfilecheck Info: File `report.out' has not changed. -(rerunfilecheck) Checksum: 2CBB39A98792ADED098F7EA274B20901;1102. +(rerunfilecheck) Checksum: 3CD7533541C949103C3CE99EE358800F;1363. ) Here is how much of TeX's memory you used: - 8689 strings out of 476683 - 138341 string characters out of 5809790 - 1852018 words of memory out of 5000000 - 28875 multiletter control sequences out of 15000+600000 + 8873 strings out of 476683 + 140477 string characters out of 5809790 + 1887018 words of memory out of 5000000 + 29053 multiletter control sequences out of 15000+600000 513616 words of font info for 43 fonts, out of 8000000 for 9000 1348 hyphenation exceptions out of 8191 - 72i,6n,76p,353b,855s stack positions out of 5000i,500n,10000p,200000b,80000s + 72i,6n,76p,461b,1742s stack positions out of 5000i,500n,10000p,200000b,80000s -Output written on report.xdv (2 pages, 20600 bytes). +Output written on report.xdv (3 pages, 44220 bytes). diff --git a/midterm/report/report.out b/midterm/report/report.out index 3a8e551f..50668588 100644 --- a/midterm/report/report.out +++ b/midterm/report/report.out @@ -8,3 +8,4 @@ \BOOKMARK [2][-]{subsection.3.2}{\376\377\000C\000o\000r\000r\000e\000c\000t\000i\000o\000n\000s}{section.3}% 8 \BOOKMARK [2][-]{subsection.3.3}{\376\377\000R\000e\000s\000u\000l\000t\000s\000\040\000a\000n\000d\000\040\000f\000i\000n\000d\000i\000n\000g\000s}{section.3}% 9 \BOOKMARK [1][-]{section.4}{\376\377\000F\000i\000n\000i\000s\000h\000i\000n\000g\000\040\000p\000r\000o\000j\000e\000c\000t}{}% 10 +\BOOKMARK [2][-]{subsection.4.1}{\376\377\000E\000m\000b\000e\000d\000d\000i\000n\000g\000\040\000m\000o\000r\000e\000\040\000d\000a\000t\000a\000\040\000i\000n\000\040\000u\000s\000e\000r\000\040\000a\000n\000d\000\040\000a\000n\000i\000m\000e}{section.4}% 11 diff --git a/midterm/report/report.pdf b/midterm/report/report.pdf index 5d87df25..1414c850 100644 Binary files a/midterm/report/report.pdf and b/midterm/report/report.pdf differ diff --git a/midterm/report/report.synctex.gz b/midterm/report/report.synctex.gz index d8b3ecba..859ec784 100644 Binary files a/midterm/report/report.synctex.gz and b/midterm/report/report.synctex.gz differ diff --git a/midterm/report/report.tex b/midterm/report/report.tex index e82e33d0..bd51dad8 100644 --- a/midterm/report/report.tex +++ b/midterm/report/report.tex @@ -7,7 +7,38 @@ \maketitle \section{Progress} We have implemented reading data from csv files, preprocessing them with optional showing of some of the information about the data and used model/learner for implementing neighbour searches \\ -Right now the model predicts random anime choosen from the table +Program is very flexible and allows for a lot of modification from command line arguments \\ +Full list here: +\begin{lstlisting}[language=bash] +options: +-h, --help show this help message and exit +--data_limit DATA_LIMIT, -dl DATA_LIMIT + Specify data limit, Recommended at least 500k, + set to -1 for no limit +--seed SEED, -s SEED Specify seed +--debug DEBUG, -d DEBUG + Use debug (more information) prints +--database DATABASE, -db DATABASE + Specify database path +--metric {cosine,mahalanobis,euclidean} +-m {cosine,mahalanobis,euclidean} + Specify metric for NearestNeighbor learner +--algorithm {auto,ball_tree,kd_tree,brute} +-a {auto,ball_tree,kd_tree,brute} + Specify algorithm for Nearest Neighbor learner +--anime ANIME, -an ANIME + Specify anime to choose +--neighbors NEIGHBORS, -n NEIGHBORS + Specify number of nearest neighbors +--user_threshold USER_THRESHOLD, -ut USER_THRESHOLD + Specify minimal number of votes + required for user to be included in + the data, set to -1 for no threshold +--anime_threshold ANIME_THRESHOLD, -at ANIME_THRESHOLD + Specify minimal number of votes + required for anime to be included + in the data, set to -1 for no threshold +\end{lstlisting} \section{Results} \subsection{Presentation} \subsubsection{Plots} @@ -29,5 +60,7 @@ All of functionality that we want to implement is available in sklearn and scipy \subsection{Results and findings} \section{Finishing project} +\subsection{Embedding more data in user and anime} +Currently we are only embedding pure rating values of users, we do not take into consideration, popularity, "controversy", studio which created the anime, length of anime (number of episodes and length of episodes), and when it was aired \\ \end{document} \ No newline at end of file diff --git a/midterm/report/report.xdv b/midterm/report/report.xdv index 93e0ac13..7e1693a1 100644 Binary files a/midterm/report/report.xdv and b/midterm/report/report.xdv differ