diff --git a/final/code/main.py b/final/code/main.py index c8a21e30..b691d349 100644 --- a/final/code/main.py +++ b/final/code/main.py @@ -157,8 +157,8 @@ def preprocessing(rating_data, anime_contact_data, rating_data, "anime_id", anime_threshold) rating_data = combine_name_and_ratings(rating_data) - rating_data = rating_data.drop(columns="rating_x") - rating_data = rating_data.rename(columns={"rating_y": "rating"}) + rating_data = rating_data.drop(columns="rating_y") + rating_data = rating_data.rename(columns={"rating_x": "rating"}) if debug and not auto: print(rating_data) get_data_info(rating_data, True) @@ -171,7 +171,7 @@ def preprocessing(rating_data, anime_contact_data, return pivot_table -def predict(prediction_model, pivot_table, seed=42, anime="RANDOM", recommendation_number=6, auto=False, debug = False): +def predict(prediction_model, pivot_table, seed=42, anime="RANDOM", recommendation_number=6, auto=False, debug=False): """ This will choose a random anime name and our prediction_model will predict similar anime. """ @@ -187,7 +187,7 @@ def predict(prediction_model, pivot_table, seed=42, anime="RANDOM", recommendati query) if debug: print("prediction model, distance: ", distance) - for i in range(0, 4): + for i in range(recommendation_number): if i == 0 and not auto and not debug: print(f"Recommendations for {chosen_anime_name}:\n") elif not auto and not debug: @@ -199,8 +199,11 @@ def predict(prediction_model, pivot_table, seed=42, anime="RANDOM", recommendati closest_anime_name = pivot_table.index[suggestions.flatten()[1]] closest_anime_distance = distance.flatten()[1] average_minus_closest_distance = closest_anime_distance - average_distance - print(f"Average distance: {average_distance}, average_minus_closest_distance: {average_minus_closest_distance}") - return f"{chosen_anime_name}_{closest_anime_name}_{closest_anime_distance}_{average_distance}_{average_minus_closest_distance}" + print( + f"Average distance: {average_distance}, average_minus_closest_distance: {average_minus_closest_distance}") + + return chosen_anime, suggestions.flatten()[1:recommendation_number+1], distance.flatten()[1:recommendation_number+1] + # return f"{chosen_anime_name}_{closest_anime_name}_{closest_anime_distance}_{average_distance}_{average_minus_closest_distance}" def calculate_neighbors(rows_number, neighbors=5): @@ -224,9 +227,10 @@ def create_model(pivot_table, rows_number, metric="cosine", algorithm="brute", n pivot_table_matrix = csr_matrix(pivot_table.values) if algorithm == "brute": model = NearestNeighbors(n_neighbors=neighbors_number, - metric=metric, algorithm=algorithm) + metric=metric, algorithm=algorithm) else: - model = NearestNeighbors(n_neighbors=neighbors_number, algorithm=algorithm) + model = NearestNeighbors( + n_neighbors=neighbors_number, algorithm=algorithm) try: model.fit(pivot_table_matrix) except: @@ -291,12 +295,14 @@ def handle_arguments(): # Access the values of the arguments return args.seed, args.debug, args.data_limit, args.database, args.metric, args.algorithm, args.anime, args.neighbors, args.user_threshold, args.anime_threshold, args.recommendation_amount, args.auto -def auto_mode(data_limit = -1, seed = 42, anime="RANDOM"): + +def auto_mode(data_limit=-1, seed=42, anime="RANDOM"): print("Started auto mode") algorithm_spread = ['auto', 'ball_tree', 'kd_tree', 'brute'] neighbor_spread = [5, "sqrt", "half", "log", "n-1"] # No reason to access and waste computational power every time we run the simulation - starting_rating_data, starting_anime_contact_data, starting_rows_number = get_data(limit_data=data_limit) + starting_rating_data, starting_anime_contact_data, starting_rows_number = get_data( + limit_data=data_limit) original_pivot_table = preprocessing( starting_rating_data, starting_anime_contact_data) if os.path.exists('test_results'): @@ -311,40 +317,67 @@ def auto_mode(data_limit = -1, seed = 42, anime="RANDOM"): for metric in possibleMetrics: print("testing for algorithm, metric: ", algorithm, metric) for neighbor_amount in neighbor_spread: - print("testing for algorithm, metric, neighbor_amount: ", algorithm, metric, neighbor_amount) + print("testing for algorithm, metric, neighbor_amount: ", + algorithm, metric, neighbor_amount) preprocess_model_predict(starting_rating_data, starting_anime_contact_data, - starting_rows_number, original_pivot_table, seed=seed, anime=anime, neighbors=neighbor_amount, algorithm=algorithm, metric=metric) + starting_rows_number, original_pivot_table, seed=seed, anime=anime, neighbors=neighbor_amount, algorithm=algorithm, metric=metric) -def write_test_results(title, result = ""): + +def write_test_results(title, result=""): # Create directory if it doesn't already exist - if not os.path.exists('test_results'): os.makedirs('test_results') # Generate timestamped filename - timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S') # e.g., 20230611235959 + timestamp = datetime.datetime.now().strftime( + '%Y%m%d%H%M%S') # e.g., 20230611235959 filename = f"{title}_{timestamp}.txt" - + # Create and write to the file with open(os.path.join('test_results', filename), 'a') as file: file.write(result) + +def calculate_precision(predictions, threshold=8): + ratings = [anime[anime > 0].mean() for anime in predictions] + precision = [1 if r >= threshold else 0 for r in ratings] + return np.mean(precision) + + def preprocess_model_predict(rating_data, anime_contact_data, rows_number, pivot_table, data_limit=-1, db="database", debug=False, user_threshold=500, anime_threshold=200, metric="cosine", algorithm="brute", neighbors=5, seed=42, anime="RANDOM", recommendation_amount=5): MODEL = create_model(pivot_table, rows_number, metric, algorithm, neighbors) result = "" if MODEL != "Error!": - result = predict(MODEL, pivot_table, seed, anime, recommendation_amount) - write_test_results(f"dl:{rows_number}_s:{seed}_m:{metric}_a:{algorithm}_ut:{user_threshold}_at:{anime_threshold}_n:{neighbors}", result) + chosen_anime, suggestions, distance = predict(MODEL, pivot_table, seed, + anime, recommendation_amount) + + chosen_anime_name = pivot_table.index[chosen_anime] + # average_distance = np.mean(distance) + # closest_anime_name = pivot_table.index[suggestions[1]] + # closest_anime_distance = distance[1] + # average_minus_closest_distance = closest_anime_distance - average_distance + precision = calculate_precision( + [pivot_table.iloc[s] for s in suggestions]) + + result = f"{chosen_anime_name}:\n" + for i in range(len(suggestions)): + result += f"{pivot_table.index[suggestions[i]]}; Distance: {distance[i]}\n" + result += f"Precision: {precision*100}%" + # result = f"{chosen_anime_name}_{closest_anime_name}_{closest_anime_distance}_{average_distance}_{average_minus_closest_distance}" + write_test_results( + f"dl={rows_number}&s={seed}&m={metric}&a={algorithm}&ut={user_threshold}&at={anime_threshold}&n={neighbors}", result) if __name__ == "__main__": SEED, DEBUG, DATA_LIMIT, DB, METRIC, ALGORITHM, ANIME, NEIGHBORS, USER_THRESHOLD, ANIME_THRESHOLD, RECOMMENDATION_AMOUNT, AUTO = handle_arguments() if not AUTO: starting_rating_data, starting_anime_contact_data, starting_rows_number = get_data() + pivot_table = preprocessing( + starting_rating_data, starting_anime_contact_data, USER_THRESHOLD, ANIME_THRESHOLD) preprocess_model_predict(starting_rating_data, starting_anime_contact_data, starting_rows_number, - DATA_LIMIT, DB, DEBUG, USER_THRESHOLD, ANIME_THRESHOLD, - METRIC, ALGORITHM, NEIGHBORS, SEED, ANIME, RECOMMENDATION_AMOUNT) + pivot_table, data_limit=DATA_LIMIT, db=DB, debug=DEBUG, user_threshold=USER_THRESHOLD, anime_threshold=ANIME_THRESHOLD, + metric=METRIC, algorithm=ALGORITHM, neighbors=NEIGHBORS, seed=SEED, anime=ANIME, recommendation_amount=RECOMMENDATION_AMOUNT) if AUTO: auto_mode(DATA_LIMIT, SEED, ANIME)