feat: define auto parameters spread, describe the process

This commit is contained in:
Krzysztof Rudnicki 2023-06-08 18:40:27 +02:00
parent 3b96a6f0f4
commit e82ae526cb
2 changed files with 80 additions and 9 deletions

View File

@ -25,7 +25,8 @@ def get_data(limit_data=-1, data_folder_path="database"):
# used to fetch anime_id(MAL_ID)
anime_data = anime_data.rename(columns={"MAL_ID": "anime_id"})
anime_contact_data = anime_data[["anime_id", "Name"]]
return rating_data, anime_contact_data
rows_number = rating_data.shape[0]
return rating_data, anime_contact_data, rows_number
def merge_rating_anime_data(rating_data, anime_contact_data, debug=False):
@ -187,12 +188,26 @@ def predict(prediction_model, pivot_table, seed=42, anime="RANDOM", recommendati
)
def create_model(pivot_table, metric="cosine", algorithm="brute", neighbors=5):
def calculate_neighbors(rows_number, neighbors=5):
neighbor_value = {
"default": 5,
"sqrt": sqrt(rows_number),
"half": rows_number / 2,
"log": log(rows_number),
"n-1": rows_number - 1
}
if type(neighbors) == string:
return neighbor_value[neighbors]
return neighbors
def create_model(pivot_table, rows_number, metric="cosine", algorithm="brute", neighbors=5):
"""
Creates model based on neaarest neighbor for anime prediction
"""
neighbors_number = calculate_neighbors(rows_number, neighbors)
pivot_table_matrix = csr_matrix(pivot_table.values)
model = NearestNeighbors(n_neighbors=neighbors,
model = NearestNeighbors(n_neighbors=neighbors_number,
metric=metric, algorithm=algorithm)
model.fit(pivot_table_matrix)
return model
@ -254,11 +269,28 @@ def handle_arguments():
return args.seed, args.debug, args.data_limit, args.database, args.metric, args.algorithm, args.anime, args.neighbors, args.user_threshold, args.anime_threshold, args.recommendation_amount, args.auto
if __name__ == "__main__":
SEED, DEBUG, DATA_LIMT, DB, METRIC, ALGORITHM, ANIME, NEIGHBORS, USER_THRESHOLD, ANIME_THRESHOLD, RECOMMENDATION_AMOUNT, AUTO = handle_arguments()
def auto_mode():
data_spread: [27306186, 54612373, -1]
metric_spread: ["cosine", "mahalanobis", "euclidean"]
algorithm_spread: ['auto', 'ball_tree', 'kd_tree', 'brute']
neighbor_spread: [5, "sqrt", "half", "log", "n-1"]
user_threshold_spread: [0, 500, 1000]
anime_threshold_spread: [0, 200, 500]
RATING_DATA, ANIME_CONTACT_DATA = get_data(DATA_LIMT, DB)
def preprocess_model_predict(data_limit, db, debug, user_threshold, anime_threshold, metric, algorithm, neighbors, seed, anime, recommendation_amount):
RATING_DATA, ANIME_CONTACT_DATA, ROWS_NUMBER = get_data(data_limit, db)
PIVOT_TABLE = preprocessing(
RATING_DATA, ANIME_CONTACT_DATA, DEBUG, USER_THRESHOLD, ANIME_THRESHOLD)
MODEL = create_model(PIVOT_TABLE, METRIC, ALGORITHM, NEIGHBORS)
predict(MODEL, PIVOT_TABLE, SEED, ANIME, RECOMMENDATION_AMOUNT)
RATING_DATA, ANIME_CONTACT_DATA, debug, user_threshold, anime_threshold)
MODEL = create_model(PIVOT_TABLE, ROWS_NUMBER,
metric, algorithm, neighbors)
predict(MODEL, PIVOT_TABLE, seed, anime, recommendation_amount)
if __name__ == "__main__":
SEED, DEBUG, DATA_LIMIT, DB, METRIC, ALGORITHM, ANIME, NEIGHBORS, USER_THRESHOLD, ANIME_THRESHOLD, RECOMMENDATION_AMOUNT, AUTO = handle_arguments()
if not AUTO:
preprocess_model_predict(DATA_LIMIT, DB, DEBUG, USER_THRESHOLD, ANIME_THRESHOLD,
METRIC, ALGORITHM, NEIGHBORS, SEED, ANIME, RECOMMENDATION_AMOUNT)
if AUTO:
auto_mode()

View File

@ -0,0 +1,39 @@
Parameters:
- datalimit (usable between 500k and max) [max = 109,224,747 ]
- seed (very important make sure it stays the same through all testing [maybe just 42?])
- metric (either cosine, mahalanobis or euclidean as in preliminary report)
- NN algorithm (either auto, ball_tree, kd_tree, brute)
- neighbors - number of nearest neigbors
- User threshold - minimal numbers of votes for user to be included in data
- Anime threshold - same for anime
These are 6 parameters that influence program behaviour and 1 parameter for seed
Probably would do simulations for 3 variants of each parameters (excluding seed), rest will be default
so in total 6 * 3 = 18 simulations
Default values:
Datalimit: all of data
Seed: 42
Metric: cosine
NN algorithm: brute
Neighbors: 5
User threshold: 500
Anime threshold: 200
Neighbors number count:
k = 3-5: default starting points for small-medium dataset
k = sqrt(n): rule of thumb, n is number of instances in dataset (balanced between underfitting and overfitting)
l = n / 2: look at half of dataset for each prediction
k = log(n): for very large datasets
k = n - 1: Use all data except one, will probably overgenarlize the model
Values spread:
Datalimit: [27306186, 54612373, 109224747] (max on the right, then halved and halved)
Metric: ["cosine", "mahalanobis", "euclidean"]
NN algorithm: ['auto', 'ball_tree', 'kd_tree', 'brute']
neighbors: [5, sqrt(n), n / 2, log(n), n - 1]
User threshold: [0, 500, 1000]
Anime threshold: [0, 200, 500]