# uncomment if you are using google colab

#from google.colab import drive
#drive.mount('/content/drive')


# installing surprise library, only do it for first time
!pip install surprise

Requirement already satisfied: surprise in /Users/owner/opt/anaconda3/lib/python3.9/site-packages (0.1)
Requirement already satisfied: scikit-surprise in /Users/owner/opt/anaconda3/lib/python3.9/site-packages (from surprise) (1.1.1)
Requirement already satisfied: joblib>=0.11 in /Users/owner/opt/anaconda3/lib/python3.9/site-packages (from scikit-surprise->surprise) (1.1.0)
Requirement already satisfied: numpy>=1.11.2 in /Users/owner/opt/anaconda3/lib/python3.9/site-packages (from scikit-surprise->surprise) (1.20.3)
Requirement already satisfied: scipy>=1.0.0 in /Users/owner/opt/anaconda3/lib/python3.9/site-packages (from scikit-surprise->surprise) (1.7.1)
Requirement already satisfied: six>=1.10.0 in /Users/owner/opt/anaconda3/lib/python3.9/site-packages (from scikit-surprise->surprise) (1.16.0)


import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


from surprise import accuracy

# class is used to parse a file containing ratings, data should be in structure - user ; item ; rating
from surprise.reader import Reader

# class for loading datasets
from surprise.dataset import Dataset

# for model tuning model hyper-parameters
from surprise.model_selection import GridSearchCV

# for splitting the rating data in train and test dataset
from surprise.model_selection import train_test_split

# for implementing similarity based recommendation system
from surprise.prediction_algorithms.knns import KNNBasic

# for implementing matrix factorization based recommendation system
from surprise.prediction_algorithms.matrix_factorization import SVD

from collections import defaultdict

# for implementing cross validation
from surprise.model_selection import KFold


rating = pd.read_csv('ratings.csv')


rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


#Dropping timestamp column
rating = rating.drop(['timestamp'], axis=1)


#printing the top 5 rows of the dataset Hint use .head()

#remove _______and complete the code
rating.head()


plt.figure(figsize = (12, 4))

#remove _______and complete the code
sns.countplot(x = "rating", data=rating)

plt.tick_params(labelsize = 10)
plt.title("Distribution of Ratings ", fontsize = 10)
plt.xlabel("Ratings", fontsize = 10)
plt.ylabel("Number of Ratings", fontsize = 10)
plt.show()


#Finding number of unique users
#remove _______ and complete the code

len(rating['userId'].unique())

671


#Finding number of unique movies
#remove _______ and complete the code

len(rating['movieId'].unique())

9066


rating.groupby(['userId', 'movieId']).count()


rating.groupby(['userId', 'movieId']).count()['rating'].sum()

100004


#remove _______ and complete the code
rating['movieId'].mode()

0    356
dtype: int64


#Plotting distributions of ratings for 341 interactions with movieid 356 
plt.figure(figsize=(7,7))

rating[rating['movieId'] == 356]['rating'].value_counts().plot(kind='bar')

plt.xlabel('Rating')

plt.ylabel('Count')

plt.show()


#remove _______ and complete the code
rating['userId'].mode()

0    547
dtype: int64


#Finding user-movie interactions distribution

count_interactions = rating.groupby('userId').count()['movieId']
count_interactions

userId
1       20
2       76
3       51
4      204
5      100
      ... 
667     68
668     20
669     37
670     31
671    115
Name: movieId, Length: 671, dtype: int64


#Plotting user-movie interactions distribution

plt.figure(figsize=(15,7))
#remove _______ and complete the code

sns.histplot(count_interactions)

plt.xlabel('Number of Interactions by Users')

plt.show()


#remove _______ and complete the code

#Calculating average ratings
average_rating = rating.groupby('movieId').mean().rating

#Calculating the count of ratings
count_rating = rating.groupby('movieId').count().rating

#Making a dataframe with the count and average of ratings
final_rating = pd.DataFrame({'avg_rating':average_rating, 'rating_count':count_rating})


final_rating.head()


def top_n_movies(data, n, min_interaction=100):
    
    #Finding movies with minimum number of interactions
    recommendations = data[data['rating_count'] > min_interaction]
    
    #Sorting values w.r.t average rating 
    recommendations = recommendations.sort_values(by='avg_rating', ascending=False)
    
    return recommendations.index[:n]


#remove _______ and complete the code
list(top_n_movies(final_rating,5,50))

[858, 318, 913, 1221, 50]


#remove _______ and complete the code
list(top_n_movies(final_rating,5,100))

[858, 318, 1221, 50, 527]


#remove _______ and complete the code
list(top_n_movies(final_rating,5,200))

[318, 50, 527, 608, 296]


# instantiating Reader scale with expected rating scale
reader = Reader(rating_scale=(0, 5))

# loading the rating dataset
data = Dataset.load_from_df(rating[['userId', 'movieId', 'rating']], reader)

# splitting the data into train and test dataset
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


#remove _______ and complete the code

sim_options = {'name': 'cosine',
               'user_based': True
               }

#defining Nearest neighbour algorithm
algo_knn_user = KNNBasic(sim_options = sim_options, verbose=False, random_state=1)

# Train the algorithm on the trainset or fitting the model on train dataset 
algo_knn_user.fit(trainset)

#predict ratings for the testset
predictions = algo_knn_user.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9925

0.9924509041520163


#remove _______ and complete the code
algo_knn_user.predict(4, 10, r_ui=4, verbose=True)

user: 4          item: 10         r_ui = 4.00   est = 3.62   {'actual_k': 40, 'was_impossible': False}

Prediction(uid=4, iid=10, r_ui=4, est=3.6244912065910952, details={'actual_k': 40, 'was_impossible': False})


#remove _______ and complete the code
algo_knn_user.predict(4, 3, verbose=True)

user: 4          item: 3          r_ui = None   est = 3.20   {'actual_k': 40, 'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=3.202703552548654, details={'actual_k': 40, 'was_impossible': False})


#remove _______ and complete the code

# setting up parameter grid to tune the hyperparameters
param_grid = {'k': [10, 20, 30], 'min_k': [3, 6, 9],
              'sim_options': {'name': ["cosine",'pearson',"pearson_baseline"],
                              'user_based': [True], "min_support":[2,4]}}

# performing 3-fold cross validation to tune the hyperparameters
grid_obj = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)

# fitting the data
grid_obj.fit(data)

# best RMSE score
print(grid_obj.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(grid_obj.best_params['rmse'])

0.9912631376020545
{'k': 30, 'min_k': 3, 'sim_options': {'name': 'cosine', 'user_based': True, 'min_support': 2}}


results_df = pd.DataFrame.from_dict(grid_obj.cv_results)
results_df.head()


#remove _______ and complete the code

sim_options = {'name': 'cosine',
               'user_based': True, "min_support":2}

# using the optimal similarity measure for user-user based collaborative filtering
# creating an instance of KNNBasic with optimal hyperparameter values
similarity_algo_optimized_user = KNNBasic(sim_options=sim_options, k=30, min_k=3, random_state=1,verbose=False)

# training the algorithm on the trainset
similarity_algo_optimized_user.fit(trainset)

# predicting ratings for the testset
predictions = similarity_algo_optimized_user.test(testset)

# computing RMSE on testset
accuracy.rmse(predictions)

RMSE: 0.9871

0.9871266024277001


#remove _______ and complete the code
similarity_algo_optimized_user.predict(4,10, r_ui=4, verbose=True)

user: 4          item: 10         r_ui = 4.00   est = 3.58   {'actual_k': 30, 'was_impossible': False}

Prediction(uid=4, iid=10, r_ui=4, est=3.583535324429299, details={'actual_k': 30, 'was_impossible': False})


#remove _______ and complete the code
similarity_algo_optimized_user.predict(4,3, verbose=True)

user: 4          item: 3          r_ui = None   est = 3.17   {'actual_k': 30, 'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=3.170232402310352, details={'actual_k': 30, 'was_impossible': False})


similarity_algo_optimized_user.get_neighbors(4, k=5)

[357, 220, 590, 491, 647]


def get_recommendations(data, user_id, top_n, algo):
    
    # creating an empty list to store the recommended movie ids
    recommendations = []
    
    # creating an user item interactions matrix 
    user_item_interactions_matrix = data.pivot(index='userId', columns='movieId', values='rating')
    
    # extracting those movie ids which the user_id has not interacted yet
    non_interacted_movies = user_item_interactions_matrix.loc[user_id][user_item_interactions_matrix.loc[user_id].isnull()].index.tolist()
    
    # looping through each of the movie id which user_id has not interacted yet
    for item_id in non_interacted_movies:
        
        # predicting the ratings for those non interacted movie ids by this user
        est = algo.predict(user_id, item_id).est
        
        # appending the predicted ratings
        recommendations.append((item_id, est))

    # sorting the predicted ratings in descending order
    recommendations.sort(key=lambda x: x[1], reverse=True)

    return recommendations[:top_n] # returing top n highest predicted rating movies for this user


#remove _______ and complete the code
recommendations = get_recommendations(rating,4,5,similarity_algo_optimized_user)


recommendations

[(309, 5),
 (3038, 4.999999999999999),
 (98491, 4.899347045407786),
 (6273, 4.839859025263867),
 (116, 4.753206589295344)]


#remove _______ and complete the code

#definfing similarity measure
sim_options = {'name': 'pearson',
               'user_based': False}

#defining Nearest neighbour algorithm
algo_knn_item = KNNBasic(sim_options = sim_options,verbose=False)

# Train the algorithm on the trainset or fitting the model on train dataset 
algo_knn_item.fit(trainset)

#predict ratings for the testset
predictions = algo_knn_item.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9964

0.9964454065946875


#remove _______ and complete the code
algo_knn_item.predict(4,10, r_ui=4, verbose=True)

user: 4          item: 10         r_ui = 4.00   est = 4.42   {'actual_k': 40, 'was_impossible': False}

Prediction(uid=4, iid=10, r_ui=4, est=4.420788161822849, details={'actual_k': 40, 'was_impossible': False})


#remove _______ and complete the code
algo_knn_item.predict(4,3, verbose=True)

user: 4          item: 3          r_ui = None   est = 4.06   {'actual_k': 40, 'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=4.064635736744944, details={'actual_k': 40, 'was_impossible': False})


#remove _______ and complete the code

# setting up parameter grid to tune the hyperparameters
param_grid = {'k': [10, 20, 30], 'min_k': [3, 6, 9],
              'sim_options': {'name': ["cosine",'pearson',"pearson_baseline"],
                              'user_based': [False], "min_support":[2,4]}
             }

# performing 3-fold cross validation to tune the hyperparameters
grid_obj = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)

# fitting the data
grid_obj.fit(data)

# best RMSE score
print(grid_obj.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(grid_obj.best_params['rmse'])

0.9517757100027374
{'k': 30, 'min_k': 6, 'sim_options': {'name': 'pearson_baseline', 'user_based': False, 'min_support': 2}}


results_df = pd.DataFrame.from_dict(grid_obj.cv_results)
results_df.head()


#remove _______ and complete the code

sim_options = {'name': 'pearson_baseline',
               'user_based': False, 'min_support': 2}

# creating an instance of KNNBasic with optimal hyperparameter values
similarity_algo_optimized_item = KNNBasic(sim_options=sim_options, k=30, min_k=6,verbose=False)

# training the algorithm on the trainset
similarity_algo_optimized_item.fit(trainset)

# predicting ratings for the testset
predictions = similarity_algo_optimized_item.test(testset)

# computing RMSE on testset
accuracy.rmse(predictions)

RMSE: 0.9495

0.9494691122446014


#remove _______ and complete the code
similarity_algo_optimized_item.predict(4,10, r_ui=4, verbose=True)

user: 4          item: 10         r_ui = 4.00   est = 4.18   {'actual_k': 30, 'was_impossible': False}

Prediction(uid=4, iid=10, r_ui=4, est=4.176345863496977, details={'actual_k': 30, 'was_impossible': False})


#remove _______ and complete the code
similarity_algo_optimized_item.predict(4, 3, verbose=True)

user: 4          item: 3          r_ui = None   est = 4.36   {'actual_k': 30, 'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=4.3587850293101775, details={'actual_k': 30, 'was_impossible': False})


#remove _______ and complete the code
similarity_algo_optimized_item.get_neighbors(4, k=5)

[1347, 311, 1445, 778, 108]


#remove _______ and complete the code
recommendations = get_recommendations(rating, 4, 5, similarity_algo_optimized_item)


recommendations

[(190, 5), (449, 5), (1046, 5), (1365, 5), (1398, 5)]


#remove _______ and complete the code

# using SVD matrix factorization
algo_svd = SVD()

# training the algorithm on the trainset
algo_svd.fit(trainset)

# predicting ratings for the testset
predictions = algo_svd.test(testset)

# computing RMSE on the testset
accuracy.rmse(predictions)

RMSE: 0.9023

0.9023308579642529


#remove _______ and complete the code
algo_svd.predict(4, 10, r_ui=4, verbose=True)

user: 4          item: 10         r_ui = 4.00   est = 4.09   {'was_impossible': False}

Prediction(uid=4, iid=10, r_ui=4, est=4.087940020019515, details={'was_impossible': False})


#remove _______ and complete the code
algo_svd.predict(4, 3, verbose=True)

user: 4          item: 3          r_ui = None   est = 3.79   {'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=3.78670657499283, details={'was_impossible': False})


#remove _______ and complete the code

# set the parameter space to tune
param_grid = {'n_epochs': [10, 20, 30], 'lr_all': [0.001, 0.005, 0.01],
              'reg_all': [0.2, 0.4, 0.6]}

# performing 3-fold gridsearch cross validation
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)

# fitting data
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.8938749155641837
{'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.2}


results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df.head()


#remove _______ and complete the code

# building the optimized SVD model using optimal hyperparameter search
svd_algo_optimized = SVD(n_epochs=30, lr_all=0.01, reg_all=0.2)

# training the algorithm on the trainset
svd_algo_optimized.fit(trainset)

# predicting ratings for the testset
predictions = svd_algo_optimized.test(testset)

# computing RMSE
accuracy.rmse(predictions)

RMSE: 0.8955

0.8954630064689425


#remove _______ and complete the code

svd_algo_optimized.predict(4, 10, r_ui=4, verbose=True)

user: 4          item: 10         r_ui = 4.00   est = 3.99   {'was_impossible': False}

Prediction(uid=4, iid=10, r_ui=4, est=3.987473187515993, details={'was_impossible': False})


#remove _______ and complete the code

svd_algo_optimized.predict(4, 3, verbose=True)

user: 4          item: 3          r_ui = None   est = 3.61   {'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=3.61423730206565, details={'was_impossible': False})


#remove _______ and complete the code

get_recommendations(rating, 4, 5, svd_algo_optimized)

[(1192, 5),
 (926, 4.950016862333721),
 (1948, 4.946539035975806),
 (3310, 4.945746618737897),
 (116, 4.930743682667831)]


def predict_already_interacted_ratings(data, user_id, algo):
    
    # creating an empty list to store the recommended movie ids
    recommendations = []
    
    # creating an user item interactions matrix 
    user_item_interactions_matrix = data.pivot(index='userId', columns='movieId', values='rating')
    
    # extracting those movie ids which the user_id has interacted already
    interacted_movies = user_item_interactions_matrix.loc[user_id][user_item_interactions_matrix.loc[user_id].notnull()].index.tolist()
    
    # looping through each of the movie id which user_id has interacted already
    for item_id in interacted_movies:
        
        # extracting actual ratings
        actual_rating = user_item_interactions_matrix.loc[user_id, item_id]
        
        # predicting the ratings for those non interacted movie ids by this user
        predicted_rating = algo.predict(user_id, item_id).est
        
        # appending the predicted ratings
        recommendations.append((item_id, actual_rating, predicted_rating))

    # sorting the predicted ratings in descending order
    recommendations.sort(key=lambda x: x[1], reverse=True)

    return pd.DataFrame(recommendations, columns=['movieId', 'actual_rating', 'predicted_rating']) # returing top n highest predicted rating movies for this user


predicted_ratings_for_interacted_movies = predict_already_interacted_ratings(rating, 7, similarity_algo_optimized_item)
df = predicted_ratings_for_interacted_movies.melt(id_vars='movieId', value_vars=['actual_rating', 'predicted_rating'])
sns.displot(data=df, x='value', hue='variable', kde=True);


predicted_ratings_for_interacted_movies = predict_already_interacted_ratings(rating, 7, svd_algo_optimized)
df = predicted_ratings_for_interacted_movies.melt(id_vars='movieId', value_vars=['actual_rating', 'predicted_rating'])
sns.displot(data=df, x='value', hue='variable', kde=True);


# instantiating Reader scale with expected rating scale
reader = Reader(rating_scale=(0, 5))

# loading the rating dataset
data = Dataset.load_from_df(rating[['userId', 'movieId', 'rating']], reader)

# splitting the data into train and test dataset
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


#function can be found on surprise documentation FAQs
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls


#A basic cross-validation iterator.
kf = KFold(n_splits=5)

# Make list of k values
K = [5, 10]

#remove _______ and complete the code
# Make list of models
models = [algo_knn_user, similarity_algo_optimized_user, algo_knn_item, similarity_algo_optimized_item, algo_svd, svd_algo_optimized]

for k in K:
    for model in models:
        print('> k={}, model={}'.format(k,model.__class__.__name__))
        p = []
        r = []
        for trainset, testset in kf.split(data):
            model.fit(trainset)
            predictions = model.test(testset, verbose=False)
            precisions, recalls = precision_recall_at_k(predictions, k=k, threshold=3.5)

            # Precision and recall can then be averaged over all users
            p.append(sum(prec for prec in precisions.values()) / len(precisions))
            r.append(sum(rec for rec in recalls.values()) / len(recalls))
        
        print('-----> Precision: ', round(sum(p) / len(p), 3))
        print('-----> Recall: ', round(sum(r) / len(r), 3))

> k=5, model=KNNBasic
-----> Precision:  0.768
-----> Recall:  0.413
> k=5, model=KNNBasic
-----> Precision:  0.772
-----> Recall:  0.418
> k=5, model=KNNBasic
-----> Precision:  0.647
-----> Recall:  0.338
> k=5, model=KNNBasic
-----> Precision:  0.73
-----> Recall:  0.39
> k=5, model=SVD
-----> Precision:  0.754
-----> Recall:  0.382
> k=5, model=SVD
-----> Precision:  0.746
-----> Recall:  0.386
> k=10, model=KNNBasic
-----> Precision:  0.748
-----> Recall:  0.547
> k=10, model=KNNBasic
-----> Precision:  0.752
-----> Recall:  0.556
> k=10, model=KNNBasic
-----> Precision:  0.633
-----> Recall:  0.472
> k=10, model=KNNBasic
-----> Precision:  0.708
-----> Recall:  0.542
> k=10, model=SVD
-----> Precision:  0.736
-----> Recall:  0.52
> k=10, model=SVD
-----> Precision:  0.732
-----> Recall:  0.526

	userId	movieId	rating
0	1	31	2.5
1	1	1029	3.0
2	1	1061	3.0
3	1	1129	2.0
4	1	1172	4.0

	avg_rating	rating_count
movieId
1	3.872470	247
2	3.401869	107
3	3.161017	59
4	2.384615	13
5	3.267857	56

	split0_test_rmse	split1_test_rmse	split2_test_rmse	mean_test_rmse	std_test_rmse	rank_test_rmse	split0_test_mae	split1_test_mae	split2_test_mae	mean_test_mae	std_test_mae	rank_test_mae	mean_fit_time	std_fit_time	mean_test_time	std_test_time	params	param_k	param_min_k	param_sim_options
0	0.999601	1.007935	1.004073	1.003869	0.003406	42	0.773227	0.773754	0.776122	0.774368	0.001259	21	0.355385	0.013596	2.113928	0.012148	{'k': 10, 'min_k': 3, 'sim_options': {'name': ...	10	3	{'name': 'cosine', 'user_based': True, 'min_su...
1	0.998780	1.006857	1.001463	1.002366	0.003359	39	0.772787	0.773139	0.774894	0.773607	0.000922	16	0.309440	0.026484	2.092452	0.010118	{'k': 10, 'min_k': 3, 'sim_options': {'name': ...	10	3	{'name': 'cosine', 'user_based': True, 'min_su...
2	1.002915	1.013767	1.006427	1.007703	0.004521	49	0.781391	0.785870	0.783849	0.783703	0.001832	48	0.511408	0.014796	2.106024	0.007105	{'k': 10, 'min_k': 3, 'sim_options': {'name': ...	10	3	{'name': 'pearson', 'user_based': True, 'min_s...
3	1.002333	1.011460	1.002914	1.005569	0.004172	44	0.781702	0.783972	0.781196	0.782290	0.001207	47	0.416535	0.012565	2.098976	0.012688	{'k': 10, 'min_k': 3, 'sim_options': {'name': ...	10	3	{'name': 'pearson', 'user_based': True, 'min_s...
4	0.993738	1.004707	0.996401	0.998282	0.004671	25	0.774667	0.779212	0.775501	0.776460	0.001976	32	0.479957	0.013051	2.077184	0.017852	{'k': 10, 'min_k': 3, 'sim_options': {'name': ...	10	3	{'name': 'pearson_baseline', 'user_based': Tru...

	split0_test_rmse	split1_test_rmse	split2_test_rmse	mean_test_rmse	std_test_rmse	rank_test_rmse	split0_test_mae	split1_test_mae	split2_test_mae	mean_test_mae	std_test_mae	rank_test_mae	mean_fit_time	std_fit_time	mean_test_time	std_test_time	params	param_k	param_min_k	param_sim_options
0	1.008200	1.018035	1.005463	1.010566	0.005398	49	0.780272	0.784731	0.776986	0.780663	0.003174	45	19.228845	0.144610	11.522912	0.115562	{'k': 10, 'min_k': 3, 'sim_options': {'name': ...	10	3	{'name': 'cosine', 'user_based': False, 'min_s...
1	1.002260	1.010876	0.996875	1.003337	0.005766	42	0.768556	0.773272	0.764388	0.768739	0.003629	38	13.712201	2.123800	11.927781	0.527770	{'k': 10, 'min_k': 3, 'sim_options': {'name': ...	10	3	{'name': 'cosine', 'user_based': False, 'min_s...
2	1.025800	1.035418	1.025767	1.028995	0.004542	54	0.799728	0.806096	0.798486	0.801437	0.003333	52	21.718961	2.769808	11.538013	0.114138	{'k': 10, 'min_k': 3, 'sim_options': {'name': ...	10	3	{'name': 'pearson', 'user_based': False, 'min_...
3	1.011432	1.014203	1.005328	1.010321	0.003708	48	0.781175	0.784392	0.776458	0.780675	0.003258	46	12.749396	0.179462	11.176713	0.269833	{'k': 10, 'min_k': 3, 'sim_options': {'name': ...	10	3	{'name': 'pearson', 'user_based': False, 'min_...
4	0.968087	0.975972	0.958479	0.967512	0.007153	10	0.732377	0.737830	0.723254	0.731154	0.006013	7	10.298205	0.670899	10.387762	0.071025	{'k': 10, 'min_k': 3, 'sim_options': {'name': ...	10	3	{'name': 'pearson_baseline', 'user_based': Fal...

	split0_test_rmse	split1_test_rmse	split2_test_rmse	mean_test_rmse	std_test_rmse	rank_test_rmse	split0_test_mae	split1_test_mae	split2_test_mae	mean_test_mae	std_test_mae	rank_test_mae	mean_fit_time	std_fit_time	mean_test_time	std_test_time	params	param_n_epochs	param_lr_all	param_reg_all
0	0.946593	0.943374	0.939202	0.943057	0.003026	25	0.739959	0.737531	0.736375	0.737955	0.001494	25	3.379065	0.007287	0.390260	0.010506	{'n_epochs': 10, 'lr_all': 0.001, 'reg_all': 0.2}	10	0.001	0.2
1	0.951260	0.947063	0.943786	0.947370	0.003059	26	0.745191	0.742214	0.741280	0.742895	0.001668	26	3.358689	0.022910	0.367266	0.018548	{'n_epochs': 10, 'lr_all': 0.001, 'reg_all': 0.4}	10	0.001	0.4
2	0.955922	0.952691	0.949034	0.952549	0.002814	27	0.750363	0.748347	0.747016	0.748576	0.001376	27	3.424530	0.093007	0.361025	0.010906	{'n_epochs': 10, 'lr_all': 0.001, 'reg_all': 0.6}	10	0.001	0.6
3	0.910209	0.906131	0.903036	0.906459	0.002937	10	0.702858	0.700852	0.700916	0.701542	0.000931	9	3.546953	0.013166	0.371510	0.009060	{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.2}	10	0.005	0.2
4	0.917776	0.913339	0.910198	0.913771	0.003109	15	0.710949	0.709001	0.708324	0.709425	0.001113	15	3.365576	0.032710	0.357900	0.017146	{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}	10	0.005	0.4

Project - Recommendation Systems: Movie Recommendations¶

Marks: 40¶

Context¶

Objective¶

Dataset¶

Importing Libraries¶

Loading the data¶

Question 1: Exploring the dataset (7 Marks)¶

Q 1.1 Print the top 5 rows of the dataset. (1 Mark)¶

Q 1.2 Describe the distribution of ratings. (1 Mark)¶

Q 1.3 What is the total number of unique users and unique movies? (1 Mark)¶

Q 1.4 Is there any movie that has been interacted with more than once by the same user? (1 Mark)¶

Q 1.5 Which is the most interacted movie in the dataset? (1 Mark)¶

Q 1.6 Which user interacted the most with any movie in the dataset? (1 Mark)¶

Q 1.7 What is the distribution of the user-movie interactions in this dataset? (1 Mark)¶

As we have now explored the data, let's start building Recommendation systems¶

Question 2: Create Rank-Based Recommendation System (3 Marks)¶

Model 1: Rank-Based Recommendation System¶

Recommending top 5 movies with 50 minimum interactions based on popularity¶

Recommending top 5 movies with 100 minimum interactions based on popularity¶

Recommending top 5 movies with 200 minimum interactions based on popularity¶

Model 2: User based Collaborative Filtering Recommendation System (7 Marks)¶

Types of Collaborative Filtering¶

Building Similarity/Neighborhood based Collaborative Filtering¶

Building a baseline similarity based recommendation system¶

Making the dataset into surprise dataset and splitting it into train and test set¶

Build the first baseline similarity based recommendation system using cosine similarity and KNN¶

Q 3.1 What is the RMSE for baseline user based collaborative filtering recommendation system (1 Mark)¶

Q 3.2 What is the Predicted rating for an user with userId =4 and for movieId= 10 and movieId=3? (1 Mark)¶

Let's us now predict rating for an user with userId=4 and for movieId=10¶

Improving user-user similarity based recommendation system by tuning its hyper-parameters¶

Q 3.3 Perform hyperparameter tuning for the baseline user based collaborative filtering recommendation system and find the RMSE for tuned user based collaborative filtering recommendation system? (3 Marks)¶

Q 3.4 What is the Predicted rating for an user with userId =4 and for movieId= 10 and movieId=3 using tuned user based collaborative filtering? (1 Mark)¶

Let's us now predict rating for an user with userId=4 and for movieId=10 with the optimized model¶

Identifying similar users to a given user (nearest neighbors)¶

Implementing the recommendation algorithm based on optimized KNNBasic model¶

Predicted top 5 movies for userId=4 with similarity based recommendation system¶

Q 3.5 Predict the top 5 movies for userId=4 with similarity based recommendation system (1 Mark)¶

Model 3 Item based Collaborative Filtering Recommendation System (7 Marks)¶

Q 4.1 What is the RMSE for baseline item based collaborative filtering recommendation system (1 Mark)¶

Let's us now predict rating for an user with userId=4 and for movieId=10¶

Q 4.2 What is the Predicted rating for an user with userId =4 and for movieId= 10 and movieId=3? (1 Mark)¶

Let's predict the rating for the same userId=4 but for a movie which this user has not interacted before i.e. movieId=3¶

Q 4.3 Perform hyperparameter tuning for the baseline item based collaborative filtering recommendation system and find the RMSE for tuned item based collaborative filtering recommendation system? (3 Marks)¶

Q 4.4 What is the Predicted rating for an item with userId =4 and for movieId= 10 and movieId=3 using tuned item based collaborative filtering? (1 Mark)¶

Let's us now predict rating for an user with userId=4 and for movieId=10 with the optimized model as shown below¶

Let's predict the rating for the same userId=4 but for a movie which this user has not interacted before i.e. movieId=3, by using the optimized model:¶

Identifying similar users to a given user (nearest neighbors)¶

Predicted top 5 movies for userId=4 with similarity based recommendation system¶

Q 4.5 Predict the top 5 movies for userId=4 with similarity based recommendation system (1 Mark)¶

Model 4 Collaborative Filtering - Matrix Factorization using SVD (7 marks)¶

Singular Value Decomposition (SVD)¶

U-matrix¶

Sigma-matrix¶

V-transpose matrix¶

Build a baseline matrix factorization recommendation system¶

Q 5.1 What is the RMSE for baseline SVD based collaborative filtering recommendation system (1 Mark)¶

Q 5.2 What is the Predicted rating for an user with userId =4 and for movieId= 10 and movieId=3? (1 Mark)¶

Let's us now predict rating for an user with userId=4 and for movieId=10¶

Let's predict the rating for the same userId=4 but for a movie which this user has not interacted before i.e. movieId=3:¶

Improving matrix factorization based recommendation system by tuning its hyper-parameters¶

Q 5.3 Perform hyperparameter tuning for the baseline SVD based collaborative filtering recommendation system and find the RMSE for tuned SVD based collaborative filtering recommendation system? (3 Marks)¶

Q 5.4 What is the Predicted rating for an user with userId =4 and for movieId= 10 and movieId=3 using SVD based collaborative filtering? (1 Mark)¶

Q 5.5 Predict the top 5 movies for userId=4 with SVD based recommendation system (1 Mark)¶

Predicting ratings for already interacted movies¶

Precision and Recall @ k¶

Question6: Compute the precision and recall, for each of the 6 models, at k = 5 and 10. This is 6 x 2 = 12 numerical values? (4 marks)¶

Question 7 ( 5 Marks)¶

7.1 Compare the results from the base line user-user and item-item based models.¶

7.2 How do these baseline models compare to each other with respect to the tuned user-user and item-item models?¶

7.3 The matrix factorization model is different from the collaborative filtering models. Briefly describe this difference. Also, compare the RMSE and precision recall for the models.¶

7.4 Does it improve? Can you offer any reasoning as to why that might be?¶

Conclusions¶

Let's us now predict rating for an user with `userId=4` and for `movieId=10`¶

Let's us now predict rating for an user with `userId=4` and for `movieId=10` with the optimized model¶

Let's us now predict rating for an user with `userId=4` and for `movieId=10`¶

Let's predict the rating for the same `userId=4` but for a movie which this user has not interacted before i.e. `movieId=3`¶

Let's us now predict rating for an user with `userId=4` and for `movieId=10` with the optimized model as shown below¶

Let's predict the rating for the same `userId=4` but for a movie which this user has not interacted before i.e. `movieId=3`, by using the optimized model:¶

Let's us now predict rating for an user with `userId=4` and for `movieId=10`¶

Let's predict the rating for the same `userId=4` but for a movie which this user has not interacted before i.e. `movieId=3`:¶