#!/usr/bin/env python
# coding: utf-8

# 
# Kody źródłowe do książki: Python. Uczenie maszynowe w przykładach
# 
# Rozdział 2.: Tworzenie systemu rekomendacji filmów na bazie naiwnego klasyfikatora Bayesa
# 
# Autor: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
# 

# # Budowanie systemu rekomendacyjnego na bazie klasyfikatora Bayesa

import numpy as np
import pandas as pd


data_path = 'ml-1m/ratings.dat'
df = pd.read_csv(data_path, header=None, sep='::', engine='python')
df.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
print(df)


n_users = df['user_id'].nunique()
n_movies = df['movie_id'].nunique()
print(f"Liczba użytkowników: {n_users}")
print(f"Liczba filmów: {n_movies}")


def load_user_rating_data(df, n_users, n_movies):
    """
    Wczytanie danych opinii z surowego obiektu dataframe i zwrócenie indeksów filmów
    @param df: surowy obiekt dataframe wczytany z pliku ratings.csv
    @param n_users: liczba użytkowników
    @param n_movies: liczba filmów mających opinie
    @return: dane opinii w formie tablicy numpy [użytkownik, film];
             movie_id_mapping, {movie_id: indeks kolumny w danych opinii}
    """
    data = np.zeros([n_users, n_movies], dtype=np.intc)
    movie_id_mapping = {}
    for user_id, movie_id, rating in zip(df['user_id'], df['movie_id'], df['rating']):
        user_id = int(user_id) - 1
        if movie_id not in movie_id_mapping:
            movie_id_mapping[movie_id] = len(movie_id_mapping)
        data[user_id, movie_id_mapping[movie_id]] = rating
    return data, movie_id_mapping

data, movie_id_mapping = load_user_rating_data(df, n_users, n_movies)


values, counts = np.unique(data, return_counts=True) 
for value, count in zip(values, counts): 
    print(f'Liczba ocen {value}: {count}') 


print(df['movie_id'].value_counts())


target_movie_id = 2858
X_raw = np.delete(data, movie_id_mapping[target_movie_id], axis=1)
Y_raw = data[:, movie_id_mapping[target_movie_id]]

X = X_raw[Y_raw > 0]
Y = Y_raw[Y_raw > 0]

print('Kształt X:', X.shape)
print('Kształt Y:', Y.shape)


recommend = 3
Y[Y <= recommend] = 0
Y[Y > recommend] = 1

n_pos = (Y == 1).sum()
n_neg = (Y == 0).sum()
print(f'Liczba pozytywnych próbek: {n_pos}, negatywnych {n_neg}.')


from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(len(Y_train), len(Y_test))


from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1.0, fit_prior=True)
clf.fit(X_train, Y_train)


prediction_prob = clf.predict_proba(X_test)
print(prediction_prob[0:10])

prediction = clf.predict(X_test)
print(prediction[:10])

accuracy = clf.score(X_test, Y_test)
print(f'Dokładność modelu: {accuracy*100:.1f}%')


# # Ocena jakości klasyfikacji

from sklearn.metrics import confusion_matrix
print(confusion_matrix(Y_test, prediction, labels=[0, 1]))


from sklearn.metrics import precision_score, recall_score, f1_score

precision_score(Y_test, prediction, pos_label=1)


recall_score(Y_test, prediction, pos_label=1)


f1_score(Y_test, prediction, pos_label=1)


f1_score(Y_test, prediction, pos_label=0) 


from sklearn.metrics import classification_report
report = classification_report(Y_test, prediction)
print(report)


pos_prob = prediction_prob[:, 1]

thresholds = np.arange(0.0, 1.1, 0.05)
true_pos, false_pos = [0]*len(thresholds), [0]*len(thresholds)
for pred, y in zip(pos_prob, Y_test):
    for i, threshold in enumerate(thresholds):
        if pred >= threshold:
            # Jeżeli wyniki prognozowany i rzeczywisty są równe:
            if y == 1:
                true_pos[i] += 1
            # Jeżeli wynik rzeczywisty jest równy 0, a prognozowany równy 1:
            else:
                false_pos[i] += 1
        else:
            break

n_pos_test = (Y_test == 1).sum()
n_neg_test = (Y_test == 0).sum()
true_pos_rate = [tp / n_pos_test for tp in true_pos]
false_pos_rate = [fp / n_neg_test for fp in false_pos]


import matplotlib.pyplot as plt
plt.figure()
lw = 2
plt.plot(false_pos_rate, true_pos_rate, color='darkorange', lw=lw)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Odsetek przypadków fałszywie pozytywnych')
plt.ylabel('Odsetek przypadków prawdziwie pozytywnych')
plt.title('Krzywa ROC')
plt.legend(loc="lower right")
plt.show()


from sklearn.metrics import roc_auc_score
print(roc_auc_score(Y_test, pos_prob))


# # Strojenie modeli poprzez weryfikację krzyżową

from sklearn.model_selection import StratifiedKFold
k = 5
k_fold = StratifiedKFold(n_splits=k, random_state=42)

smoothing_factor_option = [1, 2, 3, 4, 5, 6]
fit_prior_option = [True, False]
auc_record = {}

for train_indices, test_indices in k_fold.split(X, Y):
    X_train_k, X_test_k = X[train_indices], X[test_indices]
    Y_train_k, Y_test_k = Y[train_indices], Y[test_indices]
    for alpha in smoothing_factor_option:
        if alpha not in auc_record:
            auc_record[alpha] = {}
        for fit_prior in fit_prior_option:
            clf = MultinomialNB(alpha=alpha, fit_prior=fit_prior)
            clf.fit(X_train_k, Y_train_k)
            prediction_prob = clf.predict_proba(X_test_k)
            pos_prob = prediction_prob[:, 1]
            auc = roc_auc_score(Y_test_k, pos_prob)
            auc_record[alpha][fit_prior] = auc + auc_record[alpha].get(fit_prior, 0.0)


print('Wygłazanie  A priori  Pole')
for smoothing, smoothing_record in auc_record.items():
    for fit_prior, auc in smoothing_record.items():
        print(f'    {smoothing}        {fit_prior}    {auc/k:.5f}')


clf = MultinomialNB(alpha=2.0, fit_prior=False)
clf.fit(X_train, Y_train)

pos_prob = clf.predict_proba(X_test)[:, 1]
print('Pole pod krzywą w najlepszym modelu:', roc_auc_score(Y_test, pos_prob))

