import pandas as pd
from sklearn.linear_model import LogisticRegression

import re
znaki = "$.,#!:'"
def znaki_specjalne(s, maska):
    wzorzec = '[' + re.escape(maska) + ']'
    return len(re.findall(wzorzec, s))

data = pd.read_csv('Inne/SMSSpamCollection', header=None, names=["etykieta","wiadomosc"], delimiter="\t")
data['dlugosc'] = data['wiadomosc'].apply(lambda x: len(x))
data["specjalne"] = data['wiadomosc'].apply(lambda x: znaki_specjalne(x, znaki)).astype('uint8')
data['etykieta'] = data['etykieta'].replace('ham', 'nie-spam')
data.info()
data.to_csv('Inne/SMSSpamCollection-rozszerzony.csv', index=False, sep="\t")

from sklearn.model_selection import train_test_split
X = data[['dlugosc','specjalne']] # Cechy
y = data['etykieta']              # Etykiety
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

model = LogisticRegression(solver='lbfgs')
#model = LogisticRegression(solver='newton-cg')
model.fit(X_train, y_train)  # Trenowanie
print(model)

# Ocena wydajności modelu
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score
y_pred = model.predict(X_test)
print("Algorytm LogisticRegression")
df = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred), index=['nie-spam','spam'], columns=['nie-spam','spam'])
print("Accuracy score:", metrics.accuracy_score(y_test, y_pred) )
print(df)
print("Raport klasyfikacji:")
print(classification_report(y_test, y_pred))

print("Zmiana algorytmu na Naive Bayes")
from sklearn.naive_bayes import MultinomialNB
model2 = MultinomialNB()
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)
df = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred), index=['nie-spam','spam'], columns=['nie-spam','spam'])
print("Accuracy score:", metrics.accuracy_score(y_test, y_pred) )
print(df)
print("Raport klasyfikacji dla Naive Bayes:")
print(classification_report(y_test, y_pred))

print("Zmiana algorytmu na SVM (Support Vector Machines)")
from sklearn.svm import SVC
model3 = SVC(gamma='auto')
model3.fit(X_train, y_train)
y_pred = model3.predict(X_test)
df = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred), index=['nie-spam','spam'], columns=['nie-spam','spam'])
print("Accuracy score:", metrics.accuracy_score(y_test, y_pred) )
print(df)
print("Raport klasyfikacji dla SVM:")
print(classification_report(y_test, y_pred))

print("Testowanie modelu na nowych danych")
test_data = pd.read_csv('Inne/nowe-dane-SMS.csv', header=None, names=["wiadomosc"], delimiter="\t")
test_data['dlugosc'] = test_data['wiadomosc'].apply(lambda x: len(x))
test_data["specjalne"] = test_data['wiadomosc'].apply(lambda x: znaki_specjalne(x, znaki)).astype('uint8')
y2_pred = model.predict(test_data[['dlugosc','specjalne']])
wyniki = pd.DataFrame({'etykieta': y2_pred, 'wiadomosc': test_data['wiadomosc']})
wyniki.to_csv('Inne/SMS-wyniki.csv', index=False, sep="\t")  # Zapis kontrolny wyniku na dysk
print(wyniki)

