master yeoda
a long long time ago in a galaxy far far away
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
Classifying whether a login attempt is fraudulent.
1 = Yes
, 0 = No
1 = Yes
, 0 = No
1 = Yes
, 0 = No
login = pd.read_csv("login.csv")
login.head()
IP_location_known | number_of_attempts | browser_security | fraudulent | time_of_day_24 | |
---|---|---|---|---|---|
0 | 1 | 3 | 1 | 0 | 4 |
1 | 1 | 2 | 1 | 0 | 6 |
2 | 1 | 1 | 1 | 0 | 19 |
3 | 0 | 3 | 0 | 1 | 4 |
4 | 0 | 2 | 0 | 1 | 4 |
login.shape
(1000, 5)
login["fraudulent"].value_counts()
0 770 1 230 Name: fraudulent, dtype: int64
Data types
login.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 IP_location_known 1000 non-null int64 1 number_of_attempts 1000 non-null int64 2 browser_security 1000 non-null int64 3 fraudulent 1000 non-null int64 4 time_of_day_24 1000 non-null int64 dtypes: int64(5) memory usage: 39.2 KB
Convert relevant columns to categorical (equivalent to factor in R).
categorical_columns = ["IP_location_known", "browser_security", "fraudulent"]
login[categorical_columns] = login[categorical_columns].astype('category')
train_df, valid_df = train_test_split(login, test_size=0.3, random_state=666)
Train the tree
X_train = train_df[["IP_location_known", "number_of_attempts", "time_of_day_24", "browser_security"]]
y_train = train_df["fraudulent"]
clf = DecisionTreeClassifier(max_depth=3, random_state=666)
clf.fit(X_train, y_train)
DecisionTreeClassifier(max_depth=3, random_state=666)
plt.figure(figsize=(12, 8))
plot_tree(clf, feature_names=X_train.columns, class_names=["0", "1"], filled=True)
plt.show()
y_train_pred = clf.predict(X_train)
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
print(conf_matrix_train)
print(classification_report(y_train, y_train_pred))
[[493 51] [ 22 134]] precision recall f1-score support 0 0.96 0.91 0.93 544 1 0.72 0.86 0.79 156 accuracy 0.90 700 macro avg 0.84 0.88 0.86 700 weighted avg 0.91 0.90 0.90 700
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
confusion_matrix_train_display = ConfusionMatrixDisplay(conf_matrix_train, display_labels = clf.classes_)
confusion_matrix_train_display.plot()
plt.grid(False)
X_valid = valid_df[["IP_location_known", "number_of_attempts", "time_of_day_24", "browser_security"]]
y_valid = valid_df["fraudulent"]
y_valid_pred = clf.predict(X_valid)
conf_matrix_valid = confusion_matrix(y_valid, y_valid_pred)
print(conf_matrix_valid)
print(classification_report(y_valid, y_valid_pred))
[[197 29] [ 8 66]] precision recall f1-score support 0 0.96 0.87 0.91 226 1 0.69 0.89 0.78 74 accuracy 0.88 300 macro avg 0.83 0.88 0.85 300 weighted avg 0.90 0.88 0.88 300
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt
confusion_matrix_valid_display = ConfusionMatrixDisplay(conf_matrix_valid, display_labels = clf.classes_)
confusion_matrix_valid_display.plot()
plt.grid(False)
y_valid_prob = clf.predict_proba(X_valid)
y_valid_prob[:5]
array([[1. , 0. ], [1. , 0. ], [0.36538462, 0.63461538], [0. , 1. ], [0.79816514, 0.20183486]])
fpr, tpr, _ = roc_curve(y_valid.cat.codes, y_valid_prob[:, 1])
roc_auc = auc(fpr, tpr)
print(f"AUC: {roc_auc}")
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC curve (area = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.show()
AUC: 0.9338973929681894
new_login = pd.read_csv("new_login.csv")
new_login.head()
IP_location_known | number_of_attempts | browser_security | time_of_day_24 | |
---|---|---|---|---|
0 | 0 | 2 | 1 | 17 |
1 | 1 | 3 | 0 | 3 |
2 | 0 | 1 | 0 | 5 |
Convert relevant columns to categorical.
new_login[categorical_columns[:-1]] = new_login[categorical_columns[:-1]].astype("category")
new_login_pred = clf.predict(new_login[["IP_location_known", "number_of_attempts",
"time_of_day_24", "browser_security"]])
new_login_prob = clf.predict_proba(new_login[["IP_location_known", "number_of_attempts",
"time_of_day_24", "browser_security"]])
new_login["Prediction"] = new_login_pred
new_login["Probability"] = new_login_prob[:, 1]
new_login
IP_location_known | number_of_attempts | browser_security | time_of_day_24 | Prediction | Probability | |
---|---|---|---|---|---|---|
0 | 0 | 2 | 1 | 17 | 0 | 0.000000 |
1 | 1 | 3 | 0 | 3 | 0 | 0.201835 |
2 | 0 | 1 | 0 | 5 | 1 | 0.741935 |