::py_install("kaggle", pip = TRUE)
reticulatesystem("kaggle datasets download -d mlg-ulb/creditcardfraud")
::unzip("creditcardfraud.zip", files = "creditcard.csv") zip
Imbalanced classification: credit card fraud detection
structured
Demonstration of how to handle highly imbalanced classification problems.
Introduction
This example looks at the Kaggle Credit Card Fraud Detection dataset to demonstrate how to train a classification model on data with highly imbalanced classes. You can download the data by clicking “Download” at the link, or if you’re setup with a kaggle API key at "~/.kaggle/kagle.json"
, you can run the following:
First, read in the CSV data
library(tensorflow)
library(keras)
set.seed(1234)
<- readr::read_csv("creditcard.csv")
df ::glimpse(df) tibble
Rows: 284,807
Columns: 31
$ Time <dbl> 0, 0, 1, 1, 2, 2, 4, 7, 7, 9, 10, 10, 10, 11, 12, 12, 12, 1…
$ V1 <dbl> -1.3598071, 1.1918571, -1.3583541, -0.9662717, -1.1582331, …
$ V2 <dbl> -0.07278117, 0.26615071, -1.34016307, -0.18522601, 0.877736…
$ V3 <dbl> 2.53634674, 0.16648011, 1.77320934, 1.79299334, 1.54871785,…
$ V4 <dbl> 1.37815522, 0.44815408, 0.37977959, -0.86329128, 0.40303393…
$ V5 <dbl> -0.33832077, 0.06001765, -0.50319813, -0.01030888, -0.40719…
$ V6 <dbl> 0.46238778, -0.08236081, 1.80049938, 1.24720317, 0.09592146…
$ V7 <dbl> 0.239598554, -0.078802983, 0.791460956, 0.237608940, 0.5929…
$ V8 <dbl> 0.098697901, 0.085101655, 0.247675787, 0.377435875, -0.2705…
$ V9 <dbl> 0.3637870, -0.2554251, -1.5146543, -1.3870241, 0.8177393, -…
$ V10 <dbl> 0.09079417, -0.16697441, 0.20764287, -0.05495192, 0.7530744…
$ V11 <dbl> -0.55159953, 1.61272666, 0.62450146, -0.22648726, -0.822842…
$ V12 <dbl> -0.61780086, 1.06523531, 0.06608369, 0.17822823, 0.53819555…
$ V13 <dbl> -0.99138985, 0.48909502, 0.71729273, 0.50775687, 1.34585159…
$ V14 <dbl> -0.31116935, -0.14377230, -0.16594592, -0.28792375, -1.1196…
$ V15 <dbl> 1.468176972, 0.635558093, 2.345864949, -0.631418118, 0.1751…
$ V16 <dbl> -0.47040053, 0.46391704, -2.89008319, -1.05964725, -0.45144…
$ V17 <dbl> 0.207971242, -0.114804663, 1.109969379, -0.684092786, -0.23…
$ V18 <dbl> 0.02579058, -0.18336127, -0.12135931, 1.96577500, -0.038194…
$ V19 <dbl> 0.40399296, -0.14578304, -2.26185710, -1.23262197, 0.803486…
$ V20 <dbl> 0.25141210, -0.06908314, 0.52497973, -0.20803778, 0.4085423…
$ V21 <dbl> -0.018306778, -0.225775248, 0.247998153, -0.108300452, -0.0…
$ V22 <dbl> 0.277837576, -0.638671953, 0.771679402, 0.005273597, 0.7982…
$ V23 <dbl> -0.110473910, 0.101288021, 0.909412262, -0.190320519, -0.13…
$ V24 <dbl> 0.06692807, -0.33984648, -0.68928096, -1.17557533, 0.141266…
$ V25 <dbl> 0.12853936, 0.16717040, -0.32764183, 0.64737603, -0.2060095…
$ V26 <dbl> -0.18911484, 0.12589453, -0.13909657, -0.22192884, 0.502292…
$ V27 <dbl> 0.133558377, -0.008983099, -0.055352794, 0.062722849, 0.219…
$ V28 <dbl> -0.021053053, 0.014724169, -0.059751841, 0.061457629, 0.215…
$ Amount <dbl> 149.62, 2.69, 378.66, 123.50, 69.99, 3.67, 4.99, 40.80, 93.…
$ Class <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
Prepare a validation set
<- nrow(df) %>% sample.int(., ceiling( . * 0.2))
val_idxs <- df[val_idxs, ]
val_df <- df[-val_idxs, ]
train_df
sprintf("Number of training samples: %s", nrow(train_df))
[1] "Number of training samples: 227845"
sprintf("Number of validation samples: %s", nrow(val_df))
[1] "Number of validation samples: 56962"
Analyze class imbalance in the targets
table(train_df$Class)
0 1
227450 395
$Class %>% {
train_dfcat(sprintf(
"Number of positive samples in training data: %s (%.2f%% of total)\n",
sum(.), 100 * mean(.)))
}
Number of positive samples in training data: 395 (0.17% of total)
<- 1 / sum(train_df$Class == 0)
weight_for_0 <- 1 / sum(train_df$Class == 1) weight_for_1
Normalize the data using training set statistics
<- colnames(train_df) %>% setdiff("Class")
feature_names
<- lapply(train_df[feature_names], mean)
means <- lapply(train_df[feature_names], sd)
stds
for (name in feature_names) {
%<>% { (. - means[[name]]) / stds[[name]] }
train_df[[name]] %<>% { (. - means[[name]]) / stds[[name]] }
val_df[[name]] }
Build a binary classification model
<- keras_model_sequential(input_shape = c(length(feature_names))) %>%
model layer_dense(256, activation = "relu") %>%
layer_dense(256, activation = "relu") %>%
layer_dropout(0.3) %>%
layer_dense(256, activation = "relu") %>%
layer_dropout(0.3) %>%
layer_dense(1, activation = "sigmoid")
model
Model: "sequential"
____________________________________________________________________________
Layer (type) Output Shape Param #
============================================================================
dense_3 (Dense) (None, 256) 7936
dense_2 (Dense) (None, 256) 65792
dropout_1 (Dropout) (None, 256) 0
dense_1 (Dense) (None, 256) 65792
dropout (Dropout) (None, 256) 0
dense (Dense) (None, 1) 257
============================================================================
Total params: 139,777
Trainable params: 139,777
Non-trainable params: 0
____________________________________________________________________________
Train the model with class_weight
argument
<- list(
metrics metric_false_negatives(name = "fn"),
metric_false_positives(name = "fp"),
metric_true_negatives(name = "tn"),
metric_true_positives(name = "tp"),
metric_precision(name = "precision"),
metric_recall(name = "recall")
)%>% compile(
model optimizer = optimizer_adam(1e-2),
loss = "binary_crossentropy",
metrics = metrics
)<- list("0" = weight_for_0,
class_weight "1" = weight_for_1)
<- list(
callbacks callback_model_checkpoint("fraud_model_at_epoch_{epoch}.h5"))
<- as.matrix(train_df[feature_names])
train_features <- as.matrix(train_df$Class)
train_targets <- list(
validation_data as.matrix(val_df[feature_names]),
as.matrix(val_df$Class))
%>%
model fit(train_features, train_targets,
validation_data = validation_data,
class_weight = class_weight,
batch_size = 2048, epochs = 30,
callbacks = callbacks,
verbose = 2)
<- model %>%
val_pred predict(as.matrix(val_df[feature_names])) %>%
ifelse(. > .5, 1, 0) }
{
<- val_df$Class == val_pred
pred_correct cat(sprintf("Validation accuracy: %.2f", mean(pred_correct)))
Validation accuracy: 0.99
<- val_df$Class == 1
fraudulent
<- sum(fraudulent & pred_correct)
n_fraudulent_detected <- sum(fraudulent & !pred_correct)
n_fraudulent_missed <- sum(!fraudulent & !pred_correct) n_legitimate_flagged
Conclusions
At the end of training, out of 56,962 validation transactions, we are:
- Correctly identifying 85 of them as fraudulent
- Missing 12 fraudulent transactions
- At the cost of incorrectly flagging 839 legitimate transactions
In the real world, one would put an even higher weight on class 1, so as to reflect that False Negatives are more costly than False Positives.
Next time your credit card gets declined in an online purchase – this is why.
Trained Model | Demo |
---|---|