In [None]:
!pip install river

In [42]:
import collections
from river import datasets
from river import linear_model
from river import metrics
from river import evaluate
from river import preprocessing
from river import optim
from river import imblearn

In [43]:
X_y = datasets.CreditCard()

counts = collections.Counter(y for _, y in X_y)

for c, count in counts.items():
    print(f'{c}: {count} ({count / sum(counts.values()):.5%})')

0: 284315 (99.82725%)
1: 492 (0.17275%)


In [44]:
X_y

Credit card frauds.

The datasets contains transactions made by credit cards in September 2013 by european
cardholders. This dataset presents transactions that occurred in two days, where we have 492
frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class
(frauds) account for 0.172% of all transactions.

It contains only numerical input variables which are the result of a PCA transformation.
Unfortunately, due to confidentiality issues, we cannot provide the original features and more
background information about the data. Features V1, V2, ... V28 are the principal components
obtained with PCA, the only features which have not been transformed with PCA are 'Time' and
'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first
transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be
used for example-dependant cost-senstive learning. Feature 'Class' is the response variable and
it tak

## Baseline model

In [45]:
model = (
    preprocessing.StandardScaler() |
    linear_model.LogisticRegression()
)

#metric = metrics.Metrics(metrics=(metrics.F1(),metrics.ROCAUC()))
metric = metrics.Metrics(metrics=(metrics.F1(),metrics.ROCAUC(),metrics.Precision(),metrics.Recall()))

In [46]:
model

In [47]:
evaluate.progressive_val_score(X_y, model, metric)

F1: 79.51%, ROCAUC: 89.11%, Precision: 88.53%, Recall: 72.15%

## Resampling Methods

In [48]:
## Undersampling

model = (
    preprocessing.StandardScaler() |
    imblearn.RandomUnderSampler(
        classifier=linear_model.LogisticRegression(),
        desired_dist={0: .8, 1: .2},
    )
)

#metric = metrics.Metrics(metrics=(metrics.F1(),metrics.ROCAUC()))
metric = metrics.Metrics(metrics=(metrics.F1(),metrics.ROCAUC(),metrics.Precision(),metrics.Recall()))

In [49]:
model

In [50]:
evaluate.progressive_val_score(X_y, model, metric)

F1: 38.55%, ROCAUC: 95.08%, Precision: 24.97%, Recall: 84.55%

In [51]:
## Oversampling

model = (
    preprocessing.StandardScaler() |
    imblearn.RandomOverSampler(
        classifier=linear_model.LogisticRegression(),
        desired_dist={0: .8, 1: .2},
    )
)
#metric = metrics.Metrics(metrics=(metrics.F1(),metrics.ROCAUC()))
metric = metrics.Metrics(metrics=(metrics.F1(),metrics.ROCAUC(),metrics.Precision(),metrics.Recall()))

In [52]:
model

In [53]:
evaluate.progressive_val_score(X_y, model, metric)

F1: 47.27%, ROCAUC: 91.81%, Precision: 33.36%, Recall: 81.10%

In [64]:
## Under and oversampling

model = (
    preprocessing.StandardScaler() |
    imblearn.RandomSampler(
        classifier=linear_model.LogisticRegression(),
        desired_dist={0: .8, 1: .2},
        sampling_rate=.1,
    )
)
#metric = metrics.Metrics(metrics=(metrics.F1(),metrics.ROCAUC()))
metric = metrics.Metrics(metrics=(metrics.F1(),metrics.ROCAUC(),metrics.Precision(),metrics.Recall()))

In [65]:
evaluate.progressive_val_score(X_y, model, metric)

F1: 36.32%, ROCAUC: 94.23%, Precision: 23.15%, Recall: 84.15%

## Loss weighting and imbalance-aware losses

In [56]:
## Positive class weights 5x negative class
model = (
    preprocessing.StandardScaler() |
    linear_model.LogisticRegression(
        loss=optim.losses.Log(weight_pos=5)
    )
)
#metric = metrics.Metrics(metrics=(metrics.F1(),metrics.ROCAUC()))
metric = metrics.Metrics(metrics=(metrics.F1(),metrics.ROCAUC(),metrics.Precision(),metrics.Recall()))

In [57]:
evaluate.progressive_val_score(X_y, model, metric)

F1: 77.33%, ROCAUC: 91.43%, Precision: 74.76%, Recall: 80.08%

In [58]:
## Focal loss for imbalanced classification

model = (
    preprocessing.StandardScaler() |
    linear_model.LogisticRegression(loss=optim.losses.BinaryFocalLoss(2, 1))
)

#metric = metrics.Metrics(metrics=(metrics.F1(),metrics.ROCAUC()))
metric = metrics.Metrics(metrics=(metrics.F1(),metrics.ROCAUC(),metrics.Precision(),metrics.Recall()))

In [59]:
evaluate.progressive_val_score(X_y, model, metric)

F1: 79.12%, ROCAUC: 91.31%, Precision: 86.68%, Recall: 72.76%

## Mixing it all together

In [62]:
model = (
    preprocessing.StandardScaler() |
    imblearn.RandomSampler(
        classifier=linear_model.LogisticRegression(loss=optim.losses.BinaryFocalLoss(2, 1)),
        desired_dist={0: .8, 1: .2},
        sampling_rate=.1,
))
metric = metrics.Metrics(metrics=(metrics.F1(),metrics.ROCAUC(),metrics.Precision(),metrics.Recall()))

In [63]:
evaluate.progressive_val_score(X_y, model, metric)

F1: 32.50%, ROCAUC: 95.97%, Precision: 20.13%, Recall: 84.35%