import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
from IPython.display import HTML
import IPython
from matplotlib.animation import FuncAnimation
from time import time
from tqdm.notebook import tqdm
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss
import tempfile
import shutil
%matplotlib inline
set_matplotlib_formats('svg')
!python --version

Python 3.9.2


from sklearn.exceptions import ConvergenceWarning
import warnings

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=ConvergenceWarning)


data = sns.load_dataset('penguins')
data = data.dropna()
data.dtypes

species               object
island                object
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
dtype: object


sns.pairplot(data=data, hue='species')

<seaborn.axisgrid.PairGrid at 0x10e6bda30>


peng = lambda x: 'Chinstrap' if x == 'Chinstrap' else 'Other'
data['Penguin'] = data['species'].apply(peng)


rs = np.random.RandomState(34)
test = rs.choice(data.index, len(data) // 4)
train = data.index[~data.index.isin(test)]
data.loc[train, 'subset'] = 'Train'
data.loc[test, 'subset'] = 'Test'
data_train = data[data['subset'] == 'Train']
data_test = data[data['subset'] == 'Test']
data.groupby('subset').count()[['species']]


blue_colors = sns.color_palette("Paired", n_colors=2)
sns.scatterplot(data=data, x='bill_depth_mm', y='bill_length_mm',
                hue='Penguin', palette=blue_colors, style='subset',
                style_order=['Train', 'Test'])

<AxesSubplot:xlabel='bill_depth_mm', ylabel='bill_length_mm'>


clf = SGDClassifier(max_iter=50, tol=None, loss='log', alpha=0.0,
                    learning_rate='constant', eta0=0.001, random_state=33,
                    verbose=0, warm_start=True)

X = data_train[['bill_depth_mm', 'bill_length_mm']].values
y, labels = pd.factorize(data_train['Penguin'])
Y = np.expand_dims(y, axis=1)

X_test = data_test[['bill_depth_mm', 'bill_length_mm']].values
y_test, _ = pd.factorize(data_test['Penguin'])
Y_test = np.expand_dims(y_test, axis=1)

clf.fit(X, y)
classes = clf.classes_
clf.coef_, clf.intercept_, clf.n_iter_, clf.classes_

(array([[-0.11750903,  0.05547049]]), array([-0.24530203]), 50, array([0, 1]))


clf.fit(X, y)
clf.n_iter_, clf.coef_, clf.t_

(50, array([[-0.11041536,  0.05812061]]), 12751.0)


def combine_coefs(clf):
    coef = np.append(clf.intercept_, clf.coef_)
    coef = np.expand_dims(coef, axis=0).T
    return coef
combine_coefs(clf)

array([[-0.48528539],
       [-0.11041536],
       [ 0.05812061]])


decision_function = lambda X, coef: X @ coef
predict_proba_lr = lambda X, coef: 1. / (1. + np.exp(-decision_function(X, coef)))
predict_proba_lr([[1, 2, 3]], combine_coefs(clf))

array([[0.37010783]])


def add_bias(X):
    n, p = X.shape
    bias = np.ones((n, 1))
    X = np.hstack([bias, X])
    return X
X.shape, add_bias(X).shape

((255, 2), (255, 3))


def get_losses(coef):
    pred = predict_proba_lr(add_bias(X), coef)
    loss = log_loss(y, pred)
    
    pred_test = predict_proba_lr(add_bias(X_test), coef)
    loss_test = log_loss(y_test, pred_test)
    return [loss, loss_test]
get_losses(combine_coefs(clf))

[0.7374884266002778, 0.7569569316061243]


def get_acc(pred, Y):
    class_pred = np.array(pred > 0.5).astype(int)
    acc = (class_pred == Y).sum() / len(Y)
    return acc

def get_accs(coef):
    pred = predict_proba_lr(add_bias(X), coef)
    pred_test = predict_proba_lr(add_bias(X_test), coef)
    return get_acc(pred, Y), get_acc(pred_test, Y_test)
get_accs(combine_coefs(clf))

(0.6196078431372549, 0.6538461538461539)


max_iter = 50
iterations = 30000
n_fits = iterations // max_iter
clf = SGDClassifier(max_iter=max_iter, tol=None, loss='log', alpha=0.0005,
                    learning_rate='constant', eta0=0.001, random_state=33,
                    verbose=0, warm_start=True)

history = np.array([])
pbar = tqdm(range(n_fits))
curr_iter = 0
for i in pbar:
    clf.fit(X, y)
    
    coef = combine_coefs(clf)
    loss = get_losses(coef)
    acc = get_accs(coef)
    curr_iter = curr_iter + clf.n_iter_
    history = np.append(history, {
        'coef': combine_coefs(clf),
        'loss': get_losses(coef),
        'acc': get_accs(coef),
        'iteration': curr_iter
    })
    pbar.set_description(f'loss={loss[0]:.3f}')


coefs = [hist['coef'] for hist in history]
losses = [hist['loss'] for hist in history]
accs = [hist['acc'] for hist in history]
iters = [hist['iteration'] for hist in history]


df1 = pd.DataFrame(accs, columns=['Train', 'Test'])
df1 = df1.melt(value_name='Acc')

df2 = pd.DataFrame(losses, columns=['Train', 'Test'])
df2['iteration'] = iters
df2 = df2.melt(id_vars='iteration', value_name='Loss', var_name='Subset')

df = df1.join(df2)
df


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 3.5))
sns.lineplot(data=df, x='iteration', y='Acc', hue='Subset', ax=ax1)
sns.lineplot(data=df, x='iteration', y='Loss', ax=ax2)
plt.suptitle('Logistic Regression penguin classification accuracy/loss')
plt.savefig('images/logistic-metrics.svg')


def apply_over_grid(X, clf, *args):
    xaxis = np.linspace(X[:, 0].min(), X[:, 0].max(), 100)
    yaxis = np.linspace(X[:, 1].min(), X[:, 1].max(), 100)
    xx, yy = np.meshgrid(xaxis, yaxis)
    zz = np.dstack([xx, yy])
    ax1, ax2, ax3 = zz.shape
    X = zz.reshape((ax1 * ax2, ax3))
    zz = clf(X, *args)
    zz = zz.reshape((ax1, ax2))
    return xx, yy, zz

xx, yy, zz = apply_over_grid(X, lambda X: X.sum(axis=1))
X.shape, xx.shape, yy.shape, zz.shape

((255, 2), (100, 100), (100, 100), (100, 100))


def decision_region(coef, ax):
    predictor = lambda X, coef: predict_proba_lr(add_bias(X), coef)
    vals = data[['bill_depth_mm', 'bill_length_mm']].values
    xx, yy, zz = apply_over_grid(vals, predictor, coef)
    plt.contourf(xx, yy, zz, alpha=0.7,
                           levels=[0.495, 0.505], colors=['red'])
    plt.contourf(xx, yy, zz, alpha=0.4, cmap='Blues')
    sns.scatterplot(data=data_train,
                    x='bill_depth_mm', y='bill_length_mm',
                    hue='Penguin', palette=blue_colors, ax=ax)
    sns.scatterplot(data=data_test,
                    x='bill_depth_mm', y='bill_length_mm',
                    hue='Penguin', palette=blue_colors, ax=ax,
                    legend=False,
                    style='subset', style_order=['Train', 'Test'])
    
fig = plt.figure()
ax = fig.gca()
decision_region(coefs[-1], ax)
plt.colorbar()
f"loss={losses[-1]}, acc={accs[-1]}"

'loss=[0.3044399298663325, 0.29442757356266686], acc=(0.8823529411764706, 0.8974358974358975)'


folder = tempfile.mkdtemp()
print(f'Saving images to {folder}')

Saving images to /var/folders/34/923qqr696yz_ybx0dl171fn80000gn/T/tmpld_47z24


import time

frames = 150
iter_right_lim = 15000
n_fits_show = iter_right_lim // max_iter
intervals = n_fits_show // frames

pbar = tqdm(total=frames)

iters = np.array(iters)
idxes = np.argwhere(iters <= iter_right_lim)
idxes = np.squeeze(idxes)
indexes = idxes[:-2]
indexes = indexes[np.arange(0, len(indexes), step=intervals)]
idxes = np.append(idxes[-1], indexes)
def create_frame(frame, ax, coefs):
    ax.cla()
    i = idxes[frame]
    coef = coefs[i]
    loss = losses[i][0]
    loss_test = losses[i][1]
    acc = accs[i][0]
    acc_test = accs[i][1]
    iteration = iters[i]
    
    decision_region(coef, ax)

    plt.title(f'Logistic Regression fit iteration {iteration:05}/{iter_right_lim}\n'+
              f'Acc: train (⚫︎)={acc:.3f}, test (×)={acc_test:.3f}, '+
              f'loss: train={loss:.3f}')
    pbar.update()
    plt.savefig(f'{folder}/frame_{frame:03}.png')
    
fig = plt.figure()
ax = fig.gca()
animation = FuncAnimation(fig, create_frame, frames=frames,
                          fargs=(ax, coefs), interval=100)
                # Interval at 1000/100 = 10 frames per second
animated = animation.to_jshtml()
pbar.close()


!convert -background white -alpha remove -dispose Previous +antialias -layers OptimizePlus $folder/*.png ./images/logistic-fit.gif


shutil.rmtree(folder)

	variable	Acc	iteration	Subset	Loss
0	Train	0.619608	50	Train	0.742084
1	Train	0.619608	100	Train	0.737484
2	Train	0.623529	150	Train	0.732833
3	Train	0.623529	200	Train	0.728130
4	Train	0.623529	250	Train	0.723376
...	...	...	...	...	...
1195	Test	0.897436	29800	Test	0.294351
1196	Test	0.897436	29850	Test	0.294370
1197	Test	0.897436	29900	Test	0.294389
1198	Test	0.897436	29950	Test	0.294408
1199	Test	0.897436	30000	Test	0.294428

Logistic Regression¶

Author¶