First the exercise:
Now let us load our standard libraries.
import numpy as np
import pandas as pd
Let us load the credit card dataset and extract a small dataframe of numerical features to test on.
big_df = pd.read_csv("UCI_Credit_Card.csv")
big_df.head()
len(big_df)
len(big_df.dropna())
df = big_df.drop(labels = ['ID'], axis = 1)
labels = df['default.payment.next.month']
df.drop('default.payment.next.month', axis = 1, inplace = True)
num_samples = 25000
train_x, train_y = df[0:num_samples], labels[0:num_samples]
test_x, test_y = df[num_samples:], labels[num_samples:]
test_x.head()
train_y.head()
Now let us write our transformation function.
class bin_transformer(object):
    
    def __init__(self, df, num_quantiles = 2):
        self.quantiles = df.quantile(np.linspace(1./num_quantiles, 1.-1./num_quantiles,num_quantiles-1))
            
    
    def transform(self, df):
        new = pd.DataFrame()
        fns = {}
        for col_name in df.axes[1]:
            for ix, q in self.quantiles.iterrows():
                quart = q[col_name]
                new[col_name+str(ix)] = (df[col_name] >= quart)
                fns[col_name+str(ix)] =(col_name, lambda x: x[col_name]>=quart)
        return new, fns
transformer = bin_transformer(df,5)
train_x_t, tr_fns = transformer.transform(train_x)
test_x_t, test_fns = transformer.transform(test_x)
train_x_t.head()
tr_fns
Now let us build some simple loss functions for 1d labels.
def bdd_cross_entropy(pred, label):
    return -np.mean(label*np.log(pred+10**(-20)))
def MSE(pred,label):
    return np.mean((pred-label)**2)
def acc(pred,label):
    return np.mean((pred>=0.5)==(label == 1))
Now let us define the find split function.
def find_split(x, y, loss, verbose = False):
    min_ax = None
    base_loss = loss(np.mean(y),y) 
    min_loss = base_loss
    N = len(x)
    for col_name in x.axes[1]:
        mask = x[col_name]
        num_pos = np.sum(mask)
        num_neg = N - num_pos
        pos_y = np.mean(y[mask])
        neg_y = np.mean(y[~mask])
        l = (num_pos*loss(pos_y, y[mask]) + num_neg*loss(neg_y, y[~mask]))/N
        if verbose:
            print("Column {0} split has improved loss {1}".format(col_name, base_loss-l))
        if l < min_loss:
            min_loss = l
            min_ax = col_name
    return min_ax, min_loss
        
find_split(train_x_t, train_y, MSE, verbose = True)
find_split(train_x_t, train_y, bdd_cross_entropy, verbose = 0)
find_split(train_x_t, train_y, acc, verbose = 0)
np.mean(train_y[train_x_t['PAY_00.8']])
np.mean(train_y[~train_x_t['PAY_00.8']])
np.mean(train_y[train_x_t['AGE0.2']])
np.mean(train_y[~train_x_t['AGE0.2']])