from IPython import get_ipython
get_ipython().magic('reset -sf')

#%%

import pandas as pd
import numpy as np
np.random.seed(seed=42)

import math

import sklearn.datasets
import sklearn.tree
import collections
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


import tensorflow
import keras
from keras.datasets import mnist

import graphviz 

from npeet import entropy_estimators as ee
# from scipy.stats import entropy

import matplotlib.colors as col
import matplotlib.pyplot as plt
import tikzplotlib as tikz
#tikz.save('plot.tex')
import seaborn as sns
sns.set_theme()



def generate_2D_data(k):
    n = 2 ** k
    N = n ** 2
    print('nb. of samples = ' + str(N))

    X = []
    Y = []
    for i in np.arange(1, n + 1):
        for j in np.arange(1, n + 1):
            X.append([i, j])

            if i > j:
                Y.append(1)
            elif i < j:
                Y.append(-1)

            else:
                if i % 2 == 0:
                    Y.append(1)
                else:
                    Y.append(-1)

    X = np.array(X)
    Y = np.array(Y)
    return [X,Y]
        


def compute_nodes_info(X,Y,clf):
    N = len(X)              # nb. of samples
    labels = set(Y)
    C = len(labels)         # nb. of classes
    
    # Computing p(y)
    labels_nb = np.zeros(C)
    for i in Y:
        for c in range(C):
            if i==list(labels)[c]:
                labels_nb[c] +=1
                
    
    # depths = get_node_depths(clf.tree_)
    n_nodes = clf.tree_.node_count
    
    # collecting samples from each node
    # from : https://stackoverflow.com/questions/45398737/is-there-any-way-to-get-samples-under-each-leaf-of-a-decision-tree
    samples = collections.defaultdict(list)
    dec_paths = clf.decision_path(X)
    
    for d, dec in enumerate(dec_paths):
        for i in range(n_nodes):
            if dec.toarray()[0][i] == 1:
                samples[i].append(d) 
                
        
    df_nodes = pd.DataFrame(index=range(n_nodes), columns = ['proba','h1', 'h2'])
                
    for n in range(n_nodes):
        
        n_samples_id = samples[n]  # id of the samples in node n
        
        proba_node = len(n_samples_id) / N    # probability to belong to node n
        
        # Computing the node's contribution to the layer's entropy H(T)
        df_nodes.loc[n,'proba'] = proba_node
        if proba_node > 0:
            df_nodes.loc[n,'h1'] = - proba_node * math.log2(proba_node)
        else:
            df_nodes.loc[n,'h1'] = 0
            
        # Build a partition of the samples in the node by separating them by class
        class_partition = {c: [] for c in range(C)}
        
        for s in n_samples_id:
            cnt = 0
            for y in labels:
                if Y[s] == y:
                    class_partition[cnt].append(s)
                    break
                else:
                    cnt += 1
        
        # Computing the node's contribution to the layer's conditional entropy H(T|Y)
        h2 = 0
        for c in range(C):
            proba_node_given_y = len(class_partition[c]) / labels_nb[c] 
            #print(proba_node_given_y)
            if proba_node_given_y > 0:
                h2 += - proba_node_given_y *  labels_nb[c]/N * math.log2(proba_node_given_y)
            
        df_nodes.loc[n,'h2'] = h2
        
    return df_nodes



def get_layers(X,Y,clf):
    ### from https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html
    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    feature = clf.tree_.feature
    threshold = clf.tree_.threshold
    
    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
    while len(stack) > 0:
        # `pop` ensures each node is only visited once
        node_id, depth = stack.pop()
        node_depth[node_id] = depth
    
        # If the left and right child of a node is not the same we have a split
        # node
        is_split_node = children_left[node_id] != children_right[node_id]
        # If a split node, append left and right children and depth to `stack`
        # so we can loop through them
        if is_split_node:
            stack.append((children_left[node_id], depth + 1))
            stack.append((children_right[node_id], depth + 1))
        else:
            is_leaves[node_id] = True
    ###        
    
    K = max(node_depth)    # maximal depth of the tree
    layers = {x: [] for x in range(K+1)} 
    for n in range(n_nodes):
        d = node_depth[n]
        layers[d].append(n)
        
        if is_leaves[n]:
            for l in np.arange(d+1,K+1):
                layers[l].append(n)
    return layers



def compute_layers_info(df_nodes, layers, X, Y, clf):
    K = len(layers)
    df_layers = pd.DataFrame(index=range(K), columns = ['I_TX', 'H(T|Y)', 'I_YT', 'Del_G', 'Del_C', 'H_Y'])
    h_y = ee.entropyd(Y)
    df_layers['H_Y'] = h_y
    
    for l in layers:
        I_TX = 0
        I_YT = 0
        H_TY = 0
        
        for n in layers[l]:
            I_TX += df_nodes.loc[n,'h1']
            H_TY += df_nodes.loc[n,'h2']
            I_YT += df_nodes.loc[n,'h1'] - df_nodes.loc[n,'h2']
        
        df_layers.loc[l,'I_TX'] = I_TX
        df_layers.loc[l,'H(T|Y)'] = H_TY
        df_layers.loc[l,'I_YT'] = I_YT
        df_layers.loc[l,'Del_G'] = (I_YT) / h_y
        df_layers.loc[l,'Del_C'] = (I_TX) / h_y
    
    
    # adding the last layer (class prediction)
    last_layer = clf.predict(X)
    df_layers.loc[K,'I_TX'] = ee.midd(last_layer,X)
    df_layers.loc[K,'I_YT'] = ee.midd(Y, last_layer)
    df_layers.loc[K,'Del_G'] = (df_layers.loc[K,'I_YT']  ) /h_y
    df_layers.loc[K,'Del_C'] = (df_layers.loc[K,'I_TX']  ) / h_y
    
    return df_layers



def get_deterministic_IB_curve(H_input, H_output):
    complexity_scale = np.linspace(start=0, stop=H_input, num=100)

    IB_curve = []

    for i in complexity_scale:
        if i <= H_output:
            IB_curve.append(i)
        else:
            IB_curve.append(H_output)

    return np.array(IB_curve)





def plot_info_plane(df_layers,data_name,pruned=False,full_set=False):
    
    [X,Y,X_train,Y_train,X_test,Y_test, features, labels] = get_data(data_name)
    
    # get prediction rep
    n_layers = len(df_layers['I_YT']) - 1
    pred_layer = np.ravel(df_layers.iloc[-1:].to_numpy())
    
    # get other representations
    inside_rep = df_layers.iloc[:-1]

    # Full set IB curve
    h_x = 8 #math.log2(X.shape[0])
    h_y = ee.entropyd(Y)
    IB_curve = get_deterministic_IB_curve(h_x, h_y)
    
    # Train set IB curve
    h_y_train = df_layers.loc[0,'H_Y']
    IB_curve_train = get_deterministic_IB_curve(h_x, h_y_train)
    
    # Get the I_Y of IB_test for comparison
    h_y_test = ee.entropyd(Y_test)
    
    print("IB_full I_Y at : " + str(h_y))
    print("IB_train I_Y at : " + str(h_y_train))
    print("IB_test I_Y at : " + str(h_y_test))
    
    complexity_scale = np.linspace(start=0, stop=h_x, num=100)
    
    
    # plot the information plane
    sns.set_theme()
    fig, ax = plt.subplots() 

    ax.plot(complexity_scale, IB_curve, color="b", linewidth=2.5, label='$IB_{full}$',zorder=2)
    ax.plot(complexity_scale, IB_curve_train, color="black",linewidth=2, dashes=(4, 6),label='$IB_{train}$',zorder=2)
    
    ax.scatter(inside_rep['I_TX'],inside_rep['I_YT'],c=range(len(inside_rep['I_TX'])),cmap='viridis',s=100,alpha=1,zorder=3)
    ax.scatter(pred_layer[0], pred_layer[2], color='darkorange', marker='*', s=250,alpha=1,zorder=3)
    
    ax.set_xlabel('$I(T;X)$',fontsize=20)  # Add an x-label to the axes.
    ax.set_ylabel('$I(Y;T)$',fontsize=20)  # Add a y-label to the axes.
    #ax.tick_params(axis='both',which='major', labelsize=18)
    ax.legend(fontsize=20,loc=4)
    plt.tight_layout()

    #tikz.save('Figures/IP/' + data_name + '_IP.tex')
    if pruned==True:
        plt.savefig('Figures/IP/' + data_name + '_IP_pruned.png')
    elif full_set==True:
        plt.savefig('Figures/IP/' + data_name + '_IP_fullset.png')
    else:
        plt.savefig('Figures/IP/' + data_name + '_IP.png')

    
def plot_info_plane_train_old(df_layers,data_name,badfit=False):
    
    [X,Y,X_train,Y_train,X_test,Y_test, features, labels] = get_data(data_name)
    
    n_layers = len(df_layers['I_YT']) - 1
    pred_layer = np.ravel(df_layers.iloc[-1:].to_numpy())
    
    # Full set IB curve
    h_x = 8
    h_y = df_layers.loc[0,'H_Y']
    IB_curve = get_deterministic_IB_curve(h_x, h_y)
    
    complexity_scale = np.linspace(start=0, stop=h_x, num=100)
    
    
    # plot all layers except prediction layer
    g = sns.relplot(data=df_layers.iloc[:-1], x="I_TX", y="I_YT", 
                    hue=np.arange(0, n_layers), palette = "viridis", s=130, 
                    legend=False)
    
    # plot prediction layer
    g.ax.scatter(pred_layer[0], pred_layer[2], color='darkorange', marker='*', s=180)
    
    # plot IB curve
    g.ax.plot(complexity_scale, IB_curve, color="b")
    
    
    g.set_axis_labels("I(T;X)", "I(Y;T)", labelpad=10, size=15)
    g.fig.set_size_inches(6.5, 4)
    
    #tikz.save('Figures/' + data_name + '_IP.tex')
    if badfit==True:
        plt.savefig('Figures/IP/' + data_name + '_IP_badfit.png')
    else:
        plt.savefig('Figures/IP/' + data_name + '_IP_goodfit.png')
    

def plot_info_plane_train(df_layers,data_name,badfit=False):
    
    [X,Y,X_train,Y_train,X_test,Y_test, features, labels] = get_data(data_name)
    
    
    # get prediction rep
    n_layers = len(df_layers['I_YT']) - 1
    pred_layer = np.ravel(df_layers.iloc[-1:].to_numpy())
    
    # get other representations
    inside_rep = df_layers.iloc[:-1]
    
    # Full set IB curve
    h_x = 8#math.log2(X.shape[0])
    h_y = df_layers.loc[0,'H_Y']
    IB_curve = get_deterministic_IB_curve(h_x, h_y)
    
    complexity_scale = np.linspace(start=0, stop=h_x, num=100)
    
    # plot the information plane
    sns.set_theme()
    fig, ax = plt.subplots() 

    ax.plot(complexity_scale, IB_curve, color="black",linewidth=2, dashes=(4, 6),label='$IB_{train}$',zorder=2)
    
    ax.scatter(inside_rep['I_TX'],inside_rep['I_YT'],c=range(len(inside_rep['I_TX'])),cmap='viridis',s=100,alpha=1,zorder=3)
    ax.scatter(pred_layer[0], pred_layer[2], color='darkorange', marker='*', s=250,alpha=1,zorder=3)
    
    ax.set_xlabel('$I(T;X)$',fontsize=20)  # Add an x-label to the axes.
    ax.set_ylabel('$I(Y;T)$',fontsize=20)  # Add a y-label to the axes.
    #ax.tick_params(axis='both',which='major', labelsize=18)
    ax.legend(fontsize=20,loc=4)
    plt.tight_layout()
    
    #tikz.save('Figures/' + data_name + '_IP.tex')
    if badfit==True:
        plt.savefig('Figures/IP/' + data_name + '_IP_badfit.png')
    else:
        plt.savefig('Figures/IP/' + data_name + '_IP_goodfit.png')



def get_data(name):
        
    if name=='MNIST':
        (X_train, Y_train), (X_test, Y_test) = mnist.load_data()
        
        
        X_train = np.reshape(X_train, (X_train.shape[0],X_train.shape[1]*X_train.shape[1]))
        X_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[1]*X_test.shape[1]))
        
        X = np.vstack((X_train,X_test))
        Y = np.hstack((Y_train,Y_test))
        
        # X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=2/3, random_state=45)
        
        data_features = None #np.arange(0,X.shape[1])
        data_labels = [str(lab) for lab in range(10)] 
        return [X,Y,X_train,Y_train,X_test,Y_test,data_features,data_labels]
    
    
    elif name=='MNIST_trainIB_below':
        (X_train, Y_train), (X_test, Y_test) = mnist.load_data()
        
        
        X_train = np.reshape(X_train, (X_train.shape[0],X_train.shape[1]*X_train.shape[1]))
        X_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[1]*X_test.shape[1]))
        
        X = np.vstack((X_train,X_test))
        Y = np.hstack((Y_train,Y_test))
        
        X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.999, random_state=45)
        
        data_features = None #np.arange(0,X.shape[1])
        data_labels = [str(lab) for lab in range(10)] 
        
        return [X,Y,X_train,Y_train,X_test,Y_test,data_features,data_labels]
    
    
    elif name=='MNIST_small':
        
        n_train = 3000
        n_test = 1000
        
        (X_train, Y_train), (X_test, Y_test) = mnist.load_data()
        
        X_train = np.reshape(X_train, (X_train.shape[0],X_train.shape[1]*X_train.shape[1]))
        X_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[1]*X_test.shape[1]))
        
        X_train = X_train[:n_train]
        Y_train = Y_train[:n_train]
        X_test = X_test[:n_test]
        Y_test = Y_test[:n_test]
        
        X = np.vstack((X_train,X_test))
        Y = np.hstack((Y_train,Y_test))
        
        data_features = None #np.arange(0,X.shape[1])
        data_labels = [str(lab) for lab in range(10)] 
        
        return [X,Y,X_train,Y_train,X_test,Y_test,data_features,data_labels]
    
    
    elif name=='2D_noisy':
        np.random.seed(seed=42)
        
        N = 150
        X = np.zeros((N*N, 2))
        X[:, 0] = np.repeat(np.arange(0, 1, 1/N), N)
        X[:, 1] = np.concatenate([np.arange(0, 1, 1/N)]*N)
        X = X-X.mean(axis=0)
        
        Z = X + np.random.normal(0, 0.06, size=X.shape)
        Y =  np.zeros((N*N, 2))
        Y[np.square(Z-0.5).sum(1) > 0.2, 0] = 1
        Y[:, 1] = 1 - Y.sum(1)
        Y = Y[:,0]
        
        X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=1/7, random_state=42)
        data_features = None
        data_labels = ['Black','Copper']
        
        return [X,Y,X_train,Y_train,X_test,Y_test,data_features,data_labels]
    
    elif name=='2D_noisy_trainIB_below':
        np.random.seed(seed=42)
        
        N = 150
        X = np.zeros((N*N, 2))
        X[:, 0] = np.repeat(np.arange(0, 1, 1/N), N)
        X[:, 1] = np.concatenate([np.arange(0, 1, 1/N)]*N)
        X = X-X.mean(axis=0)
        
        Z = X + np.random.normal(0, 0.06, size=X.shape)
        Y =  np.zeros((N*N, 2))
        Y[np.square(Z-0.5).sum(1) > 0.2, 0] = 1
        Y[:, 1] = 1 - Y.sum(1)
        Y = Y[:,0]
        
        X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.99, random_state=5)
        data_features = None
        data_labels = ['Black','Copper']
        
        return [X,Y,X_train,Y_train,X_test,Y_test,data_features,data_labels]
    
    elif name=='2D_noisy_trainIB_above':
        np.random.seed(seed=42)
        
        N = 150
        X = np.zeros((N*N, 2))
        X[:, 0] = np.repeat(np.arange(0, 1, 1/N), N)
        X[:, 1] = np.concatenate([np.arange(0, 1, 1/N)]*N)
        X = X-X.mean(axis=0)
        
        Z = X + np.random.normal(0, 0.06, size=X.shape)
        Y =  np.zeros((N*N, 2))
        Y[np.square(Z-0.5).sum(1) > 0.2, 0] = 1
        Y[:, 1] = 1 - Y.sum(1)
        Y = Y[:,0]
        
        X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.99, random_state=1)
        data_features = None
        data_labels = ['Black','Copper']
        
        return [X,Y,X_train,Y_train,X_test,Y_test,data_features,data_labels]
    
    elif name=='iris':
        iris = sklearn.datasets.load_iris()
        X = iris.data
        Y = iris.target
        data_features = iris.feature_names
        data_labels = iris.target_names
        
        X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=1/7, random_state=42)

        return [X,Y,X_train,Y_train,X_test,Y_test,data_features,data_labels]
    
    
    elif name=='tennis':
        df = pd.read_csv('playTennis.csv', header=0,
             quotechar='"', sep=',',
             na_values = ['na', '-', '.', ''])
        
        
        df = df.dropna()
        df = df.astype('category')
        
        
        df[['Class', 'wind']] = df[['wind', 'Class']]
        df = df.rename(columns={'wind': 'Class', 'Class': 'wind'})
        
        df = df.astype('category')
        
        data_features = df.columns.tolist()[:-1]
        data_labels = list(df['Class'].cat.categories)
        
        # outlook = [overcast:0, rain:1, sunny:2]
        df['outlook'] = df['outlook'].cat.rename_categories([0, 1, 2])
        
        # temperature = [cool:0, hot:1, mild:2]
        df['temperature'] = df['temperature'].cat.rename_categories([0, 1, 2])
        
        # humidity = [high:0, normal:1]
        df['humidity'] = df['humidity'].cat.rename_categories([0, 1])
        
        # wind = [strong:0, weak:1]
        df['wind'] = df['wind'].cat.rename_categories([0, 1])
        
        
        # Class = [no:0, yes:1]
        df['Class'] = df['Class'].cat.rename_categories([0, 1])
        
        Y = df[df.columns[-1]].to_numpy()
        X = df[df.columns[:-1]].to_numpy()
        
        X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=1/3, random_state=42)
        
        return [X,Y,X_train,Y_train,X_test,Y_test,data_features,data_labels]
    
    
    elif name=='2D':
        return generate_2D_data(5)

    


def get_best_alpha_old(clf,data_name,max_depth=None):
    
    # source : https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py
    [X,Y,X_train,Y_train,X_test,Y_test, features, labels] = get_data(data_name)
    
    # Get all effective alphas
    path = clf.cost_complexity_pruning_path(X_train, Y_train)
    ccp_alphas = path.ccp_alphas
    
    # find the best alpha with test set (limit if too many values)
    if data_name == 'MNIST' and len(ccp_alphas) > 10:
        ccp_alphas = np.linspace(ccp_alphas[0],ccp_alphas[-1]/50,20)
    
    elif data_name == 'MNIST_small' and len(ccp_alphas) > 15:
        ccp_alphas = np.linspace(ccp_alphas[0],ccp_alphas[-1]/6,100)
    
    clfs = []
    
    k=0
    for ccp_alpha in ccp_alphas:
        print("alpha value nb : " + str(k) + '/' + str(len(ccp_alphas)))
        k+=1
        clf = sklearn.tree.DecisionTreeClassifier(random_state=42, max_depth=max_depth, criterion="entropy", ccp_alpha=ccp_alpha)
    
        clf.fit(X_train, Y_train)
        clfs.append(clf)
    
    # train trees with different alpha values
    train_scores = [clf.score(X_train, Y_train) for clf in clfs]
    test_scores = [clf.score(X_test, Y_test) for clf in clfs]
    
    fig, ax = plt.subplots()
    ax.set_xlabel("alpha")
    ax.set_ylabel("accuracy")
    ax.set_title("Accuracy vs alpha for training and testing sets")
    ax.plot(ccp_alphas, train_scores, marker='o', label="train",
            drawstyle="steps-post")
    ax.plot(ccp_alphas, test_scores, marker='o', label="test",
            drawstyle="steps-post")
    ax.legend()
    plt.show()
    
    best_alpha = ccp_alphas[np.argmax(test_scores)]
    return best_alpha



def get_best_alpha(clf,data_name,max_depth=None):
    
    [X,Y,X_train,Y_train,X_test,Y_test, features, labels] = get_data(data_name)
    
    # Get all effective alphas
    path = clf.cost_complexity_pruning_path(X_train, Y_train)
    ccp_alphas = path.ccp_alphas
    
    # find the best alpha with test set (limit if too many values)
    if data_name == 'MNIST':
        ccp_alphas = np.linspace(ccp_alphas[0],ccp_alphas[-1]/50,20)
    
    elif data_name == 'MNIST_small':
        ccp_alphas = np.linspace(ccp_alphas[0],ccp_alphas[-1]/6,100)
        
    elif data_name == '2D_noisy':
        ccp_alphas = np.linspace(ccp_alphas[0],ccp_alphas[-1]/20,20)
        
    
    parameters = {'ccp_alpha':ccp_alphas}
    model = sklearn.tree.DecisionTreeClassifier(random_state=42, max_depth=max_depth, criterion="entropy")
    best_clf = GridSearchCV(model, parameters, n_jobs=4, verbose=4) # default 5 k-folds
    best_clf.fit(X_train, Y_train)
    tree_model = best_clf.best_estimator_
    print (best_clf.best_score_, best_clf.best_params_) 
    
    # accuracy on training, test and full set
    pred_train = best_clf.predict(X_train)
    pred_test = best_clf.predict(X_test)
    pred_full = best_clf.predict(X)
    
    print("accuracy on training set = " + str(accuracy_score(Y_train, pred_train)))
    print("accuracy on test set = " + str(accuracy_score(Y_test, pred_test)))
    print("accuracy on whole set = " + str(accuracy_score(Y, pred_full)))
    
    return tree_model




def fit_tree(data_name,max_depth=None,alpha=0,pruned=False,full_set=False,plot_tree=False):
    [X,Y,X_train,Y_train,X_test,Y_test, features, labels] = get_data(data_name)
    
    
    clf = sklearn.tree.DecisionTreeClassifier(random_state=42, 
                                                  max_depth=max_depth, 
                                                  criterion="entropy",
                                                  ccp_alpha=alpha)
    
    # Check if we train model on full set or training set
    if full_set==True:
        clf.fit(X,Y)
    else:
        clf.fit(X_train, Y_train)
    
    # accuracy on training, test and full set
    pred_train = clf.predict(X_train)
    pred_test = clf.predict(X_test)
    pred_full = clf.predict(X)
    
    print("accuracy on training set = " + str(accuracy_score(Y_train, pred_train)))
    print("accuracy on test set = " + str(accuracy_score(Y_test, pred_test)))
    print("accuracy on whole set = " + str(accuracy_score(Y, pred_full)))
    
    # if toy dataset, plot and save sample space 
    if full_set==True:
        if data_name=='2D_noisy':
            
            N=150
            sns.set_style("dark")
            
            # plot sample space
            plt.figure(figsize=(8, 8))
            plt.imshow(Y.reshape((N,N)), cmap='copper')
            plt.tick_params(labelbottom=False, labelleft=False)
            plt.savefig('Figures/partitions/' + data_name + '_space.png')
            
            # plot partition 
            plt.figure(figsize=(8, 8))
            plt.imshow(pred_full.reshape((N,N)), cmap='copper')
            plt.tick_params(labelbottom=False, labelleft=False)

            if max_depth==None:
                plt.savefig('Figures/partitions/' + data_name + '_space_goodfit.png')
            elif max_depth==5:
                plt.savefig('Figures/partitions/' + data_name + '_space_badfit.png')
            plt.show()
    
    # possible to plot tree rep also
    if plot_tree==True:
        # save tree figure 
        dot_data = sklearn.tree.export_graphviz(clf, out_file=None, filled=True, rounded=True,  
                             special_characters=True, feature_names=features,  
                             class_names=labels, leaves_parallel = True, 
                             proportion = False, rotate = True) 
        graph = graphviz.Source(dot_data) 
        graph.format = 'png'
        
        if full_set==True:
            if max_depth==None:
                graph.render('Figures/trees/' + data_name + '_tree_goodfit')
            elif max_depth==5:
                graph.render('Figures/trees/' + data_name + '_tree_badfit')
                
            #graph.view('Figures/trees/' + data_name + '_tree_fullfit')
            
        else:
            if pruned==False:
                #graph.render('Figures/trees/' + data_name + '_tree_trainfit') 
                graph.format = 'png'
                graph.render('Figures/trees/' + data_name + '_tree_trainfit')
                #graph.view('Figures/trees/' + data_name + '_tree_trainfit')
            elif pruned==True:
                #graph.render('Figures/trees/' + data_name + '_tree_trainfit_pruned') 
                graph.format = 'png'
                graph.render('Figures/trees/' + data_name + '_tree_trainfit_pruned')
                #graph.view('Figures/trees/' + data_name + '_tree_trainfit_pruned')
        
    return clf




def run_generalization_exp(data_name,experiment = 0):
    
    # EXP I (a) : IB_train < IB_full
    if experiment==1.1:
        
        # Get the modified datasets
        if data_name=='2D_noisy':
            data_name='2D_noisy_trainIB_below'
        elif data_name=='MNIST':
            data_name = 'MNIST_trainIB_below'
        else:
            print('can not run for this dataset')
            return 0
            
        # Fit a decision tree to training set
        model = fit_tree(data_name)

        # Compute layer information content and plot it on the information plane
        [X,Y,X_train,Y_train,X_test,Y_test, features, labels] = get_data(data_name)
        
        df_nodes = compute_nodes_info(X_train,Y_train,model)
        layers = get_layers(X_train, Y_train, model)
        df_layers = compute_layers_info(df_nodes, layers,X_train, Y_train, model)
        
        plot_info_plane(df_layers, data_name)
        
        return df_layers
        
        
    # EXP I (b) : IB_train > IB_full
    elif experiment==1.2:
        
        # Get the modified datasets
        if data_name=='2D_noisy':
            data_name='2D_noisy_trainIB_above'
        elif data_name=='MNIST':
            data_name = 'MNIST_trainIB_above'
        else:
            print('can not run for this dataset')
            return 0
        
        # Fit a decision tree to training set
        model = fit_tree(data_name)

        # Compute layer information content and plot it on the information plane
        [X,Y,X_train,Y_train,X_test,Y_test, features, labels] = get_data(data_name)
        
        df_nodes = compute_nodes_info(X_train,Y_train,model)
        layers = get_layers(X_train, Y_train, model)
        df_layers = compute_layers_info(df_nodes, layers,X_train, Y_train, model)
        
        plot_info_plane(df_layers, data_name)
        
        return df_layers
        
    
    # EXP II (a) : IB_train = IB_full with training set
    elif experiment==2.1:
        
        # Fit a decision tree to the training set
        model = fit_tree(data_name)

        # Compute layer information content and plot it on the information plane
        [X,Y,X_train,Y_train,X_test,Y_test, features, labels] = get_data(data_name)
        
        df_nodes = compute_nodes_info(X_train, Y_train, model)
        layers = get_layers(X_train, Y_train, model)
        df_layers = compute_layers_info(df_nodes, layers, X_train, Y_train, model)
        
        plot_info_plane(df_layers, data_name)
        
        return df_layers
    
    
    # EXP II (b) : IB_train = IB_full with full set
    elif experiment==2.2:
        
        # Fit a decision tree to the full set
        model = fit_tree(data_name,full_set=True)

        # Compute layer information content and plot it on the information plane
        [X,Y,X_train,Y_train,X_test,Y_test, features, labels] = get_data(data_name)
        
        df_nodes = compute_nodes_info(X, Y, model)
        layers = get_layers(X, Y, model)
        df_layers = compute_layers_info(df_nodes, layers, X, Y, model)
        
        plot_info_plane(df_layers, data_name, full_set=True)
        
        return df_layers
        
    # EXP III : IB_train = IB_full pruned 
    elif experiment==3:
        
        # Fit a decision tree to the training set
        model = fit_tree(data_name)
        
        # Compute layer information content and plot it on the information plane
        [X,Y,X_train,Y_train,X_test,Y_test, features, labels] = get_data(data_name)
        
        df_nodes = compute_nodes_info(X_train, Y_train, model)
        layers = get_layers(X_train, Y_train, model)
        df_layers = compute_layers_info(df_nodes, layers, X_train, Y_train, model)
        
        plot_info_plane(df_layers, data_name)
        
        # test multiple values of alpha 
        pruned_model = get_best_alpha(model, data_name)
        #print('\n Alpha value equals to : ' + str(best_alpha))
        
        # if alpha is different from zero, fit a new pruned model
        #if best_alpha!=0:
            #pruned_model = fit_tree(data_name,alpha=best_alpha,pruned=True)
            
        # Compute layer information content and plot it on the information plane
        df_nodes_pruned = compute_nodes_info(X_train,Y_train,pruned_model)
        layers_pruned = get_layers(X_train, Y_train, pruned_model)
        df_layers_pruned = compute_layers_info(df_nodes_pruned, layers_pruned, X_train, Y_train, pruned_model)
        
        plot_info_plane(df_layers_pruned, data_name,pruned=True)
        return [df_layers,df_layers_pruned]
            
        
        
            
            
def run_datafit_exp(data_name,experiment = 0):
    
    # EXP I : perfect fit
    if experiment==1:
            
        # Fit a decision tree to training set
        model = fit_tree(data_name,full_set=True)

        # Compute layer information content and plot it on the information plane
        [X,Y,X_train,Y_train,X_test,Y_test, features, labels] = get_data(data_name)
        
        df_nodes = compute_nodes_info(X,Y,model)
        layers = get_layers(X, Y, model)
        df_layers = compute_layers_info(df_nodes, layers,X, Y, model)
        
        plot_info_plane_train(df_layers, data_name, badfit=False)
        
        return df_layers
        
        
    # EXP II : imperfect fit
    elif experiment==2:
        
        # Fit a decision tree to training set
        model = fit_tree(data_name,max_depth=5,full_set=True)

        # Compute layer information content and plot it on the information plane
        [X,Y,X_train,Y_train,X_test,Y_test, features, labels] = get_data(data_name)
        
        df_nodes = compute_nodes_info(X,Y,model)
        layers = get_layers(X, Y, model)
        df_layers = compute_layers_info(df_nodes, layers,X, Y, model)
        
        plot_info_plane_train(df_layers, data_name, badfit=True)
        
        return df_layers
        
        

#%% Exp gen 1.1 NOISY
##############################################
#       Toy generalization experiments
##############################################

data = '2D_noisy'
exp = 1.1
gen_noisy_11 = run_generalization_exp(data,exp)

"""
accuracy on training set = 1.0
accuracy on test set = 0.9577553310886644
accuracy on whole set = 0.9581777777777778
IB_full I_Y at : 0.6277840849515826
IB_train I_Y at : 0.5164490482843502
IB_test I_Y at : 0.6288039502126452
"""

#%% Exp gen 1.2 NOISY
data = '2D_noisy'
exp = 1.2
gen_noisy_12 = run_generalization_exp(data,exp)

"""
accuracy on training set = 1.0
accuracy on test set = 0.94334455667789
accuracy on whole set = 0.9439111111111111
IB_full I_Y at : 0.6277840849515826
IB_train I_Y at : 0.7037900402465797
IB_test I_Y at : 0.6269561300295622
"""

#%% Exp gen 2.1 NOISY
data = '2D_noisy'
exp = 2.1
gen_noisy_21 = run_generalization_exp(data,exp)

"""
accuracy on training set = 1.0
accuracy on test set = 0.9489891135303266
accuracy on whole set = 0.9927111111111111
IB_full I_Y at : 0.6277840849515826
IB_train I_Y at : 0.6286227149906515
IB_test I_Y at : 0.6227260846252679
"""

#%% Exp gen 2.2 NOISY
data = '2D_noisy'
exp = 2.2
gen_noisy_22 = run_generalization_exp(data,exp)

"""
accuracy on training set = 1.0
accuracy on test set = 1.0
accuracy on whole set = 1.0
IB_full I_Y at : 0.6277840849515826
IB_train I_Y at : 0.6277840849515826
IB_test I_Y at : 0.6227260846252679
"""

#%% Exp gen 3 NOISY
data = '2D_noisy'
exp = 3
gen_noisy_3 = run_generalization_exp(data,exp)

"""
### non pruned :
accuracy on training set = 1.0
accuracy on test set = 0.9489891135303266
accuracy on whole set = 0.9927111111111111
IB_full I_Y at : 0.6277840849515826
IB_train I_Y at : 0.6286227149906515
IB_test I_Y at : 0.6227260846252679
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    5.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    5.8s finished
0.9626134301270417 {'ccp_alpha': 0.0010767609949358314}
accuracy on training set = 0.9640134819808142
accuracy on test set = 0.9595645412130638
accuracy on whole set = 0.9633777777777778
IB_full I_Y at : 0.6277840849515826
IB_train I_Y at : 0.6286227149906515
IB_test I_Y at : 0.6227260846252679
"""






#%% Exp gen 1.1 MNIST
##############################################
#       MNIST generalization experiments
##############################################
data = 'MNIST'
exp = 1.1
gen_mnist_11 = run_generalization_exp(data,exp)

"""
accuracy on training set = 1.0
accuracy on test set = 0.4004862004862005
accuracy on whole set = 0.40108571428571427
IB_full I_Y at : 3.3198370254034137
IB_train I_Y at : 3.1383843022755262
IB_test I_Y at : 3.319855546026464
"""

#%% Exp gen 1.2 MNIST
data = 'MNIST'
exp = 1.2
gen_mnist_12 = run_generalization_exp(data,exp)

#%% Exp gen 2.1 MNIST
data = 'MNIST'
exp = 2.1
gen_mnist_21 = run_generalization_exp(data,exp)

"""
accuracy on training set = 1.0
accuracy on test set = 0.8855
accuracy on whole set = 0.9836428571428572
IB_full I_Y at : 3.3198370254034137
IB_train I_Y at : 3.3198709267551885
IB_test I_Y at : 3.3194225261208263
"""

#%% Exp gen 2.2 MNIST
data = 'MNIST'
exp = 2.2
gen_mnist_22 = run_generalization_exp(data,exp)

"""
accuracy on training set = 1.0
accuracy on test set = 1.0
accuracy on whole set = 1.0
IB_full I_Y at : 3.3198370254034137
IB_train I_Y at : 3.3198370254034137
IB_test I_Y at : 3.3194225261208263
"""

#%% Exp gen 3 MNIST
data = 'MNIST'
exp = 3
gen_mnist_3, gen_mnist_3_pruned= run_generalization_exp(data,exp)
    
    
"""
accuracy on training set = 1.0
accuracy on test set = 0.8855
accuracy on whole set = 0.9836428571428572
IB_full I_Y at : 3.3198370254034137
IB_train I_Y at : 3.3198709267551885
IB_test I_Y at : 3.3194225261208263
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed: 13.1min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 14.2min finished
0.8728666666666666 {'ccp_alpha': 0.00033035310424870437}
accuracy on training set = 0.9258166666666666
accuracy on test set = 0.8857
accuracy on whole set = 0.9200857142857143
IB_full I_Y at : 3.3198370254034137
IB_train I_Y at : 3.3198709267551885
IB_test I_Y at : 3.3194225261208263
"""
    
    
    
#%% Exp fit 1 NOISY
##############################################
#       Toy fit to the data experiments
##############################################
data = '2D_noisy'
exp = 1
fit_noisy_1 = run_datafit_exp(data,exp)

"""
accuracy on training set = 1.0
accuracy on test set = 1.0
accuracy on whole set = 1.0
"""

#%% Exp fit 2 NOISY
data = '2D_noisy'
exp = 2
fit_noisy_2 = run_datafit_exp(data,exp)

"""
accuracy on training set = 0.9683817524209544
accuracy on test set = 0.9748075577326802
accuracy on whole set = 0.9693
"""





#%% Exp fit 1 MNIST
##############################################
#       MNIST fit to the data experiments
##############################################
data = 'MNIST'
exp = 1
fit_mnist_1 = run_datafit_exp(data,exp)

"""
accuracy on training set = 1.0
accuracy on test set = 1.0
accuracy on whole set = 1.0
"""

#%% Exp fit 2 MNIST
data = 'MNIST'
exp = 2
fit_mnist_2 = run_datafit_exp(data,exp)

"""
accuracy on training set = 0.6828166666666666
accuracy on test set = 0.6877
accuracy on whole set = 0.6835142857142857
"""