%load_ext pycodestyle_magic


%pycodestyle_on


# Filtragem das mensagens de avisos.
import warnings
warnings.filterwarnings('ignore') 

# Manipulação de dados.
import numpy as np
import pandas as pd

# Criação de gráficos.
import matplotlib.pyplot as plt
import seaborn as sns

# Pré-processamento dos dados.
from imblearn.over_sampling import SMOTE
from scipy import stats
from sklearn.preprocessing import StandardScaler

# Algoritmos de Machine Learning.
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Treinamento dos modelos.
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV, StratifiedKFold, train_test_split

# Métricas para avaliação dos modelos.
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.metrics import make_scorer, precision_score, recall_score, roc_auc_score

print('Bibliotecas carregadas com sucesso!')

3:34: W291 trailing whitespace
6:1: E402 module level import not at top of file
7:1: E402 module level import not at top of file
10:1: E402 module level import not at top of file
11:1: E402 module level import not at top of file
14:1: E402 module level import not at top of file
15:1: E402 module level import not at top of file
16:1: E402 module level import not at top of file
19:1: E402 module level import not at top of file
20:1: E402 module level import not at top of file
21:1: E402 module level import not at top of file
22:1: E402 module level import not at top of file
23:1: E402 module level import not at top of file
26:1: E402 module level import not at top of file
26:80: E501 line too long (119 > 79 characters)
29:1: E402 module level import not at top of file
30:1: E402 module level import not at top of file
30:80: E501 line too long (93 > 79 characters)
31:1: E402 module level import not at top of file
31:80: E501 line too long (85 > 79 characters)


# Versão da linguagem Python.
from platform import python_version
print('Versão da linguagem Python:', python_version())


# Versão dos pacotes.
%reload_ext watermark
%watermark --iversions

3:13: E225 missing whitespace around operator


# Configuração do notebook.

# Plotagens.
from matplotlib import rcParams
rcParams['figure.figsize'] = 15, 10
rcParams['lines.linewidth'] = 3

# Estilo dos gráficos.
plt.style.use('ggplot')

# Configuração Dataframe.
pd.set_option('display.max_columns', None)


# Carregando o conjunto de dados.
df = pd.read_excel('data/default of credit card clients.xls')


# Cópia do dataset.
df1 = df.copy()


# Dimensão do dataframe.
df1.shape

(30000, 25)


# Visualizando o dataframe.
df1.head()


# Informações do dataframe.
df1.info()


# Valores duplicados.
df1.duplicated().sum()

0


# Contagem de registros.
df1['PAY_1'].value_counts()

0                13402
-1                5047
1                 3261
Not available     3021
-2                2476
2                 2378
3                  292
4                   63
5                   23
8                   17
6                   11
7                    9
Name: PAY_1, dtype: int64


# Máscara booleana.
pay1_booleano = df1['PAY_1'] != 'Not available'
pay1_booleano[0:5]

0    True
1    True
2    True
3    True
4    True
Name: PAY_1, dtype: bool


# Filtrando os registros.
df1 = df1.loc[pay1_booleano, :].copy()


# Convertendo a variável.
df1['PAY_1'] = df1['PAY_1'].astype('int64')


# Contagem de registros.
df1['PAY_1'].value_counts()

 0    13402
-1     5047
 1     3261
-2     2476
 2     2378
 3      292
 4       63
 5       23
 8       17
 6       11
 7        9
Name: PAY_1, dtype: int64


# Convertendo as variáveis.

# Cotação do dólar taiwanês.
dol_tw = 0.20

# Variáveis financeiras.
fin_vars = ['LIMIT_BAL',
            'BILL_AMT1',
            'BILL_AMT2',
            'BILL_AMT3',
            'BILL_AMT4',
            'BILL_AMT5',
            'BILL_AMT6',
            'PAY_AMT1',
            'PAY_AMT2',
            'PAY_AMT3',
            'PAY_AMT4',
            'PAY_AMT5',
            'PAY_AMT6']


# Variáveis antes da conversão.
df1[fin_vars].head()


# Aplicando a conversão.
df1[fin_vars] = df1[fin_vars].apply(lambda x: x * dol_tw)


# Variáveis após a conversão.
df1[fin_vars].head()


# Renomeando a variável.
df1.rename(columns={'default payment next month': 'TARGET'}, inplace=True)


# Variáveis do dataframe.
df1.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'TARGET'],
      dtype='object')


# Variável de identificação dos clientes.
id = ['ID']


# Variáveis numéricas.
nums = fin_vars.copy()
nums.insert(1, 'AGE')
nums

['LIMIT_BAL',
 'AGE',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'BILL_AMT5',
 'BILL_AMT6',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'PAY_AMT5',
 'PAY_AMT6']


# Variáveis categóricas.
cats = ['SEX',
        'EDUCATION',
        'MARRIAGE',
        'PAY_1',
        'PAY_2',
        'PAY_3',
        'PAY_4',
        'PAY_5',
        'PAY_6']


# Variável target.
target = ['TARGET']


# Variáveis de status do pagamento.
pay_sts = ['PAY_1',
           'PAY_2',
           'PAY_3',
           'PAY_4',
           'PAY_5',
           'PAY_6']


# Variáveis de valores das faturas.
bills = ['BILL_AMT1',
         'BILL_AMT2',
         'BILL_AMT3',
         'BILL_AMT4',
         'BILL_AMT5',
         'BILL_AMT6']


# Variáveis de pagamento.
pays = ['PAY_AMT1',
        'PAY_AMT2',
        'PAY_AMT3',
        'PAY_AMT4',
        'PAY_AMT5',
        'PAY_AMT6']


# Registros exclusivos.
df1['ID'].nunique()

26704


# Contagem de registros.
id_counts = df1['ID'].value_counts()
id_counts.value_counts()

1    26429
2      275
Name: ID, dtype: int64


# ID's duplicados.
duple_id = id_counts == 2

# Índíces dos ID's duplicados.
duple_idx = id_counts.index[duple_id]
duple_idx

Index(['d5aeb496-64e5', '443324fb-5cfc', 'f20d8a3d-d047', '693a0664-bde6',
       '8567249b-827e', '590a776e-5049', '2189fc56-f82a', '0913d642-c5d4',
       'af1e3f79-f628', '297edb0f-3bb1',
       ...
       '4dc45e9a-27bd', 'c9826d63-f7d3', 'fc73f07e-eb96', '5f483bdb-3aaf',
       '93b2c5f7-acea', '47d9ee33-0df0', '26bde6da-f148', 'f63d8fbe-d79e',
       'dda76366-a407', 'c3ddce11-35e2'],
      dtype='object', length=275)


# ID's duplicados.
df1.loc[df1['ID'].isin(duple_idx[0:5]), :]


# Dataframe booleano.
# True = 0, False = outro valor.
df_booleano = df1 == 0
df_booleano.head()


# Identificando os registros.
# True = registros inválidos, False = registros válidos.
df_zero = df_booleano.iloc[:, 1:].all(axis=1)

# Total de registros inválidos.
sum(df_zero)

315


# Filtrando os registros.
df2 = df1.loc[~df_zero, :].copy()


# Registros únicos.
df2['ID'].nunique()

26664


# Dimensão do dataframe.
df2.shape

(26664, 25)


# Estatísticas das variáveis.
df2[nums].describe().T


# Histogramas das variáveis.

# Amostra dos dados.
df_sample = df2.sample(1000, random_state=42)

# Redefinindo a área de plotagem.
plt.figure(figsize=(20, 10))

# Especificando as variáveis.
features = nums

# Plotagem.
for i in range(0, len(nums)):
    plt.subplot(5, 3, i + 1)
    sns.histplot(x=df_sample[features[i]], color='royalblue')
    plt.xlabel(features[i])
    plt.tight_layout()


# Histogramas das variáveis.

# Filtrando os dados.
pay_zero_mask = df_sample[pays] == 0

# Plotagem.
df_sample[pays][~pay_zero_mask].apply(np.log10).hist(layout=(2, 3),
                                                     ec='w',
                                                     alpha=.7,
                                                     color='royalblue');

10:72: E703 statement ends with a semicolon


# Plotagem das variáveis.
for i in range(0, len(cats)):
    plt.subplot(3, 3, i + 1)
    sns.countplot(x=df_sample[cats[i]], color='royalblue', ec='w')
    plt.tight_layout()


# Tratando as inconsistências observadas.

# Variável EDUCATION.
df2['EDUCATION'].replace(to_replace=[0, 5, 6], value=4, inplace=True)

# Variável MARRIAGE.
df2['MARRIAGE'].replace(to_replace=0, value=3, inplace=True)


# Comparando as variáveis.
df2.loc[df2['PAY_2'] == 2, ['PAY_2', 'PAY_3']].head()


# Tratando as inconsistências observadas.
df2 = df2.drop(['PAY_2',
                'PAY_3',
                'PAY_4',
                'PAY_5',
                'PAY_6'], axis=1)


# Redefinindo a lista de variáveis categóricas.
cats2 = ['SEX',
         'EDUCATION',
         'MARRIAGE',
         'PAY_1']


# Variáveis em relação a target.

# Especificando as variáveis.
features = cats2

# Plotagem.
for i in range(0, len(features)):
    plt.subplot(2, 2, i + 1)
    sns.countplot(data=df2, x=features[i],
                  hue='TARGET', alpha=.7, palette=['red', 'green'])
    plt.tight_layout()


# Valor mínimo da variável.
df2['LIMIT_BAL'].min()

2000.0


# Valor máximo da variável.
df2['LIMIT_BAL'].max()

160000.0


# Definindo os valores e labels.
bins = [0, 30000, 60000, 90000, 200000]
labels = [1, 2, 3, 4]


# Criando a variável.
df2['GROUP_LIMIT'] = pd.cut(df2['LIMIT_BAL'], bins,
                            labels=labels).astype('int64')


# Visualizando amostras aleatórias.
df2[['LIMIT_BAL', 'GROUP_LIMIT']].sample(5)


# Contagem de registros.
df2['GROUP_LIMIT'].value_counts()

1    14510
2     8074
3     2982
4     1098
Name: GROUP_LIMIT, dtype: int64


# Agrupando os dados.
df_group = df2.groupby('TARGET').agg({'ID': 'nunique'}).reset_index() 
df_group

2:70: W291 trailing whitespace


# Agrupando os dados.
df_group2 = df2.groupby(['TARGET', 'GROUP_LIMIT']).agg({'ID': 'nunique'}).reset_index()
df_group2

2:80: E501 line too long (87 > 79 characters)


# Concatenando os dados.
df_group3 = df_group2.merge(df_group, on='TARGET')
df_group3


# Agrupando os dados.
df_group4 = df2.groupby('GROUP_LIMIT').agg({'ID': 'nunique'}).reset_index() 
df_group4

2:76: W291 trailing whitespace


# Concatenando os dados.
df_group5 = df_group2.merge(df_group4, on='GROUP_LIMIT')
df_group5


# Coluna de percentual.
df_group5['Percentual(%)'] = df_group5['ID_x'] / df_group5['ID_y'] * 100

# Renomeando as colunas.
df_group5.columns = ['Status do Cliente',
                     'Categoria do Cliente',
                     'Total Por Categoria',
                     'Total Geral',
                     'Percentual(%)']

# Visualizando o dataframe.
df_group5


# Gráfico para a pergunta 1.

# Chart.
chart = sns.barplot(x='Categoria do Cliente', y='Percentual(%)',
                    data=df_group5, hue='Status do Cliente',
                    alpha=.7, palette=['green', 'red'])
# Título.
chart.text(x=0.5, y=95,
           s='Taxa de Inadimplência por Categoria de Clientes',
           fontsize=20, weight='bold', alpha=.75)

# Estilo e labels.
sns.set(font_scale=1.5)
sns.set_palette('prism')
chart.set_xlabel('\nCategorias', fontsize=14)
chart.set_ylabel('Percentual(%)', fontsize=14)

# Legenda.
plt.legend(loc='upper left', borderpad=1.0,
           labelspacing=1.0, fontsize=10, title='Status:');

20:59: E703 statement ends with a semicolon


# Taxa de inadimplência por status.
data = df2.groupby('PAY_1').agg({'TARGET': np.mean})


# Taxa geral de inadimplência.
data['Taxa Geral de Inadimplência'] = tx_target = df2['TARGET'].mean()


# Renomeando a coluna.
data.rename(columns={'TARGET': 'Taxa de Inadimplência por Status'},
            inplace=True)
data


# Gráfico para a pergunta 2.

# Chart.
chart = sns.lineplot(data = data, 
                     alpha = .7,
                     palette = ['green', 'red'], 
                     linewidth = 2.5)

# Título.
chart.text(x = 0.40, 
           y = 0.85, 
           s = 'Taxa de Inadimplência por Status',
           fontsize = 20, 
           weight = 'bold', 
           alpha = .75)

# Estilo e labels.
sns.set(font_scale = 1.5)
chart.set_xlabel('\nStatus', fontsize = 14)
chart.set_ylabel('Percentual(%)', fontsize = 14)

# Legenda.
plt.legend(loc = 'upper left', 
           facecolor = 'w',
           borderpad = 1.0, 
           labelspacing = 1.0, 
           fontsize = 10, 
           title = 'Status:');


# Agrupando os dados.
df_group6 = df2.groupby('TARGET').agg({'ID':'nunique'}).reset_index() 
df_group6


# Agrupando os dados.
df_group7 = df2.groupby(['TARGET','EDUCATION']).agg({'ID':'nunique'}).reset_index()
df_group7


# Concatenando os dados.
df_group8 = df_group7.merge(df_group, on = 'TARGET')
df_group8


# Agrupando os dados.
df_group9 = df2.groupby('EDUCATION').agg({'ID':'nunique'}).reset_index() 
df_group9


# Concatenando os dados.
df_group10 = df_group7.merge(df_group9, on = 'EDUCATION')
df_group10


# Coluna de percentual.
df_group10['Percentual(%)'] = df_group10['ID_x'] / df_group10['ID_y'] * 100

# Renomeando as colunas.
df_group10.columns = ['Status do Cliente', 
                      'Grau Educacional', 
                      'Total Por Categoria', 
                      'Total Geral', 
                      'Percentual(%)']

# Visualizando o dataframe.
df_group10


# Gráfico para a pergunta 3.

# Chart.
chart = sns.barplot(x = 'Grau Educacional', 
                    y = 'Percentual(%)', 
                    data = df_group10, 
                    hue = 'Status do Cliente', 
                    alpha = .7,
                    palette = ['green', 'red'])
# Título.
chart.text(x = 0.5, 
           y = 100, 
           s = 'Taxa de Inadimplência por Grau Educacional',
           fontsize = 20, 
           weight = 'bold', 
           alpha = .75)

# Estilo e labels.
sns.set(font_scale = 1.5)
sns.set_palette('prism')
chart.set_xlabel('\nGrau Educacional', fontsize = 14)
chart.set_ylabel('Percentual(%)', fontsize = 14)

# Legenda.
plt.legend(loc = 'upper left', 
           borderpad = 1.0, 
           labelspacing = 1.0, 
           fontsize = 10, 
           title = 'Status:');


# Boxplots das variáveis.

# Especificando as variáveis.
features = bills

# Plotagem.
for i in range(0, len(features)):
    plt.subplot(2, 3, i + 1)
    sns.boxplot(y = df2[features[i]], 
                color = 'royalblue', 
                orient = 'v')
    plt.tight_layout()


# Boxplots das variáveis.

# Especificando as variáveis.
features = pays

# Plotagem.
for i in range(0, len(features)):
    plt.subplot(2, 3, i + 1)
    sns.boxplot(y = df2[features[i]], 
                color = 'royalblue', 
                orient = 'v')
    plt.tight_layout()


# Boxplots das variáveis.
sub_vars = ['LIMIT_BAL', 'AGE']

# Especificando as variáveis.
features = sub_vars

# Plotagem.
for i in range(0, len(features)):
    plt.subplot(1, len(features), i + 1)
    sns.boxplot(y = df2[features[i]], 
                color = 'royalblue', 
                orient = 'v')
    plt.tight_layout()


# Variáveis para o tratamento.
vars_out = pays + bills + sub_vars
vars_out


# Array vazio.
registros = np.array([True] * len(df2))


# Tratando valores outliers.
for col in vars_out:
    
    # Z-score absoluto.
    zscore = abs(stats.zscore(df2[col])) 
    
    # Filtrando os dados.
    registros = (zscore < 3) & registros


# Filtrando os registros.
df3 = df2[registros]


# Dimensão do dataframe.
df3.shape


# Visualizando o dataframe.
df3.head()


# Definindo os valores e labels.
bins_age = [18, 25, 60, 100]
labels_age = [1, 2, 3]


# Criando a variável.
df3['GROUP_AGE'] = pd.cut(df3['AGE'],
                          bins_age, 
                          labels = labels_age).astype('int64')


# Contagem de registros.
df3['GROUP_AGE'].value_counts()


# Matriz de correlação.
corr_df = df3.corr()

# Plotagem.
sns.heatmap(corr_df, 
            cmap = 'Blues', 
            annot = False, 
            fmt = '.2f');


# Variáveis removidas.
del_vars = ['LIMIT_BAL',
            'AGE',
            'SEX',
            'MARRIAGE',
            'BILL_AMT2',
            'BILL_AMT3',
            'BILL_AMT4',
            'BILL_AMT5',
            'BILL_AMT6',
            'PAY_AMT4',
            'PAY_AMT5',
            'PAY_AMT6',
            'ID']


# Melhores variáveis.
best_vars = [item for item in df3.columns if item not in del_vars]
best_vars


# Seleção de variáveis.
df4 = df3[best_vars]


# Visualizando o dataframe.
df4.head()


# Separando os dados.
X = df4.loc[:, df4.columns != 'TARGET']
y = df4['TARGET']


# Divisão treino/teste.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)


# Dimensão dos conjuntos.
print('Exemplos de Treino: {}'.format(len(X_train)))
print('Exemplos de Teste: {}'.format(len(X_test)))


# Instanciando a classe.
smt = SMOTE()


# Aplicando o balanceamento.
X_train_smt, y_train_smt = smt.fit_resample(X_train, y_train)


# Dimensão do dataframe.
X_train_smt.shape


# Contagem de registros.
y_train_smt.value_counts()


# Instanciando o objeto.
scaler = StandardScaler()


# Padronizando as variáveis.
X_train_sc = scaler.fit_transform(X_train_smt)
X_test_sc = scaler.transform(X_test)


# Criando a função.
def classifiersTraining(features, tTarget, printMeans = True, scoring = 'accuracy'):
    
    # Número de folds.
    num_folds = 5
    
    # Listas para armazenar informações.
    models = [] 
    results = [] 
    names = [] 
    means = pd.DataFrame(columns = ['mean'])

    # Modelos testados.
    models.append(('LR', LogisticRegression()))
    models.append(('LDA', LinearDiscriminantAnalysis()))
    models.append(('KNN', KNeighborsClassifier()))
    models.append(('CART', DecisionTreeClassifier()))
    models.append(('RF', RandomForestClassifier()))

    # Avaliação dos modelos.
    for name, model in models:
        skf = StratifiedKFold(n_splits = num_folds)
        cv_results = cross_val_score(model, features, tTarget, cv = skf, scoring = scoring)
        results.append(cv_results)
        names.append(name)

        # Adicionando os resultados gerados pelo modelo ao dataframe.
        means = means.append(pd.DataFrame(data = [[cv_results.mean()]], 
                                          columns = ['mean'], 
                                          index   = [name]))

        # Imprime a mensagem contendo os resultados obtidos.
        if printMeans:
            msg = '%s: %f' % (name, cv_results.mean())
            print(msg)

    # Salva os resultados em um dataframe.
    results = pd.DataFrame(np.transpose(results), columns = names)

    # Retorna o dataframe com os resultados.
    return (results, means)


# Avaliando os modelos.
results = classifiersTraining(features = X_train_sc, tTarget = y_train_smt)


# Instanciando o modelo.
classifierRF = RandomForestClassifier()


# Criando a função.
def classifiermodel(model, X_train, y_train):
    
    # Treinamento do modelo.
    model = model.fit(X_train, y_train) 
    
    # Regra da validação cruzada.
    skf = StratifiedKFold(n_splits = 5)
    
    # Previsões obtidas na validação cruzada.
    y_train_pred = cross_val_predict(model, X_train, y_train, cv = skf)
    
    # Métricas de desempenho.
    accuracy = accuracy_score(y_train, y_train_pred)
    precision = precision_score(y_train, y_train_pred)
    recall = recall_score(y_train, y_train_pred) 
    f1 = f1_score(y_train, y_train_pred)
    conf = confusion_matrix(y_train, y_train_pred, labels = [1, 0])
    
    print('Modelo:', model)
    print('Acurácia do modelo:', (accuracy) * 100, '%')
    print('Precision do Modelo:', (precision) * 100, '%')
    print('Recall do modelo:', (recall) * 100, '%')
    print('F1 score:', (f1) * 100, '%')
    print('Matriz de Confusão:\n',  (conf))


# Aplicando a função.
classifiermodel(classifierRF, X_train_sc, y_train_smt)


# Previsões com dados de teste.
y_pred = classifierRF.predict(X_test_sc)


# Avaliação do modelo.
print(classification_report(y_test, y_pred))


# Matriz de confusão
conf_test = confusion_matrix(y_test, y_pred, labels = [1, 0])
print('Matriz de Confusão:\n', (conf_test))


# Instanciação do modelo.
classifierRF_2 = RandomForestClassifier()


# Dicionário de parâmetros.
param_grid = [{'n_estimators': [100, 300, 500], 
               'max_features': ['auto', 'sqrt', 'log2']},
              {'bootstrap': [False], 
               'n_estimators': [100, 300, 500], 
               'max_features': ['auto', 'sqrt', 'log2']}]


# Métricas de desempenho.
dic_scores = {'accuracy' :make_scorer(accuracy_score),
              'recall'   :make_scorer(recall_score),
              'precision':make_scorer(precision_score),
              'f1'       :make_scorer(f1_score)}


# Instanciando a classe.
grid_search = GridSearchCV(classifierRF_2, 
                           param_grid, 
                           scoring = dic_scores, 
                           refit = 'f1', 
                           cv = 5)


# Busca pelos melhores hiperparâmetros.
grid_search.fit(X_train_sc, y_train_smt)


# Resultado da otimização.
pd.DataFrame(grid_search.cv_results_)[['params', 
                                       'mean_test_accuracy', 
                                       'mean_test_recall', 
                                       'mean_test_precision', 
                                       'mean_test_f1']]


# Melhor combinação.
grid_search.best_params_


# Modelo otimizado.
classifierRF_2 = grid_search.best_estimator_


# Previsões com dados de teste.
y_pred_2 = classifierRF_2.predict(X_test_sc)


# Avaliação do modelo.
print(classification_report(y_test, y_pred_2))


# Matriz de confusão.
conf_test_2 = confusion_matrix(y_test, y_pred_2, labels = [1, 0])
print('Matriz de Confusão:\n', (conf_test_2))


# Curva ROC AUC.

# Preparação dos dados.
y_pred_proba_RF = classifierRF.predict_proba(X_test_sc)[::,1]
fpr1, tpr1, _ = metrics.roc_curve(y_test,  y_pred_proba_RF)
auc1 = metrics.roc_auc_score(y_test, y_pred_proba_RF)

y_pred_proba_RF2 = classifierRF_2.predict_proba(X_test_sc)[::,1]
fpr2, tpr2, _ = metrics.roc_curve(y_test,  y_pred_proba_RF2)
auc2 = metrics.roc_auc_score(y_test, y_pred_proba_RF2)

# Plotagem.
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr1, tpr1, label = 'Modelo 1 = ' + str(round(auc1, 2)))
plt.plot(fpr2, tpr2, label = 'Modelo 2 = ' + str(round(auc2, 2)))
plt.legend(loc = 4, title = 'Resultado', facecolor = 'white')
plt.xlabel('Taxa de Falsos Positivos', fontsize = 14)
plt.ylabel('Taxa de Verdadeiros Positivos', fontsize = 14)
plt.title('Curva ROC', size = 20);


# Informações do programa de aconselhamento.

# Custo por aconselhamento.
cost_per_counseling = 1200

# Taxa de eficácia esperada.
effectiveness = 0.70

# Média de valores da última fatura.
mean_bill = np.mean(X_test['BILL_AMT1'])


# Probabilidade das previsões.
predict_proba = classifierRF.predict_proba(X_test.values)
predict_proba


# Histograma das previsões.

# Chart.
chart = sns.histplot(x = predict_proba[:,1],
                     bins = 30,
                     color = 'royalblue')

# Estilo e labels.
sns.set(font_scale = 1.5)
sns.set_palette('prism')
chart.set_xlabel('\nProbabilidades', fontsize = 14)
chart.set_ylabel('Número de Contas', fontsize = 14);


# Intervalo de limites.
thresholds = np.linspace(0, 1, 101)
thresholds


# Arrays para armazenar os resultados da análise.

# Número de contas inadimplentes para cada limite.
n_pos_pred = np.empty_like(thresholds)

# Custo total dos aconselhamentos para cada limite.
cost_of_all_counselings = np.empty_like(thresholds)

# Quantidade de verdadeiros positivos (contas previstas como inadimplentes e que de fato, ficaram inadimplentes).
n_true_pos = np.empty_like(thresholds)

# Economia bruta por aconselhamento para cada limite.
savings_of_all_counselings = np.empty_like(thresholds)


# Criando o loop.
counter = 0

for threshold in thresholds:
    pos_pred = predict_proba[:,1] > threshold
    n_pos_pred[counter] = sum(pos_pred)   
    cost_of_all_counselings[counter] = n_pos_pred[counter] * cost_per_counseling   
    true_pos = pos_pred & y_test.astype(bool)    
    n_true_pos[counter] = sum(true_pos)  
    savings_of_all_counselings[counter] = n_true_pos[counter] * mean_bill * effectiveness   
    counter += 1


# Economia líquida.
net_savings = savings_of_all_counselings - cost_of_all_counselings


# Economia líquida gerada por cada valor de limite.

# Chart.
chart = sns.lineplot(thresholds, 
                     net_savings,
                     color = 'royalblue',
                     alpha = .7,
                     linewidth = 2.5)

# Estilo e labels.
sns.set(font_scale = 1.5)
chart.set_xlabel('\nThreshold (Limite)', fontsize = 14)
chart.set_ylabel('Economia Gerada', fontsize = 14);


# Valor limite ideal.
max_savings_ix = np.argmax(net_savings)
thresholds[max_savings_ix]


# Economia gerada com a definição do limite.
net_savings[max_savings_ix]


# Custo de inadimplências sem o programa de aconselhamento.
cost_of_defaults = sum(y_test) * mean_bill
cost_of_defaults


# Percentual de redução dos custos com o programa de aconselhamento.
net_savings[max_savings_ix]/cost_of_defaults


# Contas previstas como inadimplentes em cada limite (flag rate).

# Chart.
chart = sns.lineplot(thresholds, 
                     n_pos_pred/len(y_test),
                     alpha = .7,
                     linewidth = 2.5)

# Estilo e labels.
sns.set(font_scale = 1.5)
chart.set_xlabel('\nThreshold (Limite)', fontsize = 14)
chart.set_ylabel('Flag Rate', fontsize = 14);

	LIMIT_BAL	BILL_AMT1	BILL_AMT2	BILL_AMT3	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6
0	20000	3913	3102	689	0	0	0	0	689	0	0	0	0
1	120000	2682	1725	2682	3272	3455	3261	0	1000	1000	1000	0	2000
2	90000	29239	14027	13559	14331	14948	15549	1518	1500	1000	1000	1000	5000
3	50000	46990	48233	49291	28314	28959	29547	2000	2019	1200	1100	1069	1000
4	50000	8617	5670	35835	20940	19146	19131	2000	36681	10000	9000	689	679

	LIMIT_BAL	BILL_AMT1	BILL_AMT2	BILL_AMT3	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6
0	4000.0	782.6	620.4	137.8	0.0	0.0	0.0	0.0	137.8	0.0	0.0	0.0	0.0
1	24000.0	536.4	345.0	536.4	654.4	691.0	652.2	0.0	200.0	200.0	200.0	0.0	400.0
2	18000.0	5847.8	2805.4	2711.8	2866.2	2989.6	3109.8	303.6	300.0	200.0	200.0	200.0	1000.0
3	10000.0	9398.0	9646.6	9858.2	5662.8	5791.8	5909.4	400.0	403.8	240.0	220.0	213.8	200.0
4	10000.0	1723.4	1134.0	7167.0	4188.0	3829.2	3826.2	400.0	7336.2	2000.0	1800.0	137.8	135.8

	count	mean	std	min	25%	50%	75%	max
LIMIT_BAL	26664.0	33583.810981	25967.890616	2000.0	10000.00	28000.0	48000.00	160000.0
AGE	26664.0	35.505213	9.227442	21.0	28.00	34.0	41.00	79.0
BILL_AMT1	26664.0	10281.146145	14726.737421	-33116.0	716.00	4472.2	13529.95	149362.8
BILL_AMT2	26664.0	9860.000300	14186.909907	-13955.4	599.95	4230.0	12879.10	134312.6
BILL_AMT3	26664.0	9405.268009	13741.071905	-31452.8	525.45	4015.9	12072.00	171017.2
BILL_AMT4	26664.0	8667.778908	12855.050148	-34000.0	468.35	3807.4	10945.50	141372.8
BILL_AMT5	26664.0	8067.627340	12141.188817	-16266.8	349.00	3613.2	10058.10	164708.0
BILL_AMT6	26664.0	7777.974467	11886.508331	-67920.6	251.20	3401.0	9850.75	139988.8
PAY_AMT1	26664.0	1140.817154	3339.879726	0.0	200.00	422.9	1005.40	174710.4
PAY_AMT2	26664.0	1176.221970	4242.861970	0.0	160.40	401.4	1000.00	245416.4
PAY_AMT3	26664.0	1051.902993	3453.087912	0.0	78.00	364.4	911.25	177808.6
PAY_AMT4	26664.0	977.409743	3191.269874	0.0	58.95	300.0	810.10	124200.0
PAY_AMT5	26664.0	968.745995	3062.344359	0.0	48.55	300.0	816.55	85305.8
PAY_AMT6	26664.0	1051.568609	3527.093637	0.0	22.20	300.0	803.00	105733.2

	LIMIT_BAL	GROUP_LIMIT
3981	4000.0	1
18586	16000.0	1
27903	18000.0	1
9450	100000.0	4
11483	20000.0	1

	TARGET	GROUP_LIMIT	ID
0	0	1	10487
1	0	2	6731
2	0	3	2561
3	0	4	971
4	1	1	4023
5	1	2	1343
6	1	3	421
7	1	4	127

Análise de Crédito Bancário - Bank Credit Analytics¶

1. Entendimento do Negócio¶

1.1 Dicionário de Dados¶

1.2 Estratégia da Solução¶

2. Entendimento dos Dados¶

2.1 Bibliotecas Necessárias¶

2.2 Tratamento Inicial dos Dados¶

2.3 Análise Exploratória¶

2.3.1 Variável de Identificação dos Clientes¶

2.3.2 Variáveis Numéricas¶

2.3.3 Variáveis Categóricas¶

2.3.4 Variável Target¶

2.3.5 Perguntas de Negócio¶

2.3.6 Checando Outliers¶

3. Engenharia de Atributos¶

3.1 Extração de Variáveis (Feature Extraction)¶

3.2 Seleção de Variáveis (Feature Selection)¶

4. Pré-Processamento dos Dados¶

4.1 Divisão Treino/Teste¶

4.2 Balanceamento de Classes¶

4.3 Padronização dos Dados¶

5. Modelagem Preditiva¶

5.1 Seleção de Algoritmos¶

5.2 RandomForest - Modelo 1¶

5.3 RandomForest - Modelo 2¶

6. Programa de Aconselhamento de Clientes¶

7. Conclusões Finais¶

	ID	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_1	PAY_2	PAY_3	PAY_4	PAY_5	PAY_6	BILL_AMT1	BILL_AMT2	BILL_AMT3	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6	default payment next month
0	798fc410-45c1	20000	2	2	1	24	2	2	-1	-1	-2	-2	3913	3102	689	0	0	0	0	689	0	0	0	0	1
1	8a8c8f3b-8eb4	120000	2	2	2	26	-1	2	0	0	0	2	2682	1725	2682	3272	3455	3261	0	1000	1000	1000	0	2000	1
2	85698822-43f5	90000	2	2	2	34	0	0	0	0	0	0	29239	14027	13559	14331	14948	15549	1518	1500	1000	1000	1000	5000	0
3	0737c11b-be42	50000	2	2	1	37	0	0	0	0	0	0	46990	48233	49291	28314	28959	29547	2000	2019	1200	1100	1069	1000	0
4	3b7f77cc-dbc0	50000	1	2	1	57	-1	0	-1	0	0	0	8617	5670	35835	20940	19146	19131	2000	36681	10000	9000	689	679	0

	ID	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_1	PAY_2	PAY_3	PAY_4	PAY_5	PAY_6	BILL_AMT1	BILL_AMT2	BILL_AMT3	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6	TARGET
541	d5aeb496-64e5	30000.0	2	1	2	28	1	-2	-2	-2	-2	-1	0.0	0.0	0.0	0.0	0.0	63.6	0.0	0.0	0.0	0.0	63.6	0.0	0
641	d5aeb496-64e5	0.0	0	0	0	0	0	0	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
10491	8567249b-827e	100000.0	1	1	2	38	0	0	0	0	0	0	95493.6	97616.6	99767.8	82364.0	84534.2	78919.8	4200.0	4560.0	3040.0	3800.0	3200.0	2000.0	0
10591	8567249b-827e	0.0	0	0	0	0	0	0	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
19575	443324fb-5cfc	42000.0	2	1	1	31	0	0	0	0	0	0	31508.8	32087.4	25849.4	23915.8	24063.6	21424.4	1501.0	1414.4	801.8	1600.0	2000.0	2400.0	0
19675	443324fb-5cfc	0.0	0	0	0	0	0	0	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
21274	693a0664-bde6	10000.0	1	3	2	51	1	-2	-2	-2	-2	-2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1
21374	693a0664-bde6	0.0	0	0	0	0	0	0	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
27419	f20d8a3d-d047	10000.0	1	1	1	45	-1	-1	-1	-1	-1	-1	78.0	78.0	78.0	78.0	78.0	78.0	78.0	78.0	78.0	78.0	78.0	78.0	1
27519	f20d8a3d-d047	0.0	0	0	0	0	0	0	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0

	ID	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_1	PAY_2	PAY_3	PAY_4	PAY_5	PAY_6	BILL_AMT1	BILL_AMT2	BILL_AMT3	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6	TARGET
0	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	True	True	True	True	False	True	True	True	True	False
1	False	False	False	False	False	False	False	False	True	True	True	False	False	False	False	False	False	False	True	False	False	False	True	False	False
2	False	False	False	False	False	False	True	True	True	True	True	True	False	False	False	False	False	False	False	False	False	False	False	False	True
3	False	False	False	False	False	False	True	True	True	True	True	True	False	False	False	False	False	False	False	False	False	False	False	False	True
4	False	False	False	False	False	False	False	True	False	True	True	True	False	False	False	False	False	False	False	False	False	False	False	False	True

	TARGET	GROUP_LIMIT	ID_x	ID_y
0	0	1	10487	20750
1	0	2	6731	20750
2	0	3	2561	20750
3	0	4	971	20750
4	1	1	4023	5914
5	1	2	1343	5914
6	1	3	421	5914
7	1	4	127	5914

	TARGET	GROUP_LIMIT	ID_x	ID_y
0	0	1	10487	14510
1	1	1	4023	14510
2	0	2	6731	8074
3	1	2	1343	8074
4	0	3	2561	2982
5	1	3	421	2982
6	0	4	971	1098
7	1	4	127	1098

	Status do Cliente	Categoria do Cliente	Total Por Categoria	Total Geral	Percentual(%)
0	0	1	10487	14510	72.274294
1	1	1	4023	14510	27.725706
2	0	2	6731	8074	83.366361
3	1	2	1343	8074	16.633639
4	0	3	2561	2982	85.881958
5	1	3	421	2982	14.118042
6	0	4	971	1098	88.433515
7	1	4	127	1098	11.566485

	Taxa de Inadimplência por Status	Taxa Geral de Inadimplência
PAY_1
-2	0.131664	0.221797
-1	0.170002	0.221797
0	0.128295	0.221797
1	0.336400	0.221797
2	0.694701	0.221797
3	0.773973	0.221797
4	0.682540	0.221797
5	0.434783	0.221797
6	0.545455	0.221797
7	0.777778	0.221797
8	0.588235	0.221797