# Filtragem das mensagens de avisos.
options(warn = -1)
options(dplyr.summarise.inform = FALSE)

# Carregando as bibliotecas. 
library(pacman)

pacman::p_load(caret,
               cowplot,
               dplyr,
               ggcorrplot,
               ggplot2,
               klaR,
               plotROC,
               plyr,
               ROSE,
               stringr)


# Configurações do notebook.

# Plotagens.
options(repr.plot.width = 10, 
        repr.plot.height = 6, 
        scipen = 999)

# Estilo dos gráficos.
theme_set(theme_minimal())


# Carregando o conjunto de dados.
df <- read.csv('data/Telco-Customer-Churn.csv')


# Cópia do dataset.
df1 <- df


# Dimensão do dataframe.
dim(df1)


# Visualizando o dataframe.
head(df1)


# Informações do dataframe.
str(df1)

'data.frame':	7043 obs. of  21 variables:
 $ customerID      : Factor w/ 7043 levels "0002-ORFBO","0003-MKNFE",..: 5376 3963 2565 5536 6512 6552 1003 4771 5605 4535 ...
 $ gender          : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 1 2 1 1 2 ...
 $ SeniorCitizen   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Partner         : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 2 1 ...
 $ Dependents      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 2 ...
 $ tenure          : int  1 34 2 45 2 8 22 10 28 62 ...
 $ PhoneService    : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 2 2 ...
 $ MultipleLines   : Factor w/ 3 levels "No","No phone service",..: 2 1 1 2 1 3 3 2 3 1 ...
 $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 1 1 2 2 2 1 2 1 ...
 $ OnlineSecurity  : Factor w/ 3 levels "No","No internet service",..: 1 3 3 3 1 1 1 3 1 3 ...
 $ OnlineBackup    : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 1 1 3 1 1 3 ...
 $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 1 3 1 3 1 3 1 1 3 1 ...
 $ TechSupport     : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 1 1 1 1 3 1 ...
 $ StreamingTV     : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 3 1 3 1 ...
 $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 1 1 3 1 ...
 $ Contract        : Factor w/ 3 levels "Month-to-month",..: 1 2 1 2 1 1 1 1 1 2 ...
 $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 2 1 2 1 ...
 $ PaymentMethod   : Factor w/ 4 levels "Bank transfer (automatic)",..: 3 4 4 1 3 3 2 4 3 1 ...
 $ MonthlyCharges  : num  29.9 57 53.9 42.3 70.7 ...
 $ TotalCharges    : num  29.9 1889.5 108.2 1840.8 151.7 ...
 $ Churn           : Factor w/ 2 levels "No","Yes": 1 1 2 1 2 2 1 1 2 1 ...


# Convertendo a variável.
df1$SeniorCitizen <- as.factor(df1$SeniorCitizen)


# Verificando registros ausentes.
colSums(is.na(df1))


# Removendo registros ausentes.
df1 <- df1[complete.cases(df1), ]


# Verificando registros duplicados.
table(duplicated(df1))

FALSE 
 7032


# Verificando registros únicos.
sapply(df1, function(x) {length(unique(x))})


# Substituindo os valores "No phone service".
df1$MultipleLines <- as.factor(mapvalues(df1$MultipleLines, from = c('No phone service'), to = c('No')))


# Substituindo os valores "No internet service".

# Índices das variáveis.
cols_recode <- c(10:15)

# Tratando os registros das variáveis.
for(i in 1: ncol(df1[, cols_recode])){
    df1[, cols_recode][,i] <- as.factor(mapvalues(df1[, cols_recode][,i], from = c('No internet service'), to = c('No')))}


# Churn por características dos clientes.
plot_grid(ggplot(df1, aes(x = gender, fill = Churn)) + geom_bar(position = 'fill') + labs(y = '%'),
          ggplot(df1, aes(x = SeniorCitizen, fill = Churn)) + geom_bar(position = 'fill') + labs(y = '%'),
          ggplot(df1, aes(x = Partner, fill = Churn)) + geom_bar(position = 'fill') + labs(y = '%'),
          ggplot(df1, aes(x = Dependents, fill = Churn)) + geom_bar(position = 'fill') + labs(y = '%') +
          scale_x_discrete(labels = function(x) str_wrap(x, width = 10)), align = 'h')


# Churn por informações das contas dos clientes.
plot_grid(ggplot(df1, aes(x = Contract, fill = Churn)) + geom_bar(position = 'fill') + labs(y = '%'), 
          ggplot(df1, aes(x = PaperlessBilling, fill = Churn)) + geom_bar(position = 'fill') + labs(y = '%'),
          ggplot(df1, aes(x = PaymentMethod, fill = Churn)) + geom_bar(position = 'fill') + labs(y = '%') +
          scale_x_discrete(labels = function(x) str_wrap(x, width = 10)), align = 'h')


# Churn por serviços da empresa.
plot_grid(ggplot(df1, aes(x = PhoneService, fill = Churn)) + geom_bar(position = 'fill') + labs(y = '%'), 
          ggplot(df1, aes(x = MultipleLines, fill = Churn)) + geom_bar(position = 'fill') + labs(y = '%'),
          ggplot(df1, aes(x = InternetService, fill = Churn)) + geom_bar(position = 'fill') + labs(y = '%'),
          ggplot(df1, aes(x = OnlineSecurity, fill = Churn)) + geom_bar(position = 'fill') + labs(y = '%'),
          ggplot(df1, aes(x = OnlineBackup, fill = Churn)) + geom_bar(position = 'fill') + labs(y = '%'),
          ggplot(df1, aes(x = DeviceProtection, fill = Churn)) + geom_bar(position = 'fill') + labs(y = '%'),
          ggplot(df1, aes(x = TechSupport, fill = Churn)) + geom_bar(position = 'fill') + labs(y = '%'),
          ggplot(df1, aes(x = StreamingTV, fill = Churn)) + geom_bar(position = 'fill') + labs(y = '%'),
          ggplot(df1, aes(x = StreamingMovies, fill = Churn)) + geom_bar(position = 'fill') + labs(y = '%') + 
          scale_x_discrete(labels = function(x) str_wrap(x, width = 10)), align = 'h')


# Churn total de clientes.
df1 %>%
group_by(Churn) %>% 
dplyr::summarise(Number = n()) %>%
mutate(Porcentagem = prop.table(Number) * 100) %>% 
ggplot(aes(Churn, Porcentagem)) + 
geom_col(aes(fill = Churn), show.legend = FALSE) +
labs(title = 'Churn de Clientes', y = NULL) +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label = sprintf('%.2f%%', Porcentagem)), vjust = -0.5, size = 4)


# Churn por tempo de contrato e pagamentos realizados.
plot_grid(ggplot(df1, aes(x = tenure, fill = Churn)) + geom_histogram(bins = 30), 
          ggplot(df1, aes(x = MonthlyCharges, fill = Churn)) + geom_histogram(bins = 30), 
          ggplot(df1, aes(x = TotalCharges, fill = Churn)) + geom_histogram(bins = 30))


# Pagamento mensal por provedor de internet.
ggplot(df1, aes(x = MonthlyCharges, fill = InternetService)) + geom_histogram(bins = 30) + labs(x = 'Pagamento Mensal')


# Criando a função.
group_tenure <- function(tenure){
    if (tenure >= 0 & tenure <= 12){
        return(1)
    }else if(tenure > 12 & tenure <= 24){
        return(2)
    }else if (tenure > 24 & tenure <= 48){
        return(3)
    }else if (tenure > 48 & tenure <= 60){
        return(4)
    }else if (tenure > 60){
        return(5)
    }}


# Criando nova variável.
df1$tenure_group <- sapply(df1$tenure, group_tenure)

# Convertendo a variável.
df1$tenure_group <- as.factor(df1$tenure_group)


# Level's da variável.
levels(df1$tenure_group)


# Contagem de registros.
table(df1$tenure_group)

   1    2    3    4    5 
2175 1024 1594  832 1407


# Churn por grupos de clientes.
df1 %>%
ggplot(aes(x = tenure_group, fill = Churn)) + 
geom_bar(position = 'fill') + 
labs(x = 'Grupos de Clientes',
     y = '%',
     title = 'Churn por Grupos de Clientes') + 
theme(plot.title = element_text(hjust = 0.5))


# Análise de correlação.
corr <- round(cor(df1[, c('tenure', 'MonthlyCharges', 'TotalCharges')]), 1)

# Plotagem.
ggcorrplot(corr, 
           hc.order = TRUE, 
           type = 'lower',
           outline.col = 'white',
           ggtheme = ggplot2::theme_gray, 
           lab = TRUE)
colors = c('#6D9EC1', 'white', '#E46726')


# Seleção de variáveis.
df2 <- subset(df1, select = -c(customerID, gender, tenure, PhoneService, MultipleLines, TotalCharges))


# Visualizando o dataframe.
head(df2)


# Divindo os dados.
set.seed(42)
ind <- createDataPartition(df2$Churn, p = .70, list = F)

# Conjuntos de treino e teste.
train <- df2[ind, ]
test  <- df2[-ind, ]


# Dimensão do conjunto de treino.
dim(train)


# Proporção da variável target.
prop.table(table(train$Churn))

       No       Yes 
0.7341592 0.2658408


# Balanceando as classes.
train <- ovun.sample(Churn ~ ., data = train, method = 'over')$data


# Dimensão do conjunto de treino.
dim(train)


# Proporção da variável target.
prop.table(table(train$Churn))

       No       Yes 
0.4988959 0.5011041


# Variáveis multiclasse.

# One Hot Enconding.
dummies_model <- dummyVars(Churn ~ InternetService + Contract + PaymentMethod, data = train)

# Aplicando a transformação. 
trainData <- data.frame(predict(dummies_model, newdata = train))

# Adicionando as demais variáveis.
trainData <- cbind(trainData, train)

# Removendo as variáveis transformadas.
trainData <- subset(trainData, select = -c(InternetService, Contract, PaymentMethod))


# Variáveis binárias.

# Índices das variáveis.
cols_recode <- c(12:20)

# Tratando os registros das variáveis.
for(i in 1: ncol(trainData[, cols_recode])){
    trainData[, cols_recode][,i] <- as.factor(mapvalues(trainData[, cols_recode][,i], from = c('No', 'Yes'), to = c('0', '1')))}


# Variáveis multiclasse.

# One Hot Enconding.
testData <- data.frame(predict(dummies_model, test))

# Adicionando as demais variáveis.
testData <- cbind(testData, test)

# Removendo as variáveis transformadas.
testData <- subset(testData, select = -c(InternetService, Contract, PaymentMethod))


# Variáveis binárias.

# Índices das variáveis.
cols_recode <- c(12:20)

# Tratando os registros das variáveis.
for(i in 1: ncol(testData[, cols_recode])){
    testData[, cols_recode][,i] <- as.factor(mapvalues(testData[, cols_recode][,i], from = c('No', 'Yes'), to = c('0', '1')))}


# Conjunto de treino.
range_model <- preProcess(trainData, method = 'range')

# Aplicando a transformação. 
trainData <- data.frame(predict(range_model, newdata = trainData))


# Conjunto de teste.
testData <- predict(range_model, newdata = testData)


# Conjunto de treino.
str(trainData)

'data.frame':	7246 obs. of  23 variables:
 $ InternetService.DSL                    : num  1 1 1 0 1 1 0 0 0 0 ...
 $ InternetService.Fiber.optic            : num  0 0 0 1 0 0 0 1 1 0 ...
 $ InternetService.No                     : num  0 0 0 0 0 0 1 0 0 1 ...
 $ Contract.Month.to.month                : num  1 0 0 1 0 1 0 0 1 0 ...
 $ Contract.One.year                      : num  0 1 1 0 1 0 0 1 0 1 ...
 $ Contract.Two.year                      : num  0 0 0 0 0 0 1 0 0 0 ...
 $ PaymentMethod.Bank.transfer..automatic.: num  0 0 1 0 1 0 0 0 0 0 ...
 $ PaymentMethod.Credit.card..automatic.  : num  0 0 0 1 0 0 1 1 0 0 ...
 $ PaymentMethod.Electronic.check         : num  1 0 0 0 0 0 0 0 1 0 ...
 $ PaymentMethod.Mailed.check             : num  0 1 0 0 0 1 0 0 0 1 ...
 $ SeniorCitizen                          : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ Partner                                : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 1 2 1 1 ...
 $ Dependents                             : Factor w/ 2 levels "0","1": 1 1 1 2 2 2 1 1 1 1 ...
 $ OnlineSecurity                         : Factor w/ 2 levels "0","1": 1 2 2 1 2 2 1 1 2 1 ...
 $ OnlineBackup                           : Factor w/ 2 levels "0","1": 2 1 1 2 2 1 1 1 1 1 ...
 $ DeviceProtection                       : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 2 2 1 ...
 $ TechSupport                            : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 2 1 ...
 $ StreamingTV                            : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 2 2 1 ...
 $ StreamingMovies                        : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 2 1 ...
 $ PaperlessBilling                       : Factor w/ 2 levels "0","1": 2 1 1 2 1 2 1 1 2 1 ...
 $ MonthlyCharges                         : num  0.116 0.385 0.24 0.706 0.377 ...
 $ Churn                                  : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ tenure_group                           : Factor w/ 5 levels "1","2","3","4",..: 1 3 3 2 5 2 2 4 3 4 ...


# Conjunto de teste.
str(testData)

'data.frame':	2108 obs. of  23 variables:
 $ InternetService.DSL                    : num  0 1 0 0 0 1 0 1 0 0 ...
 $ InternetService.Fiber.optic            : num  1 0 1 1 1 0 0 0 1 1 ...
 $ InternetService.No                     : num  0 0 0 0 0 0 1 0 0 0 ...
 $ Contract.Month.to.month                : num  1 1 0 1 1 0 1 1 1 1 ...
 $ Contract.One.year                      : num  0 0 0 0 0 1 0 0 0 0 ...
 $ Contract.Two.year                      : num  0 0 1 0 0 0 0 0 0 0 ...
 $ PaymentMethod.Bank.transfer..automatic.: num  0 0 0 0 0 0 1 1 0 0 ...
 $ PaymentMethod.Credit.card..automatic.  : num  0 0 1 0 1 0 0 0 0 0 ...
 $ PaymentMethod.Electronic.check         : num  1 0 0 1 0 0 0 0 1 1 ...
 $ PaymentMethod.Mailed.check             : num  0 1 0 0 0 1 0 0 0 0 ...
 $ SeniorCitizen                          : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 2 1 1 ...
 $ Partner                                : Factor w/ 2 levels "0","1": 1 1 2 1 2 2 1 1 1 1 ...
 $ Dependents                             : Factor w/ 2 levels "0","1": 1 1 2 1 1 2 1 1 1 1 ...
 $ OnlineSecurity                         : Factor w/ 2 levels "0","1": 1 2 2 1 1 2 1 1 1 1 ...
 $ OnlineBackup                           : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 1 1 1 2 ...
 $ DeviceProtection                       : Factor w/ 2 levels "0","1": 2 1 2 2 2 2 1 1 1 2 ...
 $ TechSupport                            : Factor w/ 2 levels "0","1": 1 1 2 1 1 2 1 1 1 1 ...
 $ StreamingTV                            : Factor w/ 2 levels "0","1": 2 1 2 1 2 1 1 1 1 2 ...
 $ StreamingMovies                        : Factor w/ 2 levels "0","1": 2 1 2 2 2 1 1 1 1 2 ...
 $ PaperlessBilling                       : Factor w/ 2 levels "0","1": 2 1 1 2 2 1 1 1 2 2 ...
 $ MonthlyCharges                         : num  0.811 0.115 0.946 0.715 0.769 ...
 $ Churn                                  : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 2 2 ...
 $ tenure_group                           : Factor w/ 5 levels "1","2","3","4",..: 1 1 5 2 1 3 1 1 1 3 ...


# Configurações da validação cruzada.
control <- trainControl(method = 'cv', number = 5)


# Treinando os modelos.

# Logistic Regression.
modelGLM <- train(Churn ~ ., data = trainData, method = 'glm', trControl = control)

# Naive Bayes.
modelNB <- train(Churn ~ ., data = trainData, method = 'nb', trControl = control)

# Decision Tree. 
modelDT <- train(Churn ~ ., data = trainData, method = 'ctree', trControl = control) 

# Random Forest.
modelRF <- train(Churn ~ ., data = trainData, method = 'rf', trControl = control)


# Obtendo os resultados.
results <- resamples(list(GLM = modelGLM, NB = modelNB, DT = modelDT, RF = modelRF))
summary(results)
bwplot(results)
dotplot(results)

Call:
summary.resamples(object = results)

Models: GLM, NB, DT, RF 
Number of resamples: 5 

Accuracy 
         Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
GLM 0.7413793 0.7494824 0.7681159 0.7631827 0.7763975 0.7805383    0
NB  0.7280883 0.7425811 0.7460317 0.7462024 0.7508627 0.7634483    0
DT  0.7556936 0.7653554 0.7681159 0.7686990 0.7758621 0.7784679    0
RF  0.8689655 0.8743961 0.8785369 0.8768987 0.8799172 0.8826777    0

Kappa 
         Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
GLM 0.4826149 0.4988810 0.5361875 0.5262759 0.5527190 0.5609769    0
NB  0.4560127 0.4850584 0.4918958 0.4922731 0.5016477 0.5267507    0
DT  0.5112901 0.5306069 0.5361132 0.5373030 0.5516508 0.5568541    0
RF  0.7378353 0.7487136 0.7570034 0.7537160 0.7597510 0.7652766    0


# Previsões do modelo.

# Previsões com dados de treino.
rf_train <- predict(modelRF, trainData)

# Previsões com dados de teste.
rf_test <- predict(modelRF, testData)


# Desempenho com dados de treino.
confusionMatrix(rf_train, trainData$Churn, positive = 'Yes', mode = 'everything')

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  3472   10
       Yes  143 3621
                                               
               Accuracy : 0.9789               
                 95% CI : (0.9753, 0.9821)     
    No Information Rate : 0.5011               
    P-Value [Acc > NIR] : < 0.00000000000000022
                                               
                  Kappa : 0.9578               
                                               
 Mcnemar's Test P-Value : < 0.00000000000000022
                                               
            Sensitivity : 0.9972               
            Specificity : 0.9604               
         Pos Pred Value : 0.9620               
         Neg Pred Value : 0.9971               
              Precision : 0.9620               
                 Recall : 0.9972               
                     F1 : 0.9793               
             Prevalence : 0.5011               
         Detection Rate : 0.4997               
   Detection Prevalence : 0.5195               
      Balanced Accuracy : 0.9788               
                                               
       'Positive' Class : Yes


# Desempenho com dados de teste.
confusionMatrix(rf_test, test$Churn, positive = 'Yes', mode = 'everything')

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  1277  212
       Yes  271  348
                                          
               Accuracy : 0.7709          
                 95% CI : (0.7523, 0.7887)
    No Information Rate : 0.7343          
    P-Value [Acc > NIR] : 0.00006471      
                                          
                  Kappa : 0.4318          
                                          
 Mcnemar's Test P-Value : 0.008313        
                                          
            Sensitivity : 0.6214          
            Specificity : 0.8249          
         Pos Pred Value : 0.5622          
         Neg Pred Value : 0.8576          
              Precision : 0.5622          
                 Recall : 0.6214          
                     F1 : 0.5903          
             Prevalence : 0.2657          
         Detection Rate : 0.1651          
   Detection Prevalence : 0.2936          
      Balanced Accuracy : 0.7232          
                                          
       'Positive' Class : Yes


# Previsões do modelo.

# Previsões com dados de treino.
glm_train <- predict(modelGLM, trainData)

# Previsões com dados de teste.
glm_test <- predict(modelGLM, testData)


# Desempenho com dados de treino.
confusionMatrix(glm_train, trainData$Churn, positive = 'Yes', mode = 'everything')

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  2613  702
       Yes 1002 2929
                                               
               Accuracy : 0.7648               
                 95% CI : (0.7549, 0.7746)     
    No Information Rate : 0.5011               
    P-Value [Acc > NIR] : < 0.00000000000000022
                                               
                  Kappa : 0.5296               
                                               
 Mcnemar's Test P-Value : 0.0000000000004379   
                                               
            Sensitivity : 0.8067               
            Specificity : 0.7228               
         Pos Pred Value : 0.7451               
         Neg Pred Value : 0.7882               
              Precision : 0.7451               
                 Recall : 0.8067               
                     F1 : 0.7747               
             Prevalence : 0.5011               
         Detection Rate : 0.4042               
   Detection Prevalence : 0.5425               
      Balanced Accuracy : 0.7647               
                                               
       'Positive' Class : Yes


# Desempenho com dados de teste.
confusionMatrix(glm_test, test$Churn, positive = 'Yes', mode = 'everything')

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  1138  103
       Yes  410  457
                                              
               Accuracy : 0.7566              
                 95% CI : (0.7377, 0.7748)    
    No Information Rate : 0.7343              
    P-Value [Acc > NIR] : 0.01044             
                                              
                  Kappa : 0.4691              
                                              
 Mcnemar's Test P-Value : < 0.0000000000000002
                                              
            Sensitivity : 0.8161              
            Specificity : 0.7351              
         Pos Pred Value : 0.5271              
         Neg Pred Value : 0.9170              
              Precision : 0.5271              
                 Recall : 0.8161              
                     F1 : 0.6405              
             Prevalence : 0.2657              
         Detection Rate : 0.2168              
   Detection Prevalence : 0.4113              
      Balanced Accuracy : 0.7756              
                                              
       'Positive' Class : Yes


# Variáveis mais importantes.
varImp <- varImp(modelGLM)
plot(varImp, main = 'Melhores Variáveis GLM')


# Previsões do modelo.

# Previsões com dados de treino.
dt_train <- predict(modelDT, trainData)

# Previsões com dados de teste.
dt_test <- predict(modelDT, testData)


# Desempenho com dados de treino.
confusionMatrix(dt_train, trainData$Churn, positive = 'Yes', mode = 'everything')

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  2869  535
       Yes  746 3096
                                               
               Accuracy : 0.8232               
                 95% CI : (0.8142, 0.8319)     
    No Information Rate : 0.5011               
    P-Value [Acc > NIR] : < 0.00000000000000022
                                               
                  Kappa : 0.6464               
                                               
 Mcnemar's Test P-Value : 0.000000004427       
                                               
            Sensitivity : 0.8527               
            Specificity : 0.7936               
         Pos Pred Value : 0.8058               
         Neg Pred Value : 0.8428               
              Precision : 0.8058               
                 Recall : 0.8527               
                     F1 : 0.8286               
             Prevalence : 0.5011               
         Detection Rate : 0.4273               
   Detection Prevalence : 0.5302               
      Balanced Accuracy : 0.8231               
                                               
       'Positive' Class : Yes


# Desempenho com dados de teste.
confusionMatrix(dt_test, test$Churn, positive = 'Yes', mode = 'everything')

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  1165  159
       Yes  383  401
                                             
               Accuracy : 0.7429             
                 95% CI : (0.7237, 0.7614)   
    No Information Rate : 0.7343             
    P-Value [Acc > NIR] : 0.1943             
                                             
                  Kappa : 0.4156             
                                             
 Mcnemar's Test P-Value : <0.0000000000000002
                                             
            Sensitivity : 0.7161             
            Specificity : 0.7526             
         Pos Pred Value : 0.5115             
         Neg Pred Value : 0.8799             
              Precision : 0.5115             
                 Recall : 0.7161             
                     F1 : 0.5967             
             Prevalence : 0.2657             
         Detection Rate : 0.1902             
   Detection Prevalence : 0.3719             
      Balanced Accuracy : 0.7343             
                                             
       'Positive' Class : Yes


# Previsões em probabilidades.
dt_pb <- predict(modelDT, testData, type = 'prob')[,2]
glm_pb <- predict(modelGLM, testData, type = 'prob')[,2]
rf_pb <- predict(modelRF, testData, type = 'prob')[,2]


# Concatenando os dados.
roc.data <- cbind(testData, dt_pb, glm_pb, rf_pb)
head(roc.data[, c('Churn', 'dt_pb', 'glm_pb', 'rf_pb')])


# Preparando os dados.
roc.long <- melt_roc(roc.data, d = 'Churn', m = c('dt_pb', 'glm_pb', 'rf_pb'))


# ROC AUC.
rocplot <- ggplot(roc.long, aes(d = ifelse(D == 'Yes', 1, 0), m = M, color = name)) +
geom_roc(n.cuts = 0) +
style_roc(xlab = 'Falsos Positivos',
          ylab = 'Verdadeiros Positivos') +
labs(title = 'Curva ROC', color = 'Legenda') +
theme(plot.title = element_text(hjust = 0.5))

# Plotagem.
rocplot +
geom_abline(size = 0.5, color = 'grey30') +
annotate('text', x = .75, y = .35, label = paste('AUC DT =', round(calc_auc(rocplot)$AUC[1],3))) +
annotate('text', x = .75, y = .28, label = paste('AUC GLM =', round(calc_auc(rocplot)$AUC[2],3))) +
annotate('text', x = .75, y = .21, label = paste('AUC RF =', round(calc_auc(rocplot)$AUC[3],3)))

	Churn	dt_pb	glm_pb	rf_pb
6	Yes	0.98076923	0.91767440	0.978
8	No	0.61764706	0.55198729	0.912
16	No	0.03478261	0.06004441	0.086
20	No	0.66666667	0.77068851	0.422
32	No	0.75789474	0.90399770	0.822
33	No	0.07692308	0.06820508	0.126

Rotatividade de Clientes - Customer Churn Analytics¶

1. Entendimento do Negócio¶

1.1 Dicionário de Dados¶

1.2 Estratégia da Solução¶

2. Entendimento dos Dados¶

2.1 Bibliotecas Necessárias¶

2.2 Tratamento Inicial dos Dados¶

2.3 Análise Exploratória¶

2.3.1 Variáveis Categóricas¶

2.3.2 Variáveis Numéricas¶

3. Engenharia de Atributos¶

3.1 Extração de Variáveis (Feature Extraction)¶

3.2 Seleção de Variáveis (Feature Selection)¶

4. Pré-Processamento dos Dados¶

4.1 Divisão Treino/Teste¶

4.2 Balanceamento de Classes¶

4.3 Codificação de Variáveis¶

4.3.1 Conjunto de Treino¶

4.3.2 Conjunto de Teste¶

4.4 Normalização de Variáveis¶

5. Modelagem Preditiva¶

5.1 Random Forest¶

5.2 Logistic Regression¶

5.3 Decision Tree¶

6. Comparando os Modelos¶

7. Conclusões Finais¶

customerID	gender	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	...	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
7590-VHVEG	Female	Yes	No	1	No	No phone service	DSL	No	...	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
5575-GNVDE	Male	No	No	34	Yes	No	DSL	Yes	...	Yes	No	No	No	One year	No	Mailed check	56.95	1889.50	No
3668-QPYBK	Male	No	No	2	Yes	No	DSL	Yes	...	No	No	No	No	Month-to-month	Yes	Mailed check	53.85	108.15	Yes
7795-CFOCW	Male	No	No	45	No	No phone service	DSL	Yes	...	Yes	Yes	No	No	One year	No	Bank transfer (automatic)	42.30	1840.75	No
9237-HQITU	Female	No	No	2	Yes	No	Fiber optic	No	...	No	No	No	No	Month-to-month	Yes	Electronic check	70.70	151.65	Yes
9305-CDSKC	Female	No	No	8	Yes	Yes	Fiber optic	No	...	Yes	No	Yes	Yes	Month-to-month	Yes	Electronic check	99.65	820.50	Yes