# Filtragem das mensagens de avisos.
options(warn = -1)
options(dplyr.summarise.inform = FALSE)

# Carregando as bibliotecas. 
library(pacman)

pacman::p_load(clustertend,
               corrplot,
               data.table,
               dplyr,
               factoextra,
               ggplot2,
               grid,
               lubridate,
               NbClust,
               readr,
               scales,
               tidyr,
               tidytext,
               wesanderson)


# Configurações do notebook.

# Plotagens.
options(repr.plot.width = 10, 
        repr.plot.height = 6, 
        scipen = 999)

# Estilo dos gráficos.
theme_set(theme_classic())

# Função para alterar o tamanho da área de plotagens.
fig <- function(width, heigth){
    options(repr.plot.width = width, 
            repr.plot.height = heigth)}


# Carregando o conjunto de dados.
df <- read_csv('dataset.csv')

-- Column specification --------------------------------------------------------
cols(
  id_transacao = col_character(),
  horario_pedido = col_datetime(format = ""),
  localidade = col_double(),
  nome_item = col_character(),
  quantidade_item = col_double(),
  latitude = col_double(),
  longitude = col_double()
)


# Cópia do dataset.
df1 <- df


# Dimensão do dataframe.
dim(df1)


# Visualizando o dataframe.
head(df1)


# Informações do dataframe.
str(df1)

spec_tbl_df [260,645 x 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ id_transacao   : chr [1:260645] "0x7901ee" "0x7901ee" "0x7901ee" "0x12b47f" ...
 $ horario_pedido : POSIXct[1:260645], format: "2019-01-16 18:33:00" "2019-01-16 18:33:00" ...
 $ localidade     : num [1:260645] 7 7 7 3 3 6 6 2 2 2 ...
 $ nome_item      : chr [1:260645] "bebida" "pizza" "sobremesa" "salada" ...
 $ quantidade_item: num [1:260645] 2 2 2 1 1 2 2 2 2 2 ...
 $ latitude       : num [1:260645] 41.8 41.8 41.8 41.9 41.9 ...
 $ longitude      : num [1:260645] -88 -88 -88 -87.6 -87.6 ...
 - attr(*, "spec")=
  .. cols(
  ..   id_transacao = col_character(),
  ..   horario_pedido = col_datetime(format = ""),
  ..   localidade = col_double(),
  ..   nome_item = col_character(),
  ..   quantidade_item = col_double(),
  ..   latitude = col_double(),
  ..   longitude = col_double()
  .. )


# Verificando registros ausentes.
colSums(is.na(df1))


# Verificando registros duplicados.
table(duplicated(df1))

 FALSE 
260645


# Verificando registros únicos.
sapply(df1, function(x) {length(unique(x))})


# Excluindo as variáveis.
df1 <- subset(df1, select = -c(latitude, longitude))


# Variáveis do dataframe.
colnames(df1)


# Estatísticas das variáveis.
summary(df1[c('horario_pedido',
              'localidade', 
              'quantidade_item')])

 horario_pedido                  localidade    quantidade_item
 Min.   :2019-01-01 00:00:00   Min.   :1.000   Min.   :1.000  
 1st Qu.:2019-04-02 13:12:00   1st Qu.:3.000   1st Qu.:1.000  
 Median :2019-07-01 11:49:00   Median :5.000   Median :2.000  
 Mean   :2019-07-01 21:07:00   Mean   :5.135   Mean   :2.447  
 3rd Qu.:2019-09-30 18:07:00   3rd Qu.:7.000   3rd Qu.:4.000  
 Max.   :2019-12-30 23:59:00   Max.   :9.000   Max.   :5.000


# Contagem de registros.
table(df1$nome_item)

   bebida     pizza    salada sobremesa 
    46156     76122     38367    100000


# Boxplot's dos itens.
df1 %>%
ggplot(aes(x = nome_item, y = quantidade_item)) +
geom_boxplot(aes(fill = nome_item), show.legend = FALSE) +
scale_fill_brewer(palette = 'Spectral') +
ggtitle('Boxplot dos Itens') + theme(plot.title = element_text(hjust = 0.5)) + 
labs(x = 'Quantidade', y = 'Itens')


# Histogramas dos itens.
df1 %>%
select(nome_item, quantidade_item) %>% 
ggplot(aes(x = quantidade_item, fill = nome_item)) +
geom_histogram(aes(y = ..density..), bins = 5,  show.legend = FALSE) +
scale_fill_brewer(palette = 'Spectral') +
facet_grid(~nome_item) +
ggtitle('Histogramas dos Itens') + theme(plot.title = element_text(hjust = 0.5)) +  
labs(x = 'Quantidade', y = 'Frêquencia')


# Pergunta 1.
df1 %>% 
mutate(mes = month(horario_pedido)) %>%
ggplot(aes(mes)) +
geom_freqpoly(binwidth = 1) +
ylim(c(19900, 23000)) +
scale_x_continuous(breaks = 1:12) +
ggtitle('Pedidos Realizados por Mês') + theme(plot.title = element_text(hjust = 0.5)) + 
labs(x = 'Mês', y = 'Quantidade')


# Pergunta 2.
df1 %>% 
mutate(dia = day(horario_pedido),
       mes = month(horario_pedido),
       ano = year(horario_pedido)) %>% 
ggplot(aes(dia)) +
geom_freqpoly(binwidth = 1) +
ylim(c(7500, 10000)) +
scale_x_continuous(breaks = 1:30) +
ggtitle('Pedidos Realizados por Dia do Mês') + theme(plot.title = element_text(hjust = 0.5)) + 
labs(x = 'Dia', y = 'Quantidade')


# Criando a função.
p_dia_semana <- function(month, color){
    df1 %>%
    mutate(ano = year(horario_pedido),
           mes = month(horario_pedido),
           dia_mes = day(horario_pedido),
           dia_semana = weekdays(horario_pedido),
           horas = hour(horario_pedido)) %>%
    mutate(dia_semana = factor(dia_semana, levels = c('segunda-feira',
                                                      'terça-feira',
                                                      'quarta-feira',
                                                      'quinta-feira',
                                                      'sexta-feira', 
                                                      'sábado', 
                                                      'domingo'),
                               labels = c('seg', 'ter', 'qua', 'qui', 'sex', 'sáb', 'dom'))) %>%
    filter(ano == 2019, mes == month) %>% 
    group_by(dia_semana) %>% 
    dplyr::summarise(n = n()) %>% 
    ggplot(aes(x = dia_semana, y = n)) +
    geom_col(size = 1, fill = color) +
    ggtitle(sprintf('Mês %d', month))+
    xlab('') +
    geom_text(aes(label = round(n/sum(n), 2)), 
              vjust = 1.6, 
              color = 'black', 
              size = 5)}


# Pergunta 3.

# Redefinindo a área de plotagem.
fig(15,15)

# Plotagens.
gridExtra::grid.arrange(p_dia_semana(1, 'lightblue'),
                        p_dia_semana(2, 'lightblue'),
                        p_dia_semana(3, 'lightblue'),
                        p_dia_semana(4, 'lightblue'),
                        p_dia_semana(5, 'lightblue'),
                        p_dia_semana(6, 'lightblue'),
                        p_dia_semana(7, 'lightblue'),
                        p_dia_semana(8, 'lightblue'),
                        p_dia_semana(9, 'lightblue'),
                        p_dia_semana(10, 'lightblue'),
                        p_dia_semana(11, 'lightblue'),
                        p_dia_semana(12, 'lightblue'),
                        ncol = 2, nrow = 6,
                        top = textGrob('Pedidos por Dia da Semana - 2019',
                                       gp = gpar(fontsize = 20)))


# Pergunta 4.
df1 %>%
mutate(hora = hour(horario_pedido),
       dia = day(horario_pedido),
       mes = month(horario_pedido),
       ano = year(horario_pedido)) %>%
ggplot(aes(hora)) +
geom_freqpoly(binwidth = 1) +
scale_x_continuous(breaks = 0:23) +
ggtitle('Pedidos por Horário') + theme(plot.title = element_text(hjust = 0.5)) +
labs(x = 'Hora', y = 'Quantidade')


# Pergunta 5.

# Redefinindo a área de plotagem.
fig(13,8)

df1 %>%
mutate(hora = hour(horario_pedido),
       dia = day(horario_pedido),
       mes = month(horario_pedido),
       ano = year(horario_pedido)) %>%
select(mes, hora, nome_item, quantidade_item) %>% 
group_by(mes, hora, nome_item) %>% 
summarise(tot = sum(quantidade_item)) %>%

# Plotagem.
ggplot(aes(x = tot, y = reorder(nome_item, tot), fill = nome_item)) +
geom_col() +
scale_fill_brewer(palette = 'Spectral') +
scale_x_continuous(labels = label_number(scale = 1/1000)) +
guides(fill = F) +
facet_grid(mes~hora) + 
ggtitle('Número de Pedidos por Horário') + theme(plot.title = element_text(hjust = 0.5)) + 
labs(x = 'Quantidade (milhar)', y = NULL)


# Pergunta 6.
df1 %>%
group_by(localidade) %>% 
dplyr::summarise(n = n()) %>% 
ggplot(aes(x = reorder(localidade, n), y = n)) +
geom_col(fill = 'lightblue') +
coord_flip() +
ggtitle('Total de Pedidos por Localidade') + theme(plot.title = element_text(hjust = 0.5)) +
xlab("Localidade") + ylab("Número de Pedidos") +
geom_text(aes(label = round(n/sum(n), 2)),
          vjust = 0.5, 
          color = 'black', 
          size = 3.5, 
          hjust = 1.5)


# Pergunta 7.
df1 %>%
mutate(hora = hour(horario_pedido),
       dia = day(horario_pedido),
       mes = month(horario_pedido),
       ano = year(horario_pedido)) %>%
select(localidade, hora, mes, nome_item, quantidade_item) %>%
mutate(localidade = as.factor(localidade)) %>% 
group_by(localidade, hora, mes, nome_item) %>% 
summarise(tot = sum(quantidade_item)) %>%

# PLotagem.
ggplot(aes(x = tot, y = reorder_within(nome_item, tot, localidade), fill = nome_item)) +
geom_col() +
scale_y_reordered() +
scale_fill_brewer(palette = 'Spectral') +
facet_wrap(~localidade, scales = 'free_y') +
guides(fill = F) +
ggtitle('Itens Pedidos por Localidade') + theme(plot.title = element_text(hjust = 0.5)) + 
labs(x = NULL, y = NULL)


# Tabela pivot.
df2 <- df1 %>% 
group_by(id_transacao) %>% 
spread(nome_item, quantidade_item) %>%

# Tratando valores.
tidyr::replace_na((list(bebida = 0,
                        pizza = 0,
                        salada = 0,
                        sobremesa = 0)))


# Visualizando o dataframe.
head(df2)


# Seleção das variáveis.
df3 <- df2 %>% 
select(localidade, bebida, pizza, salada, sobremesa)

# Excluindo a variável.
df3$id_transacao <- NULL

Adding missing grouping variables: `id_transacao`


# Informações do dataframe.
str(df3)

tibble [100,000 x 5] (S3: tbl_df/tbl/data.frame)
 $ localidade: num [1:100000] 7 3 6 2 8 6 7 6 2 7 ...
 $ bebida    : num [1:100000] 2 0 0 2 0 0 3 1 1 2 ...
 $ pizza     : num [1:100000] 2 0 2 2 0 1 3 4 2 2 ...
 $ salada    : num [1:100000] 0 1 0 0 3 0 1 0 0 1 ...
 $ sobremesa : num [1:100000] 2 1 2 2 3 1 4 4 2 3 ...


# Função para normalização.
dadosnorm <- function(x) {return ((x - min(x)) / (max(x) - min(x)))}


# Aplicando a normalização.
dfnorm <- as.data.frame(lapply(df3, dadosnorm))


# Dataset original.
head(df3)


# Dataset normalizado.
head(dfnorm)


# Amostra dos dados.
set.seed(42)
sample_df <- dfnorm %>% as_tibble() %>% 
sample_frac(0.02)


# Visualizando o dataframe.
head(sample_df)


# Teste de Hopkins.
hopkins(sample_df, n = nrow(sample_df)-1)


# Criando a função.
elbow <- function(dataset){
    wss <- numeric(15)
    for (i in 1:15) wss[i] <- sum(kmeans(dataset, centers = i, nstart = 100)$withinss)
    plot(1:15, wss, type = 'b', main = 'Método de Elbow', xlab = 'Número de Clusters', pch = 8)}


# Método de Elbow.
elbow(sample_df)


# Instanciando e treinando o modelo.
set.seed(42)
modelo <- kmeans(dfnorm, 3)


# Tamanho dos clusters.
modelo$size


# Clusterização com dados amostrais.
cluster_viz <- eclust(sample_df, 'kmeans', k = 3, graph = FALSE)
fviz_cluster(cluster_viz, geom = 'point')


# Associando o resultado ao dataframe.
df2$cluster <- modelo$cluster
head(df2)


# Pedidos por clusters.
df2 %>% as.data.frame() %>%
select(bebida, pizza, salada, sobremesa, cluster) %>%
as_tibble() %>%
gather(item, qtd, 1:4) %>%
mutate(cluster = factor(cluster, levels = c(1, 2, 3),
                        labels = c('Cluster 1', 'Cluster 2', 'Cluster 3'))) %>%
group_by(cluster) %>%
dplyr::summarise(qtd = sum(qtd)) %>%

# Plotagem.
ggplot(aes(x = cluster, y = qtd)) +
geom_col(aes(fill = cluster)) +
ggtitle('Pedidos Realizados por Clusters') + theme(plot.title = element_text(hjust = 0.5)) +
labs(x = 'Clusters', y = 'Quantidade') +
geom_text(aes(label = round(qtd/sum(qtd), 2)),
          vjust = 1.6, 
          color = 'white', 
          size = 3.5) +
scale_fill_discrete(name = 'Legendas')


# Criando funções.

# Itens mais pedidos.
qnt_pedidos <- function(x, cor_escolhida){
    df2 %>% as.data.frame() %>%
    mutate(Cluster = factor(cluster, levels = c(1, 2, 3),
                            labels = c('Cluster 1', 'Cluster 2', 'Cluster 3'))) %>% 
    filter(Cluster == x) %>% 
    select(bebida, pizza, salada, sobremesa) %>% 
    as_tibble() %>% 
    gather(item, qtd, 1:4) %>% 
    group_by(item) %>% 
    dplyr::summarise(qtd = sum(qtd)) %>% 
    ggplot(aes(x = item, y = qtd)) +
    geom_col(fill = cor_escolhida) +
    ylim(c(0, 150000)) +
    ggtitle('Itens Mais Pedidos') +
    geom_text(aes(label = round(qtd/sum(qtd), 2)), 
              vjust = -1, 
              color = 'black', 
              size = 3.5)}


# Distribuição de frequência.
frq_pedidos <- function(x,y){
    df2 %>% as.data.frame() %>% 
    mutate(Cluster = factor(cluster, levels = c(1, 2, 3),
                            labels = c('Cluster 1', 'Cluster 2', 'Cluster 3'))) %>% 
    filter(Cluster == x) %>% 
    select(bebida, pizza, salada, sobremesa) %>% 
    as_tibble() %>% 
    gather(item, qtd, 1:4) %>% 
    filter(qtd > 0) %>% 
    ggplot(aes(x = qtd)) +
    geom_histogram(aes(y = ..density..), binwidth = 1, show.legend = FALSE, fill = y) +
    facet_grid(~item) +
    ggtitle('Distribuição de Frequência') +
    xlab('Quantidade')}

# Pedidos por horário.
hr_pedidos <- function(x){
    df2 %>% as.data.frame() %>% 
    select(horario_pedido, bebida, pizza, salada, sobremesa, cluster) %>% 
    gather(item, qtd, 2:5) %>% 
    mutate(horario = hour(horario_pedido),
           Cluster = factor(cluster,levels = c(1, 2, 3),
                            labels = c('Cluster 1', 'Cluster 2', 'Cluster 3'))) %>% 
    select(Cluster, item, qtd, horario) %>% 
    filter(Cluster == x, qtd > 0) %>% 
    select(item, qtd, horario) %>% 
    as_tibble() %>% 
    ggplot(aes(x = horario, color = item)) +
    geom_freqpoly(binwidth = 1, size = 1) +
    xlim(c(0, 23)) +
    xlab('Hora do dia') +
    ggtitle('Pedidos por Horário')}


# Cluster 1.
gridExtra::grid.arrange(qnt_pedidos('Cluster 1', 'coral2'),
                        frq_pedidos('Cluster 1', 'coral2'),
                        hr_pedidos('Cluster 1'),
                        nrow = 3,
                        top = textGrob('Cluster 1', gp = gpar(fontsize = 15)))


# Cluster 2.
gridExtra::grid.arrange(qnt_pedidos('Cluster 2', 'palegreen3'),
                        frq_pedidos('Cluster 2', 'palegreen3'),
                        hr_pedidos('Cluster 2'),
                        nrow = 3,
                        top = textGrob('Cluster 2', gp = gpar(fontsize = 15)))


# Cluster 3.
gridExtra::grid.arrange(qnt_pedidos('Cluster 3', 'steelblue'),
                        frq_pedidos('Cluster 3', 'steelblue'),
                        hr_pedidos('Cluster 3'),
                        nrow = 3,
                        top = textGrob('Cluster 3', gp = gpar(fontsize = 15)))

localidade	bebida	pizza	salada	sobremesa
0.750	0.4	0.4	0.0	0.25
0.250	0.0	0.0	0.2	0.00
0.625	0.0	0.4	0.0	0.25
0.125	0.4	0.4	0.0	0.25
0.875	0.0	0.0	0.6	0.50
0.625	0.0	0.2	0.0	0.00

localidade	bebida	pizza	salada	sobremesa
0.000	0.0	0.0	0.6	0.50
0.125	0.8	0.8	0.0	0.75
0.125	0.0	0.4	0.0	0.25
0.375	0.8	0.8	0.0	0.75
0.625	0.0	0.2	0.2	0.25
0.000	0.2	0.4	0.0	0.25

Segmentação de Clientes - Customer Segmentation¶

1. Entendimento do Negócio¶

1.1 Dicionário de Dados¶

1.2 Estratégia da Solução¶

2. Entendimento dos Dados¶

2.1 Bibliotecas Necessárias¶

2.2 Tratamento Inicial dos Dados¶

2.3 Análise Exploratória¶

2.3.1 Perguntas de Negócio¶

3. Engenharia de Atributos¶

3.1 Extração de Variáveis (Feature Extraction)¶

3.2 Seleção de Variáveis (Feature Selection)¶

4. Pré-Processamento dos Dados¶

5. Modelagem Preditiva¶

5.1 Avaliação da Tendência dos Clusters¶

5.2 Definindo o Número de Clusters¶

5.3 Criação do Modelo Preditivo¶

6. Análise de Clusters¶

6.1 Análise Cluster 1¶

6.2 Análise Cluster 2¶

6.3 Análise Cluster 3¶

7. Conclusões Finais¶

id_transacao	horario_pedido	localidade	nome_item	quantidade_item	latitude	longitude
0x7901ee	2019-01-16 18:33:00	7	bebida	2	41.79413	-88.01014
0x7901ee	2019-01-16 18:33:00	7	pizza	2	41.79413	-88.01014
0x7901ee	2019-01-16 18:33:00	7	sobremesa	2	41.79413	-88.01014
0x12b47f	2019-09-04 12:36:00	3	salada	1	41.88449	-87.62706
0x12b47f	2019-09-04 12:36:00	3	sobremesa	1	41.88449	-87.62706
0x6d6979	2019-03-18 00:27:00	6	pizza	2	41.78458	-87.60756