# Filtragem das mensagens de avisos.
import warnings
warnings.filterwarnings('ignore') 

# Manipulação de dados.
import datetime
import math
import numpy as np
import pandas as pd

# Criação de gráficos.
import matplotlib.pyplot as plt
import seaborn as sns

# Pré-processamento dos dados.
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler

# Algoritmos de Machine Learning.
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LinearRegression

# Seleção de variáveis.
from sklearn.feature_selection import f_regression, SelectKBest

# Métricas para avaliação dos modelos.
from pycorrcat.pycorrcat import corr_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error

print('Bibliotecas carregadas com sucesso!')

Bibliotecas carregadas com sucesso!


# Versão da linguagem Python.
from platform import python_version
print('Versão da linguagem Python:', python_version())

Versão da linguagem Python: 3.7.13


# Versão dos pacotes.
%reload_ext watermark
%watermark --iversions

numpy     : 1.21.6
pandas    : 1.3.5
IPython   : 5.5.0
seaborn   : 0.11.2
matplotlib: 3.2.2


# Configuração do notebook.

# Plotagens.
from matplotlib import rcParams
rcParams['figure.figsize'] = 15, 10
rcParams['lines.linewidth'] = 3

# Estilo dos gráficos.
plt.style.use('ggplot')

# Configuração Dataframe.
pd.set_option('display.max_columns', None)


# Carregando os conjuntos de dados.
df_train = pd.read_csv('data/train.csv')
df_store = pd.read_csv('data/store.csv')

# Concatenando os conjuntos.
df = pd.merge(df_train, df_store, how = 'left', on = 'Store')


# Cópia do dataset.
df1 = df


# Visualizando o dataframe.
df1.head()


# Dimensão do dataframe.
print('Número de registros: {}'.format(df1.shape[0]))
print('Número de variáveis: {}'.format(df1.shape[1]))

Número de registros: 1017209
Número de variáveis: 18


# Informações do dataframe.
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1017209 entries, 0 to 1017208
Data columns (total 18 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Store                      1017209 non-null  int64  
 1   DayOfWeek                  1017209 non-null  int64  
 2   Date                       1017209 non-null  object 
 3   Sales                      1017209 non-null  int64  
 4   Customers                  1017209 non-null  int64  
 5   Open                       1017209 non-null  int64  
 6   Promo                      1017209 non-null  int64  
 7   StateHoliday               1017209 non-null  object 
 8   SchoolHoliday              1017209 non-null  int64  
 9   StoreType                  1017209 non-null  object 
 10  Assortment                 1017209 non-null  object 
 11  CompetitionDistance        1014567 non-null  float64
 12  CompetitionOpenSinceMonth  693861 non-null   float64
 13  CompetitionOpenSinceYear   693861 non-null   float64
 14  Promo2                     1017209 non-null  int64  
 15  Promo2SinceWeek            509178 non-null   float64
 16  Promo2SinceYear            509178 non-null   float64
 17  PromoInterval              509178 non-null   object 
dtypes: float64(5), int64(8), object(5)
memory usage: 147.5+ MB


# Valores ausentes.
df1.isnull().sum()

Store                             0
DayOfWeek                         0
Date                              0
Sales                             0
Customers                         0
Open                              0
Promo                             0
StateHoliday                      0
SchoolHoliday                     0
StoreType                         0
Assortment                        0
CompetitionDistance            2642
CompetitionOpenSinceMonth    323348
CompetitionOpenSinceYear     323348
Promo2                            0
Promo2SinceWeek              508031
Promo2SinceYear              508031
PromoInterval                508031
dtype: int64


# Valor máximo da variável.
df1['CompetitionDistance'].max()

75860.0


# Tratando valores ausentes.
df1['CompetitionDistance'] = df1['CompetitionDistance'].apply(lambda x: 200000.0 if math.isnan(x) else x)


# Conversão da variável.
df1['Date'] = pd.to_datetime(df1['Date'])


# Tratando valores ausentes.
df1['CompetitionOpenSinceMonth'] = df1.apply(lambda x: x['Date'].month if math.isnan(x['CompetitionOpenSinceMonth']) 
                                             else x['CompetitionOpenSinceMonth'], axis = 1)


# Tratando valores ausentes.

# CompetitionOpenSinceYear.
df1['CompetitionOpenSinceYear'] = df1.apply(lambda x: x['Date'].year if math.isnan(x['CompetitionOpenSinceYear']) 
                                            else x['CompetitionOpenSinceYear'], axis = 1)

# Promo2SinceWeek.
df1['Promo2SinceWeek'] = df1.apply(lambda x: x['Date'].week if math.isnan(x['Promo2SinceWeek']) 
                                   else x['Promo2SinceWeek'], axis = 1)

# Promo2SinceYear.
df1['Promo2SinceYear'] = df1.apply(lambda x: x['Date'].year if math.isnan(x['Promo2SinceYear']) 
                                   else x['Promo2SinceYear'], axis = 1)


# Excluindo a variável.
df1 = df1.drop(['PromoInterval'], axis = 1)


# Valores ausentes.
df1.isnull().sum()

Store                        0
DayOfWeek                    0
Date                         0
Sales                        0
Customers                    0
Open                         0
Promo                        0
StateHoliday                 0
SchoolHoliday                0
StoreType                    0
Assortment                   0
CompetitionDistance          0
CompetitionOpenSinceMonth    0
CompetitionOpenSinceYear     0
Promo2                       0
Promo2SinceWeek              0
Promo2SinceYear              0
dtype: int64


# Filtrando o dataframe.
df1 = df1[(df1['Open'] == 1) & (df1['Sales'] > 0)]


# Excluindo as variáveis.
df1 = df1.drop(['Open', 
                'DayOfWeek'], axis = 1)


# Variáveis do dataframe.
df1.columns

Index(['Store', 'Date', 'Sales', 'Customers', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear'],
      dtype='object')


# Variável de identificação das unidades.
store = ['Store']


# Variável de datas.
date = ['Date']


# Variáveis numéricas.
nums = ['Sales',
        'Customers',
        'CompetitionDistance',
        'Promo2SinceWeek']


# Variáveis categóricas.
cats = ['Promo',
        'StateHoliday',
        'SchoolHoliday',
        'StoreType',
        'Assortment',
        'CompetitionOpenSinceMonth',
        'CompetitionOpenSinceYear',
        'Promo2',
        'Promo2SinceYear']


# Quantidade de registros.
len(df1['Store'].unique())

1115


# Datas do conjunto de dados.
print('Data início: {}'.format(df1['Date'].min()))
print('Data final: {}'.format(df1['Date'].max()))

Data início: 2013-01-01 00:00:00
Data final: 2015-07-31 00:00:00


# Estatísticas das variáveis.
df1[nums].describe().T


# Histogramas das variáveis.

# Amostra dos dados.
df_sample = df1.sample(1000, random_state = 42)

# Redefinindo a área de plotagem.
plt.figure(figsize = (20, 10))

# Especificando as variáveis.
features = nums

# Plotagem.
for i in range(0, len(nums)):
    plt.subplot(2, 2, i + 1)
    sns.histplot(x = df_sample[features[i]],
                 color = 'r', alpha = 0.7)
    plt.xlabel(features[i])
    plt.tight_layout()


# Plotagem das variáveis.

# Redefinindo a área de plotagem.
plt.figure(figsize = (20, 10))

# Plotagem.
for i in range(0, len(cats)):
    plt.subplot(3, 3, i + 1)
    sns.countplot(x = df_sample[cats[i]], 
                  palette = 'Reds_r')
    plt.tight_layout()


# Dia.
df1['Day'] = df1['Date'].dt.day

# Mês.
df1['Month'] = df1['Date'].dt.month

# Ano.
df1['Year'] = df1['Date'].dt.year

# Dia da semana.
df1['WeekDay'] = df1['Date'].dt.dayofweek

# Semana do ano.
df1['WeekYear'] = df1['Date'].dt.weekofyear

# Ano e mês.
df1['YearMonth'] = df1['Date'].dt.strftime('%Y-%m')


# Visualizando as variáveis.
df1[['Date', 
     'Day', 
     'Month', 
     'Year', 
     'WeekDay', 
     'WeekYear', 
     'YearMonth']].head()


# Hipótese 1.

# Preparação dos dados.
aux1 = df1[['Day', 'Sales']].groupby('Day').sum().reset_index()
aux1['before_after'] = aux1['Day'].apply(lambda x: 'before_10_days' if x <= 10 else 'after_10_days')
aux2 = aux1[['before_after', 'Sales']].groupby('before_after').sum().reset_index()

# Plotagem.
p1 = sns.barplot(x = 'before_after', 
                 y = 'Sales', 
                 data = aux2, 
                 palette = 'Reds_r')
p1.set_title('Vendas Antes/Depois do Dia 10/Mês', size = 15);


# Hipótese 2.

# Preparação dos dados.
aux1 = df1[['WeekDay', 'Sales']].groupby('WeekDay').sum().reset_index()

# Área de plotagem.
fig, (axis1, axis2) = plt.subplots(1, 2)

# Plotagem 1.
p1 = sns.barplot(x = 'WeekDay', 
                 y = 'Sales', 
                 data = aux1, 
                 palette = 'Reds_r', 
                 ax = axis1)
p1.set_title('Vendas por Dia da Semana', size = 15);

# Plotagem 2.
p2 = sns.regplot(x = 'WeekDay', 
                 y = 'Sales', 
                 data = aux1, 
                 color = 'r', 
                 ax = axis2)
p2.set_title('Vendas por Dia da Semana', size = 15);


# Hipótese 3.

# Preparação dos dados.
aux1 = df1[['Month', 'Sales']].groupby('Month').sum().reset_index()

# Área de plotagem.
fig, (axis1, axis2) = plt.subplots(1, 2)

# Plotagem 1.
p1 = sns.barplot(x = 'Month', 
                 y = 'Sales', 
                 data = aux1, 
                 palette = 'Reds_r', 
                 ax = axis1)
p1.set_title('Vendas por Mês', size = 15);

# Plotagem 2.
p2 = sns.regplot(x = 'Month', 
                 y = 'Sales', 
                 data = aux1, 
                 color = 'r', 
                 ax = axis2)
p2.set_title('Vendas por Mês', size = 15);


# Hipótese 4.

# Preparação dos dados.
aux1 = df1[['Year', 'Sales']].groupby('Year').sum().reset_index()

# Plotagem.
p1 = sns.barplot(x = 'Year', 
                 y = 'Sales', 
                 data = aux1, 
                 palette = 'Reds_r')
p1.set_title('Vendas por Ano', size = 15);


# Vendas por tipo de promoção.
df1[['Promo', 'Promo2', 'Sales']].groupby(['Promo', 'Promo2']).sum().sort_values('Sales').reset_index()


# Hipótese 5.

# Unidades que aderiram as duas promoções.
aux1 = df1[(df1['Promo'] == 1) & (df1['Promo2'] == 1)][['YearMonth', 'Sales']].groupby('YearMonth').sum().reset_index()
ax = aux1.plot(title = 'Comparação de Vendas por Tipo de Promoção', color = 'red', alpha = 0.7)

# Unidades que aderiram somente a promoção normal.
aux2 = df1[(df1['Promo'] == 1) & (df1['Promo2'] == 0)][['YearMonth', 'Sales']].groupby('YearMonth').sum().reset_index()
aux2.plot(color = 'blue', alpha = 0.7, ax = ax)

ax.legend(labels = ['Promo2', 'Promo']);


# Convertendo as classes.
df1['StateHoliday'] = df1['StateHoliday'].apply(lambda x: 'public_holiday' if x == 'a' else
                                                'easter_holiday' if x == 'b' else 
                                                'christmas' if x == 'c' else 'regular_day')


# Contagem de registros.
df1[['StateHoliday']].value_counts()

StateHoliday  
regular_day       843428
public_holiday       694
easter_holiday       145
christmas             71
dtype: int64


# Hipótese 6.

# Preparando os dados.
aux = df1[df1['StateHoliday'] != 'regular_day']

# Plotagem 1.
plt.subplot(1, 2, 1)
aux1 = aux[['StateHoliday', 'Sales']].groupby('StateHoliday').sum().reset_index()
p1 = sns.barplot(x = 'StateHoliday', 
                 y = 'Sales', 
                 palette = 'Reds_r', 
                 data = aux1)
p1.set_title('Número de Vendas em Feriados', size = 15);

# Plotagem 2
plt.subplot(1, 2, 2)
aux2 = aux[['Year', 'StateHoliday', 'Sales']].groupby(['Year', 'StateHoliday']).sum().reset_index()
p2 = sns.barplot(x = 'Year', 
                 y = 'Sales', 
                 hue = 'StateHoliday', 
                 palette = 'Reds_r', 
                 data = aux2)
p2.set_title('Número de Vendas em Feriados por Ano', size = 15);


# Hipótese 7.

# Plotagem 1.
plt.subplot(2, 1, 1)
aux1 = df1[['SchoolHoliday', 'Sales']].groupby('SchoolHoliday').sum().reset_index()
p1 = sns.barplot(x = 'SchoolHoliday',
                 y = 'Sales', 
                 palette = 'Reds_r', 
                 data = aux1)
p1.set_title('Número de Vendas em Feriados Escolares', size = 15);

# Plotagem 2.
plt.subplot(2, 1, 2)
aux2 = df1[['Month', 'SchoolHoliday', 'Sales']].groupby(['Month', 'SchoolHoliday']).sum().reset_index()
p2 = sns.barplot(x = 'Month', 
                 y = 'Sales', 
                 hue = 'SchoolHoliday',
                 palette = 'Reds_r', 
                 data = aux2);


# Hipótese 8.

# Área de plotagem.
fig, (axis1, axis2) = plt.subplots(1, 2)

# Plotagem 1.
p1 = sns.barplot(x = 'Assortment', 
                 y = 'Sales', 
                 data = df1, 
                 order = ['a', 'b', 'c'], 
                 palette = 'Reds_r', 
                 ax = axis1)
p1.set_title('Vendas por Tipo de Sortimento', size = 15);

# Plotagem 2.
p2 = sns.barplot(x = 'Assortment', 
                 y = 'Customers', 
                 data = df1, 
                 order = ['a', 'b', 'c'], 
                 palette = 'Reds_r', 
                 ax = axis2)
p2.set_title('Clientes por Tipo de Sortimento', size = 15);


# Distribuição da variável.

# Preparação dos dados.
aux1 = df1[['CompetitionDistance', 'Sales']].groupby('CompetitionDistance').sum().reset_index()

# Plotagem.
p1 = sns.scatterplot(x = 'CompetitionDistance', 
                     y = 'Sales', 
                     data = aux1, 
                     color = 'r')
p1.set_title('Distribuição da Variável CompetitionDistance', size = 15);


# Hipótese 9.

# Range para as distâncias.
bins = list(np.arange(0, 20000, 1000))
aux1['CompetitionDistanceBinned'] = pd.cut(aux1['CompetitionDistance'], bins = bins)

# Preparação dos dados.
aux2 = aux1[['CompetitionDistanceBinned', 'Sales']].groupby('CompetitionDistanceBinned').sum().reset_index()

# Plotagem.
p1 = sns.barplot(x = 'CompetitionDistanceBinned', 
                 y = 'Sales', 
                 palette = 'Reds_r', 
                 data = aux2)
p1.set_title('Vendas por Distância de Concorrentes', size = 15)
plt.xticks(rotation = 45);


# Nova variável.

# Convertendo as variáveis.
df1['CompetitionOpenSinceYear'] = df1['CompetitionOpenSinceYear'].astype('int64')
df1['CompetitionOpenSinceMonth'] = df1['CompetitionOpenSinceMonth'].astype('int64')

# Concatenando os dados.
df1['CompetitionSince'] = df1.apply(lambda x: datetime.datetime(year = x['CompetitionOpenSinceYear'],
                                                                month = x['CompetitionOpenSinceMonth'], 
                                                                day = 1), axis = 1)


# Nova variável.
df1['CompetitionTimeMonth'] = ((df1['Date'] - df1['CompetitionSince']) / 30).apply(lambda x: x.days).astype('int64')


# Visualizando a transformação.
df1[['CompetitionOpenSinceMonth', 
     'CompetitionOpenSinceYear', 
     'CompetitionSince']].head()


# Visualizando a transformação.
df1[['CompetitionOpenSinceMonth', 
     'CompetitionOpenSinceYear', 
     'CompetitionSince', 
     'CompetitionTimeMonth']].head()


# Excluindo as variáveis.
df1 = df1.drop(['CompetitionOpenSinceMonth', 
                'CompetitionOpenSinceYear', 
                'CompetitionSince'], axis = 1)


# Hipótese 10.

# Preparando os dados.
aux1 = df1[['CompetitionTimeMonth', 'Sales']].groupby('CompetitionTimeMonth').sum().reset_index()

# Filtro para diminuir a granularidade.
aux2 = aux1[(aux1['CompetitionTimeMonth'] < 60) & (aux1['CompetitionTimeMonth'] != 0)]

# Plotagem.
p1 = sns.barplot(x = 'CompetitionTimeMonth', 
                 y = 'Sales', 
                 color = 'r', 
                 alpha = 0.7, 
                 data = aux2)
p1.set_title('Vendas por Período de Concorrência', size = 15)
plt.xticks(rotation = 90);


# Nova variável.

# Convertendo as variáveis.
df1['Promo2SinceWeek'] = df1['Promo2SinceWeek'].astype('int64')
df1['Promo2SinceYear'] = df1['Promo2SinceYear'].astype('int64')

# Concatenando os dados.
df1['PromoSince'] = df1['Promo2SinceYear'].astype(str) + '-' + df1['Promo2SinceWeek'].astype(str)


# Visualizando a transformação.
df1[['Promo2SinceWeek', 
     'Promo2SinceYear', 
     'PromoSince']].head()


# Formatando a variável.
df1['PromoSince'] = df1['PromoSince'].apply(lambda x:
                                            datetime.datetime.strptime(x + '-1', '%Y-%W-%w') - datetime.timedelta(days = 7))


# Visualizando a variável.
df1[['PromoSince']].head()


# Nova variável.
df1['PromoTimeWeek'] = ((df1['Date'] - df1['PromoSince']) / 7).apply(lambda x: x.days).astype('int64')


# Visualizando as variáveis.
df1[['Promo2',
     'Promo2SinceWeek', 
     'Promo2SinceYear', 
     'PromoSince', 
     'PromoTimeWeek']].head()


# Excluindo as variáveis.
df1 = df1.drop(['Promo2SinceYear', 
                'Promo2SinceWeek', 
                'PromoSince'], axis = 1)


# Hipótese 11.

# Preparando os dados.
aux1 = df1[['PromoTimeWeek', 'Sales']].groupby('PromoTimeWeek').sum().reset_index()

# Plotagem 1.
plt.subplot(2, 1, 1)
aux2 = aux1[aux1['PromoTimeWeek'] > 0] 
p1 = sns.barplot(x = 'PromoTimeWeek', 
                 y = 'Sales', 
                 color = 'r', 
                 alpha = 0.7, 
                 data = aux2)
p1.set(xlabel = ''), 
p1.set_xticklabels(''), 
p1.set_title('Vendas no Período Estendido de Promoção');

# Plotagem 2.
plt.subplot(2, 1, 2)
aux3 = aux1[aux1['PromoTimeWeek'] < 0]
p2 = sns.barplot(x = 'PromoTimeWeek', 
                 y = 'Sales', 
                 color = 'r', 
                 alpha = 0.7, 
                 data = aux3)
p2.set(xlabel = ''), 
p2.set_xticklabels(''), 
p2.set_title('Vendas no Período Normal de Promoção');


# Redefinindo as variáveis numéricas.
nums2 = ['Sales', 
         'Customers', 
         'CompetitionDistance', 
         'CompetitionTimeMonth', 
         'PromoTimeWeek']


# Matriz de correlação.
corr_df = df1[nums2].corr()

# Plotagem.
sns.heatmap(corr_df, 
            cmap = 'Blues', 
            annot = True, 
            fmt = '.2f');


# Excluindo a variável.
df1 = df1.drop(['Customers'], axis = 1)


# Redefinindo as variáveis categóricas.
cats2 = ['Promo', 
        'StateHoliday', 
        'SchoolHoliday', 
        'StoreType', 
        'Assortment', 
        'Promo2']


# Instanciando o objeto para conversão.
le = LabelEncoder()


# Aplicando a transformação.
df1[cats2] = df1[cats2].apply(lambda x: le.fit_transform(x))


# Visualizando a transformação.
df1[cats2].head()


# Correlação Cramer V.
correlation_matrix = corr_matrix(df1, ['Promo', 
                                       'StateHoliday', 
                                       'SchoolHoliday', 
                                       'StoreType', 
                                       'Assortment', 
                                       'Promo2'])

# Plotagem.
p1 = sns.heatmap(correlation_matrix, 
                 annot = True, 
                 cmap = 'Blues')
p1.set_title('Correlação Variáveis Categóricas', size = 15);


# Boxplots das variáveis.

# CompetitionDistance.
plt.subplot(3, 2, 1)
p1 = sns.boxplot(df1['CompetitionDistance'], 
                 color = 'tomato');

# CompetitionTimeMonth.
plt.subplot(3, 2, 2)
p2 = sns.boxplot(df1['CompetitionTimeMonth'], 
                 color = 'tomato');

# PromoTimeWeek.
plt.subplot(3, 2, 3)
p3 = sns.boxplot(df1['PromoTimeWeek'], 
                 color = 'tomato');

# Year.
plt.subplot(3, 2, 4)
p4 = sns.boxplot(df1['Year'], 
                 color = 'tomato');

plt.tight_layout()
plt.show()


# Instanciando os objetos.
rs = RobustScaler() 
mms = MinMaxScaler()


# Variáveis antes da transformação.
df1[['CompetitionDistance', 
     'CompetitionTimeMonth', 
     'PromoTimeWeek', 
     'Year']].head()


# Redimensionando as variáveis com outliers.

# CompetitionDistance.
df1['CompetitionDistance'] = rs.fit_transform(df1[['CompetitionDistance']].values)

# CompetitionTimeMonth.
df1['CompetitionTimeMonth'] = rs.fit_transform(df1[['CompetitionTimeMonth']].values)


# Redimensionando as variáveis sem outliers.

# PromoTimeWeek.
df1['PromoTimeWeek'] = mms.fit_transform(df1[['PromoTimeWeek']].values)

# Year.
df1['Year'] = mms.fit_transform(df1[['Year']].values)


# Variáveis após a transformação.
df1[['CompetitionDistance', 
     'CompetitionTimeMonth', 
     'PromoTimeWeek', 
     'Year']].head()


# Distribuição da variável.

# Plotagem.
p = sns.distplot(df1['Sales'], 
                 color = 'r')
p.set_title('Distribuição da Variável Antes da Transformação', size = 15);


# Transformação logarítmica.
df1['Sales'] = np.log1p(df1['Sales'])


# Distribuição da variável após o redimensionamento.

# PLotagem.
p1 = sns.distplot(df1['Sales'], 
                  color = 'r')
p1.set_title('Distribuição da Variável Após a Transformação', size = 15);


# Variáveis antes da transformação.
df1[['Day', 
     'Month', 
     'WeekDay', 
     'WeekYear']].head()


# Redimensionamento das variáveis periódicas.

# Day.
df1['DaySin'] = df1['Day'].apply(lambda x: np.sin(x * (2 * np.pi/30)))
df1['DayCos'] = df1['Day'].apply(lambda x: np.cos(x * (2 * np.pi/30)))

# Month.
df1['MonthSin'] = df1['Month'].apply(lambda x: np.sin(x * (2 * np.pi/12)))
df1['MonthCos'] = df1['Month'].apply(lambda x: np.cos(x * (2 * np.pi/12)))

# WeekDay.
df1['WeekDaySin'] = df1['WeekDay'].apply(lambda x: np.sin(x * (2 * np.pi/7)))
df1['WeekDayCos'] = df1['WeekDay'].apply(lambda x: np.cos(x * (2 * np.pi/7)))

# WeekYear.
df1['WeekYearSin'] = df1['WeekYear'].apply(lambda x: np.sin(x * (2 * np.pi/52)))
df1['WeekYearCos'] = df1['WeekYear'].apply(lambda x: np.cos(x * (2 * np.pi/52)))


# Excluindo as variáveis.
cols_drop = ['Day', 
             'Month', 
             'WeekDay', 
             'WeekYear', 
             'YearMonth']
df1 = df1.drop(cols_drop, axis = 1)


# Informações sobre o dataframe.
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 844338 entries, 0 to 1017190
Data columns (total 21 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   Store                 844338 non-null  int64         
 1   Date                  844338 non-null  datetime64[ns]
 2   Sales                 844338 non-null  float64       
 3   Promo                 844338 non-null  int64         
 4   StateHoliday          844338 non-null  int64         
 5   SchoolHoliday         844338 non-null  int64         
 6   StoreType             844338 non-null  int64         
 7   Assortment            844338 non-null  int64         
 8   CompetitionDistance   844338 non-null  float64       
 9   Promo2                844338 non-null  int64         
 10  Year                  844338 non-null  float64       
 11  CompetitionTimeMonth  844338 non-null  float64       
 12  PromoTimeWeek         844338 non-null  float64       
 13  DaySin                844338 non-null  float64       
 14  DayCos                844338 non-null  float64       
 15  MonthSin              844338 non-null  float64       
 16  MonthCos              844338 non-null  float64       
 17  WeekDaySin            844338 non-null  float64       
 18  WeekDayCos            844338 non-null  float64       
 19  WeekYearSin           844338 non-null  float64       
 20  WeekYearCos           844338 non-null  float64       
dtypes: datetime64[ns](1), float64(13), int64(7)
memory usage: 174.0 MB


# Data máxima do dataset.
df1[['Date']].max()

Date   2015-07-31
dtype: datetime64[ns]


# Data para divisão dos dados.
df1[['Store', 'Date']].groupby('Store').max().reset_index()['Date'][0] - datetime.timedelta(days = 6 * 7)

Timestamp('2015-06-19 00:00:00')


# Dados de treino.
X_train = df1[df1['Date'] < '2015-06-19']
y_train = X_train['Sales']

print('Data início: {}'.format(X_train['Date'].min()))
print('Data final: {}'.format(X_train['Date'].max()))

Data início: 2013-01-01 00:00:00
Data final: 2015-06-18 00:00:00


# Dados de teste.
X_test = df1[df1['Date'] >= '2015-06-19']
y_test = X_test['Sales']

print('Data início: {}'.format(X_test['Date'].min()))
print('Data final: {}'.format(X_test['Date'].max()))

Data início: 2015-06-19 00:00:00
Data final: 2015-07-31 00:00:00


# Excluindo as variáveis.
X_train_2 = X_train.drop(['Sales', 'Date'], axis = 1)
X_test_2 = X_test.drop(['Sales', 'Date'], axis = 1)


# Separando as variáveis.
X = X_train_2.values
y = X_train['Sales'].values
print(X.shape, y.shape)

(802942, 19) (802942,)


# Instanciando o objeto.
selector = SelectKBest(score_func = f_regression, k = 15)


# Treinando o objeto.
bestFeatuesSKB = selector.fit_transform(X, y)


# Melhores variáveis.
bfSkb = X_train_2.columns[selector.get_support()]
bfSkb

Index(['Promo', 'SchoolHoliday', 'StoreType', 'Assortment',
       'CompetitionDistance', 'Promo2', 'Year', 'PromoTimeWeek', 'DaySin',
       'DayCos', 'MonthSin', 'MonthCos', 'WeekDaySin', 'WeekDayCos',
       'WeekYearCos'],
      dtype='object')


# Score das variáveis.
scores = pd.Series(selector.scores_, index = X_train_2.columns)
scores = scores[selector.get_support()]
scores = scores.sort_values(ascending = False)
scores

Promo                  155489.603874
Assortment              11534.214087
Promo2                  11123.513621
WeekDayCos               9374.377834
WeekDaySin               7360.133043
DayCos                   4679.657391
DaySin                   2273.465042
Year                     1911.438473
CompetitionDistance      1545.793879
PromoTimeWeek            1542.658416
SchoolHoliday            1125.107885
StoreType                 331.857841
WeekYearCos               105.110149
MonthSin                   86.955932
MonthCos                   85.252383
dtype: float64


# Adicionando a variável.
bfSkb = list(bfSkb)
bfSkb.extend(['Store'])


# Novos conjuntos de treino.
X_train_3 = X_train_2[bfSkb]
y_train = y_train
print(X_train_3.shape, y_train.shape)

(802942, 16) (802942,)


# Novos conjuntos de teste.
X_test_3 = X_test_2[bfSkb]
y_test = y_test
print(X_test_3.shape, y_test.shape)

(41396, 16) (41396,)


# Definindo funções auxiliares.

# Cálculo do erro do modelo.
def percentage_error(actual, predicted):
    res = np.empty(actual.shape)
    for j in range(actual.shape[0]):
        if actual[j] != 0:
            res[j] = (actual[j] - predicted[j]) / actual[j]
        else:
            res[j] = predicted[j] / np.mean(actual)
    return res

# MAPE (Erro Percentual Absoluto Médio).
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs(percentage_error(np.asarray(y_true), np.asarray(y_pred)))) * 100

# Métricas do modelo.
def ml_error(model_name, y, y_pred):
    
    mae  = mean_absolute_error(y, y_pred)
    mape = mean_absolute_percentage_error(y, y_pred)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    
    return pd.DataFrame({
        'Modelo': model_name,
        'MAE': mae,
        'MAPE': mape,
        'RMSE': rmse}, index = [0])


# Função de para validação cruzada.
def cross_validation(x_training, kfold, model_name, model, verbose = False):
    
    mae_list  = []
    mape_list = []
    rmse_list = []
    
    for k in reversed(range(1, kfold + 1)):
        if verbose:
            print('\nValor de "K": {}'.format(k))
            
        # Data de início e término para validação.
        validation_start_date = x_training['Date'].max() - datetime.timedelta(days = k * 6 * 7)
        validation_end_date = x_training['Date'].max() - datetime.timedelta(days = (k - 1) * 6 * 7)
        
        # Filtrando os dados.
        training = x_training[x_training['Date'] < validation_start_date]
        validation = x_training[(x_training['Date'] >= validation_start_date) & (x_training['Date'] <= validation_end_date)]
        
        # Definindo os dados de treino e teste.
        # Dados de treino.
        xtraining = training.drop(['Date', 'Sales'], axis = 1)
        ytraining = training['Sales']
        
        # Dados de teste.
        xvalidation = validation.drop(['Date', 'Sales'], axis = 1)
        yvalidation = validation['Sales']
        
        # Treinamento do modelo.
        m = model.fit(xtraining, ytraining)
        
        # Previsões do modelo.
        y_pred = m.predict(xvalidation)
        
        # Avaliação do modelo.
        m_result = ml_error(model_name, np.expm1(yvalidation), np.expm1(y_pred))
                
        # Desempenho do modelo.
        mae_list.append(m_result['MAE']) 
        mape_list.append(m_result['MAPE'])
        rmse_list.append(m_result['RMSE'])
        
    return pd.DataFrame({'Model Name': model_name,
                         'MAE CV': np.round(np.mean(mae_list), 2).astype(str) + ' +/- ' + np.round(np.std(mae_list), 2).astype(str),
                         'MAPE CV': np.round(np.mean(mape_list), 2).astype(str) + ' +/- ' + np.round(np.std(mape_list), 2).astype(str),
                         'RMSE CV': np.round(np.mean(rmse_list), 2).astype(str) + ' +/- ' + np.round(np.std(rmse_list), 2).astype(str)}, index = [0])


# Conjunto de dados para validação cruzada.

# Lista das melhores variáveis.
best_vars = list(bfSkb)
best_vars.extend(['Sales', 'Date'])

# Criando os conjuntos.
X_train_4 = X_train[best_vars]
X_test_4 = X_test[best_vars]


# Preparando os dados.
aux1 = X_test_3.copy()
aux1['Sales'] = y_test.copy()

# Previsões.
aux2 = aux1[['Store', 'Sales']].groupby('Store').mean().reset_index().rename(columns = {'Sales': 'Predictions'})
aux1 = pd.merge(aux1, aux2, how = 'left', on = 'Store')
yhat_baseline = aux1['Predictions'] 

# Retornando os valores originais.
baseline_result = ml_error('Average Model', np.expm1(y_test), np.expm1(yhat_baseline))
baseline_result


# Instanciando e treinando o modelo.
lr = LinearRegression().fit(X_train_3, y_train)


# Previsões com dados de teste.
y_pred = lr.predict(X_test_3)


# Desempenho do modelo.
lr_result = ml_error('Linear Regression', np.expm1(y_test), np.expm1(y_pred))
lr_result


# Validação cruzada.
lr_result_cv = cross_validation(X_train_4, 5, 'Linear Regression', lr, verbose = False)
lr_result_cv


# Instanciando e treinando o modelo.
lrr = Lasso(alpha = 0.01).fit(X_train_3, y_train)


# Previsões com dados de teste.
y_pred_2 = lrr.predict(X_test_3)


# Desempenho do modelo.
lrr_result = ml_error('Linear Regression - Lasso', np.expm1(y_test), np.expm1(y_pred_2))
lrr_result


# Validação cruzada.
lrr_result_cv = cross_validation(X_train_4, 5, 'Linear Regression - Lasso', lrr, verbose = False)
lrr_result_cv


# Instanciando e treinando o modelo.
rf = RandomForestRegressor().fit(X_train_3, y_train)


# Previsões com dados de teste.
y_pred_3 = rf.predict(X_test_3)


# Desempenho do modelo.
rf_result = ml_error('Random Forest Regressor', np.expm1(y_test), np.expm1(y_pred_3))
rf_result


# Validação cruzada.
rf_result_cv = cross_validation(X_train_4, 5, 'Random Forest Regressor', rf, verbose = True)
rf_result_cv

Valor de "K": 5

Valor de "K": 4

Valor de "K": 3

Valor de "K": 2

Valor de "K": 1


# Performance nos treinamentos.
modelling_results = pd.concat([lr_result, lrr_result, rf_result])
modelling_results.sort_values('RMSE')


# Performance na validação cruzada.
modelling_results_cv = pd.concat([lr_result_cv, lrr_result_cv, rf_result_cv])
modelling_results_cv.sort_values('RMSE CV')


# Retornando a escala original dos dados.
X_test_4['Sales'] = np.expm1(X_test_4['Sales'])

# Coluna de previsões.
X_test_4['Predictions'] = np.expm1(y_pred_3)


# Visualizando as variáveis.
X_test_4[['Sales', 'Predictions']].head()


# Preparando os dados.
df_pred = X_test_4[['Store', 'Predictions']].groupby('Store').sum().reset_index()


# Obtendo as métricas.

# MAE.
df_aux1 = X_test_4[['Store', 'Sales', 'Predictions']].groupby('Store').apply(lambda x: mean_absolute_error(x['Sales'],
                                                                                                           x['Predictions'])).reset_index().rename(columns = {0: 'MAE'})

# MAPE.
df_aux2 = X_test_4[['Store', 'Sales', 'Predictions']].groupby('Store').apply(lambda x: mean_absolute_percentage_error(x['Sales'],
                                                                                                                      x['Predictions'])).reset_index().rename(columns = {0: 'MAPE'})


# Concatenando as informações.
df_aux3 = pd.merge(df_aux1, df_aux2, how = 'inner', on = 'Store')
df_pred = pd.merge(df_pred, df_aux3, how = 'inner', on = 'Store')


# Visualizando o dataframe.
df_pred.head()


# Piores e melhores cenários.
df_pred['WorstScenario'] = df_pred['Predictions'] - df_pred['MAE']
df_pred['BestScenario'] = df_pred['Predictions'] + df_pred['MAE']

# Organizando as colunas.
df_pred = df_pred[['Store', 'Predictions', 'WorstScenario', 'BestScenario', 'MAE', 'MAPE']]

# Piores previsões.
df_pred.sort_values('MAPE', ascending = False).head()


# MAPE de cada unidade.

# Plotagem.
p1 = sns.scatterplot(x = 'Store', 
                     y = 'MAPE', 
                     color = 'r', 
                     data = df_pred)
p1.set_title('MAPE/Unidade', size = 15);


# Tabela com resultados somados.
df_pred2 = df_pred[['Predictions', 'WorstScenario', 'BestScenario']].apply(lambda x: np.sum(x), axis = 0).reset_index().rename(columns = {'index': 'scenario', 0: 'values'})
df_pred2['values'] = df_pred2['values'].map('${:,.2f}'.format)
df_pred2


# Taxa de erros do modelo.
X_test_4['Error'] = X_test_4['Sales'] - X_test_4['Predictions']
X_test_4['ErrorRate'] = X_test_4['Predictions'] / X_test_4['Sales']


# Resultado do modelo.

# Plotagem 1.
plt.subplot(2, 2, 1)
sns.lineplot(x = 'Date', y = 'Sales', color = 'r', data = X_test_4, label = 'SALES')
sns.lineplot(x = 'Date', y = 'Predictions', color = 'g', data = X_test_4, label = 'PREDICTIONS')

# Plotagem 2.
plt.subplot(2, 2, 2)
sns.lineplot(x = 'Date', y = 'ErrorRate', color = 'r', data = X_test_4)
plt.axhline(1, linestyle = '--', color = 'g')

# Plotagem 3.
plt.subplot(2, 2, 3)
sns.distplot(X_test_4['Error'], color = 'r')

# Plotagem 4.
plt.subplot(2, 2, 4)
sns.scatterplot(X_test_4['Predictions'], X_test_4['Error'], color = 'r');

	Store	DayOfWeek	Date	Sales	Customers	Open	Promo	SchoolHoliday	StoreType	Assortment	CompetitionDistance	CompetitionOpenSinceMonth	CompetitionOpenSinceYear	Promo2	Promo2SinceWeek	Promo2SinceYear	PromoInterval
0	1	5	2015-07-31	5263	555	1	1	1	c	a	1270.0	9.0	2008.0	0	NaN	NaN	NaN
1	2	5	2015-07-31	6064	625	1	1	1	a	a	570.0	11.0	2007.0	1	13.0	2010.0	Jan,Apr,Jul,Oct
2	3	5	2015-07-31	8314	821	1	1	1	a	a	14130.0	12.0	2006.0	1	14.0	2011.0	Jan,Apr,Jul,Oct
3	4	5	2015-07-31	13995	1498	1	1	1	c	c	620.0	9.0	2009.0	0	NaN	NaN	NaN
4	5	5	2015-07-31	4822	559	1	1	1	a	a	29910.0	4.0	2015.0	0	NaN	NaN	NaN

	count	mean	std	min	25%	50%	75%	max
Sales	844338.0	6955.959134	3103.815515	46.0	4859.0	6369.0	8360.0	41551.0
Customers	844338.0	762.777166	401.194153	8.0	519.0	676.0	893.0	7388.0
CompetitionDistance	844338.0	5961.827515	12592.181107	20.0	710.0	2330.0	6910.0	200000.0
Promo2SinceWeek	844338.0	23.629083	14.288315	1.0	12.0	22.0	37.0	52.0

	Promo	Promo2	Sales
0	0	1	1289362241
1	1	1	1472275754
2	0	0	1482612096
3	1	0	1628930532

	CompetitionOpenSinceMonth	CompetitionOpenSinceYear	CompetitionSince
0	9	2008	2008-09-01
1	11	2007	2007-11-01
2	12	2006	2006-12-01
3	9	2009	2009-09-01
4	4	2015	2015-04-01

	PromoSince
0	2015-07-27
1	2010-03-22
2	2011-03-28
3	2015-07-27
4	2015-07-27

Previsão de Vendas - Sales Forecast¶

1. Entendimento do Negócio¶

1.1 Dicionário de Dados¶

1.2 Estratégia da Solução¶

2. Entendimento dos Dados¶

2.1 Bibliotecas Necessárias¶

2.2 Tratamento Inicial dos Dados¶

2.3 Análise Exploratória¶

2.3.1 Variável de Identificação das Unidades.¶

2.3.2 Variável de Datas.¶

2.3.3 Variáveis Numéricas¶

2.3.4 Variáveis Categóricas¶

2.3.5 Hipóteses de Negócio¶

3. Engenharia de Atributos¶

4. Pré-Processamento dos Dados¶

4.1 Análise de Correlação¶

4.1.1 Análise de Correlação (Variáveis Numéricas)¶

4.1.2 Análise de Correlação (Variáveis Categóricas)¶

4.2 Redimensionamento das Variáveis¶

4.2.1 Redimensionando Variáveis com Escalas Distintas¶

4.2.2 Redimensionando a Variável Target¶

4.2.3 Redimensionando Variáveis Periódicas¶

4.3 Seleção das Variáveis¶

4.3.1 Divisão Treino e Teste¶

4.3.2 Seleção de Variáveis (Feature Selection)¶

5. Modelagem Preditiva¶

5.1 Definindo Funções Auxiliares¶

5.2 Average Model¶

5.3 Regressão Linear Ridge¶

5.4 Regressão Linear Lasso¶

5.5 Random Forest Regressor¶

6. Avaliação dos Modelos¶

7. Conclusões Finais¶

	Date	Day	Month	Year	WeekDay	WeekYear	YearMonth
0	2015-07-31	31	7	2015	4	31	2015-07
1	2015-07-31	31	7	2015	4	31	2015-07
2	2015-07-31	31	7	2015	4	31	2015-07
3	2015-07-31	31	7	2015	4	31	2015-07
4	2015-07-31	31	7	2015	4	31	2015-07

	Promo2SinceWeek	Promo2SinceYear	PromoSince
0	31	2015	2015-31
1	13	2010	2010-13
2	14	2011	2011-14
3	31	2015	2015-31
4	31	2015	2015-31

	CompetitionDistance	CompetitionTimeMonth	PromoTimeWeek	Year
0	-0.170968	0.918919	0.287016	1.0
1	-0.283871	1.054054	0.922551	1.0
2	1.903226	1.202703	0.801822	1.0
3	-0.275806	0.743243	0.287016	1.0
4	4.448387	-0.162162	0.287016	1.0

Modelo	MAE	MAPE	RMSE
Random Forest Regressor	700.012929	10.410141	1050.098965
Linear Regression	1873.854367	28.914407	2702.565526
Linear Regression - Lasso	1899.604403	28.803252	2766.601292

Model Name	MAE CV	MAPE CV	RMSE CV
Random Forest Regressor	853.65 +/- 213.05	11.78 +/- 2.06	1278.42 +/- 316.98
Linear Regression	2108.99 +/- 335.01	29.82 +/- 1.86	3013.54 +/- 519.62
Linear Regression - Lasso	2130.05 +/- 367.59	29.56 +/- 1.25	3066.85 +/- 548.48

	Sales	Predictions
0	5263.0	5678.656562
1	6064.0	6617.162129
2	8314.0	10228.371249
3	13995.0	12388.958453
4	4822.0	5389.180189

	Store	Predictions	MAE	MAPE
0	1	166373.651431	304.034602	7.069573
1	2	181940.886507	404.285189	7.668412
2	3	272150.177739	781.721589	10.930225
3	4	350813.541240	777.386941	7.446817
4	5	168660.563019	299.026349	7.013860

	Store	Predictions	WorstScenario	BestScenario	MAE	MAPE
291	292	107769.069507	104226.514095	111311.624919	3542.555412	59.849305
908	909	244436.375370	237332.628573	251540.122167	7103.746797	48.673154
97	98	280147.888920	278142.496518	282153.281322	2005.392402	36.456502
549	550	253841.112603	252179.271722	255502.953484	1661.840881	32.525347
875	876	214881.073593	210915.360542	218846.786644	3965.713051	31.150108

	scenario	values
0	Predictions	$291,661,573.80
1	WorstScenario	$290,876,668.31
2	BestScenario	$292,446,479.29