#install.packages("lubridate") 
#install.packages("tidyverse")
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.1     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0
## -- Conflicts ------------------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date()        masks base::date()
## x dplyr::filter()          masks stats::filter()
## x lubridate::intersect()   masks base::intersect()
## x dplyr::lag()             masks stats::lag()
## x lubridate::setdiff()     masks base::setdiff()
## x lubridate::union()       masks base::union()
library(forecast)
library(dplyr)

title: “Project Alcohol” author: “A. Pendem, F. Tungcharoensukjira, J. Wong, T.Shalhoub”


Action Plan:

The aim of this project is to analyze the distribution and trend of alcohol sales in Iowa and accordingly predict the number of bottle sales for the next 12 months. The predictions can help to facilitate making decisions regarding optimal quantities of orders and production in the future.

Data Summary

The table below summarises the data that has been collected for the purpose of developing the model. Since we have 12M rows of data, we will look at the first 100,000 rows of data for a preliminary analysis before proceeding. The preliminary data contains a representative range of dates, districts and products.

AlcoholData<-read.csv('C:/Users/Jaime/Documents/INSEAD MBA/P3/Data Science/Iowa_Liquor_Sales.csv', as.is = TRUE, nrows = 100000) #load data

#to load data randomly<- read.csv.sql("x.csv", sql = "select * from file order by random() limit 20000")

str(AlcoholData)
## 'data.frame':    100000 obs. of  24 variables:
##  $ Invoice.Item.Number  : chr  "S29198800001" "S29195400002" "S29050300001" "S28867700001" ...
##  $ Date                 : chr  "11/20/2015" "11/21/2015" "11/16/2015" "11/04/2015" ...
##  $ Store.Number         : int  2191 2205 3549 2513 3942 3650 2538 3942 2662 4307 ...
##  $ Store.Name           : chr  "Keokuk Spirits" "Ding's Honk And Holler" "Quicker Liquor Store" "Hy-Vee Food Store #2 / Iowa City" ...
##  $ Address              : chr  "1013 MAIN" "900 E WASHINGTON" "1414 48TH ST" "812  S 1ST AVE" ...
##  $ City                 : chr  "KEOKUK" "CLARINDA" "FORT MADISON" "IOWA CITY" ...
##  $ Zip.Code             : chr  "52632" "51632" "52627" "52240" ...
##  $ Store.Location       : chr  "1013 MAIN\nKEOKUK 52632\n(40.39978, -91.387531)" "900 E WASHINGTON\nCLARINDA 51632\n(40.739238, -95.02756)" "1414 48TH ST\nFORT MADISON 52627\n(40.624226, -91.373211)" "812 S 1ST AVE\nIOWA CITY 52240\n" ...
##  $ County.Number        : int  56 73 56 52 86 47 7 86 70 43 ...
##  $ County               : chr  "Lee" "Page" "Lee" "Johnson" ...
##  $ Category             : int  NA NA NA NA NA NA 1701100 NA 1701100 NA ...
##  $ Category.Name        : chr  "" "" "" "" ...
##  $ Vendor.Number        : int  255 255 130 65 130 65 962 65 65 130 ...
##  $ Vendor.Name          : chr  "Wilson Daniels Ltd." "Wilson Daniels Ltd." "Disaronno International LLC" "Jim Beam Brands" ...
##  $ Item.Number          : int  297 297 249 237 249 237 238 237 173 249 ...
##  $ Item.Description     : chr  "Templeton Rye w/Flask" "Templeton Rye w/Flask" "Disaronno Amaretto Cavalli Mignon 3-50ml Pack" "Knob Creek w/ Crystal Decanter" ...
##  $ Pack                 : int  6 6 20 3 20 3 6 3 12 20 ...
##  $ Bottle.Volume..ml.   : int  750 750 150 1750 150 1750 1500 1750 750 150 ...
##  $ State.Bottle.Cost    : chr  "$18.09" "$18.09" "$6.40" "$35.55" ...
##  $ State.Bottle.Retail  : chr  "$27.14" "$27.14" "$9.60" "$53.34" ...
##  $ Bottles.Sold         : int  6 12 2 3 2 1 6 2 4 2 ...
##  $ Sale..Dollars.       : chr  "$162.84" "$325.68" "$19.20" "$160.02" ...
##  $ Volume.Sold..Liters. : num  4.5 9 0.3 5.25 0.3 1.75 9 3.5 3 0.3 ...
##  $ Volume.Sold..Gallons.: num  1.19 2.38 0.08 1.39 0.08 0.46 2.38 0.92 0.79 0.08 ...
summary(AlcoholData)
##  Invoice.Item.Number     Date            Store.Number   Store.Name       
##  Length:100000       Length:100000      Min.   :2106   Length:100000     
##  Class :character    Class :character   1st Qu.:2602   Class :character  
##  Mode  :character    Mode  :character   Median :3651   Mode  :character  
##                                         Mean   :3492                     
##                                         3rd Qu.:4194                     
##                                         Max.   :9018                     
##                                                                          
##    Address              City             Zip.Code        
##  Length:100000      Length:100000      Length:100000     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  Store.Location     County.Number      County             Category      
##  Length:100000      Min.   : 1.00   Length:100000      Min.   :1011100  
##  Class :character   1st Qu.:31.00   Class :character   1st Qu.:1012210  
##  Mode  :character   Median :61.00   Mode  :character   Median :1032080  
##                     Mean   :57.05                      Mean   :1044499  
##                     3rd Qu.:77.00                      3rd Qu.:1062310  
##                     Max.   :99.00                      Max.   :1701100  
##                     NA's   :125                        NA's   :106      
##  Category.Name      Vendor.Number   Vendor.Name         Item.Number    
##  Length:100000      Min.   : 10.0   Length:100000      Min.   :   173  
##  Class :character   1st Qu.:115.0   Class :character   1st Qu.: 27102  
##  Mode  :character   Median :260.0   Mode  :character   Median : 38176  
##                     Mean   :256.3                      Mean   : 45627  
##                     3rd Qu.:380.0                      3rd Qu.: 62061  
##                     Max.   :978.0                      Max.   :994200  
##                                                                        
##  Item.Description        Pack        Bottle.Volume..ml. State.Bottle.Cost 
##  Length:100000      Min.   :  1.00   Min.   :  50.0     Length:100000     
##  Class :character   1st Qu.:  6.00   1st Qu.: 750.0     Class :character  
##  Mode  :character   Median : 12.00   Median : 750.0     Mode  :character  
##                     Mean   : 12.15   Mean   : 932.6                       
##                     3rd Qu.: 12.00   3rd Qu.:1000.0                       
##                     Max.   :336.00   Max.   :6000.0                       
##                                                                           
##  State.Bottle.Retail  Bottles.Sold      Sale..Dollars.    
##  Length:100000       Min.   :   1.000   Length:100000     
##  Class :character    1st Qu.:   3.000   Class :character  
##  Mode  :character    Median :   6.000   Mode  :character  
##                      Mean   :   9.688                     
##                      3rd Qu.:  12.000                     
##                      Max.   :2328.000                     
##                                                           
##  Volume.Sold..Liters. Volume.Sold..Gallons.
##  Min.   :   0.100     Min.   :  0.030      
##  1st Qu.:   1.750     1st Qu.:  0.460      
##  Median :   5.250     Median :  1.390      
##  Mean   :   8.919     Mean   :  2.356      
##  3rd Qu.:  10.500     3rd Qu.:  2.770      
##  Max.   :2328.000     Max.   :614.990      
## 

Cleaning the data

Since the dataset is large and complex, it needed to be cleaned so that it can be properly processed in R.

Through cleaning the data, we had to deal with the following issues: 1) The dataset contained over 12m rows of data and therefore the file was too large to be processed in Excel. We used R to conduct analysis on a sample of the data. 2) Ensuring that the data was in the appropriate format to conduct the analysis in R. 3) Adding data fields for the purpose simplifying the analysis conducted.

In order to facilitate data analysis in R we have to fix the incorrectly classfied data types by: 1) making the date format R friendly 2) making all other variables into factors, excluding dates, sales and cost numbers 3) taking out the $ value in the sales and cost variables and making them in numeric format

# Fixing incorrectly classified data types:
AlcoholData$Invoice.Item.Number <- as.factor(AlcoholData$Invoice.Item.Number)
AlcoholData$Date <- as.Date(AlcoholData$Date, "%m/%d/%Y") # Making date format R friendly
AlcoholData$Store.Number <- as.factor(AlcoholData$Store.Number)
AlcoholData$Store.Name <- as.factor(AlcoholData$Store.Name)
AlcoholData$Address <- as.factor(AlcoholData$Address)
AlcoholData$City <- as.factor(AlcoholData$City)
AlcoholData$Zip.Code <- as.factor(AlcoholData$Zip.Code)
AlcoholData$Store.Location <- as.factor(AlcoholData$Store.Location)
AlcoholData$County.Number <- as.factor(AlcoholData$County.Number)
AlcoholData$Category <- as.factor(AlcoholData$Category)
AlcoholData$Category.Name <- as.factor(AlcoholData$Category.Name)
AlcoholData$Vendor.Number<- as.factor(AlcoholData$Vendor.Number)
AlcoholData$Vendor.Name<- as.factor(AlcoholData$Vendor.Name)
AlcoholData$Item.Number<- as.factor(AlcoholData$Item.Number)
AlcoholData$Item.Description<- as.factor(AlcoholData$Item.Description)
AlcoholData$State.Bottle.Cost <- as.numeric(gsub("\\$","",AlcoholData$State.Bottle.Cost)) # Making numeric and removing $ sign
AlcoholData$State.Bottle.Retail <- as.numeric(gsub("\\$","",AlcoholData$State.Bottle.Retail)) # Making numeric and removing $ sign
AlcoholData$Sale..Dollars <- as.numeric(gsub("\\$","",AlcoholData$Sale..Dollars)) # Making numeric and removing $ sign

Adding columns to classify data

In order to understand the trends in alcohol consumption in Iowa over time, we added columns to the dataset which included adding the following variables based on the date of sale – year, month, and day of the week. These can be used to conduct an analysis on the time series data to identify any seasonaility and trends existing in the data.

AlcoholData$dayofweek <- wday(AlcoholData$Date,label=TRUE) # Creating column that highlights what day of week 
AlcoholData$month <- month(AlcoholData$Date,label=TRUE) # Creating column showing month
AlcoholData$year <- year(AlcoholData$Date) # creating column showing year

Understanding the data

Based on our hypothesis, we would expect sales of alcohol to be seasonal, with a peak in sales in certain months in which here are more holidays (e.g. Thanksgiving, Christmas, 4th of July), as well as a peak in sales towards the weekend.

AlcoholData %>%
  group_by(month) %>%
  summarise(sales = sum(`Bottles.Sold`)) %>%
  ggplot(aes(month,sales, group = 1)) + 
  geom_area(aes(alpha = .5, fill = 1)) +
  geom_bar(stat = 'identity', aes(color = 1)) +
  theme_minimal() +
  labs(x = "", y = "Sales", title = "Number of bottles sold by month")  +
  scale_y_continuous(labels = scales::comma) + 
  guides(color = FALSE, fill = FALSE, alpha = FALSE)

plot of chunk unnamed-chunk-5

#to plot which month has the highest sales

The chart above shows the sales of alcohol by month. The chart shows that most of the sales seem to happen in October, and there does not seem to be a strong seasonality component as the sales are fairly consistent across other months.

AlcoholData %>%
  group_by(dayofweek) %>%
  summarise(sales = sum(`Bottles.Sold`)) %>%
  ggplot(aes(dayofweek,sales, group = 1)) + 
  geom_area(aes(alpha = .5, fill = 1)) +
  geom_bar(stat = 'identity', aes(color = 1)) +
  theme_minimal() +
  labs(x = "", y = "Sales", title = "Number of bottles sold by day of the week")  +
  scale_y_continuous(labels = scales::comma) + 
  guides(color = FALSE, fill = FALSE, alpha = FALSE)

plot of chunk unnamed-chunk-6

#to plot which day has the highest sales

Iowa does not sell alcohol on Sundays due to legal restrictions, and based on the above chart, it seems that most of the sales happen from Monday - Thursday instead, perhaps due to people making their purchases over the weekdays for the weekends.

Narrowing Scope of analysis

Due to the large size of the dataset, we decided to narrow the scope of our subsequent analysis to a range of specific products. We used the below charts to decide on which Category of products to analyze for maximum business impact.

We plotted the number of bottles sold by category to determine the most popular type of products in terms of movement of inventory (number of bottles sold), as shown below.

barplot(las=2, table(AlcoholData$Category.Name), cex.names=0.5)

plot of chunk unnamed-chunk-7

#see which types of liquor are the most popular based on frequency of sale

We also added columns to our data which calculate profit to see what are the products generating the most profits to businesses in the state (in terms of dollar value of absolute profits).

AlcoholData$profitperbottle<- AlcoholData$State.Bottle.Retail - AlcoholData$State.Bottle.Cost # Creating column for profit per bottle
AlcoholData$profittotal<- AlcoholData$profitperbottle*AlcoholData$Bottles.Sold # Creating column for total profit

The chart below shows the total profits generated by each category of products.

profitdf = AlcoholData %>% group_by(Category.Name) %>% mutate(sumprofit = sum(profittotal))
profitdf = profitdf %>% select(Category.Name,sumprofit)
profitdf = unique(profitdf)


profitdf %>%
  ggplot(aes(Category.Name,sumprofit, group = 1)) + 
  geom_area(aes(alpha = .5, fill = 1)) +
  geom_bar(stat = 'identity', aes(color = 1)) +
  theme_minimal() +
  labs(x = "", y = "Total Profit", title = "Profit by Category")  +
  scale_y_continuous(labels = scales::comma) + 
  guides(color = FALSE, fill = FALSE, alpha = FALSE)

plot of chunk unnamed-chunk-9

The chart shows that Vodka 80 Proof was the higest in bottles sold in Iowa, while the profit generated by Canadian Whiskeys is the highest. Since the objective of our model is to predict quantities of sales, we decided to focus our analysis on the product with the highest movement of inventory, Vodka 80 Proof, which has only slightly lower sales in terms of revenue as compared to Canadian Whiskey. Also, the Canadian Whiskey dataset covers a timespan of less than 2 years up to the end of 2016, making it insufficient for developing a prediction for sales in 2018.

We proceeded with extracting out only the Vodka 80 Proof data from the entire dataset: We use the time series data of sales of Vodka 80 Proof to predict the number of bottles sold using different forecasting models.

Forecasting

filtereddata<-read.csv('C:/Users/Jaime/Documents/INSEAD MBA/P3/Data Science/fitereddata2.csv') #load data

newdf = filtereddata %>% group_by(year,month) %>% mutate(testsum = sum(Bottles.Sold))
newdf = newdf %>% select(year,month,testsum)
newdf = unique(newdf)
newdf = arrange(newdf,year,month)

newdf_ts <- ts(newdf$testsum,start=2012.01, frequency=12) # ts function defines the dataset as timeseries starting Jan 2012 and having seasonality of frequency 12 (monthly) 


#plot various decompositions into error/noise, trend and seasonality

fit <- decompose(newdf_ts, type="multiplicative") #decompose using "classical" method, multiplicative form
plot(fit)

plot of chunk unnamed-chunk-11

fit <- decompose(newdf_ts, type="additive") #decompose using "classical" method, additive form
plot(fit)

plot of chunk unnamed-chunk-11

fit <- stl(newdf_ts, t.window=12, s.window="periodic", robust=TRUE) #decompose using STL (Season and trend using Loess)
plot(fit)

plot of chunk unnamed-chunk-11

plot(newdf_ts)

plot of chunk unnamed-chunk-11

# Create exponential smoothing models: additive vs multiplicative noise (first A vs M), additive vs multiplicative trend (second A vs M) and no seasonality vs automatic detection (third N vs Z) trend and no seasonlity (AAN), multiplicative (MMN)
newdf_AAN <- ets(newdf_ts, model="AAN")
newdf_AAZ <- ets(newdf_ts, model="AAZ", damped=FALSE)
newdf_MMN <- ets(newdf_ts, model="MMN", damped=FALSE)
newdf_MMZ <- ets(newdf_ts, model="MMZ", damped=FALSE)

# Create their prediction "cones" for 12 months (1 year) into the future with quintile confidence intervals
newdf_AAN_pred <- forecast(newdf_AAN, h=12, level=c(0.2, 0.4, 0.6, 0.8))
newdf_AAZ_pred <- forecast(newdf_AAZ, h=12, level=c(0.2, 0.4, 0.6, 0.8))
newdf_MMN_pred <- forecast(newdf_MMN, h=12, level=c(0.2, 0.4, 0.6, 0.8))
newdf_MMZ_pred <- forecast(newdf_MMZ, h=12, level=c(0.2, 0.4, 0.6, 0.8))

#Create a trigonometric box-cox autoregressive trend seasonality (TBATS) model
newdf_tbats <- tbats(newdf_ts)
newdf_tbats_pred <-forecast(newdf_tbats, h=12)
plot(newdf_tbats_pred, xlab="Year", ylab="Predicted Bottles Sold")

plot of chunk unnamed-chunk-11

par(mfrow=c(1,1)) # Lets look at them one-by-one
plot(newdf_AAZ_pred, xlab="Year", ylab="Predicted Bottles Sold")

plot of chunk unnamed-chunk-11

plot(newdf_MMZ_pred, xlab="Year", ylab="Predicted Bottles Sold")

plot of chunk unnamed-chunk-11

plot(newdf_tbats_pred, xlab="Year", ylab="Predicted Bottles Sold")

plot of chunk unnamed-chunk-11

# Lets look at what our models actually are
newdf_AAZ
## ETS(A,A,N) 
## 
## Call:
##  ets(y = newdf_ts, model = "AAZ", damped = FALSE) 
## 
##   Smoothing parameters:
##     alpha = 0.0083 
##     beta  = 0.0083 
## 
##   Initial states:
##     l = 286092.6804 
##     b = 695.1576 
## 
##   sigma:  45032.57
## 
##      AIC     AICc      BIC 
## 1435.516 1436.716 1445.642
newdf_MMZ
## ETS(M,M,M) 
## 
## Call:
##  ets(y = newdf_ts, model = "MMZ", damped = FALSE) 
## 
##   Smoothing parameters:
##     alpha = 0.0065 
##     beta  = 0.0065 
##     gamma = 1e-04 
## 
##   Initial states:
##     l = 280493.1941 
##     b = 1.0036 
##     s=0.9698 1.1671 0.8847 0.9583 0.9276 1.0624
##            1.0843 0.9433 0.8253 1.0852 1.0985 0.9933
## 
##   sigma:  0.0913
## 
##      AIC     AICc      BIC 
## 1410.150 1426.255 1444.581
newdf_tbats
## TBATS(0, {2,2}, 1, {<12,4>})
## 
## Call: tbats(y = newdf_ts)
## 
## Parameters
##   Lambda: 1e-06
##   Alpha: -0.02011629
##   Beta: 0.01479715
##   Damping Parameter: 1
##   Gamma-1 Values: 0.0001625124
##   Gamma-2 Values: 0.0001420674
##   AR coefficients: -0.943186 -0.395815
##   MA coefficients: 1.146374 0.98029
## 
## Seed States:
##               [,1]
##  [1,] 12.547936342
##  [2,]  0.007365120
##  [3,]  0.031584960
##  [4,]  0.072373480
##  [5,] -0.061380399
##  [6,] -0.054251182
##  [7,] -0.006366751
##  [8,] -0.017480309
##  [9,]  0.063997022
## [10,]  0.003886602
## [11,]  0.000000000
## [12,]  0.000000000
## [13,]  0.000000000
## [14,]  0.000000000
## 
## Sigma: 0.084367
## AIC: 1414.031
# Print the mean predictions from the three seasonal models to the console screen to copy and paste from the screen to Excel 
cbind(newdf_AAZ_pred$mean, newdf_MMZ_pred$mean, newdf_tbats_pred$mean)
## Time Series:
## Start = 2016.67666666667 
## End = 2017.59333333333 
## Frequency = 12 
##          newdf_AAZ_pred$mean newdf_MMZ_pred$mean newdf_tbats_pred$mean
## 2016.677            377428.3            360165.1              346437.0
## 2016.760            380329.8            335050.5              379013.7
## 2016.843            383231.2            445388.6              421045.7
## 2016.927            386132.6            372914.6              419232.5
## 2017.010            389034.0            384887.2              396474.8
## 2017.093            391935.4            428884.5              464999.6
## 2017.177            394836.8            426923.3              429998.6
## 2017.260            397738.2            327185.3              339369.0
## 2017.343            400639.6            376831.6              389077.1
## 2017.427            403541.0            436487.0              471331.2
## 2017.510            406442.4            430927.3              447791.0
## 2017.593            409343.9            379137.3              416096.7
write.csv(newdf_MMZ_pred$mean, file = "Predicted Bottles Solds.csv") # export the selected model's predictions into a CSV file

#ARIMA model could also be fitted but does not work well in this case. Need ARIMA with covariates (e.g., Fourier)
newdf_arima <- auto.arima(newdf_ts, seasonal=TRUE)
newdf_arima
## Series: newdf_ts 
## ARIMA(3,1,0)(1,0,0)[12] 
## 
## Coefficients:
##           ar1      ar2      ar3    sar1
##       -0.7247  -0.5120  -0.5375  0.6271
## s.e.   0.1134   0.1307   0.1152  0.1105
## 
## sigma^2 estimated as 1.49e+09:  log likelihood=-660.68
## AIC=1331.36   AICc=1332.58   BIC=1341.4
newdf_arima_pred <-forecast(newdf_arima, h=12)
plot(newdf_arima_pred, ylab="Predicted Bottles Sold")

plot of chunk unnamed-chunk-11

The decomposition chart of the time series shows that there is an upward trend in the data in addition to a pattern of seasonality. We proceeded with plotting the forecasted sales by using different methods including ETS, TBATS, and ARIMA. Based on our understanding of the sales patterns for alcohol in Iowa and the demographics of the state, we select the ETS(M,M,M) model as the best fit model for predicting sales in the coming 12 months. We expect the trend of increasing consumption to continue into the future as the portion of the population within the legal age of alcohol consumption will continue to grow. The improvements in the US economy in 2018 further promote greater expenditure on alcohol products in the future. We expect the seasonality trend to apply as well given the short timespan covered by the prediction period (12 months).

Other factors to be considered

In assessing the accuracy of our models to make predictions for the future, and ion particular the long-term, it is important to consider other factors that may impact the future sales of alcohol in the state of Iowa. For example, we need to consider the effect of the following factors on sales of 80 proof Vodka in Iowa:

1) Introduction of new products can affect demand for existing product through the threat of cannibalisation.

2) Changes in consumer preferences due to changes in demographic and economic status

3) Impact of changes in government and legislation in the state of Iowa and neighboring states on sales of alcoholic products in Iowa.

4) Changes in macro-economic and political situations.