#install.packages("lubridate")
#install.packages("tidyverse")
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.1 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## -- Conflicts ------------------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x lubridate::setdiff() masks base::setdiff()
## x lubridate::union() masks base::union()
library(forecast)
library(dplyr)
title: “Project Alcohol” author: “A. Pendem, F. Tungcharoensukjira, J. Wong, T.Shalhoub”
The aim of this project is to analyze the distribution and trend of alcohol sales in Iowa and accordingly predict the number of bottle sales for the next 12 months. The predictions can help to facilitate making decisions regarding optimal quantities of orders and production in the future.
The table below summarises the data that has been collected for the purpose of developing the model. Since we have 12M rows of data, we will look at the first 100,000 rows of data for a preliminary analysis before proceeding. The preliminary data contains a representative range of dates, districts and products.
AlcoholData<-read.csv('C:/Users/Jaime/Documents/INSEAD MBA/P3/Data Science/Iowa_Liquor_Sales.csv', as.is = TRUE, nrows = 100000) #load data
#to load data randomly<- read.csv.sql("x.csv", sql = "select * from file order by random() limit 20000")
str(AlcoholData)
## 'data.frame': 100000 obs. of 24 variables:
## $ Invoice.Item.Number : chr "S29198800001" "S29195400002" "S29050300001" "S28867700001" ...
## $ Date : chr "11/20/2015" "11/21/2015" "11/16/2015" "11/04/2015" ...
## $ Store.Number : int 2191 2205 3549 2513 3942 3650 2538 3942 2662 4307 ...
## $ Store.Name : chr "Keokuk Spirits" "Ding's Honk And Holler" "Quicker Liquor Store" "Hy-Vee Food Store #2 / Iowa City" ...
## $ Address : chr "1013 MAIN" "900 E WASHINGTON" "1414 48TH ST" "812 S 1ST AVE" ...
## $ City : chr "KEOKUK" "CLARINDA" "FORT MADISON" "IOWA CITY" ...
## $ Zip.Code : chr "52632" "51632" "52627" "52240" ...
## $ Store.Location : chr "1013 MAIN\nKEOKUK 52632\n(40.39978, -91.387531)" "900 E WASHINGTON\nCLARINDA 51632\n(40.739238, -95.02756)" "1414 48TH ST\nFORT MADISON 52627\n(40.624226, -91.373211)" "812 S 1ST AVE\nIOWA CITY 52240\n" ...
## $ County.Number : int 56 73 56 52 86 47 7 86 70 43 ...
## $ County : chr "Lee" "Page" "Lee" "Johnson" ...
## $ Category : int NA NA NA NA NA NA 1701100 NA 1701100 NA ...
## $ Category.Name : chr "" "" "" "" ...
## $ Vendor.Number : int 255 255 130 65 130 65 962 65 65 130 ...
## $ Vendor.Name : chr "Wilson Daniels Ltd." "Wilson Daniels Ltd." "Disaronno International LLC" "Jim Beam Brands" ...
## $ Item.Number : int 297 297 249 237 249 237 238 237 173 249 ...
## $ Item.Description : chr "Templeton Rye w/Flask" "Templeton Rye w/Flask" "Disaronno Amaretto Cavalli Mignon 3-50ml Pack" "Knob Creek w/ Crystal Decanter" ...
## $ Pack : int 6 6 20 3 20 3 6 3 12 20 ...
## $ Bottle.Volume..ml. : int 750 750 150 1750 150 1750 1500 1750 750 150 ...
## $ State.Bottle.Cost : chr "$18.09" "$18.09" "$6.40" "$35.55" ...
## $ State.Bottle.Retail : chr "$27.14" "$27.14" "$9.60" "$53.34" ...
## $ Bottles.Sold : int 6 12 2 3 2 1 6 2 4 2 ...
## $ Sale..Dollars. : chr "$162.84" "$325.68" "$19.20" "$160.02" ...
## $ Volume.Sold..Liters. : num 4.5 9 0.3 5.25 0.3 1.75 9 3.5 3 0.3 ...
## $ Volume.Sold..Gallons.: num 1.19 2.38 0.08 1.39 0.08 0.46 2.38 0.92 0.79 0.08 ...
summary(AlcoholData)
## Invoice.Item.Number Date Store.Number Store.Name
## Length:100000 Length:100000 Min. :2106 Length:100000
## Class :character Class :character 1st Qu.:2602 Class :character
## Mode :character Mode :character Median :3651 Mode :character
## Mean :3492
## 3rd Qu.:4194
## Max. :9018
##
## Address City Zip.Code
## Length:100000 Length:100000 Length:100000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Store.Location County.Number County Category
## Length:100000 Min. : 1.00 Length:100000 Min. :1011100
## Class :character 1st Qu.:31.00 Class :character 1st Qu.:1012210
## Mode :character Median :61.00 Mode :character Median :1032080
## Mean :57.05 Mean :1044499
## 3rd Qu.:77.00 3rd Qu.:1062310
## Max. :99.00 Max. :1701100
## NA's :125 NA's :106
## Category.Name Vendor.Number Vendor.Name Item.Number
## Length:100000 Min. : 10.0 Length:100000 Min. : 173
## Class :character 1st Qu.:115.0 Class :character 1st Qu.: 27102
## Mode :character Median :260.0 Mode :character Median : 38176
## Mean :256.3 Mean : 45627
## 3rd Qu.:380.0 3rd Qu.: 62061
## Max. :978.0 Max. :994200
##
## Item.Description Pack Bottle.Volume..ml. State.Bottle.Cost
## Length:100000 Min. : 1.00 Min. : 50.0 Length:100000
## Class :character 1st Qu.: 6.00 1st Qu.: 750.0 Class :character
## Mode :character Median : 12.00 Median : 750.0 Mode :character
## Mean : 12.15 Mean : 932.6
## 3rd Qu.: 12.00 3rd Qu.:1000.0
## Max. :336.00 Max. :6000.0
##
## State.Bottle.Retail Bottles.Sold Sale..Dollars.
## Length:100000 Min. : 1.000 Length:100000
## Class :character 1st Qu.: 3.000 Class :character
## Mode :character Median : 6.000 Mode :character
## Mean : 9.688
## 3rd Qu.: 12.000
## Max. :2328.000
##
## Volume.Sold..Liters. Volume.Sold..Gallons.
## Min. : 0.100 Min. : 0.030
## 1st Qu.: 1.750 1st Qu.: 0.460
## Median : 5.250 Median : 1.390
## Mean : 8.919 Mean : 2.356
## 3rd Qu.: 10.500 3rd Qu.: 2.770
## Max. :2328.000 Max. :614.990
##
Since the dataset is large and complex, it needed to be cleaned so that it can be properly processed in R.
Through cleaning the data, we had to deal with the following issues: 1) The dataset contained over 12m rows of data and therefore the file was too large to be processed in Excel. We used R to conduct analysis on a sample of the data. 2) Ensuring that the data was in the appropriate format to conduct the analysis in R. 3) Adding data fields for the purpose simplifying the analysis conducted.
In order to facilitate data analysis in R we have to fix the incorrectly classfied data types by: 1) making the date format R friendly 2) making all other variables into factors, excluding dates, sales and cost numbers 3) taking out the $ value in the sales and cost variables and making them in numeric format
# Fixing incorrectly classified data types:
AlcoholData$Invoice.Item.Number <- as.factor(AlcoholData$Invoice.Item.Number)
AlcoholData$Date <- as.Date(AlcoholData$Date, "%m/%d/%Y") # Making date format R friendly
AlcoholData$Store.Number <- as.factor(AlcoholData$Store.Number)
AlcoholData$Store.Name <- as.factor(AlcoholData$Store.Name)
AlcoholData$Address <- as.factor(AlcoholData$Address)
AlcoholData$City <- as.factor(AlcoholData$City)
AlcoholData$Zip.Code <- as.factor(AlcoholData$Zip.Code)
AlcoholData$Store.Location <- as.factor(AlcoholData$Store.Location)
AlcoholData$County.Number <- as.factor(AlcoholData$County.Number)
AlcoholData$Category <- as.factor(AlcoholData$Category)
AlcoholData$Category.Name <- as.factor(AlcoholData$Category.Name)
AlcoholData$Vendor.Number<- as.factor(AlcoholData$Vendor.Number)
AlcoholData$Vendor.Name<- as.factor(AlcoholData$Vendor.Name)
AlcoholData$Item.Number<- as.factor(AlcoholData$Item.Number)
AlcoholData$Item.Description<- as.factor(AlcoholData$Item.Description)
AlcoholData$State.Bottle.Cost <- as.numeric(gsub("\\$","",AlcoholData$State.Bottle.Cost)) # Making numeric and removing $ sign
AlcoholData$State.Bottle.Retail <- as.numeric(gsub("\\$","",AlcoholData$State.Bottle.Retail)) # Making numeric and removing $ sign
AlcoholData$Sale..Dollars <- as.numeric(gsub("\\$","",AlcoholData$Sale..Dollars)) # Making numeric and removing $ sign
In order to understand the trends in alcohol consumption in Iowa over time, we added columns to the dataset which included adding the following variables based on the date of sale – year, month, and day of the week. These can be used to conduct an analysis on the time series data to identify any seasonaility and trends existing in the data.
AlcoholData$dayofweek <- wday(AlcoholData$Date,label=TRUE) # Creating column that highlights what day of week
AlcoholData$month <- month(AlcoholData$Date,label=TRUE) # Creating column showing month
AlcoholData$year <- year(AlcoholData$Date) # creating column showing year
Based on our hypothesis, we would expect sales of alcohol to be seasonal, with a peak in sales in certain months in which here are more holidays (e.g. Thanksgiving, Christmas, 4th of July), as well as a peak in sales towards the weekend.
AlcoholData %>%
group_by(month) %>%
summarise(sales = sum(`Bottles.Sold`)) %>%
ggplot(aes(month,sales, group = 1)) +
geom_area(aes(alpha = .5, fill = 1)) +
geom_bar(stat = 'identity', aes(color = 1)) +
theme_minimal() +
labs(x = "", y = "Sales", title = "Number of bottles sold by month") +
scale_y_continuous(labels = scales::comma) +
guides(color = FALSE, fill = FALSE, alpha = FALSE)
#to plot which month has the highest sales
The chart above shows the sales of alcohol by month. The chart shows that most of the sales seem to happen in October, and there does not seem to be a strong seasonality component as the sales are fairly consistent across other months.
AlcoholData %>%
group_by(dayofweek) %>%
summarise(sales = sum(`Bottles.Sold`)) %>%
ggplot(aes(dayofweek,sales, group = 1)) +
geom_area(aes(alpha = .5, fill = 1)) +
geom_bar(stat = 'identity', aes(color = 1)) +
theme_minimal() +
labs(x = "", y = "Sales", title = "Number of bottles sold by day of the week") +
scale_y_continuous(labels = scales::comma) +
guides(color = FALSE, fill = FALSE, alpha = FALSE)
#to plot which day has the highest sales
Iowa does not sell alcohol on Sundays due to legal restrictions, and based on the above chart, it seems that most of the sales happen from Monday - Thursday instead, perhaps due to people making their purchases over the weekdays for the weekends.
Due to the large size of the dataset, we decided to narrow the scope of our subsequent analysis to a range of specific products. We used the below charts to decide on which Category of products to analyze for maximum business impact.
We plotted the number of bottles sold by category to determine the most popular type of products in terms of movement of inventory (number of bottles sold), as shown below.
barplot(las=2, table(AlcoholData$Category.Name), cex.names=0.5)
#see which types of liquor are the most popular based on frequency of sale
We also added columns to our data which calculate profit to see what are the products generating the most profits to businesses in the state (in terms of dollar value of absolute profits).
AlcoholData$profitperbottle<- AlcoholData$State.Bottle.Retail - AlcoholData$State.Bottle.Cost # Creating column for profit per bottle
AlcoholData$profittotal<- AlcoholData$profitperbottle*AlcoholData$Bottles.Sold # Creating column for total profit
The chart below shows the total profits generated by each category of products.
profitdf = AlcoholData %>% group_by(Category.Name) %>% mutate(sumprofit = sum(profittotal))
profitdf = profitdf %>% select(Category.Name,sumprofit)
profitdf = unique(profitdf)
profitdf %>%
ggplot(aes(Category.Name,sumprofit, group = 1)) +
geom_area(aes(alpha = .5, fill = 1)) +
geom_bar(stat = 'identity', aes(color = 1)) +
theme_minimal() +
labs(x = "", y = "Total Profit", title = "Profit by Category") +
scale_y_continuous(labels = scales::comma) +
guides(color = FALSE, fill = FALSE, alpha = FALSE)
The chart shows that Vodka 80 Proof was the higest in bottles sold in Iowa, while the profit generated by Canadian Whiskeys is the highest. Since the objective of our model is to predict quantities of sales, we decided to focus our analysis on the product with the highest movement of inventory, Vodka 80 Proof, which has only slightly lower sales in terms of revenue as compared to Canadian Whiskey. Also, the Canadian Whiskey dataset covers a timespan of less than 2 years up to the end of 2016, making it insufficient for developing a prediction for sales in 2018.
We proceeded with extracting out only the Vodka 80 Proof data from the entire dataset: We use the time series data of sales of Vodka 80 Proof to predict the number of bottles sold using different forecasting models.
filtereddata<-read.csv('C:/Users/Jaime/Documents/INSEAD MBA/P3/Data Science/fitereddata2.csv') #load data
newdf = filtereddata %>% group_by(year,month) %>% mutate(testsum = sum(Bottles.Sold))
newdf = newdf %>% select(year,month,testsum)
newdf = unique(newdf)
newdf = arrange(newdf,year,month)
newdf_ts <- ts(newdf$testsum,start=2012.01, frequency=12) # ts function defines the dataset as timeseries starting Jan 2012 and having seasonality of frequency 12 (monthly)
#plot various decompositions into error/noise, trend and seasonality
fit <- decompose(newdf_ts, type="multiplicative") #decompose using "classical" method, multiplicative form
plot(fit)
fit <- decompose(newdf_ts, type="additive") #decompose using "classical" method, additive form
plot(fit)
fit <- stl(newdf_ts, t.window=12, s.window="periodic", robust=TRUE) #decompose using STL (Season and trend using Loess)
plot(fit)
plot(newdf_ts)
# Create exponential smoothing models: additive vs multiplicative noise (first A vs M), additive vs multiplicative trend (second A vs M) and no seasonality vs automatic detection (third N vs Z) trend and no seasonlity (AAN), multiplicative (MMN)
newdf_AAN <- ets(newdf_ts, model="AAN")
newdf_AAZ <- ets(newdf_ts, model="AAZ", damped=FALSE)
newdf_MMN <- ets(newdf_ts, model="MMN", damped=FALSE)
newdf_MMZ <- ets(newdf_ts, model="MMZ", damped=FALSE)
# Create their prediction "cones" for 12 months (1 year) into the future with quintile confidence intervals
newdf_AAN_pred <- forecast(newdf_AAN, h=12, level=c(0.2, 0.4, 0.6, 0.8))
newdf_AAZ_pred <- forecast(newdf_AAZ, h=12, level=c(0.2, 0.4, 0.6, 0.8))
newdf_MMN_pred <- forecast(newdf_MMN, h=12, level=c(0.2, 0.4, 0.6, 0.8))
newdf_MMZ_pred <- forecast(newdf_MMZ, h=12, level=c(0.2, 0.4, 0.6, 0.8))
#Create a trigonometric box-cox autoregressive trend seasonality (TBATS) model
newdf_tbats <- tbats(newdf_ts)
newdf_tbats_pred <-forecast(newdf_tbats, h=12)
plot(newdf_tbats_pred, xlab="Year", ylab="Predicted Bottles Sold")
par(mfrow=c(1,1)) # Lets look at them one-by-one
plot(newdf_AAZ_pred, xlab="Year", ylab="Predicted Bottles Sold")
plot(newdf_MMZ_pred, xlab="Year", ylab="Predicted Bottles Sold")
plot(newdf_tbats_pred, xlab="Year", ylab="Predicted Bottles Sold")
# Lets look at what our models actually are
newdf_AAZ
## ETS(A,A,N)
##
## Call:
## ets(y = newdf_ts, model = "AAZ", damped = FALSE)
##
## Smoothing parameters:
## alpha = 0.0083
## beta = 0.0083
##
## Initial states:
## l = 286092.6804
## b = 695.1576
##
## sigma: 45032.57
##
## AIC AICc BIC
## 1435.516 1436.716 1445.642
newdf_MMZ
## ETS(M,M,M)
##
## Call:
## ets(y = newdf_ts, model = "MMZ", damped = FALSE)
##
## Smoothing parameters:
## alpha = 0.0065
## beta = 0.0065
## gamma = 1e-04
##
## Initial states:
## l = 280493.1941
## b = 1.0036
## s=0.9698 1.1671 0.8847 0.9583 0.9276 1.0624
## 1.0843 0.9433 0.8253 1.0852 1.0985 0.9933
##
## sigma: 0.0913
##
## AIC AICc BIC
## 1410.150 1426.255 1444.581
newdf_tbats
## TBATS(0, {2,2}, 1, {<12,4>})
##
## Call: tbats(y = newdf_ts)
##
## Parameters
## Lambda: 1e-06
## Alpha: -0.02011629
## Beta: 0.01479715
## Damping Parameter: 1
## Gamma-1 Values: 0.0001625124
## Gamma-2 Values: 0.0001420674
## AR coefficients: -0.943186 -0.395815
## MA coefficients: 1.146374 0.98029
##
## Seed States:
## [,1]
## [1,] 12.547936342
## [2,] 0.007365120
## [3,] 0.031584960
## [4,] 0.072373480
## [5,] -0.061380399
## [6,] -0.054251182
## [7,] -0.006366751
## [8,] -0.017480309
## [9,] 0.063997022
## [10,] 0.003886602
## [11,] 0.000000000
## [12,] 0.000000000
## [13,] 0.000000000
## [14,] 0.000000000
##
## Sigma: 0.084367
## AIC: 1414.031
# Print the mean predictions from the three seasonal models to the console screen to copy and paste from the screen to Excel
cbind(newdf_AAZ_pred$mean, newdf_MMZ_pred$mean, newdf_tbats_pred$mean)
## Time Series:
## Start = 2016.67666666667
## End = 2017.59333333333
## Frequency = 12
## newdf_AAZ_pred$mean newdf_MMZ_pred$mean newdf_tbats_pred$mean
## 2016.677 377428.3 360165.1 346437.0
## 2016.760 380329.8 335050.5 379013.7
## 2016.843 383231.2 445388.6 421045.7
## 2016.927 386132.6 372914.6 419232.5
## 2017.010 389034.0 384887.2 396474.8
## 2017.093 391935.4 428884.5 464999.6
## 2017.177 394836.8 426923.3 429998.6
## 2017.260 397738.2 327185.3 339369.0
## 2017.343 400639.6 376831.6 389077.1
## 2017.427 403541.0 436487.0 471331.2
## 2017.510 406442.4 430927.3 447791.0
## 2017.593 409343.9 379137.3 416096.7
write.csv(newdf_MMZ_pred$mean, file = "Predicted Bottles Solds.csv") # export the selected model's predictions into a CSV file
#ARIMA model could also be fitted but does not work well in this case. Need ARIMA with covariates (e.g., Fourier)
newdf_arima <- auto.arima(newdf_ts, seasonal=TRUE)
newdf_arima
## Series: newdf_ts
## ARIMA(3,1,0)(1,0,0)[12]
##
## Coefficients:
## ar1 ar2 ar3 sar1
## -0.7247 -0.5120 -0.5375 0.6271
## s.e. 0.1134 0.1307 0.1152 0.1105
##
## sigma^2 estimated as 1.49e+09: log likelihood=-660.68
## AIC=1331.36 AICc=1332.58 BIC=1341.4
newdf_arima_pred <-forecast(newdf_arima, h=12)
plot(newdf_arima_pred, ylab="Predicted Bottles Sold")
The decomposition chart of the time series shows that there is an upward trend in the data in addition to a pattern of seasonality. We proceeded with plotting the forecasted sales by using different methods including ETS, TBATS, and ARIMA. Based on our understanding of the sales patterns for alcohol in Iowa and the demographics of the state, we select the ETS(M,M,M) model as the best fit model for predicting sales in the coming 12 months. We expect the trend of increasing consumption to continue into the future as the portion of the population within the legal age of alcohol consumption will continue to grow. The improvements in the US economy in 2018 further promote greater expenditure on alcohol products in the future. We expect the seasonality trend to apply as well given the short timespan covered by the prediction period (12 months).
In assessing the accuracy of our models to make predictions for the future, and ion particular the long-term, it is important to consider other factors that may impact the future sales of alcohol in the state of Iowa. For example, we need to consider the effect of the following factors on sales of 80 proof Vodka in Iowa:
1) Introduction of new products can affect demand for existing product through the threat of cannibalisation.
2) Changes in consumer preferences due to changes in demographic and economic status
3) Impact of changes in government and legislation in the state of Iowa and neighboring states on sales of alcoholic products in Iowa.
4) Changes in macro-economic and political situations.