HW8 Sols

R Markdown

This is an R Markdown document. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.

The syntax on line 19 then the subsequent on line 56 tells R Markdown to run R code without the asterisks embeds the subsequent code as embedded inside the R Markdown file

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(plyr)

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

library(doBy)
library(tidyr)
library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 3.0-2

library(ggplot2)

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

setwd("C:/Users/jlariv/OneDrive/Econ 404/")
mydata <- read.csv("oj.csv")
oj <- mydata
colnames(mydata)[1-17]

##  [1] "store"    "brand"    "week"     "logmove"  "feat"     "price"   
##  [7] "AGE60"    "EDUC"     "ETHNIC"   "INCOME"   "HHLARGE"  "WORKWOM" 
## [13] "HVAL150"  "SSTRDIST" "SSTRVOL"  "CPWVOL5"

#reduce decimal places
is.num=sapply(mydata, is.numeric)
mydata[is.num] = lapply(mydata[is.num], round, 4)

#CREATE LAGGED DATA with recycled code
#First why to add in lagged weeks
df1 <-oj
df1$week<-df1$week+1  # df1 now has NEXT week and not the current one.  If we merge this by weeks now, this is last week's price (e.g., "lagged price").
df2 <-oj
df2$week<-df1$week+2  

myvars <- c("price", "logmove","week", "brand","store", "feat")
df1 <- df1[myvars]
names(df1)[names(df1)=="price"] = "price_t1"
names(df1)[names(df1)=="feat"] = "feat_t1"
names(df1)[names(df1)=="logmove"] = "logmove_t1"
lagged <- merge(oj, df1, by=c("brand","store","week"))
#NOTE: The number of observations decreased.  Why?  You've just lost (at least) one week's worth of data at each store.
lagged=lagged[order(lagged$week,lagged$store),]
lagged=lagged[order(lagged$store,lagged$week),]
#Comparing this to above, Store two is nowhere to be found.  There was missing data for consecutive weeks.  As a result, it gets dropped.

#Now do the same this with two weeks of lagged data.

df2 <- df2[myvars]
names(df2)[names(df2)=="price"] = "price_t2"
names(df2)[names(df2)=="feat"] = "feat_t2"
names(df2)[names(df2)=="logmove"] = "logmove_t2"
lagged_RF <- merge(lagged, df2, by=c("brand","store","week"))

#Repeat entire process with additional lag
df3 <- oj
df2$week<-df1$week+3  
df3 <- df3[myvars]
names(df3)[names(df3)=="price"] = "price_t3"
names(df3)[names(df3)=="feat"] = "feat_t3"
names(df3)[names(df3)=="logmove"] = "logmove_t3"
lagged_RF <- merge(lagged_RF, df3, by=c("brand","store","week"))

#drop unknown features to avoid spliting on these features.
lagged_RF$SSTRDIST <- NULL
lagged_RF$SSTRVOL <- NULL
lagged_RF$CPWVOL5 <- NULL
lagged_RF$CPDIST5 <- NULL

# NOTE: This code does not scale in the number of products; for that we need to utilize loops or some prepackaged function.
lagged_RF_D <- subset(lagged_RF,brand=="dominicks")
lagged_RF_M <- subset(lagged_RF,brand=="minute.maid")
lagged_RF_T <- subset(lagged_RF,brand=="tropicana")

#Dominicks rename 
names(lagged_RF_D)[names(lagged_RF_D)=="price_t3"] = "price_t3_D"
names(lagged_RF_D)[names(lagged_RF_D)=="price_t2"] = "price_t2_D"
names(lagged_RF_D)[names(lagged_RF_D)=="price_t1"] = "price_t1_D"
names(lagged_RF_D)[names(lagged_RF_D)=="price"] = "price_D"

names(lagged_RF_D)[names(lagged_RF_D)=="feat_t3"] = "feat_t3_D"
names(lagged_RF_D)[names(lagged_RF_D)=="feat_t2"] = "feat_t2_D"
names(lagged_RF_D)[names(lagged_RF_D)=="feat_t1"] = "feat_t1_D"
names(lagged_RF_D)[names(lagged_RF_D)=="feat"] = "feat_D"

names(lagged_RF_D)[names(lagged_RF_D)=="logmove_t3"] = "logmove_t3_D"
names(lagged_RF_D)[names(lagged_RF_D)=="logmove_t2"] = "logmove_t2_D"
names(lagged_RF_D)[names(lagged_RF_D)=="logmove_t1"] = "logmove_t1_D"
names(lagged_RF_D)[names(lagged_RF_D)=="logmove"] = "logmove_D"

#MM rename 
names(lagged_RF_M)[names(lagged_RF_M)=="price_t3"] = "price_t3_M"
names(lagged_RF_M)[names(lagged_RF_M)=="price_t2"] = "price_t2_M"
names(lagged_RF_M)[names(lagged_RF_M)=="price_t1"] = "price_t1_M"
names(lagged_RF_M)[names(lagged_RF_M)=="price"] = "price_M"

names(lagged_RF_M)[names(lagged_RF_M)=="feat_t3"] = "feat_t3_M"
names(lagged_RF_M)[names(lagged_RF_M)=="feat_t2"] = "feat_t2_M"
names(lagged_RF_M)[names(lagged_RF_M)=="feat_t1"] = "feat_t1_M"
names(lagged_RF_M)[names(lagged_RF_M)=="feat"] = "feat_M"

names(lagged_RF_M)[names(lagged_RF_M)=="logmove_t3"] = "logmove_t3_M"
names(lagged_RF_M)[names(lagged_RF_M)=="logmove_t2"] = "logmove_t2_M"
names(lagged_RF_M)[names(lagged_RF_M)=="logmove_t1"] = "logmove_t1_M"
names(lagged_RF_M)[names(lagged_RF_M)=="logmove"] = "logmove_M"

#Trop rename 
names(lagged_RF_T)[names(lagged_RF_T)=="price_t3"] = "price_t3_T"
names(lagged_RF_T)[names(lagged_RF_T)=="price_t2"] = "price_t2_T"
names(lagged_RF_T)[names(lagged_RF_T)=="price_t1"] = "price_t1_T"
names(lagged_RF_T)[names(lagged_RF_T)=="price"] = "price_T"

names(lagged_RF_T)[names(lagged_RF_T)=="feat_t3"] = "feat_t3_T"
names(lagged_RF_T)[names(lagged_RF_T)=="feat_t2"] = "feat_t2_T"
names(lagged_RF_T)[names(lagged_RF_T)=="feat_t1"] = "feat_t1_T"
names(lagged_RF_T)[names(lagged_RF_T)=="feat"] = "feat_T"

names(lagged_RF_T)[names(lagged_RF_T)=="logmove_t3"] = "logmove_t3_T"
names(lagged_RF_T)[names(lagged_RF_T)=="logmove_t2"] = "logmove_t2_T"
names(lagged_RF_T)[names(lagged_RF_T)=="logmove_t1"] = "logmove_t1_T"
names(lagged_RF_T)[names(lagged_RF_T)=="logmove"] = "logmove_T"

#temp DFs for merging to other brands' sales
#Trop for D and M
myvars_T <- c("feat_t3_T", "feat_t2_T", "feat_t1_T","feat_T", "price_t3_T", "price_t2_T", "price_t1_T","price_T", "logmove_T","logmove_t1_T", "logmove_t2_T", "logmove_t3_T","store","week")
temp_T <- lagged_RF_T[myvars_T]

#Dom for T and M
myvars_D <- c("feat_t3_D", "feat_t2_D", "feat_t1_D","feat_D", "price_t3_D", "price_t2_D", "price_t1_D","price_D","logmove_D","logmove_t1_D", "logmove_t2_D", "logmove_t3_D","store","week")
temp_D <- lagged_RF_D[myvars_D]

#MM for T and D
myvars_M <- c("feat_t3_M", "feat_t2_M", "feat_t1_M","feat_M", "price_t3_M", "price_t2_M", "price_t1_M","price_M","logmove_M","logmove_t1_M", "logmove_t2_M", "logmove_t3_M","store","week")
temp_M <- lagged_RF_M[myvars_M]

lagged_RF_D <- merge(lagged_RF_D, temp_T, by=c("store","week"))
lagged_RF_D <- merge(lagged_RF_D, temp_M, by=c("store","week"))

lagged_RF_M <- merge(lagged_RF_M, temp_T, by=c("store","week"))
lagged_RF_M <- merge(lagged_RF_M, temp_D, by=c("store","week"))

lagged_RF_T <- merge(lagged_RF_T, temp_D, by=c("store","week"))
lagged_RF_T <- merge(lagged_RF_T, temp_M, by=c("store","week"))


#Train the model excluding contemporaneous features 
## DOMINICKS
DQ <- lagged_RF_D
DQ$feat_D <- NULL
DQ$price_D <- NULL
DQ$feat_M <- NULL
DQ$price_M <- NULL
DQ$feat_T <- NULL
DQ$price_T <- NULL
range(DQ$week)

## [1]  43 160

#Make sure to remove store variables since they are numeric and likely uninformative as such.  The store FEs will be picked up by sociodemographics.  Adjusting week to be a linear time trend.
DQ$week <-DQ$week-42
colnames(DQ)

##  [1] "store"        "week"         "brand"        "logmove_D"    "AGE60"       
##  [6] "EDUC"         "ETHNIC"       "INCOME"       "HHLARGE"      "WORKWOM"     
## [11] "HVAL150"      "price_t1_D"   "logmove_t1_D" "feat_t1_D"    "price_t2_D"  
## [16] "logmove_t2_D" "feat_t2_D"    "price_t3_D"   "logmove_t3_D" "feat_t3_D"   
## [21] "feat_t3_T"    "feat_t2_T"    "feat_t1_T"    "price_t3_T"   "price_t2_T"  
## [26] "price_t1_T"   "logmove_T"    "logmove_t1_T" "logmove_t2_T" "logmove_t3_T"
## [31] "feat_t3_M"    "feat_t2_M"    "feat_t1_M"    "price_t3_M"   "price_t2_M"  
## [36] "price_t1_M"   "logmove_M"    "logmove_t1_M" "logmove_t2_M" "logmove_t3_M"

DQ.rf <- randomForest(logmove_D ~ AGE60+EDUC+ETHNIC+INCOME+HHLARGE+WORKWOM+HVAL150+price_t1_D+feat_t1_D+price_t2_D+feat_t2_D+price_t3_D+feat_t3_D+feat_t3_T+feat_t2_T+feat_t1_T+price_t3_T+price_t2_T+price_t1_T+feat_t3_M, data = DQ, ntree = 100, keep.forest = TRUE)
DQ$pred_logmove_D = predict(DQ.rf)
DQ$resid_QD <- DQ$logmove_D- DQ$pred_logmove_D
mean(DQ$resid_QD)

## [1] -0.0001510409

fig_DQ<-ggplot(DQ, aes(pred_logmove_D,logmove_D))+geom_point() + geom_smooth(method='lm') + geom_abline(intercept = 0, slope =1)

fig_DQ

## `geom_smooth()` using formula 'y ~ x'

DQ$resid_QD_2 <- (DQ$logmove_D- DQ$pred_logmove_D)^2
mean(DQ$resid_QD_2)

## [1] 0.2866385

# mean is very close to zero.  Likely non-zero due to non-normal distribution of sales.

DP <- lagged_RF_D
DP$feat_D <- NULL

DP$feat_M <- NULL
DP$price_M <- NULL
DP$feat_T <- NULL
DP$price_T <- NULL
DP$week <-DP$week-42
DP.rf <- randomForest(log(price_D) ~AGE60+EDUC+ETHNIC+INCOME+HHLARGE+WORKWOM+   HVAL150+price_t1_D+feat_t1_D+price_t2_D+feat_t2_D+price_t3_D+feat_t3_D+feat_t3_T+feat_t2_T+feat_t1_T+price_t3_T, data = DP, ntree = 100, keep.forest = TRUE)
DP$pred_logprice_D = predict(DP.rf)
DP$resid_logP_D <- log(DP$price_D)- DP$pred_logprice_D
mean(DP$resid_logP_D)

## [1] -5.821289e-05

fig_DP<-ggplot(DP, aes(pred_logprice_D,log(price_D)))+geom_point() + geom_smooth(method='lm') + geom_abline(intercept = 0, slope =1)
fig_DP

## `geom_smooth()` using formula 'y ~ x'

DP$resid_logP_D_2 <- (log(DP$price_D)- DP$pred_logprice_D)^2
mean(DP$resid_logP_D_2)

## [1] 0.000150586

# mean is very close to zero.  


## MM
MQ <- lagged_RF_M
MQ$feat_D <- NULL
MQ$price_D <- NULL
MQ$feat_M <- NULL
MQ$price_M <- NULL
MQ$feat_T <- NULL
MQ$price_T <- NULL
MQ$week <-MQ$week-42
MQ.rf <- randomForest(logmove_M ~ AGE60+EDUC+ETHNIC+INCOME+HHLARGE+WORKWOM+   HVAL150+price_t1_D+feat_t1_D+price_t2_D+feat_t2_D+price_t3_D+feat_t3_D+feat_t3_T+feat_t2_T+feat_t1_T+price_t3_T+price_t2_T, data = MQ, ntree = 100, keep.forest = TRUE)
MQ$pred_logmove_M = predict(MQ.rf)
MQ$resid_MQ <- MQ$logmove_M- MQ$pred_logmove_M
mean(MQ$resid_MQ)

## [1] -0.0006827532

fig_MQ<-ggplot(MQ, aes(pred_logmove_M,logmove_M))+geom_point() + geom_smooth(method='lm') + geom_abline(intercept = 0, slope =1)
fig_MQ

## `geom_smooth()` using formula 'y ~ x'

MQ$resid_MQ_2 <- (MQ$logmove_M- MQ$pred_logmove_M)^2
mean(MQ$resid_MQ_2)

## [1] 0.2095052

# mean is very close to zero.  Likely non-zero due to non-normal distribution of sales.

MP <- lagged_RF_M
MP$feat_D <- NULL
MP$price_D <- NULL
MP$feat_M <- NULL

MP$feat_T <- NULL
MP$price_T <- NULL
MP$week <-MP$week-42
MP.rf <- randomForest(log(price_M) ~ AGE60+EDUC+ETHNIC+INCOME+HHLARGE+WORKWOM+ HVAL150+price_t1_D+feat_t1_D+price_t2_D+feat_t2_D+price_t3_D+feat_t3_D+feat_t3_T+feat_t2_T+feat_t1_T+price_t3_T+price_t2_T, data = MP, ntree = 100, keep.forest = TRUE)
MP$pred_logprice_M = predict(MP.rf)
MP$resid_logP_M <- log(MP$price_M)- MP$pred_logprice_M
mean(MP$resid_logP_M)

## [1] 0.0006496291

fig_MP<-ggplot(MP, aes(pred_logprice_M,log(price_M)))+geom_point() + geom_smooth(method='lm') + geom_abline(intercept = 0, slope =1)
fig_MP

## `geom_smooth()` using formula 'y ~ x'

MP$resid_logP_M_2 <- (log(MP$price_M)- MP$pred_logprice_M)^2
mean(MP$resid_logP_M_2)

## [1] 0.002069723

# mean is very close to zero.  

## Tropicana
TQ <- lagged_RF_T
TQ$feat_D <- NULL
TQ$price_D <- NULL
TQ$feat_M <- NULL
TQ$price_M <- NULL
TQ$feat_T <- NULL
TQ$price_T <- NULL
TQ$week <-TQ$week-42
TQ.rf <- randomForest(logmove_T ~ AGE60+EDUC+ETHNIC+INCOME+HHLARGE+WORKWOM+HVAL150+price_t1_D+feat_t1_D+price_t2_D+feat_t2_D+price_t3_D+feat_t3_D+feat_t3_T+feat_t2_T+feat_t1_T+price_t3_T+price_t2_T+price_t1_T+feat_t3_M+feat_t2_M, data = TQ, ntree = 100, keep.forest = TRUE)
TQ$pred_logmove_T = predict(TQ.rf)
TQ$resid_TQ <- TQ$logmove_T- TQ$pred_logmove_T
mean(TQ$resid_TQ)

## [1] 0.005597873

fig_TQ<-ggplot(TQ, aes(pred_logmove_T,logmove_T))+geom_point() + geom_smooth(method='lm') + geom_abline(intercept = 0, slope =1)
fig_TQ

## `geom_smooth()` using formula 'y ~ x'

TQ$resid_TQ_2 <- (TQ$logmove_T- TQ$pred_logmove_T)^2
mean(TQ$resid_TQ_2)

## [1] 0.08517018

# mean is very close to zero.  Likely non-zero due to non-normal distribution of sales.

TP <- lagged_RF_T
TP$feat_D <- NULL
TP$price_D <- NULL
TP$feat_M <- NULL
TP$price_M <- NULL
TP$feat_T <- NULL

TP$week <-TP$week-42
TP.rf <- randomForest(log(price_T) ~AGE60+EDUC+ETHNIC+INCOME+HHLARGE+WORKWOM+HVAL150+price_t1_D+feat_t1_D+price_t2_D+feat_t2_D+price_t3_D+feat_t3_D+feat_t3_T+feat_t2_T+feat_t1_T+price_t3_T+price_t2_T+price_t1_T+feat_t3_M, data = TP, ntree = 100, keep.forest = TRUE)
TP$pred_logprice_T = predict(TP.rf)
TP$resid_logP_T <- log(TP$price_T)- TP$pred_logprice_T
mean(TP$resid_logP_T)

## [1] -2.998157e-06

fig_TP<-ggplot(TP, aes(pred_logprice_T,log(price_T)))+geom_point() + geom_smooth(method='lm') + geom_abline(intercept = 0, slope =1)
fig_TP

## `geom_smooth()` using formula 'y ~ x'

TP$resid_logP_T_2 <- (log(TP$price_T)- TP$pred_logprice_T)^2
mean(TP$resid_logP_T_2)

## [1] 6.06242e-05

# mean is very close to zero.  


myvars_M <- c("feat_t3_M", "feat_t2_M", "feat_t1_M","feat_M", "price_t3_M", "price_t2_M", "price_t1_M","price_M","store","week")
temp_M <- lagged_RF_M[myvars_M]


temp_DP <- DP[c("store","week", "resid_logP_D")]
temp_MP <- MP[c("store","week", "resid_logP_M")]
temp_TP <- TP[c("store","week", "resid_logP_T")]

TQ <- merge(TQ, temp_DP, by=c("store","week"))
TQ <- merge(TQ, temp_MP, by=c("store","week"))
TQ <- merge(TQ, temp_TP, by=c("store","week"))

MQ <- merge(MQ, temp_DP, by=c("store","week"))
MQ <- merge(MQ, temp_MP, by=c("store","week"))
MQ <- merge(MQ, temp_TP, by=c("store","week"))

DQ <- merge(DQ, temp_DP, by=c("store","week"))
DQ <- merge(DQ, temp_MP, by=c("store","week"))
DQ <- merge(DQ, temp_TP, by=c("store","week"))

D_reg = glm(resid_QD ~ resid_logP_D + resid_logP_M + resid_logP_T, data=DQ)
summary(D_reg)

## 
## Call:
## glm(formula = resid_QD ~ resid_logP_D + resid_logP_M + resid_logP_T, 
##     data = DQ)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.2970  -0.2851   0.0159   0.2993   3.2073  
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -0.0003006  0.0056590  -0.053  0.95764   
## resid_logP_D -1.0505153  0.4619740  -2.274  0.02299 * 
## resid_logP_M  0.1466191  0.1245795   1.177  0.23926   
## resid_logP_T  2.2966421  0.7269799   3.159  0.00159 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2862345)
## 
##     Null deviance: 2562.5  on 8939  degrees of freedom
## Residual deviance: 2557.8  on 8936  degrees of freedom
## AIC: 14193
## 
## Number of Fisher Scoring iterations: 2

M_reg = glm(resid_MQ ~ resid_logP_D + resid_logP_M + resid_logP_T, data=MQ)
summary(M_reg)

## 
## Call:
## glm(formula = resid_MQ ~ resid_logP_D + resid_logP_M + resid_logP_T, 
##     data = MQ)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2799  -0.2499  -0.0097   0.2250   3.0124  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.001844   0.004465   0.413    0.680    
## resid_logP_D -0.034521   0.364495  -0.095    0.925    
## resid_logP_M -3.894640   0.098292 -39.623   <2e-16 ***
## resid_logP_T -0.388368   0.573583  -0.677    0.498    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1781842)
## 
##     Null deviance: 1873.0  on 8939  degrees of freedom
## Residual deviance: 1592.3  on 8936  degrees of freedom
## AIC: 9955.7
## 
## Number of Fisher Scoring iterations: 2

# These regression results look off.  If the elasticity of MM was truly -1.5, it implies P_MM should be higher!

T_reg = glm(resid_TQ ~ resid_logP_D + resid_logP_M + resid_logP_T, data=TQ)
summary(T_reg)

## 
## Call:
## glm(formula = resid_TQ ~ resid_logP_D + resid_logP_M + resid_logP_T, 
##     data = TQ)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.56585  -0.16892  -0.00859   0.16211   2.91672  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.005327   0.003076   1.732   0.0833 .  
## resid_logP_D -0.220057   0.251088  -0.876   0.3808    
## resid_logP_M  0.386805   0.067710   5.713 1.15e-08 ***
## resid_logP_T -2.261119   0.395121  -5.723 1.08e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.08455476)
## 
##     Null deviance: 761.14  on 8939  degrees of freedom
## Residual deviance: 755.58  on 8936  degrees of freedom
## AIC: 3291.6
## 
## Number of Fisher Scoring iterations: 2

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

HW8 Sols

Jacob LaRiviere

May 31, 2020

R Markdown