R Markdown

This is an R Markdown document. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.

The syntax on line 19 then the subsequent on line 56 tells R Markdown to run R code without the asterisks embeds the subsequent code as embedded inside the R Markdown file

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
library(plyr)
## Warning: package 'plyr' was built under R version 3.5.1
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.5.3
setwd("C:/Users/jlariv/OneDrive/Econ 404/")
oj <- read.csv("oj.csv") 

## create some colors for the brands
levels(oj$brand)
## [1] "dominicks"   "minute.maid" "tropicana"
#brandcol <- c("green","red","gold")
par(mfrow=c(1,2))
ggplot(oj, aes(factor(brand), price)) + geom_boxplot(aes(fill = factor(brand)))

ggplot(oj, aes(factor(brand), log(price))) + geom_boxplot(aes(fill = factor(brand)))

ggplot(oj, aes(logmove, price)) + geom_point(aes(color = factor(brand)))

ggplot(oj, aes(price, logmove)) + geom_point(aes(color = factor(brand)))

ggplot(oj, aes(feat, brand)) + geom_point(position = "jitter", aes(color = factor(brand)))

#using aggregate from the base package; 5 and 6 are the location in the column of the dataframe
aggregate(oj[, 5:6], list(oj$brand), mean)
##       Group.1      feat    price
## 1   dominicks 0.2570215 1.735809
## 2 minute.maid 0.2885273 2.241162
## 3   tropicana 0.1662348 2.870493
#using ddplyr from the plyr package
ddply(oj, .(brand), summarize,  feat=mean(feat),price=mean(price))
##         brand      feat    price
## 1   dominicks 0.2570215 1.735809
## 2 minute.maid 0.2885273 2.241162
## 3   tropicana 0.1662348 2.870493
#use plyr to look at mean and SD of price by brand-featured combination
ddply(oj, .(brand,feat), summarize,  mean_price=mean(price),sd_price=sd(price), obs=length(price))
##         brand feat mean_price  sd_price  obs
## 1   dominicks    0   1.795460 0.3823584 7169
## 2   dominicks    1   1.563375 0.3415036 2480
## 3 minute.maid    0   2.328877 0.4080230 6865
## 4 minute.maid    1   2.024867 0.3014639 2784
## 5   tropicana    0   2.966131 0.5132730 8045
## 6   tropicana    1   2.390817 0.4614941 1604
## simple regression
reg = glm(logmove ~ log(price)*feat + brand, data=oj)
summary(reg)
## 
## Call:
## glm(formula = logmove ~ log(price) * feat + brand, data = oj)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.6036  -0.4410  -0.0118   0.4183   3.2952  
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      10.12520    0.01580  640.68   <2e-16 ***
## log(price)       -2.34447    0.02292 -102.29   <2e-16 ***
## feat              1.48028    0.02719   54.45   <2e-16 ***
## brandminute.maid  0.69440    0.01167   59.49   <2e-16 ***
## brandtropicana    1.29026    0.01470   87.78   <2e-16 ***
## log(price):feat  -0.87757    0.03741  -23.46   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.4942208)
## 
##     Null deviance: 30079  on 28946  degrees of freedom
## Residual deviance: 14303  on 28941  degrees of freedom
## AIC: 61755
## 
## Number of Fisher Scoring iterations: 2
reg1 = glm(logmove ~ log(price)*brand*feat, data=oj)
summary(reg1)
## 
## Call:
## glm(formula = logmove ~ log(price) * brand * feat, data = oj)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.8893  -0.4290  -0.0091   0.4125   3.2368  
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      10.40658    0.02335 445.668  < 2e-16 ***
## log(price)                       -2.77415    0.03883 -71.445  < 2e-16 ***
## brandminute.maid                  0.04720    0.04663   1.012    0.311    
## brandtropicana                    0.70794    0.05080  13.937  < 2e-16 ***
## feat                              1.09441    0.03810  28.721  < 2e-16 ***
## log(price):brandminute.maid       0.78293    0.06140  12.750  < 2e-16 ***
## log(price):brandtropicana         0.73579    0.05684  12.946  < 2e-16 ***
## log(price):feat                  -0.47055    0.07409  -6.351 2.17e-10 ***
## brandminute.maid:feat             1.17294    0.08196  14.312  < 2e-16 ***
## brandtropicana:feat               0.78525    0.09875   7.952 1.90e-15 ***
## log(price):brandminute.maid:feat -1.10922    0.12225  -9.074  < 2e-16 ***
## log(price):brandtropicana:feat   -0.98614    0.12411  -7.946 2.00e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.4829706)
## 
##     Null deviance: 30079  on 28946  degrees of freedom
## Residual deviance: 13975  on 28935  degrees of freedom
## AIC: 61094
## 
## Number of Fisher Scoring iterations: 2
reg1 = lm(logmove ~ log(price)*brand*feat + AGE60 + EDUC + ETHNIC + INCOME+ HHLARGE + WORKWOM + HVAL150, data=oj)
summary(reg1)
## 
## Call:
## lm(formula = logmove ~ log(price) * brand * feat + AGE60 + EDUC + 
##     ETHNIC + INCOME + HHLARGE + WORKWOM + HVAL150, data = oj)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0461 -0.3990 -0.0028  0.3892  3.0766 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      11.96596    0.31077  38.504  < 2e-16 ***
## log(price)                       -2.88199    0.03740 -77.057  < 2e-16 ***
## brandminute.maid                  0.13369    0.04487   2.980  0.00289 ** 
## brandtropicana                    0.81040    0.04888  16.580  < 2e-16 ***
## feat                              1.04612    0.03664  28.553  < 2e-16 ***
## AGE60                             1.52744    0.12570  12.151  < 2e-16 ***
## EDUC                              0.47262    0.09922   4.763 1.91e-06 ***
## ETHNIC                            0.58163    0.03579  16.251  < 2e-16 ***
## INCOME                           -0.13953    0.03110  -4.487 7.27e-06 ***
## HHLARGE                          -1.18884    0.23164  -5.132 2.88e-07 ***
## WORKWOM                          -1.33364    0.14165  -9.415  < 2e-16 ***
## HVAL150                           0.40076    0.03996  10.029  < 2e-16 ***
## log(price):brandminute.maid       0.71365    0.05904  12.087  < 2e-16 ***
## log(price):brandtropicana         0.69152    0.05463  12.657  < 2e-16 ***
## log(price):feat                  -0.39170    0.07123  -5.499 3.85e-08 ***
## brandminute.maid:feat             1.09687    0.07879  13.922  < 2e-16 ***
## brandtropicana:feat               0.72157    0.09492   7.602 3.00e-14 ***
## log(price):brandminute.maid:feat -1.04438    0.11750  -8.889  < 2e-16 ***
## log(price):brandtropicana:feat   -0.97440    0.11928  -8.169 3.23e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6679 on 28928 degrees of freedom
## Multiple R-squared:  0.571,  Adjusted R-squared:  0.5707 
## F-statistic:  2139 on 18 and 28928 DF,  p-value: < 2.2e-16
reg2 = lm(logmove ~ log(price)*brand*feat + log(price)*HHLARGE + feat*HHLARGE + log(price)*EDUC + log(price)*INCOME + AGE60 + EDUC + ETHNIC + HHLARGE + WORKWOM + HVAL150, data=oj)
summary(reg2)
## 
## Call:
## lm(formula = logmove ~ log(price) * brand * feat + log(price) * 
##     HHLARGE + feat * HHLARGE + log(price) * EDUC + log(price) * 
##     INCOME + AGE60 + EDUC + ETHNIC + HHLARGE + WORKWOM + HVAL150, 
##     data = oj)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4138 -0.3868 -0.0038  0.3744  2.9579 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      16.44927    0.58621  28.061  < 2e-16 ***
## log(price)                       -8.60794    0.62176 -13.844  < 2e-16 ***
## brandminute.maid                  0.13736    0.04379   3.137  0.00171 ** 
## brandtropicana                    0.84380    0.04781  17.649  < 2e-16 ***
## feat                              1.01658    0.05119  19.860  < 2e-16 ***
## HHLARGE                           3.90065    0.47998   8.127 4.58e-16 ***
## EDUC                             -1.56451    0.17060  -9.170  < 2e-16 ***
## INCOME                           -0.57764    0.05848  -9.878  < 2e-16 ***
## AGE60                             1.52514    0.12269  12.431  < 2e-16 ***
## ETHNIC                            0.60671    0.03495  17.358  < 2e-16 ***
## WORKWOM                          -1.25295    0.13827  -9.062  < 2e-16 ***
## HVAL150                           0.42501    0.03900  10.896  < 2e-16 ***
## log(price):brandminute.maid       0.71019    0.05763  12.324  < 2e-16 ***
## log(price):brandtropicana         0.66355    0.05338  12.430  < 2e-16 ***
## log(price):feat                  -0.37882    0.06953  -5.448 5.13e-08 ***
## brandminute.maid:feat             1.08924    0.07689  14.166  < 2e-16 ***
## brandtropicana:feat               0.70858    0.09267   7.646 2.13e-14 ***
## log(price):HHLARGE               -6.44577    0.50608 -12.737  < 2e-16 ***
## feat:HHLARGE                      0.21151    0.31232   0.677  0.49826    
## log(price):EDUC                   2.50698    0.17717  14.150  < 2e-16 ***
## log(price):INCOME                 0.55557    0.06223   8.928  < 2e-16 ***
## log(price):brandminute.maid:feat -1.03769    0.11467  -9.049  < 2e-16 ***
## log(price):brandtropicana:feat   -0.97401    0.11642  -8.366  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6518 on 28924 degrees of freedom
## Multiple R-squared:  0.5915, Adjusted R-squared:  0.5911 
## F-statistic:  1903 on 22 and 28924 DF,  p-value: < 2.2e-16
# Looks like a lot of the sociodemographic features matter!!

exp(coef(reg2)["HHLARGE"]*(summary(oj$HHLARGE)["3rd Qu."]-summary(oj$HHLARGE)["Median"]))
##  HHLARGE 
## 1.097908
exp(coef(reg2)["EDUC"]*(summary(oj$EDUC)["Max."]-summary(oj$EDUC)["Median"]))
##      EDUC 
## 0.6264138
#Let's see how many weeks we have:
unique(oj$week)
##   [1]  40  46  47  48  50  51  52  53  54  57  58  59  60  61  62  63  64
##  [18]  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81
##  [35]  82  83  84  85  86  87  88  89  90  91  92  93  94  95  97  98  99
##  [52] 100 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
##  [69] 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
##  [86] 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
## [103] 153 154 155 156 157 158 159 160  42  43  44  49  55  56  96 101 102
## [120]  41  45
#Create a new df which reorders the old one. NOTE: We could easily just call this oj rather than df and avoid the redundancy
df=oj[order(oj$week,oj$store),]
#Investigating this you'll see some stores have missing weeks.
#The reordered data is summarized differently 
unique(df$week)
##   [1]  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56
##  [18]  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73
##  [35]  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
##  [52]  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
##  [69] 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
##  [86] 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
## [103] 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
## [120] 159 160
#First why to add in lagged weeks
df1 <-oj
df1$week<-df1$week+1  # df1 now has NEXT week and not the current one.  If we merge this by weeks now, this is last week's price (e.g., "lagged price").
myvars <- c("price", "week", "brand","store")
df1 <- df1[myvars]
lagged <- merge(oj, df1, by=c("brand","store","week"))
#NOTE: The number of observations decreased.  Why?  You've just lost (at least) one week's worth of data at each store.
lagged=lagged[order(lagged$week,lagged$store),]
lagged=lagged[order(lagged$store,lagged$week),]
#Comparing this to above, Store two is nowhere to be found.  There was missing data for consecutive weeks.  As a result, it gets dropped.
colnames(lagged)[18] <- "lagged_price"
colnames(lagged)[6] <- "price"

reglag = glm(logmove ~ log(price)*brand*feat + log(lagged_price)*brand, data=lagged)
summary(reglag)
## 
## Call:
## glm(formula = logmove ~ log(price) * brand * feat + log(lagged_price) * 
##     brand, data = lagged)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.9319  -0.4241  -0.0129   0.4096   3.2165  
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)                        10.281184   0.024544 418.884  < 2e-16
## log(price)                         -3.145972   0.044114 -71.314  < 2e-16
## brandminute.maid                    0.006265   0.049820   0.126   0.8999
## brandtropicana                      0.708711   0.054031  13.117  < 2e-16
## feat                                0.846213   0.041043  20.618  < 2e-16
## log(lagged_price)                   0.645167   0.036222  17.811  < 2e-16
## log(price):brandminute.maid         0.881891   0.069623  12.667  < 2e-16
## log(price):brandtropicana           0.863464   0.065264  13.230  < 2e-16
## log(price):feat                     0.066412   0.078979   0.841   0.4004
## brandminute.maid:feat               1.266994   0.084872  14.928  < 2e-16
## brandtropicana:feat                 0.683487   0.106706   6.405 1.52e-10
## brandminute.maid:log(lagged_price) -0.125843   0.058702  -2.144   0.0321
## brandtropicana:log(lagged_price)   -0.270310   0.054141  -4.993 5.99e-07
## log(price):brandminute.maid:feat   -1.558072   0.125874 -12.378  < 2e-16
## log(price):brandtropicana:feat     -1.196229   0.132111  -9.055  < 2e-16
##                                       
## (Intercept)                        ***
## log(price)                         ***
## brandminute.maid                      
## brandtropicana                     ***
## feat                               ***
## log(lagged_price)                  ***
## log(price):brandminute.maid        ***
## log(price):brandtropicana          ***
## log(price):feat                       
## brandminute.maid:feat              ***
## brandtropicana:feat                ***
## brandminute.maid:log(lagged_price) *  
## brandtropicana:log(lagged_price)   ***
## log(price):brandminute.maid:feat   ***
## log(price):brandtropicana:feat     ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.4713887)
## 
##     Null deviance: 29152  on 28007  degrees of freedom
## Residual deviance: 13196  on 27993  degrees of freedom
## AIC: 58436
## 
## Number of Fisher Scoring iterations: 2
reglag2 = glm(logmove ~ log(price)*brand*feat + log(lagged_price)*brand +AGE60 + EDUC + ETHNIC + INCOME+ HHLARGE + WORKWOM + HVAL150 + SSTRDIST
           + SSTRVOL + CPDIST5 + CPWVOL5, data=lagged)
summary(reglag2)
## 
## Call:
## glm(formula = logmove ~ log(price) * brand * feat + log(lagged_price) * 
##     brand + AGE60 + EDUC + ETHNIC + INCOME + HHLARGE + WORKWOM + 
##     HVAL150 + SSTRDIST + SSTRVOL + CPDIST5 + CPWVOL5, data = lagged)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.2454  -0.3887  -0.0068   0.3726   2.9652  
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)                        13.130101   0.330976  39.671  < 2e-16
## log(price)                         -3.201166   0.041780 -76.620  < 2e-16
## brandminute.maid                    0.055007   0.047247   1.164  0.24433
## brandtropicana                      0.775856   0.051225  15.146  < 2e-16
## feat                                0.823103   0.038824  21.201  < 2e-16
## log(lagged_price)                   0.613380   0.034291  17.887  < 2e-16
## AGE60                               1.956564   0.128034  15.282  < 2e-16
## EDUC                                1.019598   0.102025   9.994  < 2e-16
## ETHNIC                              0.627430   0.037386  16.783  < 2e-16
## INCOME                             -0.286580   0.033447  -8.568  < 2e-16
## HHLARGE                            -0.952566   0.231640  -4.112 3.93e-05
## WORKWOM                            -0.472176   0.147515  -3.201  0.00137
## HVAL150                             0.321369   0.042008   7.650 2.07e-14
## SSTRDIST                           -0.021672   0.001474 -14.701  < 2e-16
## SSTRVOL                            -0.056664   0.009831  -5.764 8.31e-09
## CPDIST5                             0.068128   0.006319  10.782  < 2e-16
## CPWVOL5                            -0.504341   0.025857 -19.505  < 2e-16
## log(price):brandminute.maid         0.864065   0.065879  13.116  < 2e-16
## log(price):brandtropicana           0.855851   0.061736  13.863  < 2e-16
## log(price):feat                     0.101505   0.074704   1.359  0.17423
## brandminute.maid:feat               1.232252   0.080307  15.344  < 2e-16
## brandtropicana:feat                 0.680951   0.100928   6.747 1.54e-11
## brandminute.maid:log(lagged_price) -0.142207   0.055537  -2.561  0.01046
## brandtropicana:log(lagged_price)   -0.285044   0.051213  -5.566 2.63e-08
## log(price):brandminute.maid:feat   -1.514683   0.119095 -12.718  < 2e-16
## log(price):brandtropicana:feat     -1.215347   0.124954  -9.726  < 2e-16
##                                       
## (Intercept)                        ***
## log(price)                         ***
## brandminute.maid                      
## brandtropicana                     ***
## feat                               ***
## log(lagged_price)                  ***
## AGE60                              ***
## EDUC                               ***
## ETHNIC                             ***
## INCOME                             ***
## HHLARGE                            ***
## WORKWOM                            ** 
## HVAL150                            ***
## SSTRDIST                           ***
## SSTRVOL                            ***
## CPDIST5                            ***
## CPWVOL5                            ***
## log(price):brandminute.maid        ***
## log(price):brandtropicana          ***
## log(price):feat                       
## brandminute.maid:feat              ***
## brandtropicana:feat                ***
## brandminute.maid:log(lagged_price) *  
## brandtropicana:log(lagged_price)   ***
## log(price):brandminute.maid:feat   ***
## log(price):brandtropicana:feat     ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.4216844)
## 
##     Null deviance: 29152  on 28007  degrees of freedom
## Residual deviance: 11800  on 27982  degrees of freedom
## AIC: 55326
## 
## Number of Fisher Scoring iterations: 2
# You would expect to see decreases in lagged prices decrease this weeks sales (e.g., substitutes).  That is exactly what we find.  The effect is largest for Dominicks, medium for Minute Maid and smallest for Tropicana.  Consistent with current period elasticity estimates.  


# There ought to be a way to figure this out with panel data but I couldn't figure it out.
# pot_dates<-data.frame(seq(as.Date("1993/3/1"), as.Date("1997/3/1"), "1 week"))
# colnames(pot_dates)[1] <- "faux_date"
# pot_dates$week = 1:nrow(pot_dates)
# oj_dates <- merge(oj, pot_dates, by=c("week"))


library(reshape2)
#dcast is a function in the reshape2 library that turns "long data" into "wide data""
oj_prices <-oj[,1:6]
oj_wide <- dcast(oj_prices, store + week ~ brand)
## Using price as value column: use value.var to override.
colnames(oj_wide)[3] <- "P_Dom"
colnames(oj_wide)[4] <- "P_MM"
colnames(oj_wide)[5] <- "P_Trop"
oj_cross <- merge(oj, oj_wide, by=c("week","store"))

#Merge the wide data back in then only look at the cross price elasticity matrix for tropicana.
trop_cross <- subset(oj_cross, brand=="tropicana") 

regcross = glm(logmove ~ log(P_Dom)*feat+log(P_MM)*feat+log(P_Trop)*feat, data=trop_cross)
summary(regcross)
## 
## Call:
## glm(formula = logmove ~ log(P_Dom) * feat + log(P_MM) * feat + 
##     log(P_Trop) * feat, data = trop_cross)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3191  -0.3801   0.0011   0.3661   2.6433  
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      10.92239    0.04339 251.738  < 2e-16 ***
## log(P_Dom)        0.14741    0.02916   5.056 4.36e-07 ***
## feat              1.50603    0.09893  15.224  < 2e-16 ***
## log(P_MM)         0.29909    0.03741   7.995 1.44e-15 ***
## log(P_Trop)      -2.15199    0.03728 -57.721  < 2e-16 ***
## log(P_Dom):feat   0.13039    0.10214   1.277    0.202    
## feat:log(P_MM)    0.68484    0.11157   6.138 8.67e-10 ***
## feat:log(P_Trop) -1.76597    0.09519 -18.553  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.3535176)
## 
##     Null deviance: 6927.8  on 9648  degrees of freedom
## Residual deviance: 3408.3  on 9641  degrees of freedom
## AIC: 17359
## 
## Number of Fisher Scoring iterations: 2
# We see that orange juice is a substitute for other orange juice. Minute Maid has twice the cross price elasticity as Dominicks, which makes sense.


# The key this here is that the week variable is formatted as a date variable.  This provides R with some information that it is a panel dataset


 #create a date and sequence accompanying the dates within the dataframe then use lag operaters to make progress on it.

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.