MLB Time Series Analysis

Required Packages

Time Series Data

overall.data <- read.csv('MLB Stats - Overall.csv')
overall.data <- subset(overall.data,select= c(-X))
overall.data <- ts(overall.data, start=1961)

plot(overall.data,type='o', main="Statistics Over Time in MLB")

Train/Test Split

train <- subset(overall.data, subset= time(overall.data) < 2010)
test <- subset(overall.data, subset= time(overall.data) >= 2010)

#split data for testing true model
train <- ts(train, start=1961)
test <- ts(test, start=2010)

plot(train,type='o')

HR.data <- train[,"HR"]
hr.model <- lm(HR.data~time(HR.data))
loghr.model <- lm(log(HR.data)~time(HR.data))

summary(hr.model)

## 
## Call:
## lm(formula = HR.data ~ time(HR.data))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.7472  -5.7753   0.2292   6.2471  15.2617 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -1.956e+03  1.704e+02  -11.48 3.12e-15 ***
## time(HR.data)  9.989e-01  8.585e-02   11.63 1.94e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.499 on 47 degrees of freedom
## Multiple R-squared:  0.7423, Adjusted R-squared:  0.7368 
## F-statistic: 135.4 on 1 and 47 DF,  p-value: 1.939e-15

summary(loghr.model)

## 
## Call:
## lm(formula = log(HR.data) ~ time(HR.data))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.17514 -0.11868  0.09318  0.33522  1.02490 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -92.085281  12.588737  -7.315 2.73e-09 ***
## time(HR.data)   0.047897   0.006342   7.553 1.19e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6278 on 47 degrees of freedom
## Multiple R-squared:  0.5483, Adjusted R-squared:  0.5386 
## F-statistic: 57.04 on 1 and 47 DF,  p-value: 1.194e-09

hits.data <- train[,"Hits"]
hits.model <- lm(hits.data~time(hits.data))
sqrthits.model <- lm(sqrt(hits.data)~time(hits.data))

summary(hits.model)

## 
## Call:
## lm(formula = hits.data ~ time(hits.data))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.9242  -3.9746   0.3419   4.3349  14.9680 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -2.271e+03  1.446e+02  -15.71   <2e-16 ***
## time(hits.data)  1.158e+00  7.284e-02   15.90   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.211 on 47 degrees of freedom
## Multiple R-squared:  0.8433, Adjusted R-squared:  0.8399 
## F-statistic: 252.8 on 1 and 47 DF,  p-value: < 2.2e-16

summary(sqrthits.model)

## 
## Call:
## lm(formula = sqrt(hits.data) ~ time(hits.data))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.47287 -0.38823  0.00947  0.37776  1.68321 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -2.347e+02  1.452e+01  -16.17   <2e-16 ***
## time(hits.data)  1.207e-01  7.313e-03   16.51   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.724 on 47 degrees of freedom
## Multiple R-squared:  0.8529, Adjusted R-squared:  0.8498 
## F-statistic: 272.5 on 1 and 47 DF,  p-value: < 2.2e-16

runs.data <- train[,"Runs"]
runs.model <- lm(runs.data~time(runs.data))
logruns.model <- lm(log(runs.data)~time(runs.data))
sqrtruns.model <- lm(sqrt(runs.data)~time(runs.data))


summary(runs.model)

## 
## Call:
## lm(formula = runs.data ~ time(runs.data))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.8198  -5.0902   0.5065   4.8049  14.1359 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -2.253e+03  1.407e+02  -16.02   <2e-16 ***
## time(runs.data)  1.149e+00  7.086e-02   16.22   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.014 on 47 degrees of freedom
## Multiple R-squared:  0.8484, Adjusted R-squared:  0.8452 
## F-statistic:   263 on 1 and 47 DF,  p-value: < 2.2e-16

summary(logruns.model)

## 
## Call:
## lm(formula = log(runs.data) ~ time(runs.data))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.04933 -0.18499  0.03511  0.28652  0.87553 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -1.093e+02  9.890e+00  -11.05 1.16e-14 ***
## time(runs.data)  5.657e-02  4.982e-03   11.35 4.54e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4932 on 47 degrees of freedom
## Multiple R-squared:  0.7328, Adjusted R-squared:  0.7271 
## F-statistic: 128.9 on 1 and 47 DF,  p-value: 4.544e-15

summary(sqrtruns.model)

## 
## Call:
## lm(formula = sqrt(runs.data) ~ time(runs.data))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.90230 -0.55305  0.04637  0.44200  1.58732 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -2.329e+02  1.468e+01  -15.87   <2e-16 ***
## time(runs.data)  1.198e-01  7.393e-03   16.20   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7319 on 47 degrees of freedom
## Multiple R-squared:  0.8482, Adjusted R-squared:  0.8449 
## F-statistic: 262.5 on 1 and 47 DF,  p-value: < 2.2e-16

SO.data <- train[,"SO"]
so.model <- lm(SO.data~time(SO.data))
logso.model <- lm(log(SO.data)~time(SO.data))

summary(so.model)

## 
## Call:
## lm(formula = SO.data ~ time(SO.data))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5354  -2.9292   0.3088   1.8923  14.0482 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -1.842e+03  9.838e+01  -18.72   <2e-16 ***
## time(SO.data)  9.405e-01  4.956e-02   18.98   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.906 on 47 degrees of freedom
## Multiple R-squared:  0.8846, Adjusted R-squared:  0.8821 
## F-statistic: 360.1 on 1 and 47 DF,  p-value: < 2.2e-16

summary(logso.model)

## 
## Call:
## lm(formula = log(SO.data) ~ time(SO.data))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.69738 -0.25659  0.05418  0.17767  1.06303 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -1.007e+02  9.755e+00  -10.32 1.15e-13 ***
## time(SO.data)  5.221e-02  4.914e-03   10.62 4.39e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4865 on 47 degrees of freedom
## Multiple R-squared:  0.706,  Adjusted R-squared:  0.6997 
## F-statistic: 112.9 on 1 and 47 DF,  p-value: 4.391e-14

Best Fit Line

par(mfrow=c(2,2))
plot(HR.data,type='o');abline(hr.model)
plot(hits.data,type='o');abline(hits.model)
plot(runs.data,type='o');abline(runs.model)
plot(SO.data,type='o');abline(so.model)

HAC TEST

coeftest(hr.model, vcov=vcovHAC(hr.model))

## 
## t test of coefficients:
## 
##                  Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept)   -1956.01684   226.75817 -8.6260 3.013e-11 ***
## time(HR.data)     0.99888     0.11445  8.7278 2.136e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

coeftest(hits.model, vcov=vcovHAC(hits.model))

## 
## t test of coefficients:
## 
##                    Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept)     -2.2713e+03  1.6068e+02 -14.136 < 2.2e-16 ***
## time(hits.data)  1.1583e+00  8.1379e-02  14.233 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

coeftest(runs.model, vcov=vcovHAC(runs.model))

## 
## t test of coefficients:
## 
##                    Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept)     -2.2534e+03  1.6373e+02 -13.763 < 2.2e-16 ***
## time(runs.data)  1.1492e+00  8.2857e-02  13.869 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

coeftest(so.model, vcov=vcovHAC(so.model))

## 
## t test of coefficients:
## 
##                  Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept)   -1.8419e+03  1.2405e+02 -14.848 < 2.2e-16 ***
## time(SO.data)  9.4051e-01  6.2183e-02  15.125 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Non-transformed model residuals

par(mfrow=c(2,2))
plot(rstandard(hr.model),type='o')
plot(rstandard(hits.model),type='o')
plot(rstandard(runs.model),type='o')
plot(rstandard(so.model),type='o')

Transformed Model Residuals

hrres <- rstandard(hr.model)
hitsres <- rstandard(sqrthits.model)
runsres <- rstandard(sqrtruns.model)
sores <- rstandard(logso.model)

par(mfrow= c(2,2))
plot(hrres,type='o')
plot(hitsres,type='o')
plot(runsres,type='o')
plot(sores,type='o')

par(mfrow=c(2,2))
qqnorm(hrres);qqline(hrres)
qqnorm(hitsres);qqline(hitsres)
qqnorm(runsres);qqline(runsres)
qqnorm(sores);qqline(sores)

par(mfrow=c(2,2))
hist(hrres)
hist(hitsres)
hist(runsres)
hist(sores)

shapiro.test(hrres)

## 
##  Shapiro-Wilk normality test
## 
## data:  hrres
## W = 0.97858, p-value = 0.5074

shapiro.test(hitsres)

## 
##  Shapiro-Wilk normality test
## 
## data:  hitsres
## W = 0.98146, p-value = 0.6277

shapiro.test(runsres)

## 
##  Shapiro-Wilk normality test
## 
## data:  runsres
## W = 0.98754, p-value = 0.8795

shapiro.test(sores)

## 
##  Shapiro-Wilk normality test
## 
## data:  sores
## W = 0.88791, p-value = 0.0002265

Time Dependence

par(mfrow=c(2,2))
acf(hrres)
acf(hitsres)
acf(runsres)
acf(sores)

par(mfrow=c(2,2))
runs(hrres)

## $pvalue
## [1] 0.401
## 
## $observed.runs
## [1] 22
## 
## $expected.runs
## [1] 25.40816
## 
## $n1
## [1] 23
## 
## $n2
## [1] 26
## 
## $k
## [1] 0

runs(hitsres)

## $pvalue
## [1] 1
## 
## $observed.runs
## [1] 25
## 
## $expected.runs
## [1] 25.4898
## 
## $n1
## [1] 24
## 
## $n2
## [1] 25
## 
## $k
## [1] 0

runs(runsres)

## $pvalue
## [1] 0.829
## 
## $observed.runs
## [1] 24
## 
## $expected.runs
## [1] 25.2449
## 
## $n1
## [1] 22
## 
## $n2
## [1] 27
## 
## $k
## [1] 0

runs(sores)

## $pvalue
## [1] 0.0398
## 
## $observed.runs
## [1] 17
## 
## $expected.runs
## [1] 24.26531
## 
## $n1
## [1] 19
## 
## $n2
## [1] 30
## 
## $k
## [1] 0

Determine Order of Integration d

par(mfrow=c(2,2))
plot(hrres, type='o')
plot(hitsres, type='o')
plot(runsres, type='o')
plot(sores, type='o')

Stationarity and Unit Roots

adf.test(hrres)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  hrres
## Dickey-Fuller = -2.735, Lag order = 3, p-value = 0.28
## alternative hypothesis: stationary

adf.test(hitsres)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  hitsres
## Dickey-Fuller = -2.0007, Lag order = 3, p-value = 0.5739
## alternative hypothesis: stationary

adf.test(runsres)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  runsres
## Dickey-Fuller = -2.4859, Lag order = 3, p-value = 0.3797
## alternative hypothesis: stationary

adf.test(sores)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  sores
## Dickey-Fuller = -3.2888, Lag order = 3, p-value = 0.08391
## alternative hypothesis: stationary

pp.test(hrres)

## Warning in pp.test(hrres): p-value smaller than printed p-value

## 
##  Phillips-Perron Unit Root Test
## 
## data:  hrres
## Dickey-Fuller Z(alpha) = -32.301, Truncation lag parameter = 3, p-value
## = 0.01
## alternative hypothesis: stationary

pp.test(hitsres)

## Warning in pp.test(hitsres): p-value smaller than printed p-value

## 
##  Phillips-Perron Unit Root Test
## 
## data:  hitsres
## Dickey-Fuller Z(alpha) = -37.906, Truncation lag parameter = 3, p-value
## = 0.01
## alternative hypothesis: stationary

pp.test(runsres)

## Warning in pp.test(runsres): p-value smaller than printed p-value

## 
##  Phillips-Perron Unit Root Test
## 
## data:  runsres
## Dickey-Fuller Z(alpha) = -35.665, Truncation lag parameter = 3, p-value
## = 0.01
## alternative hypothesis: stationary

pp.test(sores)

## 
##  Phillips-Perron Unit Root Test
## 
## data:  sores
## Dickey-Fuller Z(alpha) = -24.699, Truncation lag parameter = 3, p-value
## = 0.01344
## alternative hypothesis: stationary

kpss.test(hrres)

## Warning in kpss.test(hrres): p-value greater than printed p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  hrres
## KPSS Level = 0.12324, Truncation lag parameter = 3, p-value = 0.1

kpss.test(hitsres)

## Warning in kpss.test(hitsres): p-value greater than printed p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  hitsres
## KPSS Level = 0.08834, Truncation lag parameter = 3, p-value = 0.1

kpss.test(runsres)

## Warning in kpss.test(runsres): p-value greater than printed p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  runsres
## KPSS Level = 0.070726, Truncation lag parameter = 3, p-value = 0.1

kpss.test(sores)

## Warning in kpss.test(sores): p-value greater than printed p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  sores
## KPSS Level = 0.12208, Truncation lag parameter = 3, p-value = 0.1

Choosing p and q

par(mfrow=c(2,2))
acf(hrres)
acf(hitsres)
acf(runsres)
acf(sores)

par(mfrow=c(2,2))
pacf(hrres)
pacf(hitsres)
pacf(runsres)
pacf(sores)

EACF Plots

print("HR")

## [1] "HR"

eacf(hrres)

## AR/MA
##   0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x o o o o o o o o o o  o  o  o 
## 1 x o o o o o o o o o o  o  o  o 
## 2 x o o o o o o o o o o  o  o  o 
## 3 o o o o o o o o o o o  o  o  o 
## 4 o o o o o o o o o o o  o  o  o 
## 5 x x o o o o o o o o o  o  o  o 
## 6 x x o o o o o o o o o  o  o  o 
## 7 x x o o o o o o o o o  o  o  o

print("Hits")

## [1] "Hits"

eacf(hitsres)

## AR/MA
##   0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x o o o o o o o x o x  x  o  o 
## 1 x o o o o o o o o o o  o  o  o 
## 2 x o o o o o o o o o o  o  o  o 
## 3 o x o o o o o o o o o  o  o  o 
## 4 o x o o o o o o o o o  o  o  o 
## 5 x o o o o o o o o o o  o  o  o 
## 6 x o o o o o o o o o o  o  o  o 
## 7 o o x o o o o o o o o  o  o  o

print("Runs")

## [1] "Runs"

eacf(runsres)

## AR/MA
##   0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 o o o o o o o o o o x  o  o  o 
## 1 o o o o o o o o o o o  o  o  o 
## 2 x o o o o o o o o o o  o  o  o 
## 3 o o o o o o o o o o o  o  o  o 
## 4 o o o o o o o o o o o  o  o  o 
## 5 x o o o o o o o o o o  o  o  o 
## 6 o o o o o o o o o o o  o  o  o 
## 7 o o x o o o o o o o o  o  o  o

print("SO")

## [1] "SO"

eacf(sores)

## AR/MA
##   0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x o o o o o o o o o o  o  o  o 
## 1 x o o o o o o o o o o  o  o  o 
## 2 x o o o o o o o o o o  o  o  o 
## 3 x o o o o o o o o o o  o  o  o 
## 4 x o o o o o o o o o o  o  o  o 
## 5 o o o o o o o o o o o  o  o  o 
## 6 x o x o o o o o o o o  o  o  o 
## 7 x o x o o o o o o o o  o  o  o

For HR: IMA(1,1) or ARI(3,1)

For Hits: IMA(1,1) or ARI(3,1)

For Runs:

ARI(1,1)

For SO:

ARI(5,1) or IMA(1,1)

Creating ARIMA Models

hr.MA <- Arima(hrres, c(0,1,1))
hr.AR3 <- Arima(hrres, c(3,1,0))
hr.MA

## Series: hrres 
## ARIMA(0,1,1) 
## 
## Coefficients:
##           ma1
##       -0.6334
## s.e.   0.1478
## 
## sigma^2 = 1.018:  log likelihood = -68.29
## AIC=140.58   AICc=140.84   BIC=144.32

hr.AR3

## Series: hrres 
## ARIMA(3,1,0) 
## 
## Coefficients:
##           ar1      ar2      ar3
##       -0.5507  -0.3258  -0.1511
## s.e.   0.1416   0.1551   0.1410
## 
## sigma^2 = 1.082:  log likelihood = -68.63
## AIC=145.27   AICc=146.2   BIC=152.75

hits.MA <- Arima(hitsres, c(0,1,1))
hits.AR <- Arima(hitsres, c(3,1,0))
hits.MA

## Series: hitsres 
## ARIMA(0,1,1) 
## 
## Coefficients:
##           ma1
##       -0.6153
## s.e.   0.1201
## 
## sigma^2 = 0.9927:  log likelihood = -67.66
## AIC=139.33   AICc=139.6   BIC=143.07

hits.AR

## Series: hitsres 
## ARIMA(3,1,0) 
## 
## Coefficients:
##           ar1      ar2      ar3
##       -0.6524  -0.3704  -0.1078
## s.e.   0.1500   0.1682   0.1483
## 
## sigma^2 = 1.026:  log likelihood = -67.41
## AIC=142.83   AICc=143.76   BIC=150.31

runs.AR <- Arima(runsres, c(1,1,0))
runs.AR

## Series: runsres 
## ARIMA(1,1,0) 
## 
## Coefficients:
##           ar1
##       -0.4009
## s.e.   0.1310
## 
## sigma^2 = 1.249:  log likelihood = -73.03
## AIC=150.05   AICc=150.32   BIC=153.8

so.AR <- Arima(sores, c(5,1,0))
so.MA <- Arima(sores, c(0,1,1))
so.AR

## Series: sores 
## ARIMA(5,1,0) 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4     ar5
##       -0.3555  -0.0444  -0.1564  -0.1196  0.1052
## s.e.   0.1500   0.1709   0.1761   0.1764  0.1566
## 
## sigma^2 = 0.8642:  log likelihood = -62.13
## AIC=136.25   AICc=138.3   BIC=147.48

so.MA

## Series: sores 
## ARIMA(0,1,1) 
## 
## Coefficients:
##           ma1
##       -0.3507
## s.e.   0.1490
## 
## sigma^2 = 0.8323:  log likelihood = -63.27
## AIC=130.53   AICc=130.8   BIC=134.27

keep hr.MA, hits.MA, runs.AR, so.MA

tsdiag(hr.MA)

tsdiag(hits.MA)

tsdiag(runs.AR)

tsdiag(so.MA)

Overfitting

hr.overfit <- Arima(hrres, c(0,1,2))
hits.overfit <- Arima(hitsres, c(0,1,2))
runs.overfit <- Arima(runsres, c(2,1,0))
so.overfit <- Arima(sores, c(0,1,2))

data.frame(
  Model= c("hr.MA","hits.MA","runs.AR","so.MA","hr.overfit","hits.overfit","runs.overfit","so.overfit"),
  AIC= c(hr.MA$aic, hits.MA$aic, runs.AR$aic, so.MA$aic, hr.overfit$aic, hits.overfit$aic, runs.overfit$aic, so.overfit$aic),
  AICC= c(hr.MA$aicc, hits.MA$aicc, runs.AR$aicc, so.MA$aicc, hr.overfit$aicc, hits.overfit$aicc, runs.overfit$aicc, so.overfit$aicc),
  BIC= c(hr.MA$bic, hits.MA$bic, runs.AR$bic, so.MA$bic, hr.overfit$bic, hits.overfit$bic, runs.overfit$bic, so.overfit$bic)
  )

##          Model      AIC     AICC      BIC
## 1        hr.MA 140.5766 140.8432 144.3190
## 2      hits.MA 139.3295 139.5962 143.0719
## 3      runs.AR 150.0535 150.3202 153.7959
## 4        so.MA 130.5302 130.7969 134.2726
## 5   hr.overfit 141.9884 142.5338 147.6020
## 6 hits.overfit 141.1190 141.6645 146.7326
## 7 runs.overfit 148.5819 149.1273 154.1955
## 8   so.overfit 132.4430 132.9884 138.0566

Forecasting

#HR
HR.MA1_xreg=Arima(HR.data,order=c(0,1,1),xreg=1:length(HR.data))
newtm=seq(from=length(HR.data)+1,to=length(HR.data)+10,length=10)
predx=predict(HR.MA1_xreg,n.ahead=10,newxreg=newtm)
pr=predx$pred
uci=pr+2*predx$se
lci=pr-2*predx$se

# To plot the predicted values as prediction intervals, code them as time series
pr=ts(pr,start=2010,freq=1)
uci=ts(uci,start=2010,freq=1)
lci=ts(lci,start=2010,freq=1)

ymin=min(c(as.vector(lci),HR.data))-.1
ymax=max(c(as.vector(uci),HR.data))+.1

par(mfrow=c(1,1))
plot(HR.data,xlim=c(1961,2020),ylim=c(ymin,ymax),main="HR.data")
lines(test[,"HR"], type='o')
lines(pr,col=2)
lines(uci,col=3)
lines(lci,col=3)

#Hits
hits.MA1_xreg=Arima(hits.data,order=c(0,1,1),xreg=1:length(hits.data))
newtm=seq(from=length(hits.data)+1,to=length(hits.data)+10,length=10)
predx=predict(hits.MA1_xreg,n.ahead=10,newxreg=newtm)
pr=predx$pred
uci=pr+2*predx$se
lci=pr-2*predx$se

# To plot the predicted values as prediction intervals, code them as time series
pr=ts(pr,start=2010,freq=1)
uci=ts(uci,start=2010,freq=1)
lci=ts(lci,start=2010,freq=1)

ymin=min(c(as.vector(lci),hits.data))-.1
ymax=max(c(as.vector(uci),hits.data))+.1

par(mfrow=c(1,1))
plot(hits.data,xlim=c(1961,2020),ylim=c(ymin,ymax),main="Hits.data")
lines(test[,"Hits"], type='o')
lines(pr,col=2)
lines(uci,col=3)
lines(lci,col=3)

#Runs
runs.AR1_xreg=Arima(runs.data,order=c(1,1,0),xreg=1:length(runs.data))
newtm=seq(from=length(runs.data)+1,to=length(runs.data)+10,length=10)
predx=predict(runs.AR1_xreg,n.ahead=10,newxreg=newtm)
pr=predx$pred
uci=pr+2*predx$se
lci=pr-2*predx$se

# To plot the predicted values as prediction intervals, code them as time series
pr=ts(pr,start=2010,freq=1)
uci=ts(uci,start=2010,freq=1)
lci=ts(lci,start=2010,freq=1)

ymin=min(c(as.vector(lci),runs.data))-.1
ymax=max(c(as.vector(uci),runs.data))+.1

par(mfrow=c(1,1))
plot(runs.data,xlim=c(1961,2020),ylim=c(ymin,ymax),main="runs.data")
lines(test[,"Runs"], type='o')
lines(pr,col=2)
lines(uci,col=3)
lines(lci,col=3)

#SO
SO.MA1_xreg=Arima(SO.data,order=c(0,1,1),xreg=1:length(SO.data))
newtm=seq(from=length(SO.data)+1,to=length(SO.data)+10,length=10)
predx=predict(SO.MA1_xreg,n.ahead=10,newxreg=newtm)
pr=predx$pred
uci=pr+2*predx$se
lci=pr-2*predx$se

# To plot the predicted values as prediction intervals, code them as time series
pr=ts(pr,start=2010,freq=1)
uci=ts(uci,start=2010,freq=1)
lci=ts(lci,start=2010,freq=1)

ymin=min(c(as.vector(lci),SO.data))-.1
ymax=max(c(as.vector(uci),SO.data))+.1

par(mfrow=c(1,1))
plot(SO.data,xlim=c(1961,2020),ylim=c(ymin,ymax),main="SO.data")
lines(test[,"SO"], type='o')
lines(pr,col=2)
lines(uci,col=3)
lines(lci,col=3)

MLB Time Series Analysis

Braden Barglind, Jonathan Oliveros

2022-06-13

Required Packages

Time Series Data

Train/Test Split

Best Fit Line

HAC TEST

Non-transformed model residuals

Transformed Model Residuals

Time Dependence

Determine Order of Integration d

Stationarity and Unit Roots

Choosing p and q

EACF Plots

Creating ARIMA Models

Overfitting

Forecasting