Directions

Captain Toad’s treasure!

A very popular tool!

1. Scatter plot

1.1 Load data

library(xlsx)

## Warning: package 'xlsx' was built under R version 4.0.4

eg8_3 <- read.xlsx("Eg 8-3 Home Market Value.xlsx",
                   sheetIndex = 1, header = TRUE,
                   startRow = 3, colIndex = 1:3)
head(eg8_3)

##   House.Age Square.Feet Market.Value
## 1        33        1812        90000
## 2        32        1914       104400
## 3        32        1842        93300
## 4        33        1812        91000
## 5        32        1836       101900
## 6        33        2028       108500

names(eg8_3)[1:3] <- c("House_Age", "Square_Feet", "Market_Value")
head(eg8_3)

##   House_Age Square_Feet Market_Value
## 1        33        1812        90000
## 2        32        1914       104400
## 3        32        1842        93300
## 4        33        1812        91000
## 5        32        1836       101900
## 6        33        2028       108500

1.2 Scatter plot

library(ggplot2)
library(ggpubr)

ggplot(eg8_3) + aes(x = Market_Value, y = Square_Feet) +
  geom_point(shape = 2, colour = "black") +
  xlab("Square Footage") + ylab("Market Value") +
  ggtitle("Size of House and Market Value") +
  geom_smooth(method = lm) +
  stat_regline_equation(label.x = 110000, label.y = 1700) +
  stat_cor(method = "pearson", label.x = 110000, label.y = 1600)

## `geom_smooth()` using formula 'y ~ x'

2. Simple Linear Regression

eg8_3_regression <- lm(eg8_3$Market_Value ~ eg8_3$Square_Feet)
summary(eg8_3_regression)

## 
## Call:
## lm(formula = eg8_3$Market_Value ~ eg8_3$Square_Feet)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8067  -4327  -1923   3097  32634 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       32673.220   8831.951   3.699  0.00065 ***
## eg8_3$Square_Feet    35.036      5.167   6.780  3.8e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7288 on 40 degrees of freedom
## Multiple R-squared:  0.5347, Adjusted R-squared:  0.5231 
## F-statistic: 45.97 on 1 and 40 DF,  p-value: 3.798e-08

confint(eg8_3_regression, level = 0.95)

##                        2.5 %      97.5 %
## (Intercept)       14823.1816 50523.25820
## eg8_3$Square_Feet    24.5927    45.48004

3. Residuals

eg8_3_stdresiduals <- rstandard(eg8_3_regression)
head(eg8_3_stdresiduals)

##          1          2          3          4          5          6 
## -0.8583995  0.6563142 -0.5460889 -0.7190292  0.6840116  0.6826257

eg8_3_com <- cbind(eg8_3, eg8_3_stdresiduals)
head(eg8_3_com)

##   House_Age Square_Feet Market_Value eg8_3_stdresiduals
## 1        33        1812        90000         -0.8583995
## 2        32        1914       104400          0.6563142
## 3        32        1842        93300         -0.5460889
## 4        33        1812        91000         -0.7190292
## 5        32        1836       101900          0.6840116
## 6        33        2028       108500          0.6826257

ggplot(eg8_3_com) + aes(x = eg8_3_com$Market_Value, y = eg8_3_com$eg8_3_stdresiduals) +
  geom_point() +
  xlab("Market Value") + ylab("Standard Residuals") +
  ggtitle("Size of House and Market Value, Residuals")

## Warning: Use of `eg8_3_com$Market_Value` is discouraged. Use `Market_Value`
## instead.

## Warning: Use of `eg8_3_com$eg8_3_stdresiduals` is discouraged. Use
## `eg8_3_stdresiduals` instead.

3.1 Normality

ggplot(eg8_3) + aes(x = Market_Value) +
  geom_histogram() +
  ylab("Count") +
  ggtitle("Distribution of Market Value")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using the Shapiro-Wilks test.

H-0: normal distribution.

H-alt: distribution is different from a normal distribution.

shapiro.test(eg8_3$Market_Value)

## 
##  Shapiro-Wilk normality test
## 
## data:  eg8_3$Market_Value
## W = 0.90155, p-value = 0.001603

3.2 Autocorrelation

May not be very applicable here. But just for illustration……

library(car)

## Loading required package: carData

durbinWatsonTest(eg8_3_regression)

##  lag Autocorrelation D-W Statistic p-value
##    1       0.1110143      1.258807   0.024
##  Alternative hypothesis: rho != 0

3.3 Automatic Diagnostics

library(gvlma)
gvlma(eg8_3_regression)

## 
## Call:
## lm(formula = eg8_3$Market_Value ~ eg8_3$Square_Feet)
## 
## Coefficients:
##       (Intercept)  eg8_3$Square_Feet  
##          32673.22              35.04  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = eg8_3_regression) 
## 
##                        Value   p-value                   Decision
## Global Stat        1.793e+02 0.000e+00 Assumptions NOT satisfied!
## Skewness           4.133e+01 1.284e-10 Assumptions NOT satisfied!
## Kurtosis           1.206e+02 0.000e+00 Assumptions NOT satisfied!
## Link Function      9.494e-04 9.754e-01    Assumptions acceptable.
## Heteroscedasticity 1.741e+01 3.006e-05 Assumptions NOT satisfied!

4. Multiple Linear Regression

4.1 Load data

eg8_12 <- read.xlsx("Eg 8-12 Colleges and Universities.xlsx",
                   sheetIndex = 1, header = TRUE,
                   startRow = 3, colIndex = 1:7)
head(eg8_12)

##     School       Type Median.SAT Acceptance.Rate Expenditures.Student
## 1  Amherst   Lib Arts       1315            0.22                26636
## 2  Barnard   Lib Arts       1220            0.53                17653
## 3    Bates   Lib Arts       1240            0.36                17554
## 4 Berkeley University       1176            0.37                23665
## 5  Bowdoin   Lib Arts       1300            0.24                25703
## 6    Brown University       1281            0.24                24201
##   Top.10..HS Graduation..
## 1         85           93
## 2         69           80
## 3         58           88
## 4         95           68
## 5         78           90
## 6         80           90

names(eg8_12)[1:7] <- c("School", "Type", "Median_SAT",
                        "Acceptance_Rate", "Expenditure_per_Student",
                        "Top_10_HS", "Graduation_pct")
head(eg8_12)

##     School       Type Median_SAT Acceptance_Rate Expenditure_per_Student
## 1  Amherst   Lib Arts       1315            0.22                   26636
## 2  Barnard   Lib Arts       1220            0.53                   17653
## 3    Bates   Lib Arts       1240            0.36                   17554
## 4 Berkeley University       1176            0.37                   23665
## 5  Bowdoin   Lib Arts       1300            0.24                   25703
## 6    Brown University       1281            0.24                   24201
##   Top_10_HS Graduation_pct
## 1        85             93
## 2        69             80
## 3        58             88
## 4        95             68
## 5        78             90
## 6        80             90

4.2 Multiple Linear Regression Model

Select only complete cases.

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## v purrr   0.3.4

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::recode() masks car::recode()
## x purrr::some()   masks car::some()

eg8_12_com <- drop_na(eg8_12, "Graduation_pct")
head(eg8_12_com)

##     School       Type Median_SAT Acceptance_Rate Expenditure_per_Student
## 1  Amherst   Lib Arts       1315            0.22                   26636
## 2  Barnard   Lib Arts       1220            0.53                   17653
## 3    Bates   Lib Arts       1240            0.36                   17554
## 4 Berkeley University       1176            0.37                   23665
## 5  Bowdoin   Lib Arts       1300            0.24                   25703
## 6    Brown University       1281            0.24                   24201
##   Top_10_HS Graduation_pct
## 1        85             93
## 2        69             80
## 3        58             88
## 4        95             68
## 5        78             90
## 6        80             90

Regression model.

eg8_12_regression <- lm(eg8_12_com$Graduation_pct ~ eg8_12_com$Median_SAT +
                          eg8_12_com$Acceptance_Rate +
                          eg8_12_com$Expenditure_per_Student +
                          eg8_12_com$Top_10_HS)
summary(eg8_12_regression)

## 
## Call:
## lm(formula = eg8_12_com$Graduation_pct ~ eg8_12_com$Median_SAT + 
##     eg8_12_com$Acceptance_Rate + eg8_12_com$Expenditure_per_Student + 
##     eg8_12_com$Top_10_HS)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.8674  -2.0462   0.6193   3.6417  11.2090 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         1.792e+01  2.456e+01   0.730 0.469402    
## eg8_12_com$Median_SAT               7.201e-02  1.798e-02   4.004 0.000236 ***
## eg8_12_com$Acceptance_Rate         -2.486e+01  8.315e+00  -2.990 0.004560 ** 
## eg8_12_com$Expenditure_per_Student -1.356e-04  6.593e-05  -2.057 0.045600 *  
## eg8_12_com$Top_10_HS               -1.628e-01  7.934e-02  -2.051 0.046214 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.308 on 44 degrees of freedom
## Multiple R-squared:  0.5344, Adjusted R-squared:  0.4921 
## F-statistic: 12.63 on 4 and 44 DF,  p-value: 6.332e-07

confint(eg8_12_regression, level = 0.95)

##                                            2.5 %        97.5 %
## (Intercept)                        -3.157088e+01  6.741279e+01
## eg8_12_com$Median_SAT               3.576208e-02  1.082505e-01
## eg8_12_com$Acceptance_Rate         -4.161739e+01 -8.101078e+00
## eg8_12_com$Expenditure_per_Student -2.685259e-04 -2.773789e-06
## eg8_12_com$Top_10_HS               -3.226729e-01 -2.856120e-03

4.3 Residuals

eg8_12_stdresiduals <- rstandard(eg8_12_regression)
head(eg8_12_stdresiduals)

##          1          2          3          4          5          6 
##  0.6449387  0.2008233  0.3063999 -1.4197111  0.1199610  0.4085649

eg8_12_com_2 <- cbind(eg8_12_com, eg8_12_stdresiduals)
head(eg8_12_com_2)

##     School       Type Median_SAT Acceptance_Rate Expenditure_per_Student
## 1  Amherst   Lib Arts       1315            0.22                   26636
## 2  Barnard   Lib Arts       1220            0.53                   17653
## 3    Bates   Lib Arts       1240            0.36                   17554
## 4 Berkeley University       1176            0.37                   23665
## 5  Bowdoin   Lib Arts       1300            0.24                   25703
## 6    Brown University       1281            0.24                   24201
##   Top_10_HS Graduation_pct eg8_12_stdresiduals
## 1        85             93           0.6449387
## 2        69             80           0.2008233
## 3        58             88           0.3063999
## 4        95             68          -1.4197111
## 5        78             90           0.1199610
## 6        80             90           0.4085649

Graduation Percentage.

ggplot(eg8_12_com_2) + aes(x = Graduation_pct, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Graduation Percentage") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Graduation Percentage")

Median SAT scores.

ggplot(eg8_12_com_2) + aes(x = Median_SAT, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Median SAT Scores") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Median SAT Scores")

Acceptance rate

ggplot(eg8_12_com_2) + aes(x = Acceptance_Rate, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Acceptance Rate") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Acceptance Rate")

Expenditure per student.

ggplot(eg8_12_com_2) + aes(x = Expenditure_per_Student, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Expenditure per Student") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Expenditure per Student")

Top 10% in High School.

ggplot(eg8_12_com_2) + aes(x = Top_10_HS, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Top 10% in High School") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Top 10% in High School")

4.4 Diagnostics

4.4.1 Normality

ggplot(eg8_12_com) + aes(x = Graduation_pct) +
  geom_histogram() +
  ylab("Count") +
  ggtitle("Distribution of Graduation Percentage")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using the Shapiro-Wilks test.

H-0: normal distribution.

H-alt: distribution is different from a normal distribution.

shapiro.test(eg8_12_com$Graduation_pct)

## 
##  Shapiro-Wilk normality test
## 
## data:  eg8_12_com$Graduation_pct
## W = 0.92977, p-value = 0.006014

4.4.2 Autocorrelation

May not be very applicable here. But just for illustration……

durbinWatsonTest(eg8_12_regression)

##  lag Autocorrelation D-W Statistic p-value
##    1     -0.01695507      2.009753     0.9
##  Alternative hypothesis: rho != 0

4.4.3 Multicollinearity

vif(eg8_12_regression)

##              eg8_12_com$Median_SAT         eg8_12_com$Acceptance_Rate 
##                           2.164223                           2.105911 
## eg8_12_com$Expenditure_per_Student               eg8_12_com$Top_10_HS 
##                           1.770573                           1.969190

4.4.4 Homoscedasticity

Use Breusch Pagan Test.

library(olsrr)

## Warning: package 'olsrr' was built under R version 4.0.5

## 
## Attaching package: 'olsrr'

## The following object is masked from 'package:datasets':
## 
##     rivers

ols_test_breusch_pagan(eg8_12_regression)

## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                         Data                          
##  -----------------------------------------------------
##  Response : eg8_12_com$Graduation_pct 
##  Variables: fitted values of eg8_12_com$Graduation_pct 
## 
##         Test Summary         
##  ----------------------------
##  DF            =    1 
##  Chi2          =    1.226334 
##  Prob > Chi2   =    0.2681211

Multiple test for each variable.

ols_test_breusch_pagan(eg8_12_regression, rhs = TRUE,
                       multiple = TRUE)

## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                                                        Data                                                         
##  -------------------------------------------------------------------------------------------------------------------
##  Response : eg8_12_com$Graduation_pct 
##  Variables: eg8_12_com$Median_SAT eg8_12_com$Acceptance_Rate eg8_12_com$Expenditure_per_Student eg8_12_com$Top_10_HS 
## 
##                    Test Summary (Unadjusted p values)                   
##  ---------------------------------------------------------------------
##   Variable                                 chi2        df        p     
##  ---------------------------------------------------------------------
##   eg8_12_com$Median_SAT                 0.830848067     1    0.3620274 
##   eg8_12_com$Acceptance_Rate            0.582793086     1    0.4452196 
##   eg8_12_com$Expenditure_per_Student    0.159781567     1    0.6893577 
##   eg8_12_com$Top_10_HS                  0.009688868     1    0.9215892 
##  ---------------------------------------------------------------------
##   simultaneous                          1.682850685     4    0.7938304 
##  ---------------------------------------------------------------------

4.4.5 Automatic Diagnostics

library(gvlma)
gvlma(eg8_12_regression)

## 
## Call:
## lm(formula = eg8_12_com$Graduation_pct ~ eg8_12_com$Median_SAT + 
##     eg8_12_com$Acceptance_Rate + eg8_12_com$Expenditure_per_Student + 
##     eg8_12_com$Top_10_HS)
## 
## Coefficients:
##                        (Intercept)               eg8_12_com$Median_SAT  
##                          1.792e+01                           7.201e-02  
##         eg8_12_com$Acceptance_Rate  eg8_12_com$Expenditure_per_Student  
##                         -2.486e+01                          -1.356e-04  
##               eg8_12_com$Top_10_HS  
##                         -1.628e-01  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = eg8_12_regression) 
## 
##                    Value p-value                Decision
## Global Stat        5.261  0.2615 Assumptions acceptable.
## Skewness           1.657  0.1980 Assumptions acceptable.
## Kurtosis           0.176  0.6748 Assumptions acceptable.
## Link Function      2.820  0.0931 Assumptions acceptable.
## Heteroscedasticity 0.608  0.4355 Assumptions acceptable.

5. Categorical Independent Variables

eg8_12_regression_2 <- lm(eg8_12_com$Graduation_pct ~ eg8_12_com$Median_SAT +
                          eg8_12_com$Acceptance_Rate +
                          eg8_12_com$Expenditure_per_Student +
                          eg8_12_com$Top_10_HS + factor(eg8_12_com$Type))
summary(eg8_12_regression_2)

## 
## Call:
## lm(formula = eg8_12_com$Graduation_pct ~ eg8_12_com$Median_SAT + 
##     eg8_12_com$Acceptance_Rate + eg8_12_com$Expenditure_per_Student + 
##     eg8_12_com$Top_10_HS + factor(eg8_12_com$Type))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.5654  -2.9483   0.3081   3.8726  11.9014 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         1.259e+01  2.608e+01   0.483  0.63167    
## eg8_12_com$Median_SAT               7.768e-02  2.014e-02   3.856  0.00038 ***
## eg8_12_com$Acceptance_Rate         -2.464e+01  8.378e+00  -2.941  0.00525 ** 
## eg8_12_com$Expenditure_per_Student -1.642e-04  7.985e-05  -2.056  0.04590 *  
## eg8_12_com$Top_10_HS               -1.866e-01  8.805e-02  -2.119  0.03993 *  
## factor(eg8_12_com$Type)University   1.436e+00  2.236e+00   0.642  0.52408    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.344 on 43 degrees of freedom
## Multiple R-squared:  0.5389, Adjusted R-squared:  0.4852 
## F-statistic: 10.05 on 5 and 43 DF,  p-value: 2.023e-06

confint(eg8_12_regression_2, level = 0.95)

##                                            2.5 %        97.5 %
## (Intercept)                        -4.000033e+01  6.518279e+01
## eg8_12_com$Median_SAT               3.705419e-02  1.183027e-01
## eg8_12_com$Acceptance_Rate         -4.153449e+01 -7.741431e+00
## eg8_12_com$Expenditure_per_Student -3.251922e-04 -3.128078e-06
## eg8_12_com$Top_10_HS               -3.641304e-01 -8.986419e-03
## factor(eg8_12_com$Type)University  -3.073331e+00  5.945974e+00

5.1 Residuals

eg8_12_stdresiduals_2 <- rstandard(eg8_12_regression_2)
eg8_12_stdresiduals_2

##           1           2           3           4           5           6 
##  0.77047462  0.28651708  0.32340149 -1.39467253  0.21607707  0.25370435 
##           7           8           9          10          11          12 
##  0.77051290 -1.49953514 -1.40265359 -0.25398293 -2.21309600  0.45743712 
##          13          14          15          16          17          18 
## -0.32623281  1.30717612 -0.21288077  1.36605372  0.93403761  0.02896642 
##          19          20          21          22          23          24 
## -0.81279483  0.06200444 -0.30071528 -0.06752947  0.77723760  0.85009900 
##          25          26          27          28          29          30 
##  0.19726011  0.96477712  0.60281233 -0.64808316 -1.19645214 -1.53169967 
##          31          32          33          34          35          36 
##  0.70144932 -0.59507413  2.35690516 -0.13719103 -0.26707632  0.35647355 
##          37          38          39          40          41          42 
## -2.11067232 -0.19457706  0.77549454  1.18181419 -1.97165257 -0.17083150 
##          43          44          45          46          47          48 
## -0.95028381 -0.01690501 -1.77855715  1.40615964  0.71600404  0.69181778 
##          49 
##  0.82234144

eg8_12_com_cat <- cbind(eg8_12_com, eg8_12_stdresiduals_2)
head(eg8_12_com_cat)

##     School       Type Median_SAT Acceptance_Rate Expenditure_per_Student
## 1  Amherst   Lib Arts       1315            0.22                   26636
## 2  Barnard   Lib Arts       1220            0.53                   17653
## 3    Bates   Lib Arts       1240            0.36                   17554
## 4 Berkeley University       1176            0.37                   23665
## 5  Bowdoin   Lib Arts       1300            0.24                   25703
## 6    Brown University       1281            0.24                   24201
##   Top_10_HS Graduation_pct eg8_12_stdresiduals_2
## 1        85             93             0.7704746
## 2        69             80             0.2865171
## 3        58             88             0.3234015
## 4        95             68            -1.3946725
## 5        78             90             0.2160771
## 6        80             90             0.2537043

Graduation percentage.

ggplot(eg8_12_com_cat) + aes(x = Graduation_pct, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Graduation Percentage") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Graduation Percentage")

Median SAT Score.

ggplot(eg8_12_com_cat) + aes(x = Median_SAT, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Median SAT Scores") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Median SAT Scores")

Acceptance rate.

ggplot(eg8_12_com_cat) + aes(x = Acceptance_Rate, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Acceptance Rate") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Acceptance Rate")

Expenditure per Student.

ggplot(eg8_12_com_cat) + aes(x = Expenditure_per_Student, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Expenditure per Student") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Expenditure per Student")

Top 10% in High School.

ggplot(eg8_12_com_cat) + aes(x = Top_10_HS, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Top 10% in High School") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Top 10% in High School")

Type of University.

ggplot(eg8_12_com_cat) + aes(x = Type, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Type of University") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Type of University")

5.2 Diagnostics

5.2.1 Normality

ggplot(eg8_12_com_cat) + aes(x = Graduation_pct) +
  geom_histogram() +
  ylab("Count") +
  ggtitle("Distribution of Graduation Percentage")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using the Shapiro-Wilks test.

H-0: normal distribution.

H-alt: distribution is different from a normal distribution.

shapiro.test(eg8_12_com_cat$Graduation_pct)

## 
##  Shapiro-Wilk normality test
## 
## data:  eg8_12_com_cat$Graduation_pct
## W = 0.92977, p-value = 0.006014

5.2.2 Autocorrelation

May not be very applicable here. But just for illustration……

durbinWatsonTest(eg8_12_regression_2)

##  lag Autocorrelation D-W Statistic p-value
##    1     -0.01388796      2.001011   0.982
##  Alternative hypothesis: rho != 0

5.2.3 Multicollinearity

vif(eg8_12_regression_2)

##              eg8_12_com$Median_SAT         eg8_12_com$Acceptance_Rate 
##                           2.679097                           2.109477 
## eg8_12_com$Expenditure_per_Student               eg8_12_com$Top_10_HS 
##                           2.562343                           2.392690 
##            factor(eg8_12_com$Type) 
##                           2.143916

5.2.4 Homoscedasticity

Use Breusch Pagan Test.

ols_test_breusch_pagan(eg8_12_regression_2)

## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                         Data                          
##  -----------------------------------------------------
##  Response : eg8_12_com$Graduation_pct 
##  Variables: fitted values of eg8_12_com$Graduation_pct 
## 
##        Test Summary         
##  ---------------------------
##  DF            =    1 
##  Chi2          =    1.520704 
##  Prob > Chi2   =    0.217513

5.2.5 Automatic Diagnostics

gvlma(eg8_12_regression_2)

## 
## Call:
## lm(formula = eg8_12_com$Graduation_pct ~ eg8_12_com$Median_SAT + 
##     eg8_12_com$Acceptance_Rate + eg8_12_com$Expenditure_per_Student + 
##     eg8_12_com$Top_10_HS + factor(eg8_12_com$Type))
## 
## Coefficients:
##                        (Intercept)               eg8_12_com$Median_SAT  
##                          1.259e+01                           7.768e-02  
##         eg8_12_com$Acceptance_Rate  eg8_12_com$Expenditure_per_Student  
##                         -2.464e+01                          -1.642e-04  
##               eg8_12_com$Top_10_HS   factor(eg8_12_com$Type)University  
##                         -1.866e-01                           1.436e+00  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = eg8_12_regression_2) 
## 
##                      Value p-value                Decision
## Global Stat        4.87073 0.30082 Assumptions acceptable.
## Skewness           1.04439 0.30680 Assumptions acceptable.
## Kurtosis           0.06851 0.79352 Assumptions acceptable.
## Link Function      2.98099 0.08425 Assumptions acceptable.
## Heteroscedasticity 0.77684 0.37811 Assumptions acceptable.

6. Non-Linear Regression

6.1 Load data

library(xlsx)

eg8_18 <- read.xlsx("Eg 8-18 Beverage Sales.xlsx",
                    sheetIndex = 1, header = TRUE,
                    startRow = 3, colIndex = 1:2)
head(eg8_18)

##   Temperature Sales
## 1          85  1810
## 2          90  4825
## 3          79   438
## 4          82   775
## 5          84  1213
## 6          96  8692

6.2 Non-Linear Model

eg8_18_regression <- lm(eg8_18$Sales ~ eg8_18$Temperature^2)
summary(eg8_18_regression)

## 
## Call:
## lm(formula = eg8_18$Sales ~ eg8_18$Temperature^2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1765.6  -598.4  -293.0   562.0  2014.8 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -32511.25    3408.72  -9.538 1.12e-08 ***
## eg8_18$Temperature    408.60      39.27  10.406 2.76e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1041 on 19 degrees of freedom
## Multiple R-squared:  0.8507, Adjusted R-squared:  0.8429 
## F-statistic: 108.3 on 1 and 19 DF,  p-value: 2.761e-09

confint(eg8_18_regression, level = 0.95)

##                          2.5 %      97.5 %
## (Intercept)        -39645.7869 -25376.7065
## eg8_18$Temperature    326.4189    490.7864

6.3. Residuals

eg8_18_stdresiduals <- rstandard(eg8_18_regression)
eg8_18_stdresiduals

##          1          2          3          4          5          6          7 
## -0.4043254  0.5579609  0.6896936 -0.2192460 -0.5920101  2.0884092 -1.0741874 
##          8          9         10         11         12         13         14 
##  1.8602444 -0.5675443  2.1650140 -1.1272465 -0.3188644 -0.9171575 -0.2908789 
##         15         16         17         18         19         20         21 
## -0.1002041 -1.7633017  0.3494740 -0.6365315  0.7144124  0.4360437 -0.3703755

eg8_18_com <- cbind(eg8_18, eg8_18_stdresiduals)
head(eg8_18_com)

##   Temperature Sales eg8_18_stdresiduals
## 1          85  1810          -0.4043254
## 2          90  4825           0.5579609
## 3          79   438           0.6896936
## 4          82   775          -0.2192460
## 5          84  1213          -0.5920101
## 6          96  8692           2.0884092

ggplot(eg8_18_com) + aes(x = Sales, eg8_18_stdresiduals) +
  geom_point() +
  xlab("Sales") + ylab("Standard Residuals") +
  ggtitle("Temperature and Sales, Residuals")

6.4 Normality

ggplot(eg8_18) + aes(x = Sales) +
  geom_histogram() +
  ylab("Count") +
  ggtitle("Distribution of Sales")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using the Shapiro-Wilks test.

H-0: normal distribution.

H-alt: distribution is different from a normal distribution.

shapiro.test(eg8_18$Sales)

## 
##  Shapiro-Wilk normality test
## 
## data:  eg8_18$Sales
## W = 0.84399, p-value = 0.003353

6.5 Autocorrelation

May not be very applicable here. But just for illustration……

durbinWatsonTest(eg8_18_regression)

##  lag Autocorrelation D-W Statistic p-value
##    1      -0.4602732      2.906229   0.018
##  Alternative hypothesis: rho != 0

6.6 Homoscedasticity

Use Breusch Pagan Test.

library(olsrr) 
ols_test_breusch_pagan(eg8_18_regression)

## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                   Data                   
##  ----------------------------------------
##  Response : eg8_18$Sales 
##  Variables: fitted values of eg8_18$Sales 
## 
##         Test Summary         
##  ----------------------------
##  DF            =    1 
##  Chi2          =    2.49159 
##  Prob > Chi2   =    0.1144561

Multiple test for each variable.

ols_test_breusch_pagan(eg8_18_regression, rhs = TRUE,
                       multiple = TRUE)

## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##     Data      
##  -------------
##  Response : eg8_18$Sales 
##  Variables: m2 
## 
##        Test Summary (Unadjusted p values)      
##  --------------------------------------------
##   Variable          chi2      df        p     
##  --------------------------------------------
##   m2               2.49159     1    0.1144561 
##  --------------------------------------------
##   simultaneous     2.49159     1    0.1144561 
##  --------------------------------------------

6.7 Automatic Diagnostics

library(gvlma)
gvlma(eg8_18_regression)

## 
## Call:
## lm(formula = eg8_18$Sales ~ eg8_18$Temperature^2)
## 
## Coefficients:
##        (Intercept)  eg8_18$Temperature  
##           -32511.2               408.6  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = eg8_18_regression) 
## 
##                       Value   p-value                   Decision
## Global Stat        15.15675 0.0043869 Assumptions NOT satisfied!
## Skewness            1.22049 0.2692645    Assumptions acceptable.
## Kurtosis            0.06201 0.8033491    Assumptions acceptable.
## Link Function      13.59504 0.0002268 Assumptions NOT satisfied!
## Heteroscedasticity  0.27922 0.5972123    Assumptions acceptable.

Regression

master yeoda

a long, long time ago in a galaxy far, far away

Directions

1. Scatter plot

1.1 Load data

1.2 Scatter plot

2. Simple Linear Regression

3. Residuals

3.1 Normality

3.2 Autocorrelation

3.3 Automatic Diagnostics

4. Multiple Linear Regression

4.1 Load data

4.2 Multiple Linear Regression Model

4.3 Residuals

4.4 Diagnostics

4.4.1 Normality

4.4.2 Autocorrelation

4.4.3 Multicollinearity

4.4.4 Homoscedasticity

4.4.5 Automatic Diagnostics

5. Categorical Independent Variables

5.1 Residuals

5.2 Diagnostics

5.2.1 Normality

5.2.2 Autocorrelation

5.2.3 Multicollinearity

5.2.4 Homoscedasticity

5.2.5 Automatic Diagnostics

6. Non-Linear Regression

6.1 Load data

6.2 Non-Linear Model

6.3. Residuals

6.4 Normality

6.5 Autocorrelation

6.6 Homoscedasticity

6.7 Automatic Diagnostics