Directions

Captain Toad’s treasure!

A very popular tool!

Data for demo

Back to the spellbook

1. Scatter plot

1.1 Load data

library(xlsx)
## Warning: package 'xlsx' was built under R version 4.0.4
eg8_3 <- read.xlsx("Eg 8-3 Home Market Value.xlsx",
                   sheetIndex = 1, header = TRUE,
                   startRow = 3, colIndex = 1:3)
head(eg8_3)
##   House.Age Square.Feet Market.Value
## 1        33        1812        90000
## 2        32        1914       104400
## 3        32        1842        93300
## 4        33        1812        91000
## 5        32        1836       101900
## 6        33        2028       108500
names(eg8_3)[1:3] <- c("House_Age", "Square_Feet", "Market_Value")
head(eg8_3)
##   House_Age Square_Feet Market_Value
## 1        33        1812        90000
## 2        32        1914       104400
## 3        32        1842        93300
## 4        33        1812        91000
## 5        32        1836       101900
## 6        33        2028       108500

1.2 Scatter plot

library(ggplot2)
library(ggpubr)

ggplot(eg8_3) + aes(x = Market_Value, y = Square_Feet) +
  geom_point(shape = 2, colour = "black") +
  xlab("Square Footage") + ylab("Market Value") +
  ggtitle("Size of House and Market Value") +
  geom_smooth(method = lm) +
  stat_regline_equation(label.x = 110000, label.y = 1700) +
  stat_cor(method = "pearson", label.x = 110000, label.y = 1600)
## `geom_smooth()` using formula 'y ~ x'

2. Simple Linear Regression

eg8_3_regression <- lm(eg8_3$Market_Value ~ eg8_3$Square_Feet)
summary(eg8_3_regression)
## 
## Call:
## lm(formula = eg8_3$Market_Value ~ eg8_3$Square_Feet)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8067  -4327  -1923   3097  32634 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       32673.220   8831.951   3.699  0.00065 ***
## eg8_3$Square_Feet    35.036      5.167   6.780  3.8e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7288 on 40 degrees of freedom
## Multiple R-squared:  0.5347, Adjusted R-squared:  0.5231 
## F-statistic: 45.97 on 1 and 40 DF,  p-value: 3.798e-08
confint(eg8_3_regression, level = 0.95)
##                        2.5 %      97.5 %
## (Intercept)       14823.1816 50523.25820
## eg8_3$Square_Feet    24.5927    45.48004

3. Residuals

eg8_3_stdresiduals <- rstandard(eg8_3_regression)
head(eg8_3_stdresiduals)
##          1          2          3          4          5          6 
## -0.8583995  0.6563142 -0.5460889 -0.7190292  0.6840116  0.6826257
eg8_3_com <- cbind(eg8_3, eg8_3_stdresiduals)
head(eg8_3_com)
##   House_Age Square_Feet Market_Value eg8_3_stdresiduals
## 1        33        1812        90000         -0.8583995
## 2        32        1914       104400          0.6563142
## 3        32        1842        93300         -0.5460889
## 4        33        1812        91000         -0.7190292
## 5        32        1836       101900          0.6840116
## 6        33        2028       108500          0.6826257
ggplot(eg8_3_com) + aes(x = eg8_3_com$Market_Value, y = eg8_3_com$eg8_3_stdresiduals) +
  geom_point() +
  xlab("Market Value") + ylab("Standard Residuals") +
  ggtitle("Size of House and Market Value, Residuals")
## Warning: Use of `eg8_3_com$Market_Value` is discouraged. Use `Market_Value`
## instead.
## Warning: Use of `eg8_3_com$eg8_3_stdresiduals` is discouraged. Use
## `eg8_3_stdresiduals` instead.

3.1 Normality

ggplot(eg8_3) + aes(x = Market_Value) +
  geom_histogram() +
  ylab("Count") +
  ggtitle("Distribution of Market Value")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using the Shapiro-Wilks test.

H-0: normal distribution.

H-alt: distribution is different from a normal distribution.

shapiro.test(eg8_3$Market_Value)
## 
##  Shapiro-Wilk normality test
## 
## data:  eg8_3$Market_Value
## W = 0.90155, p-value = 0.001603

3.2 Autocorrelation

May not be very applicable here. But just for illustration……

library(car)
## Loading required package: carData
durbinWatsonTest(eg8_3_regression) 
##  lag Autocorrelation D-W Statistic p-value
##    1       0.1110143      1.258807   0.024
##  Alternative hypothesis: rho != 0

3.3 Automatic Diagnostics

library(gvlma)
gvlma(eg8_3_regression)
## 
## Call:
## lm(formula = eg8_3$Market_Value ~ eg8_3$Square_Feet)
## 
## Coefficients:
##       (Intercept)  eg8_3$Square_Feet  
##          32673.22              35.04  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = eg8_3_regression) 
## 
##                        Value   p-value                   Decision
## Global Stat        1.793e+02 0.000e+00 Assumptions NOT satisfied!
## Skewness           4.133e+01 1.284e-10 Assumptions NOT satisfied!
## Kurtosis           1.206e+02 0.000e+00 Assumptions NOT satisfied!
## Link Function      9.494e-04 9.754e-01    Assumptions acceptable.
## Heteroscedasticity 1.741e+01 3.006e-05 Assumptions NOT satisfied!

4. Multiple Linear Regression

4.1 Load data

eg8_12 <- read.xlsx("Eg 8-12 Colleges and Universities.xlsx",
                   sheetIndex = 1, header = TRUE,
                   startRow = 3, colIndex = 1:7)
head(eg8_12)
##     School       Type Median.SAT Acceptance.Rate Expenditures.Student
## 1  Amherst   Lib Arts       1315            0.22                26636
## 2  Barnard   Lib Arts       1220            0.53                17653
## 3    Bates   Lib Arts       1240            0.36                17554
## 4 Berkeley University       1176            0.37                23665
## 5  Bowdoin   Lib Arts       1300            0.24                25703
## 6    Brown University       1281            0.24                24201
##   Top.10..HS Graduation..
## 1         85           93
## 2         69           80
## 3         58           88
## 4         95           68
## 5         78           90
## 6         80           90
names(eg8_12)[1:7] <- c("School", "Type", "Median_SAT",
                        "Acceptance_Rate", "Expenditure_per_Student",
                        "Top_10_HS", "Graduation_pct")
head(eg8_12)
##     School       Type Median_SAT Acceptance_Rate Expenditure_per_Student
## 1  Amherst   Lib Arts       1315            0.22                   26636
## 2  Barnard   Lib Arts       1220            0.53                   17653
## 3    Bates   Lib Arts       1240            0.36                   17554
## 4 Berkeley University       1176            0.37                   23665
## 5  Bowdoin   Lib Arts       1300            0.24                   25703
## 6    Brown University       1281            0.24                   24201
##   Top_10_HS Graduation_pct
## 1        85             93
## 2        69             80
## 3        58             88
## 4        95             68
## 5        78             90
## 6        80             90

4.2 Multiple Linear Regression Model

Select only complete cases.

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::recode() masks car::recode()
## x purrr::some()   masks car::some()
eg8_12_com <- drop_na(eg8_12, "Graduation_pct")
head(eg8_12_com)
##     School       Type Median_SAT Acceptance_Rate Expenditure_per_Student
## 1  Amherst   Lib Arts       1315            0.22                   26636
## 2  Barnard   Lib Arts       1220            0.53                   17653
## 3    Bates   Lib Arts       1240            0.36                   17554
## 4 Berkeley University       1176            0.37                   23665
## 5  Bowdoin   Lib Arts       1300            0.24                   25703
## 6    Brown University       1281            0.24                   24201
##   Top_10_HS Graduation_pct
## 1        85             93
## 2        69             80
## 3        58             88
## 4        95             68
## 5        78             90
## 6        80             90

Regression model.

eg8_12_regression <- lm(eg8_12_com$Graduation_pct ~ eg8_12_com$Median_SAT +
                          eg8_12_com$Acceptance_Rate +
                          eg8_12_com$Expenditure_per_Student +
                          eg8_12_com$Top_10_HS)
summary(eg8_12_regression)
## 
## Call:
## lm(formula = eg8_12_com$Graduation_pct ~ eg8_12_com$Median_SAT + 
##     eg8_12_com$Acceptance_Rate + eg8_12_com$Expenditure_per_Student + 
##     eg8_12_com$Top_10_HS)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.8674  -2.0462   0.6193   3.6417  11.2090 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         1.792e+01  2.456e+01   0.730 0.469402    
## eg8_12_com$Median_SAT               7.201e-02  1.798e-02   4.004 0.000236 ***
## eg8_12_com$Acceptance_Rate         -2.486e+01  8.315e+00  -2.990 0.004560 ** 
## eg8_12_com$Expenditure_per_Student -1.356e-04  6.593e-05  -2.057 0.045600 *  
## eg8_12_com$Top_10_HS               -1.628e-01  7.934e-02  -2.051 0.046214 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.308 on 44 degrees of freedom
## Multiple R-squared:  0.5344, Adjusted R-squared:  0.4921 
## F-statistic: 12.63 on 4 and 44 DF,  p-value: 6.332e-07
confint(eg8_12_regression, level = 0.95)
##                                            2.5 %        97.5 %
## (Intercept)                        -3.157088e+01  6.741279e+01
## eg8_12_com$Median_SAT               3.576208e-02  1.082505e-01
## eg8_12_com$Acceptance_Rate         -4.161739e+01 -8.101078e+00
## eg8_12_com$Expenditure_per_Student -2.685259e-04 -2.773789e-06
## eg8_12_com$Top_10_HS               -3.226729e-01 -2.856120e-03

4.3 Residuals

eg8_12_stdresiduals <- rstandard(eg8_12_regression)
head(eg8_12_stdresiduals)
##          1          2          3          4          5          6 
##  0.6449387  0.2008233  0.3063999 -1.4197111  0.1199610  0.4085649
eg8_12_com_2 <- cbind(eg8_12_com, eg8_12_stdresiduals)
head(eg8_12_com_2)
##     School       Type Median_SAT Acceptance_Rate Expenditure_per_Student
## 1  Amherst   Lib Arts       1315            0.22                   26636
## 2  Barnard   Lib Arts       1220            0.53                   17653
## 3    Bates   Lib Arts       1240            0.36                   17554
## 4 Berkeley University       1176            0.37                   23665
## 5  Bowdoin   Lib Arts       1300            0.24                   25703
## 6    Brown University       1281            0.24                   24201
##   Top_10_HS Graduation_pct eg8_12_stdresiduals
## 1        85             93           0.6449387
## 2        69             80           0.2008233
## 3        58             88           0.3063999
## 4        95             68          -1.4197111
## 5        78             90           0.1199610
## 6        80             90           0.4085649

Graduation Percentage.

ggplot(eg8_12_com_2) + aes(x = Graduation_pct, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Graduation Percentage") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Graduation Percentage")

Median SAT scores.

ggplot(eg8_12_com_2) + aes(x = Median_SAT, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Median SAT Scores") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Median SAT Scores")

Acceptance rate

ggplot(eg8_12_com_2) + aes(x = Acceptance_Rate, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Acceptance Rate") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Acceptance Rate")

Expenditure per student.

ggplot(eg8_12_com_2) + aes(x = Expenditure_per_Student, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Expenditure per Student") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Expenditure per Student")

Top 10% in High School.

ggplot(eg8_12_com_2) + aes(x = Top_10_HS, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Top 10% in High School") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Top 10% in High School")

4.4 Diagnostics

4.4.1 Normality

ggplot(eg8_12_com) + aes(x = Graduation_pct) +
  geom_histogram() +
  ylab("Count") +
  ggtitle("Distribution of Graduation Percentage")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using the Shapiro-Wilks test.

H-0: normal distribution.

H-alt: distribution is different from a normal distribution.

shapiro.test(eg8_12_com$Graduation_pct)
## 
##  Shapiro-Wilk normality test
## 
## data:  eg8_12_com$Graduation_pct
## W = 0.92977, p-value = 0.006014

4.4.2 Autocorrelation

May not be very applicable here. But just for illustration……

durbinWatsonTest(eg8_12_regression) 
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.01695507      2.009753     0.9
##  Alternative hypothesis: rho != 0

4.4.3 Multicollinearity

vif(eg8_12_regression) 
##              eg8_12_com$Median_SAT         eg8_12_com$Acceptance_Rate 
##                           2.164223                           2.105911 
## eg8_12_com$Expenditure_per_Student               eg8_12_com$Top_10_HS 
##                           1.770573                           1.969190

4.4.4 Homoscedasticity

Use Breusch Pagan Test.

library(olsrr) 
## Warning: package 'olsrr' was built under R version 4.0.5
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
ols_test_breusch_pagan(eg8_12_regression)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                         Data                          
##  -----------------------------------------------------
##  Response : eg8_12_com$Graduation_pct 
##  Variables: fitted values of eg8_12_com$Graduation_pct 
## 
##         Test Summary         
##  ----------------------------
##  DF            =    1 
##  Chi2          =    1.226334 
##  Prob > Chi2   =    0.2681211

Multiple test for each variable.

ols_test_breusch_pagan(eg8_12_regression, rhs = TRUE,
                       multiple = TRUE)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                                                        Data                                                         
##  -------------------------------------------------------------------------------------------------------------------
##  Response : eg8_12_com$Graduation_pct 
##  Variables: eg8_12_com$Median_SAT eg8_12_com$Acceptance_Rate eg8_12_com$Expenditure_per_Student eg8_12_com$Top_10_HS 
## 
##                    Test Summary (Unadjusted p values)                   
##  ---------------------------------------------------------------------
##   Variable                                 chi2        df        p     
##  ---------------------------------------------------------------------
##   eg8_12_com$Median_SAT                 0.830848067     1    0.3620274 
##   eg8_12_com$Acceptance_Rate            0.582793086     1    0.4452196 
##   eg8_12_com$Expenditure_per_Student    0.159781567     1    0.6893577 
##   eg8_12_com$Top_10_HS                  0.009688868     1    0.9215892 
##  ---------------------------------------------------------------------
##   simultaneous                          1.682850685     4    0.7938304 
##  ---------------------------------------------------------------------

4.4.5 Automatic Diagnostics

library(gvlma)
gvlma(eg8_12_regression)
## 
## Call:
## lm(formula = eg8_12_com$Graduation_pct ~ eg8_12_com$Median_SAT + 
##     eg8_12_com$Acceptance_Rate + eg8_12_com$Expenditure_per_Student + 
##     eg8_12_com$Top_10_HS)
## 
## Coefficients:
##                        (Intercept)               eg8_12_com$Median_SAT  
##                          1.792e+01                           7.201e-02  
##         eg8_12_com$Acceptance_Rate  eg8_12_com$Expenditure_per_Student  
##                         -2.486e+01                          -1.356e-04  
##               eg8_12_com$Top_10_HS  
##                         -1.628e-01  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = eg8_12_regression) 
## 
##                    Value p-value                Decision
## Global Stat        5.261  0.2615 Assumptions acceptable.
## Skewness           1.657  0.1980 Assumptions acceptable.
## Kurtosis           0.176  0.6748 Assumptions acceptable.
## Link Function      2.820  0.0931 Assumptions acceptable.
## Heteroscedasticity 0.608  0.4355 Assumptions acceptable.

5. Categorical Independent Variables

eg8_12_regression_2 <- lm(eg8_12_com$Graduation_pct ~ eg8_12_com$Median_SAT +
                          eg8_12_com$Acceptance_Rate +
                          eg8_12_com$Expenditure_per_Student +
                          eg8_12_com$Top_10_HS + factor(eg8_12_com$Type))
summary(eg8_12_regression_2)
## 
## Call:
## lm(formula = eg8_12_com$Graduation_pct ~ eg8_12_com$Median_SAT + 
##     eg8_12_com$Acceptance_Rate + eg8_12_com$Expenditure_per_Student + 
##     eg8_12_com$Top_10_HS + factor(eg8_12_com$Type))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.5654  -2.9483   0.3081   3.8726  11.9014 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         1.259e+01  2.608e+01   0.483  0.63167    
## eg8_12_com$Median_SAT               7.768e-02  2.014e-02   3.856  0.00038 ***
## eg8_12_com$Acceptance_Rate         -2.464e+01  8.378e+00  -2.941  0.00525 ** 
## eg8_12_com$Expenditure_per_Student -1.642e-04  7.985e-05  -2.056  0.04590 *  
## eg8_12_com$Top_10_HS               -1.866e-01  8.805e-02  -2.119  0.03993 *  
## factor(eg8_12_com$Type)University   1.436e+00  2.236e+00   0.642  0.52408    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.344 on 43 degrees of freedom
## Multiple R-squared:  0.5389, Adjusted R-squared:  0.4852 
## F-statistic: 10.05 on 5 and 43 DF,  p-value: 2.023e-06
confint(eg8_12_regression_2, level = 0.95)
##                                            2.5 %        97.5 %
## (Intercept)                        -4.000033e+01  6.518279e+01
## eg8_12_com$Median_SAT               3.705419e-02  1.183027e-01
## eg8_12_com$Acceptance_Rate         -4.153449e+01 -7.741431e+00
## eg8_12_com$Expenditure_per_Student -3.251922e-04 -3.128078e-06
## eg8_12_com$Top_10_HS               -3.641304e-01 -8.986419e-03
## factor(eg8_12_com$Type)University  -3.073331e+00  5.945974e+00

5.1 Residuals

eg8_12_stdresiduals_2 <- rstandard(eg8_12_regression_2)
eg8_12_stdresiduals_2
##           1           2           3           4           5           6 
##  0.77047462  0.28651708  0.32340149 -1.39467253  0.21607707  0.25370435 
##           7           8           9          10          11          12 
##  0.77051290 -1.49953514 -1.40265359 -0.25398293 -2.21309600  0.45743712 
##          13          14          15          16          17          18 
## -0.32623281  1.30717612 -0.21288077  1.36605372  0.93403761  0.02896642 
##          19          20          21          22          23          24 
## -0.81279483  0.06200444 -0.30071528 -0.06752947  0.77723760  0.85009900 
##          25          26          27          28          29          30 
##  0.19726011  0.96477712  0.60281233 -0.64808316 -1.19645214 -1.53169967 
##          31          32          33          34          35          36 
##  0.70144932 -0.59507413  2.35690516 -0.13719103 -0.26707632  0.35647355 
##          37          38          39          40          41          42 
## -2.11067232 -0.19457706  0.77549454  1.18181419 -1.97165257 -0.17083150 
##          43          44          45          46          47          48 
## -0.95028381 -0.01690501 -1.77855715  1.40615964  0.71600404  0.69181778 
##          49 
##  0.82234144
eg8_12_com_cat <- cbind(eg8_12_com, eg8_12_stdresiduals_2)
head(eg8_12_com_cat)
##     School       Type Median_SAT Acceptance_Rate Expenditure_per_Student
## 1  Amherst   Lib Arts       1315            0.22                   26636
## 2  Barnard   Lib Arts       1220            0.53                   17653
## 3    Bates   Lib Arts       1240            0.36                   17554
## 4 Berkeley University       1176            0.37                   23665
## 5  Bowdoin   Lib Arts       1300            0.24                   25703
## 6    Brown University       1281            0.24                   24201
##   Top_10_HS Graduation_pct eg8_12_stdresiduals_2
## 1        85             93             0.7704746
## 2        69             80             0.2865171
## 3        58             88             0.3234015
## 4        95             68            -1.3946725
## 5        78             90             0.2160771
## 6        80             90             0.2537043

Graduation percentage.

ggplot(eg8_12_com_cat) + aes(x = Graduation_pct, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Graduation Percentage") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Graduation Percentage")

Median SAT Score.

ggplot(eg8_12_com_cat) + aes(x = Median_SAT, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Median SAT Scores") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Median SAT Scores")

Acceptance rate.

ggplot(eg8_12_com_cat) + aes(x = Acceptance_Rate, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Acceptance Rate") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Acceptance Rate")

Expenditure per Student.

ggplot(eg8_12_com_cat) + aes(x = Expenditure_per_Student, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Expenditure per Student") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Expenditure per Student")

Top 10% in High School.

ggplot(eg8_12_com_cat) + aes(x = Top_10_HS, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Top 10% in High School") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Top 10% in High School")

Type of University.

ggplot(eg8_12_com_cat) + aes(x = Type, y = eg8_12_stdresiduals) +
  geom_point() + xlab("Type of University") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Type of University")

5.2 Diagnostics

5.2.1 Normality

ggplot(eg8_12_com_cat) + aes(x = Graduation_pct) +
  geom_histogram() +
  ylab("Count") +
  ggtitle("Distribution of Graduation Percentage")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using the Shapiro-Wilks test.

H-0: normal distribution.

H-alt: distribution is different from a normal distribution.

shapiro.test(eg8_12_com_cat$Graduation_pct)
## 
##  Shapiro-Wilk normality test
## 
## data:  eg8_12_com_cat$Graduation_pct
## W = 0.92977, p-value = 0.006014

5.2.2 Autocorrelation

May not be very applicable here. But just for illustration……

durbinWatsonTest(eg8_12_regression_2) 
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.01388796      2.001011   0.982
##  Alternative hypothesis: rho != 0

5.2.3 Multicollinearity

vif(eg8_12_regression_2) 
##              eg8_12_com$Median_SAT         eg8_12_com$Acceptance_Rate 
##                           2.679097                           2.109477 
## eg8_12_com$Expenditure_per_Student               eg8_12_com$Top_10_HS 
##                           2.562343                           2.392690 
##            factor(eg8_12_com$Type) 
##                           2.143916

5.2.4 Homoscedasticity

Use Breusch Pagan Test.

ols_test_breusch_pagan(eg8_12_regression_2)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                         Data                          
##  -----------------------------------------------------
##  Response : eg8_12_com$Graduation_pct 
##  Variables: fitted values of eg8_12_com$Graduation_pct 
## 
##        Test Summary         
##  ---------------------------
##  DF            =    1 
##  Chi2          =    1.520704 
##  Prob > Chi2   =    0.217513

5.2.5 Automatic Diagnostics

gvlma(eg8_12_regression_2)
## 
## Call:
## lm(formula = eg8_12_com$Graduation_pct ~ eg8_12_com$Median_SAT + 
##     eg8_12_com$Acceptance_Rate + eg8_12_com$Expenditure_per_Student + 
##     eg8_12_com$Top_10_HS + factor(eg8_12_com$Type))
## 
## Coefficients:
##                        (Intercept)               eg8_12_com$Median_SAT  
##                          1.259e+01                           7.768e-02  
##         eg8_12_com$Acceptance_Rate  eg8_12_com$Expenditure_per_Student  
##                         -2.464e+01                          -1.642e-04  
##               eg8_12_com$Top_10_HS   factor(eg8_12_com$Type)University  
##                         -1.866e-01                           1.436e+00  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = eg8_12_regression_2) 
## 
##                      Value p-value                Decision
## Global Stat        4.87073 0.30082 Assumptions acceptable.
## Skewness           1.04439 0.30680 Assumptions acceptable.
## Kurtosis           0.06851 0.79352 Assumptions acceptable.
## Link Function      2.98099 0.08425 Assumptions acceptable.
## Heteroscedasticity 0.77684 0.37811 Assumptions acceptable.

6. Non-Linear Regression

6.1 Load data

library(xlsx)

eg8_18 <- read.xlsx("Eg 8-18 Beverage Sales.xlsx",
                    sheetIndex = 1, header = TRUE,
                    startRow = 3, colIndex = 1:2)
head(eg8_18)
##   Temperature Sales
## 1          85  1810
## 2          90  4825
## 3          79   438
## 4          82   775
## 5          84  1213
## 6          96  8692

6.2 Non-Linear Model

eg8_18_regression <- lm(eg8_18$Sales ~ eg8_18$Temperature^2)
summary(eg8_18_regression)
## 
## Call:
## lm(formula = eg8_18$Sales ~ eg8_18$Temperature^2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1765.6  -598.4  -293.0   562.0  2014.8 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -32511.25    3408.72  -9.538 1.12e-08 ***
## eg8_18$Temperature    408.60      39.27  10.406 2.76e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1041 on 19 degrees of freedom
## Multiple R-squared:  0.8507, Adjusted R-squared:  0.8429 
## F-statistic: 108.3 on 1 and 19 DF,  p-value: 2.761e-09
confint(eg8_18_regression, level = 0.95)
##                          2.5 %      97.5 %
## (Intercept)        -39645.7869 -25376.7065
## eg8_18$Temperature    326.4189    490.7864

6.3. Residuals

eg8_18_stdresiduals <- rstandard(eg8_18_regression)
eg8_18_stdresiduals
##          1          2          3          4          5          6          7 
## -0.4043254  0.5579609  0.6896936 -0.2192460 -0.5920101  2.0884092 -1.0741874 
##          8          9         10         11         12         13         14 
##  1.8602444 -0.5675443  2.1650140 -1.1272465 -0.3188644 -0.9171575 -0.2908789 
##         15         16         17         18         19         20         21 
## -0.1002041 -1.7633017  0.3494740 -0.6365315  0.7144124  0.4360437 -0.3703755
eg8_18_com <- cbind(eg8_18, eg8_18_stdresiduals)
head(eg8_18_com)
##   Temperature Sales eg8_18_stdresiduals
## 1          85  1810          -0.4043254
## 2          90  4825           0.5579609
## 3          79   438           0.6896936
## 4          82   775          -0.2192460
## 5          84  1213          -0.5920101
## 6          96  8692           2.0884092
ggplot(eg8_18_com) + aes(x = Sales, eg8_18_stdresiduals) +
  geom_point() +
  xlab("Sales") + ylab("Standard Residuals") +
  ggtitle("Temperature and Sales, Residuals")

6.4 Normality

ggplot(eg8_18) + aes(x = Sales) +
  geom_histogram() +
  ylab("Count") +
  ggtitle("Distribution of Sales")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using the Shapiro-Wilks test.

H-0: normal distribution.

H-alt: distribution is different from a normal distribution.

shapiro.test(eg8_18$Sales)
## 
##  Shapiro-Wilk normality test
## 
## data:  eg8_18$Sales
## W = 0.84399, p-value = 0.003353

6.5 Autocorrelation

May not be very applicable here. But just for illustration……

durbinWatsonTest(eg8_18_regression) 
##  lag Autocorrelation D-W Statistic p-value
##    1      -0.4602732      2.906229   0.018
##  Alternative hypothesis: rho != 0

6.6 Homoscedasticity

Use Breusch Pagan Test.

library(olsrr) 
ols_test_breusch_pagan(eg8_18_regression)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                   Data                   
##  ----------------------------------------
##  Response : eg8_18$Sales 
##  Variables: fitted values of eg8_18$Sales 
## 
##         Test Summary         
##  ----------------------------
##  DF            =    1 
##  Chi2          =    2.49159 
##  Prob > Chi2   =    0.1144561

Multiple test for each variable.

ols_test_breusch_pagan(eg8_18_regression, rhs = TRUE,
                       multiple = TRUE)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##     Data      
##  -------------
##  Response : eg8_18$Sales 
##  Variables: m2 
## 
##        Test Summary (Unadjusted p values)      
##  --------------------------------------------
##   Variable          chi2      df        p     
##  --------------------------------------------
##   m2               2.49159     1    0.1144561 
##  --------------------------------------------
##   simultaneous     2.49159     1    0.1144561 
##  --------------------------------------------

6.7 Automatic Diagnostics

library(gvlma)
gvlma(eg8_18_regression)
## 
## Call:
## lm(formula = eg8_18$Sales ~ eg8_18$Temperature^2)
## 
## Coefficients:
##        (Intercept)  eg8_18$Temperature  
##           -32511.2               408.6  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = eg8_18_regression) 
## 
##                       Value   p-value                   Decision
## Global Stat        15.15675 0.0043869 Assumptions NOT satisfied!
## Skewness            1.22049 0.2692645    Assumptions acceptable.
## Kurtosis            0.06201 0.8033491    Assumptions acceptable.
## Link Function      13.59504 0.0002268 Assumptions NOT satisfied!
## Heteroscedasticity  0.27922 0.5972123    Assumptions acceptable.