library(xlsx)
## Warning: package 'xlsx' was built under R version 4.0.4
eg8_3 <- read.xlsx("Eg 8-3 Home Market Value.xlsx",
sheetIndex = 1, header = TRUE,
startRow = 3, colIndex = 1:3)
head(eg8_3)
## House.Age Square.Feet Market.Value
## 1 33 1812 90000
## 2 32 1914 104400
## 3 32 1842 93300
## 4 33 1812 91000
## 5 32 1836 101900
## 6 33 2028 108500
names(eg8_3)[1:3] <- c("House_Age", "Square_Feet", "Market_Value")
head(eg8_3)
## House_Age Square_Feet Market_Value
## 1 33 1812 90000
## 2 32 1914 104400
## 3 32 1842 93300
## 4 33 1812 91000
## 5 32 1836 101900
## 6 33 2028 108500
library(ggplot2)
library(ggpubr)
ggplot(eg8_3) + aes(x = Market_Value, y = Square_Feet) +
geom_point(shape = 2, colour = "black") +
xlab("Square Footage") + ylab("Market Value") +
ggtitle("Size of House and Market Value") +
geom_smooth(method = lm) +
stat_regline_equation(label.x = 110000, label.y = 1700) +
stat_cor(method = "pearson", label.x = 110000, label.y = 1600)
## `geom_smooth()` using formula 'y ~ x'
eg8_3_regression <- lm(eg8_3$Market_Value ~ eg8_3$Square_Feet)
summary(eg8_3_regression)
##
## Call:
## lm(formula = eg8_3$Market_Value ~ eg8_3$Square_Feet)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8067 -4327 -1923 3097 32634
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32673.220 8831.951 3.699 0.00065 ***
## eg8_3$Square_Feet 35.036 5.167 6.780 3.8e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7288 on 40 degrees of freedom
## Multiple R-squared: 0.5347, Adjusted R-squared: 0.5231
## F-statistic: 45.97 on 1 and 40 DF, p-value: 3.798e-08
confint(eg8_3_regression, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) 14823.1816 50523.25820
## eg8_3$Square_Feet 24.5927 45.48004
eg8_3_stdresiduals <- rstandard(eg8_3_regression)
head(eg8_3_stdresiduals)
## 1 2 3 4 5 6
## -0.8583995 0.6563142 -0.5460889 -0.7190292 0.6840116 0.6826257
eg8_3_com <- cbind(eg8_3, eg8_3_stdresiduals)
head(eg8_3_com)
## House_Age Square_Feet Market_Value eg8_3_stdresiduals
## 1 33 1812 90000 -0.8583995
## 2 32 1914 104400 0.6563142
## 3 32 1842 93300 -0.5460889
## 4 33 1812 91000 -0.7190292
## 5 32 1836 101900 0.6840116
## 6 33 2028 108500 0.6826257
ggplot(eg8_3_com) + aes(x = eg8_3_com$Market_Value, y = eg8_3_com$eg8_3_stdresiduals) +
geom_point() +
xlab("Market Value") + ylab("Standard Residuals") +
ggtitle("Size of House and Market Value, Residuals")
## Warning: Use of `eg8_3_com$Market_Value` is discouraged. Use `Market_Value`
## instead.
## Warning: Use of `eg8_3_com$eg8_3_stdresiduals` is discouraged. Use
## `eg8_3_stdresiduals` instead.
ggplot(eg8_3) + aes(x = Market_Value) +
geom_histogram() +
ylab("Count") +
ggtitle("Distribution of Market Value")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Using the Shapiro-Wilks test.
H-0: normal distribution.
H-alt: distribution is different from a normal distribution.
shapiro.test(eg8_3$Market_Value)
##
## Shapiro-Wilk normality test
##
## data: eg8_3$Market_Value
## W = 0.90155, p-value = 0.001603
May not be very applicable here. But just for illustration……
library(car)
## Loading required package: carData
durbinWatsonTest(eg8_3_regression)
## lag Autocorrelation D-W Statistic p-value
## 1 0.1110143 1.258807 0.024
## Alternative hypothesis: rho != 0
library(gvlma)
gvlma(eg8_3_regression)
##
## Call:
## lm(formula = eg8_3$Market_Value ~ eg8_3$Square_Feet)
##
## Coefficients:
## (Intercept) eg8_3$Square_Feet
## 32673.22 35.04
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = eg8_3_regression)
##
## Value p-value Decision
## Global Stat 1.793e+02 0.000e+00 Assumptions NOT satisfied!
## Skewness 4.133e+01 1.284e-10 Assumptions NOT satisfied!
## Kurtosis 1.206e+02 0.000e+00 Assumptions NOT satisfied!
## Link Function 9.494e-04 9.754e-01 Assumptions acceptable.
## Heteroscedasticity 1.741e+01 3.006e-05 Assumptions NOT satisfied!
eg8_12 <- read.xlsx("Eg 8-12 Colleges and Universities.xlsx",
sheetIndex = 1, header = TRUE,
startRow = 3, colIndex = 1:7)
head(eg8_12)
## School Type Median.SAT Acceptance.Rate Expenditures.Student
## 1 Amherst Lib Arts 1315 0.22 26636
## 2 Barnard Lib Arts 1220 0.53 17653
## 3 Bates Lib Arts 1240 0.36 17554
## 4 Berkeley University 1176 0.37 23665
## 5 Bowdoin Lib Arts 1300 0.24 25703
## 6 Brown University 1281 0.24 24201
## Top.10..HS Graduation..
## 1 85 93
## 2 69 80
## 3 58 88
## 4 95 68
## 5 78 90
## 6 80 90
names(eg8_12)[1:7] <- c("School", "Type", "Median_SAT",
"Acceptance_Rate", "Expenditure_per_Student",
"Top_10_HS", "Graduation_pct")
head(eg8_12)
## School Type Median_SAT Acceptance_Rate Expenditure_per_Student
## 1 Amherst Lib Arts 1315 0.22 26636
## 2 Barnard Lib Arts 1220 0.53 17653
## 3 Bates Lib Arts 1240 0.36 17554
## 4 Berkeley University 1176 0.37 23665
## 5 Bowdoin Lib Arts 1300 0.24 25703
## 6 Brown University 1281 0.24 24201
## Top_10_HS Graduation_pct
## 1 85 93
## 2 69 80
## 3 58 88
## 4 95 68
## 5 78 90
## 6 80 90
Select only complete cases.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.4 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::recode() masks car::recode()
## x purrr::some() masks car::some()
eg8_12_com <- drop_na(eg8_12, "Graduation_pct")
head(eg8_12_com)
## School Type Median_SAT Acceptance_Rate Expenditure_per_Student
## 1 Amherst Lib Arts 1315 0.22 26636
## 2 Barnard Lib Arts 1220 0.53 17653
## 3 Bates Lib Arts 1240 0.36 17554
## 4 Berkeley University 1176 0.37 23665
## 5 Bowdoin Lib Arts 1300 0.24 25703
## 6 Brown University 1281 0.24 24201
## Top_10_HS Graduation_pct
## 1 85 93
## 2 69 80
## 3 58 88
## 4 95 68
## 5 78 90
## 6 80 90
Regression model.
eg8_12_regression <- lm(eg8_12_com$Graduation_pct ~ eg8_12_com$Median_SAT +
eg8_12_com$Acceptance_Rate +
eg8_12_com$Expenditure_per_Student +
eg8_12_com$Top_10_HS)
summary(eg8_12_regression)
##
## Call:
## lm(formula = eg8_12_com$Graduation_pct ~ eg8_12_com$Median_SAT +
## eg8_12_com$Acceptance_Rate + eg8_12_com$Expenditure_per_Student +
## eg8_12_com$Top_10_HS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.8674 -2.0462 0.6193 3.6417 11.2090
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.792e+01 2.456e+01 0.730 0.469402
## eg8_12_com$Median_SAT 7.201e-02 1.798e-02 4.004 0.000236 ***
## eg8_12_com$Acceptance_Rate -2.486e+01 8.315e+00 -2.990 0.004560 **
## eg8_12_com$Expenditure_per_Student -1.356e-04 6.593e-05 -2.057 0.045600 *
## eg8_12_com$Top_10_HS -1.628e-01 7.934e-02 -2.051 0.046214 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.308 on 44 degrees of freedom
## Multiple R-squared: 0.5344, Adjusted R-squared: 0.4921
## F-statistic: 12.63 on 4 and 44 DF, p-value: 6.332e-07
confint(eg8_12_regression, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) -3.157088e+01 6.741279e+01
## eg8_12_com$Median_SAT 3.576208e-02 1.082505e-01
## eg8_12_com$Acceptance_Rate -4.161739e+01 -8.101078e+00
## eg8_12_com$Expenditure_per_Student -2.685259e-04 -2.773789e-06
## eg8_12_com$Top_10_HS -3.226729e-01 -2.856120e-03
eg8_12_stdresiduals <- rstandard(eg8_12_regression)
head(eg8_12_stdresiduals)
## 1 2 3 4 5 6
## 0.6449387 0.2008233 0.3063999 -1.4197111 0.1199610 0.4085649
eg8_12_com_2 <- cbind(eg8_12_com, eg8_12_stdresiduals)
head(eg8_12_com_2)
## School Type Median_SAT Acceptance_Rate Expenditure_per_Student
## 1 Amherst Lib Arts 1315 0.22 26636
## 2 Barnard Lib Arts 1220 0.53 17653
## 3 Bates Lib Arts 1240 0.36 17554
## 4 Berkeley University 1176 0.37 23665
## 5 Bowdoin Lib Arts 1300 0.24 25703
## 6 Brown University 1281 0.24 24201
## Top_10_HS Graduation_pct eg8_12_stdresiduals
## 1 85 93 0.6449387
## 2 69 80 0.2008233
## 3 58 88 0.3063999
## 4 95 68 -1.4197111
## 5 78 90 0.1199610
## 6 80 90 0.4085649
Graduation Percentage.
ggplot(eg8_12_com_2) + aes(x = Graduation_pct, y = eg8_12_stdresiduals) +
geom_point() + xlab("Graduation Percentage") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Graduation Percentage")
Median SAT scores.
ggplot(eg8_12_com_2) + aes(x = Median_SAT, y = eg8_12_stdresiduals) +
geom_point() + xlab("Median SAT Scores") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Median SAT Scores")
Acceptance rate
ggplot(eg8_12_com_2) + aes(x = Acceptance_Rate, y = eg8_12_stdresiduals) +
geom_point() + xlab("Acceptance Rate") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Acceptance Rate")
Expenditure per student.
ggplot(eg8_12_com_2) + aes(x = Expenditure_per_Student, y = eg8_12_stdresiduals) +
geom_point() + xlab("Expenditure per Student") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Expenditure per Student")
Top 10% in High School.
ggplot(eg8_12_com_2) + aes(x = Top_10_HS, y = eg8_12_stdresiduals) +
geom_point() + xlab("Top 10% in High School") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Top 10% in High School")
ggplot(eg8_12_com) + aes(x = Graduation_pct) +
geom_histogram() +
ylab("Count") +
ggtitle("Distribution of Graduation Percentage")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Using the Shapiro-Wilks test.
H-0: normal distribution.
H-alt: distribution is different from a normal distribution.
shapiro.test(eg8_12_com$Graduation_pct)
##
## Shapiro-Wilk normality test
##
## data: eg8_12_com$Graduation_pct
## W = 0.92977, p-value = 0.006014
May not be very applicable here. But just for illustration……
durbinWatsonTest(eg8_12_regression)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.01695507 2.009753 0.9
## Alternative hypothesis: rho != 0
vif(eg8_12_regression)
## eg8_12_com$Median_SAT eg8_12_com$Acceptance_Rate
## 2.164223 2.105911
## eg8_12_com$Expenditure_per_Student eg8_12_com$Top_10_HS
## 1.770573 1.969190
Use Breusch Pagan Test.
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.0.5
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_test_breusch_pagan(eg8_12_regression)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## -----------------------------------------------------
## Response : eg8_12_com$Graduation_pct
## Variables: fitted values of eg8_12_com$Graduation_pct
##
## Test Summary
## ----------------------------
## DF = 1
## Chi2 = 1.226334
## Prob > Chi2 = 0.2681211
Multiple test for each variable.
ols_test_breusch_pagan(eg8_12_regression, rhs = TRUE,
multiple = TRUE)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## -------------------------------------------------------------------------------------------------------------------
## Response : eg8_12_com$Graduation_pct
## Variables: eg8_12_com$Median_SAT eg8_12_com$Acceptance_Rate eg8_12_com$Expenditure_per_Student eg8_12_com$Top_10_HS
##
## Test Summary (Unadjusted p values)
## ---------------------------------------------------------------------
## Variable chi2 df p
## ---------------------------------------------------------------------
## eg8_12_com$Median_SAT 0.830848067 1 0.3620274
## eg8_12_com$Acceptance_Rate 0.582793086 1 0.4452196
## eg8_12_com$Expenditure_per_Student 0.159781567 1 0.6893577
## eg8_12_com$Top_10_HS 0.009688868 1 0.9215892
## ---------------------------------------------------------------------
## simultaneous 1.682850685 4 0.7938304
## ---------------------------------------------------------------------
library(gvlma)
gvlma(eg8_12_regression)
##
## Call:
## lm(formula = eg8_12_com$Graduation_pct ~ eg8_12_com$Median_SAT +
## eg8_12_com$Acceptance_Rate + eg8_12_com$Expenditure_per_Student +
## eg8_12_com$Top_10_HS)
##
## Coefficients:
## (Intercept) eg8_12_com$Median_SAT
## 1.792e+01 7.201e-02
## eg8_12_com$Acceptance_Rate eg8_12_com$Expenditure_per_Student
## -2.486e+01 -1.356e-04
## eg8_12_com$Top_10_HS
## -1.628e-01
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = eg8_12_regression)
##
## Value p-value Decision
## Global Stat 5.261 0.2615 Assumptions acceptable.
## Skewness 1.657 0.1980 Assumptions acceptable.
## Kurtosis 0.176 0.6748 Assumptions acceptable.
## Link Function 2.820 0.0931 Assumptions acceptable.
## Heteroscedasticity 0.608 0.4355 Assumptions acceptable.
eg8_12_regression_2 <- lm(eg8_12_com$Graduation_pct ~ eg8_12_com$Median_SAT +
eg8_12_com$Acceptance_Rate +
eg8_12_com$Expenditure_per_Student +
eg8_12_com$Top_10_HS + factor(eg8_12_com$Type))
summary(eg8_12_regression_2)
##
## Call:
## lm(formula = eg8_12_com$Graduation_pct ~ eg8_12_com$Median_SAT +
## eg8_12_com$Acceptance_Rate + eg8_12_com$Expenditure_per_Student +
## eg8_12_com$Top_10_HS + factor(eg8_12_com$Type))
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.5654 -2.9483 0.3081 3.8726 11.9014
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.259e+01 2.608e+01 0.483 0.63167
## eg8_12_com$Median_SAT 7.768e-02 2.014e-02 3.856 0.00038 ***
## eg8_12_com$Acceptance_Rate -2.464e+01 8.378e+00 -2.941 0.00525 **
## eg8_12_com$Expenditure_per_Student -1.642e-04 7.985e-05 -2.056 0.04590 *
## eg8_12_com$Top_10_HS -1.866e-01 8.805e-02 -2.119 0.03993 *
## factor(eg8_12_com$Type)University 1.436e+00 2.236e+00 0.642 0.52408
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.344 on 43 degrees of freedom
## Multiple R-squared: 0.5389, Adjusted R-squared: 0.4852
## F-statistic: 10.05 on 5 and 43 DF, p-value: 2.023e-06
confint(eg8_12_regression_2, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) -4.000033e+01 6.518279e+01
## eg8_12_com$Median_SAT 3.705419e-02 1.183027e-01
## eg8_12_com$Acceptance_Rate -4.153449e+01 -7.741431e+00
## eg8_12_com$Expenditure_per_Student -3.251922e-04 -3.128078e-06
## eg8_12_com$Top_10_HS -3.641304e-01 -8.986419e-03
## factor(eg8_12_com$Type)University -3.073331e+00 5.945974e+00
eg8_12_stdresiduals_2 <- rstandard(eg8_12_regression_2)
eg8_12_stdresiduals_2
## 1 2 3 4 5 6
## 0.77047462 0.28651708 0.32340149 -1.39467253 0.21607707 0.25370435
## 7 8 9 10 11 12
## 0.77051290 -1.49953514 -1.40265359 -0.25398293 -2.21309600 0.45743712
## 13 14 15 16 17 18
## -0.32623281 1.30717612 -0.21288077 1.36605372 0.93403761 0.02896642
## 19 20 21 22 23 24
## -0.81279483 0.06200444 -0.30071528 -0.06752947 0.77723760 0.85009900
## 25 26 27 28 29 30
## 0.19726011 0.96477712 0.60281233 -0.64808316 -1.19645214 -1.53169967
## 31 32 33 34 35 36
## 0.70144932 -0.59507413 2.35690516 -0.13719103 -0.26707632 0.35647355
## 37 38 39 40 41 42
## -2.11067232 -0.19457706 0.77549454 1.18181419 -1.97165257 -0.17083150
## 43 44 45 46 47 48
## -0.95028381 -0.01690501 -1.77855715 1.40615964 0.71600404 0.69181778
## 49
## 0.82234144
eg8_12_com_cat <- cbind(eg8_12_com, eg8_12_stdresiduals_2)
head(eg8_12_com_cat)
## School Type Median_SAT Acceptance_Rate Expenditure_per_Student
## 1 Amherst Lib Arts 1315 0.22 26636
## 2 Barnard Lib Arts 1220 0.53 17653
## 3 Bates Lib Arts 1240 0.36 17554
## 4 Berkeley University 1176 0.37 23665
## 5 Bowdoin Lib Arts 1300 0.24 25703
## 6 Brown University 1281 0.24 24201
## Top_10_HS Graduation_pct eg8_12_stdresiduals_2
## 1 85 93 0.7704746
## 2 69 80 0.2865171
## 3 58 88 0.3234015
## 4 95 68 -1.3946725
## 5 78 90 0.2160771
## 6 80 90 0.2537043
Graduation percentage.
ggplot(eg8_12_com_cat) + aes(x = Graduation_pct, y = eg8_12_stdresiduals) +
geom_point() + xlab("Graduation Percentage") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Graduation Percentage")
Median SAT Score.
ggplot(eg8_12_com_cat) + aes(x = Median_SAT, y = eg8_12_stdresiduals) +
geom_point() + xlab("Median SAT Scores") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Median SAT Scores")
Acceptance rate.
ggplot(eg8_12_com_cat) + aes(x = Acceptance_Rate, y = eg8_12_stdresiduals) +
geom_point() + xlab("Acceptance Rate") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Acceptance Rate")
Expenditure per Student.
ggplot(eg8_12_com_cat) + aes(x = Expenditure_per_Student, y = eg8_12_stdresiduals) +
geom_point() + xlab("Expenditure per Student") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Expenditure per Student")
Top 10% in High School.
ggplot(eg8_12_com_cat) + aes(x = Top_10_HS, y = eg8_12_stdresiduals) +
geom_point() + xlab("Top 10% in High School") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Top 10% in High School")
Type of University.
ggplot(eg8_12_com_cat) + aes(x = Type, y = eg8_12_stdresiduals) +
geom_point() + xlab("Type of University") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Type of University")
ggplot(eg8_12_com_cat) + aes(x = Graduation_pct) +
geom_histogram() +
ylab("Count") +
ggtitle("Distribution of Graduation Percentage")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Using the Shapiro-Wilks test.
H-0: normal distribution.
H-alt: distribution is different from a normal distribution.
shapiro.test(eg8_12_com_cat$Graduation_pct)
##
## Shapiro-Wilk normality test
##
## data: eg8_12_com_cat$Graduation_pct
## W = 0.92977, p-value = 0.006014
May not be very applicable here. But just for illustration……
durbinWatsonTest(eg8_12_regression_2)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.01388796 2.001011 0.982
## Alternative hypothesis: rho != 0
vif(eg8_12_regression_2)
## eg8_12_com$Median_SAT eg8_12_com$Acceptance_Rate
## 2.679097 2.109477
## eg8_12_com$Expenditure_per_Student eg8_12_com$Top_10_HS
## 2.562343 2.392690
## factor(eg8_12_com$Type)
## 2.143916
Use Breusch Pagan Test.
ols_test_breusch_pagan(eg8_12_regression_2)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## -----------------------------------------------------
## Response : eg8_12_com$Graduation_pct
## Variables: fitted values of eg8_12_com$Graduation_pct
##
## Test Summary
## ---------------------------
## DF = 1
## Chi2 = 1.520704
## Prob > Chi2 = 0.217513
gvlma(eg8_12_regression_2)
##
## Call:
## lm(formula = eg8_12_com$Graduation_pct ~ eg8_12_com$Median_SAT +
## eg8_12_com$Acceptance_Rate + eg8_12_com$Expenditure_per_Student +
## eg8_12_com$Top_10_HS + factor(eg8_12_com$Type))
##
## Coefficients:
## (Intercept) eg8_12_com$Median_SAT
## 1.259e+01 7.768e-02
## eg8_12_com$Acceptance_Rate eg8_12_com$Expenditure_per_Student
## -2.464e+01 -1.642e-04
## eg8_12_com$Top_10_HS factor(eg8_12_com$Type)University
## -1.866e-01 1.436e+00
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = eg8_12_regression_2)
##
## Value p-value Decision
## Global Stat 4.87073 0.30082 Assumptions acceptable.
## Skewness 1.04439 0.30680 Assumptions acceptable.
## Kurtosis 0.06851 0.79352 Assumptions acceptable.
## Link Function 2.98099 0.08425 Assumptions acceptable.
## Heteroscedasticity 0.77684 0.37811 Assumptions acceptable.
library(xlsx)
eg8_18 <- read.xlsx("Eg 8-18 Beverage Sales.xlsx",
sheetIndex = 1, header = TRUE,
startRow = 3, colIndex = 1:2)
head(eg8_18)
## Temperature Sales
## 1 85 1810
## 2 90 4825
## 3 79 438
## 4 82 775
## 5 84 1213
## 6 96 8692
eg8_18_regression <- lm(eg8_18$Sales ~ eg8_18$Temperature^2)
summary(eg8_18_regression)
##
## Call:
## lm(formula = eg8_18$Sales ~ eg8_18$Temperature^2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1765.6 -598.4 -293.0 562.0 2014.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -32511.25 3408.72 -9.538 1.12e-08 ***
## eg8_18$Temperature 408.60 39.27 10.406 2.76e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1041 on 19 degrees of freedom
## Multiple R-squared: 0.8507, Adjusted R-squared: 0.8429
## F-statistic: 108.3 on 1 and 19 DF, p-value: 2.761e-09
confint(eg8_18_regression, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) -39645.7869 -25376.7065
## eg8_18$Temperature 326.4189 490.7864
eg8_18_stdresiduals <- rstandard(eg8_18_regression)
eg8_18_stdresiduals
## 1 2 3 4 5 6 7
## -0.4043254 0.5579609 0.6896936 -0.2192460 -0.5920101 2.0884092 -1.0741874
## 8 9 10 11 12 13 14
## 1.8602444 -0.5675443 2.1650140 -1.1272465 -0.3188644 -0.9171575 -0.2908789
## 15 16 17 18 19 20 21
## -0.1002041 -1.7633017 0.3494740 -0.6365315 0.7144124 0.4360437 -0.3703755
eg8_18_com <- cbind(eg8_18, eg8_18_stdresiduals)
head(eg8_18_com)
## Temperature Sales eg8_18_stdresiduals
## 1 85 1810 -0.4043254
## 2 90 4825 0.5579609
## 3 79 438 0.6896936
## 4 82 775 -0.2192460
## 5 84 1213 -0.5920101
## 6 96 8692 2.0884092
ggplot(eg8_18_com) + aes(x = Sales, eg8_18_stdresiduals) +
geom_point() +
xlab("Sales") + ylab("Standard Residuals") +
ggtitle("Temperature and Sales, Residuals")
ggplot(eg8_18) + aes(x = Sales) +
geom_histogram() +
ylab("Count") +
ggtitle("Distribution of Sales")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Using the Shapiro-Wilks test.
H-0: normal distribution.
H-alt: distribution is different from a normal distribution.
shapiro.test(eg8_18$Sales)
##
## Shapiro-Wilk normality test
##
## data: eg8_18$Sales
## W = 0.84399, p-value = 0.003353
May not be very applicable here. But just for illustration……
durbinWatsonTest(eg8_18_regression)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.4602732 2.906229 0.018
## Alternative hypothesis: rho != 0
Use Breusch Pagan Test.
library(olsrr)
ols_test_breusch_pagan(eg8_18_regression)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## ----------------------------------------
## Response : eg8_18$Sales
## Variables: fitted values of eg8_18$Sales
##
## Test Summary
## ----------------------------
## DF = 1
## Chi2 = 2.49159
## Prob > Chi2 = 0.1144561
Multiple test for each variable.
ols_test_breusch_pagan(eg8_18_regression, rhs = TRUE,
multiple = TRUE)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## -------------
## Response : eg8_18$Sales
## Variables: m2
##
## Test Summary (Unadjusted p values)
## --------------------------------------------
## Variable chi2 df p
## --------------------------------------------
## m2 2.49159 1 0.1144561
## --------------------------------------------
## simultaneous 2.49159 1 0.1144561
## --------------------------------------------
library(gvlma)
gvlma(eg8_18_regression)
##
## Call:
## lm(formula = eg8_18$Sales ~ eg8_18$Temperature^2)
##
## Coefficients:
## (Intercept) eg8_18$Temperature
## -32511.2 408.6
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = eg8_18_regression)
##
## Value p-value Decision
## Global Stat 15.15675 0.0043869 Assumptions NOT satisfied!
## Skewness 1.22049 0.2692645 Assumptions acceptable.
## Kurtosis 0.06201 0.8033491 Assumptions acceptable.
## Link Function 13.59504 0.0002268 Assumptions NOT satisfied!
## Heteroscedasticity 0.27922 0.5972123 Assumptions acceptable.