Directions

It’s a boba tea-quila sunrise! Comparing means using t-test.

Data for demo

Back to the spell book

1. Independent samples t-test

1.1 One-Sample t-test 1

Use Cadsoft data.

Data for demo

cadsoft <- read.csv("Eg_7-2.csv", header = TRUE)
names(cadsoft)[1] <- "Customer"

head(cadsoft)
##   Customer Time
## 1        1   20
## 2        2   12
## 3        3   15
## 4        4   11
## 5        5   22
## 6        6    6

H0: mean response time >= 25

H1: mean respone time < 25

t.test(cadsoft$Time, mu = 25, alternative = "less")
## 
##  One Sample t-test
## 
## data:  cadsoft$Time
## t = -1.0522, df = 43, p-value = 0.1493
## alternative hypothesis: true mean is less than 25
## 95 percent confidence interval:
##     -Inf 26.8475
## sample estimates:
## mean of x 
##  21.90909

H0: mean response time <= 25

H1: mean respone time > 25

t.test(cadsoft$Time, mu = 25, alternative = "greater")
## 
##  One Sample t-test
## 
## data:  cadsoft$Time
## t = -1.0522, df = 43, p-value = 0.8507
## alternative hypothesis: true mean is greater than 25
## 95 percent confidence interval:
##  16.97068      Inf
## sample estimates:
## mean of x 
##  21.90909

H0: mean response time = 25

H1: mean respone time <> 25

t.test(cadsoft$Time, mu = 25, alternative = "two.sided")
## 
##  One Sample t-test
## 
## data:  cadsoft$Time
## t = -1.0522, df = 43, p-value = 0.2986
## alternative hypothesis: true mean is not equal to 25
## 95 percent confidence interval:
##  15.98474 27.83344
## sample estimates:
## mean of x 
##  21.90909

1.2 One-Sample t-test 2

Another example.

Data for demo

vacation <- read.csv("Eg_7-6.csv", header = TRUE)
names(vacation)[1] <- "Age"
head(vacation)
##   Age Gender Relationship.Status Vacations.per.Year Number.of.Children
## 1  24   Male             Married                  2                  0
## 2  26 Female             Married                  4                  0
## 3  28   Male             Married                  2                  2
## 4  33   Male             Married                  4                  0
## 5  45   Male             Married                  2                  0
## 6  49   Male             Married                  1                  2

H0: Age = 35

H1: Age <> 35

t.test(vacation$Age, mu = 35, alternative = "two.sided")
## 
##  One Sample t-test
## 
## data:  vacation$Age
## t = 2.7283, df = 33, p-value = 0.01012
## alternative hypothesis: true mean is not equal to 35
## 95 percent confidence interval:
##  35.93485 41.41809
## sample estimates:
## mean of x 
##  38.67647

Test for normality.

shapiro.test(vacation$Age)
## 
##  Shapiro-Wilk normality test
## 
## data:  vacation$Age
## W = 0.93422, p-value = 0.04158

2. Two-Sample t-test

Use purchase orders data.

Data for demo

2.1 Prepare data

supplier <- read.csv("Eg_7-9.csv", header = TRUE)
names(supplier)[1] <- "Supplier"
head(supplier)
##             Supplier Order.No. Item.No.   Item.Description Item.Cost Quantity
## 1   Hulkey Fasteners  Aug11001     1122 Airframe fasteners    $4.25   19,500 
## 2      Alum Sheeting  Aug11002     1243 Airframe fasteners    $4.25   10,000 
## 3 Fast-Tie Aerospace  Aug11003     5462 Shielded Cable/ft.    $1.05   23,000 
## 4 Fast-Tie Aerospace  Aug11004     5462 Shielded Cable/ft.    $1.05   21,500 
## 5      Steelpin Inc.  Aug11005     5319 Shielded Cable/ft.    $1.10   17,500 
## 6 Fast-Tie Aerospace  Aug11006     5462 Shielded Cable/ft.    $1.05   22,500 
##   Cost.per.order A.P.Terms..Months. Order.Date Arrival.Date
## 1    $82,875.00                  30   08/05/11     08/13/11
## 2    $42,500.00                  30   08/08/11     08/14/11
## 3    $24,150.00                  30   08/10/11     08/15/11
## 4    $22,575.00                  30   08/15/11     08/22/11
## 5    $19,250.00                  30   08/20/11     08/31/11
## 6    $23,625.00                  30   08/20/11     08/26/11

Convert date fields.

library(lubridate)
## Warning: package 'lubridate' was built under R version 4.0.5
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
supplier$Order.Date <- lubridate::parse_date_time(supplier$Order.Date, c('mdy'))
supplier$Arrival.Date <- lubridate::parse_date_time(supplier$Arrival.Date, c('mdy'))


supplier$Order.Date <- as.Date(supplier$Order.Date, format = "%m/%d/%Y")
supplier$Arrival.Date <- as.Date(supplier$Arrival.Date, format = "%m/%d/%Y")

str(supplier)
## 'data.frame':    94 obs. of  10 variables:
##  $ Supplier          : chr  "Hulkey Fasteners" "Alum Sheeting" "Fast-Tie Aerospace" "Fast-Tie Aerospace" ...
##  $ Order.No.         : chr  "Aug11001" "Aug11002" "Aug11003" "Aug11004" ...
##  $ Item.No.          : int  1122 1243 5462 5462 5319 5462 4312 7258 6321 5462 ...
##  $ Item.Description  : chr  "Airframe fasteners" "Airframe fasteners" "Shielded Cable/ft." "Shielded Cable/ft." ...
##  $ Item.Cost         : chr  " $4.25 " " $4.25 " " $1.05 " " $1.05 " ...
##  $ Quantity          : chr  " 19,500 " " 10,000 " " 23,000 " " 21,500 " ...
##  $ Cost.per.order    : chr  " $82,875.00 " " $42,500.00 " " $24,150.00 " " $22,575.00 " ...
##  $ A.P.Terms..Months.: int  30 30 30 30 30 30 30 45 30 30 ...
##  $ Order.Date        : Date, format: "2011-08-05" "2011-08-08" ...
##  $ Arrival.Date      : Date, format: "2011-08-13" "2011-08-14" ...

Compute lead time.

supplier$lead <- as.numeric(supplier$Arrival.Date - supplier$Order.Date)
head(supplier)
##             Supplier Order.No. Item.No.   Item.Description Item.Cost Quantity
## 1   Hulkey Fasteners  Aug11001     1122 Airframe fasteners    $4.25   19,500 
## 2      Alum Sheeting  Aug11002     1243 Airframe fasteners    $4.25   10,000 
## 3 Fast-Tie Aerospace  Aug11003     5462 Shielded Cable/ft.    $1.05   23,000 
## 4 Fast-Tie Aerospace  Aug11004     5462 Shielded Cable/ft.    $1.05   21,500 
## 5      Steelpin Inc.  Aug11005     5319 Shielded Cable/ft.    $1.10   17,500 
## 6 Fast-Tie Aerospace  Aug11006     5462 Shielded Cable/ft.    $1.05   22,500 
##   Cost.per.order A.P.Terms..Months. Order.Date Arrival.Date lead
## 1    $82,875.00                  30 2011-08-05   2011-08-13    8
## 2    $42,500.00                  30 2011-08-08   2011-08-14    6
## 3    $24,150.00                  30 2011-08-10   2011-08-15    5
## 4    $22,575.00                  30 2011-08-15   2011-08-22    7
## 5    $19,250.00                  30 2011-08-20   2011-08-31   11
## 6    $23,625.00                  30 2011-08-20   2011-08-26    6

Filter data for Suppliers Alum Sheeting and Durable Products.

supplier_alum_durrable <- subset(supplier, Supplier == "Alum Sheeting" | 
                                   Supplier == "Durrable Products")

table(supplier_alum_durrable$Supplier)
## 
##     Alum Sheeting Durrable Products 
##                 8                13

2.2 Two direction

H0: alum - durrable = 0

H1: alum - durrable <> 0

Assume unequal variances

t.test(supplier_alum_durrable$lead ~ supplier_alum_durrable$Supplier)
## 
##  Welch Two Sample t-test
## 
## data:  supplier_alum_durrable$lead by supplier_alum_durrable$Supplier
## t = 3.828, df = 9.5306, p-value = 0.003636
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.8598894 3.2939567
## sample estimates:
##     mean in group Alum Sheeting mean in group Durrable Products 
##                        7.000000                        4.923077

Assume equal variances

t.test(supplier_alum_durrable$lead ~ supplier_alum_durrable$Supplier,
       var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  supplier_alum_durrable$lead by supplier_alum_durrable$Supplier
## t = 4.4044, df = 19, p-value = 0.0003046
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.089955 3.063891
## sample estimates:
##     mean in group Alum Sheeting mean in group Durrable Products 
##                        7.000000                        4.923077

2.3 One direction

H0: Alum Sheeting >= Durrable Products.

H1: Alum Sheeting < Durrable Products.

Assuming unequal variance by default.

t.test(supplier_alum_durrable$lead ~ supplier_alum_durrable$Supplier,
       alternative = "less")
## 
##  Welch Two Sample t-test
## 
## data:  supplier_alum_durrable$lead by supplier_alum_durrable$Supplier
## t = 3.828, df = 9.5306, p-value = 0.9982
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##      -Inf 3.065242
## sample estimates:
##     mean in group Alum Sheeting mean in group Durrable Products 
##                        7.000000                        4.923077

Testing for a specific mean difference, say 2.

H0: Alum Sheeting >= Durrable Products + 2

H1: Alum Sheeting < Durrable Products + 2

Assuming unequal variance by default.

t.test(supplier_alum_durrable$lead ~ supplier_alum_durrable$Supplier, 
       alternative = "less", mu = 2)
## 
##  Welch Two Sample t-test
## 
## data:  supplier_alum_durrable$lead by supplier_alum_durrable$Supplier
## t = 0.14178, df = 9.5306, p-value = 0.5549
## alternative hypothesis: true difference in means is less than 2
## 95 percent confidence interval:
##      -Inf 3.065242
## sample estimates:
##     mean in group Alum Sheeting mean in group Durrable Products 
##                        7.000000                        4.923077

H0: Alum Sheeting <= Durrable Products.

H1: Alum Sheeting > Durrable Products.

Assuming unequal variance by default.

t.test(supplier_alum_durrable$lead ~ supplier_alum_durrable$Supplier,
       alternative = "greater")
## 
##  Welch Two Sample t-test
## 
## data:  supplier_alum_durrable$lead by supplier_alum_durrable$Supplier
## t = 3.828, df = 9.5306, p-value = 0.001818
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  1.088604      Inf
## sample estimates:
##     mean in group Alum Sheeting mean in group Durrable Products 
##                        7.000000                        4.923077

Testing for a specific mean difference, say 3.

H0: Alum Sheeting <= Durrable Products + 3

H1: Alum Sheeting > Durrable Products + 3

Assuming unequal variance by default.

t.test(supplier_alum_durrable$lead ~ supplier_alum_durrable$Supplier,
       alternative = "greater", mu = 3)
## 
##  Welch Two Sample t-test
## 
## data:  supplier_alum_durrable$lead by supplier_alum_durrable$Supplier
## t = -1.7013, df = 9.5306, p-value = 0.9394
## alternative hypothesis: true difference in means is greater than 3
## 95 percent confidence interval:
##  1.088604      Inf
## sample estimates:
##     mean in group Alum Sheeting mean in group Durrable Products 
##                        7.000000                        4.923077

3. Paired-samples t-test

3.1 load data

Use pile foundation data

pf <- read.csv("pile_foundation.csv", header = TRUE)
head(pf)
##   ï..Pile_Number Pile_Length_Estimated Pile_Length_Actual
## 1              1                 10.58              18.58
## 2              2                 10.58              18.58
## 3              3                 10.58              18.58
## 4              4                 10.58              18.58
## 5              5                 10.58              28.58
## 6              6                 10.58              26.58

3.2 Rename fields

names(pf)[1:3] <- c("Pile_Number", "Pile_Length_Estimated", 
                    "Pile_Length_Actual")
head(pf)
##   Pile_Number Pile_Length_Estimated Pile_Length_Actual
## 1           1                 10.58              18.58
## 2           2                 10.58              18.58
## 3           3                 10.58              18.58
## 4           4                 10.58              18.58
## 5           5                 10.58              28.58
## 6           6                 10.58              26.58

3.3 Paired samples t-test

H0: Pile_Length_Estimated = Pile_Length_Actual

H1: Pile_Length_Estimated <> Pile_Length_Actual

t.test(pf$Pile_Length_Estimated, pf$Pile_Length_Actual, paired = TRUE)
## 
##  Paired t-test
## 
## data:  pf$Pile_Length_Estimated and pf$Pile_Length_Actual
## t = -10.912, df = 310, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -7.528856 -5.228508
## sample estimates:
## mean of the differences 
##               -6.378682

Assumptions

Test for normality.

shapiro.test(pf$Pile_Length_Estimated)
## 
##  Shapiro-Wilk normality test
## 
## data:  pf$Pile_Length_Estimated
## W = 0.92995, p-value = 6.34e-11
shapiro.test(pf$Pile_Length_Actual)
## 
##  Shapiro-Wilk normality test
## 
## data:  pf$Pile_Length_Actual
## W = 0.96387, p-value = 5.459e-07