---
title: "SQA AH Statistics Exam Papers, using R"
output: html_document
date: "2023-07-19"
---
INSTRUCTIONS
Expand (or Collapse) the code chunks by clicking on the grey triangle to the right of the line number, or use the menu item: Edit > Folding > Expand All or Collapse All
Locate the code chunk you want from the Year and Question number
Run any code chunk by clicking on the green triangle 'play button' in the top right of each section of code
```{r 2016 Question 1}
heights = c(26, 15, 28, 23, 25, 15, 16, 20, 22, 27, 17, 30, 48)
# boxplot display
boxplot(x = heights,
horizontal = TRUE,
las = TRUE)
# stem-and-leaf display
stem(heights)
```
```{r 2016 Question 3}
n = 12
mean = 147.8
st_dev = 2.379
# create a simulated data set with correct statistics
simulated_weights = scale(1:n) * st_dev + mean
t.test(simulated_weights,
mu = 150,
alternative = "less")
```
```{r 2016 Question 4b}
sample_size = 20
sample_mean = 142 / sample_size
sample_st_dev = sqrt( (1120.16 - (142^2) / sample_size) / (sample_size - 1 ) )
# calculate P(X-bar > 8.1)
pnorm(q = 8.1,
mean = sample_mean,
sd = sample_st_dev / sqrt(sample_size),
lower.tail = FALSE)
```
```{r 2016 Question 5(a)(i)}
yields = c(4.00, 4.50, 4.75, 4.88, 4.99, 5.10, 5.00, 5.25, 5.63)
# the t.test command delivers both hypothesis test result and confidence interval
t.test(yields,
mu = 4.75,
alternative = "two.sided",
conf.level = 0.90)
```
```{r 2016 Question 8(a)}
recoveries = c(75, 65)
patients = c(100, 100)
prop.test(recoveries,
patients,
alternative = "greater",
correct = FALSE) # to prevent Yate's Continuity Correction
```
```{r 2016 Question 9(b)(c)}
mu = 4
# calculate P(X > 4 + 2 * sqrt(4) ) where X ~ Po(4)
ppois(q = mu + 2 * sqrt(mu),
lambda = mu,
lower.tail = FALSE)
# calculate approximate P(T < 140) where T ~ Po(152)
pnorm(q = 139.5,
mean = 152,
sd = sqrt(152),
lower.tail = TRUE)
```
```{r 2016 Question 10(b)}
y_values = c(1, 2)
y_probabilities = c(2/5, 3/5)
# mean of y
sum(y_values * y_probabilities)
# variance of y
sum(y_values^2 * y_probabilities) - (sum(y_values * y_probabilities))^2
```
```{r 2016 Question 11(a)(b)}
cross = c(23.5, 12.0, 21.0, 20.1, 22.0, 21.5, 22.1, 20.4, 18.3, 21.1, 21.0, 12.0)
self = c(17.4, 20.4, 20.0, 20.1, 20.0, 18.6, 18.6, 15.3, 16.5, 18.0, 18.0, 18.0)
differences = cross - self
t.test(differences,
mu = 0,
alternative = "greater")
# remove zeros from differences before performing the Wilxocon test
updated_differences = differences[differences != 0]
wilcox.test(x = updated_differences,
mu = 0, # the command uses mu rather than median
alternative = "greater")
# note that the value of V stated in the Wilcoxon test output is *not* the minimum rank sum
```
```{r 2017 Question 2}
x_values = c(20, 30, 40, 50, 60)
x_probabilities = c(1/5, 1/5, 1/5, 1/5, 1/5)
# mean of y
sum(x_values * x_probabilities)
# variance of y
sum(x_values^2 * x_probabilities) - (sum(x_values * x_probabilities))^2
```
```{r 2017 Question 3(b)}
pianists_n = 16
pianists_mean = 77
pianists_st_dev = 10
violinists_n = 14
violinists_mean = 82
violinists_st_dev = 8
# create simulated data sets with correct statistics
simulated_pianist_marks = scale(1:pianists_n) * pianists_st_dev + pianists_mean
simulated_violinist_marks = scale(1:violinists_n) * violinists_st_dev + violinists_mean
t.test(simulated_pianist_marks,
simulated_violinist_marks,
paired = FALSE,
var.equal = TRUE, #this will pool the samples
alternative = "less")
```
```{r 2017 Question 7(a)}
trees_n = 18
trees_mean = 7.46
trees_st_dev = 1.46
# create simulated data sets with correct statistics
simulated_trees_per_hectare = scale(1:trees_n) * trees_st_dev + trees_mean
# the t.test command delivers both hypothesis test result and confidence interval
t.test(simulated_trees_per_hectare,
mu = 5.87,
alternative = "two.sided",
conf.level = 0.95)
```
```{r 2017 Question 8}
# (a) W ~ Po(4) ... P(2 <= W <= 6 ) = P(W <= 6) - P(W <= 1)
ppois(q = 6, lambda = 4) - ppois(q = 1, lambda = 4)
# equivalent to using the diff(erence) command...
diff(ppois(q = c(1, 6), lambda = 4))
# (b) X ~ N(4, 4) .. P(2 < X < 6 )
pnorm(q = 6, mean = 4, sd = 2) - pnorm(q = 2, mean = 4, sd = 2)
# equivalent to using the diff(erence) command...
diff(pnorm(q = c(2, 6), mean = 4, sd = 2))
# (c) Y ~ U(6, 10) .. P(8 - sqrt(4/3) < Y < 8 + sqrt(4/3))
punif(q = 8 + sqrt(4/3), min = 6, max = 10) - punif(q = 8 - sqrt(4/3), min = 6, max = 10)
# equivalent to using the diff(erence) command...
diff(punif(q = 8 + c(-1, 1) * sqrt(4/3), min = 6, max = 10))
```
```{r 2017 Question 9(c)}
# the package called 'BSDA' is needed for the z.test function
if(!require(BSDA)){install.packages("BSDA"); library(BSDA)}
lead_n = 25
lead_mean = 174.5
lead_st_dev = 23.1
# create simulated data sets with correct statistics
simulated_lead_concentration = scale(1:lead_n) * lead_st_dev + lead_mean
z.test(x = simulated_lead_concentration,
sigma.x = lead_st_dev,
mu = 165.6,
alternative = "greater")
```
```{r 2017 Question 10(c)}
table = t(data.frame(
recaptured = c(58, 51),
not_recaptured = c(255, 182)
))
output = chisq.test(x = table,
correct = FALSE) # to prevent Yate's Continuity Correction
output # displays test statistic, degrees of freedom and p-value
print("Expected frequencies:")
output$expected # displays table of expected frequencies
```
```{r 2017 Question 11(a)}
in_favour = 61
sample_size = 100
# this uses the 2-sample proportion test command syntax, but sets the second sample to have zero 'successes'. This ensures that the generated confidence interval agrees with hand-calculated answers.
prop.test(c(in_favour, 0),
c(sample_size, sample_size),
alternative = "two.sided",
conf.level = 0.99,
correct = FALSE) # to prevent Yate's Continuity Correction
```
```{r 2018 Question 1(a)}
mu = 2
# calculate P(X = 3 ) where X ~ Po(2)
dpois(x = 3,
lambda = mu)
```
```{r 2018 Question 3(a)}
hab_1 = c(313, 342, 366, 350, 376, 438, 400)
hab_2 = c(83, 94, 91, 86, 102, 113, 98)
# here's the scatterplot, for interest
plot(x = hab_1,
y = hab_2)
# the question requires a test on beta, but the numerical output from a test on rho is equivalent in terms of test statistic, degrees of freedom and p-value
cor.test(x = hab_1,
y = hab_2,
alternative = "two.sided",
method = "pearson")
```
```{r 2018 Question 4}
# (a) P(X > 10100) where X ~ N(10000, 250^2)
pnorm(q = 10100,
mean = 10000,
sd = 250,
lower.tail = FALSE)
# (b) we want x such that P(X > x) = 0.90 X ~ N(10000, 250^2)
qnorm(p = 0.90,
mean = 10000,
sd = 250,
lower.tail = FALSE)
# (c) P(T < 3000) where T ~ N(2975, 157)
pnorm(q = 3000,
mean = 2975,
sd = sqrt(157),
lower.tail = TRUE)
```
```{r 2018 Question 5(c)}
# the package called 'BSDA' is needed for the z.test function
if(!require(BSDA)){install.packages("BSDA"); library(BSDA)}
pebble_n = 100
pebble_mean = 119.4
pebble_st_dev = 21.6
# create simulated data set with correct statistics
simulated_pebble_sizes = scale(1:pebble_n) * pebble_st_dev + pebble_mean
# the output from a z.test gives the confidence interval
z.test(x = simulated_pebble_sizes,
sigma.x = pebble_st_dev ,
alternative = "two.sided",
conf.level = 0.90)
```
```{r 2018 Question 6}
m = 4
n = 7
# generate all 330 combinations
all_combinations = combn(m + n, m)
# calculate the rank sums all 330 combinations
rank_sums = colSums(all_combinations)
# locate combinations whose rank sum is <= 14
combinations_needed = rank_sums <= 14
# display the combinations that meet the criteria
all_combinations[, combinations_needed]
```
```{r 2018 Question 7(a)}
number_of_failed_bulbs = c(0, 1, 2, 3, 4, 5, 6)
observed_frequencies = c(59, 38, 19, 3, 1, 0, 0)
# generate binomial model probabilities
probabilities = dbinom(x = number_of_failed_bulbs,
size = 6,
prob = 0.1)
#conduct the goodness of fit hypothesis test
output = chisq.test(x = observed_frequencies,
p = probabilities)
# displays expected frequencies, test statistic, degrees of freedom, p-value
print("Expected frequencies:")
output$expected
output
# Inspection of expected frequencies shows that too many are less than 5
# We therefore need to combine categories by merging the last 5 categories
# create a function called 'combine' that combines the final n categories in a list
combine <- function(list, col) {
temp_col = length(list) - col # the number of columns lefts unchanged
temp_start = list[1:temp_col] # capture the first columns unchanged
temp_end = sum(list[(temp_col + 1):length(list)]) # sum the remaining columns
c(temp_start, temp_end) # splice the two lists together
}
# combine categories before applying goodness of fit test
categories_to_combine = 5
new_observed_frequencies = combine(observed_frequencies, categories_to_combine)
new_probabilities = combine(probabilities, categories_to_combine)
#conduct the goodness of fit hypothesis test with the combined category data
output = chisq.test(x = new_observed_frequencies,
p = new_probabilities)
# displays expected frequencies, test statistic, degrees of freedom, p-value
print("Expected frequencies:")
output$expected
output
```
```{r 2018 Question 8(a)}
accidents = 18
sample_size = 100
# the output of this test gives the correct p-value. All other output can be ignored.
prop.test(x = accidents,
n = sample_size,
p = 0.119,
alternative = "greater",
correct = FALSE) # to prevent Yate's Continuity Correction
```
```{r 2018 Question 11(b)}
welfare = c(19, 22, 39, 42, 59, 70, 79, 83, 117, 119, 140, 142, 144, 193, 204, 210, 236, 290)
employment = c(64.7, 51.1, 47.0, 54.0, 42.6, 48.3, 36.2, 47.9, 64.7, 54.9, 58.6, 69.9, 54.9, 61.9, 69.3, 76.0, 74.1, 78.7)
# fit the linear model and store it in a variable called 'model'
model = lm(employment ~ welfare)
# display linear model
model
# display fitted value and residual for 5th piece of data (for Czech Republic)
model$fitted.values[5]
model$residuals[5]
```
```{r 2018 Question 12}
location_A = c(13.9, 11.0, 11.3, 12.5, 13.4, 11.5, 14.5, 13.4, 12.2, 11.3)
location_B = c(14.4, 12.2, 17.4, 12.0, 16.1, 10.9, 19.4, 10.3)
t.test(location_A,
location_B,
paired = FALSE,
var.equal = TRUE, #this will pool the samples
alternative = "two.sided")
```
```{r 2019 Question 2(c)}
table = t(data.frame(
colour_blind_no = c(450, 506),
colour_blind_yes = c(40, 4)
))
output = chisq.test(x = table,
correct = FALSE) # to prevent Yate's Continuity Correction
output # displays test statistic, degrees of freedom and p-value
print("Expected frequencies:")
output$expected # displays table of expected frequencies
```
```{r 2019 Question 6}
energies = c(2725, 2650, 2421, 2793, 2239, 3225, 2156, 2692, 2369, 2725)
# the t.test command delivers both hypothesis test result and confidence interval
t.test(energies,
alternative = "two.sided",
conf.level = 0.95)
t.test(energies,
alternative = "two.sided",
conf.level = 0.99)
```
```{r 2019 Question 8(b)(c)}
Sgg = 531.5676
Sgc = 555.0811
Scc = 1731.2973
coeff_determination = Sgc^2 / (Sgg * Scc)
coeff_determination
r = sqrt(coeff_determination)
t = r * sqrt(37 - 2) / sqrt(1 - r^2)
p_value = 2 * pt(q = t,
df = 37 - 2,
lower.tail = FALSE)
p_value
```
```{r 2019 Question 9}
marks = c(86, 80, 78, 73, 69, 65, 62, 61, 59, 58, 54, 51, 49, 47, 43, 40, 38, 37, 35, 32, 29, 29)
differences = marks - 65
# remove zeros from differences before performing the Wilxocon test
updated_differences = differences[differences != 0]
wilcox.test(x = updated_differences,
mu = 0, # the command uses mu rather than median
alternative = "less")
```
```{r 2019 Question 10(a)}
# the package called 'BSDA' is needed for the z.test function
if(!require(BSDA)){install.packages("BSDA"); library(BSDA)}
wingspan_n = 25
wingspan_mean = 48.3
wingspan_st_dev = 4
# create simulated data set with correct statistics
simulated_wingspans = scale(1:wingspan_n) * wingspan_st_dev + wingspan_mean
z.test(x = simulated_wingspans,
sigma.x = wingspan_st_dev,
mu = 50,
alternative = "less")
```
```{r 2021 Paper 2 Question 1}
s_values = c(2, 4, 6, 8, 10)
s_probabilities = s_values / sum(s_values)
# mean of y
sum(s_values * s_probabilities)
# variance of y
sum(s_values^2 * s_probabilities) - (sum(s_values * s_probabilities))^2
```
```{r 2021 Paper 2 Question 3}
mites = c(6, 8, 11, 13, 6, 14, 11, 9, 6, 7, 11, 8, 6, 14)
differences = mites - 7
# remove zeros from differences before performing the Wilxocon test
updated_differences = differences[differences != 0]
wilcox.test(x = updated_differences,
mu = 0, # the command uses mu rather than median
alternative = "greater")
# note that the value of V stated in the Wilcoxon test output is *not* the minimum rank sum
```
```{r 2021 Paper 2 Question 5}
brand_A_n = 11
brand_A_mean = 54
brand_A_st_dev = 5
brand_B_n = 15
brand_B_mean = 47
brand_B_st_dev = 11
# create simulated data sets with correct statistics
simulated_brand_A_times = scale(1:brand_A_n) * brand_A_st_dev + brand_A_mean
simulated_brand_B_times = scale(1:brand_B_n) * brand_B_st_dev + brand_B_mean
t.test(simulated_brand_A_times,
simulated_brand_B_times,
paired = FALSE,
var.equal = TRUE, #this will pool the samples
alternative = "two.sided")
```
```{r 2021 Paper 2 Question 6(c)}
# the package called 'BSDA' is needed for the z.test function
if(!require(BSDA)){install.packages("BSDA"); library(BSDA)}
time_n = 25
time_mean = 409
time_st_dev = 130
# create simulated data set with correct statistics
simulated_times = scale(1:time_n) * time_st_dev + time_mean
# the output from a z.test gives the confidence interval
z.test(x = simulated_times,
sigma.x = time_st_dev ,
alternative = "two.sided",
conf.level = 0.95)
```
```{r 2021 Paper 2 Question 8}
# the package called 'BSDA' is needed for the z.test function
if(!require(BSDA)){install.packages("BSDA"); library(BSDA)}
time_n = 50
time_mean = 16.1
time_st_dev = 2
# create simulated data set with correct statistics
simulated_times = scale(1:time_n) * time_st_dev + time_mean
# the output from a z.test gives the confidence interval
z.test(x = simulated_times,
sigma.x = time_st_dev,
mu = 15,
alternative = "greater",
conf.level = 0.95)
```
```{r 2021 Paper 2 Question 10(a)}
selling_beyond = 13
sample_size = 50
# this uses the 2-sample proportion test command syntax, but sets the second sample to have zero 'successes'. This ensures that the generated confidence interval agrees with hand-calculated answers.
prop.test(c(selling_beyond, 0),
c(sample_size, sample_size),
alternative = "two.sided",
conf.level = 0.95,
correct = FALSE) # to prevent Yate's Continuity Correction
```
```{r 2021 Paper 2 Question 11(b)}
# the package called 'BSDA' is needed for the z.test function
if(!require(BSDA)){install.packages("BSDA"); library(BSDA)}
group_B_n = 70
group_B_mean = 55.4
group_B_st_dev = 10.08
group_C_n = 60
group_C_mean = 51.8
group_C_st_dev = 10.49
# create simulated data sets with correct statistics
simulated_group_B_scores = scale(1:group_B_n) * group_B_st_dev + group_B_mean
simulated_group_C_scores = scale(1:group_C_n) * group_C_st_dev + group_C_mean
z.test(x = simulated_group_B_scores,
y = simulated_group_C_scores,
sigma.x = group_B_st_dev,
sigma.y = group_C_st_dev,
alternative = "two.sided")
# for interest, by comparison, here is what the t-test would have been
t.test(simulated_group_B_scores,
simulated_group_C_scores,
paired = FALSE,
var.equal = TRUE, #this will pool the samples
alternative = "two.sided")
```
```{r 2022 Paper 2 Question 1}
table = t(data.frame(
infected_yes = c(76, 129),
infected_no = c(399, 332)
))
output = chisq.test(x = table,
correct = FALSE) # to prevent Yate's Continuity Correction
output # displays test statistic, degrees of freedom and p-value
print("Expected frequencies:")
output$expected # displays table of expected frequencies
```
```{r 2022 Paper 2 Question 2}
# (b) P(X = 0) where X ~ Po(2.3)
dpois(x = 0,
lambda = 2.3)
# (c) P(X = 2 and Y = 2) where X ~ Po(2.3) and Y ~ Po(1.7)
dpois(x = 2, lambda = 2.3) * dpois(x = 2, lambda = 1.7)
# (d) P(W > 5) where W ~ Po(4)
ppois(q = 5,
lambda = 4,
lower.tail = FALSE)
```
```{r 2022 Paper 2 Question 5(b)}
french = c(67, 83, 71, 59, 49, 89, 42, 55, 77)
german = c(64, 82, 71, 62, 42, 85, 39, 50, 75)
differences = french - german
t.test(differences,
mu = 0,
alternative = "two.sided")
```
```{r 2022 Paper 2 Question 7(b)}
successes = 14
sample_size = 50
# this uses the 2-sample proportion test command syntax, but sets the second sample to have zero 'successes'. This ensures that the generated confidence interval agrees with hand-calculated answers.
prop.test(c(successes, 0),
c(sample_size, sample_size),
alternative = "two.sided",
conf.level = 0.99,
correct = FALSE) # to prevent Yate's Continuity Correction
```
```{r 2022 Paper 2 Question 9(b)}
# the package called 'BSDA' is needed for the z.test function
if(!require(BSDA)){install.packages("BSDA"); library(BSDA)}
width_n = 45
width_mean = 52.6
width_st_dev = sqrt(103.25)
# create simulated data sets with correct statistics
simulated_widths = scale(1:width_n) * width_st_dev + width_mean
z.test(x = simulated_widths,
sigma.x = width_st_dev,
mu = 50,
alternative = "greater")
```
```{r 2022 Paper 2 Question 10(a)}
Sxx = 278.61
Syy = 10.95
Sxy = 46.29
n = 6
r = Sxy / sqrt(Sxx * Syy)
r
t = r * sqrt(n - 2) / sqrt(1 - r^2)
t
p_value = 2 * pt(q = t,
df = n - 2,
lower.tail = FALSE)
p_value
```
```{r 2022 Paper 2 Question 11(a)}
adults = c(1.3, 2.2, 1.5, 3.5, 0.3, 2.7, 3.5, 2.3, 2.9, 4.0)
juveniles = c(1.1, 4.1, 1.7, 1.3, 0.7, 1.9, 2.9, 1.1, 2.8, 0.9)
# back-to-back stem and leaf diagrams are not built into R by default
# the 'aplpack' package does support them, but technical issues may need to be overcome
stem(adults)
stem(juveniles)
# the process for a Mann-Whitney test is embedded within the wilcox.test command
wilcox.test(x = adults,
y = juveniles,
paired = FALSE,
alternative = "two.sided")
# note that the value of 'W' in the output is *not* the rank sum used in the AH Stats course
```
```{r 2023 Paper 2 Question 2}
steps = c(320, 310, 321, 304, 298, 328, 296, 307, 314, 295)
differences = steps - 300
# remove zeros from differences before performing the Wilxocon test
updated_differences = differences[differences != 0]
wilcox.test(x = updated_differences,
mu = 0, # the command uses mu rather than median
alternative = "greater")
# note that the value of V stated in the Wilcoxon test output is *not* the minimum rank sum
```
```{r 2023 Paper 2 Question 4}
observed_offspring = c(78, 90, 152)
# generate probabilities in the ratio 1:1:2
probabilities = c(1/4, 1/4, 2/4)
#conduct the goodness of fit hypothesis test
output = chisq.test(x = observed_offspring,
p = probabilities)
# displays expected frequencies, test statistic, degrees of freedom, p-value
print("Expected frequencies:")
output$expected
output
```
```{r 2023 Paper 2 Question 6}
# the package called 'BSDA' is needed for the z.test function
if(!require(BSDA)){install.packages("BSDA"); library(BSDA)}
length_n = 75
length_mean = 3840 / 75
length_st_dev = sqrt((198240 - 3840^2/75) / (75 - 1))
# create simulated data sets with correct statistics
simulated_lengths = scale(1:length_n) * length_st_dev + length_mean
z.test(x = simulated_lengths,
sigma.x = length_st_dev,
mu = 50,
alternative = "greater")
# for interest, a single sample t-test would have given...
t.test(x = simulated_lengths,
mu = 50,
alternative = "greater")
```
```{r 2023 Paper 2 Question 8}
n = 25
r = 0.652
t = r * sqrt(n - 2) / sqrt(1 - r^2)
t
p_value = 2 * pt(q = t,
df = n - 2,
lower.tail = FALSE)
p_value
```
```{r 2023 Paper 2 Question 9(a)}
with_tracker = c(5.1, 10, 10.8, 7.5, 6.2, 10.2, 5.4, 4.2, 8.1, 11.1, 10.2, 5.3)
without_tracker = c(4, 9.5, 12, 5.5, 5.9, 11, 4.8, 3.5, 6.5, 11.5, 9.4, 5.1)
differences = with_tracker - without_tracker
t.test(differences,
mu = 0,
alternative = "greater")
```
```{r 2023 Paper 2 Question 10}
homeless = 23312
sample_size = 37878
# the output of this test gives the correct p-value. All other output can be ignored.
prop.test(x = homeless,
n = sample_size,
p = 0.624,
alternative = "two.sided",
correct = FALSE) # to prevent Yate's Continuity Correction
```
```{r 2023 Paper 2 Question 11}
# set up a system of two simultaneous equations in the matrix form Ax = B
A = matrix(data = c(1, 1,
qnorm(0.1), qnorm(0.95)),
nrow = 2,
ncol = 2)
B = c(17, 24)
# solve the system of equations to give mu and sigma
solve(A, B)
```
```{r 2023 Paper 2 Question 12(a)}
in_favour = 55
sample_size = 100
# this uses the 2-sample proportion test command syntax, but sets the second sample to have zero 'successes'. This ensures that the generated confidence interval agrees with hand-calculated answers.
prop.test(c(in_favour, 0),
c(sample_size, sample_size),
alternative = "two.sided",
conf.level = 0.99,
correct = FALSE) # to prevent Yate's Continuity Correction
```