---
title: "SQA AH Statistics Exam Papers, using R"
output: html_document
date: "2024-05-03"
---
INSTRUCTIONS
Expand (or Collapse) the code chunks by clicking on the grey triangle to the right of the line number, or use the menu item: Edit > Folding > Expand All or Collapse All
Locate the code chunk you want from the Year and Question number
Run any code chunk by clicking on the green triangle 'play button' in the top right of each section of code
```{r 2024 Paper 1 Question 1(c)}
table = t(data.frame(
decade_1980s = c(3, 8, 1, 21, 17),
decade_1990s = c(2, 7, 5, 20, 16),
decade_2000s = c(2, 5, 8, 31, 4),
decade_2010s = c(2, 3, 14, 26, 5)
))
output = chisq.test(x = table,
correct = FALSE) # to prevent Yate's Continuity Correction
output # displays test statistic, degrees of freedom and p-value
print("Expected frequencies:")
output$expected # displays table of expected frequencies
```
```{r 2024 Paper 1 Question 1(d)(i)}
age_rating_18 = c(37, 5)
total_all_films = c(150, 50)
p = sum(age_rating_18) / sum(total_all_films) # pooled proportion
p # displays pooled proportion
z = diff(age_rating_18 / total_all_films) / sqrt (p * (1-p) * sum(1 / total_all_films))
z # displays test statistic
```
```{r 2024 Paper 1 Question 2(b)}
standard = c(7500, 8000, 2000, 550, 1250, 1000, 2250, 6800, 3400, 6300, 9100, 970, 1040, 670, 400)
new = c(410, 250, 800, 1400, 7900, 7400, 1020, 6000, 920, 1420, 2700, 4200, 5200, 4100)
# the process for a Mann-Whitney test is embedded within the wilcox.test command
output = wilcox.test(x = standard,
y = new,
paired = FALSE,
alternative = "two.sided")
output # displays Mann-Whitney output
# note that the value of 'W' in the output is *not* the rank sum used in the AH Statistics course
W_output = output$statistic
# the value of 'W' in the standard R output is actually the distance away from the smallest possible rank sum
# the smallest possible rank sum here is 0.5 * m * (m + 1) where m = smallest sample size
m = min(length(new), length(standard))
n = max(length(new), length(standard))
# now refer to the top of page 16 in Statistical Formulae and Tables booklet
Wm = 0.5 * m * (m + 1) + W_output
Wm_reversed_ranks = m * (m + n + 1) - Wm
W = min(Wm, Wm_reversed_ranks)
W # this is the value of W used in the AH Statistics course
```
```{r 2024 Paper 1 Question 2(c)}
month_3 = c(7500, 8000, 2000, 550, 1250, 1000, 2250, 6800, 3400, 6300, 9100, 970, 1040, 670, 400)
month_6 = c(7480, 8010, 2010, 560, 1210, 980, 2180, 6790, 3370, 6260, 9060, 960, 990, 640, 430)
differences = month_3 - month_6
differences # display differences, and check there are no zeros
ranks = rank(abs(differences))
ranks # display ranks of absolute values of differences
wilcox.test(x = differences,
mu = 0, # the command uses mu rather than median
alternative = "two.sided",
exact = FALSE) # approximate p-value
# note that the value of V stated in the Wilcoxon test output is *not* the minimum rank sum
W_negative = sum((differences < 0) * ranks)
W_negative # display rank sum of negative differences
W_positive = sum((differences > 0) * ranks)
W_positive # display rank sum of positive differences
W = min(W_negative, W_positive)
W # rank sum used in AH Statistics course
```
```{r 2024 Paper 2 Question 1}
wild_birds = c(11, 27, 45, 63, 65, 70, 77, 79, 87, 88, 90, 95, 102, 130)
# boxplot display
boxplot(x = wild_birds,
horizontal = TRUE,
las = TRUE)
# the boxplot shows one possible outlier below the lower fence
```
```{r 2024 Paper 2 Question 3(a)}
# calculate P(X = 9) where X ~ B(12, 0.88)
dbinom(x = 9,
size = 12, # number of trials
prob = 0.88) # probability of success
```
```{r 2024 Paper 2 Question 3(b)}
# calculate P(X > 36) using Normal Approximation
n = 48
p = 0.88
normal_mean = n * p
normal_st_dev = sqrt(n * p * (1 - p))
pnorm(q = 36.5, # using continuity correction
mean = normal_mean,
sd = normal_st_dev,
lower.tail = FALSE) # so it calculates P(Z > ...) and not P(Z < ...)
# for interesting comparison, calculating P(X > 36) exactly, where X ~ B(48, 0.88)
sum(dbinom(x = c(37:48), # all values of x from 37 to 48 inclusive
size = 48, # number of trials
prob = 0.88)) # probability of success
```
```{r 2024 Paper 2 Question 4}
observed_grades = c(185, 170, 197, 163, 155)
# generate probabilities according to discrete uniform distribution, U(5)
probabilities = c(1/5, 1/5, 1/5, 1/5, 1/5)
#conduct the goodness of fit hypothesis test
output = chisq.test(x = observed_grades,
p = probabilities)
# displays expected frequencies, test statistic, degrees of freedom, p-value
print("Expected frequencies:")
output$expected
output
```
```{r 2024 Paper 2 Question 5(b)}
attempt = c(2, 2, 3, 4, 4, 5, 9, 12, 14)
errors = c(11, 10, 7, 6, 5, 4, 2, 1, 1)
# generate scatterplot of original data
plot(x = attempt,
y = errors)
# generate scatterplot of transformed data
plot(x = attempt,
y = 1 / errors)
# fit the linear model to transformed data and store it in a variable called 'model'
model = lm((1 / errors) ~ attempt)
# display linear model
model
# calculate prediction interval for rat completing maze on attempt number 7
prediction_interval = predict.lm(model,
newdata = data.frame(attempt = 7),
interval = "prediction",
level = 0.95)
prediction_interval # display prediction interval for 1/errors, for transformed data model
1 / prediction_interval # convert prediction interval from '1/errors' to 'errors'
```
```{r 2024 Paper 2 Question 6(a)}
work_properly = 14
sample_size = 20
# this uses the 2-sample proportion test command syntax, but sets the second sample to have zero 'successes'. This ensures that the generated confidence interval agrees with hand-calculated answers.
prop.test(x = c(work_properly, 0),
n = c(sample_size, sample_size),
alternative = "two.sided",
conf.level = 0.99,
correct = FALSE) # to prevent Yate's Continuity Correction
```
```{r 2024 Paper 2 Question 8}
area_1_n = 9
area_1_mean = 38.7
area_1_st_dev = 3.42
area_2_n = 11
area_2_mean = 36.9
area_2_st_dev = 3.51
# create simulated data sets with correct statistics
simulated_area_1 = scale(1:area_1_n) * area_1_st_dev + area_1_mean
simulated_area_2 = scale(1:area_2_n) * area_2_st_dev + area_2_mean
t.test(simulated_area_1,
simulated_area_2,
paired = FALSE, # non-paired data
var.equal = TRUE, # this will pool the samples
alternative = "greater")
```
```{r 2024 Paper 2 Question 9(b)}
mean_rate = 6000 / 2400
# calculate P(X = 0) where X ~ Po(2.5)
dpois(x = 0,
lambda = mean_rate)
```
```{r 2024 Paper 2 Question 9(c)}
# calculate P(X <= x) where X ~ Po(2.5) and x takes values from 0 to 7
# and look for the first to exceed 0.90
ppois(q = 0:7,
lambda = 2.5)
# and to check you have the correct values either side of this threshold....
# calculate P(X <= 4) where X ~ Po(2.5)
ppois(q = 4,
lambda = 2.5)
# calculate P(X <= 5) where X ~ Po(2.5)
ppois(q = 5,
lambda = 2.5)
```
```{r 2024 Paper 2 Question 10}
z_95 = qnorm(p = 0.95) # value of z such that P(Z < z) = 0.95
# we need to solve for the minimum value of integer n the inequality:
# z_95 * 2.9 / sqrt (n) < 0.7
# we shall use 'brute force' by substituting in lots of values of n, to then find the minimum value of n that works
# first, set up a list of possible values of n, from 1 to 100
n_possible = 1:100
# for each possible value of n, return TRUE or FALSE in a list, depending on whether it satisfies the inequality
evaluations = z_95 * 2.9 / sqrt (n_possible) < 0.7
# extract the minimum value of n that gave a TRUE
min(n_possible[evaluations])
```
```{r 2024 Paper 2 Question 12(a)}
# calculate P(total mass > 25000) using combination of J ~ N(69, 6) and H ~ N(453, 16)
normal_mean = 48 * 69 + 48 * 453
normal_mean
normal_variance = 48 * 6 + 48 * 16
normal_variance
pnorm(q = 25000,
mean = normal_mean,
sd = sqrt(normal_variance),
lower.tail = FALSE)
```
```{r 2024 Paper 2 Question 12(b)}
# the package called 'BSDA' is needed for the z.test function
if(!require(BSDA)){install.packages("BSDA"); library(BSDA)}
mass_n = 10
mass_mean = 527.5
mass_st_dev = 5
# create simulated data sets with correct statistics
simulated_masses = scale(1:mass_n) * mass_st_dev + mass_mean
z.test(x = simulated_masses,
sigma.x = mass_st_dev,
mu = 522,
alternative = "two.sided")
```
```{r 2024 Paper 2 Question 13(a)}
sample_size = 5
sample_mean = 102
sample_st_dev = 0.13
sigma_limits = 2
# calculate both lower and upper limits
sample_mean + c(-1, 1) * sigma_limits * sample_st_dev / sqrt(sample_size)
```