---
title: "SQA AH Statistics Exam Papers, using R"
output: html_document
date: "2023-07-19"
---
INSTRUCTIONS
Expand (or Collapse) the code chunks by clicking on the grey triangle to the right of the line number, or use the menu item: Edit > Folding > Expand All or Collapse All
Locate the code chunk you want from the Year and Question number
Run any code chunk by clicking on the green triangle 'play button' in the top right of each section of code
```{r 2018 Question 1(a)}
mu = 2
# calculate P(X = 3 ) where X ~ Po(2)
dpois(x = 3,
lambda = mu)
```
```{r 2018 Question 3(a)}
hab_1 = c(313, 342, 366, 350, 376, 438, 400)
hab_2 = c(83, 94, 91, 86, 102, 113, 98)
# here's the scatterplot, for interest
plot(x = hab_1,
y = hab_2)
# the question requires a test on beta, but the numerical output from a test on rho is equivalent in terms of test statistic, degrees of freedom and p-value
cor.test(x = hab_1,
y = hab_2,
alternative = "two.sided",
method = "pearson")
```
```{r 2018 Question 4}
# (a) P(X > 10100) where X ~ N(10000, 250^2)
pnorm(q = 10100,
mean = 10000,
sd = 250,
lower.tail = FALSE)
# (b) we want x such that P(X > x) = 0.90 X ~ N(10000, 250^2)
qnorm(p = 0.90,
mean = 10000,
sd = 250,
lower.tail = FALSE)
# (c) P(T < 3000) where T ~ N(2975, 157)
pnorm(q = 3000,
mean = 2975,
sd = sqrt(157),
lower.tail = TRUE)
```
```{r 2018 Question 5(c)}
# the package called 'BSDA' is needed for the z.test function
if(!require(BSDA)){install.packages("BSDA"); library(BSDA)}
pebble_n = 100
pebble_mean = 119.4
pebble_st_dev = 21.6
# create simulated data set with correct statistics
simulated_pebble_sizes = scale(1:pebble_n) * pebble_st_dev + pebble_mean
# the output from a z.test gives the confidence interval
z.test(x = simulated_pebble_sizes,
sigma.x = pebble_st_dev ,
alternative = "two.sided",
conf.level = 0.90)
```
```{r 2018 Question 6}
m = 4
n = 7
# generate all 330 combinations
all_combinations = combn(m + n, m)
# calculate the rank sums all 330 combinations
rank_sums = colSums(all_combinations)
# locate combinations whose rank sum is <= 14
combinations_needed = rank_sums <= 14
# display the combinations that meet the criteria
all_combinations[, combinations_needed]
```
```{r 2018 Question 7(a)}
number_of_failed_bulbs = c(0, 1, 2, 3, 4, 5, 6)
observed_frequencies = c(59, 38, 19, 3, 1, 0, 0)
# generate binomial model probabilities
probabilities = dbinom(x = number_of_failed_bulbs,
size = 6,
prob = 0.1)
#conduct the goodness of fit hypothesis test
output = chisq.test(x = observed_frequencies,
p = probabilities)
# displays expected frequencies, test statistic, degrees of freedom, p-value
print("Expected frequencies:")
output$expected
output
# Inspection of expected frequencies shows that too many are less than 5
# We therefore need to combine categories by merging the last 5 categories
# create a function called 'combine' that combines the final n categories in a list
combine <- function(list, col) {
temp_col = length(list) - col # the number of columns lefts unchanged
temp_start = list[1:temp_col] # capture the first columns unchanged
temp_end = sum(list[(temp_col + 1):length(list)]) # sum the remaining columns
c(temp_start, temp_end) # splice the two lists together
}
# combine categories before applying goodness of fit test
categories_to_combine = 5
new_observed_frequencies = combine(observed_frequencies, categories_to_combine)
new_probabilities = combine(probabilities, categories_to_combine)
#conduct the goodness of fit hypothesis test with the combined category data
output = chisq.test(x = new_observed_frequencies,
p = new_probabilities)
# displays expected frequencies, test statistic, degrees of freedom, p-value
print("Expected frequencies:")
output$expected
output
```
```{r 2018 Question 8(a)}
accidents = 18
sample_size = 100
# the output of this test gives the correct p-value. All other output can be ignored.
prop.test(x = accidents,
n = sample_size,
p = 0.119,
alternative = "greater",
correct = FALSE) # to prevent Yate's Continuity Correction
```
```{r 2018 Question 11(b)}
welfare = c(19, 22, 39, 42, 59, 70, 79, 83, 117, 119, 140, 142, 144, 193, 204, 210, 236, 290)
employment = c(64.7, 51.1, 47.0, 54.0, 42.6, 48.3, 36.2, 47.9, 64.7, 54.9, 58.6, 69.9, 54.9, 61.9, 69.3, 76.0, 74.1, 78.7)
# fit the linear model and store it in a variable called 'model'
model = lm(employment ~ welfare)
# display linear model
model
# display fitted value and residual for 5th piece of data (for Czech Republic)
model$fitted.values[5]
model$residuals[5]
```
```{r 2018 Question 12}
location_A = c(13.9, 11.0, 11.3, 12.5, 13.4, 11.5, 14.5, 13.4, 12.2, 11.3)
location_B = c(14.4, 12.2, 17.4, 12.0, 16.1, 10.9, 19.4, 10.3)
t.test(location_A,
location_B,
paired = FALSE,
var.equal = TRUE, #this will pool the samples
alternative = "two.sided")
```