---
title: "SQA AH Statistics Exam Papers, using R"
output: html_document
date: "2025-06-01"
---

INSTRUCTIONS

Expand (or Collapse) the code chunks by clicking on the grey triangle to the right of the line number, or use the menu item: Edit > Folding > Expand All or Collapse All

Locate the code chunk you want from the Year and Question number

Run any code chunk by clicking on the green triangle 'play button' in the top right of each section of code


```{r 2025 Paper 1 Question 1(c)}

# x values
xi = c(0:7)

# observed frequencies
Oi = c(0, 3, 6, 6, 5, 5, 2, 1)

# mean rate
sum(Oi * xi) / sum(Oi)
```

```{r 2025 Paper 1 Question 1(d)}

# observed frequencies
Oi = c(0, 3, 6, 6, 5, 8)

# expected frequency using P(X >= 5) = P(X > 4) where X ~ Po(3.5)
sum(Oi) * ppois(q = 4,
                lambda = 3.5,
                lower.tail = FALSE)
```

```{r 2025 Paper 1 Question 1(f)(ii)}

# observed frequencies
Oi = c(3, 6, 6, 5, 8)

# grouped probabilities
Pi = c(ppois(q = 1, lambda = 3.5), # for P(X <= 1)
       dpois(x = 2, lambda = 3.5), # for P(X = 2)
       dpois(x = 3, lambda = 3.5), # for P(X = 3)
       dpois(x = 4, lambda = 3.5), # for P(X = 4)
       ppois(q = 4, lambda = 3.5, lower.tail = FALSE)) # for P(X > 4)

# calculate and display expected frequencies
Ei = sum(Oi) * Pi
Ei

# calculate test statistic, X^2
sum( ((Oi - Ei) ^ 2) / Ei)

```

```{r 2025 Paper 2 Question 1}

x_values = c(1:6)
x_probabilities = c(1/60, 1/6, 1/6, 1/6, 1/6, 19/60)

# calculate and display the mean of X
mean_x = sum(x_values * x_probabilities)
mean_x

# calculate mean of X^2
mean_x2 = sum(x_values^2 * x_probabilities)

# calculate and display the variance of X
variance_x = mean_x2 - (mean_x)^2
variance_x

# variance of y
variance_y = (8^2 - 1) / 12

# variance of X - Y = V(X - Y) = V(X) + V(Y)
variance_x + variance_y

# standard deviation, SD(X - Y)
sqrt(variance_x + variance_y)

```

```{r 2025 Paper 2 Question 2(a)}

distances = c(1, 4, 4, 7, 7, 8, 9, 13, 15, 20, 21, 22, 25, 27, 46)

# boxplot display which shows one possible outlier above the upper fence
boxplot(x = distances,
        horizontal = TRUE)

# calculate Q1 and Q3
quartiles = quantile(x = distances,
                     probs = c(0.25, 0.75),
                     type = 6) # to select the method used by SQA

lower_fence = quartiles[1] - 1.5 * diff(quartiles)

upper_fence = quartiles[2] + 1.5 * diff(quartiles)

# detect if any data is below the lower fence (TRUE or FALSE)
min(distances) < lower_fence

# detect if any data is above the upper fence (TRUE or FALSE)
max(distances) > upper_fence
```

```{r 2025 Paper 2 Question 2(b)}
table = t(data.frame(
  retail = c(36, 26),
  food = c(33, 5)
))

# perform test
output = chisq.test(x = table,
                    correct = FALSE) # to prevent Yate's Continuity Correction

# displays test statistic, degrees of freedom and p-value
output

# display expected frequencies that were used
print("Expected frequencies:")
output$expected
```

```{r 2025 Paper 2 Question 3}

# calculate approximate P(X > 20) where X ~ Po(14)
pnorm(q = 20.5,
      mean = 14,
      sd = sqrt(14),
      lower.tail = FALSE)

# for interest, calculate exact P(X > 20) where X ~ Po(14)
ppois(q = 20,
      lambda = 14,
      lower.tail = FALSE)

```

```{r 2025 Paper 2 Question 4}

# calculate P(X-bar > 5.50) where X-bar ~ N(4.70, 2.80/sqrt(50))
pnorm(q = 5.50,
      mean = 4.70,
      sd = 2.80 / sqrt(50),
      lower.tail = FALSE)

```

```{r 2025 Paper 2 Question 5}

data = c(40.1, 40.3, 39.5, 40.6, 40.8, 39.9, 40.0, 41.3, 38.3)
differences = data - 40

# remove zeros from differences before performing the Wilcoxon test
updated_differences = differences[differences != 0]

# perform Wilcoxon test
wilcox.test(x = updated_differences,
            mu = 0, # the command uses mu rather than median
            alternative = "two.sided",
            exact = FALSE) # due to tied ranks, p-value will be approximate

# note that the value of V stated in the Wilcoxon test output is *not* the minimum rank sum

# calculate and display ranks of absolute values of differences
ranks = rank(abs(updated_differences))
ranks 

# calculate and display rank sum of negative differences
W_negative = sum((updated_differences < 0) * ranks)
W_negative 

# calculate and display rank sum of positive differences
W_positive = sum((updated_differences > 0) * ranks)
W_positive 

# calculate and display the rank sum used in AH Statistics course
W = min(W_negative, W_positive)
W 
```

```{r 2025 Paper 2 Question 6}

# the package called 'BSDA' is needed for the z.test function
if(!require(BSDA)){install.packages("BSDA"); library(BSDA)}

years_00_09_mean = 3.9868
years_00_09_st_dev = 1.3396
years_00_09_n = 990

years_10_18_mean = 4.2510
years_10_18_st_dev = 1.2342
years_10_18_n = 735

# create simulated data sets with correct statistics
simulated_years_00_09_values = scale(1:years_00_09_n) * years_00_09_st_dev + years_00_09_mean
simulated_years_10_18_values = scale(1:years_10_18_n) * years_10_18_st_dev + years_10_18_mean

# perform the z-test
z.test(x = simulated_years_00_09_values,
       y = simulated_years_10_18_values,
       sigma.x = years_00_09_st_dev,
       sigma.y = years_10_18_st_dev,
       alternative = "less")

# for interest, by comparison, here is what the t-test would have been
t.test(simulated_years_00_09_values,
       simulated_years_10_18_values,
       paired = FALSE,
       var.equal = TRUE, #this will pool the samples
       alternative = "less")

```

```{r 2025 Paper 2 Question 8}

# record pass rates, for Centre A then Centre B
successes = c(14, 21)
trials = c(29, 30)

# perform two-sample proportion test
prop.test(x = successes,
          n = trials,
          alternative = "less",
          correct = FALSE) # to prevent continuity correction

# you will see from the above test output the mention of X-squared, which stems from a lot of parallels between two-sample proportion tests and a chi-squared test of association

# the next lines of code perform the same test, but manually.....

# calculate and display the pooled proportion
p = sum(successes) / sum(trials) 
p 

# calculate and display the test statistic
z = -diff(successes / trials) / sqrt (p * (1-p) * sum(1 / trials))
z 

# calculate the p-Value of the test
pnorm(q = z,
      mean = 0,
      sd = 1)

```

```{r 2025 Paper 2 Question 9}

y_mean = 0.254
y_st_dev = 2.54 * 0.25

# calculate the probability of within 1 of 0, P(-1 < Y < 1) = P(Y < 1) - P(Y < -1)
diff(pnorm(q = c(-1,1),
           mean = y_mean,
           sd = y_st_dev))

# calculate expected number of planks
80 * diff(pnorm(q = c(-1,1),
           mean = y_mean,
           sd = y_st_dev))

```

```{r 2025 Paper 2 Question 10}

sigma_x = 27.9
sigma_x2 = 130.2
sample_n = 6

# calculate sample mean and sample standard deviation from summary statistics
sample_mean = sigma_x / sample_n
sample_st_dev = sqrt((sigma_x2 - (sigma_x ^ 2) / sample_n) / (sample_n - 1))

# create a simulated data set that has the correct mean and standard deviation
simulated_masses = scale(1:sample_n) * sample_st_dev + sample_mean

# perform t-test
t.test(x = simulated_masses,
       mu = 4.5,
       alternative = "two.sided")

```

```{r 2025 Paper 2 Question 12}

sample_n = 15
sample_mean = 139.5
sample_st_dev = 0.7

# create a simulated data set that has the correct mean and standard deviation
simulated_heights = scale(1:sample_n) * sample_st_dev + sample_mean

# use the t-test function (without specifying a value for mu) to deliver a 98% confidence interval
t.test(x = simulated_heights,
       conf.level = 0.98)

```



