# Statistical analysis of measurement data by condition group.
# Companion script for the MathJet tutorial: R / RStudio.

library(dplyr)
library(ggplot2)

# Load data
df <- read.csv("measurements.csv")
cat(sprintf("Loaded %d rows, %d columns\n", nrow(df), ncol(df)))

# Per-condition summary statistics
summary_stats <- df %>%
  group_by(condition) %>%
  summarise(
    n = n(),
    mean = mean(measurement_value),
    sd = sd(measurement_value),
    se = sd / sqrt(n)
  )
print(summary_stats)

# One-way ANOVA: does condition affect measurement value?
model <- aov(measurement_value ~ condition, data = df)
cat("\n--- ANOVA ---\n")
print(summary(model))

# Tukey HSD post-hoc pairwise comparisons
tukey <- TukeyHSD(model)
cat("\n--- Tukey HSD ---\n")
print(tukey)

# Effect size: Cohen's d for treatment_a vs control
control <- df$measurement_value[df$condition == "control"]
treatment_a <- df$measurement_value[df$condition == "treatment_a"]
pooled_sd <- sqrt((var(control) + var(treatment_a)) / 2)
cohens_d <- (mean(treatment_a) - mean(control)) / pooled_sd
cat(sprintf("\nCohen's d (treatment_a vs control): %.3f\n", cohens_d))

# Boxplot with jittered points, colored by condition
ggplot(df, aes(x = condition, y = measurement_value, fill = condition)) +
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.4, size = 1.5) +
  labs(title = "Measurement value by condition",
       x = "Condition", y = "Measurement value") +
  theme_minimal() +
  theme(legend.position = "none")