Think about an ongoing study in your lab (or a paper you have read in a different class), and decide on a pattern that you might expect in your experiment if a specific hypothesis were true.
To start simply, assume that the data in each of your treatment groups follow a normal distribution. Specify the sample sizes, means, and variances for each group that would be reasonable if your hypothesis were true. You may need to consult some previous literature and/or an expert in the field to come up with these numbers.
Topic: the effect of temperature on the development time of a particular insect species.
Hypothesis: higher temperatures lead to faster development times in the insect species under study.
Experimental design: two treatment groups including one group exposed to a higher temperature and another group exposed to a lower temperature.
- Higher Temperature Group:
- Sample size: 30
- Mean development time: 10 days
- Variance: 1.5
- Lower Temperature Group:
- Sample size: 30
- Mean development time: 15 days
- Variance: 2.0
Using the methods we have covered in class, write code to create a random data set that has these attributes. Organize these data into a data frame with the appropriate structure.
library(ggplot2)
library(dplyr)
library(MASS)
library(tidyverse)
library(pwr)
library(pwrss)
library(gridExtra)
# Set seed for reproducibility
#set.seed(123)
# Generate random data for the higher temperature group
hightemp <- data.frame(
development_time = rnorm(30, mean = 10, sd = sqrt(1.5))
)
#print(hightemp)
# Generate random data for the lower temperature group
lowtemp <- data.frame(
development_time = rnorm(30, mean = 15, sd = sqrt(2.0))
)
#print(lowtemp)
# Add a group column to identify the temperature group
hightemp$group <- "Higher Temperature"
#print(hightemp)
lowtemp$group <- "Lower Temperature"
#print(lowtemp)
# Combine the data frames into a single data frame
insect_data <- rbind(hightemp, lowtemp)
# Print the first few rows of the data frame
print(head(insect_data))
## development_time group
## 1 10.202028 Higher Temperature
## 2 11.471453 Higher Temperature
## 3 9.513329 Higher Temperature
## 4 9.719370 Higher Temperature
## 5 11.063066 Higher Temperature
## 6 11.176798 Higher Temperature
Now write code to analyze the data (probably as an ANOVA or regression analysis), but possibly as a logistic regression or contingency table analysis. Write code to generate a useful graph of the data.
# ANOVA
anova <- aov(insect_data$development_time ~ insect_data$group, data = insect_data)
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## insect_data$group 1 292.8 292.83 164.8 <2e-16 ***
## Residuals 58 103.1 1.78
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Create a box plot
boxplot <- ggplot(insect_data, aes(x = group, y = development_time, fill = group)) +
geom_boxplot() +
labs(title = "Development Time of Insects at Different Temperatures",
x = "Temperature Group",
y = "Development Time (days)") +
theme_minimal()
# Display the box plot
print(boxplot)
Try running your analysis multiple times to get a feeling for how variable the results are with the same parameters, but different sets of random numbers.
# Different run p-value and F value changes by rerunning the same codes:
# run #1
# pval = <2e-16 ***
# Fval= 190
# # run #2
# pval = <2e-16 ***
# Fval = 319
##### or
results <- list()
# Set the number of iterations
num_iterations <- 6
# Initialize a vector to store the ANOVA p-values
p_values <- numeric(num_iterations)
for (i in 1:num_iterations) {
# Generate a new random dataset
set.seed(i) # Set a different seed for each iteration
hightemp <- data.frame(
development_time = rnorm(30, mean = 10, sd = sqrt(1.5))
)
lowtemp <- data.frame(
development_time = rnorm(30, mean = 15, sd = sqrt(2.0))
)
hightemp$group <- "Higher Temperature"
lowtemp$group <- "Lower Temperature"
insect_data <- rbind(hightemp, lowtemp)
# ANOVA
anova <- aov(development_time ~ group, data = insect_data)
p_values[i] <- summary(anova)[[1]]$`Pr(>F)`[1]
# Store the results
results[[i]] <- list(
summary = summary(anova),
data = insect_data)
}
# Print the p-values from each iteration
print(p_values)
## [1] 9.138965e-25 4.716218e-17 5.132635e-24 1.012101e-20 1.859572e-21
## [6] 4.801209e-19
# Print the summaries of the models for each iteration
for (i in 1:6) {
cat("Iteration", i, ":\n")
print(results[[i]]$summary)
}
## Iteration 1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 1 388.1 388.1 304.9 <2e-16 ***
## Residuals 58 73.8 1.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Iteration 2 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 1 327.2 327.2 139.3 <2e-16 ***
## Residuals 58 136.3 2.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Iteration 3 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 1 425.4 425.4 284 <2e-16 ***
## Residuals 58 86.9 1.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Iteration 4 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 1 284.06 284.06 205.5 <2e-16 ***
## Residuals 58 80.16 1.38
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Iteration 5 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 1 407.3 407.3 221.3 <2e-16 ***
## Residuals 58 106.7 1.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Iteration 6 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 1 314.6 314.55 172.9 <2e-16 ***
## Residuals 58 105.5 1.82
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Generate a plot showing the density of development_time by group for each iteration
plot_list <- lapply(results, function(res) {
ggplot(res$data, aes(x = development_time, fill = group)) +
geom_density(alpha = 0.5) +
labs(title = "Development Time Density by Group",
x = "Development Time",
y = "Density") +
theme_minimal()
})
# Plot the density plots
gridExtra::grid.arrange(grobs = plot_list, ncol = 2)
Now begin adjusting the means of the different groups. Given the sample sizes you have chosen, how small can the differences between the groups be (the “effect size”) for you to still detect a significant pattern (p < 0.05)?
# Set the parameters
alpha <- 0.05
power <- 0.8
n <- 30 # Sample size for each group
sd1 <- sqrt(1.5) # Standard deviation for group 1
mean1 <- 10 # Mean for group 1
sd2 <- sqrt(2.0) # Standard deviation for group 2
mean2 <- 15 # Mean for group 2
# power analysis for Cohen's d
d_values <- seq(0.01, 1, by = 0.01) # Range of Cohen's d values to test
significant_effects <- numeric(length(d_values))
for (i in 1:length(d_values)) {
d <- d_values[i] # Cohen's d value
means_diff <- d * sqrt((sd1^2 + sd2^2) / 2) # Calculate the difference in means
t_test_result <- t.test(rnorm(n, mean1, sd1), rnorm(n, mean2, sd2), var.equal = TRUE) # t-test
p_value <- t_test_result$p.value
significant_effects[i] <- p_value
}
# Find the minimum detectable effect size (Cohen's d)
min_d <- d_values[which(significant_effects < alpha)[1]]
# Print the minimum detectable effect size
print(paste("Minimum detectable effect size (Cohen's d):", min_d))
## [1] "Minimum detectable effect size (Cohen's d): 0.01"
Alternatively, for the effect sizes you originally hypothesized, what is the minimum sample size you would need in order to detect a statistically significant effect? Again, run the model a few times with the same parameter set to get a feeling for the effect of random variation in the data.
# Set the parameters
alpha <- 0.05
power <- 0.8
sd1 <- sqrt(1.5) # Standard deviation for group 1
mean1 <- 10 # Mean for group 1
sd2 <- sqrt(2.0) # Standard deviation for group 2
mean2 <- 15 # Mean for group 1
# Calculate Statistical Power
pwrss.t.2means(mu1 = mean1, mu2 = mean2, sd1 = sd1, sd2 = sd2, kappa = 1,
n2 = 30, alpha = 0.05,
alternative = "not equal")
## Difference between Two means
## (Independent Samples t Test)
## H0: mu1 = mu2
## HA: mu1 != mu2
## ------------------------------
## Statistical power = 1
## n1 = 30
## n2 = 30
## ------------------------------
## Alternative = "not equal"
## Degrees of freedom = 58
## Non-centrality parameter = -14.639
## Type I error rate = 0.05
## Type II error rate = 0
# Calculate Minimum Required Sample Size
pwrss.t.2means(mu1 = mean1, mu2 = mean2, sd1 = sd1, sd2 = sd2, kappa = 1,
power = .80, alpha = 0.01,
alternative = "not equal")
## Difference between Two means
## (Independent Samples t Test)
## H0: mu1 = mu2
## HA: mu1 != mu2
## ------------------------------
## Statistical power = 0.8
## n1 = 4
## n2 = 4
## ------------------------------
## Alternative = "not equal"
## Degrees of freedom = 6
## Non-centrality parameter = -5.345
## Type I error rate = 0.01
## Type II error rate = 0.2
Write up your results in a markdown file, organized with headers and different code chunks to show your analysis. Be explicit in your explanation and justification for sample sizes, means, and variances.
If you have time, try repeating this exercise with one of the more sophisticated distributions, such as the gamma or negative binomial (depending on the kind of data you have). You will have to spend some time figuring out by trial and error the parameter values you will need to generate appropriate means and variances of the different groups.
# Function to generate gamma-distributed data with specific mean and variance
generate_gamma_data <- function(mean, var, size) {
scale <- var / mean # Calculate the scale parameter based on mean and variance
shape <- mean / scale # Calculate the shape parameter based on mean and scale
return(rgamma(size, shape, scale))
}
# Set seed for reproducibility
set.seed(123)
# Generate data for Group 1 with mean 10 and variance 5
hightemp <- data.frame(
development_time = rgamma(10, 5, 100))
# Generate data for Group 2 with mean 15 and variance 8
lowtemp <- data.frame(
development_time = rgamma(15, 8, 100))
# Combine the data for Group 1 and Group 2
data <- rbind(
transform(hightemp, group = "Group 1"),
transform(lowtemp, group = "Group 2")
)
# Convert group variable to a factor
data$group <- as.factor(data$group)
# Fit a logistic regression model
model <- glm(group ~ development_time, data = data, family = binomial)
# Print the summary of the model
summary(model)
##
## Call:
## glm(formula = group ~ development_time, family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.055 1.335 -1.540 0.1237
## development_time 42.121 22.010 1.914 0.0557 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 33.651 on 24 degrees of freedom
## Residual deviance: 28.928 on 23 degrees of freedom
## AIC: 32.928
##
## Number of Fisher Scoring iterations: 3
# Generate a graph
ggplot(data, aes(x = development_time, fill = group)) +
geom_density(alpha = 0.5) +
labs(title = "Development Time Density by Group",
x = "Development Time",
y = "Density") +
theme_minimal()
results <- list()
for (i in 1:6) {
set.seed(i) # Set a different seed for each iteration
# Generate data for Group 1 with mean 10 and variance 5
hightemp <- data.frame(
development_time = rgamma(10, 5, 100)
)
# Generate data for Group 2 with mean 15 and variance 8
lowtemp <- data.frame(
development_time = rgamma(15, 8, 100)
)
# Combine the data for Group 1 and Group 2
data <- rbind(
transform(hightemp, group = "Group 1"),
transform(lowtemp, group = "Group 2")
)
# Convert group variable to a factor
data$group <- as.factor(data$group)
# Fit a logistic regression model
model <- glm(group ~ development_time, data = data, family = binomial)
# Store the results
results[[i]] <- list(
summary = summary(model),
data = data
)
}
# Print the summaries of the models for each iteration
for (i in 1:6) {
cat("Iteration", i, ":\n")
print(results[[i]]$summary)
}
## Iteration 1 :
##
## Call:
## glm(formula = group ~ development_time, family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.316 1.676 -1.979 0.0478 *
## development_time 59.082 26.182 2.257 0.0240 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 33.651 on 24 degrees of freedom
## Residual deviance: 25.880 on 23 degrees of freedom
## AIC: 29.88
##
## Number of Fisher Scoring iterations: 4
##
## Iteration 2 :
##
## Call:
## glm(formula = group ~ development_time, family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.973 1.323 -2.247 0.0247 *
## development_time 59.378 24.669 2.407 0.0161 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 33.651 on 24 degrees of freedom
## Residual deviance: 21.360 on 23 degrees of freedom
## AIC: 25.36
##
## Number of Fisher Scoring iterations: 6
##
## Iteration 3 :
##
## Call:
## glm(formula = group ~ development_time, family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.20 5.32 -1.916 0.0553 .
## development_time 187.93 94.20 1.995 0.0460 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 33.651 on 24 degrees of freedom
## Residual deviance: 15.250 on 23 degrees of freedom
## AIC: 19.25
##
## Number of Fisher Scoring iterations: 7
##
## Iteration 4 :
##
## Call:
## glm(formula = group ~ development_time, family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.519 2.358 -2.340 0.0193 *
## development_time 84.814 33.622 2.523 0.0116 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 33.651 on 24 degrees of freedom
## Residual deviance: 20.024 on 23 degrees of freedom
## AIC: 24.024
##
## Number of Fisher Scoring iterations: 5
##
## Iteration 5 :
##
## Call:
## glm(formula = group ~ development_time, family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.056 1.674 -2.422 0.0154 *
## development_time 69.045 27.032 2.554 0.0106 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 33.651 on 24 degrees of freedom
## Residual deviance: 19.909 on 23 degrees of freedom
## AIC: 23.909
##
## Number of Fisher Scoring iterations: 6
##
## Iteration 6 :
##
## Call:
## glm(formula = group ~ development_time, family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.421 1.258 -1.924 0.0543 .
## development_time 48.830 21.654 2.255 0.0241 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 33.651 on 24 degrees of freedom
## Residual deviance: 25.859 on 23 degrees of freedom
## AIC: 29.859
##
## Number of Fisher Scoring iterations: 5
# Generate a plot showing the density of development_time by group for each iteration
plot_list <- lapply(results, function(res) {
ggplot(res$data, aes(x = development_time, fill = group)) +
geom_density(alpha = 0.5) +
labs(title = "Development Time Density by Group",
x = "Development Time",
y = "Density") +
theme_minimal()
})
# Plot the density plots
gridExtra::grid.arrange(grobs = plot_list, ncol = 3)
# Calculate the effect size based on the means and variances of the groups
effect_size <- abs(mean(hightemp$development_time) - mean(lowtemp$development_time)) /
sqrt(((sd(hightemp$development_time) ^ 2 + sd(lowtemp$development_time) ^ 2) / 2))
mean1 <- mean(hightemp$development_time)
sd1 <- sd(hightemp$development_time)
mean2 <- mean(lowtemp$development_time)
sd2 <- sd(lowtemp$development_time)
# min sample size
min <- pwrss.np.2groups(mu1 = mean1, mu2 = mean2, sd1 = sd1, sd2 = sd2, kappa = 1,
power = .80, alpha = 0.05,
alternative = "not equal")
## Non-parametric Difference between Two Groups (Independent samples)
## Mann-Whitney U or Wilcoxon Rank-sum Test
## (a.k.a Wilcoxon-Mann-Whitney Test)
## Method: GUENTHER
## ------------------------------
## Statistical power = 0.8
## n1 = 13
## n2 = 13
## ------------------------------
## Alternative = "not equal"
## Non-centrality parameter = -2.929
## Degrees of freedom = 22.39
## Type I error rate = 0.05
## Type II error rate = 0.2