# ANOVA

## Problem

You want to compare multiple groups using an ANOVA.

## Solution

```data <- read.table(header=T, text='
subject sex   age before after
1   F   old    9.5   7.1
2   M   old   10.3  11.0
3   M   old    7.5   5.8
4   F   old   12.4   8.8
5   M   old   10.2   8.6
6   M   old   11.0   8.0
7   M young    9.1   3.0
8   F young    7.9   5.2
9   F   old    6.6   3.4
10   M young    7.7   4.0
11   M young    9.4   5.3
12   M   old   11.6  11.3
13   M young    9.9   4.6
14   F young    8.6   6.4
15   F young   14.3  13.5
16   F   old    9.2   4.7
17   M young    9.8   5.1
18   F   old    9.9   7.3
19   F young   13.0   9.5
20   M young   10.2   5.4
21   M young    9.0   3.7
22   F young    7.9   6.2
23   M   old   10.1  10.0
24   M young    9.0   1.7
25   M young    8.6   2.9
26   M young    9.4   3.2
27   M young    9.7   4.7
28   M young    9.3   4.9
29   F young   10.7   9.8
30   M   old    9.3   9.4
')
```

### One way between ANOVA

```# One way between:
# IV: sex
# DV: before
aov.before.sex <- aov(before ~ sex, data=data)
summary(aov.before.sex)
#             Df Sum Sq Mean Sq F value Pr(>F)
# sex          1  1.529   1.529   0.573 0.4554
# Residuals   28 74.701   2.668

# Show the means
model.tables(aov.before.sex, "means")
# Tables of means
# Grand mean
# 9.703333
#
#  sex
#      F      M
#     10  9.532
# rep 11 19.000
```

### Two way between ANOVA

```# 2x2 between:
# IV: sex
# IV: age
# DV: after
# These two calls are equivalent
aov.after.sex.age <- aov(after ~ sex*age, data=data)
aov.after.sex.age <- aov(after ~ sex + age + sex:age, data=data)
summary(aov.after.sex.age)
#             Df  Sum Sq Mean Sq F value   Pr(>F)
# sex          1  16.078  16.078  4.0384 0.054962 .
# age          1  38.961  38.961  9.7862 0.004301 **
# sex:age      1  89.611  89.611 22.5085  6.6e-05 ***
# Residuals   26 103.512   3.981
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Show the means
model.tables(aov.after.sex.age, "means")
# Tables of means
# Grand mean
#
# 6.483333
#
#  sex
#          F      M
#      7.445  5.926
# rep 11.000 19.000
#
#  age
#      young    old
#      5.556  7.874
# rep 18.000 12.000
#
#  sex:age
#      age
# sex   young  old
#   F    8.433  6.260
#   rep  6.000  5.000
#   M    4.042  9.157
#   rep 12.000  7.000
```

### Tukey HSD post-hoc test

```TukeyHSD(aov.after.sex.age)
#   Tukey multiple comparisons of means
#     95% family-wise confidence level
#
# Fit: aov(formula = after ~ sex + age + sex:age, data = data)
#
# \$sex
#          diff       lwr        upr     p adj
# M-F -1.519139 -3.073025 0.03474709 0.0549625
#
# \$age
#              diff       lwr      upr     p adj
# old-young 2.317850 0.7893498 3.846349 0.0044215
#
# \$`sex:age`
#                       diff        lwr       upr     p adj
# M:young-F:young -4.3916667 -7.1285380 -1.654795 0.0008841
# F:old-F:young   -2.1733333 -5.4878491  1.141182 0.2966111
# M:old-F:young    0.7238095 -2.3214997  3.769119 0.9138789
# F:old-M:young    2.2183333 -0.6952887  5.131955 0.1832890
# M:old-M:young    5.1154762  2.5121923  7.718760 0.0000676
# M:old-F:old      2.8971429 -0.3079526  6.102238 0.0869856
```

## ANOVAs with within-subjects variables

For ANOVAs with within-subjects variables, the data must be in long format. The data supplied above is in wide format, so we have to convert it first. (See ../../Manipulating data/Converting data between wide and long format for more information.)

Also, for ANOVAs with a within-subjects variable, there must be an identifier column. In this case, it is `subject`. This identifier variable must be a factor. If it is a numeric type, the function will interpret it incorrectly and it won't work properly.

```library(reshape2)

# This is what the original (wide) data looks like
# subject sex   age before after
#       1   F   old    9.5   7.1
#       2   M   old   10.3  11.0
#       3   M   old    7.5   5.8

# Convert it to long format
data.long <- melt(data, id = c("subject","sex","age"), # keep these columns the same
measure = c("before","after"),       # Put these two columns into a new column
variable.name="time")                # Name of the new column with the labels
# subject sex   age   time value
#       1   F   old before   9.5
#       2   M   old before  10.3
#       3   M   old before   7.5
#     ...
#       1   F   old  after   7.1
#       2   M   old  after  11.0
#       3   M   old  after   5.8
#     ...

# Make sure subject column is a factor
data\$subject <- factor(data\$subject)
```

### One-way within ANOVA

First, convert the data to long format and make sure `subject` is a factor, as shown above.

```# IV (within): time
# DV:          value
aov.time <- aov(value ~ time + Error(subject/time), data=data.long)
summary(aov.time)
# Error: subject
#           Df Sum Sq Mean Sq F value Pr(>F)
# Residuals  1 2.6238  2.6238
#
# Error: subject:time
#      Df Sum Sq Mean Sq
# time  1 136.97  136.97
#
# Error: Within
#           Df Sum Sq Mean Sq F value  Pr(>F)
# time       1  21.35   21.35   3.748 0.05793 .
# Residuals 56 318.97    5.70
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# This won't work here for some reason (?)
model.tables(aov.time, "means")
# Tables of means
# Grand mean
#
# 8.093333
# Warning message:
# In any(efficiency) : coercing argument of type 'double' to logical
```

### Mixed design ANOVA

First, convert the data to long format and make sure `subject` is a factor, as shown above.

```# 2x2 mixed:
# IV between: age
# IV within:  time
# DV:         value
aov.after.age.time <- aov(value ~ age*time + Error(subject/time), data=data.long)
summary(aov.after.age.time)
# Error: subject
#     Df Sum Sq Mean Sq
# age  1 2.6238  2.6238
#
# Error: subject:time
#      Df Sum Sq Mean Sq
# time  1 136.97  136.97
#
# Error: Within
#           Df  Sum Sq Mean Sq F value  Pr(>F)
# age        1  22.260  22.260  4.2831 0.04329 *
# time       1  21.348  21.348  4.1076 0.04764 *
# age:time   1  16.064  16.064  3.0909 0.08440 .
# Residuals 54 280.648   5.197
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# This won't work here because the data is unbalanced
model.tables(aov.after.age.time, "means")
# Error in model.tables.aovlist(aov.after.age.time, "means") :
#   design is unbalanced so cannot proceed
```

### More ANOVAs with within-subjects variables

These examples don't operate on the data above, but they should illustrate how to do things. First, convert the data to long format and make sure `subject` is a factor, as shown above.

```# Two within variables
aov.ww <- aov(y ~ w1*w2 + Error(subject/(w1*w2)), data=data.long)

# One between variable and two within variables
aov.bww <- aov(y ~ b1*w1*w2 + Error(subject/(w1*w2)) + b1, data=data.long)

# Two between variables and one within variables
aov.bww <- aov(y ~ b1*b2*w1 + Error(subject/(w1)) + b1*b2, data=data.long)
```

### Note

This is the code used to generate the data above. It is here for safe keeping.

```set.seed(123)

N <- 30
subject     <- 1:N
data        <- data.frame(subject)
data\$sex    <- factor(vector(length=N), levels=c("F","M"))
data\$age    <- factor(vector(length=N), levels=c("young","old"))
data\$before <- vector("numeric", length=N)
data\$after  <- vector("numeric", length=N)

for (i in 1:N) {
if (runif(1) < .5)  data\$sex[i] <- "F"
else                data\$sex[i] <- "M"

if (runif(1) < .5)  data\$age[i] <- "young"
else                data\$age[i] <- "old"

# Set different effects based on sex and condition
if (data\$sex[i] == "F") {
if (data\$age[i] == "young")  change.mean <- -2
else                         change.mean <- -4
}
else {
if (data\$age[i] == "old")  change.mean <- -1
else                       change.mean <- -5
}

data\$before[i] <- round(rnorm(1, mean=10, sd=2), digits=1)
data\$after[i]  <- data\$before[i] + round(rnorm(1, mean=change.mean, sd=1), digits=1)

}
```