单因素方差分析与多重比较

R
qqplotr
rstatix
统计检验
Author

Rui

Published

June 7, 2023

导入数据

Code
library(tidyverse)
library(palmerpenguins)
library(ggplot2)
Code
data <- penguins %>% drop_na()

data %>% head() %>% knitr::kable()
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year
Adelie Torgersen 39.1 18.7 181 3750 male 2007
Adelie Torgersen 39.5 17.4 186 3800 female 2007
Adelie Torgersen 40.3 18.0 195 3250 female 2007
Adelie Torgersen 36.7 19.3 193 3450 female 2007
Adelie Torgersen 39.3 20.6 190 3650 male 2007
Adelie Torgersen 38.9 17.8 181 3625 female 2007

计算均值、标准差:

Code
data %>%
  group_by(species) %>%
  summarise(
    mean = mean(flipper_length_mm),
    sd = sd(flipper_length_mm)
  )
## # A tibble: 3 × 3
##   species    mean    sd
##   <fct>     <dbl> <dbl>
## 1 Adelie     190.  6.52
## 2 Chinstrap  196.  7.13
## 3 Gentoo     217.  6.59

分组描述性统计:

Code
library(psych)
describeBy(data$flipper_length_mm, data$species)  # 分组描述统计
## 
##  Descriptive statistics by group 
## group: Adelie
##    vars   n  mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 146 190.1 6.52    190  190.08 7.41 172 210    38 0.08     0.28 0.54
## ------------------------------------------------------------ 
## group: Chinstrap
##    vars  n   mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 68 195.82 7.13    196  195.75 7.41 178 212    34 -0.01    -0.13 0.86
## ------------------------------------------------------------ 
## group: Gentoo
##    vars   n   mean   sd median trimmed  mad min max range skew kurtosis  se
## X1    1 119 217.24 6.59    216  216.91 7.41 203 231    28 0.36    -0.72 0.6

以下是一些可视化数据分布的方法:

Code
boxplot(flipper_length_mm ~ species, data = data)

Code
library(ggpubr)
ggerrorplot(data, x = "species", y = "flipper_length_mm", desc_stat = "mean_sd") #画平均值标准差图

Code
library(ggdist)
ggplot(data = data, aes(x = flipper_length_mm, y = species)) +
  stat_histinterval(slab_color = "white",
                    outline_bars = TRUE,
                    fill = "royalblue",
                    alpha = 0.5) +
  theme_ggdist() +
  labs(title = "Histogram_interval plots")

Code
ggplot(data, aes(x = species, y = flipper_length_mm)) +
  stat_summary(fun.data = "median_q1q3", geom = "errorbar", width = 0.2, size = 0.5) + # 误差棒,中位数,25%和75%分位数
  stat_summary(fun.y = median, geom = "crossbar", width = 0.5, size = 0.3) + # 中位数水平线
  
  geom_jitter(aes(color = species)) + # 抖动点
  scale_fill_brewer(palette = "Set1") + # 调色
  labs(
    title = "Flipper length",
    x = "Species", y = "Flipper length (mm)",
    color = "Penguin species", shape = "Penguin species"
  ) +
  
  theme_bw()

正态性检验

分组 Q-Q 图

Code
# 核密度图
Q1 <- ggplot(data, aes(y = flipper_length_mm, fill = species)) +
  geom_density(alpha = 0.2,        # 填充颜色透明度
               size = 1,           # 线条粗细
               linetype = 1) +     # 线条类型1是实线,2是虚线
  ylim(c(160, 240)) + 
  scale_fill_brewer(palette = "Set1") + # 调色
  facet_grid(.~species) +
  labs(x = "freq", y = "flipper_length_mm") +
  theme_bw()

# Q-Q 图
library(qqplotr)
Q2 <- ggplot(data, aes(sample = flipper_length_mm, fill = species)) +
  stat_qq_band(#bandType = "ts", 
               alpha = 0.4) +
  stat_qq_point(size = 1) +
  stat_qq_line(linewidth = 1) +
  scale_fill_brewer(palette = "Set1") + # 调色
  facet_wrap(vars(species), scales = "free_x") + # x轴刻度不固定
  labs(x = "Theoretical", y = "Sample") +
  theme_bw()

# 拼图
library(patchwork) 
Q1/Q2

Shapiro 检验

Code
for (kind in unique(data$species)) {
  data %>%
    dplyr::filter(species == kind) %>%
    dplyr::select(flipper_length_mm) %>%
    pull() %>%
    shapiro.test() %>%
    print()
}
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.9934, p-value = 0.7427
## 
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.96148, p-value = 0.00176
## 
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.98891, p-value = 0.8106

除了第 2 组,其他组均通过 Shapior 正态性检验。

也可以使用 rstatix 包中的 shapiro_test 函数。

K-S 检验

Code
for (kind in unique(data$species)) {
  data %>%
    dplyr::filter(species == kind) %>%
    dplyr::select(flipper_length_mm) %>%
    pull() %>%
    ks.test(., "pnorm", mean = mean(.), sd = sqrt(.)) %>%
    print()
}
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  .
## D = 0.19656, p-value = 2.521e-05
## alternative hypothesis: two-sided
## 
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  .
## D = 0.24516, p-value = 1.226e-06
## alternative hypothesis: two-sided
## 
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  .
## D = 0.20597, p-value = 0.006244
## alternative hypothesis: two-sided

3 组均未通过 K-S 正态性检验.

不同方法

bandType 可选参数:

  • pointwise:使用正态分布构造

  • boot:基于 boostrap 构造

  • ks:基于 Kolmogorov-Smirnov 检验

  • ts:构造 tail-sensitive 的置信区间

Code
ggplot(data, aes(sample = flipper_length_mm)) +
  geom_qq_band(bandType = "ks", mapping = aes(fill = "KS"), alpha = 0.3) +
  geom_qq_band(bandType = "ts", mapping = aes(fill = "TS"), alpha = 0.3) +
  geom_qq_band(bandType = "pointwise", mapping = aes(fill = "Normal"), alpha = 0.3) +
  geom_qq_band(bandType = "boot", mapping = aes(fill = "Bootstrap"), alpha = 0.3) +
  stat_qq_line() +
  stat_qq_point() +
  facet_wrap(vars(species), scales = "free_x") +
  labs(x = "Theoretical", y = "Sample") +
  theme_bw()

残差正态性检验

Code
library(lmtest) # 调用程序包“lmtest”
lmresults <- lm(flipper_length_mm~species, data = data) # 将回归分析结果赋值给lmresults
res <- rstandard(lmresults) # 提取标准化残差赋值给res
Code
shapiro.test(res)
## 
##  Shapiro-Wilk normality test
## 
## data:  res
## W = 0.99449, p-value = 0.2756

残差基本符合正态分布。

也可以先进行 ANOVA,再对残差进行正态性检验。

方差齐性检验

Bartlett 方差齐性检验

Code
library(stats)
bartlett.test(flipper_length_mm ~ species, data = data)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  flipper_length_mm by species
## Bartlett's K-squared = 0.80702, df = 2, p-value = 0.668

p 值大于 0.05,接受各组方差相同的原假设,即方差齐性。

Levene 检验

Code
library(car)
leveneTest(flipper_length_mm ~ species, data = data) # 默认以中位数为中心
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   2  0.4428 0.6426
##       330
leveneTest(flipper_length_mm ~ species, data = data, center = "mean") # 以均值为中心
## Levene's Test for Homogeneity of Variance (center = "mean")
##        Df F value Pr(>F)
## group   2  0.5031 0.6051
##       330

无论是选择中位数还是均值作为检验中心,p 值均大于 0.05,接受各组方差相同的原假设,即方差齐性。

单因素方差分析

Code
fit <- data %>% aov(flipper_length_mm ~ species, data = .)
summary(fit)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## species       2  50526   25263   567.4 <2e-16 ***
## Residuals   330  14693      45                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  • Df(自由度):表示每个因素的自由度。例如,在这个例子中,species 因素有 2 个自由度,Residuals 因素有 330 个自由度。

  • Sum Sq(平方和):表示每个因素对总方差的贡献。例如,在这个例子中,species 因素对总方差的贡献为 50526,Residuals 因素对总方差的贡献为 14693。

  • Mean Sq(均方):表示每个因素对总方差的贡献的平均值。均方是通过将平方和除以自由度来计算得到的。例如,在这个例子中,species 因素对总方差的贡献的平均值为25263,Residuals因素对总方差的贡献的平均值为 45。

  • F value(F 值):表示因素对结果的显著性的影响。F 值越大,说明因素对结果的影响越显著。例如,在这个例子中,species 因素的 F 值为 567.4。

  • Pr(>F)(F 值概率):表示 F 值在随机分布下的概率。如果 Pr(>F) 值小于 0.05,那么就可以认为因素对结果有显著影响。例如,在这个例子中,species 因素的 Pr(>F) 值 <2e-16,小于 0.05,因此可以认为它对结果有显著影响。

Code
shapiro.test(fit$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  fit$residuals
## W = 0.99451, p-value = 0.2778

可见 ANOVA 的残差符合正态分布。

修正的单因素方差分析

Code
library(rstatix)
data %>% welch_anova_test(flipper_length_mm ~ species)  # 校正的单因素方差分析
## # A tibble: 1 × 7
##   .y.                   n statistic   DFn   DFd        p method     
## * <chr>             <int>     <dbl> <dbl> <dbl>    <dbl> <chr>      
## 1 flipper_length_mm   333      581.     2  172. 3.78e-77 Welch ANOVA

多重比较

Tukey HSD 检验

stats

Code
library(stats)
hsd1 <- TukeyHSD(fit)
hsd1
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = flipper_length_mm ~ species, data = .)
## 
## $species
##                      diff       lwr       upr p adj
## Chinstrap-Adelie  5.72079  3.414364  8.027215     0
## Gentoo-Adelie    27.13255 25.192399 29.072709     0
## Gentoo-Chinstrap 21.41176 19.023644 23.799885     0

使用 broom 包整洁检验结果:

Code
broom::tidy(hsd1)
## # A tibble: 3 × 7
##   term    contrast         null.value estimate conf.low conf.high adj.p.value
##   <chr>   <chr>                 <dbl>    <dbl>    <dbl>     <dbl>       <dbl>
## 1 species Chinstrap-Adelie          0     5.72     3.41      8.03    3.75e- 8
## 2 species Gentoo-Adelie             0    27.1     25.2      29.1     5.82e-13
## 3 species Gentoo-Chinstrap          0    21.4     19.0      23.8     5.82e-13
Code
par(mar = c(5, 5, 6, 3))
plot(hsd1)

rstatix

Code
library(rstatix)
hsd2 <- tukey_hsd(fit)
hsd2
## # A tibble: 3 × 9
##   term    group1    group2    null.value estimate conf.low conf.high    p.adj
## * <chr>   <chr>     <chr>          <dbl>    <dbl>    <dbl>     <dbl>    <dbl>
## 1 species Adelie    Chinstrap          0     5.72     3.41      8.03 3.75e- 8
## 2 species Adelie    Gentoo             0    27.1     25.2      29.1  5.82e-13
## 3 species Chinstrap Gentoo             0    21.4     19.0      23.8  5.82e-13
## # ℹ 1 more variable: p.adj.signif <chr>

multcomp 包:

使用 multcomp 包的 glht 函数:

Code
library(multcomp)
post_test1 <- glht(fit, linfct = mcp(species = "Tukey"))
summary(post_test1)
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: aov(formula = flipper_length_mm ~ species, data = .)
## 
## Linear Hypotheses:
##                         Estimate Std. Error t value Pr(>|t|)    
## Chinstrap - Adelie == 0   5.7208     0.9796    5.84  2.3e-08 ***
## Gentoo - Adelie == 0     27.1326     0.8241   32.92  < 1e-08 ***
## Gentoo - Chinstrap == 0  21.4118     1.0143   21.11  < 1e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
Code
par(mar = c(5, 10, 5, 3))
plot(post_test1)

Dunnett 检验

使用 multcomp 包的 glht 函数:

Code
post_test2 <- glht(fit, linfct = mcp(species = "Dunnett"))
summary(post_test2)
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Dunnett Contrasts
## 
## 
## Fit: aov(formula = flipper_length_mm ~ species, data = .)
## 
## Linear Hypotheses:
##                         Estimate Std. Error t value Pr(>|t|)    
## Chinstrap - Adelie == 0   5.7208     0.9796    5.84 2.51e-08 ***
## Gentoo - Adelie == 0     27.1326     0.8241   32.92  < 1e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
Code
par(mar = c(5, 10, 5, 3))
plot(post_test2)

SNK 法属于多重极差检验,其检验统计量为 q,又称 q 检验。SNK 法与 Bonferroni 法在功能上类似,SNK 法是 Tukey HSD 检验法的一种修正,相对于 Tukey HSD 检验不是很保守,因此能发现更多的差异。

可以通过 agricolae 包的 SNK.test 函数实现:

Code
library(agricolae)
SNK.test(fit, trt = "species", console = T)
## 
## Study: fit ~ "species"
## 
## Student Newman Keuls Test
## for flipper_length_mm 
## 
## Mean Square Error:  44.52349 
## 
## species,  means
## 
##           flipper_length_mm      std   r Min Max
## Adelie             190.1027 6.521825 146 172 210
## Chinstrap          195.8235 7.131894  68 178 212
## Gentoo             217.2353 6.585431 119 203 231
## 
## Groups according to probability of means differences and alpha level( 0.05 )
## 
## Means with the same letter are not significantly different.
## 
##           flipper_length_mm groups
## Gentoo             217.2353      a
## Chinstrap          195.8235      b
## Adelie             190.1027      c

Bonferroni 法修正的 t 检验

使用 stats 包中的 pairwise.t.test 函数:

Code
pairwise.t.test(data$flipper_length_mm, data$species, 
                p.adjust.method = "bonf", data = data)
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  data$flipper_length_mm and data$species 
## 
##           Adelie  Chinstrap
## Chinstrap 3.8e-08 -        
## Gentoo    < 2e-16 < 2e-16  
## 
## P value adjustment method: bonferroni

可视化

Code
# Edit from here
x <- which(names(data) == "species") # name of grouping variable
y <- which(names(data) == "flipper_length_mm") # names of variables to test
method1 <- "anova" # one of "anova" or "kruskal.test"
method2 <- "t.test" # one of "wilcox.test" or "t.test"
my_comparisons <- list(c("Chinstrap", "Adelie"), c("Gentoo", "Adelie"), c("Gentoo", "Chinstrap")) # comparisons for post-hoc tests


library(ggpubr)
for (i in y) {
  for (j in x) {
    p <- ggboxplot(data,
      x = colnames(data[j]), y = colnames(data[i]),
      color = colnames(data[j]),
      legend = "none",
      palette = "npg",
      add = "jitter"
    )
    print(
      p + 
        #stat_compare_means(aes(label = paste0(after_stat(method), ", p-value = ", after_stat(p.format))), method = method1, label.y = max(data[, i], na.rm = TRUE)) + 
        stat_compare_means(comparisons = my_comparisons, method = method2, label = "p.format") # remove if p-value of ANOVA or Kruskal-Wallis test >= alpha
    )
  }
}

Code
# 此处生成一个列表,用于ggsignif两两组间的检验
Vec1 <- c("Chinstrap", "Adelie", "Gentoo")
comb_list <- list()
for(i in 1:(length(Vec1)-1)) {
  for(j in (i+1):length(Vec1)) {
    comb <- combn(c(Vec1[i], Vec1[j]), 2)
    if(!any(comb[1, ] == comb[2, ])) {
      comb_list[length(comb_list) + 1] <- list(comb)
    }
  }
}

# 选择一个颜色运行,如果修改数据,请根据你的分组数量,设置对应数量的颜色
#Custom.color <- c("#d3838a", "#efa48a", "#cbd27e")
#Custom.color <- c("#4a9a5b", "#8ab522", "#d1d628")
#Custom.color <- c("#245892", "#877eac", "#a9d296")
Custom.color <- c("#ae3f51", "#d6b55a", "#79aec1")


# 注:如果要保持pdf文件,请删除代码前的“#”号, width,height 可设置长宽比
# 绘图说明:因为geom_jitter散点图无法同时设置宽度和偏移,因此散点图必须对齐X轴刻度,而往往我们希望让中间的图位于中间,
# 所以此处我们隐藏了X轴刻度,并对X轴标签做了偏移,使其居中箱线图
# 由于ggsignif添加统计检验的模块无法移动,因此我们绘制pdf矢量图后在adobe illustrator中调整


# 绘制箱线图
# pdf("plot.pdf",width = 4.4,height = 3)
ggplot(data, aes(x = species, y = flipper_length_mm, fill = species)) +
  geom_jitter(mapping = aes(color = species), width = 0.05, alpha = 0.5, size = 0.9)+ # 绘制散点图
  geom_boxplot(position = position_nudge(x = 0.14), width = 0.1, outlier.size = 0, outlier.alpha = 0) + # 绘制箱线图,并通过position设置偏移
  stat_halfeye(mapping = aes(fill = species), width = 0.2, .width = 0, justification = -1.2, point_colour = NA, alpha = 0.6) + # 绘制云雨图,并通过position设置偏移
  scale_fill_manual(values = Custom.color) +   # 映射云雨图和箱线图的颜色
  scale_color_manual(values = Custom.color) +  # 映射散点的颜色
  # expand_limits(x = c(0.5, 3.8)) + # 扩展画板,若显示不全,请根据你的数据范围手动调整或删除此行
  # ylim(11,35) + # 控制y轴显示范围,若显示不全,请根据你的数据范围手动调整或删除此行
  # xlab("Histological origins of cancer cells") +  # 设置X轴标题
  # ylab("Degree of cellular infiltration") +   # 设置Y轴标题
  ggtitle("Penguins") +  #设置主标题
  theme(axis.ticks.x = element_line(size = 0, color = "black"),  # 自定义主题
        panel.background = element_rect(fill = "white", color = "white"),  # 设置画板
        panel.grid.major.x = element_blank(),   # 设置网格
        panel.grid.minor.x = element_blank(), # 设置网格
        panel.grid.major.y = element_line(color = "gray", size = 0.25), # 设置网格
        panel.grid.minor.y = element_blank(), # 设置网格
        panel.border = element_rect(color = "black", fill = NA, linewidth = 1), # 设置边框
        legend.position = "none", # 隐藏图例
        axis.title.x = element_text(size = 13),  # 调整X轴标题字体大小
        axis.title.y = element_text(size = 13), # 调整Y轴标题字体大小
        axis.text.x = element_text(size = 12, hjust = 0.3), # 设置x轴刻度字体偏移,若更换数据,可能需要重新设置
        axis.text.y = element_text(size = 12), # 设置Y轴刻度字体大小
        plot.title = element_text(hjust = 0.5)
        )+
  geom_signif(comparisons = comb_list, step_increase = 0.1, map_signif_level = TRUE, vjust = 0.5, hjust= 0)