Code
library(tidyverse)
library(palmerpenguins)
library(ggplot2)
Rui
June 7, 2023
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year |
---|---|---|---|---|---|---|---|
Adelie | Torgersen | 39.1 | 18.7 | 181 | 3750 | male | 2007 |
Adelie | Torgersen | 39.5 | 17.4 | 186 | 3800 | female | 2007 |
Adelie | Torgersen | 40.3 | 18.0 | 195 | 3250 | female | 2007 |
Adelie | Torgersen | 36.7 | 19.3 | 193 | 3450 | female | 2007 |
Adelie | Torgersen | 39.3 | 20.6 | 190 | 3650 | male | 2007 |
Adelie | Torgersen | 38.9 | 17.8 | 181 | 3625 | female | 2007 |
计算均值、标准差:
分组描述性统计:
library(psych)
describeBy(data$flipper_length_mm, data$species) # 分组描述统计
##
## Descriptive statistics by group
## group: Adelie
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 146 190.1 6.52 190 190.08 7.41 172 210 38 0.08 0.28 0.54
## ------------------------------------------------------------
## group: Chinstrap
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 68 195.82 7.13 196 195.75 7.41 178 212 34 -0.01 -0.13 0.86
## ------------------------------------------------------------
## group: Gentoo
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 119 217.24 6.59 216 216.91 7.41 203 231 28 0.36 -0.72 0.6
以下是一些可视化数据分布的方法:
ggplot(data, aes(x = species, y = flipper_length_mm)) +
stat_summary(fun.data = "median_q1q3", geom = "errorbar", width = 0.2, size = 0.5) + # 误差棒,中位数,25%和75%分位数
stat_summary(fun.y = median, geom = "crossbar", width = 0.5, size = 0.3) + # 中位数水平线
geom_jitter(aes(color = species)) + # 抖动点
scale_fill_brewer(palette = "Set1") + # 调色
labs(
title = "Flipper length",
x = "Species", y = "Flipper length (mm)",
color = "Penguin species", shape = "Penguin species"
) +
theme_bw()
# 核密度图
Q1 <- ggplot(data, aes(y = flipper_length_mm, fill = species)) +
geom_density(alpha = 0.2, # 填充颜色透明度
size = 1, # 线条粗细
linetype = 1) + # 线条类型1是实线,2是虚线
ylim(c(160, 240)) +
scale_fill_brewer(palette = "Set1") + # 调色
facet_grid(.~species) +
labs(x = "freq", y = "flipper_length_mm") +
theme_bw()
# Q-Q 图
library(qqplotr)
Q2 <- ggplot(data, aes(sample = flipper_length_mm, fill = species)) +
stat_qq_band(#bandType = "ts",
alpha = 0.4) +
stat_qq_point(size = 1) +
stat_qq_line(linewidth = 1) +
scale_fill_brewer(palette = "Set1") + # 调色
facet_wrap(vars(species), scales = "free_x") + # x轴刻度不固定
labs(x = "Theoretical", y = "Sample") +
theme_bw()
# 拼图
library(patchwork)
Q1/Q2
for (kind in unique(data$species)) {
data %>%
dplyr::filter(species == kind) %>%
dplyr::select(flipper_length_mm) %>%
pull() %>%
shapiro.test() %>%
print()
}
##
## Shapiro-Wilk normality test
##
## data: .
## W = 0.9934, p-value = 0.7427
##
##
## Shapiro-Wilk normality test
##
## data: .
## W = 0.96148, p-value = 0.00176
##
##
## Shapiro-Wilk normality test
##
## data: .
## W = 0.98891, p-value = 0.8106
除了第 2 组,其他组均通过 Shapior 正态性检验。
也可以使用 rstatix
包中的 shapiro_test
函数。
for (kind in unique(data$species)) {
data %>%
dplyr::filter(species == kind) %>%
dplyr::select(flipper_length_mm) %>%
pull() %>%
ks.test(., "pnorm", mean = mean(.), sd = sqrt(.)) %>%
print()
}
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: .
## D = 0.19656, p-value = 2.521e-05
## alternative hypothesis: two-sided
##
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: .
## D = 0.24516, p-value = 1.226e-06
## alternative hypothesis: two-sided
##
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: .
## D = 0.20597, p-value = 0.006244
## alternative hypothesis: two-sided
3 组均未通过 K-S 正态性检验.
bandType
可选参数:
pointwise:使用正态分布构造
boot:基于 boostrap 构造
ks:基于 Kolmogorov-Smirnov 检验
ts:构造 tail-sensitive 的置信区间
ggplot(data, aes(sample = flipper_length_mm)) +
geom_qq_band(bandType = "ks", mapping = aes(fill = "KS"), alpha = 0.3) +
geom_qq_band(bandType = "ts", mapping = aes(fill = "TS"), alpha = 0.3) +
geom_qq_band(bandType = "pointwise", mapping = aes(fill = "Normal"), alpha = 0.3) +
geom_qq_band(bandType = "boot", mapping = aes(fill = "Bootstrap"), alpha = 0.3) +
stat_qq_line() +
stat_qq_point() +
facet_wrap(vars(species), scales = "free_x") +
labs(x = "Theoretical", y = "Sample") +
theme_bw()
残差基本符合正态分布。
也可以先进行 ANOVA,再对残差进行正态性检验。
p 值大于 0.05,接受各组方差相同的原假设,即方差齐性。
library(car)
leveneTest(flipper_length_mm ~ species, data = data) # 默认以中位数为中心
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 0.4428 0.6426
## 330
leveneTest(flipper_length_mm ~ species, data = data, center = "mean") # 以均值为中心
## Levene's Test for Homogeneity of Variance (center = "mean")
## Df F value Pr(>F)
## group 2 0.5031 0.6051
## 330
无论是选择中位数还是均值作为检验中心,p 值均大于 0.05,接受各组方差相同的原假设,即方差齐性。
Df(自由度):表示每个因素的自由度。例如,在这个例子中,species 因素有 2 个自由度,Residuals 因素有 330 个自由度。
Sum Sq(平方和):表示每个因素对总方差的贡献。例如,在这个例子中,species 因素对总方差的贡献为 50526,Residuals 因素对总方差的贡献为 14693。
Mean Sq(均方):表示每个因素对总方差的贡献的平均值。均方是通过将平方和除以自由度来计算得到的。例如,在这个例子中,species 因素对总方差的贡献的平均值为25263,Residuals因素对总方差的贡献的平均值为 45。
F value(F 值):表示因素对结果的显著性的影响。F 值越大,说明因素对结果的影响越显著。例如,在这个例子中,species 因素的 F 值为 567.4。
Pr(>F)(F 值概率):表示 F 值在随机分布下的概率。如果 Pr(>F) 值小于 0.05,那么就可以认为因素对结果有显著影响。例如,在这个例子中,species 因素的 Pr(>F) 值 <2e-16,小于 0.05,因此可以认为它对结果有显著影响。
可见 ANOVA 的残差符合正态分布。
stats
包library(stats)
hsd1 <- TukeyHSD(fit)
hsd1
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = flipper_length_mm ~ species, data = .)
##
## $species
## diff lwr upr p adj
## Chinstrap-Adelie 5.72079 3.414364 8.027215 0
## Gentoo-Adelie 27.13255 25.192399 29.072709 0
## Gentoo-Chinstrap 21.41176 19.023644 23.799885 0
使用 broom
包整洁检验结果:
broom::tidy(hsd1)
## # A tibble: 3 × 7
## term contrast null.value estimate conf.low conf.high adj.p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 species Chinstrap-Adelie 0 5.72 3.41 8.03 3.75e- 8
## 2 species Gentoo-Adelie 0 27.1 25.2 29.1 5.82e-13
## 3 species Gentoo-Chinstrap 0 21.4 19.0 23.8 5.82e-13
rstatix
包library(rstatix)
hsd2 <- tukey_hsd(fit)
hsd2
## # A tibble: 3 × 9
## term group1 group2 null.value estimate conf.low conf.high p.adj
## * <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 species Adelie Chinstrap 0 5.72 3.41 8.03 3.75e- 8
## 2 species Adelie Gentoo 0 27.1 25.2 29.1 5.82e-13
## 3 species Chinstrap Gentoo 0 21.4 19.0 23.8 5.82e-13
## # ℹ 1 more variable: p.adj.signif <chr>
multcomp
包:使用 multcomp
包的 glht
函数:
library(multcomp)
post_test1 <- glht(fit, linfct = mcp(species = "Tukey"))
summary(post_test1)
##
## Simultaneous Tests for General Linear Hypotheses
##
## Multiple Comparisons of Means: Tukey Contrasts
##
##
## Fit: aov(formula = flipper_length_mm ~ species, data = .)
##
## Linear Hypotheses:
## Estimate Std. Error t value Pr(>|t|)
## Chinstrap - Adelie == 0 5.7208 0.9796 5.84 2.3e-08 ***
## Gentoo - Adelie == 0 27.1326 0.8241 32.92 < 1e-08 ***
## Gentoo - Chinstrap == 0 21.4118 1.0143 21.11 < 1e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
使用 multcomp
包的 glht
函数:
post_test2 <- glht(fit, linfct = mcp(species = "Dunnett"))
summary(post_test2)
##
## Simultaneous Tests for General Linear Hypotheses
##
## Multiple Comparisons of Means: Dunnett Contrasts
##
##
## Fit: aov(formula = flipper_length_mm ~ species, data = .)
##
## Linear Hypotheses:
## Estimate Std. Error t value Pr(>|t|)
## Chinstrap - Adelie == 0 5.7208 0.9796 5.84 2.51e-08 ***
## Gentoo - Adelie == 0 27.1326 0.8241 32.92 < 1e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
SNK 法属于多重极差检验,其检验统计量为 q,又称 q 检验。SNK 法与 Bonferroni 法在功能上类似,SNK 法是 Tukey HSD 检验法的一种修正,相对于 Tukey HSD 检验不是很保守,因此能发现更多的差异。
可以通过 agricolae
包的 SNK.test
函数实现:
library(agricolae)
SNK.test(fit, trt = "species", console = T)
##
## Study: fit ~ "species"
##
## Student Newman Keuls Test
## for flipper_length_mm
##
## Mean Square Error: 44.52349
##
## species, means
##
## flipper_length_mm std r Min Max
## Adelie 190.1027 6.521825 146 172 210
## Chinstrap 195.8235 7.131894 68 178 212
## Gentoo 217.2353 6.585431 119 203 231
##
## Groups according to probability of means differences and alpha level( 0.05 )
##
## Means with the same letter are not significantly different.
##
## flipper_length_mm groups
## Gentoo 217.2353 a
## Chinstrap 195.8235 b
## Adelie 190.1027 c
使用 stats
包中的 pairwise.t.test
函数:
pairwise.t.test(data$flipper_length_mm, data$species,
p.adjust.method = "bonf", data = data)
##
## Pairwise comparisons using t tests with pooled SD
##
## data: data$flipper_length_mm and data$species
##
## Adelie Chinstrap
## Chinstrap 3.8e-08 -
## Gentoo < 2e-16 < 2e-16
##
## P value adjustment method: bonferroni
# Edit from here
x <- which(names(data) == "species") # name of grouping variable
y <- which(names(data) == "flipper_length_mm") # names of variables to test
method1 <- "anova" # one of "anova" or "kruskal.test"
method2 <- "t.test" # one of "wilcox.test" or "t.test"
my_comparisons <- list(c("Chinstrap", "Adelie"), c("Gentoo", "Adelie"), c("Gentoo", "Chinstrap")) # comparisons for post-hoc tests
library(ggpubr)
for (i in y) {
for (j in x) {
p <- ggboxplot(data,
x = colnames(data[j]), y = colnames(data[i]),
color = colnames(data[j]),
legend = "none",
palette = "npg",
add = "jitter"
)
print(
p +
#stat_compare_means(aes(label = paste0(after_stat(method), ", p-value = ", after_stat(p.format))), method = method1, label.y = max(data[, i], na.rm = TRUE)) +
stat_compare_means(comparisons = my_comparisons, method = method2, label = "p.format") # remove if p-value of ANOVA or Kruskal-Wallis test >= alpha
)
}
}
# 此处生成一个列表,用于ggsignif两两组间的检验
Vec1 <- c("Chinstrap", "Adelie", "Gentoo")
comb_list <- list()
for(i in 1:(length(Vec1)-1)) {
for(j in (i+1):length(Vec1)) {
comb <- combn(c(Vec1[i], Vec1[j]), 2)
if(!any(comb[1, ] == comb[2, ])) {
comb_list[length(comb_list) + 1] <- list(comb)
}
}
}
# 选择一个颜色运行,如果修改数据,请根据你的分组数量,设置对应数量的颜色
#Custom.color <- c("#d3838a", "#efa48a", "#cbd27e")
#Custom.color <- c("#4a9a5b", "#8ab522", "#d1d628")
#Custom.color <- c("#245892", "#877eac", "#a9d296")
Custom.color <- c("#ae3f51", "#d6b55a", "#79aec1")
# 注:如果要保持pdf文件,请删除代码前的“#”号, width,height 可设置长宽比
# 绘图说明:因为geom_jitter散点图无法同时设置宽度和偏移,因此散点图必须对齐X轴刻度,而往往我们希望让中间的图位于中间,
# 所以此处我们隐藏了X轴刻度,并对X轴标签做了偏移,使其居中箱线图
# 由于ggsignif添加统计检验的模块无法移动,因此我们绘制pdf矢量图后在adobe illustrator中调整
# 绘制箱线图
# pdf("plot.pdf",width = 4.4,height = 3)
ggplot(data, aes(x = species, y = flipper_length_mm, fill = species)) +
geom_jitter(mapping = aes(color = species), width = 0.05, alpha = 0.5, size = 0.9)+ # 绘制散点图
geom_boxplot(position = position_nudge(x = 0.14), width = 0.1, outlier.size = 0, outlier.alpha = 0) + # 绘制箱线图,并通过position设置偏移
stat_halfeye(mapping = aes(fill = species), width = 0.2, .width = 0, justification = -1.2, point_colour = NA, alpha = 0.6) + # 绘制云雨图,并通过position设置偏移
scale_fill_manual(values = Custom.color) + # 映射云雨图和箱线图的颜色
scale_color_manual(values = Custom.color) + # 映射散点的颜色
# expand_limits(x = c(0.5, 3.8)) + # 扩展画板,若显示不全,请根据你的数据范围手动调整或删除此行
# ylim(11,35) + # 控制y轴显示范围,若显示不全,请根据你的数据范围手动调整或删除此行
# xlab("Histological origins of cancer cells") + # 设置X轴标题
# ylab("Degree of cellular infiltration") + # 设置Y轴标题
ggtitle("Penguins") + #设置主标题
theme(axis.ticks.x = element_line(size = 0, color = "black"), # 自定义主题
panel.background = element_rect(fill = "white", color = "white"), # 设置画板
panel.grid.major.x = element_blank(), # 设置网格
panel.grid.minor.x = element_blank(), # 设置网格
panel.grid.major.y = element_line(color = "gray", size = 0.25), # 设置网格
panel.grid.minor.y = element_blank(), # 设置网格
panel.border = element_rect(color = "black", fill = NA, linewidth = 1), # 设置边框
legend.position = "none", # 隐藏图例
axis.title.x = element_text(size = 13), # 调整X轴标题字体大小
axis.title.y = element_text(size = 13), # 调整Y轴标题字体大小
axis.text.x = element_text(size = 12, hjust = 0.3), # 设置x轴刻度字体偏移,若更换数据,可能需要重新设置
axis.text.y = element_text(size = 12), # 设置Y轴刻度字体大小
plot.title = element_text(hjust = 0.5)
)+
geom_signif(comparisons = comb_list, step_increase = 0.1, map_signif_level = TRUE, vjust = 0.5, hjust= 0)
---
title: "单因素方差分析与多重比较"
author: "Rui"
date: "2023-06-07"
categories: [R, qqplotr, rstatix, 统计检验]
image: "AdelieWPD.jpg"
format:
html:
code-fold: true
code-tools: true
---
```{r setup, include = FALSE}
# 设置默认参数
knitr::opts_chunk$set(
echo = TRUE,
message = FALSE,
warning = FALSE,
collapse = TRUE
)
```
## 导入数据
```{r}
library(tidyverse)
library(palmerpenguins)
library(ggplot2)
```
```{r}
data <- penguins %>% drop_na()
data %>% head() %>% knitr::kable()
```
计算均值、标准差:
```{r}
data %>%
group_by(species) %>%
summarise(
mean = mean(flipper_length_mm),
sd = sd(flipper_length_mm)
)
```
分组描述性统计:
```{r}
library(psych)
describeBy(data$flipper_length_mm, data$species) # 分组描述统计
```
以下是一些可视化数据分布的方法:
```{r}
boxplot(flipper_length_mm ~ species, data = data)
```
```{r}
library(ggpubr)
ggerrorplot(data, x = "species", y = "flipper_length_mm", desc_stat = "mean_sd") #画平均值标准差图
```
```{r}
library(ggdist)
ggplot(data = data, aes(x = flipper_length_mm, y = species)) +
stat_histinterval(slab_color = "white",
outline_bars = TRUE,
fill = "royalblue",
alpha = 0.5) +
theme_ggdist() +
labs(title = "Histogram_interval plots")
```
```{r}
ggplot(data, aes(x = species, y = flipper_length_mm)) +
stat_summary(fun.data = "median_q1q3", geom = "errorbar", width = 0.2, size = 0.5) + # 误差棒,中位数,25%和75%分位数
stat_summary(fun.y = median, geom = "crossbar", width = 0.5, size = 0.3) + # 中位数水平线
geom_jitter(aes(color = species)) + # 抖动点
scale_fill_brewer(palette = "Set1") + # 调色
labs(
title = "Flipper length",
x = "Species", y = "Flipper length (mm)",
color = "Penguin species", shape = "Penguin species"
) +
theme_bw()
```
## 正态性检验
### 分组 Q-Q 图
```{r}
# 核密度图
Q1 <- ggplot(data, aes(y = flipper_length_mm, fill = species)) +
geom_density(alpha = 0.2, # 填充颜色透明度
size = 1, # 线条粗细
linetype = 1) + # 线条类型1是实线,2是虚线
ylim(c(160, 240)) +
scale_fill_brewer(palette = "Set1") + # 调色
facet_grid(.~species) +
labs(x = "freq", y = "flipper_length_mm") +
theme_bw()
# Q-Q 图
library(qqplotr)
Q2 <- ggplot(data, aes(sample = flipper_length_mm, fill = species)) +
stat_qq_band(#bandType = "ts",
alpha = 0.4) +
stat_qq_point(size = 1) +
stat_qq_line(linewidth = 1) +
scale_fill_brewer(palette = "Set1") + # 调色
facet_wrap(vars(species), scales = "free_x") + # x轴刻度不固定
labs(x = "Theoretical", y = "Sample") +
theme_bw()
# 拼图
library(patchwork)
Q1/Q2
```
### Shapiro 检验
```{r}
for (kind in unique(data$species)) {
data %>%
dplyr::filter(species == kind) %>%
dplyr::select(flipper_length_mm) %>%
pull() %>%
shapiro.test() %>%
print()
}
```
除了第 2 组,其他组均通过 Shapior 正态性检验。
也可以使用 `rstatix` 包中的 `shapiro_test` 函数。
### K-S 检验
```{r}
for (kind in unique(data$species)) {
data %>%
dplyr::filter(species == kind) %>%
dplyr::select(flipper_length_mm) %>%
pull() %>%
ks.test(., "pnorm", mean = mean(.), sd = sqrt(.)) %>%
print()
}
```
3 组均未通过 K-S 正态性检验.
### 不同方法
`bandType` 可选参数:
- pointwise:使用正态分布构造
- boot:基于 boostrap 构造
- ks:基于 Kolmogorov-Smirnov 检验
- ts:构造 tail-sensitive 的置信区间
```{r}
ggplot(data, aes(sample = flipper_length_mm)) +
geom_qq_band(bandType = "ks", mapping = aes(fill = "KS"), alpha = 0.3) +
geom_qq_band(bandType = "ts", mapping = aes(fill = "TS"), alpha = 0.3) +
geom_qq_band(bandType = "pointwise", mapping = aes(fill = "Normal"), alpha = 0.3) +
geom_qq_band(bandType = "boot", mapping = aes(fill = "Bootstrap"), alpha = 0.3) +
stat_qq_line() +
stat_qq_point() +
facet_wrap(vars(species), scales = "free_x") +
labs(x = "Theoretical", y = "Sample") +
theme_bw()
```
### 残差正态性检验
```{r}
library(lmtest) # 调用程序包“lmtest”
lmresults <- lm(flipper_length_mm~species, data = data) # 将回归分析结果赋值给lmresults
res <- rstandard(lmresults) # 提取标准化残差赋值给res
```
```{r}
shapiro.test(res)
```
残差基本符合正态分布。
也可以先进行 ANOVA,再对残差进行正态性检验。
## 方差齐性检验
### Bartlett 方差齐性检验
```{r}
library(stats)
bartlett.test(flipper_length_mm ~ species, data = data)
```
p 值大于 0.05,接受各组方差相同的原假设,即方差齐性。
### Levene 检验
```{r}
library(car)
leveneTest(flipper_length_mm ~ species, data = data) # 默认以中位数为中心
leveneTest(flipper_length_mm ~ species, data = data, center = "mean") # 以均值为中心
```
无论是选择中位数还是均值作为检验中心,p 值均大于 0.05,接受各组方差相同的原假设,即方差齐性。
## 单因素方差分析
```{r}
fit <- data %>% aov(flipper_length_mm ~ species, data = .)
summary(fit)
```
- Df(自由度):表示每个因素的自由度。例如,在这个例子中,species 因素有 2 个自由度,Residuals 因素有 330 个自由度。
- Sum Sq(平方和):表示每个因素对总方差的贡献。例如,在这个例子中,species 因素对总方差的贡献为 50526,Residuals 因素对总方差的贡献为 14693。
- Mean Sq(均方):表示每个因素对总方差的贡献的平均值。均方是通过将平方和除以自由度来计算得到的。例如,在这个例子中,species 因素对总方差的贡献的平均值为25263,Residuals因素对总方差的贡献的平均值为 45。
- F value(F 值):表示因素对结果的显著性的影响。F 值越大,说明因素对结果的影响越显著。例如,在这个例子中,species 因素的 F 值为 567.4。
- Pr(>F)(F 值概率):表示 F 值在随机分布下的概率。如果 Pr(>F) 值小于 0.05,那么就可以认为因素对结果有显著影响。例如,在这个例子中,species 因素的 Pr(>F) 值 <2e-16,小于 0.05,因此可以认为它对结果有显著影响。
```{r}
shapiro.test(fit$residuals)
```
可见 ANOVA 的残差符合正态分布。
## 修正的单因素方差分析
```{r}
library(rstatix)
data %>% welch_anova_test(flipper_length_mm ~ species) # 校正的单因素方差分析
```
## 多重比较
### Tukey HSD 检验
#### `stats` 包
```{r}
library(stats)
hsd1 <- TukeyHSD(fit)
hsd1
```
使用 `broom` 包整洁检验结果:
```{r}
broom::tidy(hsd1)
```
```{r}
par(mar = c(5, 5, 6, 3))
plot(hsd1)
```
#### `rstatix` 包
```{r}
library(rstatix)
hsd2 <- tukey_hsd(fit)
hsd2
```
#### `multcomp` 包:
使用 `multcomp` 包的 `glht` 函数:
```{r}
library(multcomp)
post_test1 <- glht(fit, linfct = mcp(species = "Tukey"))
summary(post_test1)
```
```{r}
par(mar = c(5, 10, 5, 3))
plot(post_test1)
```
### Dunnett 检验
使用 `multcomp` 包的 `glht` 函数:
```{r}
post_test2 <- glht(fit, linfct = mcp(species = "Dunnett"))
summary(post_test2)
```
```{r}
par(mar = c(5, 10, 5, 3))
plot(post_test2)
```
SNK 法属于多重极差检验,其检验统计量为 q,又称 q 检验。SNK 法与 Bonferroni 法在功能上类似,SNK 法是 Tukey HSD 检验法的一种修正,相对于 Tukey HSD 检验不是很保守,因此能发现更多的差异。
可以通过 `agricolae` 包的 `SNK.test` 函数实现:
```{r}
library(agricolae)
SNK.test(fit, trt = "species", console = T)
```
### Bonferroni 法修正的 t 检验
使用 `stats` 包中的 `pairwise.t.test` 函数:
```{r}
pairwise.t.test(data$flipper_length_mm, data$species,
p.adjust.method = "bonf", data = data)
```
## 可视化
```{r}
# Edit from here
x <- which(names(data) == "species") # name of grouping variable
y <- which(names(data) == "flipper_length_mm") # names of variables to test
method1 <- "anova" # one of "anova" or "kruskal.test"
method2 <- "t.test" # one of "wilcox.test" or "t.test"
my_comparisons <- list(c("Chinstrap", "Adelie"), c("Gentoo", "Adelie"), c("Gentoo", "Chinstrap")) # comparisons for post-hoc tests
library(ggpubr)
for (i in y) {
for (j in x) {
p <- ggboxplot(data,
x = colnames(data[j]), y = colnames(data[i]),
color = colnames(data[j]),
legend = "none",
palette = "npg",
add = "jitter"
)
print(
p +
#stat_compare_means(aes(label = paste0(after_stat(method), ", p-value = ", after_stat(p.format))), method = method1, label.y = max(data[, i], na.rm = TRUE)) +
stat_compare_means(comparisons = my_comparisons, method = method2, label = "p.format") # remove if p-value of ANOVA or Kruskal-Wallis test >= alpha
)
}
}
```
```{r}
# 此处生成一个列表,用于ggsignif两两组间的检验
Vec1 <- c("Chinstrap", "Adelie", "Gentoo")
comb_list <- list()
for(i in 1:(length(Vec1)-1)) {
for(j in (i+1):length(Vec1)) {
comb <- combn(c(Vec1[i], Vec1[j]), 2)
if(!any(comb[1, ] == comb[2, ])) {
comb_list[length(comb_list) + 1] <- list(comb)
}
}
}
# 选择一个颜色运行,如果修改数据,请根据你的分组数量,设置对应数量的颜色
#Custom.color <- c("#d3838a", "#efa48a", "#cbd27e")
#Custom.color <- c("#4a9a5b", "#8ab522", "#d1d628")
#Custom.color <- c("#245892", "#877eac", "#a9d296")
Custom.color <- c("#ae3f51", "#d6b55a", "#79aec1")
# 注:如果要保持pdf文件,请删除代码前的“#”号, width,height 可设置长宽比
# 绘图说明:因为geom_jitter散点图无法同时设置宽度和偏移,因此散点图必须对齐X轴刻度,而往往我们希望让中间的图位于中间,
# 所以此处我们隐藏了X轴刻度,并对X轴标签做了偏移,使其居中箱线图
# 由于ggsignif添加统计检验的模块无法移动,因此我们绘制pdf矢量图后在adobe illustrator中调整
# 绘制箱线图
# pdf("plot.pdf",width = 4.4,height = 3)
ggplot(data, aes(x = species, y = flipper_length_mm, fill = species)) +
geom_jitter(mapping = aes(color = species), width = 0.05, alpha = 0.5, size = 0.9)+ # 绘制散点图
geom_boxplot(position = position_nudge(x = 0.14), width = 0.1, outlier.size = 0, outlier.alpha = 0) + # 绘制箱线图,并通过position设置偏移
stat_halfeye(mapping = aes(fill = species), width = 0.2, .width = 0, justification = -1.2, point_colour = NA, alpha = 0.6) + # 绘制云雨图,并通过position设置偏移
scale_fill_manual(values = Custom.color) + # 映射云雨图和箱线图的颜色
scale_color_manual(values = Custom.color) + # 映射散点的颜色
# expand_limits(x = c(0.5, 3.8)) + # 扩展画板,若显示不全,请根据你的数据范围手动调整或删除此行
# ylim(11,35) + # 控制y轴显示范围,若显示不全,请根据你的数据范围手动调整或删除此行
# xlab("Histological origins of cancer cells") + # 设置X轴标题
# ylab("Degree of cellular infiltration") + # 设置Y轴标题
ggtitle("Penguins") + #设置主标题
theme(axis.ticks.x = element_line(size = 0, color = "black"), # 自定义主题
panel.background = element_rect(fill = "white", color = "white"), # 设置画板
panel.grid.major.x = element_blank(), # 设置网格
panel.grid.minor.x = element_blank(), # 设置网格
panel.grid.major.y = element_line(color = "gray", size = 0.25), # 设置网格
panel.grid.minor.y = element_blank(), # 设置网格
panel.border = element_rect(color = "black", fill = NA, linewidth = 1), # 设置边框
legend.position = "none", # 隐藏图例
axis.title.x = element_text(size = 13), # 调整X轴标题字体大小
axis.title.y = element_text(size = 13), # 调整Y轴标题字体大小
axis.text.x = element_text(size = 12, hjust = 0.3), # 设置x轴刻度字体偏移,若更换数据,可能需要重新设置
axis.text.y = element_text(size = 12), # 设置Y轴刻度字体大小
plot.title = element_text(hjust = 0.5)
)+
geom_signif(comparisons = comb_list, step_increase = 0.1, map_signif_level = TRUE, vjust = 0.5, hjust= 0)
```