UMAP
UMAP (Uniform Manifold Approximation and Projection ),是一种降维技术,是基于黎曼几何和代数拓扑的理论框架结构构建的。在处理大数据集时,UMAP 优势明显,运行速度更快,内存占用小。它不仅可以进行降维,而且可用于可视化,类似于 t-SNE,也可用于一般非线性降维。
Code
library(dplyr)
# 先去除原数据中的label,并标准化
iris_norm <- iris[ , -5] %>%
scale(center = TRUE, scale = TRUE) %>%
as.data.frame()
iris_norm %>% head() %>% knitr::kable()
-0.8976739 |
1.0156020 |
-1.335752 |
-1.311052 |
-1.1392005 |
-0.1315388 |
-1.335752 |
-1.311052 |
-1.3807271 |
0.3273175 |
-1.392399 |
-1.311052 |
-1.5014904 |
0.0978893 |
-1.279104 |
-1.311052 |
-1.0184372 |
1.2450302 |
-1.335752 |
-1.311052 |
-0.5353840 |
1.9333146 |
-1.165809 |
-1.048667 |
Code
library(umap)
# UMAP降维
iris_umap <- iris_norm %>% umap()
# 提取降维结果
layout <- iris_umap$layout
Code
# 将降维结果转换为数据框形式,并合并原数据中的label
layout <- layout %>%
as.data.frame() %>%
bind_cols(iris$Species)
# 修改列名,方便下一步可视化
colnames(layout) <- c("X", "Y", "Species")
Code
library(ggplot2)
layout %>%
ggplot(aes(x = X, y = Y, col = Species, shape = Species))+
geom_point(size = 2) +
labs(x = "umap_1", y = "umap_2") +
theme_bw()