Rui - 基于DTW距离度量的时间序列层次聚类

导入相关包

Code

library(data.table)
library(tidyverse)
library(lubridate)
library(xts)
library(ggplot2)
library(tictoc)
library(dtwclust)
library(ape)
library(sparcl)

读取数据

Code

data <- fread("data/mts_data.csv")
data %>% head(10) %>% knitr::kable()

order_date	sales_region_code	item_code	first_cate_code	second_cate_code	sales_chan_name	item_price	ord_qty
2015-09-02	102	20323	305	412	offline	99	502
2015-09-02	102	21350	305	412	offline	267	107
2015-09-02	101	20657	303	410	offline	2996	18
2015-09-02	102	20457	305	412	offline	164	308
2015-09-03	102	22046	305	412	offline	1204	88
2015-09-03	102	20020	305	412	offline	1918	18
2015-09-03	102	20459	305	412	offline	1666	6
2015-09-03	102	20797	305	412	offline	2619	31
2015-09-03	102	21745	303	401	offline	1713	41
2015-09-03	102	20717	303	401	offline	2351	20

Code

glimpse(data)
## Rows: 476,131
## Columns: 8
## $ order_date        <IDate> 2015-09-02, 2015-09-02, 2015-09-02, 2015-09-02, 20…
## $ sales_region_code <int> 102, 102, 101, 102, 102, 102, 102, 102, 102, 102, 10…
## $ item_code         <int> 20323, 21350, 20657, 20457, 22046, 20020, 20459, 207…
## $ first_cate_code   <int> 305, 305, 303, 305, 305, 305, 305, 305, 303, 303, 30…
## $ second_cate_code  <int> 412, 412, 410, 412, 412, 412, 412, 412, 401, 401, 41…
## $ sales_chan_name   <chr> "offline", "offline", "offline", "offline", "offline…
## $ item_price        <dbl> 99, 267, 2996, 164, 1204, 1918, 1666, 2619, 1713, 23…
## $ ord_qty           <int> 502, 107, 18, 308, 88, 18, 6, 31, 41, 20, 26, 168, 4…

数据处理

Code

data <- data %>% 
  mutate(type = paste("F", first_cate_code, "S", second_cate_code, sep = "")) %>%
  group_by(order_date, type) %>%
  summarise(qty = sum(ord_qty)) %>%
  ungroup()

Code

data %>% head() %>% knitr::kable()

order_date	type	qty
2015-09-02	F303S410	18
2015-09-02	F305S412	917
2015-09-03	F303S401	282
2015-09-03	F305S412	382
2015-09-03	F306S407	458
2015-09-04	F302S408	209

创建一个连续时间：

Code

# 缺失日期补齐
data <- data %>% mutate(order_date = as.Date(order_date))
x <- data[[1, 1]] # 开始日期
y <- data[[nrow(data), 1]] # 结束日期
z <- as.numeric(y - x) # 完整时间段
DATE <- (x + days(0:z)) %>% as_tibble() %>% rename("order_date" = "value")

使用30日移动平均平滑时间序列并转换为列表：

Code

item_list <- unique(data$type)

qty_list <- list()

for (item in item_list) {
  t <- data %>% 
    filter(type == item) %>%
    right_join(DATE, by = "order_date") %>%
    mutate(qty = replace_na(qty, 0)) %>% # NA 转换为 0
    #mutate(z_qty = scale(qty, center = TRUE, scale = TRUE)) # 数据标准化
    #mutate(log_qty = log(qty+1)) # 对数化缩小数据尺度
    mutate(rollmean = rollapply(qty, width = 30, FUN = mean, align = "right", fill = NA)) # 30日移动平均
  
  qty_list[[as.character(item)]] <- t$rollmean
}

Note

注意列表中含有 NA；
也可以使用 scale() 函数进行标准化

检查列表中每条时间序列的长度：

Code

qty_list %>% purrr::map(~length(.))
## $F303S410
## [1] 1206
## 
## $F305S412
## [1] 1206
## 
## $F303S401
## [1] 1206
## 
## $F306S407
## [1] 1206
## 
## $F302S408
## [1] 1206
## 
## $F306S402
## [1] 1206
## 
## $F303S406
## [1] 1206
## 
## $F301S405
## [1] 1206
## 
## $F304S409
## [1] 1206
## 
## $F307S403
## [1] 1206
## 
## $F308S404
## [1] 1206
## 
## $F303S411
## [1] 1206

可视化每条时间序列数据，要把每条时间序列从列表中提取出来。由于每条时间序列长度相同，可以横向合并为一个数据框（每一列就是一条时间序列）。由于使用了 30 日移动平均，所以每条时间序列的前 29 个数据都为 NA，合并为数据框后数据框的前 29 行全部为 NA。为了美化作图，可以将这些 NA 移除。

Code

qty_list %>% 
  purrr::map_dfc(~.) %>%
  mutate(order_date = DATE$order_date) %>% # 别忘了添加日期
  slice(-c(1:29)) %>% # 删去前29行NA值
  pivot_longer(cols = -order_date, 
               names_to = "type", 
               values_to = "qty") %>% # 宽数据转长数据
  arrange(order_date, type) %>%
  ggplot(aes(x = order_date, y = qty, color = type)) +
  geom_line() +
  facet_wrap(~type, scales = "free_y") +
  theme_light() +
  theme(axis.ticks.y = element_blank(), 
        axis.text.y = element_blank(), 
        legend.position = "none")

定义 DTW 距离度量函数

Warning

DTW 的算法复杂度不低，当时间序列种类较多、时间序列较长时慎用；
即便时间序列长度不等，一样可以使用 DTW

DTW 接受一个时间序列列表，就像 qty_list 那样，但是不能有 NA。所以首先要去除每条时间序列的前 29 个数据（NA）。

Code

test <- qty_list %>% map(function(x) (x[-c(1:29)]))

定义 DTW：

Code

# Normalized DTW
ndtw <- function(x, y, ...) {
  dtw(x, y, ...,
  step.pattern = asymmetric,
  distance.only = TRUE)$normalizedDistance # 核心函数 dtw
}

# Register the distance with proxy
proxy::pr_DB$set_entry(
  FUN = ndtw, 
  names = c("nDTW"),
  loop = TRUE, 
  type = "metric", 
  distance = TRUE,
  description = "Normalized, asymmetric DTW"
)

关于 proxy::pr_DB()：

proxy::pr_DB() 是一个计算两个数据集之间的距离矩阵的函数。该函数使用 DB 距离（distance-based dissimilarity，又称为 Minkowski 距离），该距离度量在两个数据集之间计算对应观测之间的距离。
proxy::pr_DB() 可以用于计算两个数据集中的观测之间的欧几里得距离、曼哈顿距离或切比雪夫距离等距离度量。
需要注意的是，由于 proxy::pr_DB() 函数计算距离矩阵需要进行多次计算，因此对于较大的数据集可能会较慢。如果需要计算大型数据集的距离矩阵，可以考虑使用其他更高效的方法，例如并行计算或使用专用的软件包。

关于 proxy::pr_DB$set_entry()：

proxy::pr_DB$set_entry() 是一个用于设置距离矩阵中单个元素的函数。该函数用于在使用 proxy::pr_DB() 函数计算距离矩阵时，手动设置某些距离的值，以覆盖默认的距离计算。
需要注意的是，proxy::pr_DB$set_entry() 函数仅用于手动设置距离矩阵的单个元素。如果需要更广泛地修改距离矩阵，例如在计算距离矩阵时应用自定义距离度量（比如定义的 ndtw() 函数），则可以使用 proxy::pr_DB() 函数的 method 参数来指定自定义距离度量。
另外，由于距离矩阵是对称矩阵，因此 proxy::pr_DB$set_entry() 函数只需要设置距离矩阵的一半。对于省略的元素，proxy::pr_DB() 函数将使用对称性自动计算它们的值。

层次聚类

Code

tic("基于DTW的层次聚类 ")

hc_dtw <- tsclust(
  test, 
  type = "h", 
  k = 5L,
  preproc = zscore, # 标准化
  seed = 123,
  distance = "nDTW", # 使用nDTW作为距离度量
  centroid = shape_extraction,
  control = hierarchical_control(method = "average")
)

toc()
## 基于DTW的层次聚类 : 35.53 sec elapsed

可视化聚类树：

Code

# By default, the dendrogram is plotted in hierarchical clustering
plot(hc_dtw)

可视化时间序列聚类结果：

Code

# The series and the obtained prototypes can be plotted too
plot(hc_dtw, type = "sc")

可视化第 4 类聚类结果：

Code

# Focusing on the first cluster
plot(hc_dtw, type = "series", clus = 4L)

Code

plot(hc_dtw, type = "centroids", clus = 4L) # 聚类中心

Tip

当然，除了层次聚类，还可以选择 K-means、PAM 等聚类方法。

并行计算

Code

require("doParallel")
# Create parallel workers
workers <- makeCluster(4L)
# Preload dtwclust in each worker; not necessary but useful
invisible(clusterEvalQ(workers, library("dtwclust")))
# Register the backend; this step MUST be done
registerDoParallel(workers)
# Backend detected automatically

tic("并行计算 ")
hc_dtw <- tsclust(
  test, 
  type = "h", 
  k = 5L,
  preproc = zscore, # 标准化
  seed = 123,
  distance = "nDTW", # 使用nDTW作为距离度量
  centroid = shape_extraction,
  control = hierarchical_control(method = "average")
)
toc()
## 并行计算 : 27.05 sec elapsed

# Stop parallel workers
stopCluster(workers)
# Go back to sequential computation
registerDoSEQ()

Code

# By default, the dendrogram is plotted in hierarchical clustering
plot(hc_dtw)

Code

# The series and the obtained prototypes can be plotted too
plot(hc_dtw, type = "sc")

Code

# Focusing on the first cluster
plot(hc_dtw, type = "series", clus = 4L)

Code

plot(hc_dtw, type = "centroids", clus = 4L)

美化聚类树

可以借助 ape 包画出不一样的聚类树：

Code

plot(as.phylo(hc_dtw), type = "fan")

Code

plot(as.phylo(hc_dtw), type = "cladogram")

使用 sparcl 包绘制：

Code

# colors the leaves of a dendrogram
kind <-  cutree(hc_dtw, 5) # 选择聚为 5 类（设置 5 种颜色）
ColorDendrogram(
  hc_dtw, 
  y = kind, 
  labels = names(kind), 
  main = "", # 标题
  xlab = "", # X-axis labe
  sub = "", # Sub-x-axis label
  branchlength = 0.12 # 结合 y 轴刻度设置颜色线条的长度
)