使用 Tidyverse 清洗 CFPS 数据(三)

R
数据处理
一些尝试
tidyverse
Author

Rui

Published

September 26, 2022

数据验证与数据清洗

导入数据并重命名变量

Code
setwd("F:/RuiBlog/posts/R/使用tidyverse清洗CFPS数据")

## 导入数据
library(tidyverse)

file_path_2018 <- "./data/operated_data/data_2018.sav"
df_2018 <- haven::read_sav(file_path_2018) %>% as_tibble() %>% glimpse()
## Rows: 12,231
## Columns: 15
## $ id             <chr> "100051_100051502", "100160_120009102", "100286_1300051…
## $ fid16.x        <dbl+lbl> 100051, 100160, 100286, 100435, 100453, 100551, 100…
## $ provcd18       <dbl+lbl> 11, 12, 13, 13, 43, 13, 13, 13, 13, 37, 13, 13, 11,…
## $ urban18        <dbl+lbl> 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, …
## $ fml18          <dbl+lbl> 3, 2, 1, 4, 6, 3, 2, 4, 1, 2, 2, 4, 3, 4, 5, 1, 3, …
## $ fincome18      <dbl+lbl> 240000, 180000,  30000,  70000, 118000,  80000,   2…
## $ expense18      <dbl+lbl> 275300, 220200,  46860,  61096,  60000,  83000,  12…
## $ family_debts18 <dbl> 0, 470000, 50000, 180000, 0, 200000, 0, 20000, 500000, …
## $ family_asset18 <dbl> 3050000.0, 2440000.0, 600000.0, 831000.0, 3165000.0, 10…
## $ fid16.y        <dbl+lbl> 100051, 100160, 100286, 100435, 100453, 100551, 100…
## $ age            <dbl+lbl> 52, 27, 40, 31, 68, 30, 73, 30, 26, 33, 25, 31, 28,…
## $ age2           <dbl> 2704, 729, 1600, 961, 4624, 900, 5329, 900, 676, 1089, …
## $ gender         <dbl+lbl> 1, 5, 1, 1, 1, 1, 5, 5, 1, 1, 5, 5, 1, 5, 1, 1, 5, …
## $ marriage       <dbl+lbl> 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, …
## $ health         <dbl+lbl> -8,  6, -8, -8,  6, -8,  4,  7, -8, -8, -8,  5, -8,…

file_path_2016 <- "./data/operated_data/data_2016.sav"
df_2016 <- haven::read_sav(file_path_2016) %>% as_tibble() %>% glimpse()
## Rows: 12,501
## Columns: 15
## $ id             <chr> "100051_100051502", "100160_120009102", "100286_1300051…
## $ fid16.x        <dbl+lbl> 100051, 100160, 100286, 100376, 100531, 100569, 100…
## $ provcd16       <dbl+lbl> 11, 12, 13, 13, 13, 13, 13, 13, 37, 13, 13, 11, 13,…
## $ urban16        <dbl+lbl> 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ fml16          <dbl+lbl> 3, 1, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 4, 1, 5, 1, 2, …
## $ expense16      <dbl+lbl> 102140.0,  50691.0,  31900.0,  67000.0,  33600.0,  …
## $ fincome16      <dbl+lbl> 180000,  85000,  70700,  76000,  28500,   1800,  20…
## $ family_debts16 <dbl> 0, 0, 120000, 0, 0, 0, 0, 200000, 400000, 0, 12000, 0, …
## $ family_asset16 <dbl> 3180000.0, 75000.0, 158750.0, 4800.0, 536562.5, 1300.0,…
## $ fid16.y        <dbl+lbl> 100051, 100160, 100286, 100376, 100531, 100569, 100…
## $ age            <dbl+lbl> 50, 25, 38, 20, 31, 71, 28, 25, 31, 23, 29, 32, 35,…
## $ age2           <dbl> 2500, 625, 1444, 400, 961, 5041, 784, 625, 961, 529, 84…
## $ gender         <dbl+lbl> 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, …
## $ marriage       <dbl+lbl> 2, 1, 4, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, …
## $ health         <dbl+lbl> -8, -8,  7, -8, -8,  6,  5, -8, -8, -8, -8, -8,  6,…

## 更改变量名称
df_2018 %<>% select(-fid16.y) %>%
  rename(fid = fid16.x,
         age18 = age,
         age2_18 = age2,
         gender18 = gender,
         marriage18 = marriage,
         health18 = health)

df_2016 %<>% select(-fid16.y) %>%
  rename(fid = fid16.x,
         age16 = age,
         age2_16 = age2,
         gender16 = gender,
         marriage16 = marriage,
         health16 = health)

数据转换

0-1 变换

根据论文中的设置,需要将性别和婚姻状况转换为 0-1 变量。其中男性为 1,女性为 0;当前已婚为 1,其他情况为 0。

Note
  • 在原数据中缺失和异常是用负数表示的,在数据转换的过程中一定要注意保持这部分数据不变,以方便之后对缺失值的处理

  • 需要尤为注意的是:2016 年原数据中男性为 1,女性为 0;而在 2018 年原数据中男性为 1,女性为 5。

Code
## 数据转换
# 性别0-1
df_2018$gender18[df_2018$gender18 == 5] <- 0
# 婚姻状况0-1
df_2018$marriage18[(df_2018$marriage18 != 2) & (df_2018$marriage18 > 0)] <- 0
df_2018$marriage18[df_2018$marriage18 == 2] <- 1
df_2016$marriage16[(df_2016$marriage16 != 2) & (df_2016$marriage16 > 0)] <- 0
df_2016$marriage16[df_2016$marriage16 == 2] <- 1

经济数据处理

虽然论文中没有明确提及,但这里还是尝试对经济变量进行平减处理。以 2016 年为基期,对 2018 年经济数据做平减处理以消除通货膨胀的影响。

  • 对于 expense18 和 fincome18,使用居民消费价格指数平减。 以 2016 年为基期,2018年居民消费价格指数为103.729084。

  • 对于 family_debts18 和 family_asset18,使用固定资产投资价格指数平减。 以 2016 年为基期。2018 年固定资产投资价格指数为 111.518519。

Code
library(magrittr)
df_2018 %<>% mutate(expense18 = expense18/1.03729084,
                    fincome18 = fincome18/1.03729084,
                    family_debts18 = family_debts18/1.11518519,
                    family_asset18 = family_asset18/1.11518519)

glimpse(df_2018)
## Rows: 12,231
## Columns: 14
## $ id             <chr> "100051_100051502", "100160_120009102", "100286_1300051…
## $ fid            <dbl+lbl> 100051, 100160, 100286, 100435, 100453, 100551, 100…
## $ provcd18       <dbl+lbl> 11, 12, 13, 13, 43, 13, 13, 13, 13, 37, 13, 13, 11,…
## $ urban18        <dbl+lbl> 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, …
## $ fml18          <dbl+lbl> 3, 2, 1, 4, 6, 3, 2, 4, 1, 2, 2, 4, 3, 4, 5, 1, 3, …
## $ fincome18      <dbl> 231371.946, 173528.959, 28921.493, 67483.484, 113757.87…
## $ expense18      <dbl> 265402.903, 212283.760, 45175.372, 58899.585, 57842.986…
## $ family_debts18 <dbl> 0.000, 421454.664, 44835.603, 161408.169, 0.000, 179342…
## $ family_asset18 <dbl> 2734971.76, 2187977.41, 538027.23, 745167.72, 2838093.6…
## $ age18          <dbl+lbl> 52, 27, 40, 31, 68, 30, 73, 30, 26, 33, 25, 31, 28,…
## $ age2_18        <dbl> 2704, 729, 1600, 961, 4624, 900, 5329, 900, 676, 1089, …
## $ gender18       <dbl+lbl> 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, …
## $ marriage18     <dbl+lbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, …
## $ health18       <dbl+lbl> -8,  6, -8, -8,  6, -8,  4,  7, -8, -8, -8,  5, -8,…
glimpse(df_2016) # 2016年观测比2018年多
## Rows: 12,501
## Columns: 14
## $ id             <chr> "100051_100051502", "100160_120009102", "100286_1300051…
## $ fid            <dbl+lbl> 100051, 100160, 100286, 100376, 100531, 100569, 100…
## $ provcd16       <dbl+lbl> 11, 12, 13, 13, 13, 13, 13, 13, 37, 13, 13, 11, 13,…
## $ urban16        <dbl+lbl> 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ fml16          <dbl+lbl> 3, 1, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 4, 1, 5, 1, 2, …
## $ expense16      <dbl+lbl> 102140.0,  50691.0,  31900.0,  67000.0,  33600.0,  …
## $ fincome16      <dbl+lbl> 180000,  85000,  70700,  76000,  28500,   1800,  20…
## $ family_debts16 <dbl> 0, 0, 120000, 0, 0, 0, 0, 200000, 400000, 0, 12000, 0, …
## $ family_asset16 <dbl> 3180000.0, 75000.0, 158750.0, 4800.0, 536562.5, 1300.0,…
## $ age16          <dbl+lbl> 50, 25, 38, 20, 31, 71, 28, 25, 31, 23, 29, 32, 35,…
## $ age2_16        <dbl> 2500, 625, 1444, 400, 961, 5041, 784, 625, 961, 529, 84…
## $ gender16       <dbl+lbl> 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, …
## $ marriage16     <dbl+lbl> 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, …
## $ health16       <dbl+lbl> -8, -8,  7, -8, -8,  6,  5, -8, -8, -8, -8, -8,  6,…

数据验证

跨年份合并

由于 2016 年与 2018 年数据有较强的联系,故需要同时考虑两张表的信息。这里采用的办法是将两张表合并,这样能够同时处理两年的数据。

Code
## 2016年数据与2018年数据内连接
merged_data <- df_2016 %>% 
  inner_join(df_2018, by = "id") # 相当于取交集

缺失值定义

所有变量均有实际意义,并且从 SPSS 变量视图中可以看出:缺失值和异常值全部被赋值为负数。故将所有小于 0 的值变为 NA。

Code
merged_data[merged_data < 0] <- NA

数据类型规约

Code
## 数据类型规整
merged_data$fid.x %<>% as.character()
merged_data$fid.y %<>% as.character()

merged_data$fml16 %<>% as.integer()
merged_data$fml18 %<>% as.integer()
merged_data$age16 %<>% as.integer()
merged_data$age18 %<>% as.integer()
merged_data$age2_16 %<>% as.integer()
merged_data$age2_18 %<>% as.integer()

merged_data$provcd16 %<>% as.factor()
merged_data$provcd18 %<>% as.factor()
merged_data$urban16 %<>% as.factor()
merged_data$urban18 %<>% as.factor()
merged_data$gender16 %<>% as.factor()
merged_data$gender18 %<>% as.factor()
merged_data$marriage16 %<>% as.factor()
merged_data$marriage18 %<>% as.factor()
merged_data$health16 %<>% as.factor()
merged_data$health18 %<>% as.factor()

merged_data %<>% select(id, fid.x, fid.y, ends_with("16"), ends_with("8"), everything())

数据验证

我们需要事先定义好验证规则:

  • 规则1. 户主的性别不变

  • 规则2. 省份不变(有争议)

  • 规则3. 城市或农村不变(有争议)

  • 规则4. 年龄大于等于 18

  • 规则5. 两年之内年龄差大于等于 1,小于等于 2

  • 规则6. 家庭成员个数大于等于 1,且两年内家庭成员个数变动有限

Code
library(validate)

v <- validator(gender16 == gender18,
               provcd16 == provcd18,
               urban16 == urban18,
               age16 >= 18,
               age18 - age16 >= 1,
               age18 - age16 <= 2,
               abs(fml18-fml16) <= 5)

cf <- confront(merged_data, v)
summary(cf)
##   name items passes fails nNA error warning                  expression
## 1   V1  6739   6688     8  43 FALSE   FALSE        gender16 == gender18
## 2   V2     0      0     0   0  TRUE   FALSE        provcd16 == provcd18
## 3   V3  6739   6405   219 115 FALSE   FALSE          urban16 == urban18
## 4   V4  6739   6737     2   0 FALSE   FALSE        age16 - 18 >= -1e-08
## 5   V5  6739   6715    24   0 FALSE   FALSE age18 - age16 - 1 >= -1e-08
## 6   V6  6739   6690    49   0 FALSE   FALSE  age18 - age16 - 2 <= 1e-08
## 7   V7  6739   6711    28   0 FALSE   FALSE     abs(fml18 - fml16) <= 5

barplot(cf, main = "merged data")

summary() 的结果和柱状图可知,除了规则 2 外每一条验证规则都有部分个例不通过。违背验证规则 1、4、5、6、7 的数据无论是从逻辑上还是从经济意义上都说不通,故作删除处理。而违背验证规则 3 的数据,由于本文认为省份虚拟变量和城市虚拟变量不随时间变动,故做删除处理。

Code
head(values(cf))
##        V1   V3   V4   V5   V6   V7
## [1,] TRUE TRUE TRUE TRUE TRUE TRUE
## [2,] TRUE TRUE TRUE TRUE TRUE TRUE
## [3,] TRUE TRUE TRUE TRUE TRUE TRUE
## [4,] TRUE TRUE TRUE TRUE TRUE TRUE
## [5,] TRUE TRUE TRUE TRUE TRUE TRUE
## [6,] TRUE TRUE TRUE TRUE TRUE TRUE

temp <- apply(values(cf), MARGIN = 1, FUN = sum)
temp <- (temp == ncol(values(cf)) | is.na(temp)) # 6 条验证规则未通过
merged_data <- merged_data[temp, ]

缺失值探索

Code
# 每一列缺失值的数量
merged_data %>%
  summarise(
    across(everything(), function (x) sum(is.na(x)))
  ) %>% knitr::kable()
id fid.x fid.y provcd16 urban16 fml16 expense16 fincome16 family_debts16 family_asset16 age16 age2_16 gender16 marriage16 health16 provcd18 urban18 fml18 fincome18 expense18 family_debts18 family_asset18 age18 age2_18 gender18 marriage18 health18
0 0 0 0 102 0 0 16 20 153 0 0 0 0 692 0 35 0 0 8 55 243 0 0 43 43 863
Code
# 缺失模式探索
library(VIM)
VIM::aggr(merged_data, sortComb = TRUE, sortVar = TRUE, only.miss = TRUE)

## 
##  Variables sorted by number of missings: 
##        Variable       Count
##        health18 0.134465566
##        health16 0.107821751
##  family_asset18 0.037862262
##  family_asset16 0.023839202
##         urban16 0.015892801
##  family_debts18 0.008569648
##        gender18 0.006699907
##      marriage18 0.006699907
##         urban18 0.005453412
##  family_debts16 0.003116236
##       fincome16 0.002492988
##       expense18 0.001246494
##              id 0.000000000
##           fid.x 0.000000000
##           fid.y 0.000000000
##        provcd16 0.000000000
##           fml16 0.000000000
##       expense16 0.000000000
##           age16 0.000000000
##         age2_16 0.000000000
##        gender16 0.000000000
##      marriage16 0.000000000
##        provcd18 0.000000000
##           fml18 0.000000000
##       fincome18 0.000000000
##           age18 0.000000000
##         age2_18 0.000000000

删除连续两年核心变量缺失的观测(记录):

Code
clean_data <- merged_data %>%
  filter(!(is.na(family_asset16) | is.na(family_asset18))) %>%
  filter(!(is.na(family_debts16) & is.na(family_debts18)))

简单插补

对于 health18,health16,urban18,urban16,gender18,gender16,marriage18 和 marriage16 这些在短时间内不应该变化或者变化不大的变量,其缺失值应该用两年份对应变量的值相互插补。即 gender18 的缺失值使用对应的 gender16 的值填补,gender16 的缺失值使用对应的 gender18 的值填补……

Code
# gender18缺失值用gender16填补
clean_data %<>% 
  mutate(gender18 = if_else(is.na(gender18), gender16, gender18))

# marriage18缺失值用marriage16填补
clean_data %<>%
  mutate(marriage18 = if_else(is.na(marriage18), marriage16, marriage18))

# urban16和urban18相互插补
clean_data %<>%
  mutate(urban16 = if_else(is.na(urban16), urban18, urban16))
clean_data %<>%
  mutate(urban18 = if_else(is.na(urban18), urban16, urban18))

# health16和health18相互插补
clean_data %<>%
  mutate(health16 = if_else(is.na(health16), health18, health16))
clean_data %<>%
  mutate(health18 = if_else(is.na(health18), health16, health18))
Note

使用相互插补处理不代表没有缺失值,反而存在同为缺失值的情况

Code
library(mice)
md.pattern(clean_data)

##      id fid.x fid.y provcd16 fml16 expense16 family_asset16 age16 age2_16
## 5585  1     1     1        1     1         1              1     1       1
## 369   1     1     1        1     1         1              1     1       1
## 39    1     1     1        1     1         1              1     1       1
## 1     1     1     1        1     1         1              1     1       1
## 10    1     1     1        1     1         1              1     1       1
## 11    1     1     1        1     1         1              1     1       1
## 12    1     1     1        1     1         1              1     1       1
## 2     1     1     1        1     1         1              1     1       1
## 11    1     1     1        1     1         1              1     1       1
## 1     1     1     1        1     1         1              1     1       1
## 1     1     1     1        1     1         1              1     1       1
##       0     0     0        0     0         0              0     0       0
##      gender16 marriage16 provcd18 fml18 fincome18 family_asset18 age18 age2_18
## 5585        1          1        1     1         1              1     1       1
## 369         1          1        1     1         1              1     1       1
## 39          1          1        1     1         1              1     1       1
## 1           1          1        1     1         1              1     1       1
## 10          1          1        1     1         1              1     1       1
## 11          1          1        1     1         1              1     1       1
## 12          1          1        1     1         1              1     1       1
## 2           1          1        1     1         1              1     1       1
## 11          1          1        1     1         1              1     1       1
## 1           1          1        1     1         1              1     1       1
## 1           1          1        1     1         1              1     1       1
##             0          0        0     0         0              0     0       0
##      gender18 marriage18 expense18 fincome16 family_debts16 urban16 urban18
## 5585        1          1         1         1              1       1       1
## 369         1          1         1         1              1       1       1
## 39          1          1         1         1              1       1       1
## 1           1          1         1         1              1       1       1
## 10          1          1         1         1              1       0       0
## 11          1          1         1         1              1       0       0
## 12          1          1         1         1              0       1       1
## 2           1          1         1         1              0       1       1
## 11          1          1         1         0              1       1       1
## 1           1          1         1         0              1       1       1
## 1           1          1         0         1              1       1       1
##             0          0         1        12             14      21      21
##      family_debts18 health16 health18    
## 5585              1        1        1   0
## 369               1        0        0   2
## 39                0        1        1   1
## 1                 0        0        0   3
## 10                1        1        1   2
## 11                1        0        0   4
## 12                1        1        1   1
## 2                 1        0        0   3
## 11                1        1        1   1
## 1                 1        0        0   3
## 1                 1        1        1   1
##                  40      384      384 877

缺失值的体量太小,直接删除:

Code
clean_data %<>% drop_na()

# 每一列缺失值的数量
clean_data %>%
  summarise(
    across(everything(), function (x) sum(is.na(x)))
  ) %>% knitr::kable()
id fid.x fid.y provcd16 urban16 fml16 expense16 fincome16 family_debts16 family_asset16 age16 age2_16 gender16 marriage16 health16 provcd18 urban18 fml18 fincome18 expense18 family_debts18 family_asset18 age18 age2_18 gender18 marriage18 health18
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

保存数据

Code
haven::write_sav(clean_data, "./data/operated_data/clean_data.sav")