Rui - 使用 seaborn 探索数据

Code

import pandas as pd
import warnings
warnings.filterwarnings("ignore")

Data = pd.read_csv('F:/RuiBlog/posts/Python/EDA作图/data/data.CSV', index_col = 0, encoding = "gbk") # 导入数据，将第一列设置为索引
Data = Data.reset_index()
Data

	编号	性别	年龄	省份	受教育程度	居住地类型	职业	个人月收入	家庭月收入	自身行为13	...	态度错觉31	态度错觉34	态度错觉三	态度错觉41	态度错觉42	态度错觉43	态度错觉44	态度错觉四	态度错觉总均值	态度错觉错估方向
0	10	女	30	5	初中	农村	4	3.0	NaN	5	...	-0.2931	-0.5315	-0.4123	0.6912	-0.3456	-0.8613	-0.5042	-0.254975	-0.5578	负
1	12	女	30	5	初中	城镇	1	1.0	NaN	5	...	-0.2931	-1.5315	-0.9123	-0.3088	0.6544	0.1387	0.4958	0.245025	-0.4578	负
2	487	女	32	5	初中	农村	2	3.0	NaN	3	...	0.2931	0.4685	0.3808	-0.3088	-1.3456	0.1387	-0.5042	-0.504975	-0.8578	负
3	805	女	52	5	初中	城市	12	3.0	NaN	4	...	-0.2931	-0.5315	-0.4123	-0.3088	-0.3456	0.1387	0.4958	-0.004975	-0.0578	负
4	663	男	22	5	中专/职高/高中	城镇	14	NaN	4.0	5	...	-0.2931	-0.5315	-0.4123	0.6912	0.6544	0.1387	0.4958	0.495025	0.2422	正
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
947	161	女	22	2	博士	城市	14	NaN	2.0	5	...	-1.2931	-0.5315	-0.9123	0.6912	-0.3456	-1.8613	0.4958	-0.254975	-0.5578	负
948	887	女	30	5	博士	城市	9	5.0	NaN	4	...	-0.2931	-1.5315	-0.9123	-1.3088	-0.3456	0.1387	-0.5042	-0.504975	-0.5578	负
949	600	女	32	4	博士	城市	6	6.0	NaN	5	...	-0.2931	-0.5315	-0.4123	-1.3088	-0.3456	-1.8613	-1.5042	-1.254975	-0.9578	负
950	538	女	36	1	博士	城镇	7	9.0	NaN	4	...	-0.2931	-0.5315	-0.4123	0.6912	-1.3456	-0.8613	-0.5042	-0.504975	-0.2578	负
951	532	女	37	1	博士	城镇	7	9.0	NaN	3	...	-0.2931	-1.5315	-0.9123	-1.3088	0.6544	-0.8613	-0.5042	-0.504975	-0.6578	负

952 rows × 102 columns

Code

df1 = Data[['自身行为总均值', '他人行为总均值', '自身态度总均值', '他人态度总均值']]
df1.columns = ["A", "B", "C", "D"] # 更改列名，列名中有中文容易乱码
df1.head()

	A	B	C	D
0	3.9	4.0	3.7	3.8
1	4.2	4.1	3.7	3.9
2	2.8	2.8	4.0	3.5
3	3.9	3.8	4.7	4.3
4	4.5	4.6	4.6	4.6

Code

round(df1.describe(), 2) # 结果保留两位小数

	A	B	C	D
count	952.00	952.00	952.00	952.00
mean	3.91	3.49	4.36	4.08
std	0.57	0.71	0.41	0.54
min	1.30	1.20	1.40	1.20
25%	3.50	3.00	4.10	3.80
50%	4.00	3.50	4.40	4.20
75%	4.40	4.00	4.60	4.50
max	5.00	5.00	5.00	5.00

箱线图

导入相关库并作图：

Code

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

Code

plt.rcParams['font.sans-serif'] = ['Times New Roman'] # 字体为Times New Roman
plt.rcParams['axes.unicode_minus'] = False # 正常显示负号

colnames = df1.columns.tolist()  # 列表头
plt.figure(figsize=(10, 10))  # 指定绘图对象宽度和高度
for i in range(0, len(colnames)):
    plt.subplot(2, 2, i + 1)  # 2行2列子图
    sns.boxplot(df1[colnames[i]], orient="v", width=0.5)  # 箱线图
    plt.ylabel(colnames[i], fontsize=16) # 设置每个子图y轴的名称和字体大小
    plt.yticks(fontsize=12) # 设置每个子图y轴的刻度的字体大小
    
plt.tight_layout()
plt.show()

上面一个代码块来自我之前保存的模板。今天运行突然发现不好用了，明明已经设置了垂直放置箱线图（orient="v"）但是输出的图片中箱体依然是水平放置（orient="h"）。上网一查果然 seaborn 发生了重大更新。在看了 Gallery 中的一些模板后惊讶地发现 pandas 以及更新后的 seaborn 越来越有 tidyverse 的味道了。主要体现在对数据形式的敏感程度上：在向 seaborn 传入数据前经常需要使用 pandas 中的 melt 函数将宽数据转换为长数据，而这在以前 seaborn 绘图的时候是没有那么讲究的（或许是一直都有只是我没注意？）。

下面的示例来自 seaborn 官网的 Gallery，首先要将原始的宽数据转换成长数据：

Code

df1_long = pd.melt(df1, value_vars=df1.columns)
df1_long

	variable	value
0	A	3.9
1	A	4.2
2	A	2.8
3	A	3.9
4	A	4.5
...	...	...
3803	D	3.8
3804	D	3.8
3805	D	3.4
3806	D	4.1
3807	D	3.7

3808 rows × 2 columns

Code

sns.set_theme(style="ticks")

# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(7, 6))

# Plot the orbital period with horizontal boxes
sns.boxplot(x="variable", y="value", data=df1_long, width=.6, palette="vlag")

# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(ylabel="")
sns.despine(trim=True, left=True)

直方图-核密度估计图-PP图

Code

data_cols = 4
data_rows = df1.shape[1]//2
plt.figure(figsize=(4*data_cols, 4*data_rows))

plt.rcParams['font.sans-serif'] = ['Times New Roman'] # 字体为Times New Roman
plt.rcParams['axes.unicode_minus'] = False # 正常显示负号

i = 0
for col in df1.columns:
    i+=1
    ax = plt.subplot(data_rows, data_cols, i)
    sns.distplot(df1[col], fit = stats.norm)
    plt.xticks([]) # 去除x轴刻度，让子图美观
    plt.yticks([]) # 去除y轴刻度，让子图美观
    plt.xlabel(col, fontsize = 14) # 设置每个子图x轴的名称和字体大小
    
    i+=1
    ax = plt.subplot(data_rows, data_cols, i)
    res = stats.probplot(df1[col], plot = plt)
    plt.xticks([])
    plt.yticks([])
    
plt.tight_layout()
plt.show()

热力图

Code

variables_corr = df1.loc[:, :].corr() # 计算变量的相关系数矩阵

plt.figure(figsize=(10, 10))
# 画出热力图，格子为方格，显示数值，设置数值字体大小，隐藏colorbar，设置颜色变化范围为0-1（因为观察得知相关系数均大于1）
heat = sns.heatmap(data=variables_corr, square=True, 
                   annot=True, annot_kws={'fontsize':14}, 
                   cbar_kws={'shrink': 0.8},
                   center=1, robust=True) 

#cb = heat.figure.colorbar(heat.collections[0]) #显示colorbar
#cb.ax.tick_params(labelsize=14)  # 设置colorbar刻度字体大小。

plt.xlabel('Variables', fontsize=18) # 设置x轴的名称和字体大小
plt.ylabel('Variables', fontsize=18) # 设置y轴的名称和字体大小
plt.xticks(fontsize=14) # 设置x轴刻度的字体大小
plt.yticks(fontsize=14) # 设置y轴刻度的字体大小
plt.title('Heatmap of Variables', fontsize=26) # 设置热力图图名和字体大小

plt.show()

可以给更改颜色，颜色配置参见：http://seaborn.pydata.org/tutorial/color_palettes.html

Code

variables_corr = df1.loc[:, :].corr() # 计算变量的相关系数矩阵

plt.figure(figsize=(10, 10))
# 画出热力图，格子为方格，显示数值，设置数值字体大小，隐藏colorbar，设置颜色变化范围为0-1（因为观察得知相关系数均大于1）
heat = sns.heatmap(data=variables_corr, square=True, 
                   annot=True, annot_kws={'fontsize':14}, 
                   cbar_kws={'shrink': 0.8},
                   cmap="viridis", robust=True) 

#cb = heat.figure.colorbar(heat.collections[0]) #显示colorbar
#cb.ax.tick_params(labelsize=14)  # 设置colorbar刻度字体大小。

plt.xlabel('Variables', fontsize=18) # 设置x轴的名称和字体大小
plt.ylabel('Variables', fontsize=18) # 设置y轴的名称和字体大小
plt.xticks(fontsize=14) # 设置x轴刻度的字体大小
plt.yticks(fontsize=14) # 设置y轴刻度的字体大小
plt.title('Heatmap of Variables', fontsize=26) # 设置热力图图名和字体大小

plt.show()