Python数据分析之Pandas（数据操作）

Pandas 数据操作

import pandas as pd

Series索引

ser_obj = pd.Series(range(5), index = ['a', 'b', 'c', 'd', 'e'])
ser_obj.head()

a    0
b    1
c    2
d    3
e    4
dtype: int32

行索引

# 行索引
ser_obj['a'] #等同描述ser_obj[0]

切片索引可以按照默认索引号，也可以按照实际索引值

# 切片索引（按索引号）
ser_obj[1:3] #python索引默认是左闭右开

b    1
c    2
dtype: int32

# 切片索引（按索引值）
ser_obj['b':'d']

b    1
c    2
d    3
dtype: int32

不连续索引，同样可以按照默认索引号，也可以按照实际索引值

# 不连续索引表达一（按索引号）
ser_obj[[0, 2, 4]]

a    0
c    2
e    4
dtype: int32

# 不连续索引表达二（按索引值）
ser_obj[['a', 'e']]

a    0
e    4
dtype: int32

布尔索引

# 布尔索引
ser_bool = ser_obj > 2
print(ser_bool)
print()
print(ser_obj[ser_bool])
print()
print(ser_obj[ser_obj > 2])

a    False
b    False
c    False
d     True
e     True
dtype: bool
d    3
e    4
dtype: int32
d    3
e    4
dtype: int32

DataFrame索引

import numpy as np

df_obj = pd.DataFrame(np.random.randn(5,4), columns = ['a', 'b', 'c', 'd'])
df_obj.head()

	a	b	c	d
0	0.983790	1.063804	0.854634	-1.269025
1	0.161653	-0.904602	-1.840041	0.138183
2	-1.256608	-1.740634	-1.653686	-0.412524
3	0.165782	1.116089	0.065008	-1.693706
4	1.313987	0.734437	-0.625647	-1.738446

列索引

# 列索引
print(type(df_obj['a'])) # 返回Series类型
df_obj['a'] # 返回对应列值

<class 'pandas.core.series.Series'>
0    0.983790
1    0.161653
2   -1.256608
3    0.165782
4    1.313987
Name: a, dtype: float64

行索引

# 行索引
print(type(df_obj.loc[0])) # 返回Series类型
df_obj.loc[0] # 返回对应行值

<class 'pandas.core.series.Series'>
a    0.983790
b    1.063804
c    0.854634
d   -1.269025
Name: 0, dtype: float64

不连续索引

#不连续列索引
df_obj[['a','c']]  #不连续列索引

	a	c
0	0.983790	0.854634
1	0.161653	-1.840041
2	-1.256608	-1.653686
3	0.165782	0.065008
4	1.313987	-0.625647

#不连续行索引
df_obj.loc[[1, 3]] #不连续行索引

	a	b	c	d
1	0.161653	-0.904602	-1.840041	0.138183
3	0.165782	1.116089	0.065008	-1.693706

混合索引

# 混合索引 loc
print(df_obj.loc[0:2, 'a']) # 连续行加列索引(这里是从0-2)
print()
print(df_obj.loc[[0,2,4], 'a']) # 不连续行加列索引

0   -1.018941
1    0.089275
2   -2.210780
Name: a, dtype: float64
0   -1.018941
2   -2.210780
4    1.435787
Name: a, dtype: float64

运算与对齐

`Series` 对齐操作

s1 = pd.Series(range(10, 13), index = range(3))
s2 = pd.Series(range(20, 25), index = range(5))

print('s1: ' )
print(s1)

print('') 

print('s2: ')
print(s2)

s1: 
0    10
1    11
2    12
dtype: int32
s2: 
0    20
1    21
2    22
3    23
4    24
dtype: int32

# Series 对齐运算
print(s1 + s2) # 没有对应上的部分会显示NaN
print()
print(s1.add(s2, fill_value = -1)) # 没有对应上的部分会填充-1，然后运算
print()
s3 = s1 + s2
s3_filled = s3.fillna(-1)
print(s3_filled) ## 先运算，然后NaN填充为-1

0    30.0
1    32.0
2    34.0
3     NaN
4     NaN
dtype: float64
0    30.0
1    32.0
2    34.0
3    22.0
4    23.0
dtype: float64
0    30.0
1    32.0
2    34.0
3    -1.0
4    -1.0
dtype: float64

`DataFrame` 对齐操作

import numpy as np

df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b'])
df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c'])

print('df1: ')
print(df1)

print('') 
print('df2: ')
print(df2)

df1: 
     a    b
0  1.0  1.0
1  1.0  1.0
df2: 
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0

# DataFrame对齐操作
df1 + df2 # 没有对应上的部分会显示NaN

	a	b	c
0	2.0	2.0	NaN
1	2.0	2.0	NaN
2	NaN	NaN	NaN

df1.add(df2, fill_value = 0) # 加法操作，没有对应上的补零

	a	b	c
0	2.0	2.0	1.0
1	2.0	2.0	1.0
2	1.0	1.0	1.0

df1 - df2 # 没有对应上的部分会显示NaN

	a	b	c
0	0.0	0.0	NaN
1	0.0	0.0	NaN
2	NaN	NaN	NaN

df1.sub(df2, fill_value = 2) # 加法操作，没有对应上的补2(先补充后运算)

	a	b	c
0	0.0	0.0	1.0
1	0.0	0.0	1.0
2	1.0	1.0	1.0

df3 = df1 + df2
df3.fillna(100, inplace = True) # 先运行加法操作，没有对应上的补2(先运算，后补充)
df3

	a	b	c
0	2.0	2.0	100.0
1	2.0	2.0	100.0
2	100.0	100.0	100.0

函数应用

可以与NumPy中的ufunc函数结合操作

# Numpy ufunc 函数
df = pd.DataFrame(np.random.randn(5,4) - 1)
df

	0	1	2	3
0	-0.938212	-2.487779	-1.805374	-1.130723
1	-0.533441	0.196536	-1.094895	-1.819312
2	-3.233318	0.255510	-1.560183	-2.404621
3	-1.956924	-2.947539	-1.640760	-0.757321
4	0.198618	0.344484	-0.893815	-0.498036

np.abs(df) #取绝对值（还有其他诸多NumPy中的函数可以操作）

	0	1	2	3
0	0.938212	2.487779	1.805374	1.130723
1	0.533441	0.196536	1.094895	1.819312
2	3.233318	0.255510	1.560183	2.404621
3	1.956924	2.947539	1.640760	0.757321
4	0.198618	0.344484	0.893815	0.498036

使用apply应用行或列数据

# 使用apply应用行或列数据
# f = lambda x : x.max() # lambda存在意义就是对简单函数的简洁表示
def f(x):
    return x.max()

df.apply(f) # 默认按行比较（得到每列的最大值）

0    0.198618
1    0.344484
2   -0.893815
3   -0.498036
dtype: float64

df.apply(lambda x : x.max(), axis=1) # 按列比较（得到每行的最大值）

0   -0.938212
1    0.196536
2    0.255510
3   -0.757321
4    0.344484
dtype: float64

df.apply(lambda x : x.max(), axis=0) # # 按行比较（得到每列的最大值）

0    0.198618
1    0.344484
2   -0.893815
3   -0.498036
dtype: float64

使用applymap应用到每个数据

# 使用applymap应用到每个数据
f2 = lambda x : '%.2f' % x #每个数据显示只保留两位小数
df.applymap(f2)

	0	1	2	3
0	-0.94	-2.49	-1.81	-1.13
1	-0.53	0.20	-1.09	-1.82
2	-3.23	0.26	-1.56	-2.40
3	-1.96	-2.95	-1.64	-0.76
4	0.20	0.34	-0.89	-0.50

排序

Series索引排序 & 值排序

#索引乱序生成
s4 = pd.Series([10,13,12,25,14], index = [2,1,5,3,4])
s4

2    10
1    13
5    12
3    25
4    14
dtype: int64

# 索引排序
s4.sort_index(ascending=False) #  索引倒序排列

5    12
4    14
3    25
2    10
1    13
dtype: int64

# 值排序
s4.sort_values()

2    10
5    12
1    13
4    14
3    25
dtype: int64

DataFrame 索引排序 & 值排序

df4 = pd.DataFrame(np.random.randn(3, 4), 
                   index=[1,3,2],
                   columns=[1,4,2,3])
df4

	1	4	2	3
1	0.948112	0.076323	0.089607	0.091737
3	-1.254556	1.483504	0.468995	0.286249
2	-0.806738	-0.842388	-1.127489	-0.020803

#按索引排序
df4.sort_index(ascending=False)# 对横轴按倒序排列

	1	4	2	3
3	-1.254556	1.483504	0.468995	0.286249
2	-0.806738	-0.842388	-1.127489	-0.020803
1	0.948112	0.076323	0.089607	0.091737

#按索引排序
df4.sort_index(axis=1) #列轴按序排列

	1	2	3	4
1	0.948112	0.089607	0.091737	0.076323
3	-1.254556	0.468995	0.286249	1.483504
2	-0.806738	-1.127489	-0.020803	-0.842388

#按列排序
df4.sort_values(by=1) # by参数的作用是针对某一（些）列进行排序（不能对行使用 by 参数）

	1	4	2	3
3	-1.254556	1.483504	0.468995	0.286249
2	-0.806738	-0.842388	-1.127489	-0.020803
1	0.948112	0.076323	0.089607	0.091737

处理缺失数据

生成数据

df_data = pd.DataFrame([np.random.randn(3), [1., np.nan, np.nan],
                       [4., np.nan, np.nan], [1., np.nan, 2.]])
df_data.head()

	0	1	2
0	1.089477	-0.486706	-0.322284
1	1.000000	NaN	NaN
2	4.000000	NaN	NaN
3	1.000000	NaN	2.000000

二值化（NaN为False，非NaN为True）

# isnull
df_data.isnull()

	0	1	2
0	False	False	False
1	False	True	True
2	False	True	True
3	False	True	False

丢掉有NaN的行或列

# dropna
print(df_data.dropna()) #默认丢掉有NaN的行
print()
print(df_data.dropna(axis=1)) #丢掉有NaN的列

          0         1         2
0  1.089477 -0.486706 -0.322284
          0
0  1.089477
1  1.000000
2  4.000000
3  1.000000

填充NaN值

# fillna
df_data.fillna(-100.) # NaN值填充为-100

	0	1	2
0	1.089477	-0.486706	-0.322284
1	1.000000	-100.000000	-100.000000
2	4.000000	-100.000000	-100.000000
3	1.000000	-100.000000	2.000000

数据统计计算和描述

常用的统计计算

df_obj = pd.DataFrame(np.random.randn(5,4), columns = ['a', 'b', 'c', 'd'])
df_obj

	a	b	c	d
0	0.145119	-2.398595	0.640806	0.696701
1	-0.877139	-0.261616	-2.211734	0.140729
2	-0.644545	0.523667	-1.460002	-0.341459
3	1.369260	1.039981	0.164075	0.380755
4	0.089507	-0.371051	1.348191	-0.828315

df_obj.sum()

a    0.082203
b   -1.467614
c   -1.518663
d    0.048410
dtype: float64

df_obj.max()

a    1.369260
b    1.039981
c    1.348191
d    0.696701
dtype: float64

df_obj.min(axis=1)

0   -2.398595
1   -2.211734
2   -1.460002
3    0.164075
4   -0.828315
dtype: float64

统计描述

df_obj.describe()

	a	b	c	d
count	5.000000	5.000000	5.000000	5.000000
mean	0.016441	-0.293523	-0.303733	0.009682
std	0.878550	1.311906	1.484695	0.602578
min	-0.877139	-2.398595	-2.211734	-0.828315
25%	-0.644545	-0.371051	-1.460002	-0.341459
50%	0.089507	-0.261616	0.164075	0.140729
75%	0.145119	0.523667	0.640806	0.380755
max	1.369260	1.039981	1.348191	0.696701