本文章向大家介绍day26-2 pandas模块,主要包括day26-2 pandas模块使用实例、应用技巧、基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下。
pandas
- 支持文件存取操作,支持数据库(sql)、html、json、pickle、csv(txt、excel)、sas、stata、hdf等。
import pandas as pd
import numpy as np
# 约定俗成
Series(了解)
# pd.Series([1, 2, 3, 4])
# 效果相同,只是上面的dtype是int64 (占用的字节数更大)
pd.Series(np.array([1, 2, 3, 4]))
0 1
1 2
2 3
3 4
dtype: int32
DataFrame
pd.DataFrame(np.array([[1, 2, 3, 4], [5, 6, 7, 8]]))
|
0 |
1 |
2 |
3 |
0 |
1 |
2 |
3 |
4 |
1 |
5 |
6 |
7 |
8 |
# 首先是拿到日期
dates = pd.date_range('2019-01-01', periods=7)
# 从2019-01-01开始,计算7天
dates
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
'2019-01-05', '2019-01-06', '2019-01-07'],
dtype='datetime64[ns]', freq='D')
# 再拿到商品名称
goods_list = ['tesla', 'transformer', 'chongqiwawa,', 'masaladi']
# 再获取商品价格信息
prices = np.random.rand(7, 4)
# 约定俗成df
# 里面的信息 竖标题 横标题
df = pd.DataFrame(prices, index=dates, columns=goods_list)
df
|
tesla |
transformer |
chongqiwawa, |
masaladi |
2019-01-01 |
0.754105 |
0.096779 |
0.299980 |
0.327802 |
2019-01-02 |
0.816899 |
0.286751 |
0.513483 |
0.804952 |
2019-01-03 |
0.523050 |
0.410259 |
0.772978 |
0.086772 |
2019-01-04 |
0.419899 |
0.535284 |
0.946628 |
0.387901 |
2019-01-05 |
0.171370 |
0.921370 |
0.656765 |
0.346406 |
2019-01-06 |
0.810353 |
0.945966 |
0.048220 |
0.525464 |
2019-01-07 |
0.073864 |
0.951866 |
0.609959 |
0.338945 |
# 存入excel中
df.to_excel('test.xlsx')
内置方法
dtype |
查看数据类型 |
index |
查看行序列或者索引 |
columns |
查看各列的标签 |
values |
查看数据框内的数据,也即不含表头索引的数据 |
describe |
查看数据每一列的极值,均值,中位数,只可用于数值型数据 |
transpose |
转置,也可用T来操作 |
sort_index |
排序,可按行或列index排序输出 |
sort_values |
按数据值来排序 |
# 获取每一列的数据类型
df.dtypes
tesla float64
transformer float64
chongqiwawa, float64
masaladi float64
dtype: object
# 获取索引
df.index
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
'2019-01-05', '2019-01-06', '2019-01-07'],
dtype='datetime64[ns]', freq='D')
# 获取列标题
df.columns
Index(['tesla', 'transformer', 'chongqiwawa,', 'masaladi'], dtype='object')
# 获取值
df.values
array([[0.00890968, 0.05729652, 0.59607516, 0.22337263],
[0.88853146, 0.77845106, 0.97974385, 0.14025457],
[0.46915634, 0.89172479, 0.5544319 , 0.86177713],
[0.93810727, 0.02787091, 0.68399802, 0.72312706],
[0.46445576, 0.27518564, 0.63898171, 0.23633146],
[0.12982823, 0.72375128, 0.20697944, 0.86700956],
[0.98446901, 0.66713909, 0.2430983 , 0.41013451]])
# 生成描述性统计
df.describe()
|
tesla |
transformer |
chongqiwawa, |
masaladi |
count |
7.000000 |
7.000000 |
7.000000 |
7.000000 |
mean |
0.554780 |
0.488774 |
0.557615 |
0.494572 |
std |
0.395182 |
0.359985 |
0.265876 |
0.315884 |
min |
0.008910 |
0.027871 |
0.206979 |
0.140255 |
25% |
0.297142 |
0.166241 |
0.398765 |
0.229852 |
50% |
0.469156 |
0.667139 |
0.596075 |
0.410135 |
75% |
0.913319 |
0.751101 |
0.661490 |
0.792452 |
max |
0.984469 |
0.891725 |
0.979744 |
0.867010 |
# 标题互换
df.transpose()
|
2019-01-01 00:00:00 |
2019-01-02 00:00:00 |
2019-01-03 00:00:00 |
2019-01-04 00:00:00 |
2019-01-05 00:00:00 |
2019-01-06 00:00:00 |
2019-01-07 00:00:00 |
tesla |
0.008910 |
0.888531 |
0.469156 |
0.938107 |
0.464456 |
0.129828 |
0.984469 |
transformer |
0.057297 |
0.778451 |
0.891725 |
0.027871 |
0.275186 |
0.723751 |
0.667139 |
chongqiwawa, |
0.596075 |
0.979744 |
0.554432 |
0.683998 |
0.638982 |
0.206979 |
0.243098 |
masaladi |
0.223373 |
0.140255 |
0.861777 |
0.723127 |
0.236331 |
0.867010 |
0.410135 |
# 按照index排序
df.sort_index()
|
tesla |
transformer |
chongqiwawa, |
masaladi |
2019-01-01 |
0.008910 |
0.057297 |
0.596075 |
0.223373 |
2019-01-02 |
0.888531 |
0.778451 |
0.979744 |
0.140255 |
2019-01-03 |
0.469156 |
0.891725 |
0.554432 |
0.861777 |
2019-01-04 |
0.938107 |
0.027871 |
0.683998 |
0.723127 |
2019-01-05 |
0.464456 |
0.275186 |
0.638982 |
0.236331 |
2019-01-06 |
0.129828 |
0.723751 |
0.206979 |
0.867010 |
2019-01-07 |
0.984469 |
0.667139 |
0.243098 |
0.410135 |
# 对'tesla'的值进行排序,为False则是倒序
df.sort_values(by=['tesla'], ascending=True)
|
tesla |
transformer |
chongqiwawa, |
masaladi |
2019-01-01 |
0.008910 |
0.057297 |
0.596075 |
0.223373 |
2019-01-06 |
0.129828 |
0.723751 |
0.206979 |
0.867010 |
2019-01-05 |
0.464456 |
0.275186 |
0.638982 |
0.236331 |
2019-01-03 |
0.469156 |
0.891725 |
0.554432 |
0.861777 |
2019-01-02 |
0.888531 |
0.778451 |
0.979744 |
0.140255 |
2019-01-04 |
0.938107 |
0.027871 |
0.683998 |
0.723127 |
2019-01-07 |
0.984469 |
0.667139 |
0.243098 |
0.410135 |
处理缺失值
test_data = '''
5.1,,1.4,0.2
4.9,3.0,1.4,0.2
4.7,3.2,,0.2
7.0,3.2,4.7,1.4
6.4,3.2,4.5,1.5
6.9,3.1,4.9,
,,,
'''
from io import StringIO
test_data = StringIO(test_data) # office把这个字符串读入内存
df = pd.read_csv(test_data) # 再处理成csv格式
df
|
5.1 |
Unnamed: 1 |
1.4 |
0.2 |
0 |
4.9 |
3.0 |
1.4 |
0.2 |
1 |
4.7 |
3.2 |
NaN |
0.2 |
2 |
7.0 |
3.2 |
4.7 |
1.4 |
3 |
6.4 |
3.2 |
4.5 |
1.5 |
4 |
6.9 |
3.1 |
4.9 |
NaN |
5 |
NaN |
NaN |
NaN |
NaN |
test_data = '''
c1,c2,c3,'c4'
5.1,,1.4,0.2
4.9,3.0,1.4,0.2
4.7,3.2,,0.2
7.0,3.2,4.7,1.4
6.4,3.2,4.5,1.5
6.9,3.1,4.9,
,,,
'''
from io import StringIO
test_data = StringIO(test_data) # office把这个字符串读入内存
df = pd.read_csv(test_data) # 再处理成csv格式
# df.columns = ['c1', 'c2', 'c3', 'c4'] # 可以使用这个添加列标题,但是会覆盖掉最上面一排
df
|
c1 |
c2 |
c3 |
c4 |
0 |
5.1 |
NaN |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
NaN |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
5 |
6.9 |
3.1 |
4.9 |
NaN |
6 |
NaN |
NaN |
NaN |
NaN |
# 这里又成了0是行,1是列
df.dropna(axis=0)
|
c1 |
c2 |
c3 |
c4 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
df.dropna(axis=1)
# 把每一排至少有3个正常数据的值打印出来
df.dropna(thresh=3, axis=0)
|
c1 |
c2 |
c3 |
c4 |
0 |
5.1 |
NaN |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
NaN |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
5 |
6.9 |
3.1 |
4.9 |
NaN |
# 把每一列至少有5个正常数据的值打印出来
df.dropna(thresh=5, axis=1)
|
c1 |
c2 |
c3 |
c4 |
0 |
5.1 |
NaN |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
NaN |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
5 |
6.9 |
3.1 |
4.9 |
NaN |
6 |
NaN |
NaN |
NaN |
NaN |
df.columns
Index(['c1', 'c2', 'c3', 'c4'], dtype='object')
# 把c2列中没有缺失值的值打印出来
df.dropna(subset=['c2'])
|
c1 |
c2 |
c3 |
c4 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
NaN |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
5 |
6.9 |
3.1 |
4.9 |
NaN |
# 打印列标题
df.columns
Index(['c1', 'c2', 'c3', 'c4'], dtype='object')
# 把缺失值全部赋值为0
df.fillna(value=0)
|
c1 |
c2 |
c3 |
c4 |
0 |
5.1 |
0.0 |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
0.0 |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
5 |
6.9 |
3.1 |
4.9 |
0.0 |
6 |
0.0 |
0.0 |
0.0 |
0.0 |
合并数据
df1 = pd.DataFrame(np.zeros((3, 4)))
df1
|
0 |
1 |
2 |
3 |
0 |
0.0 |
0.0 |
0.0 |
0.0 |
1 |
0.0 |
0.0 |
0.0 |
0.0 |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
df2 = pd.DataFrame(np.ones((3, 4)))
df2
|
0 |
1 |
2 |
3 |
0 |
1.0 |
1.0 |
1.0 |
1.0 |
1 |
1.0 |
1.0 |
1.0 |
1.0 |
2 |
1.0 |
1.0 |
1.0 |
1.0 |
# 这里的axis又反了,所以不用记,用之前先去试一下就知道了
pd.concat((df1, df2), axis=0)
|
0 |
1 |
2 |
3 |
0 |
0.0 |
0.0 |
0.0 |
0.0 |
1 |
0.0 |
0.0 |
0.0 |
0.0 |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
0 |
1.0 |
1.0 |
1.0 |
1.0 |
1 |
1.0 |
1.0 |
1.0 |
1.0 |
2 |
1.0 |
1.0 |
1.0 |
1.0 |
pd.concat((df1, df2), axis=1)
|
0 |
1 |
2 |
3 |
0 |
1 |
2 |
3 |
0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
1.0 |
1.0 |
1.0 |
1 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
1.0 |
1.0 |
1.0 |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
1.0 |
1.0 |
1.0 |
取值
# 把之前存的excel取出来
df = pd.read_excel('test.xlsx', header=0, index_col=0)
df
|
tesla |
transformer |
chongqiwawa, |
masaladi |
2019-01-01 |
0.754105 |
0.096779 |
0.299980 |
0.327802 |
2019-01-02 |
0.816899 |
0.286751 |
0.513483 |
0.804952 |
2019-01-03 |
0.523050 |
0.410259 |
0.772978 |
0.086772 |
2019-01-04 |
0.419899 |
0.535284 |
0.946628 |
0.387901 |
2019-01-05 |
0.171370 |
0.921370 |
0.656765 |
0.346406 |
2019-01-06 |
0.810353 |
0.945966 |
0.048220 |
0.525464 |
2019-01-07 |
0.073864 |
0.951866 |
0.609959 |
0.338945 |
# 按照索引取值
df.loc['2019-01-01']
tesla 0.754105
transformer 0.096779
chongqiwawa, 0.299980
masaladi 0.327802
Name: 2019-01-01 00:00:00, dtype: float64
# 类似于numpy取值
df.iloc[0, 0]
0.7541054007912974
df.iloc[0, :]
tesla 0.754105
transformer 0.096779
chongqiwawa, 0.299980
masaladi 0.327802
Name: 2019-01-01 00:00:00, dtype: float64
把表格传入excel文件中
df.to_excel('test.slsx')
把表格从excel中取出来
df = pd.read_excel('test.xlsx', header=0, index_col=0)
高级(了解)
where,apply
原文地址:https://www.cnblogs.com/lucky75/p/11011610.html