《利用python进行数据分析》学习之文件加载、写入

csv文件读取¶

In [1]:

import pandas as pd
import sys

In [2]:

%cat examples/ex2.csv

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [3]:

#文件没有标签数据
pd.read_csv('examples/ex2.csv',header=None)

Out[3]:

	0	1	2	3	4
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

In [4]:

pd.read_csv('examples/ex2.csv',names=['a','b','c','d','massage'])

Out[4]:

	a	b	c	d	massage
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

In [5]:

#将其中的一列设为索引列
pd.read_csv('examples/ex2.csv',names=['a','b','c','d','massage'],index_col='massage')

Out[5]:

	a	b	c	d
massage
hello	1	2	3	4
world	5	6	7	8
foo	9	10	11	12

In [6]:

list(open('examples/ex3.txt'))

Out[6]:

['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']

In [7]:

#多出的一列数据自动识别为索引，分隔符不同使用正则表达式
pd.read_csv('examples/ex3.csv',sep='\s+')

Out[7]:

	A	B	C
aaa	-0.264438	-1.026059	-0.619500
bbb	0.927272	0.302904	-0.032399
ccc	-0.264273	-0.386314	-0.217601
ddd	-0.871858	-0.348382	1.100491

将读取的非空数据设为NaN¶

In [8]:

%cat examples/ex5.csv

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo

In [9]:

pd.read_csv('examples/ex5.csv',na_values={'something':'two','massage':['NA','foo']})

Out[9]:

	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	NaN	5	6	NaN	8	world
2	three	9	10	11.0	12	foo

In [10]:

#只读取一部分数据
pd.read_csv('examples/ex6.csv',nrows=10)

Out[10]:

	one	two	three	four	key
0	0.467976	-0.038649	-0.295344	-1.824726	L
1	-0.358893	1.404453	0.704965	-0.200638	B
2	-0.501840	0.659254	-0.421691	-0.057688	G
3	0.204886	1.074134	1.388361	-0.982404	R
4	0.354628	-0.133116	0.283763	-0.837063	Q
5	1.817480	0.742273	0.419395	-2.251035	Q
6	-0.776764	0.935518	-0.332872	-1.875641	U
7	-0.913135	1.530624	-0.572657	0.477252	K
8	0.358480	-0.497572	-0.367016	0.507702	S
9	-1.740877	-1.160417	-1.637830	2.172201	G

In [11]:

#将数据分块读取
chunker = pd.read_csv('examples/ex6.csv',chunksize=1000)
for piece in chunker:
    print(piece.iloc[0])

one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              L
Name: 0, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              T
Name: 1000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              1
Name: 2000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              H
Name: 3000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              H
Name: 4000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              1
Name: 5000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              I
Name: 6000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              1
Name: 7000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              7
Name: 8000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              B
Name: 9000, dtype: object

写入csv文件¶

In [12]:

data = pd.read_csv('examples/ex5.csv')
data.to_csv(sys.stdout)

,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo

In [13]:

data.to_csv(sys.stdout,sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo

In [14]:

#对缺失值进行标识
data.to_csv(sys.stdout,na_rep='NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo

In [15]:

data.to_csv(sys.stdout,index=False,header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo

In [16]:

#只写入子集
data.to_csv(sys.stdout,index=False,columns=['a','b','c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0

json文件¶

In [17]:

%cat examples/example.json

[{"a": 1, "b": 2, "c": 3},
 {"a": 4, "b": 5, "c": 6},
 {"a": 7, "b": 8, "c": 9}]

In [18]:

data = pd.read_json('examples/example.json')
data

Out[18]:

	a	b	c
0	1	2	3
1	4	5	6
2	7	8	9

In [19]:

data.to_json(sys.stdout)

{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}

In [20]:

#按行写入
data.to_json(sys.stdout,orient='records')

[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]

HTML¶

In [21]:

#搜索并解析包含在table标签中的数据
tables = pd.read_html('examples/fdic_failed_bank_list.html')
#只有一张表格
len(tables)

Out[21]:

In [22]:

data = tables[0]
data.head()

Out[22]:

	Bank Name	City	ST	CERT	Acquiring Institution	Closing Date	Updated Date
0	Allied Bank	Mulberry	AR	91	Today's Bank	September 23, 2016	November 17, 2016
1	The Woodbury Banking Company	Woodbury	GA	11297	United Bank	August 19, 2016	November 17, 2016
2	First CornerStone Bank	King of Prussia	PA	35312	First-Citizens Bank & Trust Company	May 6, 2016	September 6, 2016
3	Trust Company Bank	Memphis	TN	9956	The Bank of Fayette County	April 29, 2016	September 6, 2016
4	North Milwaukee State Bank	Milwaukee	WI	20364	First-Citizens Bank & Trust Company	March 11, 2016	June 16, 2016

excel¶

In [23]:

data = pd.read_excel('examples/ex1.xlsx','Sheet1')
data

Out[23]:

	Unnamed: 0	a	b	c	d	message
0	0	1	2	3	4	hello
1	1	5	6	7	8	world
2	2	9	10	11	12	foo

In [24]:

data.to_excel('examples/ex2.xlsx')

Web API¶

In [25]:

import requests
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
resp = requests.get(url)
resp

Out[25]:

<Response [200]>

In [26]:

data = resp.json()#data为字典数组
issues = pd.DataFrame(data,columns=['title','url','state','labels'])#提取部分字段
issues.head()

Out[26]:

	title	url	state	labels
0	Resample on MultiIndex level is much longer th...	https://api.github.com/repos/pandas-dev/pandas...	open	[]
1	BUG: value_counts can handle the case even wit...	https://api.github.com/repos/pandas-dev/pandas...	open	[]
2	Build error on Arch linux	https://api.github.com/repos/pandas-dev/pandas...	open	[]
3	BUG: DataFrame.to_html validates formatters ha...	https://api.github.com/repos/pandas-dev/pandas...	open	[]
4	BUG DataFream.rindex方法参数的错误	https://api.github.com/repos/pandas-dev/pandas...	open	[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...