Python数据分析之NumPy（高级篇）

一些更高级的ndarray处理

where和一些其他的逻辑运算

np.where(cond,x,y)：满足条件（cond）输出x，不满足输出y

x_arr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
y_arr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
cond = np.array([True, False, True, True, False])
print(np.where(cond, x_arr, y_arr))

[ 1.1  2.2  1.3  1.4  2.5]

arr = np.random.randn(4,4)
print(arr)
print(np.where(arr > 0, 2, -2))
print(np.where(arr > 0, 2, arr))

[[ -1.10484247e+00  -3.82422727e-01  -3.24361549e-01   1.21286234e+00]
 [  1.54499855e-01  -4.77728163e-04   1.44621074e+00  -2.64241611e-03]
 [  1.36394862e+00   6.96638259e-02  -2.75237740e-01  -3.32892881e-01]
 [ -1.37165175e+00   1.79997993e-01  -1.13509664e-01   1.88373639e+00]]
[[-2 -2 -2  2]
 [ 2 -2  2 -2]
 [ 2  2 -2 -2]
 [-2  2 -2  2]]
[[ -1.10484247e+00  -3.82422727e-01  -3.24361549e-01   2.00000000e+00]
 [  2.00000000e+00  -4.77728163e-04   2.00000000e+00  -2.64241611e-03]
 [  2.00000000e+00   2.00000000e+00  -2.75237740e-01  -3.32892881e-01]
 [ -1.37165175e+00   2.00000000e+00  -1.13509664e-01   2.00000000e+00]]

np.where可以嵌套使用

cond_1 = np.array([True, False, True, True, False])
cond_2 = np.array([False, True, False, True, False])
result = np.where(cond_1 & cond_2, 0, 
          np.where(cond_1, 1, np.where(cond_2, 2, 3)))
print(result)

[1 2 1 0 3]

arr = np.random.randn(10)
print(arr)
print((arr > 0).sum()) #数组中大于0的数相加

[ 0.27350655 -1.51093462  0.26835915 -0.45991855  1.34450904 -1.86871203
  0.04308971  1.69640444 -0.02191351 -0.43875275]
5

bools = np.array([False, False, True, False])
print(bools.any()) # 有一个为True则返回True
print(bools.all()) # 有一个为False则返回False

True
False

reshape（数组变形）

numpy可以很容易地把一维数组转成二维数组，三维数组。

import numpy as np

arr = np.arange(8)
print("(4,2):n", arr.reshape((4,2)))
print()
print("(2,2,2):n", arr.reshape((2,2,2)))

(4,2):
 [[0 1]
 [2 3]
 [4 5]
 [6 7]]
(2,2,2):
 [[[0 1]
  [2 3]]
 [[4 5]
  [6 7]]]

-1（维度自动推算）

如果我们在某一个维度上写上-1，numpy会帮我们自动推导出正确的维度

arr = np.arange(15)
print(arr.reshape((5,-1)))
print(arr.reshape((5,-1)).shape)

[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]
 [12 13 14]]
(5, 3)

ravel（拉平数组）

# 高维数组用ravel来拉平成为一维数组
arr = np.arange(15)
print(arr.ravel())

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]

concatenate（连接数组）

arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])
print(np.concatenate([arr1, arr2], axis = 0))  # 按行连接
print(np.concatenate([arr1, arr2], axis = 1))  # 按列连接

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]
[[ 1  2  3  7  8  9]
 [ 4  5  6 10 11 12]]

连接的另一种表述垂直stack与水平stack

print(np.vstack((arr1, arr2))) # 垂直堆叠
print(np.hstack((arr1, arr2))) # 水平堆叠

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]
[[ 1  2  3  7  8  9]
 [ 4  5  6 10 11 12]]

split（拆分数组）

arr = np.random.rand(5,5)
print(arr)

[[ 0.08218151  0.25291976  0.990262    0.74980044  0.92433676]
 [ 0.57215647  0.88759783  0.67939949  0.18618301  0.64810013]
 [ 0.21424794  0.5812622   0.33170632  0.40780156  0.00946797]
 [ 0.46223634  0.53574553  0.25289433  0.33226224  0.26110024]
 [ 0.81823359  0.98863697  0.13713923  0.3520669   0.38301044]]

first, second, third = np.split(arr, [1,3], axis = 0) # 按行拆分
print(first)
print()
print(second)
print()
print(third)

[[ 0.08218151  0.25291976  0.990262    0.74980044  0.92433676]]
[[ 0.57215647  0.88759783  0.67939949  0.18618301  0.64810013]
 [ 0.21424794  0.5812622   0.33170632  0.40780156  0.00946797]]
[[ 0.46223634  0.53574553  0.25289433  0.33226224  0.26110024]
 [ 0.81823359  0.98863697  0.13713923  0.3520669   0.38301044]]

first, second, third = np.split(arr, [1, 3], axis = 1) # 按列拆分
print(first)
print()
print(second)
print()
print(third)

[[ 0.08218151]
 [ 0.57215647]
 [ 0.21424794]
 [ 0.46223634]
 [ 0.81823359]]
[[ 0.25291976  0.990262  ]
 [ 0.88759783  0.67939949]
 [ 0.5812622   0.33170632]
 [ 0.53574553  0.25289433]
 [ 0.98863697  0.13713923]]
[[ 0.74980044  0.92433676]
 [ 0.18618301  0.64810013]
 [ 0.40780156  0.00946797]
 [ 0.33226224  0.26110024]
 [ 0.3520669   0.38301044]]

堆叠辅助

arr = np.arange(6)
arr1 = arr.reshape((3, 2))
arr2 = np.random.randn(3, 2)
#r_用于按行堆叠
print(np.r_[arr1, arr2])
print()
#c_用于按列堆叠
print(np.c_[np.r_[arr1, arr2], arr])
print()
#切片直接转为数组
print(np.c_[1:6, -10:-5])
print()

[[ 0.          1.        ]
 [ 2.          3.        ]
 [ 4.          5.        ]
 [ 0.04811148 -1.93674347]
 [ 1.19646481  0.17346639]
 [-1.4388562  -1.41584843]]
[[ 0.          1.          0.        ]
 [ 2.          3.          1.        ]
 [ 4.          5.          2.        ]
 [ 0.04811148 -1.93674347  3.        ]
 [ 1.19646481  0.17346639  4.        ]
 [-1.4388562  -1.41584843  5.        ]]
[[  1 -10]
 [  2  -9]
 [  3  -8]
 [  4  -7]
 [  5  -6]]

repeat（数组重复）

repeat(a,repeats, axis=None)

按元素重复

arr = np.arange(3)
print(arr.repeat(3))
print(arr.repeat([2,3,4]))
print()

[0 0 0 1 1 1 2 2 2]
[0 0 1 1 1 2 2 2 2]

指定axis来重复

arr = np.arange(4)
print(arr)

[[ 0.468845    0.43227877]
 [ 0.13822954  0.14501615]]

print(arr.repeat(2, axis=0))
print(arr.repeat(2, axis=1))

[[ 0.468845    0.43227877]
 [ 0.468845    0.43227877]
 [ 0.13822954  0.14501615]
 [ 0.13822954  0.14501615]]
[[ 0.468845    0.468845    0.43227877  0.43227877]
 [ 0.13822954  0.13822954  0.14501615  0.14501615]]

tile(按规则重复数组)

tile通过重复给定的次数来构造数组。tile(A, reps)：初始数组是A，重复规则是reps。reps表示数组A需要重复的次数、结果的行数。

arr = np.arange(4).reshape((2, 2))
print(np.tile(arr, 2))
print(np.tile(arr, (2,3)))

[[0 1 0 1]
 [2 3 2 3]]
[[0 1 0 1 0 1]
 [2 3 2 3 2 3]
 [0 1 0 1 0 1]
 [2 3 2 3 2 3]]

numpy的文件输入输出

读取csv文件作为数组

import numpy as np
arr = np.loadtxt('array_ex.txt', delimiter=',')
print(arr)

[[ 0.580052  0.18673   1.040717  1.134411]
 [ 0.194163 -0.636917 -0.938659  0.124094]
 [-0.12641   0.268607 -0.695724  0.047428]
 [-1.484413  0.004176 -0.744203  0.005487]
 [ 2.302869  0.200131  1.670238 -1.88109 ]
 [-0.19323   1.047233  0.482803  0.960334]]

数组文件读写

arr = np.arange(10)
np.save('some_array', arr)

print(np.load('some_array.npy'))

[0 1 2 3 4 5 6 7 8 9]

多个数组可以一起压缩存储

arr2 = np.arange(15).reshape(3,5)
np.savez('array_archive.npz', a=arr, b=arr2)

arch = np.load('array_archive.npz')
print(arch['a'])
print(arch['b'])

[0 1 2 3 4 5 6 7 8 9]
[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]]

用numpy写一个softmax

步骤：

数据预处理
计算exponential
每行求和
每一行除以计算的和

import numpy as np
# 产生（10，10）随机数
m = np.random.rand(10, 10) * 10 + 1000
print(m)

[[ 1002.4195769   1000.59428635  1004.19947044  1009.17641327
   1004.89329928  1001.02496808  1007.79619575  1005.61568017
   1009.28511386  1000.11608716]
 [ 1002.9870141   1005.59523328  1001.99337934  1008.79319814
   1004.78921679  1003.91814186  1009.38777432  1005.20436416
   1009.27099589  1008.69823987]
 [ 1006.68713949  1009.02893339  1008.2656608   1002.27620211  1009.2256124
   1004.14144532  1007.09728075  1006.21626467  1004.60860132
   1004.51547132]
 [ 1005.57757481  1001.6026775   1004.79229078  1004.28025577
   1008.68219699  1005.6379599   1008.07958879  1006.35060616
   1009.03418483  1003.50279599]
 [ 1003.22924339  1006.62272977  1008.5591972   1009.72498967
   1004.49414198  1004.21450523  1008.32652935  1000.90418303
   1009.24606203  1001.27113066]
 [ 1006.84865072  1005.24619541  1000.04356362  1003.38870582
   1008.59759772  1008.80052236  1007.92905671  1006.16987466  1002.3761379
   1001.55941284]
 [ 1006.80724007  1004.46597582  1003.25453387  1008.55713243
   1009.19618236  1002.06897172  1004.69874948  1006.51535711
   1005.23735087  1006.85265988]
 [ 1002.22993628  1000.59475018  1007.52711923  1000.36311206
   1008.22254861  1003.94553055  1004.23517969  1005.26438502
   1006.39421888  1005.22133756]
 [ 1006.92863693  1003.23688304  1007.11513614  1003.28880837
   1009.11093137  1006.35136574  1002.04684923  1001.13114541
   1008.50487627  1008.67481458]
 [ 1002.65347387  1001.90472796  1004.02149562  1009.63548587
   1009.16220671  1006.39781332  1008.1526219   1003.57220839
   1008.60930803  1004.41645034]]

直接对m进行e指数运算会产生上溢

print(np.exp(m))

[[ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf]
 [ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf]
 [ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf]
 [ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf]
 [ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf]
 [ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf]
 [ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf]
 [ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf]
 [ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf]
 [ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf]]
G:Anaconda3libsite-packagesipykernel_launcher.py:1: RuntimeWarning: overflow encountered in exp
  """Entry point for launching an IPython kernel.

寻找每一行的最大值

#按列取最大值（即取每一行的最大值）
m_row_max = m.max(axis=1).reshape(10,1)
print(m_row_max, m_row_max.shape)

[[ 1009.28511386]
 [ 1009.38777432]
 [ 1009.2256124 ]
 [ 1009.03418483]
 [ 1009.72498967]
 [ 1008.80052236]
 [ 1009.19618236]
 [ 1008.22254861]
 [ 1009.11093137]
 [ 1009.63548587]] (10, 1)

通过广播的方式将每行数据减去对应行的最大值

# 采用广播的方式进行减法操作
m = m - m_row_max
print(m)

[[-6.86553696 -8.69082751 -5.08564343 -0.1087006  -4.39181458 -8.26014579
  -1.48891811 -3.66943369  0.         -9.16902671]
 [-6.40076022 -3.79254104 -7.39439498 -0.59457618 -4.59855753 -5.46963247
   0.         -4.18341016 -0.11677843 -0.68953445]
 [-2.5384729  -0.19667901 -0.95995159 -6.94941029  0.         -5.08416708
  -2.12833165 -3.00934773 -4.61701107 -4.71014107]
 [-3.45661002 -7.43150733 -4.24189405 -4.75392907 -0.35198784 -3.39622493
  -0.95459604 -2.68357867  0.         -5.53138884]
 [-6.49574628 -3.1022599  -1.16579247  0.         -5.23084769 -5.51048445
  -1.39846033 -8.82080664 -0.47892764 -8.45385902]
 [-1.95187164 -3.55432696 -8.75695874 -5.41181655 -0.20292464  0.
  -0.87146565 -2.63064771 -6.42438446 -7.24110952]
 [-2.3889423  -4.73020655 -5.94164849 -0.63904993  0.         -7.12721064
  -4.49743288 -2.68082526 -3.95883149 -2.34352249]
 [-5.99261232 -7.62779843 -0.69542937 -7.85943655  0.         -4.27701805
  -3.98736891 -2.95816359 -1.82832972 -3.00121104]
 [-2.18229443 -5.87404833 -1.99579523 -5.82212299  0.         -2.75956563
  -7.06408214 -7.97978595 -0.6060551  -0.43611679]
 [-6.982012   -7.73075791 -5.61399025  0.         -0.47327916 -3.23767255
  -1.48286397 -6.06327748 -1.02617783 -5.21903553]]

求预处理后的e指数

#求预处理后的e指数
m_exp = np.exp(m)
print(m_exp, m_exp.shape)

[[  1.04312218e-03   1.68120847e-04   6.18490628e-03   8.96998943e-01
    1.23782475e-02   2.58621284e-04   2.25616615e-01   2.54909015e-02
    1.00000000e+00   1.04217895e-04]
 [  1.66029460e-03   2.25382585e-02   6.14688467e-04   5.51796380e-01
    1.00663457e-02   4.21278021e-03   1.00000000e+00   1.52464260e-02
    8.89782323e-01   5.01809632e-01]
 [  7.89869284e-02   8.21454272e-01   3.82911421e-01   9.59200640e-04
    1.00000000e+00   6.19404411e-03   1.19035722e-01   4.93238409e-02
    9.88228942e-03   9.00350735e-03]
 [  3.15364890e-02   5.92294057e-04   1.43803289e-02   8.61776882e-03
    7.03288672e-01   3.34994945e-02   3.84967625e-01   6.83182276e-02
    1.00000000e+00   3.96048477e-03]
 [  1.50984802e-03   4.49475108e-02   3.11675571e-01   1.00000000e+00
    5.34898908e-03   4.04414773e-03   2.46976935e-01   1.47629228e-04
    6.19447308e-01   2.13076561e-04]
 [  1.42008035e-01   2.86006179e-02   1.57362462e-04   4.46352464e-03
    8.16339758e-01   1.00000000e+00   4.18337963e-01   7.20317916e-02
    1.62153108e-03   7.16516327e-04]
 [  9.17266523e-02   8.82464816e-03   2.62769434e-03   5.27793627e-01
    1.00000000e+00   8.02955997e-04   1.11375513e-02   6.85065952e-02
    1.90854027e-02   9.59889224e-02]
 [  2.49713221e-03   4.86731255e-04   4.98860204e-01   3.86091355e-04
    1.00000000e+00   1.38840018e-02   1.85484526e-02   5.19141655e-02
    1.60681727e-01   4.97268106e-02]
 [  1.12782462e-01   2.81146852e-03   1.35905535e-01   2.96131163e-03
    1.00000000e+00   6.33192663e-02   8.55279590e-04   3.42312686e-04
    5.45498570e-01   6.46542214e-01]
 [  9.28433319e-04   4.39111184e-04   3.64648989e-03   1.00000000e+00
    6.22956140e-01   3.92551533e-02   2.26986674e-01   2.32676246e-03
    3.58374111e-01   5.41254683e-03]] (10, 10)

将求指数后的数据按列加和（每行求和），然后将一维数据(10,)reshape成（10,1）

m_exp_row_sum = m_exp.sum(axis = 1).reshape(10,1)
print(m_exp_row_sum, m_exp_row_sum.shape)

[[ 2.1682437 ]
 [ 2.99772713]
 [ 2.47775123]
 [ 2.24916138]
 [ 2.23431102]
 [ 2.4842771 ]
 [ 1.82649405]
 [ 1.79698532]
 [ 2.51101842]
 [ 2.26032542]] (10, 1)

每行的数据除以对应行e指数求和

m_softmax = m_exp / m_exp_row_sum
print(m_softmax)

[[  4.81090841e-04   7.75378004e-05   2.85249591e-03   4.13698398e-01
    5.70888203e-03   1.19276853e-04   1.04055008e-01   1.17564744e-02
    4.61202771e-01   4.80655820e-05]
 [  5.53851145e-04   7.51844898e-03   2.05051507e-04   1.84071584e-01
    3.35799265e-03   1.40532478e-03   3.33586066e-01   5.08599528e-03
    2.96818985e-01   1.67396701e-01]
 [  3.18784741e-02   3.31532183e-01   1.54539898e-01   3.87125483e-04
    4.03591769e-01   2.49986522e-03   4.80418376e-02   1.99066962e-02
    3.98841067e-03   3.63374146e-03]
 [  1.40214434e-02   2.63339955e-04   6.39364033e-03   3.83154756e-03
    3.12689288e-01   1.48942156e-02   1.71160517e-01   3.03749780e-02
    4.44610159e-01   1.76087176e-03]
 [  6.75755530e-04   2.01169445e-02   1.39495159e-01   4.47565264e-01
    2.39402171e-03   1.81002005e-03   1.10538297e-01   6.60737144e-05
    2.77243098e-01   9.53656673e-05]
 [  5.71627193e-02   1.15126521e-02   6.33433613e-05   1.79670965e-03
    3.28602537e-01   4.02531586e-01   1.68394243e-01   2.89950713e-02
    6.52717479e-04   2.88420453e-04]
 [  5.02200663e-02   4.83146833e-03   1.43865475e-03   2.88965424e-01
    5.47496993e-01   4.39615994e-04   6.09777585e-03   3.75071549e-02
    1.04492006e-02   5.25536464e-02]
 [  1.38962304e-03   2.70859896e-04   2.77609505e-01   2.14855041e-04
    5.56487574e-01   7.72627449e-03   1.03219834e-02   2.88895880e-02
    8.94173844e-02   2.76723522e-02]
 [  4.49150276e-02   1.11965269e-03   5.41236712e-02   1.17932692e-03
    3.98244789e-01   2.52165679e-02   3.40610640e-04   1.36324243e-04
    2.17241963e-01   2.57482067e-01]
 [  4.10752058e-04   1.94269011e-04   1.61325881e-03   4.42414172e-01
    2.75604625e-01   1.73670361e-02   1.00422121e-01   1.02939269e-03
    1.58549786e-01   2.39458743e-03]]

验证一下，对输出值进行按列求和，每行结果应该均为1

print(m_softmax.sum(axis=1))

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]

参考

[numpy指南]http://docs.scipy.org/doc/numpy/reference/

[numpy ndarray详解]https://danzhuibing.github.io/py_numpy_ndarray.html

[NumPy-快速处理数据]http://old.sebug.net/paper/books/scipydoc/numpy_intro.html