pandas 入門

#! /usr/bin/python
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np

# 先建立一個 DataFrame

dates = pd.date_range('20150328', periods=10)   # 日期序列

df = pd.DataFrame(np.random.randn(10, 5), index=dates, columns=list('ABCDE'))

# 各種觀察角度

print(df.index)         # 列出每一個 row 的 index 標籤

print(df.columns)       # 列出每一個 column 的標頭

print(df.values)        # 除去 index 與 column headers 後的數值陣列(np.array)

print(df.describe())    # 根據每一個 column 描述其資料性質(count, mean, std, ...)


# Selection: 直接取出

print(df['A'])          # 這樣會列出整個 DF 的 column 'A'

print(df[0:5])          # 列出 df 的 0, 1, 2, 3, 4 rows

print(df[[0, 1, 2]])    # 列出 df 的 0, 1, 2 三個 columns


# Selection: 使用 .loc 可以做更精細的選出資料

print(df.loc[dates[0]])                           # 根據 row 的 label 選擇

print(df.loc['20140103':'20140105', ['B', 'C']])  # 根據 row labels 和 column names 選擇


# Selection: 使用 .iloc 就可以用 numpy.array/python array 的方式來操作 pandas.DataFrame 

print(df.iloc[3])        # 取出第四個 row

print(df.iloc[0:2, 2:4]) # 取出一個區塊 (row_idx=0, 1 and col_idx=2, 3) 

print(df.iloc[1:4, :])   # 根據 row index 選擇

print(df.iloc[:, 1:3])   # 根據 col index 選擇


# Selection: .iloc[0] 和 .iloc[[0]] 的差別

print(type(df.iloc[0]))   # pandas.core.series.Series

print(type(df.iloc[[0]])) # pandas.core.frame.DataFrame

Create DataFrame list 是行列量需轉置T

#! /usr/bin/python
# -*- coding: utf-8 -*-
import pandas as pd

# 方法一
df = pd.DataFrame([1.1, 1.1, 1.1], index = list("ABC")).T # list 是行列量需轉置T
arow2 = [2.2, 2.2, 2.2]  
df.loc[len(df)] = arow2  
print(df)

# 方法二
df1 = pd.DataFrame([1.1, 1.1, 1.1]).T
df1.columns = list("ABC")
arow2 = [2.2, 2.2, 2.2]  
df1.loc[len(df)] = arow2  
print(df1)

#! /usr/bin/python
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np

data = np.random.rand(10, 5)
df = pd.DataFrame(data, columns=list("ABCDE"))
print df 

df.columns = list("12345") # 更換行欄位名稱

df.index = list("ABCDEFGHIJ") # 更換列欄位名稱

print df

print df.index.values
print type(df.index.values) # type is numpy.ndarray
print df.columns.values
print df.columns.values[0]
print type(df.columns.values[0]) # type is str
print type(df.columns.values) # type os numpy.ndarray
print df['1']

# 更改 index 名稱
df.index = list("0123456789")
print df

# 新增一個columns 欄位
#df.insert(0 , '股票代碼', pd.Series(np.random.randn(len(df['1'])),  index=df.index))
print type(df)
df.insert(0 , '股票代碼', pd.Series([2103 for x in range(len(df['1']))],  index=df.index))
print df

# 新增一列數據
mydf = [1.1, 1.2, 1.3, 1.4, 1.5,1.6]
df.loc[len(df)] = mydf
print df

# get element
# df.iloc[index]['column name']
print df.iloc[0]['股票代碼']
print df.iloc[0]['1']

DataFrame.ix

行のみ指定

import pandas as pd
import numpy as np

data = np.random.rand(10, 5)
df = pd.DataFrame(data, columns=list("ABCDE"))

print df

df = df.ix[ 4: ,:] // 行指定
print df
          A         B         C         D         E
0  0.837660  0.874868  0.801268  0.973609  0.431954
1  0.863254  0.428056  0.366020  0.671288  0.186050
2  0.839916  0.116835  0.553323  0.504780  0.715694
3  0.229714  0.548386  0.699111  0.184435  0.231816
4  0.187478  0.426919  0.540911  0.909147  0.782173
5  0.638277  0.893678  0.208585  0.937772  0.285642
6  0.840695  0.956923  0.303221  0.335961  0.020592
7  0.428946  0.211034  0.869003  0.950929  0.946157
8  0.058667  0.154454  0.239021  0.333486  0.900080
9  0.666880  0.733745  0.707146  0.931958  0.102872
          A         B         C         D         E
4  0.187478  0.426919  0.540911  0.909147  0.782173
5  0.638277  0.893678  0.208585  0.937772  0.285642
6  0.840695  0.956923  0.303221  0.335961  0.020592
7  0.428946  0.211034  0.869003  0.950929  0.946157
8  0.058667  0.154454  0.239021  0.333486  0.900080
9  0.666880  0.733745  0.707146  0.931958  0.102872

列のみ指定

import pandas as pd
import numpy as np

data = np.random.rand(10, 5)
df = pd.DataFrame(data, columns=list("ABCDE"))

print df

df = df.ix[:, 1:3]
print df
          A         B         C         D         E
0  0.076948  0.449970  0.520205  0.436706  0.739704
1  0.075599  0.577579  0.782207  0.371012  0.894063
2  0.885444  0.135454  0.105279  0.006876  0.540366
3  0.951332  0.677577  0.783089  0.380428  0.795435
4  0.840197  0.759181  0.588425  0.368400  0.714982
5  0.075328  0.926142  0.487373  0.964743  0.836157
6  0.428123  0.090402  0.144948  0.461111  0.325377
7  0.221599  0.675153  0.704074  0.880876  0.799039
8  0.587184  0.403180  0.454961  0.710530  0.699074
9  0.817209  0.615556  0.899403  0.263885  0.115144
          B         C
0  0.449970  0.520205
1  0.577579  0.782207
2  0.135454  0.105279
3  0.677577  0.783089
4  0.759181  0.588425
5  0.926142  0.487373
6  0.090402  0.144948
7  0.675153  0.704074
8  0.403180  0.454961
9  0.615556  0.899403

行・列の両方をスライスで指定

import pandas as pd
import numpy as np

data = np.random.rand(10, 5)
df = pd.DataFrame(data, columns=list("ABCDE"))

print df

df = df.ix[4: , 1:3]
print df
          A         B         C         D         E
0  0.537395  0.934079  0.655925  0.235779  0.707944
1  0.371540  0.751432  0.224493  0.575143  0.273347
2  0.615915  0.099794  0.246913  0.862826  0.875975
3  0.524642  0.645666  0.422228  0.377380  0.379291
4  0.107895  0.933713  0.409466  0.078913  0.767368
5  0.046459  0.362304  0.051669  0.392577  0.696221
6  0.700829  0.285290  0.486674  0.439223  0.682216
7  0.028071  0.642846  0.849560  0.908172  0.153547
8  0.529735  0.268067  0.217208  0.519743  0.988391
9  0.412213  0.764697  0.628845  0.807078  0.003180
          B         C
4  0.933713  0.409466
5  0.362304  0.051669
6  0.285290  0.486674
7  0.642846  0.849560
8  0.268067  0.217208
9  0.764697  0.628845

特定の行を削除する

>>> import pandas as pd
>>> import numpy as np

>>> # データフレーム df を作成
>>> df = pd.DataFrame(np.random.randn(6,4), columns=list('ABCD'))
>>> df
          A         B         C         D
0  1.361879  1.189704  2.138577 -0.476125
1  0.299178 -1.452818 -1.147374 -2.332064
2 -0.612929 -1.677195  1.735829 -0.621198
3 -0.743785 -0.223876  0.198582 -0.126726
4  1.050872  0.252367 -0.579578  0.046591
5 -1.811098  1.753245  0.339837 -0.432238

>>> # 行 5 を削除
>>> df.drop(5)
          A         B         C         D
0  1.361879  1.189704  2.138577 -0.476125
1  0.299178 -1.452818 -1.147374 -2.332064
2 -0.612929 -1.677195  1.735829 -0.621198
3 -0.743785 -0.223876  0.198582 -0.126726
4  1.050872  0.252367 -0.579578  0.046591

>>> # 行 3 と 4 を削除
>>> df.drop([3,4])
          A         B         C         D
0  1.361879  1.189704  2.138577 -0.476125
1  0.299178 -1.452818 -1.147374 -2.332064
2 -0.612929 -1.677195  1.735829 -0.621198
5 -1.811098  1.753245  0.339837 -0.432238

特定の列を削除する

>>> # 列 A を削除
>>> df.drop("A", axis=1)

          B         C         D
0  1.189704  2.138577 -0.476125
1 -1.452818 -1.147374 -2.332064
2 -1.677195  1.735829 -0.621198
3 -0.223876  0.198582 -0.126726
4  0.252367 -0.579578  0.046591
5  1.753245  0.339837 -0.432238

Python の del

>>> # 列 A を削除
>>> del df['A']
>>> df
          B         C         D
0  1.189704  2.138577 -0.476125
1 -1.452818 -1.147374 -2.332064
2 -1.677195  1.735829 -0.621198
3 -0.223876  0.198582 -0.126726
4  0.252367 -0.579578  0.046591
5  1.753245  0.339837 -0.432238

drop 刪除 列跟行

import pandas as pd
import numpy as np

data = np.random.rand(10, 5)
df = pd.DataFrame(data, columns=list("ABCDE"))
print df
          A         B         C         D         E
0  0.866928  0.553043  0.290353  0.113969  0.957001
1  0.865568  0.462533  0.473318  0.149925  0.701643
2  0.264444  0.580996  0.805309  0.117018  0.623319
3  0.540383  0.557930  0.699899  0.544156  0.975295
4  0.833700  0.753174  0.796370  0.982843  0.135175
5  0.400652  0.006030  0.133816  0.538959  0.010030
6  0.429210  0.920838  0.545176  0.940524  0.357602
7  0.794671  0.053222  0.002619  0.712328  0.972182
8  0.093454  0.759785  0.091554  0.951439  0.447590
9  0.698792  0.331673  0.626928  0.386095  0.705186

刪除第一列

In [4]: df = df.drop(df.index[0])

In [5]: df
Out[5]: 
          A         B         C         D         E
1  0.865568  0.462533  0.473318  0.149925  0.701643
2  0.264444  0.580996  0.805309  0.117018  0.623319
3  0.540383  0.557930  0.699899  0.544156  0.975295
4  0.833700  0.753174  0.796370  0.982843  0.135175
5  0.400652  0.006030  0.133816  0.538959  0.010030
6  0.429210  0.920838  0.545176  0.940524  0.357602
7  0.794671  0.053222  0.002619  0.712328  0.972182
8  0.093454  0.759785  0.091554  0.951439  0.447590
9  0.698792  0.331673  0.626928  0.386095  0.705186

刪除第二行

In [6]: df = df.drop('B',1)

In [7]: df
Out[7]: 
          A         C         D         E
1  0.865568  0.473318  0.149925  0.701643
2  0.264444  0.805309  0.117018  0.623319
3  0.540383  0.699899  0.544156  0.975295
4  0.833700  0.796370  0.982843  0.135175
5  0.400652  0.133816  0.538959  0.010030
6  0.429210  0.545176  0.940524  0.357602
7  0.794671  0.002619  0.712328  0.972182
8  0.093454  0.091554  0.951439  0.447590
9  0.698792  0.626928  0.386095  0.705186