pandas 入門
#! /usr/bin/python
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
# 先建立一個 DataFrame
dates = pd.date_range('20150328', periods=10) # 日期序列
df = pd.DataFrame(np.random.randn(10, 5), index=dates, columns=list('ABCDE'))
# 各種觀察角度
print(df.index) # 列出每一個 row 的 index 標籤
print(df.columns) # 列出每一個 column 的標頭
print(df.values) # 除去 index 與 column headers 後的數值陣列(np.array)
print(df.describe()) # 根據每一個 column 描述其資料性質(count, mean, std, ...)
# Selection: 直接取出
print(df['A']) # 這樣會列出整個 DF 的 column 'A'
print(df[0:5]) # 列出 df 的 0, 1, 2, 3, 4 rows
print(df[[0, 1, 2]]) # 列出 df 的 0, 1, 2 三個 columns
# Selection: 使用 .loc 可以做更精細的選出資料
print(df.loc[dates[0]]) # 根據 row 的 label 選擇
print(df.loc['20140103':'20140105', ['B', 'C']]) # 根據 row labels 和 column names 選擇
# Selection: 使用 .iloc 就可以用 numpy.array/python array 的方式來操作 pandas.DataFrame
print(df.iloc[3]) # 取出第四個 row
print(df.iloc[0:2, 2:4]) # 取出一個區塊 (row_idx=0, 1 and col_idx=2, 3)
print(df.iloc[1:4, :]) # 根據 row index 選擇
print(df.iloc[:, 1:3]) # 根據 col index 選擇
# Selection: .iloc[0] 和 .iloc[[0]] 的差別
print(type(df.iloc[0])) # pandas.core.series.Series
print(type(df.iloc[[0]])) # pandas.core.frame.DataFrame
Create DataFrame list 是行列量需轉置T
#! /usr/bin/python
# -*- coding: utf-8 -*-
import pandas as pd
# 方法一
df = pd.DataFrame([1.1, 1.1, 1.1], index = list("ABC")).T # list 是行列量需轉置T
arow2 = [2.2, 2.2, 2.2]
df.loc[len(df)] = arow2
print(df)
# 方法二
df1 = pd.DataFrame([1.1, 1.1, 1.1]).T
df1.columns = list("ABC")
arow2 = [2.2, 2.2, 2.2]
df1.loc[len(df)] = arow2
print(df1)
#! /usr/bin/python
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
data = np.random.rand(10, 5)
df = pd.DataFrame(data, columns=list("ABCDE"))
print df
df.columns = list("12345") # 更換行欄位名稱
df.index = list("ABCDEFGHIJ") # 更換列欄位名稱
print df
print df.index.values
print type(df.index.values) # type is numpy.ndarray
print df.columns.values
print df.columns.values[0]
print type(df.columns.values[0]) # type is str
print type(df.columns.values) # type os numpy.ndarray
print df['1']
# 更改 index 名稱
df.index = list("0123456789")
print df
# 新增一個columns 欄位
#df.insert(0 , '股票代碼', pd.Series(np.random.randn(len(df['1'])), index=df.index))
print type(df)
df.insert(0 , '股票代碼', pd.Series([2103 for x in range(len(df['1']))], index=df.index))
print df
# 新增一列數據
mydf = [1.1, 1.2, 1.3, 1.4, 1.5,1.6]
df.loc[len(df)] = mydf
print df
# get element
# df.iloc[index]['column name']
print df.iloc[0]['股票代碼']
print df.iloc[0]['1']
DataFrame.ix
行のみ指定
import pandas as pd
import numpy as np
data = np.random.rand(10, 5)
df = pd.DataFrame(data, columns=list("ABCDE"))
print df
df = df.ix[ 4: ,:] // 行指定
print df
A B C D E
0 0.837660 0.874868 0.801268 0.973609 0.431954
1 0.863254 0.428056 0.366020 0.671288 0.186050
2 0.839916 0.116835 0.553323 0.504780 0.715694
3 0.229714 0.548386 0.699111 0.184435 0.231816
4 0.187478 0.426919 0.540911 0.909147 0.782173
5 0.638277 0.893678 0.208585 0.937772 0.285642
6 0.840695 0.956923 0.303221 0.335961 0.020592
7 0.428946 0.211034 0.869003 0.950929 0.946157
8 0.058667 0.154454 0.239021 0.333486 0.900080
9 0.666880 0.733745 0.707146 0.931958 0.102872
A B C D E
4 0.187478 0.426919 0.540911 0.909147 0.782173
5 0.638277 0.893678 0.208585 0.937772 0.285642
6 0.840695 0.956923 0.303221 0.335961 0.020592
7 0.428946 0.211034 0.869003 0.950929 0.946157
8 0.058667 0.154454 0.239021 0.333486 0.900080
9 0.666880 0.733745 0.707146 0.931958 0.102872
列のみ指定
import pandas as pd
import numpy as np
data = np.random.rand(10, 5)
df = pd.DataFrame(data, columns=list("ABCDE"))
print df
df = df.ix[:, 1:3]
print df
A B C D E
0 0.076948 0.449970 0.520205 0.436706 0.739704
1 0.075599 0.577579 0.782207 0.371012 0.894063
2 0.885444 0.135454 0.105279 0.006876 0.540366
3 0.951332 0.677577 0.783089 0.380428 0.795435
4 0.840197 0.759181 0.588425 0.368400 0.714982
5 0.075328 0.926142 0.487373 0.964743 0.836157
6 0.428123 0.090402 0.144948 0.461111 0.325377
7 0.221599 0.675153 0.704074 0.880876 0.799039
8 0.587184 0.403180 0.454961 0.710530 0.699074
9 0.817209 0.615556 0.899403 0.263885 0.115144
B C
0 0.449970 0.520205
1 0.577579 0.782207
2 0.135454 0.105279
3 0.677577 0.783089
4 0.759181 0.588425
5 0.926142 0.487373
6 0.090402 0.144948
7 0.675153 0.704074
8 0.403180 0.454961
9 0.615556 0.899403
行・列の両方をスライスで指定
import pandas as pd
import numpy as np
data = np.random.rand(10, 5)
df = pd.DataFrame(data, columns=list("ABCDE"))
print df
df = df.ix[4: , 1:3]
print df
A B C D E
0 0.537395 0.934079 0.655925 0.235779 0.707944
1 0.371540 0.751432 0.224493 0.575143 0.273347
2 0.615915 0.099794 0.246913 0.862826 0.875975
3 0.524642 0.645666 0.422228 0.377380 0.379291
4 0.107895 0.933713 0.409466 0.078913 0.767368
5 0.046459 0.362304 0.051669 0.392577 0.696221
6 0.700829 0.285290 0.486674 0.439223 0.682216
7 0.028071 0.642846 0.849560 0.908172 0.153547
8 0.529735 0.268067 0.217208 0.519743 0.988391
9 0.412213 0.764697 0.628845 0.807078 0.003180
B C
4 0.933713 0.409466
5 0.362304 0.051669
6 0.285290 0.486674
7 0.642846 0.849560
8 0.268067 0.217208
9 0.764697 0.628845
特定の行を削除する
>>> import pandas as pd
>>> import numpy as np
>>> # データフレーム df を作成
>>> df = pd.DataFrame(np.random.randn(6,4), columns=list('ABCD'))
>>> df
A B C D
0 1.361879 1.189704 2.138577 -0.476125
1 0.299178 -1.452818 -1.147374 -2.332064
2 -0.612929 -1.677195 1.735829 -0.621198
3 -0.743785 -0.223876 0.198582 -0.126726
4 1.050872 0.252367 -0.579578 0.046591
5 -1.811098 1.753245 0.339837 -0.432238
>>> # 行 5 を削除
>>> df.drop(5)
A B C D
0 1.361879 1.189704 2.138577 -0.476125
1 0.299178 -1.452818 -1.147374 -2.332064
2 -0.612929 -1.677195 1.735829 -0.621198
3 -0.743785 -0.223876 0.198582 -0.126726
4 1.050872 0.252367 -0.579578 0.046591
>>> # 行 3 と 4 を削除
>>> df.drop([3,4])
A B C D
0 1.361879 1.189704 2.138577 -0.476125
1 0.299178 -1.452818 -1.147374 -2.332064
2 -0.612929 -1.677195 1.735829 -0.621198
5 -1.811098 1.753245 0.339837 -0.432238
特定の列を削除する
>>> # 列 A を削除
>>> df.drop("A", axis=1)
B C D
0 1.189704 2.138577 -0.476125
1 -1.452818 -1.147374 -2.332064
2 -1.677195 1.735829 -0.621198
3 -0.223876 0.198582 -0.126726
4 0.252367 -0.579578 0.046591
5 1.753245 0.339837 -0.432238
Python の del
>>> # 列 A を削除
>>> del df['A']
>>> df
B C D
0 1.189704 2.138577 -0.476125
1 -1.452818 -1.147374 -2.332064
2 -1.677195 1.735829 -0.621198
3 -0.223876 0.198582 -0.126726
4 0.252367 -0.579578 0.046591
5 1.753245 0.339837 -0.432238
drop 刪除 列跟行
import pandas as pd
import numpy as np
data = np.random.rand(10, 5)
df = pd.DataFrame(data, columns=list("ABCDE"))
print df
A B C D E
0 0.866928 0.553043 0.290353 0.113969 0.957001
1 0.865568 0.462533 0.473318 0.149925 0.701643
2 0.264444 0.580996 0.805309 0.117018 0.623319
3 0.540383 0.557930 0.699899 0.544156 0.975295
4 0.833700 0.753174 0.796370 0.982843 0.135175
5 0.400652 0.006030 0.133816 0.538959 0.010030
6 0.429210 0.920838 0.545176 0.940524 0.357602
7 0.794671 0.053222 0.002619 0.712328 0.972182
8 0.093454 0.759785 0.091554 0.951439 0.447590
9 0.698792 0.331673 0.626928 0.386095 0.705186
刪除第一列
In [4]: df = df.drop(df.index[0])
In [5]: df
Out[5]:
A B C D E
1 0.865568 0.462533 0.473318 0.149925 0.701643
2 0.264444 0.580996 0.805309 0.117018 0.623319
3 0.540383 0.557930 0.699899 0.544156 0.975295
4 0.833700 0.753174 0.796370 0.982843 0.135175
5 0.400652 0.006030 0.133816 0.538959 0.010030
6 0.429210 0.920838 0.545176 0.940524 0.357602
7 0.794671 0.053222 0.002619 0.712328 0.972182
8 0.093454 0.759785 0.091554 0.951439 0.447590
9 0.698792 0.331673 0.626928 0.386095 0.705186
刪除第二行
In [6]: df = df.drop('B',1)
In [7]: df
Out[7]:
A C D E
1 0.865568 0.473318 0.149925 0.701643
2 0.264444 0.805309 0.117018 0.623319
3 0.540383 0.699899 0.544156 0.975295
4 0.833700 0.796370 0.982843 0.135175
5 0.400652 0.133816 0.538959 0.010030
6 0.429210 0.545176 0.940524 0.357602
7 0.794671 0.002619 0.712328 0.972182
8 0.093454 0.091554 0.951439 0.447590
9 0.698792 0.626928 0.386095 0.705186