Pandas入門教程(三)

時間 2020-09-29 標籤 pandas 入門教程三

時間格式化和時間查詢

import pandas as pd
date=pd.Timestamp('2020/09/27 13:30:00')
print(date)

2020-09-27 13:30:00

#年
print(date.year)
#月
print(date.month)
#日
print(date.day)
#時
print(date.hour)
#分
print(date.minute)
#秒
print(date.second)

#加5天
date+pd.Timedelta('5 days')

Timestamp('2020-10-02 13:30:00')

#時間轉換
res=pd.to_datetime('2020-09-10 13:20:00')
print(res.year)

#生成一列數據
se=pd.Series(['2020-11-24 00:00:00','2020-11-25 00:00:00','2020-11-26 00:00:00'])
print(se)

0    2020-11-24 00:00:00
1    2020-11-25 00:00:00
2    2020-11-26 00:00:00
dtype: object

# 轉成時間格式
print(pd.to_datetime(se))

0   2020-11-24
1   2020-11-25
2   2020-11-26
dtype: datetime64[ns]

# 生成一個等差的時間序列,從2020-09-17開始,長度爲periods,間隔爲12H
pd.Series(pd.date_range(start='2020-09-17',periods=10,freq='12H'))

0   2020-09-17 00:00:00
1   2020-09-17 12:00:00
2   2020-09-18 00:00:00
3   2020-09-18 12:00:00
4   2020-09-19 00:00:00
5   2020-09-19 12:00:00
6   2020-09-20 00:00:00
7   2020-09-20 12:00:00
8   2020-09-21 00:00:00
9   2020-09-21 12:00:00
dtype: datetime64[ns]

df=pd.read_csv('./pandas/data/flowdata.csv')
# 將Time列轉換成時間格式
df['Time']=pd.to_datetime(df['Time'])
df=df.set_index(df['Time'])
print(df)

Time   L06_347  LS06_347  LS06_348
Time                                                                 
2009-01-01 00:00:00 2009-01-01 00:00:00  0.137417  0.097500  0.016833
2009-01-01 03:00:00 2009-01-01 03:00:00  0.131250  0.088833  0.016417
2009-01-01 06:00:00 2009-01-01 06:00:00  0.113500  0.091250  0.016750
2009-01-01 09:00:00 2009-01-01 09:00:00  0.135750  0.091500  0.016250
2009-01-01 12:00:00 2009-01-01 12:00:00  0.140917  0.096167  0.017000
...                                 ...       ...       ...       ...
2013-01-01 12:00:00 2013-01-01 12:00:00  1.710000  1.710000  0.129583
2013-01-01 15:00:00 2013-01-01 15:00:00  1.420000  1.420000  0.096333
2013-01-01 18:00:00 2013-01-01 18:00:00  1.178583  1.178583  0.083083
2013-01-01 21:00:00 2013-01-01 21:00:00  0.898250  0.898250  0.077167
2013-01-02 00:00:00 2013-01-02 00:00:00  0.860000  0.860000  0.075000

[11697 rows x 4 columns]

# 打印索引
print(df.index)

DatetimeIndex(['2009-01-01 00:00:00', '2009-01-01 03:00:00',
               '2009-01-01 06:00:00', '2009-01-01 09:00:00',
               '2009-01-01 12:00:00', '2009-01-01 15:00:00',
               '2009-01-01 18:00:00', '2009-01-01 21:00:00',
               '2009-01-02 00:00:00', '2009-01-02 03:00:00',
               ...
               '2012-12-31 21:00:00', '2013-01-01 00:00:00',
               '2013-01-01 03:00:00', '2013-01-01 06:00:00',
               '2013-01-01 09:00:00', '2013-01-01 12:00:00',
               '2013-01-01 15:00:00', '2013-01-01 18:00:00',
               '2013-01-01 21:00:00', '2013-01-02 00:00:00'],
              dtype='datetime64[ns]', name='Time', length=11697, freq=None)

# 讀取時間的時候自動將時間轉換成datetime
df=pd.read_csv('./pandas/data/flowdata.csv',index_col=0,parse_dates=True)
print(df)

L06_347  LS06_347  LS06_348
Time                                             
2009-01-01 00:00:00  0.137417  0.097500  0.016833
2009-01-01 03:00:00  0.131250  0.088833  0.016417
2009-01-01 06:00:00  0.113500  0.091250  0.016750
2009-01-01 09:00:00  0.135750  0.091500  0.016250
2009-01-01 12:00:00  0.140917  0.096167  0.017000
...                       ...       ...       ...
2013-01-01 12:00:00  1.710000  1.710000  0.129583
2013-01-01 15:00:00  1.420000  1.420000  0.096333
2013-01-01 18:00:00  1.178583  1.178583  0.083083
2013-01-01 21:00:00  0.898250  0.898250  0.077167
2013-01-02 00:00:00  0.860000  0.860000  0.075000

[11697 rows x 3 columns]

# 經過時間索引篩選數據
print(df[('2009-01-01 00:00:00'):('2009-01-01 12:00:00')])

L06_347  LS06_347  LS06_348
Time                                             
2009-01-01 00:00:00  0.137417  0.097500  0.016833
2009-01-01 03:00:00  0.131250  0.088833  0.016417
2009-01-01 06:00:00  0.113500  0.091250  0.016750
2009-01-01 09:00:00  0.135750  0.091500  0.016250
2009-01-01 12:00:00  0.140917  0.096167  0.017000

# 查看數據的最後10條數據
print(df.tail(10))

L06_347  LS06_347  LS06_348
Time                                             
2012-12-31 21:00:00  0.846500  0.846500  0.170167
2013-01-01 00:00:00  1.688333  1.688333  0.207333
2013-01-01 03:00:00  2.693333  2.693333  0.201500
2013-01-01 06:00:00  2.220833  2.220833  0.166917
2013-01-01 09:00:00  2.055000  2.055000  0.175667
2013-01-01 12:00:00  1.710000  1.710000  0.129583
2013-01-01 15:00:00  1.420000  1.420000  0.096333
2013-01-01 18:00:00  1.178583  1.178583  0.083083
2013-01-01 21:00:00  0.898250  0.898250  0.077167
2013-01-02 00:00:00  0.860000  0.860000  0.075000

# 經過年，月，日篩選數據
print(df['2012'])
print(df['2012-01'])
print(df['2012-01-31'])

L06_347  LS06_347  LS06_348
Time                                             
2012-01-01 00:00:00  0.307167  0.273917  0.028000
2012-01-01 03:00:00  0.302917  0.270833  0.030583
2012-01-01 06:00:00  0.331500  0.284750  0.030917
2012-01-01 09:00:00  0.330750  0.293583  0.029750
2012-01-01 12:00:00  0.295000  0.285167  0.031750
...                       ...       ...       ...
2012-12-31 09:00:00  0.682750  0.682750  0.066583
2012-12-31 12:00:00  0.651250  0.651250  0.063833
2012-12-31 15:00:00  0.629000  0.629000  0.061833
2012-12-31 18:00:00  0.617333  0.617333  0.060583
2012-12-31 21:00:00  0.846500  0.846500  0.170167

[2928 rows x 3 columns]
                      L06_347  LS06_347  LS06_348
Time                                             
2012-01-01 00:00:00  0.307167  0.273917  0.028000
2012-01-01 03:00:00  0.302917  0.270833  0.030583
2012-01-01 06:00:00  0.331500  0.284750  0.030917
2012-01-01 09:00:00  0.330750  0.293583  0.029750
2012-01-01 12:00:00  0.295000  0.285167  0.031750
...                       ...       ...       ...
2012-01-31 09:00:00  0.191000  0.231250  0.025583
2012-01-31 12:00:00  0.183333  0.227167  0.025917
2012-01-31 15:00:00  0.163417  0.221000  0.023750
2012-01-31 18:00:00  0.157083  0.220667  0.023167
2012-01-31 21:00:00  0.160083  0.214750  0.023333

[248 rows x 3 columns]
                      L06_347  LS06_347  LS06_348
Time                                             
2012-01-31 00:00:00  0.191250  0.247417  0.025917
2012-01-31 03:00:00  0.181083  0.241583  0.025833
2012-01-31 06:00:00  0.188750  0.236750  0.026000
2012-01-31 09:00:00  0.191000  0.231250  0.025583
2012-01-31 12:00:00  0.183333  0.227167  0.025917
2012-01-31 15:00:00  0.163417  0.221000  0.023750
2012-01-31 18:00:00  0.157083  0.220667  0.023167
2012-01-31 21:00:00  0.160083  0.214750  0.023333

# 選擇一段時間的數據
print(df['2009-01-01':'2012-01-01'])

L06_347  LS06_347  LS06_348
Time                                             
2009-01-01 00:00:00  0.137417  0.097500  0.016833
2009-01-01 03:00:00  0.131250  0.088833  0.016417
2009-01-01 06:00:00  0.113500  0.091250  0.016750
2009-01-01 09:00:00  0.135750  0.091500  0.016250
2009-01-01 12:00:00  0.140917  0.096167  0.017000
...                       ...       ...       ...
2012-01-01 09:00:00  0.330750  0.293583  0.029750
2012-01-01 12:00:00  0.295000  0.285167  0.031750
2012-01-01 15:00:00  0.301417  0.287750  0.031417
2012-01-01 18:00:00  0.322083  0.304167  0.038083
2012-01-01 21:00:00  0.355417  0.346500  0.080917

[8768 rows x 3 columns]

# 篩選全部1月份的數據
print(df[df.index.month==1])

L06_347  LS06_347  LS06_348
Time                                             
2009-01-01 00:00:00  0.137417  0.097500  0.016833
2009-01-01 03:00:00  0.131250  0.088833  0.016417
2009-01-01 06:00:00  0.113500  0.091250  0.016750
2009-01-01 09:00:00  0.135750  0.091500  0.016250
2009-01-01 12:00:00  0.140917  0.096167  0.017000
...                       ...       ...       ...
2013-01-01 12:00:00  1.710000  1.710000  0.129583
2013-01-01 15:00:00  1.420000  1.420000  0.096333
2013-01-01 18:00:00  1.178583  1.178583  0.083083
2013-01-01 21:00:00  0.898250  0.898250  0.077167
2013-01-02 00:00:00  0.860000  0.860000  0.075000

[1001 rows x 3 columns]

# 篩選全部8點到12點的數據
print(df[(df.index.hour>8) & (df.index.hour<12)])

# 或者經過between_time篩選8點到12點的數據
print(df.between_time('8:00','12:00'))

L06_347  LS06_347  LS06_348
Time                                             
2009-01-01 09:00:00  0.135750  0.091500  0.016250
2009-01-02 09:00:00  0.141917  0.097083  0.016417
2009-01-03 09:00:00  0.124583  0.084417  0.015833
2009-01-04 09:00:00  0.109000  0.105167  0.018000
2009-01-05 09:00:00  0.161500  0.114583  0.021583
...                       ...       ...       ...
2012-12-28 09:00:00  0.961500  0.961500  0.092417
2012-12-29 09:00:00  0.786833  0.786833  0.077000
2012-12-30 09:00:00  0.916000  0.916000  0.101583
2012-12-31 09:00:00  0.682750  0.682750  0.066583
2013-01-01 09:00:00  2.055000  2.055000  0.175667

[1462 rows x 3 columns]
                      L06_347  LS06_347  LS06_348
Time                                             
2009-01-01 09:00:00  0.135750  0.091500  0.016250
2009-01-01 12:00:00  0.140917  0.096167  0.017000
2009-01-02 09:00:00  0.141917  0.097083  0.016417
2009-01-02 12:00:00  0.147833  0.101917  0.016417
2009-01-03 09:00:00  0.124583  0.084417  0.015833
...                       ...       ...       ...
2012-12-30 12:00:00  1.465000  1.465000  0.086833
2012-12-31 09:00:00  0.682750  0.682750  0.066583
2012-12-31 12:00:00  0.651250  0.651250  0.063833
2013-01-01 09:00:00  2.055000  2.055000  0.175667
2013-01-01 12:00:00  1.710000  1.710000  0.129583

[2924 rows x 3 columns]

resample重採樣

# 按天求平均值
df=df.resample('D').mean()
print(df)

L06_347  LS06_347  LS06_348
Time                                    
2009-01-01  0.125010  0.092281  0.016635
2009-01-02  0.124146  0.095781  0.016406
2009-01-03  0.113562  0.085542  0.016094
2009-01-04  0.140198  0.102708  0.017323
2009-01-05  0.128812  0.104490  0.018167
...              ...       ...       ...
2012-12-29  0.807604  0.807604  0.078031
2012-12-30  1.027240  1.027240  0.088000
2012-12-31  0.748365  0.748365  0.081417
2013-01-01  1.733042  1.733042  0.142198
2013-01-02  0.860000  0.860000  0.075000

[1463 rows x 3 columns]

# 求3天的平均值
print(df.resample('3D').mean())

L06_347  LS06_347  LS06_348
Time                                    
2009-01-01  0.120906  0.091201  0.016378
2009-01-04  0.121594  0.091708  0.016670
2009-01-07  0.097042  0.070740  0.014479
2009-01-10  0.115941  0.086340  0.014545
2009-01-13  0.346962  0.364549  0.034198
...              ...       ...       ...
2012-12-20  0.996337  0.996337  0.114472
2012-12-23  2.769059  2.769059  0.225542
2012-12-26  1.451583  1.451583  0.140101
2012-12-29  0.861069  0.861069  0.082483
2013-01-01  1.296521  1.296521  0.108599

[488 rows x 3 columns]

# 求1個月的平均值
print(df.resample('M').mean().head())

L06_347  LS06_347  LS06_348
Time                                    
2009-01-31  0.517864  0.536660  0.045597
2009-02-28  0.516847  0.529987  0.047238
2009-03-31  0.372536  0.382359  0.037508
2009-04-30  0.163182  0.129354  0.021356
2009-05-31  0.178588  0.160616  0.020744

# 求某個時間單位的平均值，最大值，最小值
print(df.resample('M').min().head())
print(df.resample('M').max().head())

L06_347  LS06_347  LS06_348
Time                                    
2009-01-31  0.078156  0.058438  0.013573
2009-02-28  0.182646  0.135667  0.019073
2009-03-31  0.131385  0.098875  0.016979
2009-04-30  0.078510  0.066375  0.013917
2009-05-31  0.060771  0.047969  0.013656
             L06_347  LS06_347  LS06_348
Time                                    
2009-01-31  5.933531  6.199927  0.404708
2009-02-28  4.407604  4.724583  0.231750
2009-03-31  1.337896  1.586833  0.116969
2009-04-30  0.275698  0.247312  0.037375
2009-05-31  2.184250  2.433073  0.168792

列排序

df = pd.DataFrame({'group':['a','a','a','b','b','b','c','c','c'],
                    'data':[4,3,2,1,12,3,4,5,7]})
print(df)

group  data
0     a     4
1     a     3
2     a     2
3     b     1
4     b    12
5     b     3
6     c     4
7     c     5
8     c     7

# 默認(ascending=True)從小到大排列. 從大到小(ascending=False)
df.sort_values(by=['group','data'],ascending=[False,True],inplace=True)
print(df)

group  data
6     c     4
7     c     5
8     c     7
3     b     1
5     b     3
4     b    12
2     a     2
1     a     3
0     a     4

df=pd.DataFrame({
               'k1':['one']*3+['tow']*3,
               'k2':[1,2,3,4,5,6]
                })
print(df)

k1  k2
0  one   1
1  one   2
2  one   3
3  tow   4
4  tow   5
5  tow   6

# 排序
print(df.sort_values(by='k2',ascending=False))

k1  k2
5  tow   6
4  tow   5
3  tow   4
2  one   3
1  one   2
0  one   1

# 刪除重複數據
print(df.drop_duplicates())

k1  k2
0  one   1
1  one   2
2  one   3
3  tow   4
4  tow   5
5  tow   6

# 按某列去重刪除數據
print(df.drop_duplicates(subset='k1'))

k1  k2
0  one   1
3  tow   4

df = pd.DataFrame({'food':['A1','A2','B1','B2','B3','C1','C2'],'data':[1,2,3,4,5,6,7]})
print(df)

food  data
0   A1     1
1   A2     2
2   B1     3
3   B2     4
4   B3     5
5   C1     6
6   C2     7

# 將A1,A2歸類到A, B1,B2,B3歸類B,C1,C2歸類到C
dict1 = {
    'A1':'A',
    'A2':'A',
    'B1':'B',
    'B2':'B',
    'B3':'B',
    'C1':'C',
    'C2':'C'
}
df['Upper']=df['food'].map(dict1)
print(df)

food  data Upper
0   A1     1     A
1   A2     2     A
2   B1     3     B
3   B2     4     B
4   B3     5     B
5   C1     6     C
6   C2     7     C

import numpy as np
df=pd.DataFrame({'k1':np.random.randn(5),'k2':np.random.randn(5)})
print(df)
df2=df.assign(ration=df['k1']/df['k2'])
print(df2)

k1        k2
0  1.977668 -1.136251
1  0.550649  0.010131
2  0.723699  0.304536
3 -0.247529  0.030359
4 -0.351775  0.732785
         k1        k2     ration
0  1.977668 -1.136251  -1.740520
1  0.550649  0.010131  54.355131
2  0.723699  0.304536   2.376394
3 -0.247529  0.030359  -8.153389
4 -0.351775  0.732785  -0.480052

# 刪除ration列
df2.drop('ration',axis='columns',inplace=True)
print(df2)

k1        k2
0  1.977668 -1.136251
1  0.550649  0.010131
2  0.723699  0.304536
3 -0.247529  0.030359
4 -0.351775  0.732785

數據替換

se=pd.Series([1,2,3,4,5,6,7,8])
print(se)

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
dtype: int64

se.replace(6,np.nan,inplace=True)
print(se)

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    NaN
6    7.0
7    8.0
dtype: float64

Pandas.cut計算每一個值在給定的哪一個範圍

ages = [15,18,20,21,22,34,41,52,63,79]
bins = [10,40,80]
bins_res = pd.cut(ages,bins)
print(bins_res)

[(10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (40, 80], (40, 80], (40, 80], (40, 80]]
Categories (2, interval[int64]): [(10, 40] < (40, 80]]

# 統計各個範圍的數量
print(pd.value_counts(bins_res))

(10, 40]    6
(40, 80]    4
dtype: int64

np.nan

df=pd.DataFrame([range(3),[2,np.nan,5],[np.nan,3,np.nan],range(3)])
print(df)

0    1    2
0  0.0  1.0  2.0
1  2.0  NaN  5.0
2  NaN  3.0  NaN
3  0.0  1.0  2.0

print(df.isnull())

0      1      2
0  False  False  False
1  False   True  False
2   True  False   True
3  False  False  False

# 將NaN填充成10
print(df.fillna(10))

0     1     2
0   0.0   1.0   2.0
1   2.0  10.0   5.0
2  10.0   3.0  10.0
3   0.0   1.0   2.0

# 查看某一行是否存在NaN
print(df[df.isnull().any(axis=1)])

0    1    2
1  2.0  NaN  5.0
2  NaN  3.0  NaN