共计 11022 个字符,预计需要花费 28 分钟才能阅读完成。
工夫格式化和工夫查问
import pandas as pd
date=pd.Timestamp('2020/09/27 13:30:00')
print(date)
2020-09-27 13:30:00
# 年
print(date.year)
#月
print(date.month)
#日
print(date.day)
#时
print(date.hour)
#分
print(date.minute)
#秒
print(date.second)
2020
9
27
13
30
0
# 加 5 天
date+pd.Timedelta('5 days')
Timestamp('2020-10-02 13:30:00')
# 工夫转换
res=pd.to_datetime('2020-09-10 13:20:00')
print(res.year)
2020
# 生成一列数据
se=pd.Series(['2020-11-24 00:00:00','2020-11-25 00:00:00','2020-11-26 00:00:00'])
print(se)
0 2020-11-24 00:00:00
1 2020-11-25 00:00:00
2 2020-11-26 00:00:00
dtype: object
# 转成工夫格局
print(pd.to_datetime(se))
0 2020-11-24
1 2020-11-25
2 2020-11-26
dtype: datetime64[ns]
# 生成一个等差的工夫序列, 从 2020-09-17 开始, 长度为 periods, 距离为 12H
pd.Series(pd.date_range(start='2020-09-17',periods=10,freq='12H'))
0 2020-09-17 00:00:00
1 2020-09-17 12:00:00
2 2020-09-18 00:00:00
3 2020-09-18 12:00:00
4 2020-09-19 00:00:00
5 2020-09-19 12:00:00
6 2020-09-20 00:00:00
7 2020-09-20 12:00:00
8 2020-09-21 00:00:00
9 2020-09-21 12:00:00
dtype: datetime64[ns]
df=pd.read_csv('./pandas/data/flowdata.csv')
# 将 Time 列转换成工夫格局
df['Time']=pd.to_datetime(df['Time'])
df=df.set_index(df['Time'])
print(df)
Time L06_347 LS06_347 LS06_348
Time
2009-01-01 00:00:00 2009-01-01 00:00:00 0.137417 0.097500 0.016833
2009-01-01 03:00:00 2009-01-01 03:00:00 0.131250 0.088833 0.016417
2009-01-01 06:00:00 2009-01-01 06:00:00 0.113500 0.091250 0.016750
2009-01-01 09:00:00 2009-01-01 09:00:00 0.135750 0.091500 0.016250
2009-01-01 12:00:00 2009-01-01 12:00:00 0.140917 0.096167 0.017000
... ... ... ... ...
2013-01-01 12:00:00 2013-01-01 12:00:00 1.710000 1.710000 0.129583
2013-01-01 15:00:00 2013-01-01 15:00:00 1.420000 1.420000 0.096333
2013-01-01 18:00:00 2013-01-01 18:00:00 1.178583 1.178583 0.083083
2013-01-01 21:00:00 2013-01-01 21:00:00 0.898250 0.898250 0.077167
2013-01-02 00:00:00 2013-01-02 00:00:00 0.860000 0.860000 0.075000
[11697 rows x 4 columns]
# 打印索引
print(df.index)
DatetimeIndex(['2009-01-01 00:00:00', '2009-01-01 03:00:00',
'2009-01-01 06:00:00', '2009-01-01 09:00:00',
'2009-01-01 12:00:00', '2009-01-01 15:00:00',
'2009-01-01 18:00:00', '2009-01-01 21:00:00',
'2009-01-02 00:00:00', '2009-01-02 03:00:00',
...
'2012-12-31 21:00:00', '2013-01-01 00:00:00',
'2013-01-01 03:00:00', '2013-01-01 06:00:00',
'2013-01-01 09:00:00', '2013-01-01 12:00:00',
'2013-01-01 15:00:00', '2013-01-01 18:00:00',
'2013-01-01 21:00:00', '2013-01-02 00:00:00'],
dtype='datetime64[ns]', name='Time', length=11697, freq=None)
# 读取工夫的时候主动将工夫转换成 datetime
df=pd.read_csv('./pandas/data/flowdata.csv',index_col=0,parse_dates=True)
print(df)
L06_347 LS06_347 LS06_348
Time
2009-01-01 00:00:00 0.137417 0.097500 0.016833
2009-01-01 03:00:00 0.131250 0.088833 0.016417
2009-01-01 06:00:00 0.113500 0.091250 0.016750
2009-01-01 09:00:00 0.135750 0.091500 0.016250
2009-01-01 12:00:00 0.140917 0.096167 0.017000
... ... ... ...
2013-01-01 12:00:00 1.710000 1.710000 0.129583
2013-01-01 15:00:00 1.420000 1.420000 0.096333
2013-01-01 18:00:00 1.178583 1.178583 0.083083
2013-01-01 21:00:00 0.898250 0.898250 0.077167
2013-01-02 00:00:00 0.860000 0.860000 0.075000
[11697 rows x 3 columns]
# 通过工夫索引筛选数据
print(df[('2009-01-01 00:00:00'):('2009-01-01 12:00:00')])
L06_347 LS06_347 LS06_348
Time
2009-01-01 00:00:00 0.137417 0.097500 0.016833
2009-01-01 03:00:00 0.131250 0.088833 0.016417
2009-01-01 06:00:00 0.113500 0.091250 0.016750
2009-01-01 09:00:00 0.135750 0.091500 0.016250
2009-01-01 12:00:00 0.140917 0.096167 0.017000
# 查看数据的最初 10 条数据
print(df.tail(10))
L06_347 LS06_347 LS06_348
Time
2012-12-31 21:00:00 0.846500 0.846500 0.170167
2013-01-01 00:00:00 1.688333 1.688333 0.207333
2013-01-01 03:00:00 2.693333 2.693333 0.201500
2013-01-01 06:00:00 2.220833 2.220833 0.166917
2013-01-01 09:00:00 2.055000 2.055000 0.175667
2013-01-01 12:00:00 1.710000 1.710000 0.129583
2013-01-01 15:00:00 1.420000 1.420000 0.096333
2013-01-01 18:00:00 1.178583 1.178583 0.083083
2013-01-01 21:00:00 0.898250 0.898250 0.077167
2013-01-02 00:00:00 0.860000 0.860000 0.075000
# 通过年,月,日筛选数据
print(df['2012'])
print(df['2012-01'])
print(df['2012-01-31'])
L06_347 LS06_347 LS06_348
Time
2012-01-01 00:00:00 0.307167 0.273917 0.028000
2012-01-01 03:00:00 0.302917 0.270833 0.030583
2012-01-01 06:00:00 0.331500 0.284750 0.030917
2012-01-01 09:00:00 0.330750 0.293583 0.029750
2012-01-01 12:00:00 0.295000 0.285167 0.031750
... ... ... ...
2012-12-31 09:00:00 0.682750 0.682750 0.066583
2012-12-31 12:00:00 0.651250 0.651250 0.063833
2012-12-31 15:00:00 0.629000 0.629000 0.061833
2012-12-31 18:00:00 0.617333 0.617333 0.060583
2012-12-31 21:00:00 0.846500 0.846500 0.170167
[2928 rows x 3 columns]
L06_347 LS06_347 LS06_348
Time
2012-01-01 00:00:00 0.307167 0.273917 0.028000
2012-01-01 03:00:00 0.302917 0.270833 0.030583
2012-01-01 06:00:00 0.331500 0.284750 0.030917
2012-01-01 09:00:00 0.330750 0.293583 0.029750
2012-01-01 12:00:00 0.295000 0.285167 0.031750
... ... ... ...
2012-01-31 09:00:00 0.191000 0.231250 0.025583
2012-01-31 12:00:00 0.183333 0.227167 0.025917
2012-01-31 15:00:00 0.163417 0.221000 0.023750
2012-01-31 18:00:00 0.157083 0.220667 0.023167
2012-01-31 21:00:00 0.160083 0.214750 0.023333
[248 rows x 3 columns]
L06_347 LS06_347 LS06_348
Time
2012-01-31 00:00:00 0.191250 0.247417 0.025917
2012-01-31 03:00:00 0.181083 0.241583 0.025833
2012-01-31 06:00:00 0.188750 0.236750 0.026000
2012-01-31 09:00:00 0.191000 0.231250 0.025583
2012-01-31 12:00:00 0.183333 0.227167 0.025917
2012-01-31 15:00:00 0.163417 0.221000 0.023750
2012-01-31 18:00:00 0.157083 0.220667 0.023167
2012-01-31 21:00:00 0.160083 0.214750 0.023333
# 抉择一段时间的数据
print(df['2009-01-01':'2012-01-01'])
L06_347 LS06_347 LS06_348
Time
2009-01-01 00:00:00 0.137417 0.097500 0.016833
2009-01-01 03:00:00 0.131250 0.088833 0.016417
2009-01-01 06:00:00 0.113500 0.091250 0.016750
2009-01-01 09:00:00 0.135750 0.091500 0.016250
2009-01-01 12:00:00 0.140917 0.096167 0.017000
... ... ... ...
2012-01-01 09:00:00 0.330750 0.293583 0.029750
2012-01-01 12:00:00 0.295000 0.285167 0.031750
2012-01-01 15:00:00 0.301417 0.287750 0.031417
2012-01-01 18:00:00 0.322083 0.304167 0.038083
2012-01-01 21:00:00 0.355417 0.346500 0.080917
[8768 rows x 3 columns]
# 筛选所有 1 月份的数据
print(df[df.index.month==1])
L06_347 LS06_347 LS06_348
Time
2009-01-01 00:00:00 0.137417 0.097500 0.016833
2009-01-01 03:00:00 0.131250 0.088833 0.016417
2009-01-01 06:00:00 0.113500 0.091250 0.016750
2009-01-01 09:00:00 0.135750 0.091500 0.016250
2009-01-01 12:00:00 0.140917 0.096167 0.017000
... ... ... ...
2013-01-01 12:00:00 1.710000 1.710000 0.129583
2013-01-01 15:00:00 1.420000 1.420000 0.096333
2013-01-01 18:00:00 1.178583 1.178583 0.083083
2013-01-01 21:00:00 0.898250 0.898250 0.077167
2013-01-02 00:00:00 0.860000 0.860000 0.075000
[1001 rows x 3 columns]
# 筛选所有 8 点到 12 点的数据
print(df[(df.index.hour>8) & (df.index.hour<12)])
# 或者通过 between_time 筛选 8 点到 12 点的数据
print(df.between_time('8:00','12:00'))
L06_347 LS06_347 LS06_348
Time
2009-01-01 09:00:00 0.135750 0.091500 0.016250
2009-01-02 09:00:00 0.141917 0.097083 0.016417
2009-01-03 09:00:00 0.124583 0.084417 0.015833
2009-01-04 09:00:00 0.109000 0.105167 0.018000
2009-01-05 09:00:00 0.161500 0.114583 0.021583
... ... ... ...
2012-12-28 09:00:00 0.961500 0.961500 0.092417
2012-12-29 09:00:00 0.786833 0.786833 0.077000
2012-12-30 09:00:00 0.916000 0.916000 0.101583
2012-12-31 09:00:00 0.682750 0.682750 0.066583
2013-01-01 09:00:00 2.055000 2.055000 0.175667
[1462 rows x 3 columns]
L06_347 LS06_347 LS06_348
Time
2009-01-01 09:00:00 0.135750 0.091500 0.016250
2009-01-01 12:00:00 0.140917 0.096167 0.017000
2009-01-02 09:00:00 0.141917 0.097083 0.016417
2009-01-02 12:00:00 0.147833 0.101917 0.016417
2009-01-03 09:00:00 0.124583 0.084417 0.015833
... ... ... ...
2012-12-30 12:00:00 1.465000 1.465000 0.086833
2012-12-31 09:00:00 0.682750 0.682750 0.066583
2012-12-31 12:00:00 0.651250 0.651250 0.063833
2013-01-01 09:00:00 2.055000 2.055000 0.175667
2013-01-01 12:00:00 1.710000 1.710000 0.129583
[2924 rows x 3 columns]
resample 重采样
# 按天求平均值
df=df.resample('D').mean()
print(df)
L06_347 LS06_347 LS06_348
Time
2009-01-01 0.125010 0.092281 0.016635
2009-01-02 0.124146 0.095781 0.016406
2009-01-03 0.113562 0.085542 0.016094
2009-01-04 0.140198 0.102708 0.017323
2009-01-05 0.128812 0.104490 0.018167
... ... ... ...
2012-12-29 0.807604 0.807604 0.078031
2012-12-30 1.027240 1.027240 0.088000
2012-12-31 0.748365 0.748365 0.081417
2013-01-01 1.733042 1.733042 0.142198
2013-01-02 0.860000 0.860000 0.075000
[1463 rows x 3 columns]
# 求 3 天的平均值
print(df.resample('3D').mean())
L06_347 LS06_347 LS06_348
Time
2009-01-01 0.120906 0.091201 0.016378
2009-01-04 0.121594 0.091708 0.016670
2009-01-07 0.097042 0.070740 0.014479
2009-01-10 0.115941 0.086340 0.014545
2009-01-13 0.346962 0.364549 0.034198
... ... ... ...
2012-12-20 0.996337 0.996337 0.114472
2012-12-23 2.769059 2.769059 0.225542
2012-12-26 1.451583 1.451583 0.140101
2012-12-29 0.861069 0.861069 0.082483
2013-01-01 1.296521 1.296521 0.108599
[488 rows x 3 columns]
# 求 1 个月的平均值
print(df.resample('M').mean().head())
L06_347 LS06_347 LS06_348
Time
2009-01-31 0.517864 0.536660 0.045597
2009-02-28 0.516847 0.529987 0.047238
2009-03-31 0.372536 0.382359 0.037508
2009-04-30 0.163182 0.129354 0.021356
2009-05-31 0.178588 0.160616 0.020744
# 求某个工夫单位的平均值,最大值,最小值
print(df.resample('M').min().head())
print(df.resample('M').max().head())
L06_347 LS06_347 LS06_348
Time
2009-01-31 0.078156 0.058438 0.013573
2009-02-28 0.182646 0.135667 0.019073
2009-03-31 0.131385 0.098875 0.016979
2009-04-30 0.078510 0.066375 0.013917
2009-05-31 0.060771 0.047969 0.013656
L06_347 LS06_347 LS06_348
Time
2009-01-31 5.933531 6.199927 0.404708
2009-02-28 4.407604 4.724583 0.231750
2009-03-31 1.337896 1.586833 0.116969
2009-04-30 0.275698 0.247312 0.037375
2009-05-31 2.184250 2.433073 0.168792
列排序
df = pd.DataFrame({'group':['a','a','a','b','b','b','c','c','c'],
'data':[4,3,2,1,12,3,4,5,7]})
print(df)
group data
0 a 4
1 a 3
2 a 2
3 b 1
4 b 12
5 b 3
6 c 4
7 c 5
8 c 7
# 默认 (ascending=True) 从小到大排列. 从大到小(ascending=False)
df.sort_values(by=['group','data'],ascending=[False,True],inplace=True)
print(df)
group data
6 c 4
7 c 5
8 c 7
3 b 1
5 b 3
4 b 12
2 a 2
1 a 3
0 a 4
df=pd.DataFrame({'k1':['one']*3+['tow']*3,
'k2':[1,2,3,4,5,6]
})
print(df)
k1 k2
0 one 1
1 one 2
2 one 3
3 tow 4
4 tow 5
5 tow 6
# 排序
print(df.sort_values(by='k2',ascending=False))
k1 k2
5 tow 6
4 tow 5
3 tow 4
2 one 3
1 one 2
0 one 1
# 删除反复数据
print(df.drop_duplicates())
k1 k2
0 one 1
1 one 2
2 one 3
3 tow 4
4 tow 5
5 tow 6
# 按某列去重删除数据
print(df.drop_duplicates(subset='k1'))
k1 k2
0 one 1
3 tow 4
df = pd.DataFrame({'food':['A1','A2','B1','B2','B3','C1','C2'],'data':[1,2,3,4,5,6,7]})
print(df)
food data
0 A1 1
1 A2 2
2 B1 3
3 B2 4
4 B3 5
5 C1 6
6 C2 7
# 将 A1,A2 归类到 A, B1,B2,B3 归类 B,C1,C2 归类到 C
dict1 = {
'A1':'A',
'A2':'A',
'B1':'B',
'B2':'B',
'B3':'B',
'C1':'C',
'C2':'C'
}
df['Upper']=df['food'].map(dict1)
print(df)
food data Upper
0 A1 1 A
1 A2 2 A
2 B1 3 B
3 B2 4 B
4 B3 5 B
5 C1 6 C
6 C2 7 C
import numpy as np
df=pd.DataFrame({'k1':np.random.randn(5),'k2':np.random.randn(5)})
print(df)
df2=df.assign(ration=df['k1']/df['k2'])
print(df2)
k1 k2
0 1.977668 -1.136251
1 0.550649 0.010131
2 0.723699 0.304536
3 -0.247529 0.030359
4 -0.351775 0.732785
k1 k2 ration
0 1.977668 -1.136251 -1.740520
1 0.550649 0.010131 54.355131
2 0.723699 0.304536 2.376394
3 -0.247529 0.030359 -8.153389
4 -0.351775 0.732785 -0.480052
# 删除 ration 列
df2.drop('ration',axis='columns',inplace=True)
print(df2)
k1 k2
0 1.977668 -1.136251
1 0.550649 0.010131
2 0.723699 0.304536
3 -0.247529 0.030359
4 -0.351775 0.732785
数据替换
se=pd.Series([1,2,3,4,5,6,7,8])
print(se)
0 1
1 2
2 3
3 4
4 5
5 6
6 7
7 8
dtype: int64
se.replace(6,np.nan,inplace=True)
print(se)
0 1.0
1 2.0
2 3.0
3 4.0
4 5.0
5 NaN
6 7.0
7 8.0
dtype: float64
Pandas.cut 计算每个值在给定的哪个范畴
ages = [15,18,20,21,22,34,41,52,63,79]
bins = [10,40,80]
bins_res = pd.cut(ages,bins)
print(bins_res)
[(10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (40, 80], (40, 80], (40, 80], (40, 80]]
Categories (2, interval[int64]): [(10, 40] < (40, 80]]
# 统计各个范畴的数量
print(pd.value_counts(bins_res))
(10, 40] 6
(40, 80] 4
dtype: int64
np.nan
df=pd.DataFrame([range(3),[2,np.nan,5],[np.nan,3,np.nan],range(3)])
print(df)
0 1 2
0 0.0 1.0 2.0
1 2.0 NaN 5.0
2 NaN 3.0 NaN
3 0.0 1.0 2.0
print(df.isnull())
0 1 2
0 False False False
1 False True False
2 True False True
3 False False False
# 将 NaN 填充成 10
print(df.fillna(10))
0 1 2
0 0.0 1.0 2.0
1 2.0 10.0 5.0
2 10.0 3.0 10.0
3 0.0 1.0 2.0
# 查看某一行是否存在 NaN
print(df[df.isnull().any(axis=1)])
0 1 2
1 2.0 NaN 5.0
2 NaN 3.0 NaN
正文完