工夫格式化和工夫查问
import pandas as pddate=pd.Timestamp('2020/09/27 13:30:00')print(date)
2020-09-27 13:30:00
#年print(date.year)#月print(date.month)#日print(date.day)#时print(date.hour)#分print(date.minute)#秒print(date.second)
202092713300
#加5天date+pd.Timedelta('5 days')
Timestamp('2020-10-02 13:30:00')
#工夫转换res=pd.to_datetime('2020-09-10 13:20:00')print(res.year)
2020
#生成一列数据se=pd.Series(['2020-11-24 00:00:00','2020-11-25 00:00:00','2020-11-26 00:00:00'])print(se)
0 2020-11-24 00:00:001 2020-11-25 00:00:002 2020-11-26 00:00:00dtype: object
# 转成工夫格局print(pd.to_datetime(se))
0 2020-11-241 2020-11-252 2020-11-26dtype: datetime64[ns]
# 生成一个等差的工夫序列,从2020-09-17开始,长度为periods,距离为12Hpd.Series(pd.date_range(start='2020-09-17',periods=10,freq='12H'))
0 2020-09-17 00:00:001 2020-09-17 12:00:002 2020-09-18 00:00:003 2020-09-18 12:00:004 2020-09-19 00:00:005 2020-09-19 12:00:006 2020-09-20 00:00:007 2020-09-20 12:00:008 2020-09-21 00:00:009 2020-09-21 12:00:00dtype: datetime64[ns]
df=pd.read_csv('./pandas/data/flowdata.csv')# 将Time列转换成工夫格局df['Time']=pd.to_datetime(df['Time'])df=df.set_index(df['Time'])print(df)
Time L06_347 LS06_347 LS06_348Time 2009-01-01 00:00:00 2009-01-01 00:00:00 0.137417 0.097500 0.0168332009-01-01 03:00:00 2009-01-01 03:00:00 0.131250 0.088833 0.0164172009-01-01 06:00:00 2009-01-01 06:00:00 0.113500 0.091250 0.0167502009-01-01 09:00:00 2009-01-01 09:00:00 0.135750 0.091500 0.0162502009-01-01 12:00:00 2009-01-01 12:00:00 0.140917 0.096167 0.017000... ... ... ... ...2013-01-01 12:00:00 2013-01-01 12:00:00 1.710000 1.710000 0.1295832013-01-01 15:00:00 2013-01-01 15:00:00 1.420000 1.420000 0.0963332013-01-01 18:00:00 2013-01-01 18:00:00 1.178583 1.178583 0.0830832013-01-01 21:00:00 2013-01-01 21:00:00 0.898250 0.898250 0.0771672013-01-02 00:00:00 2013-01-02 00:00:00 0.860000 0.860000 0.075000[11697 rows x 4 columns]
# 打印索引print(df.index)
DatetimeIndex(['2009-01-01 00:00:00', '2009-01-01 03:00:00', '2009-01-01 06:00:00', '2009-01-01 09:00:00', '2009-01-01 12:00:00', '2009-01-01 15:00:00', '2009-01-01 18:00:00', '2009-01-01 21:00:00', '2009-01-02 00:00:00', '2009-01-02 03:00:00', ... '2012-12-31 21:00:00', '2013-01-01 00:00:00', '2013-01-01 03:00:00', '2013-01-01 06:00:00', '2013-01-01 09:00:00', '2013-01-01 12:00:00', '2013-01-01 15:00:00', '2013-01-01 18:00:00', '2013-01-01 21:00:00', '2013-01-02 00:00:00'], dtype='datetime64[ns]', name='Time', length=11697, freq=None)
# 读取工夫的时候主动将工夫转换成datetimedf=pd.read_csv('./pandas/data/flowdata.csv',index_col=0,parse_dates=True)print(df)
L06_347 LS06_347 LS06_348Time 2009-01-01 00:00:00 0.137417 0.097500 0.0168332009-01-01 03:00:00 0.131250 0.088833 0.0164172009-01-01 06:00:00 0.113500 0.091250 0.0167502009-01-01 09:00:00 0.135750 0.091500 0.0162502009-01-01 12:00:00 0.140917 0.096167 0.017000... ... ... ...2013-01-01 12:00:00 1.710000 1.710000 0.1295832013-01-01 15:00:00 1.420000 1.420000 0.0963332013-01-01 18:00:00 1.178583 1.178583 0.0830832013-01-01 21:00:00 0.898250 0.898250 0.0771672013-01-02 00:00:00 0.860000 0.860000 0.075000[11697 rows x 3 columns]
# 通过工夫索引筛选数据print(df[('2009-01-01 00:00:00'):('2009-01-01 12:00:00')])
L06_347 LS06_347 LS06_348Time 2009-01-01 00:00:00 0.137417 0.097500 0.0168332009-01-01 03:00:00 0.131250 0.088833 0.0164172009-01-01 06:00:00 0.113500 0.091250 0.0167502009-01-01 09:00:00 0.135750 0.091500 0.0162502009-01-01 12:00:00 0.140917 0.096167 0.017000
# 查看数据的最初10条数据print(df.tail(10))
L06_347 LS06_347 LS06_348Time 2012-12-31 21:00:00 0.846500 0.846500 0.1701672013-01-01 00:00:00 1.688333 1.688333 0.2073332013-01-01 03:00:00 2.693333 2.693333 0.2015002013-01-01 06:00:00 2.220833 2.220833 0.1669172013-01-01 09:00:00 2.055000 2.055000 0.1756672013-01-01 12:00:00 1.710000 1.710000 0.1295832013-01-01 15:00:00 1.420000 1.420000 0.0963332013-01-01 18:00:00 1.178583 1.178583 0.0830832013-01-01 21:00:00 0.898250 0.898250 0.0771672013-01-02 00:00:00 0.860000 0.860000 0.075000
# 通过年,月,日筛选数据print(df['2012'])print(df['2012-01'])print(df['2012-01-31'])
L06_347 LS06_347 LS06_348Time 2012-01-01 00:00:00 0.307167 0.273917 0.0280002012-01-01 03:00:00 0.302917 0.270833 0.0305832012-01-01 06:00:00 0.331500 0.284750 0.0309172012-01-01 09:00:00 0.330750 0.293583 0.0297502012-01-01 12:00:00 0.295000 0.285167 0.031750... ... ... ...2012-12-31 09:00:00 0.682750 0.682750 0.0665832012-12-31 12:00:00 0.651250 0.651250 0.0638332012-12-31 15:00:00 0.629000 0.629000 0.0618332012-12-31 18:00:00 0.617333 0.617333 0.0605832012-12-31 21:00:00 0.846500 0.846500 0.170167[2928 rows x 3 columns] L06_347 LS06_347 LS06_348Time 2012-01-01 00:00:00 0.307167 0.273917 0.0280002012-01-01 03:00:00 0.302917 0.270833 0.0305832012-01-01 06:00:00 0.331500 0.284750 0.0309172012-01-01 09:00:00 0.330750 0.293583 0.0297502012-01-01 12:00:00 0.295000 0.285167 0.031750... ... ... ...2012-01-31 09:00:00 0.191000 0.231250 0.0255832012-01-31 12:00:00 0.183333 0.227167 0.0259172012-01-31 15:00:00 0.163417 0.221000 0.0237502012-01-31 18:00:00 0.157083 0.220667 0.0231672012-01-31 21:00:00 0.160083 0.214750 0.023333[248 rows x 3 columns] L06_347 LS06_347 LS06_348Time 2012-01-31 00:00:00 0.191250 0.247417 0.0259172012-01-31 03:00:00 0.181083 0.241583 0.0258332012-01-31 06:00:00 0.188750 0.236750 0.0260002012-01-31 09:00:00 0.191000 0.231250 0.0255832012-01-31 12:00:00 0.183333 0.227167 0.0259172012-01-31 15:00:00 0.163417 0.221000 0.0237502012-01-31 18:00:00 0.157083 0.220667 0.0231672012-01-31 21:00:00 0.160083 0.214750 0.023333
# 抉择一段时间的数据print(df['2009-01-01':'2012-01-01'])
L06_347 LS06_347 LS06_348Time 2009-01-01 00:00:00 0.137417 0.097500 0.0168332009-01-01 03:00:00 0.131250 0.088833 0.0164172009-01-01 06:00:00 0.113500 0.091250 0.0167502009-01-01 09:00:00 0.135750 0.091500 0.0162502009-01-01 12:00:00 0.140917 0.096167 0.017000... ... ... ...2012-01-01 09:00:00 0.330750 0.293583 0.0297502012-01-01 12:00:00 0.295000 0.285167 0.0317502012-01-01 15:00:00 0.301417 0.287750 0.0314172012-01-01 18:00:00 0.322083 0.304167 0.0380832012-01-01 21:00:00 0.355417 0.346500 0.080917[8768 rows x 3 columns]
# 筛选所有1月份的数据print(df[df.index.month==1])
L06_347 LS06_347 LS06_348Time 2009-01-01 00:00:00 0.137417 0.097500 0.0168332009-01-01 03:00:00 0.131250 0.088833 0.0164172009-01-01 06:00:00 0.113500 0.091250 0.0167502009-01-01 09:00:00 0.135750 0.091500 0.0162502009-01-01 12:00:00 0.140917 0.096167 0.017000... ... ... ...2013-01-01 12:00:00 1.710000 1.710000 0.1295832013-01-01 15:00:00 1.420000 1.420000 0.0963332013-01-01 18:00:00 1.178583 1.178583 0.0830832013-01-01 21:00:00 0.898250 0.898250 0.0771672013-01-02 00:00:00 0.860000 0.860000 0.075000[1001 rows x 3 columns]
# 筛选所有8点到12点的数据print(df[(df.index.hour>8) & (df.index.hour<12)])# 或者通过between_time筛选8点到12点的数据print(df.between_time('8:00','12:00'))
L06_347 LS06_347 LS06_348Time 2009-01-01 09:00:00 0.135750 0.091500 0.0162502009-01-02 09:00:00 0.141917 0.097083 0.0164172009-01-03 09:00:00 0.124583 0.084417 0.0158332009-01-04 09:00:00 0.109000 0.105167 0.0180002009-01-05 09:00:00 0.161500 0.114583 0.021583... ... ... ...2012-12-28 09:00:00 0.961500 0.961500 0.0924172012-12-29 09:00:00 0.786833 0.786833 0.0770002012-12-30 09:00:00 0.916000 0.916000 0.1015832012-12-31 09:00:00 0.682750 0.682750 0.0665832013-01-01 09:00:00 2.055000 2.055000 0.175667[1462 rows x 3 columns] L06_347 LS06_347 LS06_348Time 2009-01-01 09:00:00 0.135750 0.091500 0.0162502009-01-01 12:00:00 0.140917 0.096167 0.0170002009-01-02 09:00:00 0.141917 0.097083 0.0164172009-01-02 12:00:00 0.147833 0.101917 0.0164172009-01-03 09:00:00 0.124583 0.084417 0.015833... ... ... ...2012-12-30 12:00:00 1.465000 1.465000 0.0868332012-12-31 09:00:00 0.682750 0.682750 0.0665832012-12-31 12:00:00 0.651250 0.651250 0.0638332013-01-01 09:00:00 2.055000 2.055000 0.1756672013-01-01 12:00:00 1.710000 1.710000 0.129583[2924 rows x 3 columns]
resample重采样
# 按天求平均值df=df.resample('D').mean()print(df)
L06_347 LS06_347 LS06_348Time 2009-01-01 0.125010 0.092281 0.0166352009-01-02 0.124146 0.095781 0.0164062009-01-03 0.113562 0.085542 0.0160942009-01-04 0.140198 0.102708 0.0173232009-01-05 0.128812 0.104490 0.018167... ... ... ...2012-12-29 0.807604 0.807604 0.0780312012-12-30 1.027240 1.027240 0.0880002012-12-31 0.748365 0.748365 0.0814172013-01-01 1.733042 1.733042 0.1421982013-01-02 0.860000 0.860000 0.075000[1463 rows x 3 columns]
# 求3天的平均值print(df.resample('3D').mean())
L06_347 LS06_347 LS06_348Time 2009-01-01 0.120906 0.091201 0.0163782009-01-04 0.121594 0.091708 0.0166702009-01-07 0.097042 0.070740 0.0144792009-01-10 0.115941 0.086340 0.0145452009-01-13 0.346962 0.364549 0.034198... ... ... ...2012-12-20 0.996337 0.996337 0.1144722012-12-23 2.769059 2.769059 0.2255422012-12-26 1.451583 1.451583 0.1401012012-12-29 0.861069 0.861069 0.0824832013-01-01 1.296521 1.296521 0.108599[488 rows x 3 columns]
# 求1个月的平均值print(df.resample('M').mean().head())
L06_347 LS06_347 LS06_348Time 2009-01-31 0.517864 0.536660 0.0455972009-02-28 0.516847 0.529987 0.0472382009-03-31 0.372536 0.382359 0.0375082009-04-30 0.163182 0.129354 0.0213562009-05-31 0.178588 0.160616 0.020744
# 求某个工夫单位的平均值,最大值,最小值print(df.resample('M').min().head())print(df.resample('M').max().head())
L06_347 LS06_347 LS06_348Time 2009-01-31 0.078156 0.058438 0.0135732009-02-28 0.182646 0.135667 0.0190732009-03-31 0.131385 0.098875 0.0169792009-04-30 0.078510 0.066375 0.0139172009-05-31 0.060771 0.047969 0.013656 L06_347 LS06_347 LS06_348Time 2009-01-31 5.933531 6.199927 0.4047082009-02-28 4.407604 4.724583 0.2317502009-03-31 1.337896 1.586833 0.1169692009-04-30 0.275698 0.247312 0.0373752009-05-31 2.184250 2.433073 0.168792
列排序
df = pd.DataFrame({'group':['a','a','a','b','b','b','c','c','c'], 'data':[4,3,2,1,12,3,4,5,7]})print(df)
group data0 a 41 a 32 a 23 b 14 b 125 b 36 c 47 c 58 c 7
# 默认(ascending=True)从小到大排列. 从大到小(ascending=False)df.sort_values(by=['group','data'],ascending=[False,True],inplace=True)print(df)
group data6 c 47 c 58 c 73 b 15 b 34 b 122 a 21 a 30 a 4
df=pd.DataFrame({ 'k1':['one']*3+['tow']*3, 'k2':[1,2,3,4,5,6] })print(df)
k1 k20 one 11 one 22 one 33 tow 44 tow 55 tow 6
# 排序print(df.sort_values(by='k2',ascending=False))
k1 k25 tow 64 tow 53 tow 42 one 31 one 20 one 1
# 删除反复数据print(df.drop_duplicates())
k1 k20 one 11 one 22 one 33 tow 44 tow 55 tow 6
# 按某列去重删除数据print(df.drop_duplicates(subset='k1'))
k1 k20 one 13 tow 4
df = pd.DataFrame({'food':['A1','A2','B1','B2','B3','C1','C2'],'data':[1,2,3,4,5,6,7]})print(df)
food data0 A1 11 A2 22 B1 33 B2 44 B3 55 C1 66 C2 7
# 将A1,A2归类到A, B1,B2,B3归类B,C1,C2归类到Cdict1 = { 'A1':'A', 'A2':'A', 'B1':'B', 'B2':'B', 'B3':'B', 'C1':'C', 'C2':'C'}df['Upper']=df['food'].map(dict1)print(df)
food data Upper0 A1 1 A1 A2 2 A2 B1 3 B3 B2 4 B4 B3 5 B5 C1 6 C6 C2 7 C
import numpy as npdf=pd.DataFrame({'k1':np.random.randn(5),'k2':np.random.randn(5)})print(df)df2=df.assign(ration=df['k1']/df['k2'])print(df2)
k1 k20 1.977668 -1.1362511 0.550649 0.0101312 0.723699 0.3045363 -0.247529 0.0303594 -0.351775 0.732785 k1 k2 ration0 1.977668 -1.136251 -1.7405201 0.550649 0.010131 54.3551312 0.723699 0.304536 2.3763943 -0.247529 0.030359 -8.1533894 -0.351775 0.732785 -0.480052
# 删除ration列df2.drop('ration',axis='columns',inplace=True)print(df2)
k1 k20 1.977668 -1.1362511 0.550649 0.0101312 0.723699 0.3045363 -0.247529 0.0303594 -0.351775 0.732785
数据替换
se=pd.Series([1,2,3,4,5,6,7,8])print(se)
0 11 22 33 44 55 66 77 8dtype: int64
se.replace(6,np.nan,inplace=True)print(se)
0 1.01 2.02 3.03 4.04 5.05 NaN6 7.07 8.0dtype: float64
Pandas.cut计算每个值在给定的哪个范畴
ages = [15,18,20,21,22,34,41,52,63,79]bins = [10,40,80]bins_res = pd.cut(ages,bins)print(bins_res)
[(10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (40, 80], (40, 80], (40, 80], (40, 80]]Categories (2, interval[int64]): [(10, 40] < (40, 80]]
# 统计各个范畴的数量print(pd.value_counts(bins_res))
(10, 40] 6(40, 80] 4dtype: int64
np.nan
df=pd.DataFrame([range(3),[2,np.nan,5],[np.nan,3,np.nan],range(3)])print(df)
0 1 20 0.0 1.0 2.01 2.0 NaN 5.02 NaN 3.0 NaN3 0.0 1.0 2.0
print(df.isnull())
0 1 20 False False False1 False True False2 True False True3 False False False
# 将NaN填充成10print(df.fillna(10))
0 1 20 0.0 1.0 2.01 2.0 10.0 5.02 10.0 3.0 10.03 0.0 1.0 2.0
# 查看某一行是否存在NaNprint(df[df.isnull().any(axis=1)])
0 1 21 2.0 NaN 5.02 NaN 3.0 NaN