关于python:Pandas入门教程三

工夫格式化和工夫查问

import pandas as pddate=pd.Timestamp('2020/09/27 13:30:00')print(date)

2020-09-27 13:30:00

#年print(date.year)#月print(date.month)#日print(date.day)#时print(date.hour)#分print(date.minute)#秒print(date.second)

202092713300

#加5天date+pd.Timedelta('5 days')

Timestamp('2020-10-02 13:30:00')

#工夫转换res=pd.to_datetime('2020-09-10 13:20:00')print(res.year)

#生成一列数据se=pd.Series(['2020-11-24 00:00:00','2020-11-25 00:00:00','2020-11-26 00:00:00'])print(se)

0    2020-11-24 00:00:001    2020-11-25 00:00:002    2020-11-26 00:00:00dtype: object

# 转成工夫格局print(pd.to_datetime(se))

0   2020-11-241   2020-11-252   2020-11-26dtype: datetime64[ns]

# 生成一个等差的工夫序列,从2020-09-17开始,长度为periods,距离为12Hpd.Series(pd.date_range(start='2020-09-17',periods=10,freq='12H'))

0   2020-09-17 00:00:001   2020-09-17 12:00:002   2020-09-18 00:00:003   2020-09-18 12:00:004   2020-09-19 00:00:005   2020-09-19 12:00:006   2020-09-20 00:00:007   2020-09-20 12:00:008   2020-09-21 00:00:009   2020-09-21 12:00:00dtype: datetime64[ns]

df=pd.read_csv('./pandas/data/flowdata.csv')# 将Time列转换成工夫格局df['Time']=pd.to_datetime(df['Time'])df=df.set_index(df['Time'])print(df)

                                   Time   L06_347  LS06_347  LS06_348Time                                                                 2009-01-01 00:00:00 2009-01-01 00:00:00  0.137417  0.097500  0.0168332009-01-01 03:00:00 2009-01-01 03:00:00  0.131250  0.088833  0.0164172009-01-01 06:00:00 2009-01-01 06:00:00  0.113500  0.091250  0.0167502009-01-01 09:00:00 2009-01-01 09:00:00  0.135750  0.091500  0.0162502009-01-01 12:00:00 2009-01-01 12:00:00  0.140917  0.096167  0.017000...                                 ...       ...       ...       ...2013-01-01 12:00:00 2013-01-01 12:00:00  1.710000  1.710000  0.1295832013-01-01 15:00:00 2013-01-01 15:00:00  1.420000  1.420000  0.0963332013-01-01 18:00:00 2013-01-01 18:00:00  1.178583  1.178583  0.0830832013-01-01 21:00:00 2013-01-01 21:00:00  0.898250  0.898250  0.0771672013-01-02 00:00:00 2013-01-02 00:00:00  0.860000  0.860000  0.075000[11697 rows x 4 columns]

# 打印索引print(df.index)

DatetimeIndex(['2009-01-01 00:00:00', '2009-01-01 03:00:00',               '2009-01-01 06:00:00', '2009-01-01 09:00:00',               '2009-01-01 12:00:00', '2009-01-01 15:00:00',               '2009-01-01 18:00:00', '2009-01-01 21:00:00',               '2009-01-02 00:00:00', '2009-01-02 03:00:00',               ...               '2012-12-31 21:00:00', '2013-01-01 00:00:00',               '2013-01-01 03:00:00', '2013-01-01 06:00:00',               '2013-01-01 09:00:00', '2013-01-01 12:00:00',               '2013-01-01 15:00:00', '2013-01-01 18:00:00',               '2013-01-01 21:00:00', '2013-01-02 00:00:00'],              dtype='datetime64[ns]', name='Time', length=11697, freq=None)

# 读取工夫的时候主动将工夫转换成datetimedf=pd.read_csv('./pandas/data/flowdata.csv',index_col=0,parse_dates=True)print(df)

                      L06_347  LS06_347  LS06_348Time                                             2009-01-01 00:00:00  0.137417  0.097500  0.0168332009-01-01 03:00:00  0.131250  0.088833  0.0164172009-01-01 06:00:00  0.113500  0.091250  0.0167502009-01-01 09:00:00  0.135750  0.091500  0.0162502009-01-01 12:00:00  0.140917  0.096167  0.017000...                       ...       ...       ...2013-01-01 12:00:00  1.710000  1.710000  0.1295832013-01-01 15:00:00  1.420000  1.420000  0.0963332013-01-01 18:00:00  1.178583  1.178583  0.0830832013-01-01 21:00:00  0.898250  0.898250  0.0771672013-01-02 00:00:00  0.860000  0.860000  0.075000[11697 rows x 3 columns]

# 通过工夫索引筛选数据print(df[('2009-01-01 00:00:00'):('2009-01-01 12:00:00')])

                      L06_347  LS06_347  LS06_348Time                                             2009-01-01 00:00:00  0.137417  0.097500  0.0168332009-01-01 03:00:00  0.131250  0.088833  0.0164172009-01-01 06:00:00  0.113500  0.091250  0.0167502009-01-01 09:00:00  0.135750  0.091500  0.0162502009-01-01 12:00:00  0.140917  0.096167  0.017000

# 查看数据的最初10条数据print(df.tail(10))

                      L06_347  LS06_347  LS06_348Time                                             2012-12-31 21:00:00  0.846500  0.846500  0.1701672013-01-01 00:00:00  1.688333  1.688333  0.2073332013-01-01 03:00:00  2.693333  2.693333  0.2015002013-01-01 06:00:00  2.220833  2.220833  0.1669172013-01-01 09:00:00  2.055000  2.055000  0.1756672013-01-01 12:00:00  1.710000  1.710000  0.1295832013-01-01 15:00:00  1.420000  1.420000  0.0963332013-01-01 18:00:00  1.178583  1.178583  0.0830832013-01-01 21:00:00  0.898250  0.898250  0.0771672013-01-02 00:00:00  0.860000  0.860000  0.075000

# 通过年，月，日筛选数据print(df['2012'])print(df['2012-01'])print(df['2012-01-31'])

                      L06_347  LS06_347  LS06_348Time                                             2012-01-01 00:00:00  0.307167  0.273917  0.0280002012-01-01 03:00:00  0.302917  0.270833  0.0305832012-01-01 06:00:00  0.331500  0.284750  0.0309172012-01-01 09:00:00  0.330750  0.293583  0.0297502012-01-01 12:00:00  0.295000  0.285167  0.031750...                       ...       ...       ...2012-12-31 09:00:00  0.682750  0.682750  0.0665832012-12-31 12:00:00  0.651250  0.651250  0.0638332012-12-31 15:00:00  0.629000  0.629000  0.0618332012-12-31 18:00:00  0.617333  0.617333  0.0605832012-12-31 21:00:00  0.846500  0.846500  0.170167[2928 rows x 3 columns]                      L06_347  LS06_347  LS06_348Time                                             2012-01-01 00:00:00  0.307167  0.273917  0.0280002012-01-01 03:00:00  0.302917  0.270833  0.0305832012-01-01 06:00:00  0.331500  0.284750  0.0309172012-01-01 09:00:00  0.330750  0.293583  0.0297502012-01-01 12:00:00  0.295000  0.285167  0.031750...                       ...       ...       ...2012-01-31 09:00:00  0.191000  0.231250  0.0255832012-01-31 12:00:00  0.183333  0.227167  0.0259172012-01-31 15:00:00  0.163417  0.221000  0.0237502012-01-31 18:00:00  0.157083  0.220667  0.0231672012-01-31 21:00:00  0.160083  0.214750  0.023333[248 rows x 3 columns]                      L06_347  LS06_347  LS06_348Time                                             2012-01-31 00:00:00  0.191250  0.247417  0.0259172012-01-31 03:00:00  0.181083  0.241583  0.0258332012-01-31 06:00:00  0.188750  0.236750  0.0260002012-01-31 09:00:00  0.191000  0.231250  0.0255832012-01-31 12:00:00  0.183333  0.227167  0.0259172012-01-31 15:00:00  0.163417  0.221000  0.0237502012-01-31 18:00:00  0.157083  0.220667  0.0231672012-01-31 21:00:00  0.160083  0.214750  0.023333

# 抉择一段时间的数据print(df['2009-01-01':'2012-01-01'])

                      L06_347  LS06_347  LS06_348Time                                             2009-01-01 00:00:00  0.137417  0.097500  0.0168332009-01-01 03:00:00  0.131250  0.088833  0.0164172009-01-01 06:00:00  0.113500  0.091250  0.0167502009-01-01 09:00:00  0.135750  0.091500  0.0162502009-01-01 12:00:00  0.140917  0.096167  0.017000...                       ...       ...       ...2012-01-01 09:00:00  0.330750  0.293583  0.0297502012-01-01 12:00:00  0.295000  0.285167  0.0317502012-01-01 15:00:00  0.301417  0.287750  0.0314172012-01-01 18:00:00  0.322083  0.304167  0.0380832012-01-01 21:00:00  0.355417  0.346500  0.080917[8768 rows x 3 columns]

# 筛选所有1月份的数据print(df[df.index.month==1])

                      L06_347  LS06_347  LS06_348Time                                             2009-01-01 00:00:00  0.137417  0.097500  0.0168332009-01-01 03:00:00  0.131250  0.088833  0.0164172009-01-01 06:00:00  0.113500  0.091250  0.0167502009-01-01 09:00:00  0.135750  0.091500  0.0162502009-01-01 12:00:00  0.140917  0.096167  0.017000...                       ...       ...       ...2013-01-01 12:00:00  1.710000  1.710000  0.1295832013-01-01 15:00:00  1.420000  1.420000  0.0963332013-01-01 18:00:00  1.178583  1.178583  0.0830832013-01-01 21:00:00  0.898250  0.898250  0.0771672013-01-02 00:00:00  0.860000  0.860000  0.075000[1001 rows x 3 columns]

# 筛选所有8点到12点的数据print(df[(df.index.hour>8) & (df.index.hour<12)])# 或者通过between_time筛选8点到12点的数据print(df.between_time('8:00','12:00'))

                      L06_347  LS06_347  LS06_348Time                                             2009-01-01 09:00:00  0.135750  0.091500  0.0162502009-01-02 09:00:00  0.141917  0.097083  0.0164172009-01-03 09:00:00  0.124583  0.084417  0.0158332009-01-04 09:00:00  0.109000  0.105167  0.0180002009-01-05 09:00:00  0.161500  0.114583  0.021583...                       ...       ...       ...2012-12-28 09:00:00  0.961500  0.961500  0.0924172012-12-29 09:00:00  0.786833  0.786833  0.0770002012-12-30 09:00:00  0.916000  0.916000  0.1015832012-12-31 09:00:00  0.682750  0.682750  0.0665832013-01-01 09:00:00  2.055000  2.055000  0.175667[1462 rows x 3 columns]                      L06_347  LS06_347  LS06_348Time                                             2009-01-01 09:00:00  0.135750  0.091500  0.0162502009-01-01 12:00:00  0.140917  0.096167  0.0170002009-01-02 09:00:00  0.141917  0.097083  0.0164172009-01-02 12:00:00  0.147833  0.101917  0.0164172009-01-03 09:00:00  0.124583  0.084417  0.015833...                       ...       ...       ...2012-12-30 12:00:00  1.465000  1.465000  0.0868332012-12-31 09:00:00  0.682750  0.682750  0.0665832012-12-31 12:00:00  0.651250  0.651250  0.0638332013-01-01 09:00:00  2.055000  2.055000  0.1756672013-01-01 12:00:00  1.710000  1.710000  0.129583[2924 rows x 3 columns]

resample重采样

# 按天求平均值df=df.resample('D').mean()print(df)

             L06_347  LS06_347  LS06_348Time                                    2009-01-01  0.125010  0.092281  0.0166352009-01-02  0.124146  0.095781  0.0164062009-01-03  0.113562  0.085542  0.0160942009-01-04  0.140198  0.102708  0.0173232009-01-05  0.128812  0.104490  0.018167...              ...       ...       ...2012-12-29  0.807604  0.807604  0.0780312012-12-30  1.027240  1.027240  0.0880002012-12-31  0.748365  0.748365  0.0814172013-01-01  1.733042  1.733042  0.1421982013-01-02  0.860000  0.860000  0.075000[1463 rows x 3 columns]

# 求3天的平均值print(df.resample('3D').mean())

             L06_347  LS06_347  LS06_348Time                                    2009-01-01  0.120906  0.091201  0.0163782009-01-04  0.121594  0.091708  0.0166702009-01-07  0.097042  0.070740  0.0144792009-01-10  0.115941  0.086340  0.0145452009-01-13  0.346962  0.364549  0.034198...              ...       ...       ...2012-12-20  0.996337  0.996337  0.1144722012-12-23  2.769059  2.769059  0.2255422012-12-26  1.451583  1.451583  0.1401012012-12-29  0.861069  0.861069  0.0824832013-01-01  1.296521  1.296521  0.108599[488 rows x 3 columns]

# 求1个月的平均值print(df.resample('M').mean().head())

             L06_347  LS06_347  LS06_348Time                                    2009-01-31  0.517864  0.536660  0.0455972009-02-28  0.516847  0.529987  0.0472382009-03-31  0.372536  0.382359  0.0375082009-04-30  0.163182  0.129354  0.0213562009-05-31  0.178588  0.160616  0.020744

# 求某个工夫单位的平均值，最大值，最小值print(df.resample('M').min().head())print(df.resample('M').max().head())

             L06_347  LS06_347  LS06_348Time                                    2009-01-31  0.078156  0.058438  0.0135732009-02-28  0.182646  0.135667  0.0190732009-03-31  0.131385  0.098875  0.0169792009-04-30  0.078510  0.066375  0.0139172009-05-31  0.060771  0.047969  0.013656             L06_347  LS06_347  LS06_348Time                                    2009-01-31  5.933531  6.199927  0.4047082009-02-28  4.407604  4.724583  0.2317502009-03-31  1.337896  1.586833  0.1169692009-04-30  0.275698  0.247312  0.0373752009-05-31  2.184250  2.433073  0.168792

列排序

df = pd.DataFrame({'group':['a','a','a','b','b','b','c','c','c'],                    'data':[4,3,2,1,12,3,4,5,7]})print(df)

  group  data0     a     41     a     32     a     23     b     14     b    125     b     36     c     47     c     58     c     7

# 默认(ascending=True)从小到大排列. 从大到小(ascending=False)df.sort_values(by=['group','data'],ascending=[False,True],inplace=True)print(df)

  group  data6     c     47     c     58     c     73     b     15     b     34     b    122     a     21     a     30     a     4

df=pd.DataFrame({               'k1':['one']*3+['tow']*3,               'k2':[1,2,3,4,5,6]                })print(df)

    k1  k20  one   11  one   22  one   33  tow   44  tow   55  tow   6

# 排序print(df.sort_values(by='k2',ascending=False))

    k1  k25  tow   64  tow   53  tow   42  one   31  one   20  one   1

# 删除反复数据print(df.drop_duplicates())

    k1  k20  one   11  one   22  one   33  tow   44  tow   55  tow   6

# 按某列去重删除数据print(df.drop_duplicates(subset='k1'))

    k1  k20  one   13  tow   4

df = pd.DataFrame({'food':['A1','A2','B1','B2','B3','C1','C2'],'data':[1,2,3,4,5,6,7]})print(df)

  food  data0   A1     11   A2     22   B1     33   B2     44   B3     55   C1     66   C2     7

# 将A1,A2归类到A, B1,B2,B3归类B,C1,C2归类到Cdict1 = {    'A1':'A',    'A2':'A',    'B1':'B',    'B2':'B',    'B3':'B',    'C1':'C',    'C2':'C'}df['Upper']=df['food'].map(dict1)print(df)

  food  data Upper0   A1     1     A1   A2     2     A2   B1     3     B3   B2     4     B4   B3     5     B5   C1     6     C6   C2     7     C

import numpy as npdf=pd.DataFrame({'k1':np.random.randn(5),'k2':np.random.randn(5)})print(df)df2=df.assign(ration=df['k1']/df['k2'])print(df2)

         k1        k20  1.977668 -1.1362511  0.550649  0.0101312  0.723699  0.3045363 -0.247529  0.0303594 -0.351775  0.732785         k1        k2     ration0  1.977668 -1.136251  -1.7405201  0.550649  0.010131  54.3551312  0.723699  0.304536   2.3763943 -0.247529  0.030359  -8.1533894 -0.351775  0.732785  -0.480052

# 删除ration列df2.drop('ration',axis='columns',inplace=True)print(df2)

         k1        k20  1.977668 -1.1362511  0.550649  0.0101312  0.723699  0.3045363 -0.247529  0.0303594 -0.351775  0.732785

数据替换

se=pd.Series([1,2,3,4,5,6,7,8])print(se)

0    11    22    33    44    55    66    77    8dtype: int64

se.replace(6,np.nan,inplace=True)print(se)

0    1.01    2.02    3.03    4.04    5.05    NaN6    7.07    8.0dtype: float64

Pandas.cut计算每个值在给定的哪个范畴

ages = [15,18,20,21,22,34,41,52,63,79]bins = [10,40,80]bins_res = pd.cut(ages,bins)print(bins_res)

[(10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (40, 80], (40, 80], (40, 80], (40, 80]]Categories (2, interval[int64]): [(10, 40] < (40, 80]]

# 统计各个范畴的数量print(pd.value_counts(bins_res))

(10, 40]    6(40, 80]    4dtype: int64

np.nan

df=pd.DataFrame([range(3),[2,np.nan,5],[np.nan,3,np.nan],range(3)])print(df)

     0    1    20  0.0  1.0  2.01  2.0  NaN  5.02  NaN  3.0  NaN3  0.0  1.0  2.0

print(df.isnull())

       0      1      20  False  False  False1  False   True  False2   True  False   True3  False  False  False

# 将NaN填充成10print(df.fillna(10))

      0     1     20   0.0   1.0   2.01   2.0  10.0   5.02  10.0   3.0  10.03   0.0   1.0   2.0

# 查看某一行是否存在NaNprint(df[df.isnull().any(axis=1)])

     0    1    21  2.0  NaN  5.02  NaN  3.0  NaN