groupby分组
import pandas as pdimport numpy as npdf=pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C' : np.random.randn(8), 'D' : np.random.randn(8)})print(df)grouped=df.groupby('A')print('-'*30)print(grouped.count())print('-'*30)grouped=df.groupby(['A','B'])print(grouped.count())print('-'*30)# 通过函数分组def get_letter_type(letter): if letter.lower() in 'aeiou': return 'a' else: return 'b'grouped=df.groupby(get_letter_type,axis=1)print(grouped.count())
A B C D0 foo one 1.429387 0.6435691 bar one -0.858448 -0.2130342 foo two 0.375644 0.2145843 bar three 0.042284 -0.3304814 foo two -1.421967 0.7681765 bar two 1.293483 -0.3990036 foo one -1.101385 -0.2363417 foo three -0.852603 -1.718694------------------------------ B C DA bar 3 3 3foo 5 5 5------------------------------ C DA B bar one 1 1 three 1 1 two 1 1foo one 2 2 three 1 1 two 2 2------------------------------ a b0 1 31 1 32 1 33 1 34 1 35 1 36 1 37 1 3
se=pd.Series([1,2,3,4,5],[6,9,8,9,8])print(se)se.groupby(level=0)
6 19 28 39 48 5dtype: int64<pandas.core.groupby.generic.SeriesGroupBy object at 0x111b00040>
# 分组求和grouped=se.groupby(level=0).sum()print(grouped)
6 18 89 6dtype: int64
df2=pd.DataFrame({'X':['A','B','A','B'],'Y':[1,2,3,4]})print(df2)
X Y0 A 11 B 22 A 33 B 4
# 按X分组,并查问A列的数据grp=df2.groupby('X').get_group('A')print(grp)
X Y0 A 12 A 3
Pandas 多级索引
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]index=pd.MultiIndex.from_arrays(arrays,names=['first','second'])print(index)
MultiIndex([('bar', 'one'), ('bar', 'two'), ('baz', 'one'), ('baz', 'two'), ('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')], names=['first', 'second'])
s=pd.Series(np.random.randn(8),index=index)print(s)
first secondbar one 0.120979 two -0.440384baz one 0.515106 two -0.019882foo one 1.149595 two -0.369984qux one -0.930438 two 0.146044dtype: float64
# 分组求和grouped=s.groupby(level='first')print(grouped.sum())
firstbar -0.319405baz 0.495224foo 0.779611qux -0.784394dtype: float64
grouped=df.groupby(['A','B'])print(grouped.size())
A B bar one 1 three 1 two 1foo one 2 three 1 two 2dtype: int64
print(df)
A B C D0 foo one 1.429387 0.6435691 bar one -0.858448 -0.2130342 foo two 0.375644 0.2145843 bar three 0.042284 -0.3304814 foo two -1.421967 0.7681765 bar two 1.293483 -0.3990036 foo one -1.101385 -0.2363417 foo three -0.852603 -1.718694
print(grouped.describe().head())
C \ count mean std min 25% 50% 75% A B bar one 1.0 -0.858448 NaN -0.858448 -0.858448 -0.858448 -0.858448 three 1.0 0.042284 NaN 0.042284 0.042284 0.042284 0.042284 two 1.0 1.293483 NaN 1.293483 1.293483 1.293483 1.293483 foo one 2.0 0.164001 1.789526 -1.101385 -0.468692 0.164001 0.796694 three 1.0 -0.852603 NaN -0.852603 -0.852603 -0.852603 -0.852603 D \ max count mean std min 25% 50% A B bar one -0.858448 1.0 -0.213034 NaN -0.213034 -0.213034 -0.213034 three 0.042284 1.0 -0.330481 NaN -0.330481 -0.330481 -0.330481 two 1.293483 1.0 -0.399003 NaN -0.399003 -0.399003 -0.399003 foo one 1.429387 2.0 0.203614 0.622191 -0.236341 -0.016364 0.203614 three -0.852603 1.0 -1.718694 NaN -1.718694 -1.718694 -1.718694
75% max A B bar one -0.213034 -0.213034 three -0.330481 -0.330481 two -0.399003 -0.399003 foo one 0.423592 0.643569 three -1.718694 -1.718694
grouped=df.groupby('A')grouped['C'].agg([np.sum,np.mean,np.std])
<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}
</style>
sum | mean | std | |
---|---|---|---|
A | |||
bar | 0.477319 | 0.159106 | 1.080712 |
foo | -1.570925 | -0.314185 | 1.188767 |
</div>
字符串操作
import pandas as pdimport numpy as nps=pd.Series(['A','b','c','D',np.nan])print(s)# 转小写print(s.str.lower())# 转大写print(s.str.upper())# 每个字符的长度print(s.str.len())
0 A1 b2 c3 D4 NaNdtype: object0 a1 b2 c3 d4 NaNdtype: object0 A1 B2 C3 D4 NaNdtype: object0 1.01 1.02 1.03 1.04 NaNdtype: float64
index=pd.Index([' Index','ru ',' men'])# 去掉空格print(index.str.strip())# 去掉右边的空格print(index.str.lstrip())# 去掉左边的空格print(index.str.rstrip())
Index(['Index', 'ru', 'men'], dtype='object')Index(['Index', 'ru ', 'men'], dtype='object')Index([' Index', 'ru', ' men'], dtype='object')
df=pd.DataFrame(np.random.randn(3,2),columns=['A a','B b'],index=range(3))print(df)
A a B b0 3.005273 0.4866961 1.093889 1.0542302 -2.846352 0.302465
# 列替换print(df.columns.str.replace(' ','_'))
Index(['A_a', 'B_b'], dtype='object')
s=pd.Series(['a_b_C','c_d_e','f_g_h'])print(s)
0 a_b_C1 c_d_e2 f_g_hdtype: object
print(s.str.split('_'))
0 [a, b, C]1 [c, d, e]2 [f, g, h]dtype: object
print(s.str.split('_',expand=True,n=1))
0 10 a b_C1 c d_e2 f g_h
s = pd.Series(['A','rumen','ru','rumen','xiao','zhan'])
print(s.str.contains('ru'))
0 False1 True2 True3 True4 False5 Falsedtype: bool
s=pd.Series(['a','a|b','a|c'])print(s)
0 a1 a|b2 a|cdtype: object
print(s.str.get_dummies(sep='|'))
a b c0 1 0 01 1 1 02 1 0 1
索引
s=pd.Series(np.arange(5),np.arange(5)[::-1],dtype='int64')
s
4 03 12 21 30 4dtype: int64
print(s[s>2])
1 30 4dtype: int64
# isin查问索引在某个范畴print(s.isin([1,3,4]))
4 False3 True2 False1 True0 Truedtype: bool
# 依据索引查问数据print(s[s.isin([1,3,4])])
3 11 30 4dtype: int64
# 结构一个联结索引的数据s=pd.Series(np.arange(6),index=pd.MultiIndex.from_product([[1,2],['a','b','c']]))
print(s)
1 a 0 b 1 c 22 a 3 b 4 c 5dtype: int64
print(s.iloc[s.index.isin([(1,'b'),(2,'c')])])
1 b 12 c 5dtype: int64
# 结构一个工夫序列dates=pd.date_range('20200920',periods=8)print(dates)
DatetimeIndex(['2020-09-20', '2020-09-21', '2020-09-22', '2020-09-23', '2020-09-24', '2020-09-25', '2020-09-26', '2020-09-27'], dtype='datetime64[ns]', freq='D')
df=pd.DataFrame(np.random.randn(8,4),index=dates,columns=['A','B','C','D'])print(df)
A B C D2020-09-20 -1.218522 2.067088 0.015009 0.1587802020-09-21 -0.546837 -0.601178 -0.894882 0.1720372020-09-22 0.189848 -0.910520 0.196186 -0.0734952020-09-23 -0.566892 0.899193 -0.450925 0.6332532020-09-24 0.038838 1.577004 0.580927 0.6090502020-09-25 1.562094 0.020813 -0.618859 -0.5152122020-09-26 -1.333947 0.275765 0.139325 1.1242072020-09-27 -1.271748 1.082302 1.036805 -1.041206
# 查问A列数据print(df['A'])
2020-09-20 -1.2185222020-09-21 -0.5468372020-09-22 0.1898482020-09-23 -0.5668922020-09-24 0.0388382020-09-25 1.5620942020-09-26 -1.3339472020-09-27 -1.271748Freq: D, Name: A, dtype: float64
# 查问小于0的数字,大于0的值默认被置为NaNdf.where(df<0)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}
</style>
A | B | C | D | |
---|---|---|---|---|
2020-09-20 | -1.218522 | NaN | NaN | NaN |
2020-09-21 | -0.546837 | -0.601178 | -0.894882 | NaN |
2020-09-22 | NaN | -0.910520 | NaN | -0.073495 |
2020-09-23 | -0.566892 | NaN | -0.450925 | NaN |
2020-09-24 | NaN | NaN | NaN | NaN |
2020-09-25 | NaN | NaN | -0.618859 | -0.515212 |
2020-09-26 | -1.333947 | NaN | NaN | NaN |
2020-09-27 | -1.271748 | NaN | NaN | -1.041206 |
</div>
# 查问小于0的数字,大于0的值变成正数print(df.where(df<0,-df))
A B C D2020-09-20 -1.218522 -2.067088 -0.015009 -0.1587802020-09-21 -0.546837 -0.601178 -0.894882 -0.1720372020-09-22 -0.189848 -0.910520 -0.196186 -0.0734952020-09-23 -0.566892 -0.899193 -0.450925 -0.6332532020-09-24 -0.038838 -1.577004 -0.580927 -0.6090502020-09-25 -1.562094 -0.020813 -0.618859 -0.5152122020-09-26 -1.333947 -0.275765 -0.139325 -1.1242072020-09-27 -1.271748 -1.082302 -1.036805 -1.041206
# 查问小于0的数据,大于0的置为1000print(df.where(df<0,1000))
A B C D2020-09-20 -1.218522 1000.000000 1000.000000 1000.0000002020-09-21 -0.546837 -0.601178 -0.894882 1000.0000002020-09-22 1000.000000 -0.910520 1000.000000 -0.0734952020-09-23 -0.566892 1000.000000 -0.450925 1000.0000002020-09-24 1000.000000 1000.000000 1000.000000 1000.0000002020-09-25 1000.000000 1000.000000 -0.618859 -0.5152122020-09-26 -1.333947 1000.000000 1000.000000 1000.0000002020-09-27 -1.271748 1000.000000 1000.000000 -1.041206
# 结构一个10行3列的数据df=pd.DataFrame(np.random.randn(10,3),columns=list('abc'))print(df)
a b c0 1.761415 0.528009 -0.3472711 -0.682149 0.353312 0.3372292 1.080733 -0.272290 1.0203353 -0.979681 -1.753745 0.8363874 0.243748 2.085531 -0.9933185 -1.041006 1.518130 -0.0873836 -1.400354 -0.095196 3.0436397 -0.835144 0.926415 -1.2171028 0.326098 1.079906 0.1568849 1.836618 -1.288516 -2.492620
# 查问a>b的数据print(df.query('a>b'))
a b c0 1.761415 0.528009 -0.3472712 1.080733 -0.272290 1.0203353 -0.979681 -1.753745 0.8363879 1.836618 -1.288516 -2.492620
# 查问c>b>a的数据print(df.query('(c<b) & (b<a)'))
a b c0 1.761415 0.528009 -0.3472719 1.836618 -1.288516 -2.492620