共计 7593 个字符,预计需要花费 19 分钟才能阅读完成。
groupby 分组
import pandas as pd
import numpy as np
df=pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
print(df)
grouped=df.groupby('A')
print('-'*30)
print(grouped.count())
print('-'*30)
grouped=df.groupby(['A','B'])
print(grouped.count())
print('-'*30)
# 通过函数分组
def get_letter_type(letter):
if letter.lower() in 'aeiou':
return 'a'
else:
return 'b'
grouped=df.groupby(get_letter_type,axis=1)
print(grouped.count())
A B C D
0 foo one 1.429387 0.643569
1 bar one -0.858448 -0.213034
2 foo two 0.375644 0.214584
3 bar three 0.042284 -0.330481
4 foo two -1.421967 0.768176
5 bar two 1.293483 -0.399003
6 foo one -1.101385 -0.236341
7 foo three -0.852603 -1.718694
------------------------------
B C D
A
bar 3 3 3
foo 5 5 5
------------------------------
C D
A B
bar one 1 1
three 1 1
two 1 1
foo one 2 2
three 1 1
two 2 2
------------------------------
a b
0 1 3
1 1 3
2 1 3
3 1 3
4 1 3
5 1 3
6 1 3
7 1 3
se=pd.Series([1,2,3,4,5],[6,9,8,9,8])
print(se)
se.groupby(level=0)
6 1
9 2
8 3
9 4
8 5
dtype: int64
<pandas.core.groupby.generic.SeriesGroupBy object at 0x111b00040>
# 分组求和
grouped=se.groupby(level=0).sum()
print(grouped)
6 1
8 8
9 6
dtype: int64
df2=pd.DataFrame({'X':['A','B','A','B'],'Y':[1,2,3,4]})
print(df2)
X Y
0 A 1
1 B 2
2 A 3
3 B 4
# 按 X 分组, 并查问 A 列的数据
grp=df2.groupby('X').get_group('A')
print(grp)
X Y
0 A 1
2 A 3
Pandas 多级索引
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
index=pd.MultiIndex.from_arrays(arrays,names=['first','second'])
print(index)
MultiIndex([('bar', 'one'),
('bar', 'two'),
('baz', 'one'),
('baz', 'two'),
('foo', 'one'),
('foo', 'two'),
('qux', 'one'),
('qux', 'two')],
names=['first', 'second'])
s=pd.Series(np.random.randn(8),index=index)
print(s)
first second
bar one 0.120979
two -0.440384
baz one 0.515106
two -0.019882
foo one 1.149595
two -0.369984
qux one -0.930438
two 0.146044
dtype: float64
# 分组求和
grouped=s.groupby(level='first')
print(grouped.sum())
first
bar -0.319405
baz 0.495224
foo 0.779611
qux -0.784394
dtype: float64
grouped=df.groupby(['A','B'])
print(grouped.size())
A B
bar one 1
three 1
two 1
foo one 2
three 1
two 2
dtype: int64
print(df)
A B C D
0 foo one 1.429387 0.643569
1 bar one -0.858448 -0.213034
2 foo two 0.375644 0.214584
3 bar three 0.042284 -0.330481
4 foo two -1.421967 0.768176
5 bar two 1.293483 -0.399003
6 foo one -1.101385 -0.236341
7 foo three -0.852603 -1.718694
print(grouped.describe().head())
C \
count mean std min 25% 50% 75%
A B
bar one 1.0 -0.858448 NaN -0.858448 -0.858448 -0.858448 -0.858448
three 1.0 0.042284 NaN 0.042284 0.042284 0.042284 0.042284
two 1.0 1.293483 NaN 1.293483 1.293483 1.293483 1.293483
foo one 2.0 0.164001 1.789526 -1.101385 -0.468692 0.164001 0.796694
three 1.0 -0.852603 NaN -0.852603 -0.852603 -0.852603 -0.852603
D \
max count mean std min 25% 50%
A B
bar one -0.858448 1.0 -0.213034 NaN -0.213034 -0.213034 -0.213034
three 0.042284 1.0 -0.330481 NaN -0.330481 -0.330481 -0.330481
two 1.293483 1.0 -0.399003 NaN -0.399003 -0.399003 -0.399003
foo one 1.429387 2.0 0.203614 0.622191 -0.236341 -0.016364 0.203614
three -0.852603 1.0 -1.718694 NaN -1.718694 -1.718694 -1.718694
75% max
A B
bar one -0.213034 -0.213034
three -0.330481 -0.330481
two -0.399003 -0.399003
foo one 0.423592 0.643569
three -1.718694 -1.718694
grouped=df.groupby('A')
grouped['C'].agg([np.sum,np.mean,np.std])
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {vertical-align: middle;}
.dataframe tbody tr th {vertical-align: top;}
.dataframe thead th {text-align: right;}
</style>
sum | mean | std | |
---|---|---|---|
A | |||
bar | 0.477319 | 0.159106 | 1.080712 |
foo | -1.570925 | -0.314185 | 1.188767 |
</div>
字符串操作
import pandas as pd
import numpy as np
s=pd.Series(['A','b','c','D',np.nan])
print(s)
# 转小写
print(s.str.lower())
# 转大写
print(s.str.upper())
# 每个字符的长度
print(s.str.len())
0 A
1 b
2 c
3 D
4 NaN
dtype: object
0 a
1 b
2 c
3 d
4 NaN
dtype: object
0 A
1 B
2 C
3 D
4 NaN
dtype: object
0 1.0
1 1.0
2 1.0
3 1.0
4 NaN
dtype: float64
index=pd.Index(['Index','ru','men'])
# 去掉空格
print(index.str.strip())
# 去掉右边的空格
print(index.str.lstrip())
# 去掉左边的空格
print(index.str.rstrip())
Index(['Index', 'ru', 'men'], dtype='object')
Index(['Index', 'ru', 'men'], dtype='object')
Index(['Index', 'ru', 'men'], dtype='object')
df=pd.DataFrame(np.random.randn(3,2),columns=['A a','B b'],index=range(3))
print(df)
A a B b
0 3.005273 0.486696
1 1.093889 1.054230
2 -2.846352 0.302465
# 列替换
print(df.columns.str.replace('','_'))
Index(['A_a', 'B_b'], dtype='object')
s=pd.Series(['a_b_C','c_d_e','f_g_h'])
print(s)
0 a_b_C
1 c_d_e
2 f_g_h
dtype: object
print(s.str.split('_'))
0 [a, b, C]
1 [c, d, e]
2 [f, g, h]
dtype: object
print(s.str.split('_',expand=True,n=1))
0 1
0 a b_C
1 c d_e
2 f g_h
s = pd.Series(['A','rumen','ru','rumen','xiao','zhan'])
print(s.str.contains('ru'))
0 False
1 True
2 True
3 True
4 False
5 False
dtype: bool
s=pd.Series(['a','a|b','a|c'])
print(s)
0 a
1 a|b
2 a|c
dtype: object
print(s.str.get_dummies(sep='|'))
a b c
0 1 0 0
1 1 1 0
2 1 0 1
索引
s=pd.Series(np.arange(5),np.arange(5)[::-1],dtype='int64')
s
4 0
3 1
2 2
1 3
0 4
dtype: int64
print(s[s>2])
1 3
0 4
dtype: int64
# isin 查问索引在某个范畴
print(s.isin([1,3,4]))
4 False
3 True
2 False
1 True
0 True
dtype: bool
# 依据索引查问数据
print(s[s.isin([1,3,4])])
3 1
1 3
0 4
dtype: int64
# 结构一个联结索引的数据
s=pd.Series(np.arange(6),index=pd.MultiIndex.from_product([[1,2],['a','b','c']]))
print(s)
1 a 0
b 1
c 2
2 a 3
b 4
c 5
dtype: int64
print(s.iloc[s.index.isin([(1,'b'),(2,'c')])])
1 b 1
2 c 5
dtype: int64
# 结构一个工夫序列
dates=pd.date_range('20200920',periods=8)
print(dates)
DatetimeIndex(['2020-09-20', '2020-09-21', '2020-09-22', '2020-09-23',
'2020-09-24', '2020-09-25', '2020-09-26', '2020-09-27'],
dtype='datetime64[ns]', freq='D')
df=pd.DataFrame(np.random.randn(8,4),index=dates,columns=['A','B','C','D'])
print(df)
A B C D
2020-09-20 -1.218522 2.067088 0.015009 0.158780
2020-09-21 -0.546837 -0.601178 -0.894882 0.172037
2020-09-22 0.189848 -0.910520 0.196186 -0.073495
2020-09-23 -0.566892 0.899193 -0.450925 0.633253
2020-09-24 0.038838 1.577004 0.580927 0.609050
2020-09-25 1.562094 0.020813 -0.618859 -0.515212
2020-09-26 -1.333947 0.275765 0.139325 1.124207
2020-09-27 -1.271748 1.082302 1.036805 -1.041206
# 查问 A 列数据
print(df['A'])
2020-09-20 -1.218522
2020-09-21 -0.546837
2020-09-22 0.189848
2020-09-23 -0.566892
2020-09-24 0.038838
2020-09-25 1.562094
2020-09-26 -1.333947
2020-09-27 -1.271748
Freq: D, Name: A, dtype: float64
# 查问小于 0 的数字, 大于 0 的值默认被置为 NaN
df.where(df<0)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {vertical-align: middle;}
.dataframe tbody tr th {vertical-align: top;}
.dataframe thead th {text-align: right;}
</style>
A | B | C | D | |
---|---|---|---|---|
2020-09-20 | -1.218522 | NaN | NaN | NaN |
2020-09-21 | -0.546837 | -0.601178 | -0.894882 | NaN |
2020-09-22 | NaN | -0.910520 | NaN | -0.073495 |
2020-09-23 | -0.566892 | NaN | -0.450925 | NaN |
2020-09-24 | NaN | NaN | NaN | NaN |
2020-09-25 | NaN | NaN | -0.618859 | -0.515212 |
2020-09-26 | -1.333947 | NaN | NaN | NaN |
2020-09-27 | -1.271748 | NaN | NaN | -1.041206 |
</div>
# 查问小于 0 的数字, 大于 0 的值变成正数
print(df.where(df<0,-df))
A B C D
2020-09-20 -1.218522 -2.067088 -0.015009 -0.158780
2020-09-21 -0.546837 -0.601178 -0.894882 -0.172037
2020-09-22 -0.189848 -0.910520 -0.196186 -0.073495
2020-09-23 -0.566892 -0.899193 -0.450925 -0.633253
2020-09-24 -0.038838 -1.577004 -0.580927 -0.609050
2020-09-25 -1.562094 -0.020813 -0.618859 -0.515212
2020-09-26 -1.333947 -0.275765 -0.139325 -1.124207
2020-09-27 -1.271748 -1.082302 -1.036805 -1.041206
# 查问小于 0 的数据, 大于 0 的置为 1000
print(df.where(df<0,1000))
A B C D
2020-09-20 -1.218522 1000.000000 1000.000000 1000.000000
2020-09-21 -0.546837 -0.601178 -0.894882 1000.000000
2020-09-22 1000.000000 -0.910520 1000.000000 -0.073495
2020-09-23 -0.566892 1000.000000 -0.450925 1000.000000
2020-09-24 1000.000000 1000.000000 1000.000000 1000.000000
2020-09-25 1000.000000 1000.000000 -0.618859 -0.515212
2020-09-26 -1.333947 1000.000000 1000.000000 1000.000000
2020-09-27 -1.271748 1000.000000 1000.000000 -1.041206
# 结构一个 10 行 3 列的数据
df=pd.DataFrame(np.random.randn(10,3),columns=list('abc'))
print(df)
a b c
0 1.761415 0.528009 -0.347271
1 -0.682149 0.353312 0.337229
2 1.080733 -0.272290 1.020335
3 -0.979681 -1.753745 0.836387
4 0.243748 2.085531 -0.993318
5 -1.041006 1.518130 -0.087383
6 -1.400354 -0.095196 3.043639
7 -0.835144 0.926415 -1.217102
8 0.326098 1.079906 0.156884
9 1.836618 -1.288516 -2.492620
# 查问 a >b 的数据
print(df.query('a>b'))
a b c
0 1.761415 0.528009 -0.347271
2 1.080733 -0.272290 1.020335
3 -0.979681 -1.753745 0.836387
9 1.836618 -1.288516 -2.492620
# 查问 c >b>a 的数据
print(df.query('(c<b) & (b<a)'))
a b c
0 1.761415 0.528009 -0.347271
9 1.836618 -1.288516 -2.492620
正文完