共计 7593 个字符,预计需要花费 19 分钟才能阅读完成。
groupby 分组
import pandas as pd | |
import numpy as np | |
df=pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', | |
'foo', 'bar', 'foo', 'foo'], | |
'B' : ['one', 'one', 'two', 'three', | |
'two', 'two', 'one', 'three'], | |
'C' : np.random.randn(8), | |
'D' : np.random.randn(8)}) | |
print(df) | |
grouped=df.groupby('A') | |
print('-'*30) | |
print(grouped.count()) | |
print('-'*30) | |
grouped=df.groupby(['A','B']) | |
print(grouped.count()) | |
print('-'*30) | |
# 通过函数分组 | |
def get_letter_type(letter): | |
if letter.lower() in 'aeiou': | |
return 'a' | |
else: | |
return 'b' | |
grouped=df.groupby(get_letter_type,axis=1) | |
print(grouped.count()) | |
A B C D | |
0 foo one 1.429387 0.643569 | |
1 bar one -0.858448 -0.213034 | |
2 foo two 0.375644 0.214584 | |
3 bar three 0.042284 -0.330481 | |
4 foo two -1.421967 0.768176 | |
5 bar two 1.293483 -0.399003 | |
6 foo one -1.101385 -0.236341 | |
7 foo three -0.852603 -1.718694 | |
------------------------------ | |
B C D | |
A | |
bar 3 3 3 | |
foo 5 5 5 | |
------------------------------ | |
C D | |
A B | |
bar one 1 1 | |
three 1 1 | |
two 1 1 | |
foo one 2 2 | |
three 1 1 | |
two 2 2 | |
------------------------------ | |
a b | |
0 1 3 | |
1 1 3 | |
2 1 3 | |
3 1 3 | |
4 1 3 | |
5 1 3 | |
6 1 3 | |
7 1 3 | |
se=pd.Series([1,2,3,4,5],[6,9,8,9,8]) | |
print(se) | |
se.groupby(level=0) |
6 1 | |
9 2 | |
8 3 | |
9 4 | |
8 5 | |
dtype: int64 | |
<pandas.core.groupby.generic.SeriesGroupBy object at 0x111b00040> | |
# 分组求和 | |
grouped=se.groupby(level=0).sum() | |
print(grouped) |
6 1 | |
8 8 | |
9 6 | |
dtype: int64 | |
df2=pd.DataFrame({'X':['A','B','A','B'],'Y':[1,2,3,4]}) | |
print(df2) |
X Y | |
0 A 1 | |
1 B 2 | |
2 A 3 | |
3 B 4 | |
# 按 X 分组, 并查问 A 列的数据 | |
grp=df2.groupby('X').get_group('A') | |
print(grp) |
X Y | |
0 A 1 | |
2 A 3 | |
Pandas 多级索引
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], | |
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] | |
index=pd.MultiIndex.from_arrays(arrays,names=['first','second']) | |
print(index) |
MultiIndex([('bar', 'one'), | |
('bar', 'two'), | |
('baz', 'one'), | |
('baz', 'two'), | |
('foo', 'one'), | |
('foo', 'two'), | |
('qux', 'one'), | |
('qux', 'two')], | |
names=['first', 'second']) | |
s=pd.Series(np.random.randn(8),index=index) | |
print(s) |
first second | |
bar one 0.120979 | |
two -0.440384 | |
baz one 0.515106 | |
two -0.019882 | |
foo one 1.149595 | |
two -0.369984 | |
qux one -0.930438 | |
two 0.146044 | |
dtype: float64 | |
# 分组求和 | |
grouped=s.groupby(level='first') | |
print(grouped.sum()) |
first | |
bar -0.319405 | |
baz 0.495224 | |
foo 0.779611 | |
qux -0.784394 | |
dtype: float64 | |
grouped=df.groupby(['A','B']) | |
print(grouped.size()) |
A B | |
bar one 1 | |
three 1 | |
two 1 | |
foo one 2 | |
three 1 | |
two 2 | |
dtype: int64 | |
print(df)
A B C D | |
0 foo one 1.429387 0.643569 | |
1 bar one -0.858448 -0.213034 | |
2 foo two 0.375644 0.214584 | |
3 bar three 0.042284 -0.330481 | |
4 foo two -1.421967 0.768176 | |
5 bar two 1.293483 -0.399003 | |
6 foo one -1.101385 -0.236341 | |
7 foo three -0.852603 -1.718694 | |
print(grouped.describe().head())
C \ | |
count mean std min 25% 50% 75% | |
A B | |
bar one 1.0 -0.858448 NaN -0.858448 -0.858448 -0.858448 -0.858448 | |
three 1.0 0.042284 NaN 0.042284 0.042284 0.042284 0.042284 | |
two 1.0 1.293483 NaN 1.293483 1.293483 1.293483 1.293483 | |
foo one 2.0 0.164001 1.789526 -1.101385 -0.468692 0.164001 0.796694 | |
three 1.0 -0.852603 NaN -0.852603 -0.852603 -0.852603 -0.852603 | |
D \ | |
max count mean std min 25% 50% | |
A B | |
bar one -0.858448 1.0 -0.213034 NaN -0.213034 -0.213034 -0.213034 | |
three 0.042284 1.0 -0.330481 NaN -0.330481 -0.330481 -0.330481 | |
two 1.293483 1.0 -0.399003 NaN -0.399003 -0.399003 -0.399003 | |
foo one 1.429387 2.0 0.203614 0.622191 -0.236341 -0.016364 0.203614 | |
three -0.852603 1.0 -1.718694 NaN -1.718694 -1.718694 -1.718694 | |
75% max | |
A B | |
bar one -0.213034 -0.213034 | |
three -0.330481 -0.330481 | |
two -0.399003 -0.399003 | |
foo one 0.423592 0.643569 | |
three -1.718694 -1.718694 | |
grouped=df.groupby('A') | |
grouped['C'].agg([np.sum,np.mean,np.std]) |
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {vertical-align: middle;} | |
.dataframe tbody tr th {vertical-align: top;} | |
.dataframe thead th {text-align: right;} |
</style>
sum | mean | std | |
---|---|---|---|
A | |||
bar | 0.477319 | 0.159106 | 1.080712 |
foo | -1.570925 | -0.314185 | 1.188767 |
</div>
字符串操作
import pandas as pd | |
import numpy as np | |
s=pd.Series(['A','b','c','D',np.nan]) | |
print(s) | |
# 转小写 | |
print(s.str.lower()) | |
# 转大写 | |
print(s.str.upper()) | |
# 每个字符的长度 | |
print(s.str.len()) |
0 A | |
1 b | |
2 c | |
3 D | |
4 NaN | |
dtype: object | |
0 a | |
1 b | |
2 c | |
3 d | |
4 NaN | |
dtype: object | |
0 A | |
1 B | |
2 C | |
3 D | |
4 NaN | |
dtype: object | |
0 1.0 | |
1 1.0 | |
2 1.0 | |
3 1.0 | |
4 NaN | |
dtype: float64 | |
index=pd.Index(['Index','ru','men']) | |
# 去掉空格 | |
print(index.str.strip()) | |
# 去掉右边的空格 | |
print(index.str.lstrip()) | |
# 去掉左边的空格 | |
print(index.str.rstrip()) |
Index(['Index', 'ru', 'men'], dtype='object') | |
Index(['Index', 'ru', 'men'], dtype='object') | |
Index(['Index', 'ru', 'men'], dtype='object') | |
df=pd.DataFrame(np.random.randn(3,2),columns=['A a','B b'],index=range(3)) | |
print(df) |
A a B b | |
0 3.005273 0.486696 | |
1 1.093889 1.054230 | |
2 -2.846352 0.302465 | |
# 列替换 | |
print(df.columns.str.replace('','_')) |
Index(['A_a', 'B_b'], dtype='object') | |
s=pd.Series(['a_b_C','c_d_e','f_g_h']) | |
print(s) |
0 a_b_C | |
1 c_d_e | |
2 f_g_h | |
dtype: object | |
print(s.str.split('_'))
0 [a, b, C] | |
1 [c, d, e] | |
2 [f, g, h] | |
dtype: object | |
print(s.str.split('_',expand=True,n=1))
0 1 | |
0 a b_C | |
1 c d_e | |
2 f g_h | |
s = pd.Series(['A','rumen','ru','rumen','xiao','zhan'])
print(s.str.contains('ru'))
0 False | |
1 True | |
2 True | |
3 True | |
4 False | |
5 False | |
dtype: bool | |
s=pd.Series(['a','a|b','a|c']) | |
print(s) |
0 a | |
1 a|b | |
2 a|c | |
dtype: object | |
print(s.str.get_dummies(sep='|'))
a b c | |
0 1 0 0 | |
1 1 1 0 | |
2 1 0 1 | |
索引
s=pd.Series(np.arange(5),np.arange(5)[::-1],dtype='int64')
s
4 0 | |
3 1 | |
2 2 | |
1 3 | |
0 4 | |
dtype: int64 | |
print(s[s>2])
1 3 | |
0 4 | |
dtype: int64 | |
# isin 查问索引在某个范畴 | |
print(s.isin([1,3,4])) |
4 False | |
3 True | |
2 False | |
1 True | |
0 True | |
dtype: bool | |
# 依据索引查问数据 | |
print(s[s.isin([1,3,4])]) |
3 1 | |
1 3 | |
0 4 | |
dtype: int64 | |
# 结构一个联结索引的数据 | |
s=pd.Series(np.arange(6),index=pd.MultiIndex.from_product([[1,2],['a','b','c']])) |
print(s)
1 a 0 | |
b 1 | |
c 2 | |
2 a 3 | |
b 4 | |
c 5 | |
dtype: int64 | |
print(s.iloc[s.index.isin([(1,'b'),(2,'c')])])
1 b 1 | |
2 c 5 | |
dtype: int64 | |
# 结构一个工夫序列 | |
dates=pd.date_range('20200920',periods=8) | |
print(dates) |
DatetimeIndex(['2020-09-20', '2020-09-21', '2020-09-22', '2020-09-23', | |
'2020-09-24', '2020-09-25', '2020-09-26', '2020-09-27'], | |
dtype='datetime64[ns]', freq='D') | |
df=pd.DataFrame(np.random.randn(8,4),index=dates,columns=['A','B','C','D']) | |
print(df) |
A B C D | |
2020-09-20 -1.218522 2.067088 0.015009 0.158780 | |
2020-09-21 -0.546837 -0.601178 -0.894882 0.172037 | |
2020-09-22 0.189848 -0.910520 0.196186 -0.073495 | |
2020-09-23 -0.566892 0.899193 -0.450925 0.633253 | |
2020-09-24 0.038838 1.577004 0.580927 0.609050 | |
2020-09-25 1.562094 0.020813 -0.618859 -0.515212 | |
2020-09-26 -1.333947 0.275765 0.139325 1.124207 | |
2020-09-27 -1.271748 1.082302 1.036805 -1.041206 | |
# 查问 A 列数据 | |
print(df['A']) |
2020-09-20 -1.218522 | |
2020-09-21 -0.546837 | |
2020-09-22 0.189848 | |
2020-09-23 -0.566892 | |
2020-09-24 0.038838 | |
2020-09-25 1.562094 | |
2020-09-26 -1.333947 | |
2020-09-27 -1.271748 | |
Freq: D, Name: A, dtype: float64 | |
# 查问小于 0 的数字, 大于 0 的值默认被置为 NaN | |
df.where(df<0) |
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {vertical-align: middle;} | |
.dataframe tbody tr th {vertical-align: top;} | |
.dataframe thead th {text-align: right;} |
</style>
A | B | C | D | |
---|---|---|---|---|
2020-09-20 | -1.218522 | NaN | NaN | NaN |
2020-09-21 | -0.546837 | -0.601178 | -0.894882 | NaN |
2020-09-22 | NaN | -0.910520 | NaN | -0.073495 |
2020-09-23 | -0.566892 | NaN | -0.450925 | NaN |
2020-09-24 | NaN | NaN | NaN | NaN |
2020-09-25 | NaN | NaN | -0.618859 | -0.515212 |
2020-09-26 | -1.333947 | NaN | NaN | NaN |
2020-09-27 | -1.271748 | NaN | NaN | -1.041206 |
</div>
# 查问小于 0 的数字, 大于 0 的值变成正数 | |
print(df.where(df<0,-df)) |
A B C D | |
2020-09-20 -1.218522 -2.067088 -0.015009 -0.158780 | |
2020-09-21 -0.546837 -0.601178 -0.894882 -0.172037 | |
2020-09-22 -0.189848 -0.910520 -0.196186 -0.073495 | |
2020-09-23 -0.566892 -0.899193 -0.450925 -0.633253 | |
2020-09-24 -0.038838 -1.577004 -0.580927 -0.609050 | |
2020-09-25 -1.562094 -0.020813 -0.618859 -0.515212 | |
2020-09-26 -1.333947 -0.275765 -0.139325 -1.124207 | |
2020-09-27 -1.271748 -1.082302 -1.036805 -1.041206 | |
# 查问小于 0 的数据, 大于 0 的置为 1000 | |
print(df.where(df<0,1000)) |
A B C D | |
2020-09-20 -1.218522 1000.000000 1000.000000 1000.000000 | |
2020-09-21 -0.546837 -0.601178 -0.894882 1000.000000 | |
2020-09-22 1000.000000 -0.910520 1000.000000 -0.073495 | |
2020-09-23 -0.566892 1000.000000 -0.450925 1000.000000 | |
2020-09-24 1000.000000 1000.000000 1000.000000 1000.000000 | |
2020-09-25 1000.000000 1000.000000 -0.618859 -0.515212 | |
2020-09-26 -1.333947 1000.000000 1000.000000 1000.000000 | |
2020-09-27 -1.271748 1000.000000 1000.000000 -1.041206 | |
# 结构一个 10 行 3 列的数据 | |
df=pd.DataFrame(np.random.randn(10,3),columns=list('abc')) | |
print(df) |
a b c | |
0 1.761415 0.528009 -0.347271 | |
1 -0.682149 0.353312 0.337229 | |
2 1.080733 -0.272290 1.020335 | |
3 -0.979681 -1.753745 0.836387 | |
4 0.243748 2.085531 -0.993318 | |
5 -1.041006 1.518130 -0.087383 | |
6 -1.400354 -0.095196 3.043639 | |
7 -0.835144 0.926415 -1.217102 | |
8 0.326098 1.079906 0.156884 | |
9 1.836618 -1.288516 -2.492620 | |
# 查问 a >b 的数据 | |
print(df.query('a>b')) |
a b c | |
0 1.761415 0.528009 -0.347271 | |
2 1.080733 -0.272290 1.020335 | |
3 -0.979681 -1.753745 0.836387 | |
9 1.836618 -1.288516 -2.492620 | |
# 查问 c >b>a 的数据 | |
print(df.query('(c<b) & (b<a)')) |
a b c | |
0 1.761415 0.528009 -0.347271 | |
9 1.836618 -1.288516 -2.492620 | |
正文完