共计 4285 个字符,预计需要花费 11 分钟才能阅读完成。
import pandas as pd
gl=pd.read_csv('./pandas/data/game_logs.csv')
# 数据的内存应用状况
gl.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171907 entries, 0 to 171906
Columns: 161 entries, date to acquisition_info
dtypes: float64(77), int64(6), object(78)
memory usage: 859.4 MB
for dtype in ['float64','object','int64']:
selected_dtype=gl.select_dtypes(include=[dtype])
memory_usage_b=selected_dtype.memory_usage(deep=True).mean()
memory_usage_mb=memory_usage_b/1024/1024
print('[%s] memory usage %0.2f MB' % (dtype,memory_usage_mb))
[float64] memory usage 1.29 MB
[object] memory usage 9.50 MB
[int64] memory usage 1.12 MB
# uint8 int8 int16 int32 int64 的取值范畴
import numpy as np
for dtype in ['uint8','int8','int16','int32','int64']:
print(np.iinfo(dtype))
Machine parameters for uint8
---------------------------------------------------------------
min = 0
max = 255
---------------------------------------------------------------
Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------
Machine parameters for int16
---------------------------------------------------------------
min = -32768
max = 32767
---------------------------------------------------------------
Machine parameters for int32
---------------------------------------------------------------
min = -2147483648
max = 2147483647
---------------------------------------------------------------
Machine parameters for int64
---------------------------------------------------------------
min = -9223372036854775808
max = 9223372036854775807
---------------------------------------------------------------
# 类型转换后的数据占用内存
def mem_usage(data):
if isinstance(data,pd.DataFrame):
mem_b=data.memory_usage(deep=True).sum()
else:
mem_b=data.memory_usage(deep=True)
return "{:03.2f} MB".format(mem_b/1024**2)
gl_int64=gl.select_dtypes(include=['int64'])
# 向下类型转换
gl_int32=gl_int.apply(pd.to_numeric,downcast='unsigned')
print(mem_usage(gl_int64))
print(mem_usage(gl_int32))
# float64 转 float
gl_float64=gl.select_dtypes(include=['float64'])
gl_float=gl_float64.apply(pd.to_numeric,downcast='float')
print("转换前:"+mem_usage(gl_float64))
print("转换后"+mem_usage(gl_float))
7.87 MB
1.48 MB
转换前:100.99 MB
转换后 50.49 MB
opt_gl=gl.copy()
opt_gl[gl_int32.columns]=gl_int32
opt_gl[gl_float.columns]=gl_float
print("原数据的大小:"+mem_usage(gl))
print("转换后的数据大小:"+mem_usage(opt_gl))
原数据的大小:859.43 MB
转换后的数据大小:802.54 MB
gl_obj=gl.select_dtypes(include=['object']).copy()
print(gl_obj.describe())
day_of_week v_name v_league h_name h_league day_night \
count 171907 171907 171907 171907 171907 140150
unique 7 148 7 148 7 2
top Sat CHN NL CHN NL D
freq 28891 8870 88866 9024 88867 82724
completion forefeit protest park_id ... h_player_6_id \
count 116 145 180 171907 ... 140838
unique 116 3 5 245 ... 4774
top 19590602,PIT06,2,1,39 H V STL07 ... grimc101
freq 1 69 90 7022 ... 427
h_player_6_name h_player_7_id h_player_7_name h_player_8_id \
count 140838 140838 140838 140838
unique 4720 5253 5197 4760
top Charlie Grimm grimc101 Charlie Grimm lopea102
freq 427 491 491 676
h_player_8_name h_player_9_id h_player_9_name additional_info \
count 140838 140838 140838 1456
unique 4710 5193 5142 332
top Al Lopez spahw101 Warren Spahn HTBF
freq 676 339 339 1112
acquisition_info
count 140841
unique 1
top Y
freq 140841
[4 rows x 78 columns]
dow=gl_obj.day_of_week
print(dow.head())
dow_cat=dow.astype('category')
print(dow_cat.head())
print("转换前"+mem_usage(dow))
print("转换后"+mem_usage(dow_cat))
# 将反复比拟多的数据转换成 category, 放大数据内存
convert_obj=pd.DataFrame()
for col in gl_obj.columns:
num_unique=len(gl_obj[col].unique())
num_total=len(gl_obj[col])
if num_unique/num_total<0.5:
convert_obj.loc[:,col]=gl_obj[col].astype('category')
else:
convert_obj.loc[:,col]=gl_obj[col]
print('数据转换前:'+mem_usage(gl_obj))
print('数据转换后:'+mem_usage(convert_obj))
opt_gl[convert_obj.columns]=convert_obj
print(mem_usage(opt_gl))
# apply 操作
titanic=pd.read_csv('./pandas/data/titanic_train.csv')
titanic.iloc[99]
# 获取 99 行的数据
def get_row(data):
return data.iloc[99]
row=titanic.apply(get_row)
row
# 统计每一列为 NaN 的数量
def get_null_count(data):
col_null=pd.isnull(data)
null=data[col_null]
return len(null)
null_count=titanic.apply(get_null_count)
print(null_count)
# 数据转换
def which_class(row):
pclass=row['Pclass']
if pd.isnull(pclass):
return "UnKown"
elif pclass == 1:
return "One"
elif pclass == 2:
return "Tow"
elif pclass == 3:
return "Three"
classes=titanic.apply(which_class,axis=1)
print(classes)
# 找出未成年的数据
def is_minor(row):
age=row['Age']
if age<18:
return True
else:
return False
minor=titanic.apply(is_minor,axis=1)
print(titanic[minor])
正文完