import pandas as pdgl=pd.read_csv('./pandas/data/game_logs.csv')
# 数据的内存应用状况gl.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>RangeIndex: 171907 entries, 0 to 171906Columns: 161 entries, date to acquisition_infodtypes: float64(77), int64(6), object(78)memory usage: 859.4 MB
for dtype in ['float64','object','int64']: selected_dtype=gl.select_dtypes(include=[dtype]) memory_usage_b=selected_dtype.memory_usage(deep=True).mean() memory_usage_mb=memory_usage_b/1024/1024 print('[%s] memory usage %0.2f MB' % (dtype,memory_usage_mb))
[float64] memory usage 1.29 MB[object] memory usage 9.50 MB[int64] memory usage 1.12 MB
# uint8 int8 int16 int32 int64的取值范畴import numpy as npfor dtype in ['uint8','int8','int16','int32','int64']: print(np.iinfo(dtype))
Machine parameters for uint8---------------------------------------------------------------min = 0max = 255---------------------------------------------------------------Machine parameters for int8---------------------------------------------------------------min = -128max = 127---------------------------------------------------------------Machine parameters for int16---------------------------------------------------------------min = -32768max = 32767---------------------------------------------------------------Machine parameters for int32---------------------------------------------------------------min = -2147483648max = 2147483647---------------------------------------------------------------Machine parameters for int64---------------------------------------------------------------min = -9223372036854775808max = 9223372036854775807---------------------------------------------------------------
# 类型转换后的数据占用内存def mem_usage(data): if isinstance(data,pd.DataFrame): mem_b=data.memory_usage(deep=True).sum() else: mem_b=data.memory_usage(deep=True) return "{:03.2f} MB".format(mem_b/1024**2)gl_int64=gl.select_dtypes(include=['int64'])# 向下类型转换gl_int32=gl_int.apply(pd.to_numeric,downcast='unsigned')print(mem_usage(gl_int64))print(mem_usage(gl_int32))# float64 转 floatgl_float64=gl.select_dtypes(include=['float64'])gl_float=gl_float64.apply(pd.to_numeric,downcast='float')print("转换前:"+mem_usage(gl_float64))print("转换后"+mem_usage(gl_float))
7.87 MB1.48 MB转换前:100.99 MB转换后50.49 MB
opt_gl=gl.copy()opt_gl[gl_int32.columns]=gl_int32opt_gl[gl_float.columns]=gl_floatprint("原数据的大小:"+mem_usage(gl))print("转换后的数据大小:"+mem_usage(opt_gl))
原数据的大小:859.43 MB转换后的数据大小:802.54 MB
gl_obj=gl.select_dtypes(include=['object']).copy()print(gl_obj.describe())
day_of_week v_name v_league h_name h_league day_night \count 171907 171907 171907 171907 171907 140150 unique 7 148 7 148 7 2 top Sat CHN NL CHN NL D freq 28891 8870 88866 9024 88867 82724 completion forefeit protest park_id ... h_player_6_id \count 116 145 180 171907 ... 140838 unique 116 3 5 245 ... 4774 top 19590602,PIT06,2,1,39 H V STL07 ... grimc101 freq 1 69 90 7022 ... 427 h_player_6_name h_player_7_id h_player_7_name h_player_8_id \count 140838 140838 140838 140838 unique 4720 5253 5197 4760 top Charlie Grimm grimc101 Charlie Grimm lopea102 freq 427 491 491 676 h_player_8_name h_player_9_id h_player_9_name additional_info \count 140838 140838 140838 1456 unique 4710 5193 5142 332 top Al Lopez spahw101 Warren Spahn HTBF freq 676 339 339 1112 acquisition_info count 140841 unique 1 top Y freq 140841 [4 rows x 78 columns]
dow=gl_obj.day_of_weekprint(dow.head())
dow_cat=dow.astype('category')print(dow_cat.head())
print("转换前"+mem_usage(dow))print("转换后"+mem_usage(dow_cat))
# 将反复比拟多的数据转换成category,放大数据内存convert_obj=pd.DataFrame()for col in gl_obj.columns: num_unique=len(gl_obj[col].unique()) num_total=len(gl_obj[col]) if num_unique/num_total<0.5: convert_obj.loc[:,col]=gl_obj[col].astype('category') else: convert_obj.loc[:,col]=gl_obj[col]print('数据转换前:'+mem_usage(gl_obj))print('数据转换后:'+mem_usage(convert_obj))
opt_gl[convert_obj.columns]=convert_objprint(mem_usage(opt_gl))
# apply操作
titanic=pd.read_csv('./pandas/data/titanic_train.csv')titanic.iloc[99]
# 获取99行的数据def get_row(data): return data.iloc[99]row=titanic.apply(get_row)row
# 统计每一列为NaN的数量def get_null_count(data): col_null=pd.isnull(data) null=data[col_null] return len(null)null_count=titanic.apply(get_null_count)print(null_count)
# 数据转换def which_class(row): pclass=row['Pclass'] if pd.isnull(pclass): return "UnKown" elif pclass == 1: return "One" elif pclass == 2: return "Tow" elif pclass == 3: return "Three"classes=titanic.apply(which_class,axis=1)print(classes)
# 找出未成年的数据def is_minor(row): age=row['Age'] if age<18: return True else: return Falseminor=titanic.apply(is_minor,axis=1)print(titanic[minor])