import pandas as pdgl=pd.read_csv('./pandas/data/game_logs.csv')
# 数据的内存应用状况gl.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>RangeIndex: 171907 entries, 0 to 171906Columns: 161 entries, date to acquisition_infodtypes: float64(77), int64(6), object(78)memory usage: 859.4 MB
for dtype in ['float64','object','int64']:    selected_dtype=gl.select_dtypes(include=[dtype])    memory_usage_b=selected_dtype.memory_usage(deep=True).mean()    memory_usage_mb=memory_usage_b/1024/1024    print('[%s] memory usage %0.2f MB' % (dtype,memory_usage_mb))
[float64] memory usage 1.29 MB[object] memory usage 9.50 MB[int64] memory usage 1.12 MB
# uint8 int8 int16 int32 int64的取值范畴import numpy as npfor dtype in ['uint8','int8','int16','int32','int64']:    print(np.iinfo(dtype))
Machine parameters for uint8---------------------------------------------------------------min = 0max = 255---------------------------------------------------------------Machine parameters for int8---------------------------------------------------------------min = -128max = 127---------------------------------------------------------------Machine parameters for int16---------------------------------------------------------------min = -32768max = 32767---------------------------------------------------------------Machine parameters for int32---------------------------------------------------------------min = -2147483648max = 2147483647---------------------------------------------------------------Machine parameters for int64---------------------------------------------------------------min = -9223372036854775808max = 9223372036854775807---------------------------------------------------------------
# 类型转换后的数据占用内存def mem_usage(data):    if isinstance(data,pd.DataFrame):        mem_b=data.memory_usage(deep=True).sum()    else:        mem_b=data.memory_usage(deep=True)    return "{:03.2f} MB".format(mem_b/1024**2)gl_int64=gl.select_dtypes(include=['int64'])# 向下类型转换gl_int32=gl_int.apply(pd.to_numeric,downcast='unsigned')print(mem_usage(gl_int64))print(mem_usage(gl_int32))# float64 转 floatgl_float64=gl.select_dtypes(include=['float64'])gl_float=gl_float64.apply(pd.to_numeric,downcast='float')print("转换前:"+mem_usage(gl_float64))print("转换后"+mem_usage(gl_float))
7.87 MB1.48 MB转换前:100.99 MB转换后50.49 MB
opt_gl=gl.copy()opt_gl[gl_int32.columns]=gl_int32opt_gl[gl_float.columns]=gl_floatprint("原数据的大小:"+mem_usage(gl))print("转换后的数据大小:"+mem_usage(opt_gl))
原数据的大小:859.43 MB转换后的数据大小:802.54 MB
gl_obj=gl.select_dtypes(include=['object']).copy()print(gl_obj.describe())
       day_of_week  v_name v_league  h_name h_league day_night  \count       171907  171907   171907  171907   171907    140150   unique           7     148        7     148        7         2   top            Sat     CHN       NL     CHN       NL         D   freq         28891    8870    88866    9024    88867     82724                      completion forefeit protest park_id  ... h_player_6_id  \count                     116      145     180  171907  ...        140838   unique                    116        3       5     245  ...          4774   top     19590602,PIT06,2,1,39        H       V   STL07  ...      grimc101   freq                        1       69      90    7022  ...           427          h_player_6_name h_player_7_id h_player_7_name h_player_8_id  \count           140838        140838          140838        140838   unique            4720          5253            5197          4760   top      Charlie Grimm      grimc101   Charlie Grimm      lopea102   freq               427           491             491           676          h_player_8_name h_player_9_id h_player_9_name additional_info  \count           140838        140838          140838            1456   unique            4710          5193            5142             332   top           Al Lopez      spahw101    Warren Spahn            HTBF   freq               676           339             339            1112          acquisition_info  count            140841  unique                1  top                   Y  freq             140841  [4 rows x 78 columns]
dow=gl_obj.day_of_weekprint(dow.head())
dow_cat=dow.astype('category')print(dow_cat.head())
print("转换前"+mem_usage(dow))print("转换后"+mem_usage(dow_cat))
# 将反复比拟多的数据转换成category,放大数据内存convert_obj=pd.DataFrame()for col in gl_obj.columns:    num_unique=len(gl_obj[col].unique())    num_total=len(gl_obj[col])    if num_unique/num_total<0.5:        convert_obj.loc[:,col]=gl_obj[col].astype('category')    else:        convert_obj.loc[:,col]=gl_obj[col]print('数据转换前:'+mem_usage(gl_obj))print('数据转换后:'+mem_usage(convert_obj))
opt_gl[convert_obj.columns]=convert_objprint(mem_usage(opt_gl))
# apply操作
titanic=pd.read_csv('./pandas/data/titanic_train.csv')titanic.iloc[99]
# 获取99行的数据def get_row(data):    return data.iloc[99]row=titanic.apply(get_row)row
# 统计每一列为NaN的数量def get_null_count(data):    col_null=pd.isnull(data)    null=data[col_null]    return len(null)null_count=titanic.apply(get_null_count)print(null_count)
# 数据转换def which_class(row):    pclass=row['Pclass']    if pd.isnull(pclass):        return "UnKown"    elif pclass == 1:        return "One"    elif pclass == 2:        return "Tow"    elif pclass == 3:        return "Three"classes=titanic.apply(which_class,axis=1)print(classes)
# 找出未成年的数据def is_minor(row):    age=row['Age']    if age<18:        return True    else:        return Falseminor=titanic.apply(is_minor,axis=1)print(titanic[minor])