import pandas as pd
import numpy as np
import jieba
数据读取
df = pd.read_excel(r'E:\python爬虫\前程无忧招聘信息.xlsx',index_col=0)
数据去重与空值解决
df.drop_duplicates(subset=['公司名称','岗位名称'],inplace=True)
df[df['招聘人数'].isnull()]
df.dropna(how='all',inplace=True)
岗位名称字段解决
df['岗位名称'] = df['岗位名称'].apply(lambda x:x.lower())
counts = df['岗位名称'].value_counts()
target_job = ['算法','开发','剖析','工程师','数据','经营','运维','it','仓库','统计']
index = [df['岗位名称'].str.count(i) for i in target_job]
index = np.array(index).sum(axis=0) > 0
job_info = df[index]
job_list = ['数据分析',"数据统计","数据专员",'数据挖掘','算法','大数据','开发工程师',
'经营','软件工程','前端开发','深度学习','ai','数据库','仓库治理','数据产品', '客服','java','.net','andrio','人工智能','c++','数据管理',"测试","运维","数据工程师"]
job_list = np.array(job_list)
def Rename(x,job_list=job_list):
index = [i in x for i in job_list]if sum(index) > 0: return job_list[index][0]else: return x
job_info['岗位名称'] = job_info['岗位名称'].apply(Rename)
job_info["岗位名称"] = job_info["岗位名称"].apply(lambda x:x.replace("数据专员","数据分析"))
job_info["岗位名称"] = job_info["岗位名称"].apply(lambda x:x.replace("数据统计","股指期货数据分析"))
岗位薪资字段解决
index1 = job_info["岗位薪资"].str[-1].isin(["年","月"])
index2 = job_info["岗位薪资"].str[-3].isin(["万","千"])
job_info = job_info[index1 & index2]
job_info['均匀薪资'] = job_info['岗位薪资'].astype(str).apply(lambda x:np.array(x[:-3].split('-'),dtype=float))
job_info['均匀薪资'] = job_info['均匀薪资'].apply(lambda x:np.mean(x))
对立工资单位
job_info['单位'] = job_info['岗位薪资'].apply(lambda x:x[-3:])
job_info['公司畛域'].value_counts()
def con_unit(x):
if x['单位'] == "万/月": z = x['均匀薪资']*10000elif x['单位'] == "千/月": z = x['均匀薪资']*1000elif x['单位'] == "万/年": z = x['均匀薪资']/12*10000return int(z)
job_info['均匀薪资'] = job_info.apply(con_unit,axis=1)
job_info['单位'] = '元/月'
工作地点字段解决
job_info['工作地点'] = job_info['工作地点'].apply(lambda x:x.split('-')[0])
公司畛域字段解决
job_info['公司畛域'] = job_info['公司畛域'].apply(lambda x:x.split('/')[0])
招聘人数字段解决
job_info['招聘人数'] = job_info['招聘人数'].apply(lambda x:x.replace("若干","1").strip()[1:-1])
工作教训与学历要求字段解决
job_info['工作教训'] = job_info['工作教训'].apply(lambda x:x.replace("无需","1年以下").strip()[:-2])
job_info['学历需要'] = job_info['学历需要'].apply(lambda x:x.split()[0])
公司规模字段解决
job_info['公司规模'].value_counts()
def func(x):
if x == '少于50人': return "<50"elif x == '50-150人': return "50-150"elif x == '150-500人': return '150-500'elif x == '500-1000人': return '500-1000'elif x == '1000-5000人': return '1000-5000'elif x == '5000-10000人': return '5000-10000'elif x == '10000人以上': return ">10000"else: return np.nan
job_info['公司规模'] = job_info['公司规模'].apply(func)
公司福利字段解决
job_info['公司福利'] = job_info['公司福利'].apply(lambda x:str(x).split())
职位信息字段解决
job_info['职位信息'] = job_info['职位信息'].apply(lambda x:x.split('职能类别')[0])
with open(r"E:\C++\停用词表.txt",'r',encoding = 'utf8') as f:
stopword = f.read()
stopword = stopword.split()
job_info['职位信息'] = job_info['职位信息'].apply(lambda x:x.lower()).apply(lambda x:"".join(x)).apply(lambda x:x.strip()).apply(jieba.lcut).apply(lambda x:[i for i in x if i not in stopword])
cons = job_info['公司畛域'].value_counts()
industries = pd.DataFrame(cons.index,columns=['行业畛域'])
industry = pd.DataFrame(columns=['分词明细','行业畛域'])
for i in industries['行业畛域']:
words = []word = job_info['职位信息'][job_info['公司畛域'] == i]word.dropna(inplace=True)[words.extend(str(z).strip('\'[]').split("\', \'")) for z in word]df1 = pd.DataFrame({'分词明细':words, '行业畛域':i})industry = industry.append(df1,ignore_index=True)
industry = industry[industry['分词明细'] != "\n"]
industry = industry[industry['分词明细'] != ""]
count = pd.DataFrame(industry['分词明细'].value_counts())
lst = list(count[count['分词明细'] >=300].index)
industry = industry[industry['分词明细'].isin(lst)]
数据存储
industry.to_excel(r'E:\python爬虫\数据预处理\词云.xlsx')
job_info.to_excel(r'E:\python爬虫\数据预处理\前程无忧(已荡涤).xlsx')