import pandas as pd
import numpy as np
import jieba

数据读取

df = pd.read_excel(r'E:\python爬虫\前程无忧招聘信息.xlsx',index_col=0)

数据去重与空值解决

df.drop_duplicates(subset=['公司名称','岗位名称'],inplace=True)
df[df['招聘人数'].isnull()]
df.dropna(how='all',inplace=True)

岗位名称字段解决

df['岗位名称'] = df['岗位名称'].apply(lambda x:x.lower())
counts = df['岗位名称'].value_counts()
target_job = ['算法','开发','剖析','工程师','数据','经营','运维','it','仓库','统计']
index = [df['岗位名称'].str.count(i) for i in target_job]
index = np.array(index).sum(axis=0) > 0
job_info = df[index]
job_list = ['数据分析',"数据统计","数据专员",'数据挖掘','算法','大数据','开发工程师',

        '经营','软件工程','前端开发','深度学习','ai','数据库','仓库治理','数据产品',        '客服','java','.net','andrio','人工智能','c++','数据管理',"测试","运维","数据工程师"]

job_list = np.array(job_list)
def Rename(x,job_list=job_list):

index = [i in x for i in job_list]if sum(index) > 0:    return job_list[index][0]else:    return x

job_info['岗位名称'] = job_info['岗位名称'].apply(Rename)
job_info["岗位名称"] = job_info["岗位名称"].apply(lambda x:x.replace("数据专员","数据分析"))
job_info["岗位名称"] = job_info["岗位名称"].apply(lambda x:x.replace("数据统计","股指期货数据分析"))

岗位薪资字段解决

index1 = job_info["岗位薪资"].str[-1].isin(["年","月"])
index2 = job_info["岗位薪资"].str[-3].isin(["万","千"])
job_info = job_info[index1 & index2]
job_info['均匀薪资'] = job_info['岗位薪资'].astype(str).apply(lambda x:np.array(x[:-3].split('-'),dtype=float))
job_info['均匀薪资'] = job_info['均匀薪资'].apply(lambda x:np.mean(x))

对立工资单位

job_info['单位'] = job_info['岗位薪资'].apply(lambda x:x[-3:])
job_info['公司畛域'].value_counts()
def con_unit(x):

if x['单位'] == "万/月":    z = x['均匀薪资']*10000elif x['单位'] == "千/月":    z = x['均匀薪资']*1000elif x['单位'] == "万/年":    z = x['均匀薪资']/12*10000return int(z)

job_info['均匀薪资'] = job_info.apply(con_unit,axis=1)
job_info['单位'] = '元/月'

工作地点字段解决

job_info['工作地点'] = job_info['工作地点'].apply(lambda x:x.split('-')[0])

公司畛域字段解决

job_info['公司畛域'] = job_info['公司畛域'].apply(lambda x:x.split('/')[0])

招聘人数字段解决

job_info['招聘人数'] = job_info['招聘人数'].apply(lambda x:x.replace("若干","1").strip()[1:-1])

工作教训与学历要求字段解决

job_info['工作教训'] = job_info['工作教训'].apply(lambda x:x.replace("无需","1年以下").strip()[:-2])
job_info['学历需要'] = job_info['学历需要'].apply(lambda x:x.split()[0])

公司规模字段解决

job_info['公司规模'].value_counts()
def func(x):

if x == '少于50人':    return "<50"elif x == '50-150人':    return "50-150"elif x == '150-500人':    return '150-500'elif x == '500-1000人':    return '500-1000'elif x == '1000-5000人':    return '1000-5000'elif x == '5000-10000人':    return '5000-10000'elif x == '10000人以上':    return ">10000"else:    return np.nan

job_info['公司规模'] = job_info['公司规模'].apply(func)

公司福利字段解决

job_info['公司福利'] = job_info['公司福利'].apply(lambda x:str(x).split())

职位信息字段解决

job_info['职位信息'] = job_info['职位信息'].apply(lambda x:x.split('职能类别')[0])
with open(r"E:\C++\停用词表.txt",'r',encoding = 'utf8') as f:

stopword = f.read()

stopword = stopword.split()
job_info['职位信息'] = job_info['职位信息'].apply(lambda x:x.lower()).apply(lambda x:"".join(x)).apply(lambda x:x.strip()).apply(jieba.lcut).apply(lambda x:[i for i in x if i not in stopword])
cons = job_info['公司畛域'].value_counts()
industries = pd.DataFrame(cons.index,columns=['行业畛域'])
industry = pd.DataFrame(columns=['分词明细','行业畛域'])
for i in industries['行业畛域']:

words = []word = job_info['职位信息'][job_info['公司畛域'] == i]word.dropna(inplace=True)[words.extend(str(z).strip('\'[]').split("\', \'")) for z in word]df1 = pd.DataFrame({'分词明细':words,                    '行业畛域':i})industry = industry.append(df1,ignore_index=True)

industry = industry[industry['分词明细'] != "\n"]
industry = industry[industry['分词明细'] != ""]
count = pd.DataFrame(industry['分词明细'].value_counts())
lst = list(count[count['分词明细'] >=300].index)
industry = industry[industry['分词明细'].isin(lst)]

数据存储

industry.to_excel(r'E:\python爬虫\数据预处理\词云.xlsx')
job_info.to_excel(r'E:\python爬虫\数据预处理\前程无忧(已荡涤).xlsx')