python爬取拉勾网

又到了一年一度的招聘热季，大量的工作向咱们招手，明天我和大家一起看看拉勾网中各公司对于python人才的需要。

import jieba

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from pyecharts import Geo

from wordcloud import WordCloud

import re

import matplotlib

from imageio import imread

url=”https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false”

def data(page):

return {

“first”: “true”,

“pn”: f”{page}”,

“kd”: “python”,

‘sid’: ‘4256fece2141497bb5a8e1bfa69bcee7’

}

def get_cookies():

headers={

‘origin’: ‘https://www.lagou.com’,

‘referer’: ‘https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=’,

‘authority’: ‘www.lagou.com’,

‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36’,

}

response=requests.get(‘https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=’,headers=headers)

return response.cookies.get_dict()

cookies=get_cookies()

headers={‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36’

,’host’:’www.lagou.com’

,’origin’: ‘https://www.lagou.com’

,’referer’: ‘https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=’}

def get_data(data):

response = requests.post(url=url, headers=headers, data=data, cookies=cookies)

# json数据

content = response.json()[‘content’][‘positionResult’][‘result’]

j = 1

companyLabelstr=”

for i in content:

city = i[‘city’]

companyFullName = i[‘companyFullName’]

companySize = i[‘companySize’]

education = i[‘education’]

positionName = i[‘positionName’]

salary = i[‘salary’]

workYear = i[‘workYear’]

companyLabelList=i[‘companyLabelList’]

if len(companyLabelList)>0:

companyLabelList=”.join(companyLabelList)

else:

companyLabelList=”

”’

companyLabelstr=companyLabelList+companyLabelstr

print(workYear,companyLabelList)

print(companyLabelstr)

”’

with open(‘python.csv’, ‘a+’, encoding=’utf-8′)as f:

f.write(f'{city},{companyFullName},{companySize},{education},{positionName},{salary},{workYear},{companyLabelList}\n’)

print(f’第{j}条数据胜利’)

j += 1

if __name__ == ‘__main__’:

for i in range(1, 11):

params = data(i)

get_data(params)

上面对爬取的文本进行剖析
XM返佣https://www.kaifx.cn/broker/x…

matplotlib.rcParams[‘font.family’]=’SimHei’

plt.rcParams[‘axes.labelsize’]=16

plt.rcParams[‘xtick.labelsize’]=14

plt.rcParams[‘ytick.labelsize’]=14

plt.rcParams[‘legend.fontsize’]=12

plt.rcParams[‘figure.figsize’]=[15,9]

data=pd.read_excel(r’C:\Users\2020\Desktop\python2.xls’,encoding=’utf-8′)

1.学历

data[‘学历’].value_counts().plot(kind=’bar’,rot=0)

2.工作教训

data[‘年限’].value_counts().plot(kind=’bar’,rot=0,color=’g’)

3.城市剖析

plt.rcParams[‘figure.figsize’]=[15,15]

data[‘城市’].value_counts().plot(kind=’pie’,autopct=’%1.2f%%’,explode=np.linspace(0,1.5,18))

4.公司待遇剖析

(1)分词操作

a=len(data[‘公司福利’])

str=”

for i in range(a):

b=data[‘公司福利’][i]

if type(b)==float:

b=”

str=str+b

jieba.add_word(‘五险一金’)

jieba.add_word(‘牛B’)

jieba.add_word(‘年底双薪’)

jieba.add_word(‘带薪年假’)

jieba.add_word(‘股票期权’)

jieba.add_word(‘定期体检’)

jieba.add_word(‘节日礼物’)

words = jieba.lcut(str)

counts = {}

for word in words:

counts[word] = counts.get(word, 0) + 1

items = list(counts.items())

items.sort(key=lambda x: x[1], reverse=True)

with open(‘词频统计’,mode=’w’,encoding=’utf-8′)as f:

for i in range(20):

word,count=items[i]

f.writelines(‘{}\t{}\n’.format(word,count))

(2)词云图展现

with open(‘词频统计’,mode=’r’,encoding=’utf-8′)as f:

text=f.read()

wc=WordCloud(font_path=r’C:\Users\2020\Desktop\simhei.ttf’,background_color=’white’,width=1000,max_words=100,height=860,margin=2).generate(text)

plt.imshow(wc)

plt.axis(‘off’)

plt.show()

5.全国工资水平剖析

data2=list(map(lambda x:(data[‘城市’][x],eval(re.split(‘k|K’,data[‘工资’][x])[0])*1000),range(len(data))))

data3=pd.DataFrame(data2,index)

data4=list(map(lambda x:(data3.groupby(0).mean()[1].index[x],data3.groupby(0).mean()[1].values[x]),range(len(data3.groupby(0)))))

geo=Geo(‘全国python工资布局’,’制作人：止疼’,title_color=’#fff’,title_pos=’left’,width=1200,height=600,background_color=’#404a59′)

attr,value=geo.cast(data4)

geo.add(”,attr,value,type=’heatmap’,is_visualmap=True,maptype=’china’,visual_range=[0,300],visual_text_color=’#fff’)

geo.render()

python爬取拉勾网

评论

发表回复取消回复

更多文章

DDN HPC 存储硬件架构设计深度分析

探秘IO500：从Lustre并行文件系统出发，开启HPC存储性能新征程

苹果iOS打包的ipa应用无法安装？一篇文章带你了解可能的原因及排查方法

图解Golang：从零开始实现简易版过期LRU缓存

python爬取拉勾网

评论

发表回复 取消回复

更多文章

DDN HPC 存储硬件架构设计深度分析

探秘IO500：从Lustre并行文件系统出发，开启HPC存储性能新征程

苹果iOS打包的ipa应用无法安装？一篇文章带你了解可能的原因及排查方法

图解Golang：从零开始实现简易版过期LRU缓存

发表回复取消回复