import urllib.request
from bs4 import BeautifulSoup
import time
import pymysql
def headers_request(url):

headers={    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'

}

request=urllib.request.Request(url,headers=headers)return request

解析内容

def parse_content(content, db):

# 生成soup对象soup = BeautifulSoup(content,'lxml')# 先找蕴含所有工作的divodivbox = soup.find('div',id='resultList')# 首先找到蕴含所有工作的divodiv_list = odivbox.find_all('div',class_='el')[1:]# print(len(odiv_list))for odiv in odiv_list:    # 职位名称    jobname = odiv.select('.t1 > span > a')[0]['title']    # 公司名称    company = odiv.select('.t2 > a')[0]['title']    # 工作地点    area = odiv.select('.t3')[0].string    # 职位月薪    salary = odiv.select('.t4')[0].string    # 公布工夫    publish_time = odiv.select('.t5')[0].string    # print(salary, publish_time)    # 保留到字典中    item = {        '职位名称':jobname,        '公司名称':company,        '工作地点':area,        '职位月薪':salary,        '公布工夫':publish_time    }    # 保留到文件中    # string = str(item) + '\n'    # fp.write(string)    # 保留到mysql中    save_to_mysql(db,item)

def save_to_mysql(db,item):

# 获取游标cur = db.cursor()# 执行sql语句sql = """insert into work(jobname,company,area,salary,publish_time) values('%s','%s','%s','%s','%s')""" % (item['职位名称'], item['公司名称'], item['工作地点'], item['职位月薪'], item['公布工夫'])# print(sql)try:    cur.execute(sql)    #提交    db.commit()except Exception as e:    # print(e)    #谬误回滚    db.rollback()

def main():

# fp = open('work.txt','w',encoding='utf8')# 链接数据库db = pymysql.connect(host="xxxx",user="xxxx",password="xxxxxx",db="xx",port=xxxx,charset='utf8')# 用户输出要搜寻工作关键字keyword = input('请输出要搜寻的关键字-')# 用户输出要爬取的起始和完结页码start_page = int(input('请输出要爬取的起始页码-'))end_page = int(input('请输出要爬取的完结页码-'))# 要拼接的起始urlurl = 'https://search.51job.com/list/010000,000000,0000,00,9,99,{},2,{}.html'# 写循环,每一页开始挨着爬取for page in range(start_page,end_page + 1):    print('[金融期货](https://www.gendan5.com/futures/ff.html)正在爬取第%s页......' % page)    # 拼接url    url_page = url.format(keyword,page)    # print(url_page)    # 构建申请对象    request = headers_request(url_page)    # 发送申请,失去响应    content = urllib.request.urlopen(request).read().decode('gbk')    # 解析内容    parse_content(content,db)    print('完结爬取第%s页' % page)    time.sleep(2)# fp.close()db.close()

if name == '__main__':

main()