关于python:爬虫爬取贵阳房价Python实现

36次阅读

共计 3095 个字符,预计需要花费 8 分钟才能阅读完成。

================== 导入相干库 ==================================

from bs4 import BeautifulSoup
import numpy as np
import requests
from requests.exceptions import RequestException
import pandas as pd

============= 读取网页 =========================================

def craw(url,page):

try:
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
    html1 = requests.request("GET", url, headers=headers,timeout=10)
    html1.encoding ='utf-8' # 加编码,重要!转换为字符串编码,read()失去的是 byte 格局的
    html=html1.text
    return html
except RequestException:# 其余问题
    print('第 {0} 读取网页失败'.format(page))
    return None

========== 解析网页并保留数据到表格 ======================

def pase_page(url,page):

html=craw(url,page)
html = str(html)
if html is not None:
    soup = BeautifulSoup(html, 'lxml')
    "-- 先确定房子信息,即 li 标签列表 --"
    houses=soup.select('.resblock-list-wrapper li')# 房子列表
    "-- 再确定每个房子的信息 --"
    for j in range(len(houses)):# 遍历每一个房子
        house=houses[j]
        "名字"
        recommend_project=house.select('.resblock-name a.name')
        recommend_project=[i.get_text()for i in recommend_project]# 名字 英华天元,斌鑫江南御府...
        recommend_project=' '.join(recommend_project)
        #print(recommend_project)
        "类型"
        house_type=house.select('.resblock-name span.resblock-type')
        house_type=[i.get_text()for i in house_type]# 写字楼, 底商...
        house_type=' '.join(house_type)
        #print(house_type)
        "销售状态"
        sale_status = house.select('.resblock-name span.sale-status')
        sale_status=[i.get_text()for i in sale_status]# 在售, 在售, 售罄, 在售...
        sale_status=' '.join(sale_status)
        #print(sale_status)
        "大地址"
        big_address=house.select('.resblock-location span')
        big_address=[i.get_text()for i in big_address]#
        big_address=''.join(big_address)
        #print(big_address)
        "具体地址"
        small_address=house.select('.resblock-location a')
        small_address=[i.get_text()for i in small_address]#
        small_address=' '.join(small_address)
        #print(small_address)
        "劣势。"
        advantage=house.select('.resblock-tag span')
        advantage=[i.get_text()for i in advantage]#
        advantage=' '.join(advantage)
        #print(advantage)
        "均价:多少 1 平"
        average_price=house.select('.resblock-price .main-price .number')
        average_price=[i.get_text()for i in average_price]#16000,25000, 价格待定..
        average_price=' '.join(average_price)
        #print(average_price)
        "总价, 单位万"
        total_price=house.select('.resblock-price .second')
        total_price=[i.get_text()for i in total_price]# 总价 400 万 / 套,总价 100 万 / 套 '...
        total_price=' '.join(total_price)
        #print(total_price)
        #===================== 写入表格 =================================================
        information = [recommend_project, house_type, sale_status,big_address,small_address,advantage,average_price,total_price]
        information = np.array(information)
        information = information.reshape(-1, 8)
        information = pd.DataFrame(information, columns=['名称', '类型', '销售状态','大地址','具体地址','劣势','均价','总价'])
        information.to_csv('贵阳房价.csv', mode='a+', index=False, header=False)  # mode=[黄金](https://www.gendan5.com/nmetal/gold.html)'a+' 追加写入
    print('第 {0} 页存储数据胜利'.format(page))
else:
    print('解析失败')

================== 双线程 =====================================

import threading
for i in range(1,100,2):# 遍历网页 1 -101

url1="https://gy.fang.lianjia.com/loupan/pg"+str(i)+"/"
url2 = "https://gy.fang.lianjia.com/loupan/pg" + str(i+1) + "/"
t1 = threading.Thread(target=pase_page, args=(url1,i))# 线程 1
t2 = threading.Thread(target=pase_page, args=(url2,i+1))# 线程 2
t1.start()
t2.start()

正文完
 0