download:Java架构师-十项全能【完结无密】

!/usr/bin/python

from bs4 import BeautifulSoup
import requests
def getHouseList(url):

house =[]headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER'}#get从网页获取信息res = requests.get(url,headers=headers)#解析内容soup = BeautifulSoup(res.content,'lxml')#房源titlehousename_divs = soup.find_all('div',class_='title')for housename_div in housename_divs:    housename_as=housename_div.find_all('a')    for housename_a in housename_as:        housename=[]        #题目        housename.append(housename_a.get_text())        #超链接        housename.append(housename_a['href'])        house.append(housename)huseinfo_divs = soup.find_all('div',class_='houseInfo')for i in range(len(huseinfo_divs)):    info = huseinfo_divs[i].get_text()    infos = info.split('|')    #小区名称    house[i].append(infos[0])    #户型    house[i].append(infos[1])    #平米    house[i].append(infos[2])#查问总价house_prices = soup.find_all('div',class_='totalPrice')for i in range(len(house_prices)):    #价格    price = house_prices[i].get_text()    house[i].append(price)return house

爬取屋宇粗疏信息:所在区域、套内面积

def houseinfo(url):

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER'}res = requests.get(url,headers=headers)soup = BeautifulSoup(res.content,'lxml')msg =[]#所在区域areainfos = soup.find_all('span',class_='info')for areainfo in areainfos:    #只需要获取第一个a标签的内容即可    area = areainfo.find('a')    if(not area):        continue    hrefStr = area['href']    if(hrefStr.startswith('javascript')):        continue    msg.append(area.get_text())    break#根据屋宇户型计算套内面积infolist = soup.find_all('div',id='infoList')num = []for info in infolist:    cols = info.find_all('div',class_='col')    for i in cols:        pingmi = i.get_text()        try:            a = float(pingmi[:-2])            num.append(a)        except ValueError:            continuemsg.append(sum(num))return msg

将房源信息写入txt文件

def writeFile(houseinfo):

f = open('d:/房源.txt','a',encoding='utf8')# houseinfo.join('\n')f.write(houseinfo+'\n')f.close()

主函数

def main():

for i in range(1,100):    print('-----分隔符',i,'-------')    if i==1:        url ='https://sjz.lianjia.com/ershoufang/hy1f2f5sf1l3l2l4a2a3a4/'    else:        url='https://sjz.lianjia.com/ershoufang/pg'+str(i)+'hy1f2f5sf1l3l2l4a2a3a4/'    houses =getHouseList(url)    for house in houses:        link = house[1]        if(not link.startswith('http')):            continue        mianji = houseinfo(link)        #将套内面积、所在区域减少到房源信息        house.extend(mianji)        print(house)        info = " ".join([str(x) for x in house])        writeFile(info)

if name == '__main__':

main()

从链家网站查问到8849条房源信息,然而页面只能浮现31(每页数量)*100(总页码)=3100条房源,其余没找到。

第二版:

获取某个小区的房源信息,并写入excel。

!/usr/bin/python

from bs4 import BeautifulSoup
import requests
import xlwt
def getHouseList(url):

house =[]headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER'}#get从网页获取信息res = requests.get(url,headers=headers)#解析内容soup = BeautifulSoup(res.content,'html.parser')#房源titlehousename_divs = soup.find_all('div',class_='title')for housename_div in housename_divs:    housename_as=housename_div.find_all('a')    for housename_a in housename_as:        housename=[]        #题目        housename.append(housename_a.get_text())        #超链接        housename.append(housename_a.get('href'))        house.append(housename)huseinfo_divs = soup.find_all('div',class_='houseInfo')for i in range(len(huseinfo_divs)):    info = huseinfo_divs[i].get_text()    infos = info.split('|')    #小区名称    house[i].append(infos[0])    #户型    house[i].append(infos[1])    #平米    house[i].append(infos[2])#查问总价house_prices = soup.find_all('div',class_='totalPrice')for i in range(len(house_prices)):    #价格    price = house_prices[i].get_text()    house[i].append(price)return house

爬取屋宇粗疏信息:所在区域、套内面积

def houseinfo(url):

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER'}res = requests.get(url,headers=headers)soup = BeautifulSoup(res.content,'html.parser')msg =[]#所在区域areainfos = soup.find_all('span',class_='info')for areainfo in areainfos:    #只需要获取第一个a标签的内容即可    area = areainfo.find('a')    if(not area):        continue    hrefStr = area['href']    if(hrefStr.startswith('javascript')):        continue    msg.append(area.get_text())    break#根据屋宇户型计算套内面积infolist = soup.find_all('div',id='infoList')num = []for info in infolist:    cols = info.find_all('div',class_='col')    for i in cols:        pingmi = i.get_text()        try:            a = float(pingmi[:-2])            num.append(a)        except ValueError:            continuemsg.append(sum(num))return msg

将房源信息写入excel文件

def writeExcel(excelPath,houses):

workbook = xlwt.Workbook()#获取第一个sheet页sheet = workbook.add_sheet('git')row0=['题目','链接地址','户型','面积','朝向','总价','所属区域','套内面积']for i in range(0,len(row0)):    sheet.write(0,i,row0[i])for i in range(0,len(houses)):    house = houses[i]    print(house)    for j in range(0,len(house)):        sheet.write(i+1,j,house[j])workbook.save(excelPath)

主函数

def main():

data = []for i in range(1,5):    print('-----分隔符',i,'-------')    if i==1:        url ='https://sjz.lianjia.com/ershoufang/l2rs%E5%92%8C%E5%B9%B3%E4%B8%96%E5%AE%B6/'    else:        url='https://sjz.lianjia.com/ershoufang/pg'+str(i)+'l2rs%E5%92%8C%E5%B9%B3%E4%B8%96%E5%AE%B6/'    houses =getHouseList(url)    for house in houses:        link = house[1]        if(not link or not link.startswith('http')):            continue        mianji = houseinfo(link)        #将套内面积、所在区域减少到房源信息        house.extend(mianji)    data.extend(houses)writeExcel('d:/house.xls',data)

if name == '__main__':