最近有个需求,是要爬取某个物流公司的官网信息,我看了下官网,基本上都是静态页面比较好抓取,不像那种资讯类,电子商务类型的网站结果复杂,反爬严格,AJAX众多,还内心暗自庆幸,当我进一步分析时候发现并非普通的静态页面。
例如这个URL界面,我要获取全中国各大城市的物流园区分布信息,并且要获取详情信息,
这个页面里面是有个地图镶嵌,每个城市物流信息你要单独点击地图上的信息才能显示。
https://www.glprop.com.cn/our...

我刚开始想,这种会不会是ajax请求呢,通过chrmoe抓包并没有发现,然后我查看网页源代码
发现所有城市信息在一个scripts里面
如图:

然后各个园区的信息在一个叫park={xx}里面存着

原来都在这里面,直接获取源代码,正则匹配,开干。
item:

#普洛斯class PuluosiNewsItem(scrapy.Item):    newstitle=scrapy.Field()    newtiems=scrapy.Field()    newslink=scrapy.Field()class PuluosiItem(scrapy.Item):    assetstitle = scrapy.Field()    assetaddress=scrapy.Field()    assetgaikuang=scrapy.Field()    assetpeople=scrapy.Field()    asseturl = scrapy.Field()

pipelines:

class PuluosiNewsPipeline(object):    def __init__(self):        self.wb=Workbook()        self.ws=self.wb.active        #设置表头        self.ws.append(['普洛斯新闻标题','新闻发布时间','新闻URL'])        self.wb2 = Workbook()        self.ws2 = self.wb2.active        self.ws2.append(['资产标题', '资产地址', '资产概况','其他信息','URL'])    def process_item(self,item,spider):        if isinstance(item, PuluosiNewsItem):            line = [item['newstitle'], item['newtiems'], item['newslink']]  # 把数据中每一项整理出来            self.ws.append(line)            self.wb.save('PuluosiNews.xlsx')  # 保存xlsx文件        elif isinstance(item,PuluosiItem):            line = [item['assetstitle'], item['assetaddress'], item['assetgaikuang'],item['assetpeople'],item['asseturl']]            self.ws2.append(line)            self.wb2.save('PuluosiAsset.xlsx')  # 保存xlsx文件        return item

spider:

# -*- coding: utf-8 -*-import scrapy,re,jsonfrom news.items import PuluosiNewsItem,PuluosiItemfrom scrapy.linkextractors import LinkExtractorclass PuluosiSpider(scrapy.Spider):    name = 'puluosi'    allowed_domains = ['glprop.com.cn']    # start_urls = ['https://www.glprop.com.cn/press-releases.html']    def start_requests(self):        yield scrapy.Request('https://www.glprop.com.cn/press-releases.html', self.parse1)        yield scrapy.Request('https://www.glprop.com.cn/in-the-news.html', self.parse2)        yield scrapy.Request('https://www.glprop.com.cn/proposed-privatization.html', self.parse3)        yield scrapy.Request('https://www.glprop.com.cn/our-network/network-detail.html', self.parse4)    def parse1(self, response):        print('此时启动的爬虫为:puluosi' )        item=PuluosiNewsItem()        web=response.xpath('//tbody/tr')        web.pop(0)        for node in  web:            item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip()            print(item['newstitle'])            item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip()            print(item['newtiems'])            # urljoin创建绝对的links路径,始用于网页中的href值为相对路径的连接            item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0])            # print(item['newslink'])            yield item        #加入try 来判断当前年份的新闻是否有下一页出现        try:            next_url_tmp = response.xpath('//div[@class="page"]/a[contains(text(),"下一页")]/@href').extract()[0]            if next_url_tmp:                next_url = "https://www.glprop.com.cn" + next_url_tmp                yield scrapy.Request(next_url,callback=self.parse1)        except Exception as e:            print("当前页面没有下一页")        href=response.xpath('//ul[@class="timeList"]/li/a/@href')        for nexturl in href:            url1 =nexturl.extract()            if url1:                url="https://www.glprop.com.cn"+url1                yield scrapy.Request(url,callback=self.parse1)    def parse2(self,response):        item = PuluosiNewsItem()        web = response.xpath('//tbody/tr')        web.pop(0)        for node in  web:            item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip()            print(item['newstitle'])            item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip()            print(item['newtiems'])            # urljoin创建绝对的links路径,始用于网页中的href值为相对路径的连接            item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0])            print(item['newslink'])            yield item        #加入try 来判断当前年份的新闻是否有下一页出现        try:            next_url_tmp = response.xpath('//div[@class="page"]/a[contains(text(),"下一页")]/@href').extract()[0]            if next_url_tmp:                next_url = "https://www.glprop.com.cn" + next_url_tmp                yield scrapy.Request(next_url,callback=self.parse2)        except Exception as e:            print("当前页面没有下一页")        href=response.xpath('//ul[@class="timeList"]/li/a/@href')        for nexturl in href:            url1 =nexturl.extract()            if url1:                url="https://www.glprop.com.cn"+url1                yield scrapy.Request(url,callback=self.parse2)    def parse3(self,response):        item=PuluosiNewsItem()        web=response.xpath('//tbody/tr')        web.pop()        for node in  web:            item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip()            print(item['newstitle'])            item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip()            print(item['newtiems'])            # urljoin创建绝对的links路径,始用于网页中的href值为相对路径的连接            item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0])            print(item['newslink'])            yield item    def parse4(self,response):        link=LinkExtractor(restrict_xpaths='//div[@class="net_pop1"]//div[@class="city"]')        links=link.extract_links(response)        #获取所有城市的links        for i in links:            detailurl=i.url            yield scrapy.Request(url=detailurl,callback=self.parse5)    def parse4(self, response):        item = PuluosiItem()        citycode=re.findall('var cities =(.*);',response.text )        citycodejson=json.loads(("".join(citycode)))        #把每个城市的id和name取出来放到一个字典        dictcity={}        for i in citycodejson:            citycodename=i['name']            citycodenm=i['id']            dictcity[citycodenm]=citycodename        detail=re.findall('var parks =(.*);',response.text )        jsonBody = json.loads(("".join(detail)))        list = []        for key1 in jsonBody:            for key2  in jsonBody[key1]:                tmp=jsonBody[key1][key2]                list.append(jsonBody[key1][key2])        for node in list:            assetaddress = node['city_id']            item['assetaddress'] = dictcity[assetaddress]            # print(item['assetaddress'])            item['assetstitle'] = node['name']            # print(item['assetstitle'])            item['assetgaikuang'] = node['detail_single'].strip().replace('&nbsp;', '').replace(' ', '')            # print(item['assetgaikuang'])            assetpeople = node['description']            item['assetpeople'] = re.sub(r'<.*?>', '', (assetpeople.strip())).replace('&nbsp;', '')            item['asseturl']='https://www.glprop.com.cn/network-city-detail.html?city='+item['assetaddress']            # print(item['assetpeople'])            yield item

然后我顺便把页面的新闻信息也爬取了。