puppeteer-多URL爬取

43次阅读

共计 1860 个字符,预计需要花费 5 分钟才能阅读完成。

基本使用

'use strict';
const puppeteer = require('puppeteer');

(async () => {const browser = await puppeteer.launch();
  const page = await browser.newPage();
  let imgArr = [];
  page.on('domcontentloaded', async () => {
    imgArr = await page.$$eval('img', img => {const arr = [];
      // 返回的是一个集合需要重新遍历
      for (let i = 0; i < img.length; i++) {
        const obj = {width: img[i].width,
          naturalWidth: img[i].naturalWidth,
          height: img[i].height,
          naturalHeight: img[i].naturalHeight,
          isStandard: !((img[i].width * 10 <= img[i].naturalWidth || img[i].height * 10 <= img[i].naturalHeight)),
          url: img[i].src,
          level: 3,
          imageUrl: img[i].src,
          describeUrl: '',
          summary: ` 为了显示 ${img[i].width}x${img[i].height} 的图片引入了原尺寸为 ${img[i].naturalWidth}x${img[i].naturalHeight} 的图片 `,
        };
        if (obj.width && obj.height) {arr.push(obj);
        }
      }
      return arr;
    });
  });
  await page.goto('https://www.npmjs.com/package/puppeteer', { waitUntil: 'networkidle0'});
  await browser.close();
  console.log('imgArr:', imgArr);
})();

顺序不能变:

  • await puppeteer.launch() 启动
  • await browser.newPage() 打开 page
  • page.on 监听事件
  • await page.goto 跳转页面
  • await browser.close() 关闭

顺序改变,page.on() 监听事件将无法监听

多个 URL 的使用方法

爬取数组 url 上的所有图片,并返回其真实宽高.

/* eslint-disable no-undef */
'use strict';
const puppeteer = require('puppeteer');

(async () => {const browser = await puppeteer.launch();
  const page = await browser.newPage();
  let arr = [];

  const html = ['https://www.npmjs.com/package/puppeteer', 'https://www.iconfont.cn/search/index?searchType=icon&q=test'];

  for (let i = 0; i < html.length; i++) {await page.goto(html[i], {waitUntil: 'domcontentloaded'});
    await page.waitForSelector('img', { timeout: 3000});
    // eslint-disable-next-line no-loop-func
    const doms = await page.evaluate(() => {const arr = [ ...document.querySelectorAll('img') ];
      return arr.map(v => {
        return {
          naturalWidth: v.naturalWidth,
          naturalHeight: v.naturalHeight,
          width: v.width,
          height: v.height,
        };
      });
    });
    arr = [...arr, ...doms];
  }
  await browser.close();})();

此方法大致参考了 overflow 上的答案:

  • Crawling multiple URL in a loop using puppeteer
  • Looping through a set of urls in Puppeteer
  • Puppeteer – Proper way to loop through multiple URLs

正文完
 0