基本使用

'use strict';const puppeteer = require('puppeteer');(async () => {  const browser = await puppeteer.launch();  const page = await browser.newPage();  let imgArr = [];  page.on('domcontentloaded', async () => {    imgArr = await page.$$eval('img', img => {      const arr = [];      // 返回的是一个集合需要重新遍历      for (let i = 0; i < img.length; i++) {        const obj = {          width: img[i].width,          naturalWidth: img[i].naturalWidth,          height: img[i].height,          naturalHeight: img[i].naturalHeight,          isStandard: !((img[i].width * 10 <= img[i].naturalWidth || img[i].height * 10 <= img[i].naturalHeight)),          url: img[i].src,          level: 3,          imageUrl: img[i].src,          describeUrl: '',          summary: `为了显示${img[i].width}x${img[i].height}的图片引入了原尺寸为${img[i].naturalWidth}x${img[i].naturalHeight}的图片`,        };        if (obj.width && obj.height) {          arr.push(obj);        }      }      return arr;    });  });  await page.goto('https://www.npmjs.com/package/puppeteer', { waitUntil: 'networkidle0' });  await browser.close();  console.log('imgArr: ', imgArr);})();

顺序不能变 :

  • await puppeteer.launch() 启动
  • await browser.newPage() 打开page
  • page.on 监听事件
  • await page.goto 跳转页面
  • await browser.close() 关闭

顺序改变,page.on() 监听事件将无法监听

多个URL的使用方法

爬取数组url上的所有图片,并返回其真实宽高.

/* eslint-disable no-undef */'use strict';const puppeteer = require('puppeteer');(async () => {  const browser = await puppeteer.launch();  const page = await browser.newPage();  let arr = [];  const html = [ 'https://www.npmjs.com/package/puppeteer', 'https://www.iconfont.cn/search/index?searchType=icon&q=test' ];  for (let i = 0; i < html.length; i++) {    await page.goto(html[i], { waitUntil: 'domcontentloaded' });    await page.waitForSelector('img', { timeout: 3000 });    // eslint-disable-next-line no-loop-func    const doms = await page.evaluate(() => {      const arr = [ ...document.querySelectorAll('img') ];      return arr.map(v => {        return {          naturalWidth: v.naturalWidth,          naturalHeight: v.naturalHeight,          width: v.width,          height: v.height,        };      });    });    arr = [ ...arr, ...doms ];  }  await browser.close();})();

此方法大致参考了overflow上的答案:

  • Crawling multiple URL in a loop using puppeteer
  • Looping through a set of urls in Puppeteer
  • Puppeteer - Proper way to loop through multiple URLs