Puppeteer 生成图片 生成 PDF

时间:2024-07-09 14:18:50
方案背景

22 年时,那时由于团队权限比较受限,在不开新页面,也不开新服务的情况下,同样是生成图片的需求,性能要求也比较高,当时采用的 Html2canvas 的前端生成方案,做了十几个核心业务模块,在 Web,H5,App,Electron PC App 下, 速度 300ms~2500ms 内生成和下载图片

今年遇到同样需求,但由于业务内嵌入了几十张图表组成的可视化大屏 Iframe,Html2canvas 想解决 Iframe 情况有些吃力,也考虑团队权限能用服务端截屏的方案了,那就 Puppeteer 走起

生成 PDF的额外说明

用 puppeteer 生成 pdf 走不通,因此方案里采用了 sharp 手动的把 puppeteer 生成的图片裁切成了 pdf 页面大小的等比例尺寸,再用 pdfkit scale 等比把裁切出来的若干份图片塞进 pdf 里

代码

代码为 POC 方案,离生产可用还是要多做一些稳定性和可用性的支持工作,因此以下代码仅作学习参考

其中流程代码参考意义不大,核心代码里的裁切图片,自动滚动,等待指定 Iframe 加载 和 获取指定元素上的属性值,比较有参考意义

// 流程代码
import { ConsoleLogger } from '@nestjs/common';
import puppeteer, { Viewport } from 'puppeteer';
import { Browser, CookieParam } from 'puppeteer';
import * as helper from './helper';
const logger = new ConsoleLogger('WebViewer');

export enum TargetFileType {
  png = 'png',
  jpeg = 'jpeg',
  webp = 'webp',
  pdf = 'pdf',
}

export interface ShotParam {
  type: TargetFileType;
  url: string;
  targetFile?: string;
  cookies?: CookieParam[];
  viewport?: Viewport;
  timeout?: number;
}

export class WebViewer {
  private browser: Browser | null = null;
  private inited = false;

  static defaultInstance = new WebViewer();

  constructor(private readonly options?: WebViewerOptions) {
    this.options = options || {
      socketTimeout: 2 * 60 * 1000,
      generateTimeout: 0,
    };
  }

  async init(): Promise<void> {
    this.browser = await puppeteer.launch({
      // defaultViewport: { width: 1920, height: 1080 },
      // headless: 'shell',
      headless: false,
      pipe: true,
      args: [
        '--disable-gpu',
        '--disable-dev-shm-usage',
        '--disable-setuid-sandbox',
        '--no-first-run',
        '--no-sandbox',
        '--no-zygote',
        '--full-memory-crash-report',
        '--unlimited-storage',
      ],
    });
    this.inited = true;
  }

  async shot(param: ShotParam): Promise<Buffer> {
    if (!this.inited) {
      await this.init();
    }

    logger.log(
      `Start to shot url: ${param.url}, type: ${
        param.type
      }, viewport: { heigth:${param.viewport?.height || 0}, width:${
        param.viewport?.width || 0
      }} `,
    );
    const page = await this.browser.newPage();
    page.on('response', (response) => {
      logger.debug(response.url());
    });
    page.on('close', () => {
      logger.debug('Current page has been closed.');
    });
    if (param.cookies) {
      await page.setCookie(...param.cookies);
    }
    await page.goto(param.url, {
      timeout: param?.timeout || this.options.socketTimeout,
      waitUntil: 'networkidle0',
    });
    await helper.waitForFrame(page);
    const minWidth = 1920;
    let width = await page.$eval('.html-table', (el) => el.scrollWidth + 36);
    width = width > minWidth ? width : minWidth;
    param.viewport = { width, height: 1080 };
    await page.setViewport(param.viewport);
    await helper.getValueFromElementDataset(
      page,
      'html',
      'height',
      async (value: string) => !Number.isNaN(Number(value)),
    );
    const bodyHandle = await page.$('body');
    const { height: bodyHeight } = await bodyHandle.boundingBox();
    param.viewport = { width, height: Math.floor(bodyHeight) + 1 };
    await bodyHandle.dispose();
    await page.setViewport(param.viewport);
    await page.waitForSelector('#datart-rendered');
    await helper.sleep(300);

    let buffer = await page.screenshot();
    if (param.type === 'pdf') {
      buffer = await helper.generatePdf(buffer);
    }
    await helper.sleep(300);
    await page.close();
    return buffer;
  }

  async close(): Promise<void> {
    await this.browser?.close();
  }
}

export interface WebViewerOptions {
  socketTimeout: number;
  generateTimeout?: number;
}

// 核心代码
import { Page } from 'puppeteer';
import * as sharp from 'sharp';
import * as pdfkit from 'pdfkit';
import * as getStream from 'get-stream';

function waitForFrame(page: Page) {
  let fulfill;
  const promise = new Promise((resolve) => (fulfill = resolve));
  checkFrame();
  return promise;

  function checkFrame() {
    const frame = page.frames().find((f) => {
      console.log(f.name());
      return f.name() === 'datart';
    });
    if (frame) fulfill(frame);
    else page.once('frameattached', checkFrame);
  }
}

async function autoScroll(page: Page, selector: string) {
  return page.evaluate((selector) => {
    return new Promise((resolve) => {
      //滚动的总高度
      let totalHeight = 0;
      //每次向下滚动的高度 100 px
      const distance = 100;
      const timer = setInterval(() => {
        const dom = document.querySelector(selector);
        if (!dom) {
          return clearInterval(timer);
        }
        //页面的高度 包含滚动高度
        const scrollHeight = dom.scrollHeight;
        console.log(scrollHeight);
        //滚动条向下滚动 distance
        dom.scrollBy(0, distance);
        totalHeight += distance;
        //当滚动的总高度 大于 页面高度 说明滚到底了。也就是说到滚动条滚到底时,以上还会继续累加,直到超过页面高度
        if (totalHeight >= scrollHeight) {
          clearInterval(timer);
          resolve(true);
        }
      }, 100);
    });
  }, selector);
}

async function getValueFromElementDataset(
  page: Page,
  selector: string,
  key: string,
  checkValue: (value: string) => Promise<boolean>,
) {
  return new Promise((resolve) => {
    const interval = setInterval(async () => {
      const value = await page.$eval(
        selector,
        (el: HTMLElement, key: string) => {
          return el.dataset[key];
        },
        key,
      );
      if (checkValue && !(await checkValue(value))) {
        return;
      } else {
        clearInterval(interval);
        resolve(value);
      }
    }, 500);
  });
}

async function clipImage(
  pageWidth: number,
  pageHeight: number,
  buffer: Buffer,
): Promise<{ images: Buffer[]; scale: number }> {
  const imageOriginSharp = sharp(buffer);
  const imageSharp = sharp(buffer).resize(pageWidth);

  const imageBuffer = await imageSharp
    .withMetadata()
    .toBuffer({ resolveWithObject: true });

  const imageOriginBuffer = await imageOriginSharp
    .withMetadata()
    .toBuffer({ resolveWithObject: true });

  // const imageWidth = imageBuffer.info.width;
  const imageHeight = imageBuffer.info.height;
  const imageOriginWidth = imageOriginBuffer.info.width;
  const imageOriginHeight = imageOriginBuffer.info.height;

  const scale = imageOriginHeight / imageHeight;

  const images: Buffer[] = [];
  console.log({
    imageOriginWidth,
    imageOriginHeight,
    scale,
  });

  let startY = 0;
  while (startY < imageHeight) {
    const height = Math.min(pageHeight, imageHeight - startY);
    console.log(Math.ceil(height * scale), Math.ceil(startY * scale));
    const imageOriginSharp = sharp(buffer);
    const liteImage = await imageOriginSharp
      .extract({
        width: imageOriginWidth,
        height: Math.ceil(height * scale),
        left: 0,
        top: Math.floor(startY * scale),
      })
      .toBuffer();

    images.push(liteImage);
    startY += height;
  }

  return { images, scale };
}

const generatePdf = (imageBuffer: Buffer) => {
  return new Promise<Buffer>(async (resolve) => {
    const doc = new pdfkit();

    // 获取PDF页面的宽度和高度
    const pageWidth = doc.page.width;
    const pageHeight = doc.page.height;

    // 图片按 page 高度等比放缩然后裁切为多份,然后通过 doc addpage 以及 doc.image 把每张图放入 pdf
    const { images, scale } = await clipImage(
      pageWidth,
      pageHeight,
      imageBuffer,
    );
    let index = 1;

    for (const image of images) {
      doc.image(image, 0, 0, {
        width: pageWidth,
        scale: 1 / scale,
      });
      if (index < images.length) {
        doc.addPage();
        doc.switchToPage(index);
        index += 1;
      }
    }

    doc.end();
    const pdfBuffer = await getStream.buffer(doc);
    resolve(pdfBuffer);
  });
};

const sleep = (time: number) => new Promise((r) => setTimeout(r, time));

export {
  sleep,
  clipImage,
  autoScroll,
  generatePdf,
  waitForFrame,
  getValueFromElementDataset,
};