In this article, we'll cover how to convert PDF pages into images using Node.js. This can be useful for generating thumbnails or extracting visual content from PDF files. We'll use the pdfjs-dist library to load and render PDF pages, and canvas to create image buffers.
Prerequisites
Before getting started, you need to install the required packages:
npm install pdfjs-dist canvas
Code for Converting PDF Pages to Images and Saving Locally:
const fs = require('fs');
const path = require('path');
const pdfjs = require('pdfjs-dist/legacy/build/pdf.js');
const Canvas = require('canvas');
/**
* Converts a PDF to images by rendering each page and saving them to a local directory.
*
* @param {Buffer} pdfBuffer - The PDF file as a buffer.
* @param {string} outputDir - The directory where images will be saved.
* @returns {Promise<void>} Resolves when all images are saved.
*/
async function convertPdfToImages(pdfBuffer, outputDir) {
try {
// Ensure the output directory exists
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
// Load the original PDF using pdf.js
const loadingTask = pdfjs.getDocument({ data: pdfBuffer });
const pdfDocument = await loadingTask.promise;
// Loop through each page of the PDF
for (let i = 1; i <= pdfDocument.numPages; i++) {
const page = await pdfDocument.getPage(i);
// Render the page as an image and save it
const imageBuffer = await renderPageToImage(page);
// Save the image to the output directory
const imagePath = path.join(outputDir, `page_${i}.jpg`);
fs.writeFileSync(imagePath, imageBuffer);
console.log(`Saved: ${imagePath}`);
}
} catch (error) {
console.error('Error converting PDF to images:', error);
}
}
/**
* Renders a single PDF page to an image buffer.
*
* @param {PDFPageProxy} page - The PDF.js page object.
* @returns {Promise<Buffer>} The image as a buffer (JPEG format).
*/
async function renderPageToImage(page) {
// Scale the page to 2x for a higher quality image output
const viewport = page.getViewport({ scale: 2.0 });
const canvas = Canvas.createCanvas(viewport.width, viewport.height);
const context = canvas.getContext('2d');
const renderContext = {
canvasContext: context,
viewport: viewport,
};
// Render the PDF page to the canvas
await page.render(renderContext).promise;
// Convert the canvas content to a JPEG image buffer and return it
return canvas.toBuffer('image/jpeg');
}
// Example usage:
// const pdfBuffer = fs.readFileSync('sample.pdf');
// convertPdfToImages(pdfBuffer, './output_images');
Code Explanation
- Load the PDF: We use pdfjs-dist to load a PDF file from a buffer.
const loadingTask = pdfjs.getDocument({ data: pdfBuffer });
const pdfDocument = await loadingTask.promise;
- Render Each Page: For each page in the PDF, we render it onto a canvas using the getPage and render methods from pdfjs-dist.
const page = await pdfDocument.getPage(pageNumber);
const renderContext = {
canvasContext: context,
viewport: viewport,
};
await page.render(renderContext).promise;
- Save Image Locally: Once the page is rendered to the canvas, we save the image buffer in JPEG format using Node.js' fs module.
fs.writeFileSync(imagePath, imageBuffer);
Conclusion:
This approach works efficiently for converting PDFs into images, allowing you to process or visualize PDF content. For high-quality images, we scale the canvas to 2x. This can be easily adjusted based on your needs.
I hope this helps! Feel free to adapt the code as per your requirements.
Top comments (1)
Great guide, thanks for that.
I am trying to achieve the same thing, but am running into problems with empty images being generated.
Also Typescript is complaining about using the context:
Any ideas?