I had a situation this week, I wanted to read and validate some data from a private Google Spreadsheet using Puppeteer. Initially, I found 2 problems:
- I needed to log in with a custom email/password to have the access to the spreadsheet. A captcha appears if we use a "vanilla" implementation of the puppeteer app.
- The spreadsheet was blocked, we have read-only permission. We can't click, read, modify or make any operation on the cells.
This seems pretty awful, don't you think? Well, let's solve the first topic.
This is how I could bypass the Gmail captcha login and could read the data like a charm:
The Tools
We choose to use 3 extra packages aside puppeteer:
So my package.json
looked like this.
{
"name": "spreadsheet-checker",
"version": "1.0.0",
"description": "an google spreadsheet reader",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "MIT",
"dependencies": {
"puppeteer": "^8.0.0",
"puppeteer-extra": "^3.1.18",
"puppeteer-extra-plugin-adblocker": "^2.11.11",
"puppeteer-extra-plugin-stealth": "^2.7.6"
}
}
The Script
To get access to the spreadsheet we need to login first, and then make the redirect to the spreadsheet. So the script will be like this:
const puppeteer = require("puppeteer-extra");
// Add stealth plugin and use defaults (all tricks to hide puppeteer usage)
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
// Add adblocker plugin to block all ads and trackers (saves bandwidth)
const AdblockerPlugin = require("puppeteer-extra-plugin-adblocker");
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
(async function () {
// That's it, the rest is puppeteer usage as normal
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
let navigationPromise = page.waitForNavigation();
await page.goto("https://accounts.google.com/");
await navigationPromise;
await page.waitForSelector('input[type="email"]');
await page.type('input[type="email"]', process.env.email); // Email login
await page.click("#identifierNext");
await page.waitForSelector('input[type="password"]', { visible: true });
await page.type('input[type="password"]', process.env.password); // Password login
await page.waitForSelector("#passwordNext", { visible: true });
await page.click("#passwordNext");
navigationPromise = page.waitForNavigation();
await navigationPromise;
await page.goto(process.env.file_url); // Spreadsheet url
await page.screenshot({ path: "spreadsheet-screen.png", fullPage: true }); // We take a screenshot to have probe of the bypass
await browser.close();
})();
Now, let's solve the second topic.
The capture
So now we are in, how we can read the data? Well, the best approach with this scenario (read-only spreadsheet) we can download the data by using 2 things:
- Setting the download folder handler for the puppeteer app.
- Using
page.keyboard.down
andpage.keyboard.press
to trigger the shortcuts to save the file in the format we want it (PDF, CSV, XLSX).
The download handler
We need to bind a local folder to be the download folder for the puppeteer. To do this, we need to import the path
package and configure a downloadPath
, and then bind the page._client.send("Page.setDownloadBehavior")
with a custom configuration.
const path = require("path");
const downloadPath = path.resolve("./download");
// puppeteer-extra is a drop-in replacement for puppeteer,
// it augments the installed puppeteer with plugin functionality.
// Any number of plugins can be added through `puppeteer.use()`
const puppeteer = require("puppeteer-extra");
// Add stealth plugin and use defaults (all tricks to hide puppeteer usage)
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
// Add adblocker plugin to block all ads and trackers (saves bandwidth)
const AdblockerPlugin = require("puppeteer-extra-plugin-adblocker");
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
(async function () {
// That's it, the rest is puppeteer usage as normal
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
let navigationPromise = page.waitForNavigation();
await page.goto("https://accounts.google.com/");
await navigationPromise;
await page.waitForSelector('input[type="email"]');
await page.type('input[type="email"]', process.env.email); // Email login
await page.click("#identifierNext");
await page.waitForSelector('input[type="password"]', { visible: true });
await page.type('input[type="password"]', process.env.password); // Password login
await page.waitForSelector("#passwordNext", { visible: true });
await page.click("#passwordNext");
navigationPromise = page.waitForNavigation();
await navigationPromise;
await page.goto(process.env.file_url); // Spreadsheet url
// Our download configuration
await page._client.send("Page.setDownloadBehavior", {
behavior: "allow",
downloadPath: downloadPath,
});
await browser.close();
})();
With this, we are ready to make the download action via shortcuts.
The shortcuts
In this case, I downloaded all the pages via HTML, using the next shortcuts.
ALT + F
to open the File
tab.
ALT + D
to open the Download
menu.
ALT + W
to select Website
option, and donwload all the content as HTML.
The script updated:
const path = require("path");
const downloadPath = path.resolve("./download");
// puppeteer-extra is a drop-in replacement for puppeteer,
// it augments the installed puppeteer with plugin functionality.
// Any number of plugins can be added through `puppeteer.use()`
const puppeteer = require("puppeteer-extra");
// Add stealth plugin and use defaults (all tricks to hide puppeteer usage)
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
// Add adblocker plugin to block all ads and trackers (saves bandwidth)
const AdblockerPlugin = require("puppeteer-extra-plugin-adblocker");
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
(async function () {
// That's it, the rest is puppeteer usage as normal
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
let navigationPromise = page.waitForNavigation();
await page.goto("https://accounts.google.com/");
await navigationPromise;
await page.waitForSelector('input[type="email"]');
await page.type('input[type="email"]', process.env.email); // Email login
await page.click("#identifierNext");
await page.waitForSelector('input[type="password"]', { visible: true });
await page.type('input[type="password"]', process.env.password); // Password login
await page.waitForSelector("#passwordNext", { visible: true });
await page.click("#passwordNext");
navigationPromise = page.waitForNavigation();
await navigationPromise;
await page.goto(process.env.file_url); // Spreadsheet url
await page._client.send("Page.setDownloadBehavior", {
behavior: "allow",
downloadPath: downloadPath,
});
await page.keyboard.down("Alt");
await page.keyboard.press("KeyF");
await page.keyboard.press("KeyD");
await page.keyboard.press("KeyW");
await browser.close();
})();
Now we have the data downloaded. Cool! The reading process will be for another post.
Wrap up
This is a simple but useful implementation to solve this kind of problem. Hope you enjoy it.
Happy Hacking!
Top comments (2)
very nice !!!
Hello, how about just use 2captcha for solving CAPTCHA?