To enable Crawlera for use with Puppeteer, flip ignoreHTTPSErrors option in puppetteer.launch method to True, specify Crawlera's host and port in --proxy-server Chromium flag, and send Crawlera credentials in the Proxy-Authorization header by means of page.setExtraHTTPHeaders method.
A sample Nodejs script:
const puppeteer = require('puppeteer'); (async () => { const browser = await puppeteer.launch({ ignoreHTTPSErrors: true, args: [ '--proxy-server=proxy.crawlera.com:8010' ] }); const page = await browser.newPage(); await page.setExtraHTTPHeaders({ 'Proxy-Authorization': 'Basic ' + Buffer.from('<CRAWLERA_APIKEY>:').toString('base64'), }); console.log('Opening page ...'); try { await page.goto('https://httpbin.scrapinghub.com/redirect/6', {timeout: 180000}); } catch(err) { console.log(err); } console.log('Taking a screenshot ...'); await page.screenshot({path: 'screenshot.png'}); await browser.close(); })();
Make the following changes to use puppeteer-core instead:
const puppeteer = require('puppeteer-core'); (async () => { const browser = await puppeteer.launch({ executablePath: 'path/to/Chromium',
To speed up the page rendering, filtering out static files is recommended. For instance, this block of code, added before Proxy-Authorization, excludes images from the loaded page:
await page.setRequestInterception(true); page.on('request', request => { if (request.resourceType == 'image') request.abort(); else request.continue(); });
See request.resourceType() and page.setRequestInterception(value) for more details.
Alternatively, consider pairing Puppeteer with crawlera-headless-proxy.
Requesting a page with a Crawlera session
const puppeteer = require('puppeteer'); const request = require('request'); const proxyUser = "<CRAWLERA_API_KEY>"; const proxyHost = "proxy.crawlera.com:8010"; (async () => { function getCrawleraSession() { return new Promise(function(resolve, reject) { var crawleraSession = ""; var options = { url: `http://${proxyHost}/sessions`, 'auth': { 'user': proxyUser, 'pass': '', 'sendImmediately': true }, }; request.post(options, (err, res, body) => { if (err) { crawleraSession = ""; } else { crawleraSession = body; } resolve(crawleraSession) }); }); }; var crawleraSessionId = await getCrawleraSession(); const browser = await puppeteer.launch({ headless: true, ignoreHTTPSErrors: true, args: [ `--proxy-server=${proxyHost}` ] }); const page = await browser.newPage(); await page.setExtraHTTPHeaders({ 'Proxy-Authorization': 'Basic ' + Buffer.from(`${proxyUser}:`).toString('base64'), 'X-Crawlera-Session': crawleraSessionId, }); console.log('Opening page ...'); try { await page.goto('https://httpbin.org/headers', {timeout: 180000}); } catch(err) { console.log(err); } console.log('Taking a screenshot ...'); await page.screenshot({path: 'screenshot.png'}); await browser.close(); })();