How to get all html data after all scripts and page loading is done? (puppeteer)

30,205

Solution 1

If you want full html same as inspect? Here it is:

    const puppeteer = require('puppeteer');

    (async function main() {
      try {
        const browser = await puppeteer.launch();
        const [page] = await browser.pages();

        await page.goto('https://example.org/', { waitUntil: 'networkidle0' });
        const data = await page.evaluate(() => document.querySelector('*').outerHTML);

        console.log(data);

        await browser.close();
      } catch (err) {
        console.error(err);
      }
    })();

Solution 2

let bodyHTML = await page.evaluate(() => document.documentElement.outerHTML);

This

Solution 3

Some notes:

  1. You need not cheerio with puppeteer and you need not reparse page.content(): you already have the full DOM with all scripts run and you can evaluate any code in window context like in a browser using page.evaluate() and transferring serializable data between web API context and Node.js API context.

  2. Try to use async/await only, this will simplify your code and flow.

  3. If you need to wait till all the scripts and other dependencies are loaded, use waitUntil: 'networkidle0' in page.goto().

  4. If you suspect that document scripts need some time till the needed state, use various test functions like page.waitForSelector() or fall back to page.waitFor(milliseconds).

Here is a simple script that outputs all tag names in a page.

'use strict';

const puppeteer = require('puppeteer');

(async function main() {
  try {
    const browser = await puppeteer.launch();
    const [page] = await browser.pages();

    await page.goto('https://example.org/', { waitUntil: 'networkidle0' });

    const data = await page.evaluate(
      () =>  Array.from(document.querySelectorAll('*'))
                  .map(elem => elem.tagName)
    );

    console.log(data);

    await browser.close();
  } catch (err) {
    console.error(err);
  }
})();

You can specify your task in more details and we can try to write something more appropriate.


Script for www.bezrealitky.cz (task from a comment below):

'use strict';

const fs = require('fs');
const puppeteer = require('puppeteer');

(async function main() {
  try {
    const browser = await puppeteer.launch();
    const [page] = await browser.pages();
    page.setDefaultTimeout(0);

    await page.goto('https://www.bezrealitky.cz/vyhledat?offerType=pronajem&estateType=byt&disposition=&ownership=&construction=&equipped=&balcony=&order=timeOrder_desc&boundary=%5B%5B%7B%22lat%22%3A50.171436864513%2C%22lng%22%3A14.506905276796942%7D%2C%7B%22lat%22%3A50.154133576294%2C%22lng%22%3A14.599004629591036%7D%2C%7B%22lat%22%3A50.14524430128%2C%22lng%22%3A14.58773054712799%7D%2C%7B%22lat%22%3A50.129307131988%2C%22lng%22%3A14.60087568578706%7D%2C%7B%22lat%22%3A50.122604734575%2C%22lng%22%3A14.659116306376973%7D%2C%7B%22lat%22%3A50.106512499343%2C%22lng%22%3A14.657434650206028%7D%2C%7B%22lat%22%3A50.090685542974%2C%22lng%22%3A14.705099547441932%7D%2C%7B%22lat%22%3A50.072175921973%2C%22lng%22%3A14.700004206235008%7D%2C%7B%22lat%22%3A50.056898491904%2C%22lng%22%3A14.640206899053055%7D%2C%7B%22lat%22%3A50.038528576841%2C%22lng%22%3A14.666852728301023%7D%2C%7B%22lat%22%3A50.030955909657%2C%22lng%22%3A14.656128752460972%7D%2C%7B%22lat%22%3A50.013435368522%2C%22lng%22%3A14.66854956530301%7D%2C%7B%22lat%22%3A49.99444182116%2C%22lng%22%3A14.640153080292066%7D%2C%7B%22lat%22%3A50.010839032542%2C%22lng%22%3A14.527474219359988%7D%2C%7B%22lat%22%3A49.970771602447%2C%22lng%22%3A14.46224174052395%7D%2C%7B%22lat%22%3A49.970669964027%2C%22lng%22%3A14.400648545303966%7D%2C%7B%22lat%22%3A49.941901176098%2C%22lng%22%3A14.395563234671044%7D%2C%7B%22lat%22%3A49.948384148423%2C%22lng%22%3A14.337635637038034%7D%2C%7B%22lat%22%3A49.958376114735%2C%22lng%22%3A14.324977842107955%7D%2C%7B%22lat%22%3A49.9676286223%2C%22lng%22%3A14.34491711110104%7D%2C%7B%22lat%22%3A49.971859099005%2C%22lng%22%3A14.326815050839059%7D%2C%7B%22lat%22%3A49.990608728081%2C%22lng%22%3A14.342731259186962%7D%2C%7B%22lat%22%3A50.002211140429%2C%22lng%22%3A14.29483886971002%7D%2C%7B%22lat%22%3A50.023596577558%2C%22lng%22%3A14.315872285282012%7D%2C%7B%22lat%22%3A50.058309376419%2C%22lng%22%3A14.248086830069042%7D%2C%7B%22lat%22%3A50.073179111%2C%22lng%22%3A14.290193274400963%7D%2C%7B%22lat%22%3A50.102973823639%2C%22lng%22%3A14.224439442359994%7D%2C%7B%22lat%22%3A50.130060800171%2C%22lng%22%3A14.302396419107936%7D%2C%7B%22lat%22%3A50.116019827009%2C%22lng%22%3A14.360785349547996%7D%2C%7B%22lat%22%3A50.148005694843%2C%22lng%22%3A14.365662825877052%7D%2C%7B%22lat%22%3A50.14142969454%2C%22lng%22%3A14.394903042943952%7D%2C%7B%22lat%22%3A50.171436864513%2C%22lng%22%3A14.506905276796942%7D%2C%7B%22lat%22%3A50.171436864513%2C%22lng%22%3A14.506905276796942%7D%5D%5D&hasDrawnBoundary=1&mapBounds=%5B%5B%7B%22lat%22%3A50.289447077141126%2C%22lng%22%3A14.68724263943227%7D%2C%7B%22lat%22%3A50.289447077141126%2C%22lng%22%3A14.087801111111958%7D%2C%7B%22lat%22%3A50.039169221047985%2C%22lng%22%3A14.087801111111958%7D%2C%7B%22lat%22%3A50.039169221047985%2C%22lng%22%3A14.68724263943227%7D%2C%7B%22lat%22%3A50.289447077141126%2C%22lng%22%3A14.68724263943227%7D%5D%5D&center=%7B%22lat%22%3A50.16447196305031%2C%22lng%22%3A14.387521875272125%7D&zoom=11&locationInput=praha&limit=15');

    await page.waitForSelector('#search-content button.btn-icon');

    while (await page.$('#search-content button.btn-icon') !== null) {
      const articlesForNow = (await page.$$('#search-content article')).length;
      console.log(`Articles for now: ${articlesForNow}. Getting more...`);

      await Promise.all([
        page.evaluate(
          () => { document.querySelector('#search-content button.btn-icon').click(); }
        ),
        page.waitForFunction(
          old => document.querySelectorAll('#search-content article').length > old,
          {},
          articlesForNow
        ),
      ]);
    }

    const articlesAll = (await page.$$('#search-content article')).length;
    console.log(`All articles: ${articlesAll}.`);

    fs.writeFileSync('full.html', await page.content());
    fs.writeFileSync('articles.html', await page.evaluate(
      () => document.querySelector('#search-content div.b-filter__inner').outerHTML
    ));
    fs.writeFileSync('articles.txt', await page.evaluate(
      () => [...document.querySelectorAll('#search-content article')]
              .map(({ innerText }) => innerText)
              .join(`\n${'-'.repeat(50)}\n`)
    ));
    console.log('Saved.');

    await browser.close();
  } catch (err) {
    console.error(err);
  }
})();
Share:
30,205
Admin
Author by

Admin

Updated on July 05, 2022

Comments

  • Admin
    Admin almost 2 years

    Finally I figured how to use Node.js. Installed all libraries/extensions. So puppeteer is working, but as it was previous with Xmlhttp... it gets only template/body of the page, without needed information. All scripts on the page engage after few second it had been opened in browser (Web app?). I need to get information inside certain tags after Whole page is loaded. Also, I would ask, if it possible to have pure JavaScript, because I do not use jQuery like code. So it doubles difficulty for me...

    Here what I have so far.

    const puppeteer = require('puppeteer');
    const $ = require('cheerio');
    let browser;
    let page;
    
    const url = "really long link with latitude and attitude";
    
    (async () => puppeteer
      .launch()
      .then(await function(browser) {
        return browser.newPage();
    })
      .then(await function(page) {
        return page.goto(url).then(function() {
          return page.content();
        });
      })
      .then(await function(html) {
        $('strong', html).each(function() {
          console.log($(this).text());
        });
      })
      .catch(function(err) {
        //handle error
      }))();
    

    I get only template default body elements inside strong tag. But it should contain a lot more data than just 10 items.