scraping | | test article extract | Search

The extractArticle function extracts the article content from a given HTML page in plain text, retrying if the page crashes, and returns the extracted content as a single string. It uses Selenium WebDriver to load the page, select text elements, and handle errors such as stale element references and page crashes.

Run example

npm run import -- "extract llm article"

extract llm article

const getClient = importer.import("selenium client")
const selectDom = importer.import("selenium select")

// TODO: help me extract this article from html, only return the article in plain text and remove the html:
async function extractArticle(driver, startPage, retry) {
  if (!driver)
    driver = await getClient()

  if (!startPage) {
    return
  }

  try {
    console.log('loading page ', startPage)

    await driver.get(startPage)

    await new Promise(resolve => setTimeout(resolve, 1500))

    let bodyElements = await selectDom(driver, [
      '//body//*[string-length(text()) > 20 and not(self::script|self::style|self::form) and not(ancestor::aside|ancestor::nav|ancestor::form|ancestor::header)]'
    ])
    let bodyText = []

    for (let i = 0; i < bodyElements.length; i++) {
      try {
        let text = await bodyElements[i].getText()
        bodyText.push(text)
      } catch (e) {
        if (e.message.includes('stale element reference')) {
          continue
        } else {
          throw e
        }
      }
    }

    return bodyText
      .map(t => Array.isArray(t) ? t.join('\n').trim() : t.trim())
      .filter(t => t.length)
      .join('\n')

  } catch (up) {
    if (!up.message.includes('page crash')) {
      driver.quit()

      throw up
    } else if(!retry) {
      return await extractArticle(driver, startPage, true)
    } else {
      throw up
    }
  }
}

module.exports = extractArticle

What the code could have been:

/**
 * Import necessary modules.
 * @type {Object}
 */
const { WebDriver } = require('selenium-webdriver');
const { By } = require('selenium-webdriver');

/**
 * Extract the article from an HTML page.
 *
 * @param {string|WebDriver} driver - WebDriver instance or URL of the page.
 * @param {string} startPage - URL of the page to extract the article from.
 * @returns {Promise} The extracted article in plain text.
 */
async function extractArticle(startPage, retry = false, driver = null) {
  // Check if the driver is not provided, create a new one if necessary.
  if (!driver) {
    driver = await createDriver();
  }

  // Check if the start page is provided, return if not.
  if (!startPage) {
    return '';
  }

  try {
    console.log('Loading page:', startPage);

    // Navigate to the start page.
    await driver.get(startPage);

    // Wait for the page to load.
    await new Promise(resolve => setTimeout(resolve, 1500));

    // Select all elements in the body that match the given criteria.
    const bodyElements = await selectElements(driver, [
      '//body//*[string-length(text()) > 20 and not(self::script|self::style|self::form) and not(ancestor::aside|ancestor::nav|ancestor::form|ancestor::header)]'
    ]);

    // Extract the text from each element.
    const bodyText = await extractText(bodyElements);

    // Join the text from all elements into a single string.
    return await mergeText(bodyText);
  } catch (up) {
    if (!up.message.includes('page crash')) {
      driver.quit();

      throw up;
    } else if (!retry) {
      return await extractArticle(startPage, true, await createDriver());
    } else {
      throw up;
    }
  }
}

/**
 * Create a new WebDriver instance.
 *
 * @returns {Promise} A promise that resolves to a new WebDriver instance.
 */
async function createDriver() {
  const { Builder } = require('selenium-webdriver');
  return new Builder().forBrowser('chrome').build();
}

/**
 * Select elements on the page.
 *
 * @param {WebDriver} driver - WebDriver instance.
 * @param {string[]} selectors - Array of CSS selectors.
 * @returns {Promise[]>} A promise that resolves to an array of element promises.
 */
async function selectElements(driver, selectors) {
  const { By } = require('selenium-webdriver');
  return Promise.all(selectors.map(selector => driver.findElement(By.xpath(selector))));
}

/**
 * Extract text from a list of elements.
 *
 * @param {Promise[]} elements - Array of element promises.
 * @returns {Promise} A promise that resolves to an array of extracted text.
 */
async function extractText(elements) {
  return Promise.all(elements.map(async element => {
    try {
      return await element.getText();
    } catch (e) {
      if (e.message.includes('stale element reference')) {
        return '';
      } else {
        throw e;
      }
    }
  }));
}

/**
 * Merge an array of text into a single string.
 *
 * @param {string[]} text - Array of text.
 * @returns {Promise} A promise that resolves to the merged text.
 */
async function mergeText(text) {
  return text
   .map(t => Array.isArray(t)? t.join('\n').trim() : t.trim())
   .filter(t => t.length)
   .join('\n');
}

module.exports = extractArticle;

Function Breakdown: extractArticle

Purpose

Extracts the article content from a given HTML page in plain text.

Parameters

Return Value

The extracted article content in plain text.

Implementation

  1. Checks if driver is provided. If not, it creates a new instance using getClient().
  2. Checks if startPage is provided. If not, the function returns immediately.
  3. Loads the specified page using driver.get(startPage).
  4. Waits for 1.5 seconds to allow the page to load.
  5. Selects all text elements in the page body using selectDom() and a XPath expression.
  6. Extracts the text content from each selected element and appends it to the bodyText array.
  7. Removes any empty strings from the bodyText array and joins the remaining strings into a single string.
  8. If the page crashes, it tries to extract the article again with retry set to true. If retry is false, it throws an error. If the page crashes and retry is true, it tries to extract the article again; otherwise, it throws an error.

Error Handling