The extractArticle
function extracts the article content from a given HTML page in plain text, retrying if the page crashes, and returns the extracted content as a single string. It uses Selenium WebDriver to load the page, select text elements, and handle errors such as stale element references and page crashes.
npm run import -- "extract llm article"
const getClient = importer.import("selenium client")
const selectDom = importer.import("selenium select")
// TODO: help me extract this article from html, only return the article in plain text and remove the html:
async function extractArticle(driver, startPage, retry) {
if (!driver)
driver = await getClient()
if (!startPage) {
return
}
try {
console.log('loading page ', startPage)
await driver.get(startPage)
await new Promise(resolve => setTimeout(resolve, 1500))
let bodyElements = await selectDom(driver, [
'//body//*[string-length(text()) > 20 and not(self::script|self::style|self::form) and not(ancestor::aside|ancestor::nav|ancestor::form|ancestor::header)]'
])
let bodyText = []
for (let i = 0; i < bodyElements.length; i++) {
try {
let text = await bodyElements[i].getText()
bodyText.push(text)
} catch (e) {
if (e.message.includes('stale element reference')) {
continue
} else {
throw e
}
}
}
return bodyText
.map(t => Array.isArray(t) ? t.join('\n').trim() : t.trim())
.filter(t => t.length)
.join('\n')
} catch (up) {
if (!up.message.includes('page crash')) {
driver.quit()
throw up
} else if(!retry) {
return await extractArticle(driver, startPage, true)
} else {
throw up
}
}
}
module.exports = extractArticle
/**
* Import necessary modules.
* @type {Object}
*/
const { WebDriver } = require('selenium-webdriver');
const { By } = require('selenium-webdriver');
/**
* Extract the article from an HTML page.
*
* @param {string|WebDriver} driver - WebDriver instance or URL of the page.
* @param {string} startPage - URL of the page to extract the article from.
* @returns {Promise} The extracted article in plain text.
*/
async function extractArticle(startPage, retry = false, driver = null) {
// Check if the driver is not provided, create a new one if necessary.
if (!driver) {
driver = await createDriver();
}
// Check if the start page is provided, return if not.
if (!startPage) {
return '';
}
try {
console.log('Loading page:', startPage);
// Navigate to the start page.
await driver.get(startPage);
// Wait for the page to load.
await new Promise(resolve => setTimeout(resolve, 1500));
// Select all elements in the body that match the given criteria.
const bodyElements = await selectElements(driver, [
'//body//*[string-length(text()) > 20 and not(self::script|self::style|self::form) and not(ancestor::aside|ancestor::nav|ancestor::form|ancestor::header)]'
]);
// Extract the text from each element.
const bodyText = await extractText(bodyElements);
// Join the text from all elements into a single string.
return await mergeText(bodyText);
} catch (up) {
if (!up.message.includes('page crash')) {
driver.quit();
throw up;
} else if (!retry) {
return await extractArticle(startPage, true, await createDriver());
} else {
throw up;
}
}
}
/**
* Create a new WebDriver instance.
*
* @returns {Promise} A promise that resolves to a new WebDriver instance.
*/
async function createDriver() {
const { Builder } = require('selenium-webdriver');
return new Builder().forBrowser('chrome').build();
}
/**
* Select elements on the page.
*
* @param {WebDriver} driver - WebDriver instance.
* @param {string[]} selectors - Array of CSS selectors.
* @returns {Promise[]>} A promise that resolves to an array of element promises.
*/
async function selectElements(driver, selectors) {
const { By } = require('selenium-webdriver');
return Promise.all(selectors.map(selector => driver.findElement(By.xpath(selector))));
}
/**
* Extract text from a list of elements.
*
* @param {Promise[]} elements - Array of element promises.
* @returns {Promise} A promise that resolves to an array of extracted text.
*/
async function extractText(elements) {
return Promise.all(elements.map(async element => {
try {
return await element.getText();
} catch (e) {
if (e.message.includes('stale element reference')) {
return '';
} else {
throw e;
}
}
}));
}
/**
* Merge an array of text into a single string.
*
* @param {string[]} text - Array of text.
* @returns {Promise} A promise that resolves to the merged text.
*/
async function mergeText(text) {
return text
.map(t => Array.isArray(t)? t.join('\n').trim() : t.trim())
.filter(t => t.length)
.join('\n');
}
module.exports = extractArticle;
Function Breakdown: extractArticle
Extracts the article content from a given HTML page in plain text.
driver
: Selenium WebDriver instance. If not provided, it's created using getClient()
.startPage
: URL of the page to extract the article from. If not provided, the function returns immediately.retry
: Flag to retry extracting the article if the page crashes.The extracted article content in plain text.
driver
is provided. If not, it creates a new instance using getClient()
.startPage
is provided. If not, the function returns immediately.driver.get(startPage)
.selectDom()
and a XPath expression.bodyText
array.bodyText
array and joins the remaining strings into a single string.retry
set to true
. If retry
is false
, it throws an error. If the page crashes and retry
is true
, it tries to extract the article again; otherwise, it throws an error.retry
is false
, the function throws an error.retry
is true
, the function tries to extract the article again.