This code reads an HTML file, parses it, serializes it to XHTML, and then parses the XHTML string as an XML document using various Node.js modules. It then uses an XPath expression to select specific nodes from the XML document and logs the selected nodes to the console.
const fs = require('mz/fs');
const xpath = require('xpath');
const parse5 = require('parse5');
const xmlser = require('xmlserializer');
const dom = require('xmldom').DOMParser;
(async () => {
const html = await fs.readFile('./test.htm');
const document = parse5.parse(html.toString());
const xhtml = xmlser.serializeToString(document);
const doc = new dom().parseFromString(xhtml);
const select = xpath.useNamespaces({"x": "http://www.w3.org/1999/xhtml"});
const nodes = select("//x:a/@href", doc);
console.log(nodes);
})();
// Import required modules
const fs = require('fs/promises'); // Use promises version for async functionality
const { parse } = require('parse5'); // Import specific function to avoid namespace pollution
const { serializeToString } = require('xmlserializer'); // Import specific function to avoid namespace pollution
const { DOMParser } = require('xmldom'); // Use import for ES6 compatibility
const { select } = require('xpath'); // Import specific function to avoid namespace pollution
const { useNamespaces } = require('xpath'); // Import specific function to avoid namespace pollution
// Define constants for namespace and file path
const XHTML_NAMESPACE = 'http://www.w3.org/1999/xhtml';
const FILE_PATH = './test.htm';
// Define async function to extract href attributes
async function extractHref() {
try {
// Read file asynchronously
const html = await fs.readFile(FILE_PATH);
// Parse HTML using parse5
const document = parse(html.toString());
// Serialize HTML to XML string
const xhtml = serializeToString(document);
// Parse XML string to DOM document
const doc = new DOMParser().parseFromString(xhtml);
// Use XPath expression to select href attributes
const selectNamespace = useNamespaces({ x: XHTML_NAMESPACE });
const nodes = select("//x:a/@href", doc, selectNamespace);
console.log(nodes);
} catch (error) {
// Log error and continue execution
console.error(error);
}
}
// Call async function to extract href attributes
extractHref();
const fs = require('mz/fs');
const xpath = require('xpath');
const parse5 = require('parse5');
const xmlser = require('xmlserializer');
const dom = require('xmldom').DOMParser;
This section imports various Node.js modules:
fs
(mz/fs): A file system module for reading files.xpath
: A module for evaluating XPath expressions.parse5
: A parser for parsing HTML and XML documents.xmlser
(xmlserializer): A module for serializing XML documents to strings.dom
(xmldom): A module for parsing and manipulating XML documents.(async () => {
//... code...
})();
This is an immediately invoked async function, which executes its contents as soon as it's defined.
const html = await fs.readFile('./test.htm');
This line reads the contents of the file test.htm
in the current directory and assigns it to the html
variable.
const document = parse5.parse(html.toString());
This line parses the HTML document using parse5.parse()
and assigns the resulting document node to the document
variable.
const xhtml = xmlser.serializeToString(document);
This line serializes the parsed HTML document to an XHTML string using xmlser.serializeToString()
and assigns it to the xhtml
variable.
const doc = new dom().parseFromString(xhtml);
This line parses the XHTML string as an XML document using xmldom.DOMParser.parseFromString()
and assigns the resulting document node to the doc
variable.
const select = xpath.useNamespaces({"x": "http://www.w3.org/1999/xhtml"});
const nodes = select("//x:a/@href", doc);
This code:
xpath.useNamespaces()
to register a namespace with prefix x
and URI http://www.w3.org/1999/xhtml
.//x:a/@href
on the doc
document node to select all href
attributes of elements with tag name a
from the XHTML namespace.nodes
variable.console.log(nodes);
This line logs the selected nodes to the console.