The htmlToTree
function takes an HTML string or tree as input and returns the corresponding tree structure by parsing and recursively processing its child nodes using the accumulateChildNodes
function.
Alternatively, you can also summarize it in two sentences:
The htmlToTree
function uses the JSDOM
library to parse the input HTML and process its child nodes using the accumulateChildNodes
function. It can handle HTML strings, arrays, and text nodes, and it recursively processes attribute objects to convert their attributes to properties.
npm run import -- "html to tree"
var {JSDOM} = require('jsdom');
// TODO: create a pattern same as notebook markdown in core, accumulate
function accumulateChildNodes(body) {
var commentBuffer = []
// TODO: exclude children parent properties
// left-over children are assigned to children: []
return Array.from(body.childNodes)
.reduce((obj, n) => {
if(n.nodeName === '#text') {
commentBuffer.push[n];
return obj;
}
var parent = n.getAttribute('parent-attr');
var newNode = htmlToTree(n);
if(parent) {
var parentType = body.getAttribute(parent + '-type')
if(parentType === 'Array') {
if(typeof obj[parent] === 'undefined')
obj[parent] = [];
obj[parent].push(newNode);
} else {
obj[parent] = newNode;
}
} else {
// TODO: if no children left and no other -attr properties,
// remove the child property from the output
if(typeof obj.children === 'undefined')
obj.children = [];
obj.children.push(newNode);
}
if(typeof newNode.comments !== 'undefined') {
newNode.comments.push.apply(
newNode.comments, commentBuffer)
commentBuffer = [];
}
return obj;
}, {})
}
// expects a string or a tree from JSDOM
function htmlToTree(body) {
if(typeof body === 'string') {
var dom = new JSDOM(body);
return accumulateChildNodes(dom.window.document.body).children;
}
if(Array.isArray(body)) {
return body.map(t => htmlToTree(t))
}
if(body.nodeName === '#text') {
return body.nodeValue;
}
// convert attributed object containers back to properties
// TODO: might have a property named -type and it will be lost
var attrs = body.getAttributeNames()
.filter(a => a !== 'parent-attr' && a.substr(-5) != '-type')
.reduce((obj, p) => {
var attr = body.getAttribute(p);
try {
if(body.getAttribute(`${p}-type`) !== 'string') {
obj[p] = JSON.parse(attr);
} else {
obj[p] = attr;
}
}
catch (e) { obj[p] = attr; }
return obj;
}, {})
var children = accumulateChildNodes(body);
return Object.assign({
type: body.nodeName
}, attrs, children);
}
module.exports = {
htmlToTree
};
const { JSDOM } = require('jsdom');
/**
* Accumulate child nodes into a tree-like structure.
*
* @param {document} body The body of the HTML document.
* @returns {object} The accumulated tree structure.
*/
function accumulateChildNodes(body) {
const commentBuffer = [];
const children = Array.from(body.childNodes)
.filter(n => n.nodeName!== '#comment')
.reduce((obj, n) => {
if (n.nodeName === '#text') {
commentBuffer.push(n.nodeValue);
return obj;
}
const parent = n.getAttribute('parent-attr');
const newNode = htmlToTree(n);
if (parent) {
const parentType = body.getAttribute(parent + '-type');
if (parentType === 'Array') {
if (!obj[parent]) obj[parent] = [];
obj[parent].push(newNode);
} else {
obj[parent] = newNode;
}
} else {
if (!obj.children) obj.children = [];
obj.children.push(newNode);
}
if (newNode.comments) {
newNode.comments.push(...commentBuffer);
commentBuffer.length = 0;
}
return obj;
}, {});
// Remove child properties with no children
Object.keys(children).forEach(key => {
if (!children[key] || children[key].length === 0) delete children[key];
});
return children;
}
/**
* Convert HTML string or tree to a tree-like structure.
*
* @param {string|object} body The HTML string or tree.
* @returns {object} The converted tree structure.
*/
function htmlToTree(body) {
if (typeof body ==='string') {
const dom = new JSDOM(body);
return accumulateChildNodes(dom.window.document.body);
} else if (Array.isArray(body)) {
return body.map(t => htmlToTree(t));
} else if (body.nodeName === '#text') {
return body.nodeValue;
} else {
const attrs = Array.from(body.getAttributeNames())
.filter(attr => attr!== 'parent-attr' &&!attr.endsWith('-type'))
.reduce((obj, attr) => {
const value = body.getAttribute(attr);
try {
if (body.getAttribute(`${attr}-type`)!=='string') {
obj[attr] = JSON.parse(value);
} else {
obj[attr] = value;
}
} catch (e) {
obj[attr] = value;
}
return obj;
}, {});
const children = accumulateChildNodes(body);
return Object.assign({
type: body.nodeName,
children,
}, attrs);
}
}
module.exports = { htmlToTree };
Code Breakdown
var { JSDOM } = require('jsdom');
The code starts by importing the JSDOM
function from the jsdom
library, which is used to parse HTML.
accumulateChildNodes
function accumulateChildNodes(body) {
//...
}
This function takes an HTML element (body
) as input and recursively processes its child nodes, accumulating the results in an object. The function uses the reduce
method to iterate over the child nodes.
return Array.from(body.childNodes)
.reduce((obj, n) => {
//...
}, {});
The code iterates over the child nodes of the input element using Array.from(body.childNodes)
. The reduce
method is used to accumulate the results in an object.
if (n.nodeName === '#text') {
commentBuffer.push[n];
return obj;
}
If the current node is a text node, its value is pushed onto the commentBuffer
array.
var parent = n.getAttribute('parent-attr');
var newNode = htmlToTree(n);
if (parent) {
var parentType = body.getAttribute(parent + '-type');
if (parentType === 'Array') {
if (typeof obj[parent] === 'undefined')
obj[parent] = [];
obj[parent].push(newNode);
} else {
obj[parent] = newNode;
}
}
If the current node has a parent-attr
attribute, its value is used to determine how to handle the node. If the parent-type
attribute is set to 'Array'
, the node is appended to an array of children; otherwise, it is assigned as the child of the parent node.
if (typeof obj.children === 'undefined')
obj.children = [];
obj.children.push(newNode);
If the current node has no parent and is an unattached child, it is pushed onto the children
array of the current object.
if (typeof newNode.comments!== 'undefined') {
newNode.comments.push.apply(newNode.comments, commentBuffer);
commentBuffer = [];
}
If the current node has a comments
property, it is updated with the values from the commentBuffer
array.
htmlToTree
function htmlToTree(body) {
//...
}
This function takes an HTML string or a tree as input and returns the corresponding tree structure. The function uses the JSDOM
library to parse the input HTML.
if (typeof body ==='string') {
var dom = new JSDOM(body);
return accumulateChildNodes(dom.window.document.body).children;
}
If the input is a string, it is parsed using JSDOM
, and the resulting tree is processed by the accumulateChildNodes
function.
if (Array.isArray(body)) {
return body.map(t => htmlToTree(t));
}
If the input is an array, each element is recursively processed by the htmlToTree
function.
if (body.nodeName === '#text') {
return body.nodeValue;
}
If the input is a text node, its value is returned directly.
var attrs = body.getAttributeNames()
.forEach(attr => {
//...
});
If the input is an attribute object, its attributes are converted to properties using the forEach
method. The code is currently incomplete and has a syntax error.