The htmlToTree function takes an HTML string or tree as input and returns the corresponding tree structure by parsing and recursively processing its child nodes using the accumulateChildNodes function.
Alternatively, you can also summarize it in two sentences:
The htmlToTree function uses the JSDOM library to parse the input HTML and process its child nodes using the accumulateChildNodes function. It can handle HTML strings, arrays, and text nodes, and it recursively processes attribute objects to convert their attributes to properties.
npm run import -- "html to tree"var {JSDOM} = require('jsdom');
// TODO: create a pattern same as notebook markdown in core, accumulate
function accumulateChildNodes(body) {
var commentBuffer = []
// TODO: exclude children parent properties
// left-over children are assigned to children: []
return Array.from(body.childNodes)
.reduce((obj, n) => {
if(n.nodeName === '#text') {
commentBuffer.push[n];
return obj;
}
var parent = n.getAttribute('parent-attr');
var newNode = htmlToTree(n);
if(parent) {
var parentType = body.getAttribute(parent + '-type')
if(parentType === 'Array') {
if(typeof obj[parent] === 'undefined')
obj[parent] = [];
obj[parent].push(newNode);
} else {
obj[parent] = newNode;
}
} else {
// TODO: if no children left and no other -attr properties,
// remove the child property from the output
if(typeof obj.children === 'undefined')
obj.children = [];
obj.children.push(newNode);
}
if(typeof newNode.comments !== 'undefined') {
newNode.comments.push.apply(
newNode.comments, commentBuffer)
commentBuffer = [];
}
return obj;
}, {})
}
// expects a string or a tree from JSDOM
function htmlToTree(body) {
if(typeof body === 'string') {
var dom = new JSDOM(body);
return accumulateChildNodes(dom.window.document.body).children;
}
if(Array.isArray(body)) {
return body.map(t => htmlToTree(t))
}
if(body.nodeName === '#text') {
return body.nodeValue;
}
// convert attributed object containers back to properties
// TODO: might have a property named -type and it will be lost
var attrs = body.getAttributeNames()
.filter(a => a !== 'parent-attr' && a.substr(-5) != '-type')
.reduce((obj, p) => {
var attr = body.getAttribute(p);
try {
if(body.getAttribute(`${p}-type`) !== 'string') {
obj[p] = JSON.parse(attr);
} else {
obj[p] = attr;
}
}
catch (e) { obj[p] = attr; }
return obj;
}, {})
var children = accumulateChildNodes(body);
return Object.assign({
type: body.nodeName
}, attrs, children);
}
module.exports = {
htmlToTree
};
const { JSDOM } = require('jsdom');
/**
* Accumulate child nodes into a tree-like structure.
*
* @param {document} body The body of the HTML document.
* @returns {object} The accumulated tree structure.
*/
function accumulateChildNodes(body) {
const commentBuffer = [];
const children = Array.from(body.childNodes)
.filter(n => n.nodeName!== '#comment')
.reduce((obj, n) => {
if (n.nodeName === '#text') {
commentBuffer.push(n.nodeValue);
return obj;
}
const parent = n.getAttribute('parent-attr');
const newNode = htmlToTree(n);
if (parent) {
const parentType = body.getAttribute(parent + '-type');
if (parentType === 'Array') {
if (!obj[parent]) obj[parent] = [];
obj[parent].push(newNode);
} else {
obj[parent] = newNode;
}
} else {
if (!obj.children) obj.children = [];
obj.children.push(newNode);
}
if (newNode.comments) {
newNode.comments.push(...commentBuffer);
commentBuffer.length = 0;
}
return obj;
}, {});
// Remove child properties with no children
Object.keys(children).forEach(key => {
if (!children[key] || children[key].length === 0) delete children[key];
});
return children;
}
/**
* Convert HTML string or tree to a tree-like structure.
*
* @param {string|object} body The HTML string or tree.
* @returns {object} The converted tree structure.
*/
function htmlToTree(body) {
if (typeof body ==='string') {
const dom = new JSDOM(body);
return accumulateChildNodes(dom.window.document.body);
} else if (Array.isArray(body)) {
return body.map(t => htmlToTree(t));
} else if (body.nodeName === '#text') {
return body.nodeValue;
} else {
const attrs = Array.from(body.getAttributeNames())
.filter(attr => attr!== 'parent-attr' &&!attr.endsWith('-type'))
.reduce((obj, attr) => {
const value = body.getAttribute(attr);
try {
if (body.getAttribute(`${attr}-type`)!=='string') {
obj[attr] = JSON.parse(value);
} else {
obj[attr] = value;
}
} catch (e) {
obj[attr] = value;
}
return obj;
}, {});
const children = accumulateChildNodes(body);
return Object.assign({
type: body.nodeName,
children,
}, attrs);
}
}
module.exports = { htmlToTree };Code Breakdown
var { JSDOM } = require('jsdom');
The code starts by importing the JSDOM function from the jsdom library, which is used to parse HTML.
accumulateChildNodesfunction accumulateChildNodes(body) {
//...
}
This function takes an HTML element (body) as input and recursively processes its child nodes, accumulating the results in an object. The function uses the reduce method to iterate over the child nodes.
return Array.from(body.childNodes)
.reduce((obj, n) => {
//...
}, {});
The code iterates over the child nodes of the input element using Array.from(body.childNodes). The reduce method is used to accumulate the results in an object.
if (n.nodeName === '#text') {
commentBuffer.push[n];
return obj;
}
If the current node is a text node, its value is pushed onto the commentBuffer array.
var parent = n.getAttribute('parent-attr');
var newNode = htmlToTree(n);
if (parent) {
var parentType = body.getAttribute(parent + '-type');
if (parentType === 'Array') {
if (typeof obj[parent] === 'undefined')
obj[parent] = [];
obj[parent].push(newNode);
} else {
obj[parent] = newNode;
}
}
If the current node has a parent-attr attribute, its value is used to determine how to handle the node. If the parent-type attribute is set to 'Array', the node is appended to an array of children; otherwise, it is assigned as the child of the parent node.
if (typeof obj.children === 'undefined')
obj.children = [];
obj.children.push(newNode);
If the current node has no parent and is an unattached child, it is pushed onto the children array of the current object.
if (typeof newNode.comments!== 'undefined') {
newNode.comments.push.apply(newNode.comments, commentBuffer);
commentBuffer = [];
}
If the current node has a comments property, it is updated with the values from the commentBuffer array.
htmlToTreefunction htmlToTree(body) {
//...
}
This function takes an HTML string or a tree as input and returns the corresponding tree structure. The function uses the JSDOM library to parse the input HTML.
if (typeof body ==='string') {
var dom = new JSDOM(body);
return accumulateChildNodes(dom.window.document.body).children;
}
If the input is a string, it is parsed using JSDOM, and the resulting tree is processed by the accumulateChildNodes function.
if (Array.isArray(body)) {
return body.map(t => htmlToTree(t));
}
If the input is an array, each element is recursively processed by the htmlToTree function.
if (body.nodeName === '#text') {
return body.nodeValue;
}
If the input is a text node, its value is returned directly.
var attrs = body.getAttributeNames()
.forEach(attr => {
//...
});
If the input is an attribute object, its attributes are converted to properties using the forEach method. The code is currently incomplete and has a syntax error.