syntax | Cell 15 | Cell 17 | Search

The htmlToTree function takes an HTML string or tree as input and returns the corresponding tree structure by parsing and recursively processing its child nodes using the accumulateChildNodes function.

Alternatively, you can also summarize it in two sentences:

The htmlToTree function uses the JSDOM library to parse the input HTML and process its child nodes using the accumulateChildNodes function. It can handle HTML strings, arrays, and text nodes, and it recursively processes attribute objects to convert their attributes to properties.

Run example

npm run import -- "html to tree"

html to tree

var {JSDOM} = require('jsdom');

// TODO: create a pattern same as notebook markdown in core, accumulate
function accumulateChildNodes(body) {
    var commentBuffer = []
    // TODO: exclude children parent properties
    //   left-over children are assigned to children: []
    return Array.from(body.childNodes)
        .reduce((obj, n) => {
            if(n.nodeName === '#text') {
                commentBuffer.push[n];
                return obj;
            }
            var parent = n.getAttribute('parent-attr');
            var newNode = htmlToTree(n);
            if(parent) {
                var parentType = body.getAttribute(parent + '-type')
                if(parentType === 'Array') {
                    if(typeof obj[parent] === 'undefined')
                        obj[parent] = [];
                    obj[parent].push(newNode);
                } else {
                    obj[parent] = newNode;
                }
            } else {
                // TODO: if no children left and no other -attr properties,
                //   remove the child property from the output
                if(typeof obj.children === 'undefined')
                    obj.children = [];
                obj.children.push(newNode);
            }
            if(typeof newNode.comments !== 'undefined') {
                newNode.comments.push.apply(
                    newNode.comments, commentBuffer)
                commentBuffer = [];
            }
            return obj;
        }, {})
}

// expects a string or a tree from JSDOM
function htmlToTree(body) {
    if(typeof body === 'string') {
        var dom = new JSDOM(body);
        return accumulateChildNodes(dom.window.document.body).children;
    }
    if(Array.isArray(body)) {
        return body.map(t => htmlToTree(t))
    }
    if(body.nodeName === '#text') {
        return body.nodeValue;
    }
    // convert attributed object containers back to properties
    // TODO: might have a property named -type and it will be lost
    var attrs = body.getAttributeNames()
        .filter(a => a !== 'parent-attr' && a.substr(-5) != '-type')
        .reduce((obj, p) => {
            var attr = body.getAttribute(p);
            try {
                if(body.getAttribute(`${p}-type`) !== 'string') {
                    obj[p] = JSON.parse(attr);
                } else {
                    obj[p] = attr;
                }
            }
            catch (e) { obj[p] = attr; }
            return obj;
        }, {})
    
    var children = accumulateChildNodes(body);
    
    return Object.assign({
        type: body.nodeName
    }, attrs, children);
}

module.exports = {
    htmlToTree
};

What the code could have been:

const { JSDOM } = require('jsdom');

/**
 * Accumulate child nodes into a tree-like structure.
 * 
 * @param {document} body The body of the HTML document.
 * @returns {object} The accumulated tree structure.
 */
function accumulateChildNodes(body) {
    const commentBuffer = [];
    const children = Array.from(body.childNodes)
       .filter(n => n.nodeName!== '#comment')
       .reduce((obj, n) => {
            if (n.nodeName === '#text') {
                commentBuffer.push(n.nodeValue);
                return obj;
            }

            const parent = n.getAttribute('parent-attr');
            const newNode = htmlToTree(n);

            if (parent) {
                const parentType = body.getAttribute(parent + '-type');
                if (parentType === 'Array') {
                    if (!obj[parent]) obj[parent] = [];
                    obj[parent].push(newNode);
                } else {
                    obj[parent] = newNode;
                }
            } else {
                if (!obj.children) obj.children = [];
                obj.children.push(newNode);
            }

            if (newNode.comments) {
                newNode.comments.push(...commentBuffer);
                commentBuffer.length = 0;
            }

            return obj;
        }, {});

    // Remove child properties with no children
    Object.keys(children).forEach(key => {
        if (!children[key] || children[key].length === 0) delete children[key];
    });

    return children;
}

/**
 * Convert HTML string or tree to a tree-like structure.
 * 
 * @param {string|object} body The HTML string or tree.
 * @returns {object} The converted tree structure.
 */
function htmlToTree(body) {
    if (typeof body ==='string') {
        const dom = new JSDOM(body);
        return accumulateChildNodes(dom.window.document.body);
    } else if (Array.isArray(body)) {
        return body.map(t => htmlToTree(t));
    } else if (body.nodeName === '#text') {
        return body.nodeValue;
    } else {
        const attrs = Array.from(body.getAttributeNames())
           .filter(attr => attr!== 'parent-attr' &&!attr.endsWith('-type'))
           .reduce((obj, attr) => {
                const value = body.getAttribute(attr);
                try {
                    if (body.getAttribute(`${attr}-type`)!=='string') {
                        obj[attr] = JSON.parse(value);
                    } else {
                        obj[attr] = value;
                    }
                } catch (e) {
                    obj[attr] = value;
                }
                return obj;
            }, {});

        const children = accumulateChildNodes(body);

        return Object.assign({
            type: body.nodeName,
            children,
        }, attrs);
    }
}

module.exports = { htmlToTree };

Code Breakdown

Importing JSDOM

var { JSDOM } = require('jsdom');

The code starts by importing the JSDOM function from the jsdom library, which is used to parse HTML.

Function: accumulateChildNodes

function accumulateChildNodes(body) {
  //...
}

This function takes an HTML element (body) as input and recursively processes its child nodes, accumulating the results in an object. The function uses the reduce method to iterate over the child nodes.

Processing Child Nodes

return Array.from(body.childNodes)
 .reduce((obj, n) => {
    //...
  }, {});

The code iterates over the child nodes of the input element using Array.from(body.childNodes). The reduce method is used to accumulate the results in an object.

Handling Text Nodes

if (n.nodeName === '#text') {
  commentBuffer.push[n];
  return obj;
}

If the current node is a text node, its value is pushed onto the commentBuffer array.

Handling Attribute-Modified Children

var parent = n.getAttribute('parent-attr');
var newNode = htmlToTree(n);
if (parent) {
  var parentType = body.getAttribute(parent + '-type');
  if (parentType === 'Array') {
    if (typeof obj[parent] === 'undefined')
      obj[parent] = [];
    obj[parent].push(newNode);
  } else {
    obj[parent] = newNode;
  }
}

If the current node has a parent-attr attribute, its value is used to determine how to handle the node. If the parent-type attribute is set to 'Array', the node is appended to an array of children; otherwise, it is assigned as the child of the parent node.

Handling Unattached Children

if (typeof obj.children === 'undefined')
  obj.children = [];
obj.children.push(newNode);

If the current node has no parent and is an unattached child, it is pushed onto the children array of the current object.

Handling Comments

if (typeof newNode.comments!== 'undefined') {
  newNode.comments.push.apply(newNode.comments, commentBuffer);
  commentBuffer = [];
}

If the current node has a comments property, it is updated with the values from the commentBuffer array.

Function: htmlToTree

function htmlToTree(body) {
  //...
}

This function takes an HTML string or a tree as input and returns the corresponding tree structure. The function uses the JSDOM library to parse the input HTML.

Parsing HTML Strings

if (typeof body ==='string') {
  var dom = new JSDOM(body);
  return accumulateChildNodes(dom.window.document.body).children;
}

If the input is a string, it is parsed using JSDOM, and the resulting tree is processed by the accumulateChildNodes function.

Parsing Arrays

if (Array.isArray(body)) {
  return body.map(t => htmlToTree(t));
}

If the input is an array, each element is recursively processed by the htmlToTree function.

Processing Text Nodes

if (body.nodeName === '#text') {
  return body.nodeValue;
}

If the input is a text node, its value is returned directly.

Converting Attribute Objects to Properties

var attrs = body.getAttributeNames()
 .forEach(attr => {
    //...
  });

If the input is an attribute object, its attributes are converted to properties using the forEach method. The code is currently incomplete and has a syntax error.