edit anywhere | Cell 9 | ckeditor configuration | Search

This code snippet is a part of a web crawler or data extraction tool that loads and processes previously scraped web pages from a local cache. It parses URLs, matches them against cached data, and extracts stylesheets and images for further processing.

Run example

npm run import -- "read crawl files"

read crawl files

var path = require('path')
var fs = require('fs')
var {URL} = require('url')
var uuid = require('uuid/v1')
var importer = require('../Core')
var {glob} = importer.import("glob files")
var {minimatch} = importer.import("minimatch")
var {selectDom} = importer.import("select tree")
var prefixCssRules = importer.import("scope css")
var {findCache} = importer.import("domain crawler tools")

var PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE || '';
var project = path.join(PROFILE_PATH, 'Collections/crawls');

function matchPage(match, search, hostname) {
    return search.toLowerCase() == match.toLowerCase()
        || minimatch(search, match)
        || (!match || match === 'index')
        && search.match(/https?:\/\/[^\/]*\/?$/ig)
        && search.includes(hostname)
}

function loadScraped(url) {
    if(typeof url == 'undefined') {
        url = 'https://google.com'
    }
    if(typeof url == 'string') {
        url = new URL(url);
    }
    //console.log(url)
    var host = url.hostname
    var file = url.pathname
    var hostEscaped = host.replace(/[^a-z0-9_-]/ig, '_')
    if(!file || file === '/') file = 'index'
    
    // lookup on filesystem
    var cache = findCache(hostEscaped)
    if(!cache[0]) {
        return
    }
    var crawl = JSON.parse(fs.readFileSync(cache[0]).toString());
    var entry = crawl.filter(r => matchPage(file, r.url, host))[0];
    var result = {}
    //console.log(entry)
    // parse out styles and images and package it up in to one nice page
    if(entry) {
        var doc = selectDom('*', entry.html)
        var styles = selectDom(['//link[@rel = "stylesheet"]|//style'], doc)
        var css = ''
        styles.forEach(s => {
            var src = s.getAttribute('src') || s.getAttribute('href')
            s.remove()
            if(!src) {
                css += s.innerHTML
                return
            }
            src = new URL(src, url).href
            var rules = crawl.filter(r => r.url === src)[0]
            if(rules) {
                css += rules.content
            }
        })
        
        var scripts = selectDom(['//script|//iframe'], doc)
        scripts.forEach(s => s.remove())
        
        var images = selectDom(['//img'], doc)
        images.forEach(i => {
            var src = i.getAttribute('src')
            src = new URL(src, url).pathname
            var images = crawl.filter(r => r.url.includes(src))[0]
            if(images && images.content.includes('data:')) {
                i.setAttribute('src', images.content)
            }
            var src = i.getAttribute('srcset')
            if(src) {
                src = src.split(' ')[0]
                src = new URL(src, url).pathname
                var images = crawl.filter(r => r.url.includes(src))[0]
                if(images && images.content.includes('data:')) {
                    i.setAttribute('src', images.content)
                    i.removeAttribute('srcset')
                }
            }
        })
        
        var links = selectDom(['//a'], doc)
        links.forEach(l => {
            var src = l.getAttribute('href')
            src = new URL(src, url).href
            l.setAttribute('href', '/?url=' + src)
        })
        
        var bodyId = selectDom('body', doc).getAttribute('id')
        
        var urlReplace = ($0, $1) => {
            if(!$1 || $1.length === 0) return $0
            var src = new URL($1, url).pathname
            var images = crawl.filter(r => r.url.includes(src))[0]
            if(images && images.content.includes('data:')) {
                return `url(${images.content})`
            }
            return $0
        }
        // TODO: load images as data URIs and lower quality
        css = prefixCssRules(css, '#' + hostEscaped, bodyId)
            .replace(/url\s*\(['"]*([^\)]*?)['"]*\)/ig, urlReplace)
            .replace(/href="([^\"]*)"/ig, ($0, $1) => {
                var src = new URL($1, url).href
                return $0.replace($1, '/?url=' + src)
            })
        
        var styleTags = selectDom(['//*[@style]'], doc)
        styleTags.forEach(i => {
            var style = i.getAttribute('style')
                .replace(/url\s*\(['"]*([^\)]*?)['"]*\)/ig, urlReplace)
            i.setAttribute('style', style)
        })

        var icon = (crawl.filter(r => r.url.includes('favicon.ico'))[0] || {}).content
        
        // inject the editor into copied page
        var body = selectDom('//body', doc)
        var classes = body.getAttribute('class')
        result[file.replace(/[^a-z0-9_-]/ig, '_')] = `
<!DOCTYPE html>
<html><head>
<link rel="icon" href="${icon}">
<link type="text/css" rel="stylesheet" href="https://pro.fontawesome.com/releases/v5.10.0/css/fontawesome.css" />
<link type="text/css" rel="stylesheet" href="https://pro.fontawesome.com/releases/v5.10.0/css/solid.css" />
<link type="text/css" rel="stylesheet" href="https://golden-layout.com/files/latest/css/goldenlayout-base.css" />
<link type="text/css" rel="stylesheet" href="https://golden-layout.com/files/latest/css/goldenlayout-dark-theme.css" />
<link type="text/css" rel="stylesheet" href="" />
<style>${css}</style>
<style>
body, html {
    margin: 0;
    padding: 0;
}

.initial-page {
    height: 100%;
    width: 100%;
}

.lm_item_container {
    overflow: auto;
}

.lm_content {
    min-height: 400px;
    overflow: auto;
    z-index: 1;
}

.lm_header .lm_tab {
    height: 18px;
    font-size: 18px;
    padding-bottom: 10px;
    padding-right: 30px;
    padding-top: 10px;
}

.lm_header .lm_tab.lm_active {
    padding-top: 10px;
    padding-bottom: 10px;
}

.lm_header .lm_tab .lm_close_tab {
    height: 20px;
    width: 20px;
    background-size: 75%;
}

.lm_controls>li {
    background-size: 15px;
    padding: 5px 5px;
}

.lm_header .lm_controls>li,
.lm_maximised .lm_controls .lm_maximise,
.lm_header .lm_tab .lm_close_tab {
    text-align: unset;
    background: none;
    color: white;
}

.lm_maximised .lm_controls .lm_maximise .fa-window-maximize,
.lm_controls .lm_maximise .fa-window-minimize {
    display: none;
}

.lm_maximised .lm_controls .lm_maximise .fa-window-minimize {
    display: inline-block;
}

.lm_header .lm_tab .fa-search,
.lm_header .lm_tab .fa-stop,
.lm_header .lm_tab .fa-stream {
    top: 4px;
    right: 80px;
}

.lm_header .lm_tab .fa-print,
.lm_header .lm_tab .fa-backspace,
.lm_header .lm_tab .fa-columns {
    top: 4px;
    right: 58px;
}

.lm_header .lm_tab .fa-binoculars,
.lm_header .lm_tab .fa-play,
.lm_header .lm_tab .fa-exchange-alt {
    top: 4px;
    right: 102px;
}

.lm_header .lm_tab {
    padding-right: 110px;
}

.file-tree {
    color: white;
    position: relative;
    z-index: 1;
}

.file-tree i.fas,
.file-tree i.fas:not(.fa-folder):not(.fa-angle-right) {
    margin-right: 5px;
    margin-left: 25px;
}


.file-tree i.fa-folder,
.file-tree i.fa-angle-right {
    margin-left: 5px;
}

.file-tree li {
    list-style: none;
    cursor: pointer;
    padding: 5px 0 0;
    overflow: hidden;
}

.file-tree label {
    cursor: pointer;
}

.file-tree ul {
    list-style: none;
    padding: 0;
    margin: 5px 0 5px 10px;
    overflow: hidden;
    text-overflow: ellipsis;
}

.file-tree ul ul {
    list-style: none;
    padding: 0 0 0 20px;
    margin: 0;
}

.file-tree input[type="checkbox"] {
    opacity: 0;
    height: 1px;
    width: 1px;
    position: absolute;
    top: auto;
    left: -100px;
}

.file-tree input[type="checkbox"]:not(:checked) ~ ul {
    display: none;
}

.file-tree input[type="checkbox"]:checked ~ ul {
    display: block;
}

.file-tree input[type="checkbox"]:not(:checked) ~ label .fa-angle-right {
    transition: transform 0.5s;
    transform: rotate(0);
}

.file-tree input[type="checkbox"]:checked ~ label .fa-angle-right {
    transition: transform 0.5s;
    transform: rotate(90deg);
}

.file-tree label:hover:before {
    background: #333;
}

.file-tree input[type="checkbox"]:checked ~ label:before {
    background: #339;
}

.file-tree label:after,
.file-tree label:before {
    z-index: -1;
    width: 100%;
    content: " ";
    display: inline-block;
    position: absolute;
    height: 20px;
    left: 0;
}

.file-tree label:after {
    z-index: 0;
}

/*
*,
:before,
:after {
    padding: 0;
    margin: 0;
    box-sizing: border-box;
    max-width: 100%;
}
*/

.code-editor {
    position: absolute;
    top: 0;
    right: 0;
    bottom: 0;
    left: 0;
}

</style>
</head><body><div id="${hostEscaped}" class="initial-page"><div class="${classes}">${body.innerHTML}</div></div>
<script>
${importer.interpret('ckeditor configuration').code}
</script>
</body></html>`
    }
    return result
}

module.exports = loadScraped

//var importer = require('../Core')
//var loadScraped = importer.import("read crawl files")

if(typeof $ != 'undefined') {
    var scraped = loadScraped('https://google.com')
    $.html(scraped)
}

What the code could have been:

const path = require('path');
const fs = require('fs');
const { URL } = require('url');
const { v1: uuidv1 } = require('uuid');
const glob = require('glob');
const minimatch = require('minimatch');
const selectDom = require('select-dom');
const { prefixCssRules } = require('./scope-css');
const { findCache } = require('./domain-crawler-tools');
const importer = require('./Core');

const PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE || '';
const project = path.join(PROFILE_PATH, 'Collections/crawls');

const matchPage = (match, search, hostname) => {
  return search.toLowerCase() === match.toLowerCase()
    || minimatch(search, match)
    || (!match || match === 'index')
    && search.match(/https?:\/\/[^\/]*\/?$/ig)
    && search.includes(hostname);
};

const loadScraped = (url = 'https://google.com') => {
  if (typeof url!=='string') return {};

  const parsedUrl = new URL(url);
  const host = parsedUrl.hostname;
  const file = parsedUrl.pathname;
  const hostEscaped = host.replace(/[^a-z0-9_-]/ig, '_');

  if (!file || file === '/') file = 'index';

  const cache = findCache(hostEscaped);
  if (!cache[0]) return {};

  const crawl = JSON.parse(fs.readFileSync(cache[0]).toString());
  const entry = crawl.filter(r => matchPage(file, r.url, host))[0];
  if (!entry) return {};

  const { html } = entry;
  const doc = selectDom('*', html);
  const styles = selectDom(['//link[@rel = "stylesheet"]|//style'], doc);

  const css = styles.reduce((acc, style) => {
    const src = style.getAttribute('src') || style.getAttribute('href');
    style.remove();
    if (!src) return acc + style.innerHTML;

    const linkedCss = crawl.find(r => r.url === src);
    if (linkedCss) {
      return acc + linkedCss.content;
    }

    return acc;
  }, '');

  const scripts = selectDom(['//script|//iframe'], doc);
  scripts.forEach(s => s.remove());

  const images = selectDom(['//img'], doc);
  images.forEach(i => {
    const src = i.getAttribute('src');
    const imageSrc = new URL(src, url).pathname;
    const imagesEntry = crawl.find(r => r.url.includes(imageSrc));
    if (imagesEntry && imagesEntry.content.includes('data:')) {
      i.setAttribute('src', imagesEntry.content);
    }
    const srcset = i.getAttribute('srcset');
    if (srcset) {
      const imageSrcSet = srcset.split(' ')[0];
      const imageSrcSetUrl = new URL(imageSrcSet, url).pathname;
      const imagesEntry = crawl.find(r => r.url.includes(imageSrcSetUrl));
      if (imagesEntry && imagesEntry.content.includes('data:')) {
        i.setAttribute('src', imagesEntry.content);
        i.removeAttribute('srcset');
      }
    }
  });

  const links = selectDom(['//a'], doc);
  links.forEach(l => {
    const src = l.getAttribute('href');
    const linkSrc = new URL(src, url).href;
    l.setAttribute('href', `/?url=${linkSrc}`);
  });

  const bodyId = selectDom('body', doc).getAttribute('id');

  const urlReplace = ($0, $1) => {
    if (!$1 || $1.length === 0) return $0;
    const src = new URL($1, url).pathname;
    const imagesEntry = crawl.find(r => r.url.includes(src));
    if (imagesEntry && imagesEntry.content.includes('data:')) {
      return `url(${imagesEntry.content})`;
    }
    return $0;
  };

  const styleTags = selectDom(['//*[@style]'], doc);
  styleTags.forEach(i => {
    const style = i.getAttribute('style')
     .replace(/url\s*\(['"]*([^\)]*?)['"]*\)/ig, urlReplace);
    i.setAttribute('style', style);
  });

  const icon = (crawl.find(r => r.url.includes('favicon.ico')) || {}).content;

  const body = selectDom('//body', doc);
  const classes = body.getAttribute('class');

  const result = {
    [file.replace(/[^a-z0-9_-]/ig, '_')]: `



  
  
  
  
  
  
  


  
${body.innerHTML}
`, }; return result; }; module.exports = loadScraped;

This code snippet is designed to load and process scraped web pages, likely for a web crawler or data extraction tool.

Here's a breakdown:

1. Dependencies:

2. Configuration:

3. matchPage Function:

4. loadScraped Function:

Let me know if you have any more questions.