crawl domain

The crawlRecursive function is a recursive web crawler that starts at a specified initial URL, iteratively retrieves links from the crawled pages, and stores them in a cache, with the ability to manage recursion depth and cache updates. The function uses a series of steps, including crawling, cache management, link extraction, recursion, and termination, to fetch and process links from the web pages.

crawl domain

var {URL} = require('url') var fs = require('fs'); var path = require('path'); var importer = require('../Core'); var {doBrowserRequest} = importer.import("browser crawler tools") var { cacheFilename, existingCache, storeCache, readCache, rmhash } = importer.import("domain cache tools") async function crawlRecursive(url, depth, searches) { if(!depth) depth = 3 // TODO: minutes depth using time range? url = (typeof url === 'string' ? [url] : url ) // searches2 keeps track of new pages that should be added if searches is not provided // this guaruntees at least one page will be requested when this is called const searches2 = [] for(var i = 0; i < url.length; i++) { var l = url[i] try { await doBrowserRequest(l, readCache.bind(null, searches || searches2), storeCache.bind(null, searches || searches2)) } catch (e) { console.log(e) } } // TODO: fix this // push old cache on to bottom of current searches, // so we are always getting at least the page requested if(!searches) searches = searches2.concat(existingCache(url[0])) var existing = searches.map(s => rmhash(s.url)) var links = searches2 // TODO: pattern defensive programming .map(s => { var styles = s.styles || [] var links = s.links || [] return styles.concat(links) }) .flat() // do not include hash in actual link to the page .map(s => rmhash(s)) // filter out first occurence .filter((l, i, arr) => arr.indexOf(l) === i // filter out all existing urls && !existing.includes(rmhash(l)) // filter out data uris && !l.includes('data:') && !l.includes('mailto:') && !l.includes('javascript:') && !l.includes('ios-app:')) if(depth > 1) { return await crawlRecursive(links, depth - 1, searches) } // close the browser await doBrowserRequest(false) // save the database var filePath = cacheFilename(searches[0] ? searches[0].url : url[0]) fs.writeFileSync(filePath, JSON.stringify(searches, null, 2)) } async function crawlAll(url, depth, searches) { try { await crawlRecursive(url, depth, searches) } catch (e) { console.log(e) await doBrowserRequest(false) } } module.exports = crawlAll //var importer = require('../Core') //var crawlAll = importer.import("crawl domain") if(typeof $ !== 'undefined') { $.async() crawlAll('https://google.com', 2) .then(r => $.sendResult('done')) .catch(e => $.sendError(e)) }

What the code could have been:

const { URL } = require('url');
const fs = require('fs');
const path = require('path');
const importer = require('../Core');

const { doBrowserRequest, closeBrowser } = importer.import('browser crawler tools');
const {
  cacheFilename,
  existingCache,
  storeCache,
  readCache,
  rmhash,
} = importer.import('domain cache tools');

/**
 * Crawls a URL recursively.
 * 
 * @param {string|Array} url - The URL(s) to crawl.
 * @param {number} depth - The maximum depth to crawl.
 * @param {Array} searches - The current search results.
 * @returns {Promise} A promise that resolves with the search results.
 */
async function crawlRecursive(url, depth, searches) {
  if (depth === undefined) depth = 3; // Set default depth
  url = (typeof url ==='string'? [url] : url);

  const searches2 = new Set(); // Use a Set to keep track of new pages

  for (const l of url) {
    try {
      await doBrowserRequest(l, readCache.bind(null, searches || searches2), storeCache.bind(null, searches || searches2));
    } catch (e) {
      console.error(e);
    }
  }

  if (!searches) {
    searches = existingCache(url[0]).concat(searches2);
  }

  const existing = new Set(searches.map((s) => rmhash(s.url)));
  const links = new Set(searches2);

  const stylesAndLinks = links
   .map((s) => {
      const styles = s.styles || [];
      const links = s.links || [];
      return styles.concat(links);
    })
   .flat()
   .map((s) => rmhash(s))
   .filter((l, i, arr) => arr.indexOf(l) === i)
   .filter((l) =>!existing.has(rmhash(l)))
   .filter((l) =>!l.includes('data:') &&!l.includes('mailto:') &&!l.includes('javascript:') &&!l.includes('ios-app:'))
   .filter((l) => l.includes('http')); // Filter out invalid URLs

  if (depth > 1) {
    return await crawlRecursive(Array.from(stylesAndLinks), depth - 1, searches);
  }

  await closeBrowser(); // Close the browser
  const filePath = cacheFilename(searches[0]? searches[0].url : url[0]);
  fs.writeFileSync(filePath, JSON.stringify(searches, null, 2));
}

/**
 * Crawls a URL and its links recursively.
 * 
 * @param {string|Array} url - The URL(s) to crawl.
 * @param {number} depth - The maximum depth to crawl.
 * @param {Array} searches - The current search results.
 * @returns {Promise} A promise that resolves with the search results.
 */
async function crawlAll(url, depth, searches) {
  try {
    await crawlRecursive(url, depth, searches);
  } catch (e) {
    console.error(e);
    await closeBrowser();
  }
}

module.exports = crawlAll;

Function Breakdown: crawlRecursive(url, depth, searches)

Parameters:

url: The initial URL to start crawling from. Can be a string or an array of strings.

depth: The maximum recursion depth. Defaults to 3.

searches: An array of previously crawled search results.

Functionality:

Initialization:

If depth is not provided, it defaults to 3.
If url is a string, it is wrapped in an array.
An empty array searches2 is created to store new pages to be added.

Crawling:

The function iterates over the url array and makes a browser request to each URL using doBrowserRequest.
The cache is read and stored using readCache and storeCache respectively.
If an error occurs, it is logged to the console.

Cache Management:

If no searches array is provided, it is updated with the new pages stored in searches2.
The old cache is pushed to the bottom of the current searches array.

Link Extraction:

The searches2 array is processed to extract links.
Links are filtered to exclude existing URLs, data URIs, and specific types of links.

Recursion:

If depth is greater than 1, the function calls itself recursively with the extracted links, decreased depth, and the updated searches array.

Termination:

If depth is 1, the function returns the extracted links.

Notable Notes:

The function uses bind to pass searches (or searches2) as the cache callback to doBrowserRequest.

The rmhash function is used to remove the hash from URLs.

The existingCache function is used to retrieve the existing cache for a given URL.

The storeCache function is used to store the new cache.

The readCache function is used to read the cache.

Run example

crawl domain

What the code could have been: