data collection | multi crawl | domain cache tools | Search

crawlRecursive(url, depth, searches)**:

The crawlRecursive function is a recursive web crawler that starts at a specified initial URL, iteratively retrieves links from the crawled pages, and stores them in a cache, with the ability to manage recursion depth and cache updates. The function uses a series of steps, including crawling, cache management, link extraction, recursion, and termination, to fetch and process links from the web pages.

Run example

npm run import -- "crawl domain"

crawl domain

var {URL} = require('url')
var fs = require('fs');
var path = require('path');
var importer = require('../Core');
var {doBrowserRequest} = importer.import("browser crawler tools")
var {
    cacheFilename,
    existingCache,
    storeCache,
    readCache,
    rmhash
} = importer.import("domain cache tools")

async function crawlRecursive(url, depth, searches) {
    if(!depth) depth = 3 // TODO: minutes depth using time range?
    url = (typeof url === 'string' ? [url] : url )
    // searches2 keeps track of new pages that should be added if searches is not provided
    //   this guaruntees at least one page will be requested when this is called
    const searches2 = []
    for(var i = 0; i < url.length; i++) {
        var l = url[i]
        try {
            await doBrowserRequest(l, readCache.bind(null, searches || searches2),
                                   storeCache.bind(null, searches || searches2))
        } catch (e) {
            console.log(e)
        }
    }
    
    // TODO: fix this
    
    // push old cache on to bottom of current searches,
    //   so we are always getting at least the page requested
    if(!searches) searches = searches2.concat(existingCache(url[0]))
    
    var existing = searches.map(s => rmhash(s.url))
    var links = searches2
    // TODO: pattern defensive programming
        .map(s => {
            var styles = s.styles || []
            var links = s.links || []
            return styles.concat(links)
        })
        .flat()
        // do not include hash in actual link to the page
        .map(s => rmhash(s))
        // filter out first occurence
        .filter((l, i, arr) => arr.indexOf(l) === i
        // filter out all existing urls
                && !existing.includes(rmhash(l))
        // filter out data uris
                && !l.includes('data:') && !l.includes('mailto:')
                && !l.includes('javascript:') && !l.includes('ios-app:'))
    
    if(depth > 1) {
        return await crawlRecursive(links, depth - 1, searches)
    }
    
    // close the browser
    await doBrowserRequest(false)
    
    // save the database
    var filePath = cacheFilename(searches[0] ? searches[0].url : url[0])
    fs.writeFileSync(filePath, JSON.stringify(searches, null, 2))
}

async function crawlAll(url, depth, searches) {
    try {
        await crawlRecursive(url, depth, searches)
    } catch (e) {
        console.log(e)
        await doBrowserRequest(false)
    }
}

module.exports = crawlAll

//var importer = require('../Core')
//var crawlAll = importer.import("crawl domain")

if(typeof $ !== 'undefined') {
    $.async()
    crawlAll('https://google.com', 2)
        .then(r => $.sendResult('done'))
        .catch(e => $.sendError(e))
}

What the code could have been:

const { URL } = require('url');
const fs = require('fs');
const path = require('path');
const importer = require('../Core');

const { doBrowserRequest, closeBrowser } = importer.import('browser crawler tools');
const {
  cacheFilename,
  existingCache,
  storeCache,
  readCache,
  rmhash,
} = importer.import('domain cache tools');

/**
 * Crawls a URL recursively.
 * 
 * @param {string|Array} url - The URL(s) to crawl.
 * @param {number} depth - The maximum depth to crawl.
 * @param {Array} searches - The current search results.
 * @returns {Promise} A promise that resolves with the search results.
 */
async function crawlRecursive(url, depth, searches) {
  if (depth === undefined) depth = 3; // Set default depth
  url = (typeof url ==='string'? [url] : url);

  const searches2 = new Set(); // Use a Set to keep track of new pages

  for (const l of url) {
    try {
      await doBrowserRequest(l, readCache.bind(null, searches || searches2), storeCache.bind(null, searches || searches2));
    } catch (e) {
      console.error(e);
    }
  }

  if (!searches) {
    searches = existingCache(url[0]).concat(searches2);
  }

  const existing = new Set(searches.map((s) => rmhash(s.url)));
  const links = new Set(searches2);

  const stylesAndLinks = links
   .map((s) => {
      const styles = s.styles || [];
      const links = s.links || [];
      return styles.concat(links);
    })
   .flat()
   .map((s) => rmhash(s))
   .filter((l, i, arr) => arr.indexOf(l) === i)
   .filter((l) =>!existing.has(rmhash(l)))
   .filter((l) =>!l.includes('data:') &&!l.includes('mailto:') &&!l.includes('javascript:') &&!l.includes('ios-app:'))
   .filter((l) => l.includes('http')); // Filter out invalid URLs

  if (depth > 1) {
    return await crawlRecursive(Array.from(stylesAndLinks), depth - 1, searches);
  }

  await closeBrowser(); // Close the browser
  const filePath = cacheFilename(searches[0]? searches[0].url : url[0]);
  fs.writeFileSync(filePath, JSON.stringify(searches, null, 2));
}

/**
 * Crawls a URL and its links recursively.
 * 
 * @param {string|Array} url - The URL(s) to crawl.
 * @param {number} depth - The maximum depth to crawl.
 * @param {Array} searches - The current search results.
 * @returns {Promise} A promise that resolves with the search results.
 */
async function crawlAll(url, depth, searches) {
  try {
    await crawlRecursive(url, depth, searches);
  } catch (e) {
    console.error(e);
    await closeBrowser();
  }
}

module.exports = crawlAll;

Function Breakdown: crawlRecursive(url, depth, searches)

Parameters:

Functionality:

  1. Initialization:

  2. Crawling:

  3. Cache Management:

  4. Link Extraction:

  5. Recursion:

  6. Termination:

Notable Notes: