crawlRecursive(url, depth, searches)**:
The crawlRecursive
function is a recursive web crawler that starts at a specified initial URL, iteratively retrieves links from the crawled pages, and stores them in a cache, with the ability to manage recursion depth and cache updates. The function uses a series of steps, including crawling, cache management, link extraction, recursion, and termination, to fetch and process links from the web pages.
npm run import -- "crawl domain"
var {URL} = require('url')
var fs = require('fs');
var path = require('path');
var importer = require('../Core');
var {doBrowserRequest} = importer.import("browser crawler tools")
var {
cacheFilename,
existingCache,
storeCache,
readCache,
rmhash
} = importer.import("domain cache tools")
async function crawlRecursive(url, depth, searches) {
if(!depth) depth = 3 // TODO: minutes depth using time range?
url = (typeof url === 'string' ? [url] : url )
// searches2 keeps track of new pages that should be added if searches is not provided
// this guaruntees at least one page will be requested when this is called
const searches2 = []
for(var i = 0; i < url.length; i++) {
var l = url[i]
try {
await doBrowserRequest(l, readCache.bind(null, searches || searches2),
storeCache.bind(null, searches || searches2))
} catch (e) {
console.log(e)
}
}
// TODO: fix this
// push old cache on to bottom of current searches,
// so we are always getting at least the page requested
if(!searches) searches = searches2.concat(existingCache(url[0]))
var existing = searches.map(s => rmhash(s.url))
var links = searches2
// TODO: pattern defensive programming
.map(s => {
var styles = s.styles || []
var links = s.links || []
return styles.concat(links)
})
.flat()
// do not include hash in actual link to the page
.map(s => rmhash(s))
// filter out first occurence
.filter((l, i, arr) => arr.indexOf(l) === i
// filter out all existing urls
&& !existing.includes(rmhash(l))
// filter out data uris
&& !l.includes('data:') && !l.includes('mailto:')
&& !l.includes('javascript:') && !l.includes('ios-app:'))
if(depth > 1) {
return await crawlRecursive(links, depth - 1, searches)
}
// close the browser
await doBrowserRequest(false)
// save the database
var filePath = cacheFilename(searches[0] ? searches[0].url : url[0])
fs.writeFileSync(filePath, JSON.stringify(searches, null, 2))
}
async function crawlAll(url, depth, searches) {
try {
await crawlRecursive(url, depth, searches)
} catch (e) {
console.log(e)
await doBrowserRequest(false)
}
}
module.exports = crawlAll
//var importer = require('../Core')
//var crawlAll = importer.import("crawl domain")
if(typeof $ !== 'undefined') {
$.async()
crawlAll('https://google.com', 2)
.then(r => $.sendResult('done'))
.catch(e => $.sendError(e))
}
const { URL } = require('url');
const fs = require('fs');
const path = require('path');
const importer = require('../Core');
const { doBrowserRequest, closeBrowser } = importer.import('browser crawler tools');
const {
cacheFilename,
existingCache,
storeCache,
readCache,
rmhash,
} = importer.import('domain cache tools');
/**
* Crawls a URL recursively.
*
* @param {string|Array} url - The URL(s) to crawl.
* @param {number} depth - The maximum depth to crawl.
* @param {Array} searches - The current search results.
* @returns {Promise} A promise that resolves with the search results.
*/
async function crawlRecursive(url, depth, searches) {
if (depth === undefined) depth = 3; // Set default depth
url = (typeof url ==='string'? [url] : url);
const searches2 = new Set(); // Use a Set to keep track of new pages
for (const l of url) {
try {
await doBrowserRequest(l, readCache.bind(null, searches || searches2), storeCache.bind(null, searches || searches2));
} catch (e) {
console.error(e);
}
}
if (!searches) {
searches = existingCache(url[0]).concat(searches2);
}
const existing = new Set(searches.map((s) => rmhash(s.url)));
const links = new Set(searches2);
const stylesAndLinks = links
.map((s) => {
const styles = s.styles || [];
const links = s.links || [];
return styles.concat(links);
})
.flat()
.map((s) => rmhash(s))
.filter((l, i, arr) => arr.indexOf(l) === i)
.filter((l) =>!existing.has(rmhash(l)))
.filter((l) =>!l.includes('data:') &&!l.includes('mailto:') &&!l.includes('javascript:') &&!l.includes('ios-app:'))
.filter((l) => l.includes('http')); // Filter out invalid URLs
if (depth > 1) {
return await crawlRecursive(Array.from(stylesAndLinks), depth - 1, searches);
}
await closeBrowser(); // Close the browser
const filePath = cacheFilename(searches[0]? searches[0].url : url[0]);
fs.writeFileSync(filePath, JSON.stringify(searches, null, 2));
}
/**
* Crawls a URL and its links recursively.
*
* @param {string|Array} url - The URL(s) to crawl.
* @param {number} depth - The maximum depth to crawl.
* @param {Array} searches - The current search results.
* @returns {Promise} A promise that resolves with the search results.
*/
async function crawlAll(url, depth, searches) {
try {
await crawlRecursive(url, depth, searches);
} catch (e) {
console.error(e);
await closeBrowser();
}
}
module.exports = crawlAll;
Function Breakdown: crawlRecursive(url, depth, searches)
Parameters:
url
: The initial URL to start crawling from. Can be a string or an array of strings.depth
: The maximum recursion depth. Defaults to 3.searches
: An array of previously crawled search results.Functionality:
Initialization:
depth
is not provided, it defaults to 3.url
is a string, it is wrapped in an array.searches2
is created to store new pages to be added.Crawling:
url
array and makes a browser request to each URL using doBrowserRequest
.readCache
and storeCache
respectively.Cache Management:
searches
array is provided, it is updated with the new pages stored in searches2
.searches
array.Link Extraction:
searches2
array is processed to extract links.Recursion:
depth
is greater than 1, the function calls itself recursively with the extracted links, decreased depth
, and the updated searches
array.Termination:
depth
is 1, the function returns the extracted links.Notable Notes:
bind
to pass searches
(or searches2
) as the cache callback to doBrowserRequest
.rmhash
function is used to remove the hash from URLs.existingCache
function is used to retrieve the existing cache for a given URL.storeCache
function is used to store the new cache.readCache
function is used to read the cache.