data collection | crawl domain | browser crawler tools | Search

This code snippet appears to be a Node.js module that handles caching web pages, importing various modules, and defining functions to cache and retrieve data based on URLs. The functions include caching file creation, searching for existing caches, checking cache validity, and storing cache data in files, with various options for cache restraint and URL sanitization.

Run example

npm run import -- "domain cache tools"

domain cache tools

var {URL} = require('url')
var fs = require('fs')
var path = require('path')
var importer = require('../Core')
var {glob} = importer.import("glob files")
var {getResponseContent} = importer.import("browser crawler tools")

//var PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE || '';
var PROFILE_PATH = '/Volumes/External/Personal'
var project = path.join(PROFILE_PATH, 'Collections/crawls');

function cacheFilename(url) {
    if(typeof url === 'string') {
        url = new URL(url.includes('://') ? url : ('http://' + url.replace(/^\/\//, '')))
    }
    const time = new Date()
    var file = safeurl(url.hostname)
       + '-' + time.getFullYear()
       + '-' + (time.getMonth() + 1)
       + '-' + time.getDate() + '.json'
    return path.join(project, file)
}

function findCache(url) {
    if(typeof url === 'string') {
        url = new URL(url.includes('://') ? url : ('http://' + url.replace(/^\/\//, '')))
    }
    const host = safeurl(url.hostname)
    const crawl = glob(
        '**/*' + host + '*',
        project
    )
    crawl.sort((a, b) => {
        return fs.statSync(b).mtime.getTime() - fs.statSync(a).mtime.getTime()
    })
    return crawl
}

function existingCache(url, restrain) {
    var cache = findCache(url)
    var filePath = cacheFilename(url)
    // save pages from the same day in the same database using the url as the keys
    if(cache[0]) {
        var segments = cache[0].replace(/\..*$/ig, '').split('-')
        var date = Date.parse(segments.slice(segments.length - 3).join('-'))
        var inAWeek = date + 1000*60*60*24*7
        if(restrain === false
          || (restrain === 'week' && inAWeek > Date.now())
          || (restrain === 'day' && cache[0] === filePath)) {
            return JSON.parse(fs.readFileSync(cache[0])) || []
        }
    }
    return []
}

async function storeCache(cache, response) {
    var headers = await response.headers()
    var result = await getResponseContent(response, headers)
    if(typeof result.url === 'undefined') {
        return
    }
    var urls = cache.map(s => s.url.toLowerCase())
    var index = urls.indexOf(result.url.toLowerCase())
    if(index > -1) {
        console.log(`Received existing ${result.url}`)
        cache[index] = result
    } else {
        console.log(`Received ${result.url}`)
        cache.push(result)
    }
}

// source: https://stackoverflow.com/questions/53807574/how-to-block-ads-with-puppeteer-headless-chrome
// TODO: create a notebook for this and add easylist
var hosts;
function adBlocker() {
    if(hosts) return hosts
    hosts = {}
    //now we read the host file
    var hostFile = fs.readFileSync(path.join(__dirname,
        '../Resources/Projects/adblocker/hosts.txt'), 'utf8').split('\n');
    for (var i = 0; i < hostFile.length; i++) {
        var frags = hostFile[i].split(' ');
        if (frags.length > 1 && frags[0] === '0.0.0.0') {
            hosts[frags[1].trim()] = true;
        }
    }
    return hosts
}

function readCache(cache, request) {
    if(request.url().substr(0, 5) === 'data:') {
        return request.continue()
    }
    if(adBlocker()[new URL(request.url()).host]) {
        return request.abort()
    }
    var urls = cache.map(s => s.url.toLowerCase())
    var index = urls.indexOf(request.url().toLowerCase())
    var response = cache[index]
    if (response && response.status == 200) {
    // TODO: remove this restriction since we literally just downloaded it
    //    && response.expires
    //    && response.expires > Date.now()
    //    && response.content
    //if(response.type.includes('\n')) {
    //    debugger
    //}
        console.log(`Requesting cache ${response.status} ${response.type} ${request.url()}`)
        var headers = {
            'Access-Control-Allow-Origin': '*'
        }
        if(response.location) headers['Location'] = response.location
        // TODO: save in this format
        try {
            request.respond({
                contentType: response.type,
                headers: headers,
                status: response.status || 200,
                body: response.content
                    && response.content.substr(0, 5) === 'data:'
                    ? Buffer.from(response.content.split(',')[1], 'base64')
                    : Buffer.from(response.content || [], 'utf8'),
            })
        } catch (e) {
            console.log(e)
            request.continue()
        }
        return
    }
    console.log(`Requesting ${request.url()}`)
    request.continue()
}

// TODO: move this to URL tools in Languages/html.ipynb with getAllLinks
function rmhash(url) {
    return url.replace(/#.*$/ig, '')
}

// TODO: replace other occurrences with this function
function safeurl(url) {
    return url.replace(/[^a-z0-9_-]/ig, '_').substr(0, 100)
}

module.exports = {
    cacheFilename,
    findCache,
    existingCache,
    storeCache,
    readCache,
    rmhash,
    safeurl,
}

What the code could have been:

```javascript
const { URL } = require('url');
const fs = require('fs');
const path = require('path');
const { glob } = require('../Core/glob');
const { getResponseContent } = require('../Core/browser-crawler-tools');

const PROFILE_PATH = '/Volumes/External/Personal';
const project = path.join(PROFILE_PATH, 'Collections/crawls');
const adblockerFile = path.join(__dirname, '../Resources/Projects/adblocker/hosts.txt');

// Cache functions
function cacheFilename(url) {
    const time = new Date();
    const urlObj = new URL(url.includes('://')? url : ('http://' + url.replace(/^\/\//, '')));
    return path.join(
        project,
        safeurl(urlObj.hostname),
        `${time.getFullYear()}-${(time.getMonth() + 1).toString().padStart(2, '0')}-${time.getDate().toString().padStart(2, '0')}.json`
    );
}

function findCache(url) {
    const urlObj = new URL(url.includes('://')? url : ('http://' + url.replace(/^\/\//, '')));
    const host = safeurl(urlObj.hostname);
    const crawl = glob(`**/*${host}*/**/index.json`, project);
    crawl.sort((a, b) => fs.statSync(b).mtime.getTime() - fs.statSync(a).mtime.getTime());
    return crawl;
}

function existingCache(url, restrain = 'week') {
    const cache = findCache(url);
    const filePath = cacheFilename(url);
    if (cache[0]) {
        const segments = cache[0].replace(/\..*$/ig, '').split('-');
        const date = Date.parse(segments.slice(segments.length - 3).join('-'));
        const inAWeek = date + 1000 * 60 * 60 * 24 * 7;
        if (restrain === 'week' && inAWeek > Date.now()) {
            return JSON.parse(fs.readFileSync(cache[0]));
        } else if (restrain === 'day' && cache[0] === filePath) {
            return JSON.parse(fs.readFileSync(cache[0])) || [];
        }
    }
    return [];
}

// Store cache functions
async function storeCache(cache, response) {
    const headers = await response.headers();
    const result = await getResponseContent(response, headers);
    if (typeof result.url === 'undefined') return;
    const urls = cache.map(s => s.url.toLowerCase());
    const index = urls.indexOf(result.url.toLowerCase());
    if (index > -1) {
        console.log(`Received existing ${result.url}`);
        cache[index] = result;
    } else {
        console.log(`Received ${result.url}`);
        cache.push(result);
    }
    return cache;
}

// Ad blocker
const adBlocker = (() => {
    const hosts = {};
    const readHosts = () => {
        const hostFileContent = fs.readFileSync(adblockerFile, 'utf8').split('\n');
        hostFileContent.forEach(line => {
            const frags = line.split(' ');
            if (frags.length > 1 && frags[0] === '0.0.0.0') {
                hosts[frags[1].trim()] = true;
            }
        });
        return hosts;
    };
    return () => {
        if (Object.keys(hosts).length > 0) return hosts;
        readHosts();
        return hosts;
    };
})();

// Read cache functions
function rmhash(url) {
    return url.replace(/#.*$/ig, '');
}

function safeurl(url) {
    return url.replace(/[^a-z0-9_-]/ig, '_').substr(0, 100);
}

function readCache(cache, request) {
    if (request.url().substr(0, 5) === 'data:') return request.continue();
    if (adBlocker()[new URL(request.url()).host]) return request.abort();
    const url = new URL(request.url());
    const index = cache.find(item => item.url.toLowerCase() === url.href.toLowerCase());
    if (index && index.status === 200) {
        console.log(`Requesting cache ${index.status} ${index.type} ${request.url()}`);
        const headers = {
            'Access-Control-Allow-Origin': '*',
            'Location': index.location
        };
        try {
            request.respond({
                contentType: index.type,
                headers,
                status: index.status || 200,
                body: Buffer.from(index.content || [], 'utf8')
            });
            return;
        } catch (error) {
            console.log(error);
            request.continue();
        }
    }
    console.log(`Requesting ${request.url()}`);
    request.continue();
}

module.exports = {
    cacheFilename,
    findCache,
    existingCache,
    storeCache,
    readCache,
    rmhash,
    safeurl,
    adBlocker
};
```

Code Breakdown

Module Imports

The code begins by importing various Node.js modules:

Constants and Variables

Function Definitions

cacheFilename(url)

Takes a URL as input and returns a filename in the format hostname-YYYY-MM-DD.json. The filename is constructed using the safeurl() function (not shown in this code snippet) to sanitize the hostname.

findCache(url)

Finds cached files in the project directory that match the given URL's hostname. It uses the glob() function to search for files with the hostname as a substring, sorts the results by modification time, and returns an array of file paths.

existingCache(url, restrain)

Checks if a cache exists for the given URL. If it does, it checks if the cache is within a specified time restraint (defaulting to false for no restraint, 'week' for up to a week, or 'day' for the same day). If the cache is valid, it returns the cached data as a JSON object; otherwise, it returns an empty array.

storeCache(cache, response)

An asynchronous function that stores a cache in a file. It:

  1. Gets the response headers and content using the getResponseContent() function.
  2. If the response URL is undefined, it skips caching.
  3. Writes the cached data to a file using the cacheFilename() function.

Notes