domain cache tools

This code snippet appears to be a Node.js module that handles caching web pages, importing various modules, and defining functions to cache and retrieve data based on URLs. The functions include caching file creation, searching for existing caches, checking cache validity, and storing cache data in files, with various options for cache restraint and URL sanitization.

Run example

domain cache tools

var {URL} = require('url') var fs = require('fs') var path = require('path') var importer = require('../Core') var {glob} = importer.import("glob files") var {getResponseContent} = importer.import("browser crawler tools") //var PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE || ''; var PROFILE_PATH = '/Volumes/External/Personal' var project = path.join(PROFILE_PATH, 'Collections/crawls'); function cacheFilename(url) { if(typeof url === 'string') { url = new URL(url.includes('://') ? url : ('http://' + url.replace(/^\/\//, ''))) } const time = new Date() var file = safeurl(url.hostname) + '-' + time.getFullYear() + '-' + (time.getMonth() + 1) + '-' + time.getDate() + '.json' return path.join(project, file) } function findCache(url) { if(typeof url === 'string') { url = new URL(url.includes('://') ? url : ('http://' + url.replace(/^\/\//, ''))) } const host = safeurl(url.hostname) const crawl = glob( '**/*' + host + '*', project ) crawl.sort((a, b) => { return fs.statSync(b).mtime.getTime() - fs.statSync(a).mtime.getTime() }) return crawl } function existingCache(url, restrain) { var cache = findCache(url) var filePath = cacheFilename(url) // save pages from the same day in the same database using the url as the keys if(cache[0]) { var segments = cache[0].replace(/\..*$/ig, '').split('-') var date = Date.parse(segments.slice(segments.length - 3).join('-')) var inAWeek = date + 1000*60*60*24*7 if(restrain === false || (restrain === 'week' && inAWeek > Date.now()) || (restrain === 'day' && cache[0] === filePath)) { return JSON.parse(fs.readFileSync(cache[0])) || [] } } return [] } async function storeCache(cache, response) { var headers = await response.headers() var result = await getResponseContent(response, headers) if(typeof result.url === 'undefined') { return } var urls = cache.map(s => s.url.toLowerCase()) var index = urls.indexOf(result.url.toLowerCase()) if(index > -1) { console.log(`Received existing ${result.url}`) cache[index] = result } else { console.log(`Received ${result.url}`) cache.push(result) } } // source: https://stackoverflow.com/questions/53807574/how-to-block-ads-with-puppeteer-headless-chrome // TODO: create a notebook for this and add easylist var hosts; function adBlocker() { if(hosts) return hosts hosts = {} //now we read the host file var hostFile = fs.readFileSync(path.join(__dirname, '../Resources/Projects/adblocker/hosts.txt'), 'utf8').split('\n'); for (var i = 0; i < hostFile.length; i++) { var frags = hostFile[i].split(' '); if (frags.length > 1 && frags[0] === '0.0.0.0') { hosts[frags[1].trim()] = true; } } return hosts } function readCache(cache, request) { if(request.url().substr(0, 5) === 'data:') { return request.continue() } if(adBlocker()[new URL(request.url()).host]) { return request.abort() } var urls = cache.map(s => s.url.toLowerCase()) var index = urls.indexOf(request.url().toLowerCase()) var response = cache[index] if (response && response.status == 200) { // TODO: remove this restriction since we literally just downloaded it // && response.expires // && response.expires > Date.now() // && response.content //if(response.type.includes('\n')) { // debugger //} console.log(`Requesting cache ${response.status} ${response.type} ${request.url()}`) var headers = { 'Access-Control-Allow-Origin': '*' } if(response.location) headers['Location'] = response.location // TODO: save in this format try { request.respond({ contentType: response.type, headers: headers, status: response.status || 200, body: response.content && response.content.substr(0, 5) === 'data:' ? Buffer.from(response.content.split(',')[1], 'base64') : Buffer.from(response.content || [], 'utf8'), }) } catch (e) { console.log(e) request.continue() } return } console.log(`Requesting ${request.url()}`) request.continue() } // TODO: move this to URL tools in Languages/html.ipynb with getAllLinks function rmhash(url) { return url.replace(/#.*$/ig, '') } // TODO: replace other occurrences with this function function safeurl(url) { return url.replace(/[^a-z0-9_-]/ig, '_').substr(0, 100) } module.exports = { cacheFilename, findCache, existingCache, storeCache, readCache, rmhash, safeurl, }

What the code could have been:

```javascript
const { URL } = require('url');
const fs = require('fs');
const path = require('path');
const { glob } = require('../Core/glob');
const { getResponseContent } = require('../Core/browser-crawler-tools');

const PROFILE_PATH = '/Volumes/External/Personal';
const project = path.join(PROFILE_PATH, 'Collections/crawls');
const adblockerFile = path.join(__dirname, '../Resources/Projects/adblocker/hosts.txt');

// Cache functions
function cacheFilename(url) {
    const time = new Date();
    const urlObj = new URL(url.includes('://')? url : ('http://' + url.replace(/^\/\//, '')));
    return path.join(
        project,
        safeurl(urlObj.hostname),
        `${time.getFullYear()}-${(time.getMonth() + 1).toString().padStart(2, '0')}-${time.getDate().toString().padStart(2, '0')}.json`
    );
}

function findCache(url) {
    const urlObj = new URL(url.includes('://')? url : ('http://' + url.replace(/^\/\//, '')));
    const host = safeurl(urlObj.hostname);
    const crawl = glob(`**/*${host}*/**/index.json`, project);
    crawl.sort((a, b) => fs.statSync(b).mtime.getTime() - fs.statSync(a).mtime.getTime());
    return crawl;
}

function existingCache(url, restrain = 'week') {
    const cache = findCache(url);
    const filePath = cacheFilename(url);
    if (cache[0]) {
        const segments = cache[0].replace(/\..*$/ig, '').split('-');
        const date = Date.parse(segments.slice(segments.length - 3).join('-'));
        const inAWeek = date + 1000 * 60 * 60 * 24 * 7;
        if (restrain === 'week' && inAWeek > Date.now()) {
            return JSON.parse(fs.readFileSync(cache[0]));
        } else if (restrain === 'day' && cache[0] === filePath) {
            return JSON.parse(fs.readFileSync(cache[0])) || [];
        }
    }
    return [];
}

// Store cache functions
async function storeCache(cache, response) {
    const headers = await response.headers();
    const result = await getResponseContent(response, headers);
    if (typeof result.url === 'undefined') return;
    const urls = cache.map(s => s.url.toLowerCase());
    const index = urls.indexOf(result.url.toLowerCase());
    if (index > -1) {
        console.log(`Received existing ${result.url}`);
        cache[index] = result;
    } else {
        console.log(`Received ${result.url}`);
        cache.push(result);
    }
    return cache;
}

// Ad blocker
const adBlocker = (() => {
    const hosts = {};
    const readHosts = () => {
        const hostFileContent = fs.readFileSync(adblockerFile, 'utf8').split('\n');
        hostFileContent.forEach(line => {
            const frags = line.split(' ');
            if (frags.length > 1 && frags[0] === '0.0.0.0') {
                hosts[frags[1].trim()] = true;
            }
        });
        return hosts;
    };
    return () => {
        if (Object.keys(hosts).length > 0) return hosts;
        readHosts();
        return hosts;
    };
})();

// Read cache functions
function rmhash(url) {
    return url.replace(/#.*$/ig, '');
}

function safeurl(url) {
    return url.replace(/[^a-z0-9_-]/ig, '_').substr(0, 100);
}

function readCache(cache, request) {
    if (request.url().substr(0, 5) === 'data:') return request.continue();
    if (adBlocker()[new URL(request.url()).host]) return request.abort();
    const url = new URL(request.url());
    const index = cache.find(item => item.url.toLowerCase() === url.href.toLowerCase());
    if (index && index.status === 200) {
        console.log(`Requesting cache ${index.status} ${index.type} ${request.url()}`);
        const headers = {
            'Access-Control-Allow-Origin': '*',
            'Location': index.location
        };
        try {
            request.respond({
                contentType: index.type,
                headers,
                status: index.status || 200,
                body: Buffer.from(index.content || [], 'utf8')
            });
            return;
        } catch (error) {
            console.log(error);
            request.continue();
        }
    }
    console.log(`Requesting ${request.url()}`);
    request.continue();
}

module.exports = {
    cacheFilename,
    findCache,
    existingCache,
    storeCache,
    readCache,
    rmhash,
    safeurl,
    adBlocker
};
```

Module Imports

Constants and Variables

Function Definitions

cacheFilename(url)

Takes a URL as input and returns a filename in the format hostname-YYYY-MM-DD.json. The filename is constructed using the safeurl() function (not shown in this code snippet) to sanitize the hostname.

findCache(url)

Finds cached files in the project directory that match the given URL's hostname. It uses the glob() function to search for files with the hostname as a substring, sorts the results by modification time, and returns an array of file paths.

existingCache(url, restrain)

Checks if a cache exists for the given URL. If it does, it checks if the cache is within a specified time restraint (defaulting to false for no restraint, 'week' for up to a week, or 'day' for the same day). If the cache is valid, it returns the cached data as a JSON object; otherwise, it returns an empty array.

Run example

domain cache tools

What the code could have been:

Module Imports

Constants and Variables

Function Definitions

`cacheFilename(url)`

`findCache(url)`

`existingCache(url, restrain)`

`storeCache(cache, response)`

Notes