This code snippet appears to be a Node.js module that handles caching web pages, importing various modules, and defining functions to cache and retrieve data based on URLs. The functions include caching file creation, searching for existing caches, checking cache validity, and storing cache data in files, with various options for cache restraint and URL sanitization.
npm run import -- "domain cache tools"
var {URL} = require('url')
var fs = require('fs')
var path = require('path')
var importer = require('../Core')
var {glob} = importer.import("glob files")
var {getResponseContent} = importer.import("browser crawler tools")
//var PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE || '';
var PROFILE_PATH = '/Volumes/External/Personal'
var project = path.join(PROFILE_PATH, 'Collections/crawls');
function cacheFilename(url) {
if(typeof url === 'string') {
url = new URL(url.includes('://') ? url : ('http://' + url.replace(/^\/\//, '')))
}
const time = new Date()
var file = safeurl(url.hostname)
+ '-' + time.getFullYear()
+ '-' + (time.getMonth() + 1)
+ '-' + time.getDate() + '.json'
return path.join(project, file)
}
function findCache(url) {
if(typeof url === 'string') {
url = new URL(url.includes('://') ? url : ('http://' + url.replace(/^\/\//, '')))
}
const host = safeurl(url.hostname)
const crawl = glob(
'**/*' + host + '*',
project
)
crawl.sort((a, b) => {
return fs.statSync(b).mtime.getTime() - fs.statSync(a).mtime.getTime()
})
return crawl
}
function existingCache(url, restrain) {
var cache = findCache(url)
var filePath = cacheFilename(url)
// save pages from the same day in the same database using the url as the keys
if(cache[0]) {
var segments = cache[0].replace(/\..*$/ig, '').split('-')
var date = Date.parse(segments.slice(segments.length - 3).join('-'))
var inAWeek = date + 1000*60*60*24*7
if(restrain === false
|| (restrain === 'week' && inAWeek > Date.now())
|| (restrain === 'day' && cache[0] === filePath)) {
return JSON.parse(fs.readFileSync(cache[0])) || []
}
}
return []
}
async function storeCache(cache, response) {
var headers = await response.headers()
var result = await getResponseContent(response, headers)
if(typeof result.url === 'undefined') {
return
}
var urls = cache.map(s => s.url.toLowerCase())
var index = urls.indexOf(result.url.toLowerCase())
if(index > -1) {
console.log(`Received existing ${result.url}`)
cache[index] = result
} else {
console.log(`Received ${result.url}`)
cache.push(result)
}
}
// source: https://stackoverflow.com/questions/53807574/how-to-block-ads-with-puppeteer-headless-chrome
// TODO: create a notebook for this and add easylist
var hosts;
function adBlocker() {
if(hosts) return hosts
hosts = {}
//now we read the host file
var hostFile = fs.readFileSync(path.join(__dirname,
'../Resources/Projects/adblocker/hosts.txt'), 'utf8').split('\n');
for (var i = 0; i < hostFile.length; i++) {
var frags = hostFile[i].split(' ');
if (frags.length > 1 && frags[0] === '0.0.0.0') {
hosts[frags[1].trim()] = true;
}
}
return hosts
}
function readCache(cache, request) {
if(request.url().substr(0, 5) === 'data:') {
return request.continue()
}
if(adBlocker()[new URL(request.url()).host]) {
return request.abort()
}
var urls = cache.map(s => s.url.toLowerCase())
var index = urls.indexOf(request.url().toLowerCase())
var response = cache[index]
if (response && response.status == 200) {
// TODO: remove this restriction since we literally just downloaded it
// && response.expires
// && response.expires > Date.now()
// && response.content
//if(response.type.includes('\n')) {
// debugger
//}
console.log(`Requesting cache ${response.status} ${response.type} ${request.url()}`)
var headers = {
'Access-Control-Allow-Origin': '*'
}
if(response.location) headers['Location'] = response.location
// TODO: save in this format
try {
request.respond({
contentType: response.type,
headers: headers,
status: response.status || 200,
body: response.content
&& response.content.substr(0, 5) === 'data:'
? Buffer.from(response.content.split(',')[1], 'base64')
: Buffer.from(response.content || [], 'utf8'),
})
} catch (e) {
console.log(e)
request.continue()
}
return
}
console.log(`Requesting ${request.url()}`)
request.continue()
}
// TODO: move this to URL tools in Languages/html.ipynb with getAllLinks
function rmhash(url) {
return url.replace(/#.*$/ig, '')
}
// TODO: replace other occurrences with this function
function safeurl(url) {
return url.replace(/[^a-z0-9_-]/ig, '_').substr(0, 100)
}
module.exports = {
cacheFilename,
findCache,
existingCache,
storeCache,
readCache,
rmhash,
safeurl,
}
```javascript
const { URL } = require('url');
const fs = require('fs');
const path = require('path');
const { glob } = require('../Core/glob');
const { getResponseContent } = require('../Core/browser-crawler-tools');
const PROFILE_PATH = '/Volumes/External/Personal';
const project = path.join(PROFILE_PATH, 'Collections/crawls');
const adblockerFile = path.join(__dirname, '../Resources/Projects/adblocker/hosts.txt');
// Cache functions
function cacheFilename(url) {
const time = new Date();
const urlObj = new URL(url.includes('://')? url : ('http://' + url.replace(/^\/\//, '')));
return path.join(
project,
safeurl(urlObj.hostname),
`${time.getFullYear()}-${(time.getMonth() + 1).toString().padStart(2, '0')}-${time.getDate().toString().padStart(2, '0')}.json`
);
}
function findCache(url) {
const urlObj = new URL(url.includes('://')? url : ('http://' + url.replace(/^\/\//, '')));
const host = safeurl(urlObj.hostname);
const crawl = glob(`**/*${host}*/**/index.json`, project);
crawl.sort((a, b) => fs.statSync(b).mtime.getTime() - fs.statSync(a).mtime.getTime());
return crawl;
}
function existingCache(url, restrain = 'week') {
const cache = findCache(url);
const filePath = cacheFilename(url);
if (cache[0]) {
const segments = cache[0].replace(/\..*$/ig, '').split('-');
const date = Date.parse(segments.slice(segments.length - 3).join('-'));
const inAWeek = date + 1000 * 60 * 60 * 24 * 7;
if (restrain === 'week' && inAWeek > Date.now()) {
return JSON.parse(fs.readFileSync(cache[0]));
} else if (restrain === 'day' && cache[0] === filePath) {
return JSON.parse(fs.readFileSync(cache[0])) || [];
}
}
return [];
}
// Store cache functions
async function storeCache(cache, response) {
const headers = await response.headers();
const result = await getResponseContent(response, headers);
if (typeof result.url === 'undefined') return;
const urls = cache.map(s => s.url.toLowerCase());
const index = urls.indexOf(result.url.toLowerCase());
if (index > -1) {
console.log(`Received existing ${result.url}`);
cache[index] = result;
} else {
console.log(`Received ${result.url}`);
cache.push(result);
}
return cache;
}
// Ad blocker
const adBlocker = (() => {
const hosts = {};
const readHosts = () => {
const hostFileContent = fs.readFileSync(adblockerFile, 'utf8').split('\n');
hostFileContent.forEach(line => {
const frags = line.split(' ');
if (frags.length > 1 && frags[0] === '0.0.0.0') {
hosts[frags[1].trim()] = true;
}
});
return hosts;
};
return () => {
if (Object.keys(hosts).length > 0) return hosts;
readHosts();
return hosts;
};
})();
// Read cache functions
function rmhash(url) {
return url.replace(/#.*$/ig, '');
}
function safeurl(url) {
return url.replace(/[^a-z0-9_-]/ig, '_').substr(0, 100);
}
function readCache(cache, request) {
if (request.url().substr(0, 5) === 'data:') return request.continue();
if (adBlocker()[new URL(request.url()).host]) return request.abort();
const url = new URL(request.url());
const index = cache.find(item => item.url.toLowerCase() === url.href.toLowerCase());
if (index && index.status === 200) {
console.log(`Requesting cache ${index.status} ${index.type} ${request.url()}`);
const headers = {
'Access-Control-Allow-Origin': '*',
'Location': index.location
};
try {
request.respond({
contentType: index.type,
headers,
status: index.status || 200,
body: Buffer.from(index.content || [], 'utf8')
});
return;
} catch (error) {
console.log(error);
request.continue();
}
}
console.log(`Requesting ${request.url()}`);
request.continue();
}
module.exports = {
cacheFilename,
findCache,
existingCache,
storeCache,
readCache,
rmhash,
safeurl,
adBlocker
};
```
Code Breakdown
The code begins by importing various Node.js modules:
url
for URL manipulationfs
(File System) for interacting with the file systempath
for working with file pathsimporter
from a custom module (../Core
), which is used to import specific functionsPROFILE_PATH
is set to a fixed value (/Volumes/External/Personal
) but can be overridden by environment variables (HOME
, HOMEPATH
, USERPROFILE
).project
is the path to the project directory, constructed by joining PROFILE_PATH
with Collections/crawls
.cacheFilename(url)
Takes a URL as input and returns a filename in the format hostname-YYYY-MM-DD.json
. The filename is constructed using the safeurl()
function (not shown in this code snippet) to sanitize the hostname.
findCache(url)
Finds cached files in the project directory that match the given URL's hostname. It uses the glob()
function to search for files with the hostname as a substring, sorts the results by modification time, and returns an array of file paths.
existingCache(url, restrain)
Checks if a cache exists for the given URL. If it does, it checks if the cache is within a specified time restraint (defaulting to false
for no restraint, 'week'
for up to a week, or 'day'
for the same day). If the cache is valid, it returns the cached data as a JSON object; otherwise, it returns an empty array.
storeCache(cache, response)
An asynchronous function that stores a cache in a file. It:
getResponseContent()
function.cacheFilename()
function.safeurl()
function is not shown in this code snippet, but it's likely used to sanitize URLs to prevent malicious input.getResponseContent()
function is imported from a custom module (../Core
), but its implementation is not shown in this code snippet.fs.readFileSync()
and fs.statSync()
which are synchronous file system operations. In an asynchronous context, it would be better to use the asynchronous versions (fs.readFile()
and fs.stat()
) to avoid blocking the execution.