This Node.js script uses various custom modules to scrape websites, save PDFs and screenshots, and collect bookmarks from Google Takeout, with error handling and logging in place.
npm run import -- "collect all bookmarks"
var path = require('path')
var importer = require('../Core')
var getBookmarksFromTakeout = importer.import("parse bookmarks file")
var ISODateString = importer.import("convert date iso")
var crawlAll = importer.import("crawl domain")
var {doBrowserRequest} = importer.import("browser crawler tools")
var {
safeurl,
existingCache,
storeCache,
readCache,
} = importer.import("domain cache tools")
//var PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE || '';
var PROFILE_PATH = '/Volumes/External/Personal';
var project = path.join(PROFILE_PATH, 'Collections/pdfs');
var project2 = path.join(PROFILE_PATH, 'Collections/screenshots');
async function savePdf(filename, response, page) {
try {
var type = (await response.headers())['content-type'].split(';')[0]
if(!type.includes('text/html')) return
console.log(`Printing PDF ${filename}`)
await page.addStyleTag({content: '*,*:before,*:after{max-height: 100000px!important;}*:before,*:after{vertical-align:unset!important;}'})
await page.emulateMediaType('screen')
await page.pdf({ path: filename })
} catch (e) {
console.log(e)
}
console.log('Done printing PDF')
}
async function saveScreenshot(filename, response, page) {
try {
var type = (await response.headers())['content-type'].split(';')[0]
if(!type.includes('text/html')) return
console.log(`Printing screenshot ${filename}`)
await page.addStyleTag({content: '*,*:before,*:after{max-height: 100000px!important;}*:before,*:after{vertical-align:unset!important;}'})
await page.emulateMediaType('screen').catch(e => console.log(e))
await page.screenshot({ path: filename, fullPage: true })
} catch (e) {
console.log(e)
}
console.log('Done printing screen')
}
async function collectAllBookmarks() {
var folders = getBookmarksFromTakeout()
var links = folders.reduce(function flattenFolders(arr, cur) {
if(cur.folder === 'Sad Examples') return arr
arr.push.apply(arr, cur.links.concat(cur.children.reduce(flattenFolders, [])))
return arr
}, [])
var urls = links.map(l => l.url.toLowerCase())
var existing = []
var notexisting = []
console.log(urls.length)
//links = [{url: 'http://lifehacker.com/386811/shutdown-windows-with-a-text-message-thunderbird-edition'}]
//var i = urls.indexOf('http://lifehacker.com/386811/shutdown-windows-with-a-text-message-thunderbird-edition')
for(var i = 0; i < links.length; i++) {
const filename = path.join(project, safeurl(links[i].url) + '.pdf')
const filename2 = path.join(project2, safeurl(links[i].url) + '.png')
// check if there is a recent pdf and skip
if(fs.existsSync(filename)) {
existing.push(filename)
continue
}
notexisting.push(links[i])
try {
const cache = existingCache(links[i].url, false)
await crawlAll(links[i].url, 1, cache)
// save a pdf
// TODO: add page scrolling because AMP doesn't load images until you scroll to it
await doBrowserRequest(links[i].url,
readCache.bind(null, cache),
storeCache.bind(null, cache),
savePdf.bind(null, filename))
await doBrowserRequest(links[i].url,
readCache.bind(null, cache),
storeCache.bind(null, cache),
saveScreenshot.bind(null, filename2))
} catch (e) {
console.log(e)
await doBrowserRequest(false)
}
}
await doBrowserRequest(false)
console.log(existing)
console.log(notexisting)
}
module.exports = {
collectAllBookmarks,
saveScreenshot,
savePdf
}
const { promises: fs } = require('fs');
const path = require('path');
const importer = require('../Core');
const {
doBrowserRequest,
safeurl,
existingCache,
storeCache,
readCache
} = importer.import('browser crawler tools');
const { crawlAll } = importer.import('crawl domain');
const { getBookmarksFromTakeout, ISODateString } = importer.import('parse bookmarks file');
Code Breakdown
This code appears to be a Node.js script that uses various modules to scrape websites, save PDFs and screenshots, and collect bookmarks. Here's a high-level overview of the code:
The code starts by importing various modules using require
:
var path = require('path')
var importer = require('../Core')
var getBookmarksFromTakeout = importer.import('parse bookmarks file')
var ISODateString = importer.import('convert date iso')
var crawlAll = importer.import('crawl domain')
var {doBrowserRequest} = importer.import('browser crawler tools')
var {
safeurl,
existingCache,
storeCache,
readCache,
} = importer.import('domain cache tools')
These modules seem to be custom modules developed for this project, likely related to web scraping and bookmark management.
The code sets up some paths and variables:
var PROFILE_PATH = '/Volumes/External/Personal';
var project = path.join(PROFILE_PATH, 'Collections/pdfs');
var project2 = path.join(PROFILE_PATH, 'Collections/screenshots');
PROFILE_PATH
is set to an external drive path, and two project directories are created using the path.join
method.
The code defines three functions:
savePdf
functionThis function saves a PDF from a webpage:
async function savePdf(filename, response, page) {
//...
}
It takes three arguments: filename
, response
, and page
. The function checks the response headers to ensure it's an HTML document, then adds a CSS rule to allow for infinite scrolling, emulates screen media type, and saves the PDF using the page.pdf
method.
saveScreenshot
functionThis function saves a screenshot from a webpage:
async function saveScreenshot(filename, response, page) {
//...
}
Similar to the savePdf
function, it checks the response headers, adds a CSS rule, emulates screen media type, and saves the screenshot using the page.screenshot
method.
collectAllBookmarks
functionThis function collects all bookmarks from Google Takeout:
async function collectAllBookmarks() {
//...
}
It calls the getBookmarksFromTakeout
function to retrieve the bookmarks, which are then stored somewhere (not shown in the code snippet).
Both the savePdf
and saveScreenshot
functions catch any errors that occur during execution and log them to the console.
Overall, this code appears to be part of a larger project that involves web scraping, PDF and screenshot saving, and bookmark management.