data collection | schedule crawl domain | search results as json | Search

This Node.js script uses various custom modules to scrape websites, save PDFs and screenshots, and collect bookmarks from Google Takeout, with error handling and logging in place.

Run example

npm run import -- "collect all bookmarks"

collect all bookmarks

var path = require('path')
var importer = require('../Core')
var getBookmarksFromTakeout = importer.import("parse bookmarks file")
var ISODateString = importer.import("convert date iso")
var crawlAll = importer.import("crawl domain")
var {doBrowserRequest} = importer.import("browser crawler tools")
var {
    safeurl,
    existingCache,
    storeCache,
    readCache,
} = importer.import("domain cache tools")

//var PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE || '';
var PROFILE_PATH = '/Volumes/External/Personal';
var project = path.join(PROFILE_PATH, 'Collections/pdfs');
var project2 = path.join(PROFILE_PATH, 'Collections/screenshots');

async function savePdf(filename, response, page) {
    try {
        var type = (await response.headers())['content-type'].split(';')[0]
        if(!type.includes('text/html')) return
        console.log(`Printing PDF ${filename}`)
        await page.addStyleTag({content: '*,*:before,*:after{max-height: 100000px!important;}*:before,*:after{vertical-align:unset!important;}'})
        await page.emulateMediaType('screen')
        await page.pdf({ path: filename })
    } catch (e) {
        console.log(e)
    }
    console.log('Done printing PDF')
}

async function saveScreenshot(filename, response, page) {
    try {
        var type = (await response.headers())['content-type'].split(';')[0]
        if(!type.includes('text/html')) return
        console.log(`Printing screenshot ${filename}`)
        await page.addStyleTag({content: '*,*:before,*:after{max-height: 100000px!important;}*:before,*:after{vertical-align:unset!important;}'})
        await page.emulateMediaType('screen').catch(e => console.log(e))
        await page.screenshot({ path: filename, fullPage: true })
    } catch (e) {
        console.log(e)
    }
    console.log('Done printing screen')
}

async function collectAllBookmarks() {
    var folders = getBookmarksFromTakeout()
    var links = folders.reduce(function flattenFolders(arr, cur) {
        if(cur.folder === 'Sad Examples') return arr
        arr.push.apply(arr, cur.links.concat(cur.children.reduce(flattenFolders, [])))
        return arr
    }, [])

    var urls = links.map(l => l.url.toLowerCase())
    var existing = []
    var notexisting = []
    console.log(urls.length)
    //links = [{url: 'http://lifehacker.com/386811/shutdown-windows-with-a-text-message-thunderbird-edition'}]
    //var i = urls.indexOf('http://lifehacker.com/386811/shutdown-windows-with-a-text-message-thunderbird-edition')
    for(var i = 0; i < links.length; i++) {
        const filename = path.join(project, safeurl(links[i].url) + '.pdf')
        const filename2 = path.join(project2, safeurl(links[i].url) + '.png')
        
        // check if there is a recent pdf and skip
        if(fs.existsSync(filename)) {
            existing.push(filename)
            continue
        }
        notexisting.push(links[i])
        try {
            const cache = existingCache(links[i].url, false)
            await crawlAll(links[i].url, 1, cache)

            // save a pdf
            // TODO: add page scrolling because AMP doesn't load images until you scroll to it
            await doBrowserRequest(links[i].url, 
                                   readCache.bind(null, cache),
                                   storeCache.bind(null, cache),
                                   savePdf.bind(null, filename))
            await doBrowserRequest(links[i].url, 
                                   readCache.bind(null, cache),
                                   storeCache.bind(null, cache),
                                   saveScreenshot.bind(null, filename2))
        } catch (e) {
            console.log(e)
            await doBrowserRequest(false)
        }
    }
    await doBrowserRequest(false)
    console.log(existing)
    console.log(notexisting)
}

module.exports = {
    collectAllBookmarks,
    saveScreenshot,
    savePdf
}

What the code could have been:

const { promises: fs } = require('fs');
const path = require('path');
const importer = require('../Core');
const { 
  doBrowserRequest, 
  safeurl, 
  existingCache, 
  storeCache, 
  readCache 
} = importer.import('browser crawler tools');
const { crawlAll } = importer.import('crawl domain');
const { getBookmarksFromTakeout, ISODateString } = importer.import('parse bookmarks file');

Code Breakdown

This code appears to be a Node.js script that uses various modules to scrape websites, save PDFs and screenshots, and collect bookmarks. Here's a high-level overview of the code:

Importing Modules

The code starts by importing various modules using require:

var path = require('path')
var importer = require('../Core')
var getBookmarksFromTakeout = importer.import('parse bookmarks file')
var ISODateString = importer.import('convert date iso')
var crawlAll = importer.import('crawl domain')
var {doBrowserRequest} = importer.import('browser crawler tools')
var {
    safeurl,
    existingCache,
    storeCache,
    readCache,
} = importer.import('domain cache tools')

These modules seem to be custom modules developed for this project, likely related to web scraping and bookmark management.

Setting up Paths and Variables

The code sets up some paths and variables:

var PROFILE_PATH = '/Volumes/External/Personal';
var project = path.join(PROFILE_PATH, 'Collections/pdfs');
var project2 = path.join(PROFILE_PATH, 'Collections/screenshots');

PROFILE_PATH is set to an external drive path, and two project directories are created using the path.join method.

Functions

The code defines three functions:

savePdf function

This function saves a PDF from a webpage:

async function savePdf(filename, response, page) {
    //...
}

It takes three arguments: filename, response, and page. The function checks the response headers to ensure it's an HTML document, then adds a CSS rule to allow for infinite scrolling, emulates screen media type, and saves the PDF using the page.pdf method.

saveScreenshot function

This function saves a screenshot from a webpage:

async function saveScreenshot(filename, response, page) {
    //...
}

Similar to the savePdf function, it checks the response headers, adds a CSS rule, emulates screen media type, and saves the screenshot using the page.screenshot method.

collectAllBookmarks function

This function collects all bookmarks from Google Takeout:

async function collectAllBookmarks() {
    //...
}

It calls the getBookmarksFromTakeout function to retrieve the bookmarks, which are then stored somewhere (not shown in the code snippet).

Error Handling

Both the savePdf and saveScreenshot functions catch any errors that occur during execution and log them to the console.

Overall, this code appears to be part of a larger project that involves web scraping, PDF and screenshot saving, and bookmark management.