scraping | | google maps data list | Search

This JavaScript code imports various modules and sets constants for a project directory, then defines two functions, getLocations and getLocation, which respectively retrieve a list of locations and extract specific data from each location's webpage. The getLocation function makes a GET request to a URL, extracts XPath expressions for elements containing day names, times, and deals, and then maps the results to an object with dotw, time, and deals properties.

Cell 0

var importer = require('../Core');
var util = require('util');
var request = util.promisify(require('request'));
var {JSDOM} = require('jsdom');
var fs = require('fs');
var path = require('path');

var PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE;
var project = PROFILE_PATH + '/Collections/flagstaff-happy';


var locations, getAllXPath;
function getLocations() {
    if(typeof locations !== 'undefined') {
        return Promise.resolve(locations);
    }
    return request('https://web.archive.org/web/20160322001433/http://keepflaghappy.com/happy-hours/friday/')
        .then(r => {
            const getAllXPath = importer.import("all elements xpath from string")(r.body);
            return getAllXPath([
                '//a[contains(@href, "by-location")]/@href',
            ]);
        })
        .then(r => {
            locations = r
                .filter(l => l.match(/by-location\/[^#]+/ig))
                .map(l => l.includes('archive.org') ? l : 'https://web.archive.org' + l);
            return locations;
        });
}

function getLocation(l) {
    console.log('Downloading ' + l);
    return request(l)
        .then(r => {
            const getAllXPath = importer.import("all elements xpath from string")(r.body);
            return getAllXPath([
                '//h3[contains(., "Mon") or contains(., "Tue") or contains(., "Wed") or contains(., "Thu") or contains(., "Fri") or contains(., "Sat") or contains(., "Sun")]',
                {
                    dotw: './text()',
                    time: './following-sibling::p[count(./preceding-sibling::h3)=$i+1]//text()',
                    deals: './following-sibling::ul[count(./preceding-sibling::h3)=$i+1]//text()'
                }
            ]);
        })
        .then(r => {
            const happy = r.map(l => {
                return {
                    dotw: l.dotw,
                    time: l.time,
                    deals: l.deals.join('\n').trim().split(/\s*\n+\s*/ig)
                };
            });
            const name = l.trim().replace(/\/$/ig, '').split('/').pop().replace(/[^a-z0-9-_]/ig, '_');
            fs.writeFileSync(path.join(project, name + '.json'), JSON.stringify(happy, null, 4));
            return happy;
        })
}

$.async();
getLocations()
    .then(r => {
        return importer.runAllPromises(r.map(l => resolve => {
            return getLocation(l)
                .then(r => setTimeout(() => resolve(r), 100))
        }));
    })
    .then(r => $.mime({'text/plain': JSON.stringify(r, null, 4)}))
    .catch(e => $.sendError(e))

What the code could have been:

const { JSDOM } = require('jsdom');
const fs = require('fs');
const path = require('path');
const axios = require('axios');
const { createWriteStream } = require('fs');
const { promisify } = require('util');
const { importAll } = require('../Core');
const { sendError, mime } = require('./util');

const importAllXpath = importAll('all elements xpath from string');
const getLocations = async () => {
    try {
        const response = await axios.get('https://web.archive.org/web/20160322001433/http://keepflaghappy.com/happy-hours/friday/');
        const locations = await getLocationsFromResponse(response.data);
        return locations;
    } catch (error) {
        console.error('Error fetching locations:', error);
        throw error;
    }
};

const getLocationsFromResponse = async (response) => {
    const getAllXPath = importAllXpath(response);
    const hrefs = await getAllXPath([
        '//a[contains(@href, "by-location")]/@href',
    ]);
    return hrefs
       .filter(href => href.match(/by-location\/[^#]+/ig))
       .map(href => href.includes('archive.org')? href : 'https://web.archive.org' + href);
};

const getLocation = async (location) => {
    try {
        console.log('Downloading'+ location);
        const response = await axios.get(location);
        const allElementsXpath = importAllXpath(response.data);
        const data = await allElementsXpath([
            '//h3[contains(., "Mon") or contains(., "Tue") or contains(., "Wed") or contains(., "Thu") or contains(., "Fri") or contains(., "Sat") or contains(., "Sun")]',
            {
                dotw: './text()',
                time: './following-sibling::p[count(./preceding-sibling::h3)=$i+1]//text()',
                deals: './following-sibling::ul[count(./preceding-sibling::h3)=$i+1]//text()'
            }
        ]);
        const happy = data.map((l, i) => {
            return {
                dotw: l.dotw,
                time: l.time,
                deals: l.deals.join('\n').trim().split(/\s*\n+\s*/ig)
            };
        });
        const name = location.trim().replace(/\/$/ig, '').split('/').pop().replace(/[^a-z0-9-_]/ig, '_');
        const writeStream = createWriteStream(path.join(process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE, 'Collections', 'flagstaff-happy', name + '.json'));
        writeStream.write(JSON.stringify(happy, null, 4));
        await new Promise(resolve => writeStream.end(resolve));
        return happy;
    } catch (error) {
        console.error('Error fetching location:', error);
        throw error;
    }
};

const runPromises = async (locations) => {
    const promises = locations.map((location, index) => {
        return new Promise(resolve => {
            getLocation(location)
               .then(data => setTimeout(() => resolve(data), 100))
               .catch(error => {
                    console.error(`Error fetching location ${location}:`, error);
                    resolve(null);
                });
        });
    });
    return Promise.all(promises);
};

const main = async () => {
    try {
        const locations = await getLocations();
        const happyHours = await runPromises(locations);
        const result = happyHours
           .filter(happyHour => happyHour!== null)
           .map(happyHour => JSON.stringify(happyHour, null, 4));
        return { result };
    } catch (error) {
        console.error('Error:', error);
        throw error;
    }
};

main()
   .then(result => {
        mime({ 'text/plain': result.result.join('\n') });
    })
   .catch(error => sendError(error));

Code Breakdown

Importing Modules and Setting Constants

var importer = require('../Core');
var util = require('util');
var request = util.promisify(require('request'));
var {JSDOM} = require('jsdom');
var fs = require('fs');
var path = require('path');

var PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE;
var project = PROFILE_PATH + '/Collections/flagstaff-happy';

Defining Functions

var locations, getAllXPath;

function getLocations() {
    //...
}

function getLocation(l) {
    //...
}

getLocations Function

function getLocations() {
    if(typeof locations!== 'undefined') {
        return Promise.resolve(locations);
    }
    return request('https://web.archive.org/web/20160322001433/http://keepflaghappy.com/happy-hours/friday/')
       .then(r => {
            const getAllXPath = importer.import('all elements xpath from string')(r.body);
            return getAllXPath([
                '//a[contains(@href, "by-location")]/@href',
            ]);
        })
       .then(r => {
            locations = r
               .filter(l => l.match(/by-location\/[^#]+/ig))
               .map(l => l.includes('archive.org')? l : 'https://web.archive.org' + l);
            return locations;
        });
}

getLocation Function

function getLocation(l) {
    console.log('Downloading'+ l);
    return request(l)
       .then(r => {
            const getAllXPath = importer.import('all elements xpath from string')(r.body);
            return getAllXPath([
                '//h3[contains(., "Mon") or contains(., "Tue") or contains(., "Wed") or contains(., "Thu") or contains(., "Fri") or contains(., "Sat") or contains(., "Sun")]',
                {
                    dotw: './text()',
                    time: './following-sibling::p[count(./preceding-sibling::h3)=$i+1]//text()',
                    deals: './following-sibling::ul[count(./preceding-sibling::h3)=$i+1]//text()'
                }
            ]);
        })
       .then(r => {
            const happy = r.map(l => {
                return {
                    dotw: l.dotw,
                    time: l.time,
                    deals: l.deals.join('\n').trim().split(/\s*\n+\s*/ig)
                };
            });
            //...
        });
}