data collection | | https://www.amazon.com/gp/yourstore/iyr/ref=pd_ys_iyr_nextie=UTF8&collection=watched&iyrGroup=&maxItem=616&minItem=600 | Search

This code defines a function scrapeAlert that fetches and saves data from a website based on a given ID, and exports it for use elsewhere.

Run example

npm run import -- "Crime reports"

Crime reports

var fs = require('fs');
var path = require('path');

var PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE || '';
var project = path.join(PROFILE_PATH, 'Collections/crimes');

function scrapeAlert(ID) {
    if(fs.existsSync(path.join(project, 'maricopa-alert-' + ID + '.json'))) {
        return;
    }
    return client
        .url('https://www.maricopacountyattorney.org/CivicAlerts.aspx?AID=' + ID)
        .getAllXPath({
            time: '//*[@class = "single"]//*[@class = "date"]//text()',
            title: '//*[contains(@class, "single")]//h3//text()',
            content: '//*[@class = "single"]//*[@class = "content"]//text()'
        })
        .then(r => {
            fs.writeFileSync(path.join(project, 'maricopa-alert-' + ID + '.json'), JSON.stringify(r, null, 4));
            return r;
        })
        .catch(e => console.log(e))
}
module.exports = scrapeAlert;

if(typeof $ !== 'undefined') {
    $.async();
    var IDs = Array.from(Array(500).keys());
    multiCrawl(IDs, 'crime reports')
        .then(r => $.sendResult(r))
        .catch(e => $.sendError(e))
}

What the code could have been:

/**
 * Scrapes Maricopa County crime alerts and saves them as JSON files.
 *
 * @param {string} ID - The ID of the alert to scrape.
 * @param {object} client - The client used for web scraping.
 * @param {string} project - The path to the project directory.
 * @returns {Promise|void} The scraped alert data or void if the data already exists.
 */
const fs = require('fs').promises; // Import fs with promises support
const path = require('path');

const PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE || '';
const project = path.join(PROFILE_PATH, 'Collections/crimes');

async function scrapeAlert(ID, client) {
    // Check if the alert file already exists
    if (await fs.access(path.join(project, `maricopa-alert-${ID}.json`), fs.F_OK)) {
        return;
    }

    try {
        const r = await client
           .url(`https://www.maricopacountyattorney.org/CivicAlerts.aspx?AID=${ID}`)
           .getAllXPath({
                time: '//*[@class = "single"]//*[@class = "date"]//text()',
                title: '//*[contains(@class, "single")]//h3//text()',
                content: '//*[@class = "single"]//*[@class = "content"]//text()'
            });

        // Save the scraped data to a JSON file
        await fs.writeFile(path.join(project, `maricopa-alert-${ID}.json`), JSON.stringify(r, null, 4));
        return r;
    } catch (e) {
        console.error(e);
    }
}

/**
 * Crawl multiple IDs and send the results or errors.
 *
 * @param {string[]} IDs - The IDs of the alerts to crawl.
 * @param {string} label - The label for the crawl.
 * @returns {Promise|void} The crawled data or void if an error occurs.
 */
async function multiCrawl(IDs, label) {
    if (typeof $!== 'undefined') {
        await $ async(); // Wait for the async function to finish
        const tasks = IDs.map(ID => scrapeAlert(ID, client)); // Create an array of tasks
        const results = await Promise.all(tasks); // Execute the tasks concurrently
        return $.sendResult(results);
    } else {
        console.error('[$ not defined]');
    }
}

module.exports = { scrapeAlert, multiCrawl };

Code Breakdown

Dependencies

  • fs (File System): used for file operations.
  • path: used for path manipulation.

Variables

  • PROFILE_PATH: set to the user's home directory (or its environment variable equivalents).
  • project: set to a directory path within the user's home directory ('Collections/crimes').

scrapeAlert Function

  • Accepts ID as an argument.
  • Checks if a file with the name maricopa-alert-<ID>.json exists in the specified project directory. If it does, the function returns immediately.
  • Fetches data from the website https://www.maricopacountyattorney.org/CivicAlerts.aspx?AID=<ID> using an unknown client object ( likely a web scraping client).
  • Extracts the time, title, and content from the webpage using XPath expressions.
  • Saves the extracted data to a JSON file with the name maricopa-alert-<ID>.json in the project directory.
  • Returns the extracted data.

Export

  • Exports the scrapeAlert function.

Additional Code

  • Checks if the $ object is defined. If it is, it calls the async method on it and passes an array of 500 IDs to the multiCrawl function.
  • Calls multiCrawl with crime reports as an argument and awaits the result.
  • If $ is defined, it catches any errors and sends the result or error using the $ object.