scraping | convert summaries | | Search

This JavaScript module imports required modules and functions, defines constants and functions for file operations, URL manipulation, and data collection, and exports these functions for use in other parts of the application.

Run example

npm run import -- "default link collector"

default link collector

const fs = require('fs')
const path = require('path')
const {safeurl} = importer.import("domain cache tools")

const PROJECT_PATH = path.join(__dirname, '..', 'Resources', 'Projects', 'reasonings')

function getNearestSunday(date = new Date()) {
  const day = date.getDay();
  const diff = date.getDate() - day + (day === 0 ? -7 : 0); // adjust when it is Sunday
  const result = new Date(date.setDate(diff));
  const customString = `${result.getMonth() + 1}/${result.getDate()}/${result.getFullYear()}`; 
  return customString
}

async function defaultCollector(driver, startPage, selector = '//a[@href]/@href') {
  const selectDom = importer.import("selenium select")
  const getClient = importer.import("selenium client")
  if(!driver)
    driver = getClient()

  try {
    await driver.get(startPage)
    await new Promise(resolve => setTimeout(resolve, 1000))
    let links = await selectDom(driver, selector)
    return links.map(l => ({link: l})) // to match reddit post lister
  } catch (e) {
    driver.quit()
    throw e
  }
}

// record previously generated summaries so script will eventually complete
function persistSummaries(funny, summaries) {
  if(!funny) {
    funny = 'summary'
  }
  let weeklySummary = path.join(PROJECT_PATH, safeurl(getNearestSunday()) + '-' + safeurl(funny) + '.json')
  if(!summaries && fs.existsSync(weeklySummary)) {
    return JSON.parse(fs.readFileSync(weeklySummary))
  } else if (!summaries) {
    return {}
  } else {
    fs.writeFileSync(weeklySummary, JSON.stringify(summaries, null, 4))
  }
}

module.exports = {
  defaultCollector,
  getNearestSunday,
  persistSummaries,
}

What the code could have been:

// Import required modules and tools
const fs = require('fs');
const path = require('path');
const { safeurl } = require('domain-cache-tools');
const { Client } from'selenium-client';
const { Select } from'selenium-select';

// Define constants
const PROJECT_PATH = path.join(__dirname, '..', 'Resources', 'Projects','reasonings');

// Function to get the nearest Sunday
/**
 * Returns the nearest Sunday date as a string in the format 'MM/DD/YYYY'.
 * @param {Date} [date=new Date()] - The date to find the nearest Sunday for.
 * @returns {string} The nearest Sunday date as a string.
 */
function getNearestSunday(date = new Date()) {
  const day = date.getDay();
  const diff = date.getDate() - day + (day === 0? -7 : 0); // Adjust when it is Sunday
  const result = new Date(date.setDate(diff));
  const customString = `${result.getMonth() + 1}/${result.getDate()}/${result.getFullYear()}`;
  return customString;
}

// Function to collect links from a webpage using Selenium
/**
 * Collects links from a webpage using Selenium.
 * @param {Client} [driver] - The Selenium client instance.
 * @param {string} startPage - The URL of the webpage to collect links from.
 * @param {string} [selector='//a[@href]/@href'] - The XPath selector to use for collecting links.
 * @returns {Promise>} A promise that resolves with an array of links.
 */
async function defaultCollector(driver, startPage, selector = '//a[@href]/@href') {
  if (!driver) {
    driver = await Client();
  }
  try {
    await driver.get(startPage);
    await new Promise((resolve) => setTimeout(resolve, 1000));
    const selectDom = Select(driver);
    const links = await selectDom(driver, selector);
    return links.map((l) => ({ link: l }));
  } catch (e) {
    driver.quit();
    throw e;
  }
}

// Function to persist summaries to a file
/**
 * Persists summaries to a file.
 * @param {boolean} funny - Whether to use a funny summary name or not.
 * @param {Object} [summaries] - The summaries to persist.
 * @returns {Object} The persisted summaries or an empty object if none are provided.
 */
function persistSummaries(funny, summaries) {
  if (!funny) {
    funny ='summary';
  }
  const weeklySummary = path.join(PROJECT_PATH, safeurl(getNearestSunday()) + '-' + safeurl(funny) + '.json');
  if (!summaries && fs.existsSync(weeklySummary)) {
    return JSON.parse(fs.readFileSync(weeklySummary));
  } else if (!summaries) {
    return {};
  } else {
    fs.writeFileSync(weeklySummary, JSON.stringify(summaries, null, 4));
    return summaries;
  }
}

// Export the functions
module.exports = {
  defaultCollector,
  getNearestSunday,
  persistSummaries,
};

Code Breakdown

Requires and Imports

const fs = require('fs'); // Import the File System module
const path = require('path'); // Import the Path module
const { safeurl } = importer.import('domain cache tools'); // Import a function from a module

Constants

const PROJECT_PATH = path.join(__dirname, '..', 'Resources', 'Projects','reasonings'); // Project path

Functions

getNearestSunday(date = new Date())

function getNearestSunday(date = new Date()) {
  const day = date.getDay();
  const diff = date.getDate() - day + (day === 0? -7 : 0);
  const result = new Date(date.setDate(diff));
  const customString = `${result.getMonth() + 1}/${result.getDate()}/${result.getFullYear()}`;
  return customString;
}

defaultCollector(driver, startPage, selector = '//a[@href]/@href')

async function defaultCollector(driver, startPage, selector = '//a[@href]/@href') {
  const selectDom = importer.import('selenium select');
  const getClient = importer.import('selenium client');

  if (!driver) driver = getClient();

  try {
    await driver.get(startPage);
    await new Promise(resolve => setTimeout(resolve, 1000));
    let links = await selectDom(driver, selector);
    return links.map(l => ({ link: l }));
  } catch (e) {
    driver.quit();
    throw e;
  }
}

persistSummaries(funny, summaries)

function persistSummaries(funny, summaries) {
  if (!funny) {
    funny ='summary';
  }

  let weeklySummary = path.join(PROJECT_PATH, safeurl(getNearestSunday()) + '-' + safeurl(funny) + '.json');

  if (!summaries && fs.existsSync(weeklySummary)) {
    return JSON.parse(fs.readFileSync(weeklySummary));
  } else if (!summaries) {
    return {};
  } else {
    fs.writeFileSync(weeklySummary, JSON.stringify(summaries, null, 4));
  }
}

Exports

module.exports = {
  defaultCollector,
  getNearestSunday,
  persistSummaries,
};