The redditList
function is an asynchronous function that retrieves a list of Reddit posts from a specified subreddit, handling parameters such as the start page and time span. It iteratively retrieves posts using the redditLinks
function, checks for posts older than the specified time span, and introduces a delay between iterations to prevent infinite loops.
npm run import -- "reddit month of links"
const redditLinks = importer.import("reddit scraper")
const getClient = importer.import("selenium client")
async function redditList(driver, startPage, timeSpan = 'week') {
if(!startPage) {
startPage = 'https://www.reddit.com/r/CollapseSupport+climatechange+collapse+economicCollapse/'
}
if(!startPage.includes('://')) {
startPage = 'https://www.reddit.com/r/' + startPage
}
if(!driver)
driver = await getClient()
try {
let safety = 20
let weekAgo = new Date(Date.now() - 1000 * 60 * 60 * 24 * 7)
let monthAgo = new Date(Date.now() - 1000 * 60 * 60 * 24 * 7 * 4.2)
let finalResult = []
let result
let beforeTimeSpan = 0
do {
console.log(startPage)
result = await redditLinks(driver, startPage)
finalResult = finalResult.concat(result)
startPage = result.next
// not yet includes any articles over a month ago
beforeTimeSpan = result.filter(r => timeSpan == 'month'
? r.time < monthAgo : r.time < weekAgo).length
safety--
if(result.next && beforeTimeSpan == 0 && safety > 0) {
await new Promise(resolve => setTimeout(resolve, 1000))
}
} while(result.next && beforeTimeSpan == 0 && safety > 0)
return finalResult;
} catch (e) {
driver.quit()
throw e
}
}
module.exports = redditList
// Import required modules and clients
const redditLinks = require('./reddit scraper');
const getClient = require('./selenium client');
/**
* Scrapes Reddit for collapse support, climate change, collapse, and economic collapse topics.
*
* @param {object} driver - Selenium driver instance
* @param {string} startPage - Starting page URL. Defaults to r/CollapseSupport+climatechange+collapse+economicCollapse/
* @param {string} timeSpan - Time span to filter results. Defaults to week
* @returns {array} Array of Reddit posts
*/
async function redditList(driver, startPage = 'https://www.reddit.com/r/CollapseSupport+climatechange+collapse+economicCollapse/', timeSpan = 'week') {
// Validate and normalize start page URL
if (!startPage.includes('://')) {
startPage = `https://www.reddit.com/r/${startPage}`;
}
// Initialize Selenium driver if not provided
if (!driver) {
driver = await getClient();
}
try {
// Calculate time spans for filtering
const weekAgo = new Date(Date.now() - 1000 * 60 * 60 * 24 * 7);
const monthAgo = new Date(Date.now() - 1000 * 60 * 60 * 24 * 7 * 4.2);
// Initialize result array and safety counter
const finalResult = [];
let safety = 20;
// Initialize result and before time span variables
let result;
let beforeTimeSpan = 0;
// Scrape Reddit until all posts are included or safety limit is reached
do {
console.log(startPage);
result = await redditLinks(driver, startPage);
// Concatenate results and update start page URL
finalResult.push(...result);
startPage = result.next;
// Filter results based on time span
beforeTimeSpan = result.filter((r) => {
const filterCondition = timeSpan ==='month'? r.time < monthAgo : r.time < weekAgo;
return filterCondition;
}).length;
// Wait before scraping next page if all posts are included
if (result.next && beforeTimeSpan === 0 && safety > 0) {
await new Promise((resolve) => setTimeout(resolve, 1000));
}
// Decrement safety counter
safety--;
} while (result.next && beforeTimeSpan === 0 && safety > 0);
// Return final result array
return finalResult;
} catch (e) {
// Quit Selenium driver and rethrow error
driver.quit();
throw e;
}
}
module.exports = redditList;
Code Breakdown
The code defines an asynchronous function redditList
that retrieves a list of Reddit posts from a specified subreddit.
driver
: an instance of a Selenium clientstartPage
: the URL of the subreddit to retrieve posts from (default: 'https://www.reddit.com/r/CollapseSupport+climatechange+collapse+economicCollapse/')timeSpan
: the time span for which to retrieve posts (default: 'week')startPage
is not provided, it defaults to 'https://www.reddit.com/r/CollapseSupport+climatechange+collapse+economicCollapse/'. If startPage
does not contain a protocol (e.g., 'http://' or 'https://'), it is prepended with 'https://www.reddit.com/r/'.driver
is not provided, it is obtained from the getClient
function.redditLinks
function, which takes the driver
and startPage
as arguments.finalResult
array.timeSpan
. If there are no such posts, it waits for 1 second before continuing.safety
counter reaches 0.driver
is quit, and the error is re-thrown.do...while
loop construct to ensure that the loop body is executed at least once.safety
counter is used to prevent the loop from running indefinitely in case there are no posts that match the specified timeSpan
.beforeTimeSpan
variable is used to check if there are any posts that are older than the specified timeSpan
.Promise
constructor is used to introduce a delay of 1 second between iterations of the loop.