The redditLinks
function uses Selenium WebDriver to scrape Reddit posts from a specified subreddit or URL, extracting title, timestamp, link, and comment count for each post, and returning an array of objects with additional metadata.
The redditLinks
function scrapes Reddit posts from a specified subreddit or URL using Selenium WebDriver, extracting key metadata for each post. It returns an array of objects containing the scraped data, along with the URL of the next page of posts.
npm run import -- "reddit scraper"
const selectDom = importer.import("selenium select")
const getClient = importer.import("selenium client")
const {URL} = require('url')
async function redditLinks(driver, startPage) {
if(!startPage.includes('://')) {
startPage = 'https://www.reddit.com/r/' + startPage
}
let startUrl = new URL(startPage)
if(!driver) {
driver = await getClient()
}
try {
await driver.get(startPage)
await new Promise(resolve => setTimeout(resolve, 1000))
let links = await selectDom(driver, [
'//div[contains(@role, "main")]//div[contains(@class, "link") and not(contains(@class, "linklisting")) and not(contains(@class, "promoted"))]'])
// TODO: get some special links, get comment count, titles, finally next page
let results = []
for(let i = 0; i < links.length; i++) {
let result = await selectDom(driver, {
title: './/a[contains(@class, "title")]/text()',
link: './/a[contains(@class, "title")]/@href',
time: './/time/@datetime',
comment: './/a[contains(@class, "comments")]/text()',
}, links[i])
results.push(result)
}
let next = await selectDom(driver, '//a[contains(@rel, "next")]/@href')
let objectArray = results.map(r => ({
title: r.title,
time: new Date(r.time),
link: r.link.includes('://')
? r.link : ((!r.link.startsWith('/')
? (startUrl.origin + '/' + startUrl.pathname + './') : startUrl.origin) + r.link),
comment: r.comment
}))
objectArray.next = next
return objectArray
} catch (e) {
driver.quit()
throw e
}
}
module.exports = redditLinks
const { Client, Builder } = require('selenium-webdriver');
const { URL } = require('url');
const { By, until } = require('selenium-webdriver');
async function redditLinks(subreddit, driver = null) {
const url = new URL(`https://www.reddit.com/r/${subreddit}`);
driver??= await new Builder().forBrowser('chrome').build();
try {
await driver.get(url.href);
await driver.wait(until.elementLocated(By.css('body')), 1000);
const links = await driver.findElements(By.xpath('//div[contains(@role, "main")]//div[contains(@class, "link") and not(contains(@class, "linklisting")) and not(contains(@class, "promoted"))]'));
const results = await Promise.all(links.map(async (link) => {
const title = await link.findElement(By.xpath('.//a[contains(@class, "title")]/text()')).getText();
const linkHref = await link.findElement(By.xpath('.//a[contains(@class, "title")]/@href')).getAttribute('href');
const time = await link.findElement(By.xpath('.//time/@datetime')).getAttribute('datetime');
const comment = await link.findElement(By.xpath('.//a[contains(@class, "comments")]/text()')).getText();
return { title, link: linkHref.startsWith('/')? `${url.origin}/${linkHref}` : linkHref, time, comment };
}));
const next = await driver.findElement(By.xpath('//a[contains(@rel, "next")]/@href')).getAttribute('href');
const objectArray = results.map((r) => ({...r, time: new Date(r.time) }));
objectArray.push({ next });
await driver.quit();
return objectArray;
} catch (error) {
await driver.quit();
throw error;
}
}
module.exports = redditLinks;
driver
: an instance of Selenium WebDriver, or null
/undefined
to initialize a new instance.startPage
: the name of the Reddit subreddit to scrape, or a full URL to the subreddit.title
: the title of the post.time
: the timestamp of the post, as a Date
object.link
: the link to the post.comment
: the number of comments on the post.next
: the URL of the next page of posts.If startPage
does not contain a scheme (://
), it is assumed to be a subreddit name and is prepended with https://www.reddit.com/r/
.
The function uses Selenium WebDriver to navigate to the specified subreddit, wait for 1 second, and then extracts the links to the posts.
For each post, it extracts the title, link, timestamp, and comment count, and returns an array of objects containing this data.
The next
property is the URL of the next page of posts.
If an error occurs during the scraping process, the function closes the WebDriver instance and re-throws the error.