reddit | reddit month of links | test reddit scraper | Search

The redditLinks function uses Selenium WebDriver to scrape Reddit posts from a specified subreddit or URL, extracting title, timestamp, link, and comment count for each post, and returning an array of objects with additional metadata.

The redditLinks function scrapes Reddit posts from a specified subreddit or URL using Selenium WebDriver, extracting key metadata for each post. It returns an array of objects containing the scraped data, along with the URL of the next page of posts.

Run example

npm run import -- "reddit scraper"

reddit scraper

const selectDom = importer.import("selenium select")
const getClient = importer.import("selenium client")
const {URL} = require('url')

async function redditLinks(driver, startPage) {
  if(!startPage.includes('://')) {
    startPage = 'https://www.reddit.com/r/' + startPage
  }

  let startUrl = new URL(startPage)

  if(!driver) {
    driver = await getClient()
  }

  try {
    await driver.get(startPage)

    await new Promise(resolve => setTimeout(resolve, 1000))

    let links = await selectDom(driver, [
      '//div[contains(@role, "main")]//div[contains(@class, "link") and not(contains(@class, "linklisting")) and not(contains(@class, "promoted"))]'])

    // TODO: get some special links, get comment count, titles, finally next page
    let results = []
    for(let i = 0; i < links.length; i++) {
      let result = await selectDom(driver, {
        title: './/a[contains(@class, "title")]/text()',
        link: './/a[contains(@class, "title")]/@href',
        time: './/time/@datetime',
        comment: './/a[contains(@class, "comments")]/text()',
      }, links[i])
      results.push(result)
    }

    let next = await selectDom(driver, '//a[contains(@rel, "next")]/@href')

    let objectArray = results.map(r => ({
      title: r.title,
      time: new Date(r.time),
      link: r.link.includes('://') 
        ? r.link : ((!r.link.startsWith('/') 
        ? (startUrl.origin + '/' + startUrl.pathname + './') : startUrl.origin) + r.link),
      comment: r.comment
    }))
    objectArray.next = next
    return objectArray
  } catch (e) {
    driver.quit()

    throw e
  }
}

module.exports = redditLinks

What the code could have been:

const { Client, Builder } = require('selenium-webdriver');
const { URL } = require('url');
const { By, until } = require('selenium-webdriver');

async function redditLinks(subreddit, driver = null) {
  const url = new URL(`https://www.reddit.com/r/${subreddit}`);
  driver??= await new Builder().forBrowser('chrome').build();
  try {
    await driver.get(url.href);

    await driver.wait(until.elementLocated(By.css('body')), 1000);

    const links = await driver.findElements(By.xpath('//div[contains(@role, "main")]//div[contains(@class, "link") and not(contains(@class, "linklisting")) and not(contains(@class, "promoted"))]'));

    const results = await Promise.all(links.map(async (link) => {
      const title = await link.findElement(By.xpath('.//a[contains(@class, "title")]/text()')).getText();
      const linkHref = await link.findElement(By.xpath('.//a[contains(@class, "title")]/@href')).getAttribute('href');
      const time = await link.findElement(By.xpath('.//time/@datetime')).getAttribute('datetime');
      const comment = await link.findElement(By.xpath('.//a[contains(@class, "comments")]/text()')).getText();
      return { title, link: linkHref.startsWith('/')? `${url.origin}/${linkHref}` : linkHref, time, comment };
    }));

    const next = await driver.findElement(By.xpath('//a[contains(@rel, "next")]/@href')).getAttribute('href');

    const objectArray = results.map((r) => ({...r, time: new Date(r.time) }));
    objectArray.push({ next });

    await driver.quit();

    return objectArray;
  } catch (error) {
    await driver.quit();
    throw error;
  }
}

module.exports = redditLinks;

Function: redditLinks

Parameters

Returns

Behavior

If startPage does not contain a scheme (://), it is assumed to be a subreddit name and is prepended with https://www.reddit.com/r/.

The function uses Selenium WebDriver to navigate to the specified subreddit, wait for 1 second, and then extracts the links to the posts.

For each post, it extracts the title, link, timestamp, and comment count, and returns an array of objects containing this data.

The next property is the URL of the next page of posts.

Error Handling

If an error occurs during the scraping process, the function closes the WebDriver instance and re-throws the error.