scraping | test article summarizer | convert summaries | Search

The summerizeAll function extracts and summarizes all links from a provided startPage or links array by selecting a link scraping tool, scraping links, extracting article content, summarizing articles, and persisting summaries. The function uses various modules and functions, including getClient, extractArticle, summerizeArticle, defaultCollector, and persistSummaries, and is exported as a module for use elsewhere.

Run example

npm run import -- "summarize all articles"

summarize all articles

const getClient = importer.import("selenium client")
const extractArticle = importer.import("extract llm article")
const summerizeArticle = importer.import("summarize llm article")
const {
  defaultCollector, persistSummaries
} = importer.import("default link collector")

// select link scraping tool
function selectScaper(selector, startPage) {
  if(!selector && startPage.includes('reddit.com')) {
    selector = importer.import("reddit month of links")
  } else if(!selector) {
    selector = defaultCollector
  } else if(typeof selector == 'string') {
    selector = defaultCollector.bind(null, startPage, selector)
  }

  return selector
}

// extract persist, extract persist
async function summerizeAll(links, selector, startPage, funny) {

  if(!startPage && !links) {
    console.error('No start page or links to summerize.')
    return
  }

  let selectorFunction = selectScaper(selector, startPage)

  let driver = await getClient()

  let summaries = persistSummaries(funny)

  try {
    if(!links && startPage) {
      links = await selectorFunction(driver, startPage)
    }

    console.log(links)

    for (let i = 0; i < links.length; i++) {
      if(typeof summaries[links[i].link] != 'undefined')
        continue // already loaded

      let article = await extractArticle(driver, links[i].link)

      let summary = await summerizeArticle(article, funny)
      
      summaries[links[i].link] = summary
      persistSummaries(funny, summaries)
    }

    driver.quit()

    return summaries
  } catch (e) {
    //driver.quit()
    throw e
  }
}

module.exports = summerizeAll

What the code could have been:

const Client = require('selenium-client');
const { ExtractLLMArticle, SummarizeLLMArticle } = require('./extract-llm-article');
const { DefaultLinkCollector, persistSummaries } = require('./default-link-collector');
const RedditLinkCollector = require('./reddit-link-collector');

class LinkScraper {
  static selectScraper(selector, startPage) {
    if (!selector && startPage.includes('reddit.com')) {
      selector = new RedditLinkCollector();
    } else if (!selector) {
      selector = new DefaultLinkCollector();
    } else if (typeof selector ==='string') {
      selector = () => new DefaultLinkCollector(startPage, selector);
    }
    return selector;
  }
}

async function summarizeAll(links, selector, startPage, funny) {
  if (!startPage &&!links) {
    throw new Error('No start page or links to summarize.');
  }

  const client = new Client();
  const extractor = new ExtractLLMArticle();
  const summarizer = new SummarizeLLMArticle(funny);
  const collector = LinkScraper.selectScraper(selector, startPage);
  const summaries = persistSummaries(funny);

  try {
    if (!links && startPage) {
      links = await collector(client, startPage);
    }

    console.log(links);

    for (const link of links) {
      if (summaries[link.link]) {
        continue; // already loaded
      }

      const article = await extractor.client(client, link.link);
      const summary = await summarizer(article);

      summaries[link.link] = summary;
      persistSummaries(funny, summaries);
    }

    await client.quit();

    return summaries;
  } catch (error) {
    // do not quit client in case of an error
    throw error;
  }
}

module.exports = summarizeAll;

Code Breakdown

Importing Modules

The code imports several modules using the importer.import function:

Selecting a Link Scraping Tool

The selectScaper function determines which link scraping tool to use based on the provided selector and startPage parameters. It returns a function that can be used to scrape links.

Summerizing All Links

The summerizeAll function extracts and summarizes all links from a provided startPage or links array.

Function Parameters

Function Flow

  1. It checks if startPage or links are falsy and logs an error message if so.
  2. It selects a link scraping tool using the selectScaper function.
  3. It creates a Selenium client driver using the getClient function.
  4. It initializes an object to store summaries using the persistSummaries function.
  5. If startPage is provided but links is falsy, it scrapes links from the startPage using the selected tool.
  6. It loops through each link and:
  7. It quits the Selenium driver and returns the final summaries object.

Exporting the Function

The summerizeAll function is exported as a module.