scrape linkedin threads

This code automates the scraping of LinkedIn thread data, including messages and participant information, storing the results in local JSON files. It utilizes Selenium for web interaction and relies on external modules for file handling and data processing.

Run example

What the code could have been:

const fs = require('fs');
const path = require('path');
const glob = require('glob');
const { runSeleniumCell } = require('../Core');
const logger = require('./logger'); // assuming a custom logger module
const PROJECT_PATH = getProjectPath();
const project = path.join(PROJECT_PATH, 'Conversations');

// LinkedIn functions
const { loginLinkedIn, readLinkedInProfileInfo, listLinkedInThreads, readLinkedInThread } =
  getLinkedInFunctions();

function getProjectPath() {
  return process.env.HOME ||
         process.env.HOMEPATH ||
         process.env.USERPROFILE;
}

function getLinkedInFunctions() {
  try {
    return runSeleniumCell([
      'log in linkedin',
     'scrape LinkedIn profile',
      'list LinkedIn threads',
     'messages LinkedIn thread',
    ]);
  } catch (e) {
    logger.error(e);
    throw e;
  }
}

// TODO: Implement rate limiting to avoid LinkedIn API restrictions
async function readThread(thread) {
  try {
    const threadId = thread.replace(/^\/|\/$/ig, '').split('/').pop();
    const file = glob.sync('**/' + threadId + '-*.json', {cwd: project})[0];
    const messages = require(file);
    const result = await readLinkedInThread(thread, messages);

    const filename = project + '/'
      + result.thread.replace(/^\/|\/$/ig, '').split('/').pop()
      + '-' + result.participants.map(p => p.name).join('')
       .replace(/[^a-z]/ig, '_') + '.json';

    fs.writeFileSync(filename, JSON.stringify(result, null, 4));
    return result;
  } catch (error) {
    logger.error(error);
    return null;
  }
}

async function listThreads() {
  try {
    const profileInfo = await readLinkedInProfileInfo();
    const threads = await listLinkedInThreads();

    fs.writeFileSync(
      project + '/threads.json',
      JSON.stringify(threads, null, 4));
    return threads;
  } catch (error) {
    logger.error(error);
    throw error;
  }
}

async function scrapeLinkedInThreads() {
  try {
    const threads = require(path.join(project, 'threads.json'));
    const freshThreads = threads.filter(thread => {
      const threadId = thread.replace(/^\/|\/$/ig, '').split('/').pop();
      const file = glob.sync('**/@(' + threadId + ')-*.json', { cwd: project })[0];
      return!file ||!fs.existsSync(path.join(project, file));
    });

    const percent = Math.round((threads.length - freshThreads.length) / threads.length * 100);
    console.log((threads.length - freshThreads.length) +'/'+ threads.length +':'+ percent + '%');
    console.log(freshThreads.slice(0, 30));

    if (percent === 100) {
      await listThreads();
    }

    const promises = freshThreads.slice(0, 30)
     .map(async thread => {
        try {
          return await readThread(thread);
        } catch (error) {
          logger.error(error);
        }
      });

    await Promise.all(promises);
  } catch (error) {
    logger.error(error);
    throw error;
  }
}

module.exports = scrapeLinkedInThreads;

This code is designed to scrape LinkedIn data, specifically focusing on threads and their participants. Here's a breakdown: