linkedin messages | Read messages LinkedIn thread | Cell 5 | Search

This code automates the scraping of LinkedIn thread data, including messages and participant information, storing the results in local JSON files. It utilizes Selenium for web interaction and relies on external modules for file handling and data processing.

Run example

npm run import -- "scrape linkedin threads"

scrape linkedin threads

var fs = require('fs');
var importer = require('../Core');
var glob = require('glob');
var path = require('path');
var runSeleniumCell = importer.import("selenium cell");
var loginLinkedIn, readLinkedInProfileInfo, listLinkedInThreads,
    readLinkedInThread;

var PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE;
var project = PROFILE_PATH + '/Conversations';

function readThread(thread) {
    var threadId = thread.replace(/^\/|\/$/ig, '').split('/').pop();
    var file = glob.sync('**/' + threadId + '-*.json', {cwd: project})[0];
    var messages;
    try {
        messages = require(file)
    }
    catch (e) {
        messages = []
    }
    return readLinkedInThread(thread, messages)
        .then(t => {
        console.log(t);
            var filename = project + '/'
                + t.thread.replace(/^\/|\/$/ig, '').split('/').pop()
                + '-' + t.participants
                    .map(p => p.name).join('')
                    .replace(/[^a-z]/ig, '_') + '.json';
            fs.writeFileSync(filename, JSON.stringify(t, null, 4));
            return t;
        });
};

function listThreads() {
    var threads = [];
    return readLinkedInProfileInfo()
    // show my profile info
        .then(() => listLinkedInThreads(threads))
        // list all message threads
        .then(r => {
            fs.writeFileSync(
                project + '/threads.json',
                JSON.stringify(r, null, 4))
            return r;
        })
        .catch(e => console.log(e))
};

function scrapeLinkedInThreads() {
    var threads, fresh;

    return runSeleniumCell([
        'log in linkedin',
        'scrape LinkedIn profile',
        'list LinkedIn threads',
        'messages LinkedIn thread',
    ])
        .then(r => ({
            loginLinkedIn, readLinkedInProfileInfo, listLinkedInThreads,
            readLinkedInThread
        } = r).loginLinkedIn())
        // scrape each thread
        .then(() => {
            try {
                threads = JSON.parse(fs.readFileSync(project + '/threads.json'));
            }
            catch (e) {
                threads = [];
            }
            var threadCount = threads.length;
            var ids = threads.map(t => t.replace(/^\/|\/$/ig, '').split('/').pop());
            var files = glob.sync('**/@(' + ids.join('|') + ')-*.json', {
                cwd: project
            });
            fresh = threads.filter(t => {
                var threadId = t.replace(/^\/|\/$/ig, '').split('/').pop();
                var file = files.filter(f => f.indexOf(threadId) > -1)[0];
                return !(typeof file !== 'undefined'
                    && fs.existsSync(path.join(project, file)));
            });
            const percent = Math.round((threadCount - fresh.length) / threadCount * 100);
            console.log((threadCount - fresh.length) + ' / ' + threadCount + ' : '
                + percent
                + '%');
            console.log(fresh.slice(0, 30));
            return percent === 100 ? listThreads() : [];
        })
        .then(() => importer.runAllPromises(fresh.slice(0, 30)
            .map(t => ((resolve) => readThread(t).then((r) => resolve(r))))))
};
module.exports = scrapeLinkedInThreads;

What the code could have been:

const fs = require('fs');
const path = require('path');
const glob = require('glob');
const { runSeleniumCell } = require('../Core');
const logger = require('./logger'); // assuming a custom logger module
const PROJECT_PATH = getProjectPath();
const project = path.join(PROJECT_PATH, 'Conversations');

// LinkedIn functions
const { loginLinkedIn, readLinkedInProfileInfo, listLinkedInThreads, readLinkedInThread } =
  getLinkedInFunctions();

function getProjectPath() {
  return process.env.HOME ||
         process.env.HOMEPATH ||
         process.env.USERPROFILE;
}

function getLinkedInFunctions() {
  try {
    return runSeleniumCell([
      'log in linkedin',
     'scrape LinkedIn profile',
      'list LinkedIn threads',
     'messages LinkedIn thread',
    ]);
  } catch (e) {
    logger.error(e);
    throw e;
  }
}

// TODO: Implement rate limiting to avoid LinkedIn API restrictions
async function readThread(thread) {
  try {
    const threadId = thread.replace(/^\/|\/$/ig, '').split('/').pop();
    const file = glob.sync('**/' + threadId + '-*.json', {cwd: project})[0];
    const messages = require(file);
    const result = await readLinkedInThread(thread, messages);

    const filename = project + '/'
      + result.thread.replace(/^\/|\/$/ig, '').split('/').pop()
      + '-' + result.participants.map(p => p.name).join('')
       .replace(/[^a-z]/ig, '_') + '.json';

    fs.writeFileSync(filename, JSON.stringify(result, null, 4));
    return result;
  } catch (error) {
    logger.error(error);
    return null;
  }
}

async function listThreads() {
  try {
    const profileInfo = await readLinkedInProfileInfo();
    const threads = await listLinkedInThreads();

    fs.writeFileSync(
      project + '/threads.json',
      JSON.stringify(threads, null, 4));
    return threads;
  } catch (error) {
    logger.error(error);
    throw error;
  }
}

async function scrapeLinkedInThreads() {
  try {
    const threads = require(path.join(project, 'threads.json'));
    const freshThreads = threads.filter(thread => {
      const threadId = thread.replace(/^\/|\/$/ig, '').split('/').pop();
      const file = glob.sync('**/@(' + threadId + ')-*.json', { cwd: project })[0];
      return!file ||!fs.existsSync(path.join(project, file));
    });

    const percent = Math.round((threads.length - freshThreads.length) / threads.length * 100);
    console.log((threads.length - freshThreads.length) +'/'+ threads.length +':'+ percent + '%');
    console.log(freshThreads.slice(0, 30));

    if (percent === 100) {
      await listThreads();
    }

    const promises = freshThreads.slice(0, 30)
     .map(async thread => {
        try {
          return await readThread(thread);
        } catch (error) {
          logger.error(error);
        }
      });

    await Promise.all(promises);
  } catch (error) {
    logger.error(error);
    throw error;
  }
}

module.exports = scrapeLinkedInThreads;

This code is designed to scrape LinkedIn data, specifically focusing on threads and their participants. Here's a breakdown:

Core Functionality:

  1. Data Storage:

  2. Thread Processing:

  3. Thread Listing:

  4. Selenium Integration:

Code Structure:

Missing Information:

Let me know if you have any other questions.