This code automates the scraping of LinkedIn thread data, including messages and participant information, storing the results in local JSON files. It utilizes Selenium for web interaction and relies on external modules for file handling and data processing.
npm run import -- "scrape linkedin threads"
var fs = require('fs');
var importer = require('../Core');
var glob = require('glob');
var path = require('path');
var runSeleniumCell = importer.import("selenium cell");
var loginLinkedIn, readLinkedInProfileInfo, listLinkedInThreads,
readLinkedInThread;
var PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE;
var project = PROFILE_PATH + '/Conversations';
function readThread(thread) {
var threadId = thread.replace(/^\/|\/$/ig, '').split('/').pop();
var file = glob.sync('**/' + threadId + '-*.json', {cwd: project})[0];
var messages;
try {
messages = require(file)
}
catch (e) {
messages = []
}
return readLinkedInThread(thread, messages)
.then(t => {
console.log(t);
var filename = project + '/'
+ t.thread.replace(/^\/|\/$/ig, '').split('/').pop()
+ '-' + t.participants
.map(p => p.name).join('')
.replace(/[^a-z]/ig, '_') + '.json';
fs.writeFileSync(filename, JSON.stringify(t, null, 4));
return t;
});
};
function listThreads() {
var threads = [];
return readLinkedInProfileInfo()
// show my profile info
.then(() => listLinkedInThreads(threads))
// list all message threads
.then(r => {
fs.writeFileSync(
project + '/threads.json',
JSON.stringify(r, null, 4))
return r;
})
.catch(e => console.log(e))
};
function scrapeLinkedInThreads() {
var threads, fresh;
return runSeleniumCell([
'log in linkedin',
'scrape LinkedIn profile',
'list LinkedIn threads',
'messages LinkedIn thread',
])
.then(r => ({
loginLinkedIn, readLinkedInProfileInfo, listLinkedInThreads,
readLinkedInThread
} = r).loginLinkedIn())
// scrape each thread
.then(() => {
try {
threads = JSON.parse(fs.readFileSync(project + '/threads.json'));
}
catch (e) {
threads = [];
}
var threadCount = threads.length;
var ids = threads.map(t => t.replace(/^\/|\/$/ig, '').split('/').pop());
var files = glob.sync('**/@(' + ids.join('|') + ')-*.json', {
cwd: project
});
fresh = threads.filter(t => {
var threadId = t.replace(/^\/|\/$/ig, '').split('/').pop();
var file = files.filter(f => f.indexOf(threadId) > -1)[0];
return !(typeof file !== 'undefined'
&& fs.existsSync(path.join(project, file)));
});
const percent = Math.round((threadCount - fresh.length) / threadCount * 100);
console.log((threadCount - fresh.length) + ' / ' + threadCount + ' : '
+ percent
+ '%');
console.log(fresh.slice(0, 30));
return percent === 100 ? listThreads() : [];
})
.then(() => importer.runAllPromises(fresh.slice(0, 30)
.map(t => ((resolve) => readThread(t).then((r) => resolve(r))))))
};
module.exports = scrapeLinkedInThreads;
const fs = require('fs');
const path = require('path');
const glob = require('glob');
const { runSeleniumCell } = require('../Core');
const logger = require('./logger'); // assuming a custom logger module
const PROJECT_PATH = getProjectPath();
const project = path.join(PROJECT_PATH, 'Conversations');
// LinkedIn functions
const { loginLinkedIn, readLinkedInProfileInfo, listLinkedInThreads, readLinkedInThread } =
getLinkedInFunctions();
function getProjectPath() {
return process.env.HOME ||
process.env.HOMEPATH ||
process.env.USERPROFILE;
}
function getLinkedInFunctions() {
try {
return runSeleniumCell([
'log in linkedin',
'scrape LinkedIn profile',
'list LinkedIn threads',
'messages LinkedIn thread',
]);
} catch (e) {
logger.error(e);
throw e;
}
}
// TODO: Implement rate limiting to avoid LinkedIn API restrictions
async function readThread(thread) {
try {
const threadId = thread.replace(/^\/|\/$/ig, '').split('/').pop();
const file = glob.sync('**/' + threadId + '-*.json', {cwd: project})[0];
const messages = require(file);
const result = await readLinkedInThread(thread, messages);
const filename = project + '/'
+ result.thread.replace(/^\/|\/$/ig, '').split('/').pop()
+ '-' + result.participants.map(p => p.name).join('')
.replace(/[^a-z]/ig, '_') + '.json';
fs.writeFileSync(filename, JSON.stringify(result, null, 4));
return result;
} catch (error) {
logger.error(error);
return null;
}
}
async function listThreads() {
try {
const profileInfo = await readLinkedInProfileInfo();
const threads = await listLinkedInThreads();
fs.writeFileSync(
project + '/threads.json',
JSON.stringify(threads, null, 4));
return threads;
} catch (error) {
logger.error(error);
throw error;
}
}
async function scrapeLinkedInThreads() {
try {
const threads = require(path.join(project, 'threads.json'));
const freshThreads = threads.filter(thread => {
const threadId = thread.replace(/^\/|\/$/ig, '').split('/').pop();
const file = glob.sync('**/@(' + threadId + ')-*.json', { cwd: project })[0];
return!file ||!fs.existsSync(path.join(project, file));
});
const percent = Math.round((threads.length - freshThreads.length) / threads.length * 100);
console.log((threads.length - freshThreads.length) +'/'+ threads.length +':'+ percent + '%');
console.log(freshThreads.slice(0, 30));
if (percent === 100) {
await listThreads();
}
const promises = freshThreads.slice(0, 30)
.map(async thread => {
try {
return await readThread(thread);
} catch (error) {
logger.error(error);
}
});
await Promise.all(promises);
} catch (error) {
logger.error(error);
throw error;
}
}
module.exports = scrapeLinkedInThreads;
This code is designed to scrape LinkedIn data, specifically focusing on threads and their participants. Here's a breakdown:
Core Functionality:
Data Storage:
project
directory (usually in the user's home directory) to store scraped data in JSON files.Thread Processing:
readThread
function takes a thread URL, retrieves existing messages from a local JSON file (if available), and then calls readLinkedInThread
to fetch new messages from LinkedIn.Thread Listing:
listThreads
function logs into LinkedIn, retrieves a list of threads, and saves this list to a threads.json
file.Selenium Integration:
runSeleniumCell
to execute a series of Selenium commands to automate interactions with LinkedIn:
Code Structure:
fs
, importer
, glob
, path
).selenium cell
) for web automation.Missing Information:
readLinkedInProfileInfo
, listLinkedInThreads
, and readLinkedInThread
functions.runSeleniumCell
are not shown.Let me know if you have any other questions.