This code extracts information about participants in Facebook message threads by navigating to each participant and scraping their profile data using web automation techniques.
npm run import -- "Get messages from facebook"
var importer = require('../Core');
var chrono = require('chrono-node');
var glob = require('glob');
var fs = require('fs');
var readFacebookProfileInfo = importer.import("scrape facebook profile",
"{client}")
var PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE;
var project = PROFILE_PATH + '/Conversations';
function switchToParticipantThread(i) {
return client
.pause(1000)
.click('//*[contains(@class, "scrollable")]//h4[contains(., "People")]/parent::*//li[' + i + ']')
.pause(1000)
.then(() => readFacebookProfileInfo())
.catch(e => console.log(e))
};
function getThreadParticipants(thread) {
return client
.execute(() => {
return document.evaluate(
'count(//*[contains(@class, "scrollable")]//h4[contains(., "People")]/parent::*//li)',
document, null,
XPathResult.NUMBER_TYPE, null).numberValue;
})
.then(r => {
var result = [];
for (var i = 1; i < Math.min(3, r.value); i++) {
result[result.length] = (i =>
resolve => {
var profile = {};
switchToParticipantThread(i)
.then(r => profile = r)
.url(thread)
.pause(3000)
.catch(e => console.log(e))
.then(() => resolve(profile))
})(i);
}
if (r.value === 0) {
result[result.length] = (resolve => {
var profile = {};
readFacebookProfileInfo()
.then(r => profile = r)
.url(thread)
.pause(3000)
.catch(e => console.log(e))
.then(() => resolve(profile))
})
}
return importer.runAllPromises(result).catch(e => console.log(e));
})
.catch(e => console.log(e))
};
function readFacebookMessages(messages, i = 0) {
const offset = (new Date()).getTimezoneOffset() / 60;
var browserOffset;
// TODO: add check for needing to go to LinkedIn
// TODO: add check for needing to login
return client
// TODO: move this to utility?
.execute(() => (new Date()).getTimezoneOffset())
.then(o => browserOffset = o.value)
.getAllUntil(
'//*[contains(@role, "main")]//*[contains(@class, "scrollable")][contains(.,"Messages")]',
'//*[contains(@role, "main")]//*[contains(@class, "scrollable")][contains(.,"Messages")]//h4//text()',
[],
(a, b) => a + '' === b + '',
i => i < 10,
true /* scroll up, not down */
)
.getAllXPath([
'//*[contains(@role, "main")]//*[contains(@class, "scrollable")][contains(.,"Messages")]//h4',
{
time: './/text()',
messages: [
'./following-sibling::div[count(./preceding-sibling::h4)=$i+1]',
{
from: './/h5//text()',
message: './/*[not(name()="H5")]/text()'
}
]
}
])
.then(messages => {
return messages.reduce((acc, m) => {
// TODO: this should get the timezone inside the container using client.execute() and append:
var results = new chrono.parse(m.time); // Create new ParsedResult Object
results[0].start.assign('timezoneOffset', browserOffset * 60); // Change value in ParsedComponents Object 'start'
var newTime = results[0].start.date(); // lazy fix
// facebook only displays the time if the message is sent that day,
// this causes a problem if the timezones in the brwoser are different than the timezone here in Node.
// We have to reverse the effect of the timezone, check if the day is incremented and increment the newTime accordingly.
/*
const localTime = new Date();
const offsetTime = new Date();
offsetTime.setHours(offsetTime.getHours() - (browserOffset - offset));
if(newTime.getDate() === localTime.getDate() && newTime.getDate() + 1 === offsetTime.getDate()) {
// the current time adjusted for GMT has incremented one day
newTime.setDate(newTime.getDate() + 1);
console.log(newTime);
}
*/
m.messages.forEach((r, i) => {
const newMessage = {
time: newTime,
from: r.from,
message: typeof r.message === 'string' ? r.message : r.message.join('\n')
};
if(i === 0 && newMessage.message === newMessage.from) {
return;
}
acc.push(newMessage);
})
return acc;
}, []);
})
.catch(e => console.log(e))
};
function readFacebookThread(thread) {
var threadId = thread.replace(/^\/|\/$/ig, '').split('/').pop()
.replace(/[^a-z0-9]/ig, '_');
var file = glob.sync('**/' + threadId + '-*.json', {cwd: project})[0];
var messages;
try {
messages = JSON.parse(fs.readFileSync(file))
}
catch (e) {
messages = []
}
var participants;
return client.getUrl().then(url => url.indexOf(thread) === -1
? client.url(thread)
: client)
// get participants from topcard
.pause(1000)
.then(() => getThreadParticipants(thread))
.then(p => (participants = p))
// TODO: save to contacts
.then(() => readFacebookMessages(messages || []))
.then(messages => ({
thread: thread,
participants: participants || [],
messages: messages
}))
.then(t => {
var filename = project + '/'
+ threadId
+ '-' + t.participants
.map(p => p.name).join('')
.replace(/[^a-z0-9]/ig, '_') + '.json';
fs.writeFileSync(filename, JSON.stringify(t, null, 4));
return t;
})
.catch(e => console.log(e))
};
module.exports = readFacebookThread;
const { promisify } = require('util');
const { resolve } = require('path');
const fs = promisify(require('fs'));
const glob = promisify(require('glob'));
const { readFileSync } = require('fs');
const chrono = require('chrono-node');
const { URL } = require('url');
// Define constants
const PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE;
const project = resolve(PROFILE_PATH, 'Conversations');
// Define functions
async function switchToParticipantThread(client, i) {
try {
await client.pause(1000);
await client.click(`//*[contains(@class, "scrollable")]//h4[contains(., "People")]/parent::*//li[${i + 1}]`);
await client.pause(1000);
return await readFacebookProfileInfo(client);
} catch (e) {
console.log(e);
}
}
async function getThreadParticipants(client, thread) {
try {
const participantsCount = await client.execute(() => {
const xpathResult = document.evaluate(
'count(//*[contains(@class, "scrollable")]//h4[contains(., "People")]/parent::*//li)',
document, null,
XPathResult.NUMBER_TYPE, null);
return xpathResult.numberValue;
});
let participants = [];
if (participantsCount === 0) {
participants = [await readFacebookProfileInfo(client)];
} else {
for (let i = 1; i <= Math.min(3, participantsCount.value); i++) {
participants.push(await switchToParticipantThread(client, i));
}
}
return participants;
} catch (e) {
console.log(e);
}
}
async function readFacebookMessages(client, messages, i = 0) {
try {
const browserOffset = await client.execute(() => (new Date()).getTimezoneOffset());
const offset = Math.abs(browserOffset.value) / 60;
const messagesParsed = await client.getAllUntil(
'//*[contains(@role, "main")]//*[contains(@class, "scrollable")][contains(.,"Messages")]',
'//*[contains(@role, "main")]//*[contains(@class, "scrollable")][contains(.,"Messages")]//h4//text()',
[],
(a, b) => a + '' === b + '',
i => i < 10,
true /* scroll up, not down */
);
const messagesMapped = messagesParsed.getAllXPath([
'//*[contains(@role, "main")]//*[contains(@class, "scrollable")][contains(.,"Messages")]//h4',
{
time: './/text()',
messages: [
'./following-sibling::div[count(./preceding-sibling::h4)=$i+1]',
{
from: './/h5//text()',
message: './/*[not(name()="H5")]/text()'
}
]
}
]);
const formattedMessages = messagesMapped.reduce((acc, m) => {
const parsedResult = chrono.parse(m.time);
parsedResult[0].start.assign('timezoneOffset', browserOffset.value * 60);
const newTime = parsedResult[0].start.date();
m.messages.forEach((r, i) => {
const newMessage = {
time: newTime,
from: r.from,
message: typeof r.message ==='string'? r.message : r.message.join('\n')
};
if (i === 0 && newMessage.message === newMessage.from) {
return;
}
acc.push(newMessage);
});
return acc;
}, []);
return formattedMessages;
} catch (e) {
console.log(e);
}
}
async function readFacebookThread(client, thread) {
try {
const threadId = thread.replace(/^\/|\/$/ig, '').split('/').pop().replace(/[^a-z0-9]/ig, '_');
const file = await glob(`**/${threadId}-*.json`, { cwd: project });
const messages = file.length > 0? JSON.parse(await fs.readFile(file[0])) : [];
const participants = await getThreadParticipants(client, thread);
const messagesParsed = await readFacebookMessages(client, messages);
const data = {
thread,
participants,
messages: messagesParsed
};
const filename = `${project}/${threadId}-${participants.map(p => p.name).join('').replace(/[^a-z0-9]/ig, '_')}.json`;
await fs.writeFile(filename, JSON.stringify(data, null, 4));
return data;
} catch (e) {
console.log(e);
}
}
module.exports = readFacebookThread;
This code snippet focuses on extracting information about participants in Facebook message threads.
Here's a breakdown:
Setup:
importer
(likely for interacting with other parts of the application), chrono
(for date/time parsing), glob
(for file pattern matching), fs
(for file system operations), and readFacebookProfileInfo
(a function to scrape profile data from Facebook).PROFILE_PATH
and project
for storing data.switchToParticipantThread(i)
Function:
readFacebookProfileInfo()
to extract profile data for the selected participant.getThreadParticipants(thread)
Function:
document.evaluate
to count list items representing participants.switchToParticipantThread()
to extract profile information for each.importer.runAllPromises()
to handle the asynchronous nature of these operations.Key Points:
readFacebookProfileInfo()
.Let me know if you have any other questions.