facebook messaging | List Facebook threads | Cell 5 | Search

This code extracts information about participants in Facebook message threads by navigating to each participant and scraping their profile data using web automation techniques.

Run example

npm run import -- "Get messages from facebook"

Get messages from facebook

var importer = require('../Core');
var chrono = require('chrono-node');
var glob = require('glob');
var fs = require('fs');
var readFacebookProfileInfo = importer.import("scrape facebook profile",
"{client}")

var PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE;
var project = PROFILE_PATH + '/Conversations';

function switchToParticipantThread(i) {
    return client
        .pause(1000)
        .click('//*[contains(@class, "scrollable")]//h4[contains(., "People")]/parent::*//li[' + i + ']')
        .pause(1000)
        .then(() => readFacebookProfileInfo())
        .catch(e => console.log(e))
};

function getThreadParticipants(thread) {
    return client
        .execute(() => {
            return document.evaluate(
                'count(//*[contains(@class, "scrollable")]//h4[contains(., "People")]/parent::*//li)',
                document, null,
                XPathResult.NUMBER_TYPE, null).numberValue;
        })
        .then(r => {
            var result = [];
            for (var i = 1; i < Math.min(3, r.value); i++) {
                result[result.length] = (i =>
                    resolve => {
                        var profile = {};
                        switchToParticipantThread(i)
                            .then(r => profile = r)
                            .url(thread)
                            .pause(3000)
                            .catch(e => console.log(e))
                            .then(() => resolve(profile))
                    })(i);
            }
            if (r.value === 0) {
                result[result.length] = (resolve => {
                    var profile = {};
                    readFacebookProfileInfo()
                        .then(r => profile = r)
                        .url(thread)
                        .pause(3000)
                        .catch(e => console.log(e))
                        .then(() => resolve(profile))
                })
            }
            return importer.runAllPromises(result).catch(e => console.log(e));
        })
        .catch(e => console.log(e))
};

function readFacebookMessages(messages, i = 0) {
    const offset = (new Date()).getTimezoneOffset() / 60;
    var browserOffset;
    // TODO: add check for needing to go to LinkedIn
    // TODO: add check for needing to login
    return client
        // TODO: move this to utility?
        .execute(() => (new Date()).getTimezoneOffset())
        .then(o => browserOffset = o.value)
        .getAllUntil(
            '//*[contains(@role, "main")]//*[contains(@class, "scrollable")][contains(.,"Messages")]',
            '//*[contains(@role, "main")]//*[contains(@class, "scrollable")][contains(.,"Messages")]//h4//text()',
            [],
            (a, b) => a + '' === b + '',
            i => i < 10,
            true /* scroll up, not down */
        )
        .getAllXPath([
            '//*[contains(@role, "main")]//*[contains(@class, "scrollable")][contains(.,"Messages")]//h4',
            {
                time: './/text()',
                messages: [
                    './following-sibling::div[count(./preceding-sibling::h4)=$i+1]',
                    {
                        from: './/h5//text()',
                        message: './/*[not(name()="H5")]/text()'
                    }
                ]
            }
        ])
        .then(messages => {
            return messages.reduce((acc, m) => {
                // TODO: this should get the timezone inside the container using client.execute() and append:
                var results = new chrono.parse(m.time); // Create new ParsedResult Object
                results[0].start.assign('timezoneOffset', browserOffset * 60); // Change value in ParsedComponents Object 'start'
                var newTime = results[0].start.date(); // lazy fix
                // facebook only displays the time if the message is sent that day,
                //   this causes a problem if the timezones in the brwoser are different than the timezone here in Node.
                //   We have to reverse the effect of the timezone, check if the day is incremented and increment the newTime accordingly.
                /*
                const localTime = new Date();
                const offsetTime = new Date();
                offsetTime.setHours(offsetTime.getHours() - (browserOffset - offset));
                if(newTime.getDate() === localTime.getDate() && newTime.getDate() + 1 === offsetTime.getDate()) {
                    // the current time adjusted for GMT has incremented one day
                    newTime.setDate(newTime.getDate() + 1);
                    console.log(newTime);
                }
                */
                
                m.messages.forEach((r, i) => {
                    const newMessage = {
                        time: newTime,
                        from: r.from,
                        message: typeof r.message === 'string' ? r.message : r.message.join('\n')
                    };
                    if(i === 0 && newMessage.message === newMessage.from) {
                        return;
                    }
                    acc.push(newMessage);
                })
                return acc;
            }, []);
        })
        .catch(e => console.log(e))
};

function readFacebookThread(thread) {
    var threadId = thread.replace(/^\/|\/$/ig, '').split('/').pop()
        .replace(/[^a-z0-9]/ig, '_');
    var file = glob.sync('**/' + threadId + '-*.json', {cwd: project})[0];
    var messages;
    try {
        messages = JSON.parse(fs.readFileSync(file))
    }
    catch (e) {
        messages = []
    }
    var participants;
    return client.getUrl().then(url => url.indexOf(thread) === -1
        ? client.url(thread)
        : client)
        // get participants from topcard
        .pause(1000)
        .then(() => getThreadParticipants(thread))
        .then(p => (participants = p))

        // TODO: save to contacts
        .then(() => readFacebookMessages(messages || []))
        .then(messages => ({
            thread: thread,
            participants: participants || [],
            messages: messages
        }))
        .then(t => {
            var filename = project + '/'
                + threadId
                + '-' + t.participants
                    .map(p => p.name).join('')
                    .replace(/[^a-z0-9]/ig, '_') + '.json';
            fs.writeFileSync(filename, JSON.stringify(t, null, 4));
            return t;
        })
        .catch(e => console.log(e))
};
module.exports = readFacebookThread;

What the code could have been:

const { promisify } = require('util');
const { resolve } = require('path');
const fs = promisify(require('fs'));
const glob = promisify(require('glob'));
const { readFileSync } = require('fs');
const chrono = require('chrono-node');
const { URL } = require('url');

// Define constants
const PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE;
const project = resolve(PROFILE_PATH, 'Conversations');

// Define functions
async function switchToParticipantThread(client, i) {
    try {
        await client.pause(1000);
        await client.click(`//*[contains(@class, "scrollable")]//h4[contains(., "People")]/parent::*//li[${i + 1}]`);
        await client.pause(1000);
        return await readFacebookProfileInfo(client);
    } catch (e) {
        console.log(e);
    }
}

async function getThreadParticipants(client, thread) {
    try {
        const participantsCount = await client.execute(() => {
            const xpathResult = document.evaluate(
                'count(//*[contains(@class, "scrollable")]//h4[contains(., "People")]/parent::*//li)',
                document, null,
                XPathResult.NUMBER_TYPE, null);
            return xpathResult.numberValue;
        });
        let participants = [];

        if (participantsCount === 0) {
            participants = [await readFacebookProfileInfo(client)];
        } else {
            for (let i = 1; i <= Math.min(3, participantsCount.value); i++) {
                participants.push(await switchToParticipantThread(client, i));
            }
        }

        return participants;
    } catch (e) {
        console.log(e);
    }
}

async function readFacebookMessages(client, messages, i = 0) {
    try {
        const browserOffset = await client.execute(() => (new Date()).getTimezoneOffset());
        const offset = Math.abs(browserOffset.value) / 60;

        const messagesParsed = await client.getAllUntil(
            '//*[contains(@role, "main")]//*[contains(@class, "scrollable")][contains(.,"Messages")]',
            '//*[contains(@role, "main")]//*[contains(@class, "scrollable")][contains(.,"Messages")]//h4//text()',
            [],
            (a, b) => a + '' === b + '',
            i => i < 10,
            true /* scroll up, not down */
        );

        const messagesMapped = messagesParsed.getAllXPath([
            '//*[contains(@role, "main")]//*[contains(@class, "scrollable")][contains(.,"Messages")]//h4',
            {
                time: './/text()',
                messages: [
                    './following-sibling::div[count(./preceding-sibling::h4)=$i+1]',
                    {
                        from: './/h5//text()',
                        message: './/*[not(name()="H5")]/text()'
                    }
                ]
            }
        ]);

        const formattedMessages = messagesMapped.reduce((acc, m) => {
            const parsedResult = chrono.parse(m.time);
            parsedResult[0].start.assign('timezoneOffset', browserOffset.value * 60);
            const newTime = parsedResult[0].start.date();

            m.messages.forEach((r, i) => {
                const newMessage = {
                    time: newTime,
                    from: r.from,
                    message: typeof r.message ==='string'? r.message : r.message.join('\n')
                };

                if (i === 0 && newMessage.message === newMessage.from) {
                    return;
                }

                acc.push(newMessage);
            });

            return acc;
        }, []);

        return formattedMessages;
    } catch (e) {
        console.log(e);
    }
}

async function readFacebookThread(client, thread) {
    try {
        const threadId = thread.replace(/^\/|\/$/ig, '').split('/').pop().replace(/[^a-z0-9]/ig, '_');
        const file = await glob(`**/${threadId}-*.json`, { cwd: project });
        const messages = file.length > 0? JSON.parse(await fs.readFile(file[0])) : [];

        const participants = await getThreadParticipants(client, thread);
        const messagesParsed = await readFacebookMessages(client, messages);

        const data = {
            thread,
            participants,
            messages: messagesParsed
        };

        const filename = `${project}/${threadId}-${participants.map(p => p.name).join('').replace(/[^a-z0-9]/ig, '_')}.json`;
        await fs.writeFile(filename, JSON.stringify(data, null, 4));

        return data;
    } catch (e) {
        console.log(e);
    }
}

module.exports = readFacebookThread;

This code snippet focuses on extracting information about participants in Facebook message threads.

Here's a breakdown:

  1. Setup:

  2. switchToParticipantThread(i) Function:

  3. getThreadParticipants(thread) Function:

Key Points:

Let me know if you have any other questions.