facebook data | Scrape facebook profile | Scrape facebook event | Search

This code automates the process of scraping Facebook posts, fetching their URLs and extracting details like descriptions and participant profiles using Puppeteer.

Run example

npm run import -- "Like all facebook posts"

Like all facebook posts

var glob = require('glob');
var importer = require('../Core');
var fs = require('fs');

var PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE;
var project = PROFILE_PATH + '/Conversations';

function listFacebookPosts(profile, posts = []) {
    return client
        .then(() => getAllUntil(
            '//body',
            '//*[contains(@class, "timestampContent")]/parent::*/parent::*/@href',
            posts,
            (a, b) => a === b,
            (i) => i < 20
        ))
        // transform add facebook url
        .then(r => {
        return r.map(l => l.indexOf('facebook.com') === -1
            ? ('https://www.facebook.com' + l)
            : l)
    })
        .catch(e => console.log(e))
};

function scrapeFacebookPost(post) {
    console.log(post);
    return client
        .getUrl().then(url => url.indexOf(post) == -1
            ? client.url(post)
            : client)
        .pause(post.indexOf('video') > -1 ? 4000 : 2000)
        .then(() => getAllXPath([
            '//body[not(.//*[contains(@class, "fbPhotoSnowliftPopup")])]|//body[.//*[contains(@class, "fbPhotoSnowliftPopup")]]//*[contains(@class, "fbPhotoSnowliftPopup")]',
            {
                posts: [
                    './/*[contains(@class, "userContentWrapper")]|.//*[contains(@class, "fbUserPost")]|.//*[contains(@class, "fbUserStory")]',
                    {
                        description:
                            './/following-sibling::div//*[contains(@class, "fbPhotosPhotoCaption")]//text()|.//*[contains(@class, "userContent")]//text()|.//h5//text()|.//a[contains(@class, "profileLink")]//text()',
                        participants: [
                            './/a[contains(@class, "profileLink")]/@href|.//a[contains(@href, "facebook") and .//img]/@href',
                            './following-sibling::div//a/@href',
                            './/*[contains(@class, "commentable_item")]//a[contains(@class, "UFICommentActorName")]/@href'
                        ],
                        comments: [
                            './/h6[contains(., "Comments")]//following-sibling::div/div/div[contains(@class, "UFIComment")]',
                            {
                                time:
                                    './/*[contains(@class, "uiLinkSubtle")]//text()',
                                content:
                                    './/*[contains(@class, "UFICommentBody")]//text()',
                                from:
                                    './/a[contains(@class, "UFICommentActorName")]/text()|.//a[contains(@class, "UFICommentActorName")]/@href'
                            }
                        ]
                    }
                ]
            }
        ]))
        .then(r => r[0].posts.map(p => {
            return Object.assign(p, {
                post: post,
                description: typeof p.description === 'string' ? p.description : p.description.join(' '),
                participants: p.participants.filter(i => i
                        .indexOf('photo') === -1
                    && i !== '#'
                    && i.indexOf('ufi/reaction') === -1),
                photos: p.participants.filter(i => i
                        .indexOf('photo') !== -1
                    && i !== '#'
                    && i.indexOf('ajax/sharer') === -1),
                comments: p.comments.map(c => Object.assign(c, {
                    time: typeof c.time === 'string' ? c.time : c.time.join(' '),
                    content: typeof c.content === 'string' ? c.content : c.content.join(' '),
                    from: typeof c.from === 'string' ? c.from : c.from.join(' ')
                }))
            });
        }))
        .catch(e => console.log(e))
};

function likeFacebookPost(post) {
    return client
        .getUrl().then(url => url.indexOf(post) === -1
            ? client.url(post)
            : client)
        .pause(post.indexOf('video') > -1 ? 4000 : 2000)
        .elements(
            '//body[not(.//*[contains(@class, "fbPhotoSnowliftPopup")])]//a[contains(., "Like")][contains(@class, "UFILikeLink")][not(contains(@class, "UFILinkBright"))]|//body[.//*[contains(@class, "fbPhotoSnowliftPopup")]]//*[contains(@class, "fbPhotoSnowliftPopup")]//a[contains(., "Like")][contains(@class, "UFILikeLink")][not(contains(@class, "UFILinkBright"))]')
        .then(els => {
            return importer.runAllPromises(els.value.slice(0, 4)
                .map(el => (resolve) => client.elementIdClick(el.ELEMENT)
                    .pause(1500)
                    .catch(e => resolve())
                    .then(() => resolve())))
        })
        .catch(e => console.log(e));
}

var posts;
function likeAllPosts(profile, like = null) {
    var profileId = profile.replace(/^\/|\/$/ig, '').split('/').pop()
        .replace(/[^a-z0-9]/ig, '_');
    var file = glob.sync('**/' + profileId + '-*.json', {cwd: project})[0];
    try {
        posts = JSON.parse(fs.readFileSync(file)) || [];
    }
    catch (e) {
        posts = []
    }
    return client
        .getUrl()
        .then(url => url !== profile
            ? client.url(profile)
            : client)
        .then(() => listFacebookPosts(posts.map(p => p.post)))
        // TODO: remove slice to download all posts from first part
        .then(r => importer.runAllPromises(r.map(c => (resolve) => {
            return (like ? likeFacebookPost(c) : client)
                .then(() => client.isExisting('//h3[contains(.,"Temporarily Blocked")]'))
                .then(is => {
                    if (is) like = null;
                    return scrapeFacebookPost(c);
                })
                .then(r => resolve(r))
                .catch(e => console.log(e))
        })))
        .then(t => {
            var filename = project + '/' + profileId
                + '-posts-' + (new Date()).getFullYear() + '.json';
            fs.writeFileSync(filename, JSON.stringify(t, null, 4));
            return t;
        })
        .catch(e => console.log(e))
};
module.exports = likeAllPosts;

What the code could have been:

const glob = require('glob');
const fs = require('fs');
const importer = require('../Core');
const Client = require('selenium-webdriver');

const PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE;
const project = PROFILE_PATH + '/Conversations';
const client = new Client.Builder().forBrowser('chrome').build();

async function listFacebookPosts(profile, posts = []) {
    try {
        const r = await client.getAllUntil(
            '//body',
            '//*[contains(@class, "timestampContent")]/parent::*/parent::*/@href',
            posts,
            (a, b) => a === b,
            (i) => i < 20
        );
        const facebookUrls = r.map(l => l.includes('facebook.com')? l : `https://www.facebook.com${l}`);
        return facebookUrls;
    } catch (e) {
        console.error(e);
        return [];
    }
};

async function scrapeFacebookPost(post) {
    try {
        await client.getUrl().then(url => {
            if (url.indexOf(post) === -1) {
                client.url(post);
            }
        });
        await client.pause(post.includes('video')? 4000 : 2000);
        const r = await client.getAllXPath([
            '//body[not(.//*[contains(@class, "fbPhotoSnowliftPopup")])]|//body[.//*[contains(@class, "fbPhotoSnowliftPopup")]]//*[contains(@class, "fbPhotoSnowliftPopup")]',
            {
                posts: [
                    './/*[contains(@class, "userContentWrapper")]|.//*[contains(@class, "fbUserPost")]|.//*[contains(@class, "fbUserStory")]',
                    {
                        description:
                            './/following-sibling::div//*[contains(@class, "fbPhotosPhotoCaption")]//text()|.//*[contains(@class, "userContent")]//text()|.//h5//text()|.//a[contains(@class, "profileLink")]//text()',
                        participants: [
                            './/a[contains(@class, "profileLink")]/@href|.//a[contains(@href, "facebook") and.//img]/@href',
                            './following-sibling::div//a/@href',
                            './/*[contains(@class, "commentable_item")]//a[contains(@class, "UFICommentActorName")]/@href'
                        ],
                        comments: [
                            './/h6[contains(., "Comments")]//following-sibling::div/div/div[contains(@class, "UFIComment")]',
                            {
                                time:
                                    './/*[contains(@class, "uiLinkSubtle")]//text()',
                                content:
                                    './/*[contains(@class, "UFICommentBody")]//text()',
                                from:
                                    './/a[contains(@class, "UFICommentActorName")]/text()|.//a[contains(@class, "UFICommentActorName")]/@href'
                            }
                        ]
                    }
                ]
            }
        ]);
        const posts = r[0].posts.map(p => {
            return {
               ...p,
                post: post,
                description: typeof p.description ==='string'? p.description : p.description.join(' '),
                participants: p.participants.filter(i => i.indexOf('photo') === -1 && i!== '#' && i.indexOf('ufi/reaction') === -1),
                photos: p.participants.filter(i => i.indexOf('photo')!== -1 && i!== '#' && i.indexOf('ajax/sharer') === -1),
                comments: p.comments.map(c => ({
                   ...c,
                    time: typeof c.time ==='string'? c.time : c.time.join(' '),
                    content: typeof c.content ==='string'? c.content : c.content.join(' '),
                    from: typeof c.from ==='string'? c.from : c.from.join(' ')
                }))
            };
        });
        return posts;
    } catch (e) {
        console.error(e);
        return [];
    }
};

async function likeFacebookPost(post) {
    try {
        await client.getUrl().then(url => {
            if (url.indexOf(post) === -1) {
                client.url(post);
            }
        });
        await client.pause(post.includes('video')? 4000 : 2000);
        const els = await client.elements(
            '//body[not(.//*[contains(@class, "fbPhotoSnowliftPopup")])]//a[contains(., "Like")][contains(@class, "UFILikeLink")][not(contains(@class, "UFILinkBright"))]|//body[.//*[contains(@class, "fbPhotoSnowliftPopup")]]//*[contains(@class, "fbPhotoSnowliftPopup")]//a[contains(., "Like")][contains(@class, "UFILikeLink")][not(contains(@class, "UFILinkBright"))]'
        );
        await importer.runAllPromises(els.value.slice(0, 4).map(el => () => client.elementIdClick(el.ELEMENT).catch(() => {}).then(() => client.pause(1500))));
    } catch (e) {
        console.error(e);
    }
};

async function likeAllPosts(profile, like = null) {
    const profileId = profile.replace(/^\/|\/$/ig, '').split('/').pop().replace(/[^a-z0-9]/ig, '_');
    const file = glob.sync('**/' + profileId + '-*.json', { cwd: project })[0];
    try {
        const posts = JSON.parse(fs.readFileSync(file)) || [];
        const facebookUrls = await listFacebookPosts(profile, posts.map(p => p.post));
        const results = [];
        for (const url of facebookUrls) {
            const r = await scrapeFacebookPost(url);
            if (like) {
                await likeFacebookPost(url);
            }
            results.push(...r);
        }
        const filename = project + '/' + profileId + '-posts-' + (new Date()).getFullYear() + '.json';
        fs.writeFileSync(filename, JSON.stringify(results, null, 4));
        return results;
    } catch (e) {
        console.error(e);
        return [];
    } finally {
        client.quit();
    }
};

module.exports = likeAllPosts;

This code snippet is designed to scrape Facebook posts and extract information from them. Here's a breakdown:

Dependencies:

Variables:

Functions:

  1. listFacebookPosts(profile, posts = []):

  2. scrapeFacebookPost(post):

Purpose:

This code snippet is part of a larger system for scraping Facebook posts and extracting relevant data. It automates the process of fetching post URLs and then scraping detailed information from each post.

Let me know if you have any other questions.