facebook data | Like all facebook posts | Scrape facebook events | Search

This code automates the scraping of Facebook event details, including its description and discussions, using Puppeteer.

Run example

npm run import -- "Scrape facebook event"

Scrape facebook event

var importer = require('../Core');

function scrapeFacebookEvent(event) {
    var description;
    return client
        .getUrl()
        .then(url => url.indexOf(event) === -1
            ? client.url(event)
            : client)
        .pause(3000)
        .isExisting('//a[contains(., "About")]')
        .then(is => is ? client.click('//a[contains(., "About")]') : [])
        .pause(1000)
        .then(() => getAllXPath([
            '//body[not(.//*[contains(@class, "fbPhotoSnowliftPopup")])]|//body[.//*[contains(@class, "fbPhotoSnowliftPopup")]]//*[contains(@class, "fbPhotoSnowliftPopup")]',
            {
                description: [
                    './/*[contains(@id, "reaction_units")]/div/div|.//*[contains(@id, "event_summary")]//li',
                    {
                        value: ['.//span//text()']
                    }
                ]
            }
        ]))
        .then(desc => {
            description = desc;
        })
        .isExisting('//a[contains(., "Discussion")]')
        .then(is => is ? client.click('//a[contains(., "Discussion")]') : client)
        .pause(1000)
        .then(() => {
            return getAllXPath([
                '//body[not(.//*[contains(@class, "fbPhotoSnowliftPopup")])]|//body[.//*[contains(@class, "fbPhotoSnowliftPopup")]]//*[contains(@class, "fbPhotoSnowliftPopup")]',
                {
                    posts: [
                        './/*[contains(@class, "fbPhotoSnowliftAuthorInfo")]|.//*[contains(@class, "fbUserPost")]',
                        {
                            description:
                                './/following-sibling::div//*[contains(@class, "fbPhotosPhotoCaption")]//text()|.//*[contains(@class, "userContent")]//text()|.//h5//text()|.//a[contains(@class, "profileLink")]//text()',
                            participants: [
                                './/a[contains(@class, "profileLink")]/@href|.//a[contains(@href, "facebook") and .//img]/@href',
                                './following-sibling::div//a/@href',
                                './/*[contains(@class, "commentable_item")]//a[contains(@class, "UFICommentActorName")]/@href'
                            ],
                            comments: [
                                './/h6[contains(., "Comments")]//following-sibling::div/div/div[contains(@class, "UFIComment")]',
                                {
                                    time:
                                        './/*[contains(@class, "uiLinkSubtle")]//text()',
                                    content:
                                        './/*[contains(@class, "UFICommentBody")]//text()',
                                    from:
                                        './/a[contains(@class, "UFICommentActorName")]/text()|.//a[contains(@class, "UFICommentActorName")]/@href'
                                }
                            ]
                        }
                    ]
                }
            ])
        })
        .then(results => results.map((r, i) => Object.assign(r, {
            description: description[i].description.map(d => d.value.join(' ')),
            posts: r.posts.map(p => {
                return Object.assign(p, {
                    event: event,
                    description: p.description.join(' '),
                    participants: p.participants.filter(i => i
                            .indexOf('photo') === -1
                        && i !== '#'
                        && i.indexOf('ufi/reaction') === -1),
                    photos: p.participants.filter(i => i
                            .indexOf('photo') !== -1
                        && i !== '#'
                        && i.indexOf('ajax/sharer') === -1),
                    comments: p.comments.map(c => Object.assign(c, {
                        time: c.time.join(' '),
                        content: c.content.join(' '),
                        from: c.from.join(' ')
                    }))
                });
            })
        })))
        .catch(e => console.log(e))
};
module.exports = scrapeFacebookEvent;

What the code could have been:

const { Client } = require('../Core');

/**
 * Scrape a Facebook event.
 * 
 * This function uses a Puppeteer client to navigate to the event page, extract information, 
 * and return the results in a structured format.
 * 
 * @param {string} event - The URL of the Facebook event to scrape.
 * @returns {Promise<Object>} A promise that resolves to an object containing the scraped data.
 */
async function scrapeFacebookEvent(event) {
    try {
        // Create a new Puppeteer client instance
        const client = new Client();

        // Navigate to the event page
        const url = await client.getUrl();
        if (!url.includes(event)) {
            // If the current URL doesn't match the event URL, navigate to the event page
            await client.url(event);
        }

        // Wait for the page to load
        await client.pause(3000);

        // Check if the About button exists and click it if it does
        if (await client.isExisting('//a[contains(., "About")]')) {
            await client.click('//a[contains(., "About")]');
        }

        // Wait for the next page to load
        await client.pause(1000);

        // Extract event description
        const description = await extractDescription(client);

        // Check if the Discussion button exists and click it if it does
        if (await client.isExisting('//a[contains(., "Discussion")]')) {
            await client.click('//a[contains(., "Discussion")]');
        }

        // Wait for the next page to load
        await client.pause(1000);

        // Extract discussion data
        const discussionData = await extractDiscussionData(client, description);

        // Process the discussion data
        const processedData = discussionData.map((data, index) => ({
           ...data,
            description: description[index].description.map((desc) => desc.value.join(' ')),
            posts: data.posts.map((post) => ({
               ...post,
                event,
                description: post.description.join(' '),
                participants: post.participants.filter((participant) => participant!== '#' && participant.indexOf('photo') === -1),
                photos: post.participants.filter((participant) => participant!== '#' && participant.indexOf('photo')!== -1),
                comments: post.comments.map((comment) => ({
                   ...comment,
                    time: comment.time.join(' '),
                    content: comment.content.join(' '),
                    from: comment.from.join(' ')
                }))
            }))
        }));

        // Return the processed data
        return processedData;
    } catch (error) {
        console.log(error);
        throw error;
    }
}

/**
 * Extract event description.
 * 
 * This function uses Puppeteer to extract the event description from the event page.
 * 
 * @param {Client} client - The Puppeteer client instance.
 * @returns {Promise<Array<Object>>} A promise that resolves to an array of objects containing the event description.
 */
async function extractDescription(client) {
    const xPath = '//body[not(.//*[contains(@class, "fbPhotoSnowliftPopup")])]|//body[.//*[contains(@class, "fbPhotoSnowliftPopup")]]//*[contains(@class, "fbPhotoSnowliftPopup")]';
    const description = await getAllXPath([xPath, {
        description: [
            './/*[contains(@id, "reaction_units")]/div/div|.//*[contains(@id, "event_summary")]//li',
            {
                value: ['.//span//text()']
            }
        ]
    }]);
    return description;
}

/**
 * Extract discussion data.
 * 
 * This function uses Puppeteer to extract the discussion data from the discussion page.
 * 
 * @param {Client} client - The Puppeteer client instance.
 * @param {Array<Object>} description - The event description.
 * @returns {Promise<Array<Object>>} A promise that resolves to an array of objects containing the discussion data.
 */
async function extractDiscussionData(client, description) {
    const xPath = '//body[not(.//*[contains(@class, "fbPhotoSnowliftPopup")])]|//body[.//*[contains(@class, "fbPhotoSnowliftPopup")]]//*[contains(@class, "fbPhotoSnowliftPopup")]';
    const discussionData = await getAllXPath([xPath, {
        posts: [
            './/*[contains(@class, "fbPhotoSnowliftAuthorInfo")]|.//*[contains(@class, "fbUserPost")]',
            {
                description:
                    './/following-sibling::div//*[contains(@class, "fbPhotosPhotoCaption")]//text()|.//*[contains(@class, "userContent")]//text()|.//h5//text()|.//a[contains(@class, "profileLink")]//text()',
                participants: [
                    './/a[contains(@class, "profileLink")]/@href|.//a[contains(@href, "facebook") and.//img]/@href',
                    './following-sibling::div//a/@href',
                    './/*[contains(@class, "commentable_item")]//a[contains(@class, "UFICommentActorName")]/@href'
                ],
                comments: [
                    './/h6[contains(., "Comments")]//following-sibling::div/div/div[contains(@class, "UFIComment")]',
                    {
                        time:
                            './/*[contains(@class, "uiLinkSubtle")]//text()',
                        content:
                            './/*[contains(@class, "UFICommentBody")]//text()',
                        from:
                            './/a[contains(@class, "UFICommentActorName")]/text()|.//a[contains(@class, "UFICommentActorName")]/@href'
                    }
                ]
            }
        ]
    }]);
    return discussionData;
}

/**
 * Get all XPath elements.
 * 
 * This function uses Puppeteer to get all elements matching the specified XPath.
 * 
 * @param {Array<string|Object>} xPath - The XPath to match.
 * @returns {Promise<Array<Object>>} A promise that resolves to an array of objects containing the matched elements.
 */
async function getAllXPath(xPath) {
    // TO DO: implement get all XPath function
    throw new Error('Not implemented');
}

module.exports = scrapeFacebookEvent;

This code snippet is designed to scrape information about a Facebook event. Here's a breakdown:

Dependencies:

Function:

Purpose:

This code snippet is part of a larger system for scraping Facebook event data. It automates the process of fetching an event page, extracting its description, and gathering information about discussions happening within the event.

Let me know if you have any other questions.