This code automates the scraping of Facebook event details, including its description and discussions, using Puppeteer.
npm run import -- "Scrape facebook event"
var importer = require('../Core');
function scrapeFacebookEvent(event) {
var description;
return client
.getUrl()
.then(url => url.indexOf(event) === -1
? client.url(event)
: client)
.pause(3000)
.isExisting('//a[contains(., "About")]')
.then(is => is ? client.click('//a[contains(., "About")]') : [])
.pause(1000)
.then(() => getAllXPath([
'//body[not(.//*[contains(@class, "fbPhotoSnowliftPopup")])]|//body[.//*[contains(@class, "fbPhotoSnowliftPopup")]]//*[contains(@class, "fbPhotoSnowliftPopup")]',
{
description: [
'.//*[contains(@id, "reaction_units")]/div/div|.//*[contains(@id, "event_summary")]//li',
{
value: ['.//span//text()']
}
]
}
]))
.then(desc => {
description = desc;
})
.isExisting('//a[contains(., "Discussion")]')
.then(is => is ? client.click('//a[contains(., "Discussion")]') : client)
.pause(1000)
.then(() => {
return getAllXPath([
'//body[not(.//*[contains(@class, "fbPhotoSnowliftPopup")])]|//body[.//*[contains(@class, "fbPhotoSnowliftPopup")]]//*[contains(@class, "fbPhotoSnowliftPopup")]',
{
posts: [
'.//*[contains(@class, "fbPhotoSnowliftAuthorInfo")]|.//*[contains(@class, "fbUserPost")]',
{
description:
'.//following-sibling::div//*[contains(@class, "fbPhotosPhotoCaption")]//text()|.//*[contains(@class, "userContent")]//text()|.//h5//text()|.//a[contains(@class, "profileLink")]//text()',
participants: [
'.//a[contains(@class, "profileLink")]/@href|.//a[contains(@href, "facebook") and .//img]/@href',
'./following-sibling::div//a/@href',
'.//*[contains(@class, "commentable_item")]//a[contains(@class, "UFICommentActorName")]/@href'
],
comments: [
'.//h6[contains(., "Comments")]//following-sibling::div/div/div[contains(@class, "UFIComment")]',
{
time:
'.//*[contains(@class, "uiLinkSubtle")]//text()',
content:
'.//*[contains(@class, "UFICommentBody")]//text()',
from:
'.//a[contains(@class, "UFICommentActorName")]/text()|.//a[contains(@class, "UFICommentActorName")]/@href'
}
]
}
]
}
])
})
.then(results => results.map((r, i) => Object.assign(r, {
description: description[i].description.map(d => d.value.join(' ')),
posts: r.posts.map(p => {
return Object.assign(p, {
event: event,
description: p.description.join(' '),
participants: p.participants.filter(i => i
.indexOf('photo') === -1
&& i !== '#'
&& i.indexOf('ufi/reaction') === -1),
photos: p.participants.filter(i => i
.indexOf('photo') !== -1
&& i !== '#'
&& i.indexOf('ajax/sharer') === -1),
comments: p.comments.map(c => Object.assign(c, {
time: c.time.join(' '),
content: c.content.join(' '),
from: c.from.join(' ')
}))
});
})
})))
.catch(e => console.log(e))
};
module.exports = scrapeFacebookEvent;
const { Client } = require('../Core');
/**
* Scrape a Facebook event.
*
* This function uses a Puppeteer client to navigate to the event page, extract information,
* and return the results in a structured format.
*
* @param {string} event - The URL of the Facebook event to scrape.
* @returns {Promise<Object>} A promise that resolves to an object containing the scraped data.
*/
async function scrapeFacebookEvent(event) {
try {
// Create a new Puppeteer client instance
const client = new Client();
// Navigate to the event page
const url = await client.getUrl();
if (!url.includes(event)) {
// If the current URL doesn't match the event URL, navigate to the event page
await client.url(event);
}
// Wait for the page to load
await client.pause(3000);
// Check if the About button exists and click it if it does
if (await client.isExisting('//a[contains(., "About")]')) {
await client.click('//a[contains(., "About")]');
}
// Wait for the next page to load
await client.pause(1000);
// Extract event description
const description = await extractDescription(client);
// Check if the Discussion button exists and click it if it does
if (await client.isExisting('//a[contains(., "Discussion")]')) {
await client.click('//a[contains(., "Discussion")]');
}
// Wait for the next page to load
await client.pause(1000);
// Extract discussion data
const discussionData = await extractDiscussionData(client, description);
// Process the discussion data
const processedData = discussionData.map((data, index) => ({
...data,
description: description[index].description.map((desc) => desc.value.join(' ')),
posts: data.posts.map((post) => ({
...post,
event,
description: post.description.join(' '),
participants: post.participants.filter((participant) => participant!== '#' && participant.indexOf('photo') === -1),
photos: post.participants.filter((participant) => participant!== '#' && participant.indexOf('photo')!== -1),
comments: post.comments.map((comment) => ({
...comment,
time: comment.time.join(' '),
content: comment.content.join(' '),
from: comment.from.join(' ')
}))
}))
}));
// Return the processed data
return processedData;
} catch (error) {
console.log(error);
throw error;
}
}
/**
* Extract event description.
*
* This function uses Puppeteer to extract the event description from the event page.
*
* @param {Client} client - The Puppeteer client instance.
* @returns {Promise<Array<Object>>} A promise that resolves to an array of objects containing the event description.
*/
async function extractDescription(client) {
const xPath = '//body[not(.//*[contains(@class, "fbPhotoSnowliftPopup")])]|//body[.//*[contains(@class, "fbPhotoSnowliftPopup")]]//*[contains(@class, "fbPhotoSnowliftPopup")]';
const description = await getAllXPath([xPath, {
description: [
'.//*[contains(@id, "reaction_units")]/div/div|.//*[contains(@id, "event_summary")]//li',
{
value: ['.//span//text()']
}
]
}]);
return description;
}
/**
* Extract discussion data.
*
* This function uses Puppeteer to extract the discussion data from the discussion page.
*
* @param {Client} client - The Puppeteer client instance.
* @param {Array<Object>} description - The event description.
* @returns {Promise<Array<Object>>} A promise that resolves to an array of objects containing the discussion data.
*/
async function extractDiscussionData(client, description) {
const xPath = '//body[not(.//*[contains(@class, "fbPhotoSnowliftPopup")])]|//body[.//*[contains(@class, "fbPhotoSnowliftPopup")]]//*[contains(@class, "fbPhotoSnowliftPopup")]';
const discussionData = await getAllXPath([xPath, {
posts: [
'.//*[contains(@class, "fbPhotoSnowliftAuthorInfo")]|.//*[contains(@class, "fbUserPost")]',
{
description:
'.//following-sibling::div//*[contains(@class, "fbPhotosPhotoCaption")]//text()|.//*[contains(@class, "userContent")]//text()|.//h5//text()|.//a[contains(@class, "profileLink")]//text()',
participants: [
'.//a[contains(@class, "profileLink")]/@href|.//a[contains(@href, "facebook") and.//img]/@href',
'./following-sibling::div//a/@href',
'.//*[contains(@class, "commentable_item")]//a[contains(@class, "UFICommentActorName")]/@href'
],
comments: [
'.//h6[contains(., "Comments")]//following-sibling::div/div/div[contains(@class, "UFIComment")]',
{
time:
'.//*[contains(@class, "uiLinkSubtle")]//text()',
content:
'.//*[contains(@class, "UFICommentBody")]//text()',
from:
'.//a[contains(@class, "UFICommentActorName")]/text()|.//a[contains(@class, "UFICommentActorName")]/@href'
}
]
}
]
}]);
return discussionData;
}
/**
* Get all XPath elements.
*
* This function uses Puppeteer to get all elements matching the specified XPath.
*
* @param {Array<string|Object>} xPath - The XPath to match.
* @returns {Promise<Array<Object>>} A promise that resolves to an array of objects containing the matched elements.
*/
async function getAllXPath(xPath) {
// TO DO: implement get all XPath function
throw new Error('Not implemented');
}
module.exports = scrapeFacebookEvent;
This code snippet is designed to scrape information about a Facebook event. Here's a breakdown:
Dependencies:
importer
: A custom module (not shown) likely providing utility functions.Function:
scrapeFacebookEvent(event)
:
event
) as input.client
) to:
Purpose:
This code snippet is part of a larger system for scraping Facebook event data. It automates the process of fetching an event page, extracting its description, and gathering information about discussions happening within the event.
Let me know if you have any other questions.