This code automates the scraping of Facebook event data from HTML files, extracts event URLs, and stores the scraped information in a JSON file.
npm run import -- "Scrape facebook events"
var glob = require('glob');
var fs = require('fs');
var path = require('path');
var importer = require('../Core');
var runSeleniumCell = importer.import("selenium cell");
var PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE;
var project = PROFILE_PATH + '/Conversations';
var filename = path.join(project, 'events.json');
function getEvents(file) {
var html = fs.readFileSync(file).toString();
return importer.regexToArray(/href="([^"?]*)/ig, html, 1)
.filter(h => h.indexOf('.js') === -1)
.filter(h => h.indexOf('.css') === -1)
.filter(h => h.match(/events\/[0-9]+/ig));
};
var scrapeFacebookEvent, loginFacebook;
function scrapeFacebookEvents() {
var eventFiles = glob.sync(
'**/*@(Past|Events|Cullinan).htm',
{cwd: project});
return runSeleniumCell([
'log in facebook',
'facebook event'
])
.then(r => {
console.log(r);
loginFacebook = r[0];
scrapeFacebookEvent = r[1];
return loginFacebook();
})
.then(() => {
var events = [
...getEvents(path.join(project, eventFiles[0])),
...getEvents(path.join(project, eventFiles[1])),
...getEvents(path.join(project, eventFiles[2]))
]
.filter((elem, pos, arr) => arr.indexOf(elem) === pos);
console.log(events);
return importer.runAllPromises(events
.map(e => resolve => scrapeFacebookEvent(e)
.then(r => resolve(r))
.catch(r => resolve(r))))
})
.then(r => {
fs.writeFileSync(filename, JSON.stringify(r, null, 4));
return r;
})
};
module.exports = scrapeFacebookEvents;
const glob = require('glob');
const fs = require('fs');
const path = require('path');
const importer = require('../Core');
const runSeleniumCell = importer.import('selenium cell');
const PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE;
const PROJECT_PATH = path.join(PROFILE_PATH, 'Conversations');
class FacebookEventScraper {
constructor() {
this.filename = path.join(PROJECT_PATH, 'events.json');
this.loginFacebook = null;
this.scrapeFacebookEvent = null;
}
async scrapeFacebookEvents() {
try {
const eventFiles = await this.getFileNamesMatchingPattern();
const loginResponse = await runSeleniumCell([
'log in facebook',
'facebook event'
]);
this.loginFacebook = loginResponse[0];
this.scrapeFacebookEvent = loginResponse[1];
await this.loginFacebook();
const events = await Promise.all(
eventFiles.map(async (file) => {
const filePath = path.join(PROJECT_PATH, file);
return this.getEventsFromHtmlFile(filePath);
})
);
const uniqueEvents = [...new Set(events.flat())];
const facebookEvents = await Promise.all(
uniqueEvents.map(async (event) => {
return await this.scrapeFacebookEvent(event);
})
);
await this.writeEventsToJSONFile(facebookEvents);
return facebookEvents;
} catch (error) {
console.error('Error occurred while scraping Facebook events:', error);
throw error;
}
}
async getFileNamesMatchingPattern() {
return glob.sync(
'**/*@(Past|Events|Cullinan).htm',
{ cwd: PROJECT_PATH }
);
}
async getEventsFromHtmlFile(filePath) {
const html = await this.readHtmlFile(filePath);
return this.regexToArray(/href="([^"?]*)/ig, html, 1)
.filter((h) => h.indexOf('.js') === -1)
.filter((h) => h.indexOf('.css') === -1)
.filter((h) => h.match(/events\/[0-9]+/ig));
}
async readHtmlFile(filePath) {
return fs.readFileSync(filePath).toString();
}
regexToArray(regex, str, index) {
return importer.regexToArray(regex, str, index);
}
async writeEventsToJSONFile(events) {
fs.writeFileSync(this.filename, JSON.stringify(events, null, 4));
}
}
module.exports = FacebookEventScraper.prototype.scrapeFacebookEvents;
This code snippet is designed to scrape Facebook event data from a set of HTML files and store the results in a JSON file. Here's a breakdown:
Dependencies:
glob
: Used for finding files matching a pattern.fs
: Node.js built-in module for file system operations (reading and writing files).path
: Node.js built-in module for working with file paths.importer
: A custom module (likely providing utility functions and potentially Selenium integration).Variables:
PROFILE_PATH
: Determines the user's home directory.project
: Sets the path to a "Conversations" directory within the user's home directory.filename
: Specifies the path to the output JSON file.Functions:
getEvents(file)
:
scrapeFacebookEvents()
:
Past.htm
, Events.htm
, Cullinan.htm
) within the project
directory.runSeleniumCell
) to:
scrapeFacebookEvent
function.filename
JSON file.Purpose:
This code automates the process of scraping Facebook event data from a set of HTML files, storing the results in a structured JSON format. It relies on Selenium for browser automation and custom modules for file handling and utility functions.
Let me know if you have any other questions.