facebook data | Scrape facebook event | Cell 5 | Search

This code automates the scraping of Facebook event data from HTML files, extracts event URLs, and stores the scraped information in a JSON file.

Run example

npm run import -- "Scrape facebook events"

Scrape facebook events

var glob = require('glob');
var fs = require('fs');
var path = require('path');
var importer = require('../Core');
var runSeleniumCell = importer.import("selenium cell");

var PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE;
var project = PROFILE_PATH + '/Conversations';
var filename = path.join(project, 'events.json');

function getEvents(file) {
    var html = fs.readFileSync(file).toString();
    return importer.regexToArray(/href="([^"?]*)/ig, html, 1)
        .filter(h => h.indexOf('.js') === -1)
        .filter(h => h.indexOf('.css') === -1)
        .filter(h => h.match(/events\/[0-9]+/ig));
};

var scrapeFacebookEvent, loginFacebook;
function scrapeFacebookEvents() {
    var eventFiles = glob.sync(
        '**/*@(Past|Events|Cullinan).htm',
        {cwd: project});
    return runSeleniumCell([
            'log in facebook',
            'facebook event'
        ])
        .then(r => {
            console.log(r);
            loginFacebook = r[0];
            scrapeFacebookEvent = r[1];
            return loginFacebook();
        })
        .then(() => {
            var events = [
                ...getEvents(path.join(project, eventFiles[0])),
                ...getEvents(path.join(project, eventFiles[1])),
                ...getEvents(path.join(project, eventFiles[2]))
            ]
                .filter((elem, pos, arr) => arr.indexOf(elem) === pos);
            console.log(events);
            return importer.runAllPromises(events
                .map(e => resolve => scrapeFacebookEvent(e)
                    .then(r => resolve(r))
                    .catch(r => resolve(r))))
        })
        .then(r => {
            fs.writeFileSync(filename, JSON.stringify(r, null, 4));
            return r;
        })
};
module.exports = scrapeFacebookEvents;

What the code could have been:

const glob = require('glob');
const fs = require('fs');
const path = require('path');
const importer = require('../Core');
const runSeleniumCell = importer.import('selenium cell');

const PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE;
const PROJECT_PATH = path.join(PROFILE_PATH, 'Conversations');

class FacebookEventScraper {
  constructor() {
    this.filename = path.join(PROJECT_PATH, 'events.json');
    this.loginFacebook = null;
    this.scrapeFacebookEvent = null;
  }

  async scrapeFacebookEvents() {
    try {
      const eventFiles = await this.getFileNamesMatchingPattern();
      const loginResponse = await runSeleniumCell([
        'log in facebook',
        'facebook event'
      ]);
      this.loginFacebook = loginResponse[0];
      this.scrapeFacebookEvent = loginResponse[1];

      await this.loginFacebook();

      const events = await Promise.all(
        eventFiles.map(async (file) => {
          const filePath = path.join(PROJECT_PATH, file);
          return this.getEventsFromHtmlFile(filePath);
        })
      );

      const uniqueEvents = [...new Set(events.flat())];
      const facebookEvents = await Promise.all(
        uniqueEvents.map(async (event) => {
          return await this.scrapeFacebookEvent(event);
        })
      );

      await this.writeEventsToJSONFile(facebookEvents);
      return facebookEvents;
    } catch (error) {
      console.error('Error occurred while scraping Facebook events:', error);
      throw error;
    }
  }

  async getFileNamesMatchingPattern() {
    return glob.sync(
      '**/*@(Past|Events|Cullinan).htm',
      { cwd: PROJECT_PATH }
    );
  }

  async getEventsFromHtmlFile(filePath) {
    const html = await this.readHtmlFile(filePath);
    return this.regexToArray(/href="([^"?]*)/ig, html, 1)
     .filter((h) => h.indexOf('.js') === -1)
     .filter((h) => h.indexOf('.css') === -1)
     .filter((h) => h.match(/events\/[0-9]+/ig));
  }

  async readHtmlFile(filePath) {
    return fs.readFileSync(filePath).toString();
  }

  regexToArray(regex, str, index) {
    return importer.regexToArray(regex, str, index);
  }

  async writeEventsToJSONFile(events) {
    fs.writeFileSync(this.filename, JSON.stringify(events, null, 4));
  }
}

module.exports = FacebookEventScraper.prototype.scrapeFacebookEvents;

This code snippet is designed to scrape Facebook event data from a set of HTML files and store the results in a JSON file. Here's a breakdown:

Dependencies:

Variables:

Functions:

Purpose:

This code automates the process of scraping Facebook event data from a set of HTML files, storing the results in a structured JSON format. It relies on Selenium for browser automation and custom modules for file handling and utility functions.

Let me know if you have any other questions.