collect facebook profiles

This code automates the process of scraping Facebook friend data using Selenium, storing it in a file, and providing a method to access the extracted friend URLs. It utilizes jsdom for parsing the scraped HTML and custom modules for interacting with Facebook and managing data.

Run example

var {JSDOM} = require('jsdom'); var fs = require('fs'); var path = require('path'); var glob = require('glob'); var assert = require('assert'); var importer = require('../Core'); var runSeleniumCell = importer.import("selenium cell"); var loginFacebook, likeAllPosts, getAllXPath, scrapeFacebookFriends; var PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE; var project = PROFILE_PATH + '/Conversations'; function parseFacebookFriends() { return getAllXPath([ '//a[contains(@href, "friends_tab")]/@href' ]) .then(friends => { return friends .filter((elem, pos, arr) => arr.indexOf(elem) === pos) .map(f => f.replace(/\?.*$/ig, '')) }) } var FRIENDS_FILE; function collectFacebookProfiles() { var fresh; return runSeleniumCell([ 'log in facebook', 'like all facebook posts', 'scrape facebook profile', 'scrape facebook friends', ]) .then(r => ({ loginFacebook, likeAllPosts, scrapeFacebookFriends } = r).loginFacebook()) // TODO: abstract this data collection from JSDOM // TODO: only scrape once per day? use last file instead? //.then(() => scrapeFacebookFriends()) .then(() => { var friends = glob.sync('**/*friend*', {cwd: project, nodir: true}); friends.sort((a, b) => new Date(fs.statSync(path.join(project, a)).mtime).getTime() - new Date(fs.statSync(path.join(project, b)).mtime).getTime()); FRIENDS_FILE = path.join(project, friends.pop()); return fs.readFileSync(FRIENDS_FILE).toString(); }) .then(doc => { // call script to get all Facebook friends var dom = new JSDOM(doc); getAllXPath = importer.import("all elements xpath", "{ client: { execute: (func", "...args") => Promise.resolve({ value: func.apply(dom.window.document, args) }), addCommand: () => { } }, document: dom.window.document, XPathResult: {ORDERED_NODE_ITERATOR_TYPE: 5} }) return parseFacebookFriends(); }) .then(friends => { assert(friends.length > 0, FRIENDS_FILE + ' should have friends links in it') // use glob.sync to make sure every friend is hit at least once in a rotation var existingPosts = glob.sync('**/*-posts-*.json', {cwd: project}); // TODO: way to tell which part of the URL is unique? var friendCount = friends.length; fresh = friends.filter(profile => { const profileId = profile.replace(/^\/|\/$/ig, '').split('/').pop() .replace(/[^a-z0-9]/ig, '_'); // TODO: check for file.stat instead of year? return existingPosts.indexOf(profileId + '-posts-' + (new Date()).getFullYear() + '.json') === -1; }); const percent = Math.round((friendCount - fresh.length) / friendCount * 100); console.log((friendCount - fresh.length) + ' / ' + friendCount + ' : ' + percent + '%'); //return percent === 100 ? scrapeFacebookFriends() : []; }) .then(() => { const rand = Math.round(fresh.length * Math.random()); console.log(fresh.slice(rand, rand + 1)); return importer.runAllPromises(fresh.slice(rand, rand + 1) .map(p => (resolve) => likeAllPosts(p, null).then(r => resolve(r)))); }) }; module.exports = collectFacebookProfiles;

What the code could have been:

// Import required modules
const { JSDOM } = require('jsdom');
const fs = require('fs');
const path = require('path');
const glob = require('glob');
const assert = require('assert');
const importer = require('../Core');
const runSeleniumCell = importer.import('selenium cell');

// Set project path based on environment variables
const PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE;
const project = path.join(PROFILE_PATH, 'Conversations');

// Function to parse Facebook friends from a file
function parseFacebookFriends(file) {
    return getAllXPath([
        '//a[contains(@href, "friends_tab")]/@href'
    ])
   .then(friends => {
        return friends
         .filter((elem, pos, arr) => arr.indexOf(elem) === pos)
         .map(f => f.replace(/\?.*$/ig, ''))
         .then(friends => {
              return fs.existsSync(file)? friends : [];
          });
    });
}

// Function to collect Facebook profiles
async function collectFacebookProfiles() {
    const freshFriends = [];
    const existingPosts = glob.sync('**/*-posts-*.json', { cwd: project });
    const friendCount = await getAllFriends();
    const friendsFile = path.join(project, getNewestFriendFile(friendCount, existingPosts));
    const friends = await parseFacebookFriends(friendsFile);
    assert(friends.length > 0, friendsFile +'should have friends links in it');

    // Filter friends to scrape
    const filteredFriends = friends
       .filter(profile => {
            const profileId = profile.replace(/^\/|\/$/ig, '').split('/').pop()
               .replace(/[^a-z0-9]/ig, '_');
            return existingPosts.indexOf(profileId
                + '-posts-' + (new Date()).getFullYear() + '.json') === -1;
        })
       .map(profile => friends.indexOf(profile));

    // Scrape and like posts
    const promises = filteredFriends
       .map((index, i) => (resolve) => likeAllPosts(friends[index], null).then(r => resolve(r)));
    const results = await importer.runAllPromises(promises);
    return results;
}

// Function to get all friends
async function getAllFriends() {
    // Call script to get all Facebook friends
    const file = path.join(project, getNewestFriendFile([], glob.sync('**/*friend*', { cwd: project, nodir: true })));
    const dom = new JSDOM(await fs.readFileSync(file).toString());
    getAllXPath = importer.import('all elements xpath',  {
        client: {
            execute: (func,...args) => Promise.resolve({
                value: func.apply(dom.window.document, args)
            }),
            addCommand: () => {
            }
        },
        document: dom.window.document,
        XPathResult: {ORDERED_NODE_ITERATOR_TYPE: 5}
    });
    const friends = await parseFacebookFriends(file);
    return friends.length;
}

// Function to get the newest friend file
function getNewestFriendFile(friends = [], files) {
    if (friends.length === 0) return files.sort((a, b) => 
        new Date(fs.statSync(path.join(project, a)).mtime).getTime()
            - new Date(fs.statSync(path.join(project, b)).mtime).getTime())[0];
    return friends.map((index, i) => friends[i].replace(/^\/|\/$/ig, '').split('/').pop())
       .map(profile => path.join(project, profile + '-posts-' + (new Date()).getFullYear() + '.json'))
       .filter(file => files.indexOf(file) === -1);
}

// Run the scraper
module.exports = collectFacebookProfiles;

This code is designed to scrape Facebook friend data and process it. Here's a breakdown: