facebook connections | | Cell 1 | Search

This code automates the process of scraping Facebook friend data using Selenium, storing it in a file, and providing a method to access the extracted friend URLs. It utilizes jsdom for parsing the scraped HTML and custom modules for interacting with Facebook and managing data.

Run example

npm run import -- "collect facebook profiles"

collect facebook profiles

var {JSDOM} = require('jsdom');
var fs = require('fs');
var path = require('path');
var glob = require('glob');
var assert = require('assert');
var importer = require('../Core');
var runSeleniumCell = importer.import("selenium cell");
var loginFacebook,
    likeAllPosts,
    getAllXPath,
    scrapeFacebookFriends;

var PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE;
var project = PROFILE_PATH + '/Conversations';

function parseFacebookFriends() {
    return getAllXPath([
        '//a[contains(@href, "friends_tab")]/@href'
    ])
    .then(friends => {
        return friends
          .filter((elem, pos, arr) => arr.indexOf(elem) === pos)
          .map(f => f.replace(/\?.*$/ig, ''))
    })
}

var FRIENDS_FILE;
function collectFacebookProfiles() {
    var fresh;
    return runSeleniumCell([
        'log in facebook',
        'like all facebook posts',
        'scrape facebook profile',
        'scrape facebook friends',
    ])
        .then(r => ({
            loginFacebook,
            likeAllPosts,
            scrapeFacebookFriends
        } = r).loginFacebook())
        // TODO: abstract this data collection from JSDOM
        // TODO: only scrape once per day? use last file instead?
        //.then(() => scrapeFacebookFriends())
        .then(() => {
            var friends = glob.sync('**/*friend*', {cwd: project, nodir: true});
            friends.sort((a, b) => 
                new Date(fs.statSync(path.join(project, a)).mtime).getTime()
                    - new Date(fs.statSync(path.join(project, b)).mtime).getTime());
            FRIENDS_FILE = path.join(project, friends.pop());
            return fs.readFileSync(FRIENDS_FILE).toString();
        })
        .then(doc => {
            // call script to get all Facebook friends
            var dom = new JSDOM(doc);
            getAllXPath = importer.import("all elements xpath",
"{
                client: {
                    execute: (func",
"...args") => Promise.resolve({
                        value: func.apply(dom.window.document, args)
                    }),
                    addCommand: () => {
                    }
                },
                document: dom.window.document,
                XPathResult: {ORDERED_NODE_ITERATOR_TYPE: 5}
            })
            return parseFacebookFriends();
        })
        .then(friends => {
            assert(friends.length > 0, FRIENDS_FILE + ' should have friends links in it')
            // use glob.sync to make sure every friend is hit at least once in a rotation
            var existingPosts = glob.sync('**/*-posts-*.json', {cwd: project});
            // TODO: way to tell which part of the URL is unique?
            var friendCount = friends.length;
            fresh = friends.filter(profile => {
                const profileId = profile.replace(/^\/|\/$/ig, '').split('/').pop()
                .replace(/[^a-z0-9]/ig, '_');
                // TODO: check for file.stat instead of year?
                return existingPosts.indexOf(profileId
                    + '-posts-' + (new Date()).getFullYear() + '.json') === -1;
            });
            const percent = Math.round((friendCount - fresh.length) / friendCount * 100);
            console.log((friendCount - fresh.length) + ' / ' + friendCount + ' : '
                + percent
                + '%');
            //return percent === 100 ? scrapeFacebookFriends() : [];
        })
        .then(() => {
            const rand = Math.round(fresh.length * Math.random());
            console.log(fresh.slice(rand, rand + 1));
            return importer.runAllPromises(fresh.slice(rand, rand + 1)
                .map(p => (resolve) => likeAllPosts(p, null).then(r => resolve(r))));
        })
};
module.exports = collectFacebookProfiles;

What the code could have been:

// Import required modules
const { JSDOM } = require('jsdom');
const fs = require('fs');
const path = require('path');
const glob = require('glob');
const assert = require('assert');
const importer = require('../Core');
const runSeleniumCell = importer.import('selenium cell');

// Set project path based on environment variables
const PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE;
const project = path.join(PROFILE_PATH, 'Conversations');

// Function to parse Facebook friends from a file
function parseFacebookFriends(file) {
    return getAllXPath([
        '//a[contains(@href, "friends_tab")]/@href'
    ])
   .then(friends => {
        return friends
         .filter((elem, pos, arr) => arr.indexOf(elem) === pos)
         .map(f => f.replace(/\?.*$/ig, ''))
         .then(friends => {
              return fs.existsSync(file)? friends : [];
          });
    });
}

// Function to collect Facebook profiles
async function collectFacebookProfiles() {
    const freshFriends = [];
    const existingPosts = glob.sync('**/*-posts-*.json', { cwd: project });
    const friendCount = await getAllFriends();
    const friendsFile = path.join(project, getNewestFriendFile(friendCount, existingPosts));
    const friends = await parseFacebookFriends(friendsFile);
    assert(friends.length > 0, friendsFile +'should have friends links in it');

    // Filter friends to scrape
    const filteredFriends = friends
       .filter(profile => {
            const profileId = profile.replace(/^\/|\/$/ig, '').split('/').pop()
               .replace(/[^a-z0-9]/ig, '_');
            return existingPosts.indexOf(profileId
                + '-posts-' + (new Date()).getFullYear() + '.json') === -1;
        })
       .map(profile => friends.indexOf(profile));

    // Scrape and like posts
    const promises = filteredFriends
       .map((index, i) => (resolve) => likeAllPosts(friends[index], null).then(r => resolve(r)));
    const results = await importer.runAllPromises(promises);
    return results;
}

// Function to get all friends
async function getAllFriends() {
    // Call script to get all Facebook friends
    const file = path.join(project, getNewestFriendFile([], glob.sync('**/*friend*', { cwd: project, nodir: true })));
    const dom = new JSDOM(await fs.readFileSync(file).toString());
    getAllXPath = importer.import('all elements xpath',  {
        client: {
            execute: (func,...args) => Promise.resolve({
                value: func.apply(dom.window.document, args)
            }),
            addCommand: () => {
            }
        },
        document: dom.window.document,
        XPathResult: {ORDERED_NODE_ITERATOR_TYPE: 5}
    });
    const friends = await parseFacebookFriends(file);
    return friends.length;
}

// Function to get the newest friend file
function getNewestFriendFile(friends = [], files) {
    if (friends.length === 0) return files.sort((a, b) => 
        new Date(fs.statSync(path.join(project, a)).mtime).getTime()
            - new Date(fs.statSync(path.join(project, b)).mtime).getTime())[0];
    return friends.map((index, i) => friends[i].replace(/^\/|\/$/ig, '').split('/').pop())
       .map(profile => path.join(project, profile + '-posts-' + (new Date()).getFullYear() + '.json'))
       .filter(file => files.indexOf(file) === -1);
}

// Run the scraper
module.exports = collectFacebookProfiles;

This code is designed to scrape Facebook friend data and process it. Here's a breakdown:

  1. Setup:

  2. parseFacebookFriends Function:

  3. collectFacebookProfiles Function:

Overall, this code: