This code automates the process of scraping Facebook friend data using Selenium, storing it in a file, and providing a method to access the extracted friend URLs. It utilizes jsdom
for parsing the scraped HTML and custom modules for interacting with Facebook and managing data.
npm run import -- "collect facebook profiles"
var {JSDOM} = require('jsdom');
var fs = require('fs');
var path = require('path');
var glob = require('glob');
var assert = require('assert');
var importer = require('../Core');
var runSeleniumCell = importer.import("selenium cell");
var loginFacebook,
likeAllPosts,
getAllXPath,
scrapeFacebookFriends;
var PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE;
var project = PROFILE_PATH + '/Conversations';
function parseFacebookFriends() {
return getAllXPath([
'//a[contains(@href, "friends_tab")]/@href'
])
.then(friends => {
return friends
.filter((elem, pos, arr) => arr.indexOf(elem) === pos)
.map(f => f.replace(/\?.*$/ig, ''))
})
}
var FRIENDS_FILE;
function collectFacebookProfiles() {
var fresh;
return runSeleniumCell([
'log in facebook',
'like all facebook posts',
'scrape facebook profile',
'scrape facebook friends',
])
.then(r => ({
loginFacebook,
likeAllPosts,
scrapeFacebookFriends
} = r).loginFacebook())
// TODO: abstract this data collection from JSDOM
// TODO: only scrape once per day? use last file instead?
//.then(() => scrapeFacebookFriends())
.then(() => {
var friends = glob.sync('**/*friend*', {cwd: project, nodir: true});
friends.sort((a, b) =>
new Date(fs.statSync(path.join(project, a)).mtime).getTime()
- new Date(fs.statSync(path.join(project, b)).mtime).getTime());
FRIENDS_FILE = path.join(project, friends.pop());
return fs.readFileSync(FRIENDS_FILE).toString();
})
.then(doc => {
// call script to get all Facebook friends
var dom = new JSDOM(doc);
getAllXPath = importer.import("all elements xpath",
"{
client: {
execute: (func",
"...args") => Promise.resolve({
value: func.apply(dom.window.document, args)
}),
addCommand: () => {
}
},
document: dom.window.document,
XPathResult: {ORDERED_NODE_ITERATOR_TYPE: 5}
})
return parseFacebookFriends();
})
.then(friends => {
assert(friends.length > 0, FRIENDS_FILE + ' should have friends links in it')
// use glob.sync to make sure every friend is hit at least once in a rotation
var existingPosts = glob.sync('**/*-posts-*.json', {cwd: project});
// TODO: way to tell which part of the URL is unique?
var friendCount = friends.length;
fresh = friends.filter(profile => {
const profileId = profile.replace(/^\/|\/$/ig, '').split('/').pop()
.replace(/[^a-z0-9]/ig, '_');
// TODO: check for file.stat instead of year?
return existingPosts.indexOf(profileId
+ '-posts-' + (new Date()).getFullYear() + '.json') === -1;
});
const percent = Math.round((friendCount - fresh.length) / friendCount * 100);
console.log((friendCount - fresh.length) + ' / ' + friendCount + ' : '
+ percent
+ '%');
//return percent === 100 ? scrapeFacebookFriends() : [];
})
.then(() => {
const rand = Math.round(fresh.length * Math.random());
console.log(fresh.slice(rand, rand + 1));
return importer.runAllPromises(fresh.slice(rand, rand + 1)
.map(p => (resolve) => likeAllPosts(p, null).then(r => resolve(r))));
})
};
module.exports = collectFacebookProfiles;
// Import required modules
const { JSDOM } = require('jsdom');
const fs = require('fs');
const path = require('path');
const glob = require('glob');
const assert = require('assert');
const importer = require('../Core');
const runSeleniumCell = importer.import('selenium cell');
// Set project path based on environment variables
const PROFILE_PATH = process.env.HOME || process.env.HOMEPATH || process.env.USERPROFILE;
const project = path.join(PROFILE_PATH, 'Conversations');
// Function to parse Facebook friends from a file
function parseFacebookFriends(file) {
return getAllXPath([
'//a[contains(@href, "friends_tab")]/@href'
])
.then(friends => {
return friends
.filter((elem, pos, arr) => arr.indexOf(elem) === pos)
.map(f => f.replace(/\?.*$/ig, ''))
.then(friends => {
return fs.existsSync(file)? friends : [];
});
});
}
// Function to collect Facebook profiles
async function collectFacebookProfiles() {
const freshFriends = [];
const existingPosts = glob.sync('**/*-posts-*.json', { cwd: project });
const friendCount = await getAllFriends();
const friendsFile = path.join(project, getNewestFriendFile(friendCount, existingPosts));
const friends = await parseFacebookFriends(friendsFile);
assert(friends.length > 0, friendsFile +'should have friends links in it');
// Filter friends to scrape
const filteredFriends = friends
.filter(profile => {
const profileId = profile.replace(/^\/|\/$/ig, '').split('/').pop()
.replace(/[^a-z0-9]/ig, '_');
return existingPosts.indexOf(profileId
+ '-posts-' + (new Date()).getFullYear() + '.json') === -1;
})
.map(profile => friends.indexOf(profile));
// Scrape and like posts
const promises = filteredFriends
.map((index, i) => (resolve) => likeAllPosts(friends[index], null).then(r => resolve(r)));
const results = await importer.runAllPromises(promises);
return results;
}
// Function to get all friends
async function getAllFriends() {
// Call script to get all Facebook friends
const file = path.join(project, getNewestFriendFile([], glob.sync('**/*friend*', { cwd: project, nodir: true })));
const dom = new JSDOM(await fs.readFileSync(file).toString());
getAllXPath = importer.import('all elements xpath', {
client: {
execute: (func,...args) => Promise.resolve({
value: func.apply(dom.window.document, args)
}),
addCommand: () => {
}
},
document: dom.window.document,
XPathResult: {ORDERED_NODE_ITERATOR_TYPE: 5}
});
const friends = await parseFacebookFriends(file);
return friends.length;
}
// Function to get the newest friend file
function getNewestFriendFile(friends = [], files) {
if (friends.length === 0) return files.sort((a, b) =>
new Date(fs.statSync(path.join(project, a)).mtime).getTime()
- new Date(fs.statSync(path.join(project, b)).mtime).getTime())[0];
return friends.map((index, i) => friends[i].replace(/^\/|\/$/ig, '').split('/').pop())
.map(profile => path.join(project, profile + '-posts-' + (new Date()).getFullYear() + '.json'))
.filter(file => files.indexOf(file) === -1);
}
// Run the scraper
module.exports = collectFacebookProfiles;
This code is designed to scrape Facebook friend data and process it. Here's a breakdown:
Setup:
jsdom
for DOM manipulation, fs
for file system operations, path
for path manipulation, glob
for finding files, assert
for assertions, and custom modules from ../Core
.PROFILE_PATH
) and the project's data directory (project
).parseFacebookFriends
Function:
getAllXPath
(imported from ../Core
) to extract Facebook friend URLs from a given HTML document.collectFacebookProfiles
Function:
../Core
) to:
glob
and fs
.jsdom
.getAllXPath
with a custom client that uses the jsdom
instance.Overall, this code:
getAllXPath
.