This code imports data from IMDb TSV files (principals, titles, and names) and stores it in a structured SQLite database.
npm run import -- "Find cross section of actors and movies"
// TODO: use papa parse to load tsv files from imdb
var fs = require('fs');
var papa = require('papaparse');
var sqlite3 = require('better-sqlite3');
var importer = require('../Core');
var createTables = importer.import("create movie database tables");
var principals = '/Users/briancullinan/Downloads/title.principals.tsv';
var titles = '/Users/briancullinan/Downloads/title.basics.tsv';
var names = '/Users/briancullinan/Downloads/name.basics.tsv';
var movies = './movies.sqlite.db';
var db = new sqlite3(movies, {});
function importSQL(file, cb) {
if(typeof file === 'string') {
file = fs.createReadStream(file);
}
var prev = 0;
var count = 0;
return new Promise(resolve => papa.parse(file, {
worker: true,
delimiter: '\t',
fastMode: true,
skipEmptyLines: true,
chunk: function(results) {
count++;
if(count === 1 || count > prev + 1000) {
prev = count;
console.log(`${count} - ${results.data.length} - ${results.data[1]}`);
}
return results.data.map(r => cb(r));
},
complete: resolve
}));
}
function insertPrincipals(data) {
return db.prepare(`
INSERT OR IGNORE INTO principals
(tconst, ordering, nconst, category, job, characters)
VALUES (?,?,?,?,?,?)`).run(data);
}
function insertTitles(data) {
data = typeof data[0] !== 'object' ? [data] : data;
return db.prepare(`
INSERT OR IGNORE INTO titles
(tconst, titleType, primaryTitle, originalTitle, isAdult, startYear, endYear, runtimeMinutes, genres)
VALUES ${data.map(d => '(?,?,?,?,?,?,?,?,?)').join(',')}`).run([].concat.apply([], data));
}
function insertNames(data) {
data = typeof data[0] !== 'object' ? [data] : data;
return db.prepare(`
INSERT OR IGNORE INTO name
(nconst, primaryName, birthYear, deathYear, primaryProfession, knownForTitles)
VALUES ${data.map(d => '(?,?,?,?,?,?)').join(',')}`).run([].concat.apply([], data));
}
$.async();
createTables(db)
.then(() => importSQL(names, insertNames))
.then(r => $.sendResult(r))
.catch(e => $.sendError(e))
// close the database connection
.then(() => db.close())
/*
params = [[1,2],[3,4],[5,6],[7,8]]
stmt = db.prepare("INSERT INTO test (num1, num2) VALUES (?1,?2)");
stmt.all([params], function(err, res) {
...
});
*/
// Import required modules
const fs = require('fs');
const papa = require('papaparse');
const sqlite3 = require('better-sqlite3');
const importer = require('../Core');
// Define constants for file paths and database connection
const PRINCIPALS_FILE = '/Users/briancullinan/Downloads/title.principals.tsv';
const TITLES_FILE = '/Users/briancullinan/Downloads/title.basics.tsv';
const NAMES_FILE = '/Users/briancullinan/Downloads/name.basics.tsv';
const MOVIES_DB = './movies.sqlite.db';
// Import database schema creation function
const createTables = importer.import('create movie database tables');
// Create a new SQLite database connection
const db = new sqlite3(MOVIES_DB, {});
// Define a function to import data from a TSV file using Papa Parse
async function importTsvData(file, cb) {
try {
let fileStream = fs.createReadStream(file);
if (file.endsWith('.tsv')) {
fileStream = papa.parse(fileStream, {
worker: true,
delimiter: '\t',
fastMode: true,
skipEmptyLines: true,
});
}
// Process data in chunks to avoid memory issues
const promises = [];
const chunkSize = 1000;
// Process each chunk of data
fileStream.then((result) => {
let prev = 0;
let count = 0;
result.data.forEach((row) => {
count++;
if (count % chunkSize === 0 || count === result.data.length) {
prev = count;
promises.push(cb(row));
console.log(`${count} - ${result.data.length} - ${result.data[1]}`);
}
});
// Resolve the promise chain
return Promise.all(promises);
});
} catch (error) {
global.$.sendError(error);
}
}
// Define functions to insert data into the database
function insertPrincipals(data) {
const stmt = db.prepare(`
INSERT OR IGNORE INTO principals
(tconst, ordering, nconst, category, job, characters)
VALUES (?,?,?,?,?,?)
`);
return stmt.run(data);
}
function insertTitles(data) {
// Flatten the data array if it contains a single object
data = data.map((row) => row instanceof Object? [row] : row);
const stmt = db.prepare(`
INSERT OR IGNORE INTO titles
(tconst, titleType, primaryTitle, originalTitle, isAdult, startYear, endYear, runtimeMinutes, genres)
VALUES (?1,?2,?3,?4,?5,?6,?7,?8,?9)
`);
return data.map((row) => stmt.run(row));
}
function insertNames(data) {
// Flatten the data array if it contains a single object
data = data.map((row) => row instanceof Object? [row] : row);
const stmt = db.prepare(`
INSERT OR IGNORE INTO name
(nconst, primaryName, birthYear, deathYear, primaryProfession, knownForTitles)
VALUES (?1,?2,?3,?4,?5,?6)
`);
return data.map((row) => stmt.run(row));
}
// Create the database tables and import data
createTables(db)
.then(() => Promise.all([
importTsvData(NAMES_FILE, insertNames),
importTsvData(TITLES_FILE, insertTitles),
importTsvData(PRINCIPALS_FILE, insertPrincipals),
]))
.then((results) => {
global.$.sendResult(results);
})
.catch((error) => {
global.$.sendError(error);
})
.finally(() => {
// Close the database connection
db.close();
});
This code imports and processes data from IMDb TSV files into a SQLite database.
Here's a breakdown:
Dependencies:
fs
for file system operations, papaparse
for parsing TSV files, better-sqlite3
for SQLite database interaction, and utilities from ../Core
for creating database tables.File Paths:
principals
, titles
, names
) and the output SQLite database (movies
).Database Connection:
importSQL
Function:
fs.createReadStream
or directly if a string path is provided.papaparse
to parse the TSV file in chunks, providing progress updates.cb
) for each chunk of data, allowing for custom processing.Database Insertion Functions:
insertPrincipals
, insertTitles
, and insertNames
functions prepare and execute SQL INSERT statements to insert data into the respective database tables.Main Execution:
createTables
to create the necessary database tables.importSQL
to process each TSV file, inserting the data into the database.Let me know if you have any more questions!