movie database | | Cell 1 | Search

This code imports data from IMDb TSV files (principals, titles, and names) and stores it in a structured SQLite database.

Run example

npm run import -- "Find cross section of actors and movies"

Find cross section of actors and movies

// TODO: use papa parse to load tsv files from imdb
var fs = require('fs');
var papa = require('papaparse');
var sqlite3 = require('better-sqlite3');
var importer = require('../Core');
var createTables = importer.import("create movie database tables");

var principals = '/Users/briancullinan/Downloads/title.principals.tsv';
var titles = '/Users/briancullinan/Downloads/title.basics.tsv';
var names = '/Users/briancullinan/Downloads/name.basics.tsv';
var movies = './movies.sqlite.db';

var db = new sqlite3(movies, {});

function importSQL(file, cb) {
    if(typeof file === 'string') {
        file = fs.createReadStream(file);
    }
    var prev = 0;
    var count = 0;
    return new Promise(resolve => papa.parse(file, {
        worker: true,
        delimiter: '\t',
        fastMode: true,
        skipEmptyLines: true,
        chunk: function(results) {
            count++;
            if(count === 1 || count > prev + 1000) {
                prev = count;
                console.log(`${count} - ${results.data.length} - ${results.data[1]}`);
            }
            
            return results.data.map(r => cb(r));
        },
        complete: resolve
    }));
}

function insertPrincipals(data) {
    return db.prepare(`
INSERT OR IGNORE INTO principals
    (tconst, ordering, nconst, category, job, characters)
VALUES (?,?,?,?,?,?)`).run(data);
}

function insertTitles(data) {
    data = typeof data[0] !== 'object' ? [data] : data;
    return db.prepare(`
INSERT OR IGNORE INTO titles
    (tconst, titleType, primaryTitle, originalTitle, isAdult, startYear, endYear, runtimeMinutes, genres)
VALUES ${data.map(d => '(?,?,?,?,?,?,?,?,?)').join(',')}`).run([].concat.apply([], data));
}

function insertNames(data) {
    data = typeof data[0] !== 'object' ? [data] : data;
    return db.prepare(`
INSERT OR IGNORE INTO name
    (nconst, primaryName, birthYear, deathYear, primaryProfession, knownForTitles)
VALUES ${data.map(d => '(?,?,?,?,?,?)').join(',')}`).run([].concat.apply([], data));
}

$.async();
createTables(db)
    .then(() => importSQL(names, insertNames))
    .then(r => $.sendResult(r))
    .catch(e => $.sendError(e))
    // close the database connection
    .then(() => db.close())



/*
params = [[1,2],[3,4],[5,6],[7,8]]

stmt = db.prepare("INSERT INTO test (num1, num2) VALUES (?1,?2)");
stmt.all([params], function(err, res) {
    ...
});
*/

What the code could have been:

// Import required modules
const fs = require('fs');
const papa = require('papaparse');
const sqlite3 = require('better-sqlite3');
const importer = require('../Core');

// Define constants for file paths and database connection
const PRINCIPALS_FILE = '/Users/briancullinan/Downloads/title.principals.tsv';
const TITLES_FILE = '/Users/briancullinan/Downloads/title.basics.tsv';
const NAMES_FILE = '/Users/briancullinan/Downloads/name.basics.tsv';
const MOVIES_DB = './movies.sqlite.db';

// Import database schema creation function
const createTables = importer.import('create movie database tables');

// Create a new SQLite database connection
const db = new sqlite3(MOVIES_DB, {});

// Define a function to import data from a TSV file using Papa Parse
async function importTsvData(file, cb) {
  try {
    let fileStream = fs.createReadStream(file);
    if (file.endsWith('.tsv')) {
      fileStream = papa.parse(fileStream, {
        worker: true,
        delimiter: '\t',
        fastMode: true,
        skipEmptyLines: true,
      });
    }

    // Process data in chunks to avoid memory issues
    const promises = [];
    const chunkSize = 1000;

    // Process each chunk of data
    fileStream.then((result) => {
      let prev = 0;
      let count = 0;
      result.data.forEach((row) => {
        count++;
        if (count % chunkSize === 0 || count === result.data.length) {
          prev = count;
          promises.push(cb(row));
          console.log(`${count} - ${result.data.length} - ${result.data[1]}`);
        }
      });

      // Resolve the promise chain
      return Promise.all(promises);
    });
  } catch (error) {
    global.$.sendError(error);
  }
}

// Define functions to insert data into the database
function insertPrincipals(data) {
  const stmt = db.prepare(`
    INSERT OR IGNORE INTO principals
    (tconst, ordering, nconst, category, job, characters)
    VALUES (?,?,?,?,?,?)
  `);
  return stmt.run(data);
}

function insertTitles(data) {
  // Flatten the data array if it contains a single object
  data = data.map((row) => row instanceof Object? [row] : row);

  const stmt = db.prepare(`
    INSERT OR IGNORE INTO titles
    (tconst, titleType, primaryTitle, originalTitle, isAdult, startYear, endYear, runtimeMinutes, genres)
    VALUES (?1,?2,?3,?4,?5,?6,?7,?8,?9)
  `);
  return data.map((row) => stmt.run(row));
}

function insertNames(data) {
  // Flatten the data array if it contains a single object
  data = data.map((row) => row instanceof Object? [row] : row);

  const stmt = db.prepare(`
    INSERT OR IGNORE INTO name
    (nconst, primaryName, birthYear, deathYear, primaryProfession, knownForTitles)
    VALUES (?1,?2,?3,?4,?5,?6)
  `);
  return data.map((row) => stmt.run(row));
}

// Create the database tables and import data
createTables(db)
 .then(() => Promise.all([
    importTsvData(NAMES_FILE, insertNames),
    importTsvData(TITLES_FILE, insertTitles),
    importTsvData(PRINCIPALS_FILE, insertPrincipals),
  ]))
 .then((results) => {
    global.$.sendResult(results);
  })
 .catch((error) => {
    global.$.sendError(error);
  })
 .finally(() => {
    // Close the database connection
    db.close();
  });

This code imports and processes data from IMDb TSV files into a SQLite database.

Here's a breakdown:

  1. Dependencies:

  2. File Paths:

  3. Database Connection:

  4. importSQL Function:

  5. Database Insertion Functions:

  6. Main Execution:

Let me know if you have any more questions!