The get_cells
function imports a Jupyter Notebook file, extracts cell metadata, and formats it into a list of dictionaries with language, filename, and unique ID information. The function takes two parameters: the notebook path and a list of cell types to extract, and returns the formatted list of cells.
import json
import os
def get_cells(notebook_path, types=['*', 'code']):
"""Extract cells from a Jupyter Notebook with additional metadata."""
notebook_path = os.path.abspath(notebook_path)
with open(notebook_path, 'r', encoding='utf-8') as f:
notebook = json.load(f)
kernel = notebook.get('metadata', {}).get('kernelspec', {})
cells = [
{
**cell,
"language": (cell.get("metadata", {}).get("vscode", {}).get("languageId") or
kernel.get("language") or
notebook.get("metadata", {}).get("language_info", {}).get("name", "")),
"filename": notebook_path,
"id": f"{os.path.basename(notebook_path)}[{i}]"
}
for i, cell in enumerate(notebook.get("cells", []))
if '*' in types or cell.get("cell_type") in types
]
return cells
__all__ = {
"get_cells": get_cells
}
import json
import os
def load_notebook(notebook_path: str) -> dict:
"""Loads a Jupyter Notebook from a file.
Args:
notebook_path (str): Path to the Jupyter Notebook.
Returns:
dict: Loaded Jupyter Notebook content.
"""
notebook_path = os.path.abspath(notebook_path)
try:
with open(notebook_path, 'r', encoding='utf-8') as f:
return json.load(f)
except json.JSONDecodeError as e:
print(f"Error loading notebook: {e}")
return {}
def get_kernel_metadata(notebook: dict) -> dict:
"""Extracts kernel metadata from a Jupyter Notebook.
Args:
notebook (dict): Loaded Jupyter Notebook content.
Returns:
dict: Kernel metadata.
"""
return notebook.get('metadata', {}).get('kernelspec', {})
def get_cell_metadata(cell: dict) -> dict:
"""Extracts metadata from a Jupyter Notebook cell.
Args:
cell (dict): Cell content.
Returns:
dict: Cell metadata.
"""
return {
key: value
for key, value in cell.items()
if key.startswith('metadata') or key == 'cell_type'
}
def get_cell_language(cell: dict) -> str:
"""Determines the language of a Jupyter Notebook cell.
Args:
cell (dict): Cell content.
Returns:
str: Language of the cell.
"""
language = cell.get("metadata", {}).get("vscode", {}).get("languageId")
kernel = get_kernel_metadata(cell['metadata'])
return (language or kernel.get('language') or
cell.get('metadata', {}).get('language_info', {}).get('name', ''))
def extract_cells(notebook: dict, types: list = ['*', 'code']) -> list:
"""Extracts cells from a Jupyter Notebook with additional metadata.
Args:
notebook (dict): Loaded Jupyter Notebook content.
types (list, optional): Types of cells to extract. Defaults to ['*', 'code'].
Returns:
list: Extracted cells with additional metadata.
"""
return [
{
**cell,
"language": get_cell_language(cell),
"filename": os.path.abspath(notebook['metadata']['path']),
"id": f"{os.path.basename(notebook['metadata']['path'])}[{i}]"
}
for i, cell in enumerate(notebook.get("cells", []))
if '*' in types or cell.get("cell_type") in types
]
# Example usage:
if __name__ == "__main__":
notebook_path = 'path/to/notebook.ipynb'
cells = extract_cells(load_notebook(notebook_path))
print(cells)
import json
import os
This code imports two Python libraries:
json
: for working with JSON dataos
: for interacting with the operating system (e.g., getting absolute paths)get_cells
def get_cells(notebook_path, types=['*', 'code']):
This function takes two parameters:
notebook_path
: the path to a Jupyter Notebook filetypes
: a list of cell types to extract; default is ['*', 'code']
notebook_path = os.path.abspath(notebook_path)
with open(notebook_path, 'r', encoding='utf-8') as f:
notebook = json.load(f)
This code:
notebook_path
to its absolute path using os.path.abspath
.json.load
.kernel = notebook.get('metadata', {}).get('kernelspec', {})
This line retrieves the kernel metadata from the notebook, which is nested several levels deep in the JSON object.
cells = [
{
**cell,
"language": (cell.get("metadata", {}).get("vscode", {}).get("languageId") or
kernel.get("language") or
notebook.get("metadata", {}).get("language_info", {}).get("name", "")),
"filename": notebook_path,
"id": f"{os.path.basename(notebook_path)}[{i}]"
}
for i, cell in enumerate(notebook.get("cells", []))
if '*' in types or cell.get("cell_type") in types
]
This code defines a list cells
, which contains dictionaries representing individual cells in the notebook. Each dictionary includes:
language
: determined based on several possible sources of language informationfilename
: the path to the notebook fileid
: a unique identifier for the cellThe list comprehension iterates over the cells in the notebook, filtering them based on the specified types
.
return cells
Finally, the function returns the list of cell dictionaries.
__all__ = {
"get_cells": get_cells
}
This line makes the get_cells
function available for import using from module import get_cells
.