Skip to content

Commit eee0764

Browse files
committed
refactor: Update notebook ingestion to use standard Jupytext format
- Allows easier back/forth conversion
1 parent 4e259a0 commit eee0764

File tree

4 files changed

+93
-285
lines changed

4 files changed

+93
-285
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ dependencies = [
88
"click>=8.0.0",
99
"gitpython>=3.1.0",
1010
"httpx",
11+
"jupytext>=1.16.0",
1112
"loguru>=0.7.0",
1213
"pathspec>=0.12.1",
1314
"pydantic",

src/gitingest/utils/notebook.py

Lines changed: 22 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22

33
from __future__ import annotations
44

5-
import json
6-
from itertools import chain
7-
from typing import TYPE_CHECKING, Any
5+
from typing import TYPE_CHECKING
6+
7+
import jupytext
8+
from jupytext.config import JupytextConfiguration
89

910
from gitingest.utils.exceptions import InvalidNotebookError
1011
from gitingest.utils.logging_config import get_logger
@@ -24,7 +25,8 @@ def process_notebook(file: Path, *, include_output: bool = True) -> str:
2425
file : Path
2526
The path to the Jupyter notebook file.
2627
include_output : bool
27-
Whether to include cell outputs in the generated script (default: ``True``).
28+
Whether to include cell outputs in the generated script (Not supported by Jupytext).
29+
This parameter is kept for backward compatibility but is ignored.
2830
2931
Returns
3032
-------
@@ -37,123 +39,24 @@ def process_notebook(file: Path, *, include_output: bool = True) -> str:
3739
If the notebook file is invalid or cannot be processed.
3840
3941
"""
40-
try:
41-
with file.open(encoding="utf-8") as f:
42-
notebook: dict[str, Any] = json.load(f)
43-
except json.JSONDecodeError as exc:
44-
msg = f"Invalid JSON in notebook: {file}"
45-
raise InvalidNotebookError(msg) from exc
46-
47-
# Check if the notebook contains worksheets
48-
worksheets = notebook.get("worksheets")
49-
if worksheets:
50-
logger.warning(
51-
"Worksheets are deprecated as of IPEP-17. Consider updating the notebook. "
52-
"(See: https://github.com/jupyter/nbformat and "
53-
"https://github.com/ipython/ipython/wiki/IPEP-17:-Notebook-Format-4#remove-multiple-worksheets "
54-
"for more information.)",
42+
if include_output:
43+
# Jupytext does not support including outputs in the generated script
44+
# We log a debug message to inform the user
45+
logger.debug(
46+
"Jupytext does not support including outputs in the generated script. 'include_output' is ignored."
5547
)
5648

57-
if len(worksheets) > 1:
58-
logger.warning(
59-
"Multiple worksheets detected. Combining all worksheets into a single script.",
60-
)
61-
62-
cells = list(chain.from_iterable(ws["cells"] for ws in worksheets))
63-
64-
else:
65-
cells = notebook["cells"]
66-
67-
result = ["# Jupyter notebook converted to Python script."]
68-
69-
for cell in cells:
70-
cell_str = _process_cell(cell, include_output=include_output)
71-
if cell_str:
72-
result.append(cell_str)
73-
74-
return "\n\n".join(result) + "\n"
75-
76-
77-
def _process_cell(cell: dict[str, Any], *, include_output: bool) -> str | None:
78-
"""Process a Jupyter notebook cell and return the cell content as a string.
79-
80-
Parameters
81-
----------
82-
cell : dict[str, Any]
83-
The cell dictionary from a Jupyter notebook.
84-
include_output : bool
85-
Whether to include cell outputs in the generated script.
86-
87-
Returns
88-
-------
89-
str | None
90-
The cell content as a string, or ``None`` if the cell is empty.
91-
92-
Raises
93-
------
94-
ValueError
95-
If an unexpected cell type is encountered.
96-
97-
"""
98-
cell_type = cell["cell_type"]
99-
100-
# Validate cell type and handle unexpected types
101-
if cell_type not in ("markdown", "code", "raw"):
102-
msg = f"Unknown cell type: {cell_type}"
103-
raise ValueError(msg)
104-
105-
cell_str = "".join(cell["source"])
106-
107-
# Skip empty cells
108-
if not cell_str:
109-
return None
110-
111-
# Convert Markdown and raw cells to multi-line comments
112-
if cell_type in ("markdown", "raw"):
113-
return f'"""\n{cell_str}\n"""'
114-
115-
# Add cell output as comments
116-
outputs = cell.get("outputs")
117-
if include_output and outputs:
118-
# Include cell outputs as comments
119-
raw_lines: list[str] = []
120-
for output in outputs:
121-
raw_lines += _extract_output(output)
122-
123-
cell_str += "\n# Output:\n# " + "\n# ".join(raw_lines)
124-
125-
return cell_str
126-
127-
128-
def _extract_output(output: dict[str, Any]) -> list[str]:
129-
"""Extract the output from a Jupyter notebook cell.
130-
131-
Parameters
132-
----------
133-
output : dict[str, Any]
134-
The output dictionary from a Jupyter notebook cell.
135-
136-
Returns
137-
-------
138-
list[str]
139-
The output as a list of strings.
140-
141-
Raises
142-
------
143-
ValueError
144-
If an unknown output type is encountered.
145-
146-
"""
147-
output_type = output["output_type"]
148-
149-
if output_type == "stream":
150-
return output["text"]
49+
try:
50+
# Read the notebook using jupytext
51+
notebook = jupytext.read(file)
15152

152-
if output_type in ("execute_result", "display_data"):
153-
return output["data"]["text/plain"]
53+
# Convert to Python script
54+
# using "py:percent" format to preserve cell structure
55+
config = JupytextConfiguration()
56+
# We can add more config here if needed
15457

155-
if output_type == "error":
156-
return [f"Error: {output['ename']}: {output['evalue']}"]
58+
return jupytext.writes(notebook, fmt="py:percent")
15759

158-
msg = f"Unknown output type: {output_type}"
159-
raise ValueError(msg)
60+
except Exception as exc:
61+
msg = f"Error processing notebook {file}: {exc}"
62+
raise InvalidNotebookError(msg) from exc

tests/conftest.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,40 @@ def write_notebook(tmp_path: Path) -> WriteNotebookFunc:
150150
"""
151151

152152
def _write_notebook(name: str, content: dict[str, Any]) -> Path:
153+
# Add minimal required fields for valid notebook v4
154+
if "nbformat" not in content:
155+
content["nbformat"] = 4
156+
if "nbformat_minor" not in content:
157+
content["nbformat_minor"] = 5
158+
if "metadata" not in content:
159+
content["metadata"] = {
160+
"kernelspec": {
161+
"display_name": "Python 3",
162+
"language": "python",
163+
"name": "python3",
164+
},
165+
"language_info": {
166+
"codemirror_mode": {"name": "ipython", "version": 3},
167+
"file_extension": ".py",
168+
"mimetype": "text/x-python",
169+
"name": "python",
170+
"nbconvert_exporter": "python",
171+
"pygments_lexer": "ipython3",
172+
"version": "3.8.0",
173+
},
174+
}
175+
176+
# Ensure cells have required fields
177+
if "cells" in content:
178+
for cell in content["cells"]:
179+
if "metadata" not in cell:
180+
cell["metadata"] = {}
181+
if cell["cell_type"] == "code":
182+
if "outputs" not in cell:
183+
cell["outputs"] = []
184+
if "execution_count" not in cell:
185+
cell["execution_count"] = None
186+
153187
notebook_path = tmp_path / name
154188
with notebook_path.open(mode="w", encoding="utf-8") as f:
155189
json.dump(content, f)

0 commit comments

Comments
 (0)