Skip to content

Commit b94c00f

Browse files
feat: add optional parameter to include notebook cell outputs in generated script
1 parent dd8f1e0 commit b94c00f

File tree

4 files changed

+201
-41
lines changed

4 files changed

+201
-41
lines changed

src/gitingest/exceptions.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,10 @@ class AlreadyVisitedError(Exception):
4949

5050
def __init__(self, path: str) -> None:
5151
super().__init__(f"Symlink target already visited: {path}")
52+
53+
54+
class InvalidNotebookError(Exception):
55+
"""Exception raised when a Jupyter notebook is invalid or cannot be processed."""
56+
57+
def __init__(self, message: str) -> None:
58+
super().__init__(message)

src/gitingest/notebook_utils.py

Lines changed: 116 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,23 @@
22

33
import json
44
import warnings
5+
from itertools import chain
56
from pathlib import Path
67
from typing import Any
78

9+
from gitingest.exceptions import InvalidNotebookError
810

9-
def process_notebook(file: Path) -> str:
11+
12+
def process_notebook(file: Path, include_output: bool = True) -> str:
1013
"""
1114
Process a Jupyter notebook file and return an executable Python script as a string.
1215
1316
Parameters
1417
----------
1518
file : Path
1619
The path to the Jupyter notebook file.
20+
include_output : bool
21+
Whether to include cell outputs in the generated script, by default True.
1722
1823
Returns
1924
-------
@@ -22,45 +27,127 @@ def process_notebook(file: Path) -> str:
2227
2328
Raises
2429
------
25-
ValueError
26-
If an unexpected cell type is encountered.
30+
InvalidNotebookError
31+
If the notebook file is invalid or cannot be processed.
2732
"""
28-
with file.open(encoding="utf-8") as f:
29-
notebook: dict[str, Any] = json.load(f)
33+
try:
34+
with file.open(encoding="utf-8") as f:
35+
notebook: dict[str, Any] = json.load(f)
36+
except json.JSONDecodeError as e:
37+
raise InvalidNotebookError(f"Invalid JSON in notebook: {file}") from e
3038

3139
# Check if the notebook contains worksheets
3240
if worksheets := notebook.get("worksheets"):
33-
# https://github.com/ipython/ipython/wiki/IPEP-17:-Notebook-Format-4#remove-multiple-worksheets
34-
# "The `worksheets` field is a list, but we have no UI to support multiple worksheets.
35-
# Our design has since shifted to heading-cell based structure, so we never intend to
36-
# support the multiple worksheet model. The worksheets list of lists shall be replaced
37-
# with a single list, called `cells`."
38-
warnings.warn("Worksheets are deprecated as of IPEP-17.", DeprecationWarning)
41+
warnings.warn(
42+
"Worksheets are deprecated as of IPEP-17. Consider updating the notebook. "
43+
"(See: https://github.com/jupyter/nbformat and "
44+
"https://github.com/ipython/ipython/wiki/IPEP-17:-Notebook-Format-4#remove-multiple-worksheets "
45+
"for more information.)",
46+
DeprecationWarning,
47+
)
3948

4049
if len(worksheets) > 1:
41-
warnings.warn(
42-
"Multiple worksheets are not supported. Only the first worksheet will be processed.", UserWarning
43-
)
50+
warnings.warn("Multiple worksheets detected. Combining all worksheets into a single script.", UserWarning)
51+
52+
cells = list(chain.from_iterable(ws["cells"] for ws in worksheets))
53+
54+
else:
55+
cells = notebook["cells"]
56+
57+
result = ["# Jupyter notebook converted to Python script."]
58+
59+
for cell in cells:
60+
if cell_str := _process_cell(cell, include_output=include_output):
61+
result.append(cell_str)
62+
63+
return "\n\n".join(result) + "\n"
64+
65+
66+
def _process_cell(cell: dict[str, Any], include_output: bool) -> str | None:
67+
"""
68+
Process a Jupyter notebook cell and return the cell content as a string.
4469
45-
notebook = worksheets[0]
70+
Parameters
71+
----------
72+
cell : dict[str, Any]
73+
The cell dictionary from a Jupyter notebook.
74+
include_output : bool
75+
Whether to include cell outputs in the generated script
76+
77+
Returns
78+
-------
79+
str | None
80+
The cell content as a string, or None if the cell is empty.
81+
82+
Raises
83+
------
84+
ValueError
85+
If an unexpected cell type is encountered.
86+
"""
87+
cell_type = cell["cell_type"]
4688

47-
result = []
89+
# Validate cell type and handle unexpected types
90+
if cell_type not in ("markdown", "code", "raw"):
91+
raise ValueError(f"Unknown cell type: {cell_type}")
4892

49-
for cell in notebook["cells"]:
50-
cell_type = cell.get("cell_type")
93+
cell_str = "".join(cell["source"])
5194

52-
# Validate cell type and handle unexpected types
53-
if cell_type not in ("markdown", "code", "raw"):
54-
raise ValueError(f"Unknown cell type: {cell_type}")
95+
# Skip empty cells
96+
if not cell_str:
97+
return None
98+
99+
# Convert Markdown and raw cells to multi-line comments
100+
if cell_type in ("markdown", "raw"):
101+
return f'"""\n{cell_str}\n"""'
102+
103+
# Add cell output as comments
104+
if include_output and (outputs := cell.get("outputs")):
105+
106+
# Include cell outputs as comments
107+
output_lines = []
108+
109+
for output in outputs:
110+
output_lines += _extract_output(output)
111+
112+
for output_line in output_lines:
113+
if not output_line.endswith("\n"):
114+
output_line += "\n"
115+
116+
cell_str += "\n# Output:\n# " + "\n# ".join(output_lines)
117+
118+
return cell_str
119+
120+
121+
def _extract_output(output: dict[str, Any]) -> list[str]:
122+
"""
123+
Extract the output from a Jupyter notebook cell.
124+
125+
Parameters
126+
----------
127+
output : dict[str, Any]
128+
The output dictionary from a Jupyter notebook cell.
129+
130+
Returns
131+
-------
132+
list[str]
133+
The output as a list of strings.
134+
135+
Raises
136+
------
137+
ValueError
138+
If an unknown output type is encountered.
139+
"""
140+
output_type = output["output_type"]
55141

56-
str_ = "".join(cell.get("source", []))
57-
if not str_:
58-
continue
142+
match output_type:
143+
case "stream":
144+
return output["text"]
59145

60-
# Convert Markdown and raw cells to multi-line comments
61-
if cell_type in ("markdown", "raw"):
62-
str_ = f'"""\n{str_}\n"""'
146+
case "execute_result" | "display_data":
147+
return output["data"]["text/plain"]
63148

64-
result.append(str_)
149+
case "error":
150+
return [f"Error: {output['ename']}: {output['evalue']}"]
65151

66-
return "\n\n".join(result)
152+
case _:
153+
raise ValueError(f"Unknown output type: {output_type}")

src/gitingest/query_ingestion.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,12 @@
66

77
import tiktoken
88

9-
from gitingest.exceptions import AlreadyVisitedError, MaxFileSizeReachedError, MaxFilesReachedError
9+
from gitingest.exceptions import (
10+
AlreadyVisitedError,
11+
InvalidNotebookError,
12+
MaxFileSizeReachedError,
13+
MaxFilesReachedError,
14+
)
1015
from gitingest.notebook_utils import process_notebook
1116

1217
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB
@@ -164,7 +169,7 @@ def _read_file_content(file_path: Path) -> str:
164169

165170
with open(file_path, encoding="utf-8", errors="ignore") as f:
166171
return f.read()
167-
except OSError as e:
172+
except (OSError, InvalidNotebookError) as e:
168173
return f"Error reading file: {e}"
169174

170175

tests/test_notebook_utils.py

Lines changed: 71 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -70,16 +70,11 @@ def test_process_notebook_with_worksheets(write_notebook):
7070
def test_process_notebook_multiple_worksheets(write_notebook):
7171
"""
7272
Test a notebook containing multiple 'worksheets'.
73-
74-
If multiple worksheets are present:
75-
- Only process the first sheet's cells.
76-
- DeprecationWarning for worksheets
77-
- UserWarning for ignoring extra worksheets
7873
"""
7974
multi_worksheets = {
8075
"worksheets": [
8176
{"cells": [{"cell_type": "markdown", "source": ["# First Worksheet"]}]},
82-
{"cells": [{"cell_type": "code", "source": ['print("Ignored Worksheet")']}]},
77+
{"cells": [{"cell_type": "code", "source": ["# Second Worksheet"]}]},
8378
]
8479
}
8580

@@ -93,15 +88,26 @@ def test_process_notebook_multiple_worksheets(write_notebook):
9388
nb_multi = write_notebook("multiple_worksheets.ipynb", multi_worksheets)
9489
nb_single = write_notebook("single_worksheet.ipynb", single_worksheet)
9590

96-
with pytest.warns(DeprecationWarning, match="Worksheets are deprecated as of IPEP-17."):
97-
with pytest.warns(UserWarning, match="Multiple worksheets are not supported."):
91+
with pytest.warns(
92+
DeprecationWarning, match="Worksheets are deprecated as of IPEP-17. Consider updating the notebook."
93+
):
94+
with pytest.warns(
95+
UserWarning, match="Multiple worksheets detected. Combining all worksheets into a single script."
96+
):
9897
result_multi = process_notebook(nb_multi)
9998

100-
with pytest.warns(DeprecationWarning, match="Worksheets are deprecated as of IPEP-17."):
99+
with pytest.warns(
100+
DeprecationWarning, match="Worksheets are deprecated as of IPEP-17. Consider updating the notebook."
101+
):
101102
result_single = process_notebook(nb_single)
102103

103104
# The second worksheet (with code) should have been ignored
104-
assert result_multi == result_single, "Second worksheet was ignored, results match."
105+
assert result_multi != result_single, "The multi-worksheet notebook should have more content."
106+
assert len(result_multi) > len(result_single), "The multi-worksheet notebook should have more content."
107+
assert "# First Worksheet" in result_single, "First worksheet content should be present."
108+
assert "# Second Worksheet" not in result_single, "Second worksheet content should be absent."
109+
assert "# First Worksheet" in result_multi, "First worksheet content should be present."
110+
assert "# Second Worksheet" in result_multi, "Second worksheet content should be present."
105111

106112

107113
def test_process_notebook_code_only(write_notebook):
@@ -204,3 +210,58 @@ def test_process_notebook_invalid_cell_type(write_notebook):
204210

205211
with pytest.raises(ValueError, match="Unknown cell type: unknown"):
206212
process_notebook(nb_path)
213+
214+
215+
def test_process_notebook_with_output(write_notebook):
216+
"""
217+
Test a notebook with code cells and outputs.
218+
219+
The outputs should be included as comments if `include_output=True`.
220+
"""
221+
notebook_content = {
222+
"cells": [
223+
{
224+
"cell_type": "code",
225+
"source": [
226+
"import matplotlib.pyplot as plt\n",
227+
"print('my_data')\n",
228+
"my_data = [1, 2, 3, 4, 5]\n",
229+
"plt.plot(my_data)\n",
230+
"my_data",
231+
],
232+
"outputs": [
233+
{"output_type": "stream", "text": ["my_data"]},
234+
{"output_type": "execute_result", "data": {"text/plain": ["[1, 2, 3, 4, 5]"]}},
235+
{"output_type": "display_data", "data": {"text/plain": ["<Figure size 640x480 with 1 Axes>"]}},
236+
],
237+
}
238+
]
239+
}
240+
241+
nb_path = write_notebook("with_output.ipynb", notebook_content)
242+
with_output = process_notebook(nb_path, include_output=True)
243+
without_output = process_notebook(nb_path, include_output=False)
244+
245+
expected_source = "\n".join(
246+
[
247+
"# Jupyter notebook converted to Python script.\n",
248+
"import matplotlib.pyplot as plt",
249+
"print('my_data')",
250+
"my_data = [1, 2, 3, 4, 5]",
251+
"plt.plot(my_data)",
252+
"my_data\n",
253+
]
254+
)
255+
expected_output = "\n".join(
256+
[
257+
"# Output:",
258+
"# my_data",
259+
"# [1, 2, 3, 4, 5]",
260+
"# <Figure size 640x480 with 1 Axes>\n",
261+
]
262+
)
263+
264+
expected_combined = expected_source + expected_output
265+
266+
assert with_output == expected_combined, "Expected source code and output as comments."
267+
assert without_output == expected_source, "Expected source code only."

0 commit comments

Comments
 (0)