22
33import json
44import warnings
5+ from itertools import chain
56from pathlib import Path
67from typing import Any
78
9+ from gitingest .exceptions import InvalidNotebookError
810
9- def process_notebook (file : Path ) -> str :
11+
12+ def process_notebook (file : Path , include_output : bool = True ) -> str :
1013 """
1114 Process a Jupyter notebook file and return an executable Python script as a string.
1215
1316 Parameters
1417 ----------
1518 file : Path
1619 The path to the Jupyter notebook file.
20+ include_output : bool
21+ Whether to include cell outputs in the generated script, by default True.
1722
1823 Returns
1924 -------
@@ -22,45 +27,127 @@ def process_notebook(file: Path) -> str:
2227
2328 Raises
2429 ------
25- ValueError
26- If an unexpected cell type is encountered .
30+ InvalidNotebookError
31+ If the notebook file is invalid or cannot be processed .
2732 """
28- with file .open (encoding = "utf-8" ) as f :
29- notebook : dict [str , Any ] = json .load (f )
33+ try :
34+ with file .open (encoding = "utf-8" ) as f :
35+ notebook : dict [str , Any ] = json .load (f )
36+ except json .JSONDecodeError as e :
37+ raise InvalidNotebookError (f"Invalid JSON in notebook: { file } " ) from e
3038
3139 # Check if the notebook contains worksheets
3240 if worksheets := notebook .get ("worksheets" ):
33- # https://github.com/ipython/ipython/wiki/IPEP-17:-Notebook-Format-4#remove-multiple-worksheets
34- # "The `worksheets` field is a list, but we have no UI to support multiple worksheets.
35- # Our design has since shifted to heading-cell based structure, so we never intend to
36- # support the multiple worksheet model. The worksheets list of lists shall be replaced
37- # with a single list, called `cells`."
38- warnings .warn ("Worksheets are deprecated as of IPEP-17." , DeprecationWarning )
41+ warnings .warn (
42+ "Worksheets are deprecated as of IPEP-17. Consider updating the notebook. "
43+ "(See: https://github.com/jupyter/nbformat and "
44+ "https://github.com/ipython/ipython/wiki/IPEP-17:-Notebook-Format-4#remove-multiple-worksheets "
45+ "for more information.)" ,
46+ DeprecationWarning ,
47+ )
3948
4049 if len (worksheets ) > 1 :
41- warnings .warn (
42- "Multiple worksheets are not supported. Only the first worksheet will be processed." , UserWarning
43- )
50+ warnings .warn ("Multiple worksheets detected. Combining all worksheets into a single script." , UserWarning )
51+
52+ cells = list (chain .from_iterable (ws ["cells" ] for ws in worksheets ))
53+
54+ else :
55+ cells = notebook ["cells" ]
56+
57+ result = ["# Jupyter notebook converted to Python script." ]
58+
59+ for cell in cells :
60+ if cell_str := _process_cell (cell , include_output = include_output ):
61+ result .append (cell_str )
62+
63+ return "\n \n " .join (result ) + "\n "
64+
65+
66+ def _process_cell (cell : dict [str , Any ], include_output : bool ) -> str | None :
67+ """
68+ Process a Jupyter notebook cell and return the cell content as a string.
4469
45- notebook = worksheets [0 ]
70+ Parameters
71+ ----------
72+ cell : dict[str, Any]
73+ The cell dictionary from a Jupyter notebook.
74+ include_output : bool
75+ Whether to include cell outputs in the generated script
76+
77+ Returns
78+ -------
79+ str | None
80+ The cell content as a string, or None if the cell is empty.
81+
82+ Raises
83+ ------
84+ ValueError
85+ If an unexpected cell type is encountered.
86+ """
87+ cell_type = cell ["cell_type" ]
4688
47- result = []
89+ # Validate cell type and handle unexpected types
90+ if cell_type not in ("markdown" , "code" , "raw" ):
91+ raise ValueError (f"Unknown cell type: { cell_type } " )
4892
49- for cell in notebook ["cells" ]:
50- cell_type = cell .get ("cell_type" )
93+ cell_str = "" .join (cell ["source" ])
5194
52- # Validate cell type and handle unexpected types
53- if cell_type not in ("markdown" , "code" , "raw" ):
54- raise ValueError (f"Unknown cell type: { cell_type } " )
95+ # Skip empty cells
96+ if not cell_str :
97+ return None
98+
99+ # Convert Markdown and raw cells to multi-line comments
100+ if cell_type in ("markdown" , "raw" ):
101+ return f'"""\n { cell_str } \n """'
102+
103+ # Add cell output as comments
104+ if include_output and (outputs := cell .get ("outputs" )):
105+
106+ # Include cell outputs as comments
107+ output_lines = []
108+
109+ for output in outputs :
110+ output_lines += _extract_output (output )
111+
112+ for output_line in output_lines :
113+ if not output_line .endswith ("\n " ):
114+ output_line += "\n "
115+
116+ cell_str += "\n # Output:\n # " + "\n # " .join (output_lines )
117+
118+ return cell_str
119+
120+
121+ def _extract_output (output : dict [str , Any ]) -> list [str ]:
122+ """
123+ Extract the output from a Jupyter notebook cell.
124+
125+ Parameters
126+ ----------
127+ output : dict[str, Any]
128+ The output dictionary from a Jupyter notebook cell.
129+
130+ Returns
131+ -------
132+ list[str]
133+ The output as a list of strings.
134+
135+ Raises
136+ ------
137+ ValueError
138+ If an unknown output type is encountered.
139+ """
140+ output_type = output ["output_type" ]
55141
56- str_ = "" . join ( cell . get ( "source" , []))
57- if not str_ :
58- continue
142+ match output_type :
143+ case "stream" :
144+ return output [ "text" ]
59145
60- # Convert Markdown and raw cells to multi-line comments
61- if cell_type in ("markdown" , "raw" ):
62- str_ = f'"""\n { str_ } \n """'
146+ case "execute_result" | "display_data" :
147+ return output ["data" ]["text/plain" ]
63148
64- result .append (str_ )
149+ case "error" :
150+ return [f"Error: { output ['ename' ]} : { output ['evalue' ]} " ]
65151
66- return "\n \n " .join (result )
152+ case _:
153+ raise ValueError (f"Unknown output type: { output_type } " )
0 commit comments