From 76df0fe82017090cfbf82951cd5354fe0df89442 Mon Sep 17 00:00:00 2001 From: IsNoobGrammer Date: Mon, 13 Jan 2025 21:07:28 +0530 Subject: [PATCH 1/2] Add Support for cell-output and cell-metadata for notebook-files --- src/gitingest/notebook_utils.py | 21 ++++++++++++++++++++- src/gitingest/query_ingestion.py | 6 ++++-- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/src/gitingest/notebook_utils.py b/src/gitingest/notebook_utils.py index c5590341..84116082 100644 --- a/src/gitingest/notebook_utils.py +++ b/src/gitingest/notebook_utils.py @@ -6,7 +6,7 @@ from typing import Any -def process_notebook(file: Path) -> str: +def process_notebook(file: Path,parse_output_notebook:bool) -> str: """ Process a Jupyter notebook file and return an executable Python script as a string. @@ -45,6 +45,7 @@ def process_notebook(file: Path) -> str: notebook = worksheets[0] result = [] + cell_count=0 for cell in notebook["cells"]: cell_type = cell.get("cell_type") @@ -61,6 +62,24 @@ def process_notebook(file: Path) -> str: if cell_type in ("markdown", "raw"): str_ = f'"""\n{str_}\n"""' + # Extract Output from cell + if parse_output_notebook and (("outputs" in cell) and (cell["outputs"] != [])): + sample_output="" + for output in cell["outputs"]: + if output["output_type"] == "stream" and output["text"] != []: + sample_output += "".join(output["text"]) + "\n" + elif (output["output_type"] in ["execute_result","display_data"]) and ("data" in output) and ("text/plain" in output["data"]): + sample_output += "".join(output["data"]["text/plain"]) + "\n" + elif (output["output_type"]=="error" and ("evalue" in output) ): + sample_output += f"{output.get("ename","Error")} : " + "".join(output["evalue"]) + "\n" + str_ += f'\n# Output:\n"""{sample_output}"""\n' + + # Add Cell Info + cell_count+=1 + str_ = f"# Cell {cell_count} ; Type : ({cell_type})\n" + str_ + + + result.append(str_) return "\n\n".join(result) diff --git a/src/gitingest/query_ingestion.py b/src/gitingest/query_ingestion.py index 3396ca6e..7d2d4327 100644 --- a/src/gitingest/query_ingestion.py +++ b/src/gitingest/query_ingestion.py @@ -140,7 +140,7 @@ def _is_text_file(file_path: Path) -> bool: return False -def _read_file_content(file_path: Path) -> str: +def _read_file_content(file_path: Path , parse_output_notebook: bool = True) -> str: """ Read the content of a file. @@ -152,6 +152,8 @@ def _read_file_content(file_path: Path) -> str: ---------- file_path : Path The path to the file to read. + parse_output_notebook : bool + Whether to parse the output of the notebook-cells. Returns ------- @@ -160,7 +162,7 @@ def _read_file_content(file_path: Path) -> str: """ try: if file_path.suffix == ".ipynb": - return process_notebook(file_path) + return process_notebook(file_path, parse_output_notebook) with open(file_path, encoding="utf-8", errors="ignore") as f: return f.read() From 60d7660442168ff6046f16b3e63ad1f8f5a96f31 Mon Sep 17 00:00:00 2001 From: IsNoobGrammer Date: Mon, 13 Jan 2025 21:52:41 +0530 Subject: [PATCH 2/2] Add gitingest to valid domains --- src/gitingest/notebook_utils.py | 4 ++-- src/gitingest/query_ingestion.py | 4 ++-- src/gitingest/query_parser.py | 1 + tests/query_parser/test_query_parser.py | 5 +++++ 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/gitingest/notebook_utils.py b/src/gitingest/notebook_utils.py index 84116082..8ac7f5f9 100644 --- a/src/gitingest/notebook_utils.py +++ b/src/gitingest/notebook_utils.py @@ -6,7 +6,7 @@ from typing import Any -def process_notebook(file: Path,parse_output_notebook:bool) -> str: +def process_notebook(file: Path , parse_notebook_output: bool = True) -> str: """ Process a Jupyter notebook file and return an executable Python script as a string. @@ -63,7 +63,7 @@ def process_notebook(file: Path,parse_output_notebook:bool) -> str: str_ = f'"""\n{str_}\n"""' # Extract Output from cell - if parse_output_notebook and (("outputs" in cell) and (cell["outputs"] != [])): + if parse_notebook_output and (("outputs" in cell) and (cell["outputs"] != [])): sample_output="" for output in cell["outputs"]: if output["output_type"] == "stream" and output["text"] != []: diff --git a/src/gitingest/query_ingestion.py b/src/gitingest/query_ingestion.py index 7d2d4327..ce29effd 100644 --- a/src/gitingest/query_ingestion.py +++ b/src/gitingest/query_ingestion.py @@ -140,7 +140,7 @@ def _is_text_file(file_path: Path) -> bool: return False -def _read_file_content(file_path: Path , parse_output_notebook: bool = True) -> str: +def _read_file_content(file_path: Path , parse_notebook_output: bool = True) -> str: """ Read the content of a file. @@ -162,7 +162,7 @@ def _read_file_content(file_path: Path , parse_output_notebook: bool = True) -> """ try: if file_path.suffix == ".ipynb": - return process_notebook(file_path, parse_output_notebook) + return process_notebook(file_path, parse_notebook_output) with open(file_path, encoding="utf-8", errors="ignore") as f: return f.read() diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py index 78dd6cff..4d7f87a2 100644 --- a/src/gitingest/query_parser.py +++ b/src/gitingest/query_parser.py @@ -21,6 +21,7 @@ "bitbucket.org", "gitea.com", "codeberg.org", + "gitingest.com" ] diff --git a/tests/query_parser/test_query_parser.py b/tests/query_parser/test_query_parser.py index 0db65d3b..1134a097 100644 --- a/tests/query_parser/test_query_parser.py +++ b/tests/query_parser/test_query_parser.py @@ -17,6 +17,9 @@ async def test_parse_url_valid_https() -> None: "https://github.com/user/repo", "https://gitlab.com/user/repo", "https://bitbucket.org/user/repo", + "https://gitea.com/user/repo", + "https://codeberg.com/user/repo", + "https://gitingest.com/user/repo", ] for url in test_cases: result = await _parse_repo_source(url) @@ -34,6 +37,8 @@ async def test_parse_url_valid_http() -> None: "http://github.com/user/repo", "http://gitlab.com/user/repo", "http://bitbucket.org/user/repo", + "https://gitingest.com/user/repo", + "http://gitea.com/user/repo", ] for url in test_cases: result = await _parse_repo_source(url)