From 7529a382dd76f8ada1ca84e6209842e2e9587ea4 Mon Sep 17 00:00:00 2001 From: Arsh Zahed Date: Wed, 4 Dec 2024 15:02:12 -0800 Subject: [PATCH 1/5] Add check for unneeded jsonl columns --- src/together/utils/files.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/together/utils/files.py b/src/together/utils/files.py index 7267ccbd..bcf11548 100644 --- a/src/together/utils/files.py +++ b/src/together/utils/files.py @@ -142,6 +142,18 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: error_source="format", ) + # Check that there are not extra columns + for column in json_line: + if ( + column + not in JSONL_REQUIRED_COLUMNS_MAP[possible_format] + ): + raise InvalidFileFormatError( + message=f"Found extra column {column} in the line {idx + 1}.", + line_number=idx + 1, + error_source="format", + ) + if current_format is None: raise InvalidFileFormatError( message=( From 2dd338dda1305bb4ffb4034616621ae82dc20232 Mon Sep 17 00:00:00 2001 From: Arsh Zahed Date: Wed, 4 Dec 2024 22:20:09 -0800 Subject: [PATCH 2/5] Add test_check_jsonl_extra_column --- tests/unit/test_files_checks.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py index 65f59f61..7abae4ad 100644 --- a/tests/unit/test_files_checks.py +++ b/tests/unit/test_files_checks.py @@ -279,3 +279,14 @@ def test_check_jsonl_wrong_turn_type(tmp_path: Path): "Invalid format on line 1 of the input file. Expected a dictionary" in report["message"] ) + + +def test_check_jsonl_extra_column(tmp_path: Path): + file = tmp_path / "extra_column.jsonl" + content = [{"text": "Hello, world!", "extra_column": "extra"}] + with file.open("w") as f: + f.write("\n".join(json.dumps(item) for item in content)) + + report = check_file(file) + assert not report["is_check_passed"] + assert "Found extra column" in report["message"] From 1313db2ac5caf33b428f895b381f90a6947af09a Mon Sep 17 00:00:00 2001 From: Arsh Zahed Date: Wed, 11 Dec 2024 11:02:03 -0800 Subject: [PATCH 3/5] Fix not -> no --- src/together/utils/files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/together/utils/files.py b/src/together/utils/files.py index bcf11548..4d2446b8 100644 --- a/src/together/utils/files.py +++ b/src/together/utils/files.py @@ -142,7 +142,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: error_source="format", ) - # Check that there are not extra columns + # Check that there are no extra columns for column in json_line: if ( column From 2e8650320676a97c137209b81d23f3f59d74be6e Mon Sep 17 00:00:00 2001 From: Arsh Zahed Date: Wed, 11 Dec 2024 11:12:00 -0800 Subject: [PATCH 4/5] Update invalid dataset error message --- src/together/utils/files.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/together/utils/files.py b/src/together/utils/files.py index 4d2446b8..1b620ac2 100644 --- a/src/together/utils/files.py +++ b/src/together/utils/files.py @@ -120,7 +120,8 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: raise InvalidFileFormatError( message=( f"Error parsing file. Invalid format on line {idx + 1} of the input file. " - 'Example of valid json: {"text": "my sample string"}. ' + "Datasets must follow text, conversational, or instruction format. For more" + "information, see https://docs.together.ai/docs/fine-tuning-data-preparation" ), line_number=idx + 1, error_source="line_type", From 784fd4fd2fa0042c42498f552d5b3bd28ac881f6 Mon Sep 17 00:00:00 2001 From: Arsh Zahed Date: Wed, 11 Dec 2024 11:18:38 -0800 Subject: [PATCH 5/5] update version, add quotes to error message --- pyproject.toml | 2 +- src/together/utils/files.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3911eb95..53d2739b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api" [tool.poetry] name = "together" -version = "1.3.6" +version = "1.3.7" authors = [ "Together AI " ] diff --git a/src/together/utils/files.py b/src/together/utils/files.py index 1b620ac2..6c5892f1 100644 --- a/src/together/utils/files.py +++ b/src/together/utils/files.py @@ -150,7 +150,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: not in JSONL_REQUIRED_COLUMNS_MAP[possible_format] ): raise InvalidFileFormatError( - message=f"Found extra column {column} in the line {idx + 1}.", + message=f'Found extra column "{column}" in the line {idx + 1}.', line_number=idx + 1, error_source="format", )