From 91d6f2b864cff43dc4ba14fc3202e68c77ea3397 Mon Sep 17 00:00:00 2001 From: ChanderG Date: Thu, 26 Jun 2025 11:01:59 +0530 Subject: [PATCH 1/3] data pre-processor: add to_json flag to jinja handler add a new flag to cast the output of the jinja handler to json --- tuning/data/data_handlers.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tuning/data/data_handlers.py b/tuning/data/data_handlers.py index e5a133f18..d033d01f4 100644 --- a/tuning/data/data_handlers.py +++ b/tuning/data/data_handlers.py @@ -17,6 +17,7 @@ # Standard from enum import Enum from typing import Any, Dict, List, Union +import json # import copy import logging @@ -183,6 +184,7 @@ def __wrap_jinja_rendering_with_exception_handling(render_template: callable, ** def apply_custom_jinja_template( element: Dict[str, str], formatted_text_column_name: str, + to_json: bool, template: str, **kwargs, ): @@ -193,6 +195,7 @@ def apply_custom_jinja_template( formatted_text_column_name: Name of the dataset column where formatted text is to be saved. If doesn't exist a new column will be created. + to_json: whether to cast the output as a json template: Template to format data with. Features of Dataset should be referred to by {{key}}. Returns: @@ -212,7 +215,16 @@ def render(): env = SandboxedEnvironment(undefined=StrictUndefined) jinja_template = env.from_string(template) template_kwargs = {**tokenizer.special_tokens_map, **element} - return jinja_template.render(element=element, **template_kwargs) + res = jinja_template.render(element=element, **template_kwargs) + if to_json: + try: + # this can easily fail if the individual values in the template are not already json encoded + res = json.loads(res) + except json.decoder.JSONDecodeError as e: + raise RuntimeError( + "Column data not in expected json format: %s" % (res) + ) from e + return res return { f"{formatted_text_column_name}": __wrap_jinja_rendering_with_exception_handling( From 71c544820a9fffe7cc518cdfc8d2b76dac6909be Mon Sep 17 00:00:00 2001 From: ChanderG Date: Thu, 26 Jun 2025 15:38:41 +0530 Subject: [PATCH 2/3] move json conversion to stand-alone handler Signed-off-by: ChanderG --- tuning/data/data_handlers.py | 43 +++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/tuning/data/data_handlers.py b/tuning/data/data_handlers.py index d033d01f4..5a3896520 100644 --- a/tuning/data/data_handlers.py +++ b/tuning/data/data_handlers.py @@ -180,11 +180,37 @@ def __wrap_jinja_rendering_with_exception_handling(render_template: callable, ** f"Error occurred while rendering the provided Jinja template. {e}" ) from e +def column_to_json( + element: Dict[str, str], + column_name: str, + **kwargs, +): + """Convert a column to json format. Meant for columns that already are in json but of string type. + Expects to be run as a HF Map API function. + Args: + element: the HF Dataset element + column_name: the name of the column to convert + """ + + def render(): + inp = element[column_name] + try: + # this can easily fail if the individual values in the template are not already json encoded + res = json.loads(inp) + return res + except json.decoder.JSONDecodeError as e: + raise RuntimeError( + "Column data not in expected json format: %s" % (inp) + ) from e + + return { + f"{column_name}": render() + } + def apply_custom_jinja_template( element: Dict[str, str], formatted_text_column_name: str, - to_json: bool, template: str, **kwargs, ): @@ -195,7 +221,6 @@ def apply_custom_jinja_template( formatted_text_column_name: Name of the dataset column where formatted text is to be saved. If doesn't exist a new column will be created. - to_json: whether to cast the output as a json template: Template to format data with. Features of Dataset should be referred to by {{key}}. Returns: @@ -216,14 +241,6 @@ def render(): jinja_template = env.from_string(template) template_kwargs = {**tokenizer.special_tokens_map, **element} res = jinja_template.render(element=element, **template_kwargs) - if to_json: - try: - # this can easily fail if the individual values in the template are not already json encoded - res = json.loads(res) - except json.decoder.JSONDecodeError as e: - raise RuntimeError( - "Column data not in expected json format: %s" % (res) - ) from e return res return { @@ -674,4 +691,10 @@ def tokenize_and_apply_chat_template_with_masking( allows_batching=False, desc="Processing multimodal data", ), + "column_to_json": DataHandler( + op=column_to_json, + handler_type=DataHandlerType.MAP, + allows_batching=False, + desc="Convert column to json format", + ), } From 76c72723524c52ecac92631dba17a953eb071b34 Mon Sep 17 00:00:00 2001 From: ChanderG Date: Mon, 30 Jun 2025 10:00:47 +0530 Subject: [PATCH 3/3] Revert "move json conversion to stand-alone handler" This reverts commit 71c544820a9fffe7cc518cdfc8d2b76dac6909be. --- tuning/data/data_handlers.py | 43 +++++++++--------------------------- 1 file changed, 10 insertions(+), 33 deletions(-) diff --git a/tuning/data/data_handlers.py b/tuning/data/data_handlers.py index 5a3896520..d033d01f4 100644 --- a/tuning/data/data_handlers.py +++ b/tuning/data/data_handlers.py @@ -180,37 +180,11 @@ def __wrap_jinja_rendering_with_exception_handling(render_template: callable, ** f"Error occurred while rendering the provided Jinja template. {e}" ) from e -def column_to_json( - element: Dict[str, str], - column_name: str, - **kwargs, -): - """Convert a column to json format. Meant for columns that already are in json but of string type. - Expects to be run as a HF Map API function. - Args: - element: the HF Dataset element - column_name: the name of the column to convert - """ - - def render(): - inp = element[column_name] - try: - # this can easily fail if the individual values in the template are not already json encoded - res = json.loads(inp) - return res - except json.decoder.JSONDecodeError as e: - raise RuntimeError( - "Column data not in expected json format: %s" % (inp) - ) from e - - return { - f"{column_name}": render() - } - def apply_custom_jinja_template( element: Dict[str, str], formatted_text_column_name: str, + to_json: bool, template: str, **kwargs, ): @@ -221,6 +195,7 @@ def apply_custom_jinja_template( formatted_text_column_name: Name of the dataset column where formatted text is to be saved. If doesn't exist a new column will be created. + to_json: whether to cast the output as a json template: Template to format data with. Features of Dataset should be referred to by {{key}}. Returns: @@ -241,6 +216,14 @@ def render(): jinja_template = env.from_string(template) template_kwargs = {**tokenizer.special_tokens_map, **element} res = jinja_template.render(element=element, **template_kwargs) + if to_json: + try: + # this can easily fail if the individual values in the template are not already json encoded + res = json.loads(res) + except json.decoder.JSONDecodeError as e: + raise RuntimeError( + "Column data not in expected json format: %s" % (res) + ) from e return res return { @@ -691,10 +674,4 @@ def tokenize_and_apply_chat_template_with_masking( allows_batching=False, desc="Processing multimodal data", ), - "column_to_json": DataHandler( - op=column_to_json, - handler_type=DataHandlerType.MAP, - allows_batching=False, - desc="Convert column to json format", - ), }