feat: ODM without categories integration with fms-accel (#641)

romitjain · web-flow · commit 5eeec0ee9c95 · 2025-12-03T16:08:18.000+05:30
ODM without categories and unit tests

Signed-off-by: romit &lt;romit@ibm.com&gt;
diff --git a/tests/artifacts/predefined_data_configs/__init__.py b/tests/artifacts/predefined_data_configs/__init__.py
@@ -34,6 +34,9 @@
 DATA_CONFIG_MULTIPLE_DATASETS_SAMPLING_AND_SPLIT_YAML = os.path.join(
     PREDEFINED_DATA_CONFIGS, "multiple_datasets_with_sampling_and_split.yaml"
 )
+DATA_CONFIG_SINGLE_DATASET_ODM_YAML = os.path.join(
+    PREDEFINED_DATA_CONFIGS, "single_dataset_with_odm.yaml"
+)
 DATA_CONFIG_MULTIPLE_DATASETS_ODM_YAML = os.path.join(
     PREDEFINED_DATA_CONFIGS, "multiple_datasets_with_odm.yaml"
 )
diff --git a/tests/artifacts/predefined_data_configs/single_dataset_with_odm.yaml b/tests/artifacts/predefined_data_configs/single_dataset_with_odm.yaml
@@ -0,0 +1,28 @@
+dataprocessor:
+    type: odm
+    sampling_stopping_strategy: first_exhausted # ignored
+    seed: 66
+    odm:
+      update_interval: 1              # update every step
+      sampling_interval: 1            # sample category for every sample
+      reward_type: validation_loss    # uses eval loss of each dataset as reward
+      gamma: 0.1                      # MAB hyper-parameter
+      eta: 0.2                        # MAB hyper-parameter
+      auto_categorize_input_column: "input"   # Required: Input field on which we need to apply clustering for forming pseudo categories
+      auto_categorize_num_categories: "3"   # Optional: Number of categories for clustering
+                                            # if not provided, this will be inferred based on dataset size
+datasets:
+  - name: dataset_1
+    split:
+      train: 0.8
+      validation: 0.2 # validation set is also used in ODM reward computation when reward_type is validation_loss.
+    data_paths:
+      - "FILE_PATH"
+    data_handlers:
+      - name: tokenize_and_apply_input_masking
+        arguments:
+          remove_columns: all
+          batched: false
+          fn_kwargs:
+            input_column_name: input
+            output_column_name: output
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
@@ -51,6 +51,7 @@
     DATA_CONFIG_MULTITURN_GRANITE_3_1B_DATA_YAML,
     DATA_CONFIG_PRETOKENIZE_DATA_YAML,
     DATA_CONFIG_RENAME_SELECT_COLUMNS,
+    DATA_CONFIG_SINGLE_DATASET_ODM_YAML,
     DATA_CONFIG_SKIP_LARGE_COLUMNS_HANDLER,
     DATA_CONFIG_TOKENIZE_AND_APPLY_INPUT_MASKING_YAML,
     DATA_CONFIG_TOKENIZE_AND_TRAIN_WITH_HANDLER,
@@ -2575,3 +2576,71 @@ def test_online_data_mixing_plugin_sample_training_no_validation_split(
             "What length of trench,\n25 m broad and 15 m deep can be dug in 30 days ?"
             in output_inference
         ), f"{output_inference} does not include the prompt"
+
+
+@pytest.mark.skipif(
+    not is_fms_accelerate_available(plugins="odm"),
+    reason="Only runs if fms-accelerate is installed along with online-data-mixing plugin",
+)
+@pytest.mark.parametrize(
+    "datafile, datasetconfigname, reward_type",
+    [
+        (
+            NESTFUL_DATA_INPUT_OUTPUT_JSONL,
+            DATA_CONFIG_SINGLE_DATASET_ODM_YAML,
+            "entropy",
+        ),
+        (
+            TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSONL,
+            DATA_CONFIG_SINGLE_DATASET_ODM_YAML,
+            "entropy",
+        ),
+    ],
+)
+def test_online_data_mixing_plugin_with_auto_categorization(
+    datafile, datasetconfigname, reward_type
+):
+    """Ensure fms_acceleration_odm plugin trains with autocategorization"""
+    with tempfile.TemporaryDirectory() as tempdir:
+        data_formatting_args = copy.deepcopy(DATA_ARGS)
+
+        # set training_data_path and response_template to none
+        data_formatting_args.response_template = None
+        data_formatting_args.training_data_path = None
+
+        # add data_paths in data_config file
+        with tempfile.NamedTemporaryFile(
+            "w", delete=False, suffix=".yaml"
+        ) as temp_yaml_file:
+            with open(datasetconfigname, "r", encoding="utf-8") as f:
+                data = yaml.safe_load(f)
+                data["dataprocessor"]["odm"]["reward_type"] = reward_type
+
+                for d, df in zip(data["datasets"], [datafile]):
+                    d["data_paths"] = [df]
+
+                yaml.dump(data, temp_yaml_file)
+                data_formatting_args.data_config_path = temp_yaml_file.name
+
+        train_args = copy.deepcopy(TRAIN_ARGS)
+        train_args.output_dir = tempdir
+        train_args.logging_strategy = "steps"
+        train_args.max_steps = 10
+        train_args.eval_strategy = "steps"
+        train_args.eval_steps = 1
+
+        sft_trainer.train(MODEL_ARGS, data_formatting_args, train_args)
+
+        # validate full ft configs
+        _validate_training(tempdir)
+        _, checkpoint_path = _get_latest_checkpoint_trainer_state(tempdir)
+
+        # Load the model
+        loaded_model = TunedCausalLM.load(checkpoint_path, MODEL_NAME)
+
+        # Run inference on the text
+        output_inference = loaded_model.run(
+            "### Text: @NortonSupport Thanks much.\n\n### Label:", max_new_tokens=50
+        )
+        assert len(output_inference) > 0
+        assert "### Text: @NortonSupport Thanks much.\n\n### Label:" in output_inference
diff --git a/tuning/config/acceleration_configs/odm.py b/tuning/config/acceleration_configs/odm.py
@@ -14,7 +14,7 @@
 
 # Standard
 from dataclasses import dataclass
-from typing import Union
+from typing import Optional, Union
 
 # Local
 from .utils import ensure_nested_dataclasses_initialized, parsable_dataclass
@@ -29,6 +29,10 @@ class ODM:
     gamma: float = 0.1
     eta: float = 0.1
     resume_from_checkpoint: Union[bool, str] = False
+    auto_categorize_input_column: str = None
+    auto_categorize_num_categories: Optional[int] = None
+    auto_categorize_model_name: str = "Qwen/Qwen3-Embedding-0.6B"
+    auto_categorize_batch_size: int = 64
 
 
 @dataclass
diff --git a/tuning/data/setup_dataprocessor.py b/tuning/data/setup_dataprocessor.py
@@ -548,6 +548,14 @@ def setup_train_dataset_for_odm(
             processor=processor,
         )
 
+    auto_categorize_config = {}
+    if hasattr(odm_config.odm, "auto_categorize_input_column"):
+        auto_categorize_config = {
+            "input_column": "input_ids",
+            "num_categories": int(odm_config.odm.auto_categorize_num_categories),
+            "tokenizer": tokenizer,
+        }
+
     train_dataset = OnlineMixingDataset(
         train_dataset,
         collators,
@@ -560,6 +568,7 @@ def setup_train_dataset_for_odm(
         sampling_interval=odm_config.odm.sampling_interval,
         eval_batch_size=train_args.per_device_eval_batch_size,
         reward_type=odm_config.odm.reward_type,
+        auto_categorize_config=auto_categorize_config,
     )
     train_args.accelerator_config = {"dispatch_batches": False}
     return (True, train_dataset, data_collator)

Original file line number	Diff line number	Diff line change
`@@ -34,6 +34,9 @@`
`34`	`34`	`DATA_CONFIG_MULTIPLE_DATASETS_SAMPLING_AND_SPLIT_YAML = os.path.join(`
`35`	`35`	`PREDEFINED_DATA_CONFIGS, "multiple_datasets_with_sampling_and_split.yaml"`
`36`	`36`	`)`
	`37`	`+DATA_CONFIG_SINGLE_DATASET_ODM_YAML = os.path.join(`
	`38`	`+ PREDEFINED_DATA_CONFIGS, "single_dataset_with_odm.yaml"`
	`39`	`+)`
`37`	`40`	`DATA_CONFIG_MULTIPLE_DATASETS_ODM_YAML = os.path.join(`
`38`	`41`	`PREDEFINED_DATA_CONFIGS, "multiple_datasets_with_odm.yaml"`
`39`	`42`	`)`