VectorInstitute · fatemetkl · Nov 12, 2025 · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025
diff --git a/examples/ensemble_attack/configs/experiment_config.yaml b/examples/ensemble_attack/configs/experiment_config.yaml
@@ -1,34 +1,36 @@
 # Ensemble experiment configuration
-# This config can be used to run both the Ensemble attack training (``run_attack.py``) and testing phases (``tets_attack_model.py``).
-base_experiment_dir: examples/ensemble_attack/tabddpm_20k_experiment_data # Processed data, and experiment artifacts will be stored here
-base_data_config_dir: examples/ensemble_attack/data_configs # Training and data type configs are saved under this directory
+# This config can be used to run both the Ensemble attack training (``run_attack.py``) and testing phases (``test_attack_model.py``).
+base_experiment_dir: /projects/midst-experiments/ensemble_attack/tabddpm_10k_experiment_data/10k/ # Processed data, and experiment artifacts will be stored under this directory.
+base_data_config_dir: examples/ensemble_attack/data_configs # Training and data type configs are saved under this directory.
 
-# Pipeline control
+# Training Pipeline Control
 pipeline:
   run_data_processing: true # Set this to false if you have already saved the processed data
   run_shadow_model_training: true # Set this to false if shadow models are already trained and saved
   run_metaclassifier_training: true
 
 target_model: # This is only used for testing the attack on a real target model.
-  # This is for models trained on 20k data and generating 20k synthetic data
-  target_model_directory: /projects/midst-experiments/all_tabddpms/tabddpm_trained_with_20k/train/
+  target_model_directory: /projects/midst-experiments/all_tabddpms/tabddpm_trained_with_10k/test/
   target_model_id: 21  # Will be overridden per SLURM array task
   target_model_name: tabddpm_${target_model.target_model_id}
-  target_synthetic_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/synthetic_data/20k/20k.csv
+  target_synthetic_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/synthetic_data/10k/10k.csv
   challenge_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_with_id.csv
   challenge_label_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_label.csv
 
-  target_attack_artifact_dir: ${base_experiment_dir}/target_${target_model.target_model_id}_attack_artifacts/
-  attack_probabilities_result_path: ${target_model.target_attack_artifact_dir}/attack_model_${target_model.target_model_id}_proba
-  target_shadow_models_output_path: ${target_model.target_attack_artifact_dir}/tabddpm_${target_model.target_model_id}_shadows_dir
+  target_shadow_models_output_path: ${base_experiment_dir}/test_all_targets # Sub-directory to store test shadows and results
+  attack_probabilities_result_path: ${target_model.target_shadow_models_output_path}/test_probabilities/attack_model_${target_model.target_model_id}_proba
+  attack_rmia_shadow_training_data_choice: "combined" # Options: "combined", "only_challenge", "only_train". This determines which data to use for training RMIA attack model in testing phase.
+  # See select_challenge_data_for_training()'s docstring for more details.
 
 
 # Data paths
 data_paths:
-  midst_data_path: /projects/midst-experiments/all_tabddpms # Used to collect the data
-  population_path: ${base_experiment_dir}/population_data  # Path where the collected population data will be stored
-  processed_attack_data_path: ${base_experiment_dir}/attack_data # Path where the processed attack real train and evaluation data is stored
-  attack_evaluation_result_path: ${base_experiment_dir}/evaluation_results # Path where the attack evaluation results will be stored
+  midst_data_path: /projects/midst-experiments/all_tabddpms/ # Used to collect the data (input) as defined in data_processing_config
+  processed_base_data_dir: ${base_experiment_dir} # To save new processed data for training, or read from previously collected and processed data (testing phase).
+  population_path: ${data_paths.processed_base_data_dir}/population_data  # Path where the collected population data will be stored (output/input)
+  processed_attack_data_path: ${data_paths.processed_base_data_dir}/attack_data # Path where the processed attack real train and evaluation data is stored (output/input)
+  attack_evaluation_result_path: ${base_experiment_dir}/evaluation_results # Path where the attack (train phase) evaluation results will be stored (output)
+
 
 model_paths:
   metaclassifier_model_path: ${base_experiment_dir}/trained_models # Path where the trained metaclassifier model will be saved
@@ -38,23 +40,27 @@ model_paths:
 data_processing_config:
   population_attack_data_types_to_collect:
         [
-          "tabddpm_trained_with_20k",
+          "tabddpm_trained_with_10k",
         ]
   challenge_attack_data_types_to_collect:
         [
-          "tabddpm_trained_with_20k",
+          "tabddpm_trained_with_10k",
         ]
   population_splits: ["train"]  # Data splits to be collected for population data
-  challenge_splits: ["train"]  # Data splits to be collected for challenge points
+  challenge_splits: ["train" , "test"]  # Data splits to be collected for challenge points
+  original_population_data_path: /projects/midst-experiments/ensemble_attack/competition/population_data/ # This is where the original attack's population data (800k) will be read from, mainly to be used by DOMIAS
+  # You can download this data from https://github.com/CRCHUM-CITADEL/ensemble-mia/blob/main/input/population/population_all_with_challenge.csv
+
   # The column name in the data to be used for stratified splitting.
   column_to_stratify: "trans_type"  # Attention: This value is not documented in the original codebase.
-  folder_ranges: #Specify folder ranges for any of the mentioned splits.
-    train: [[1, 20]] # Folders to be used for train data collection in the experiments
+  folder_ranges: # Specify folder ranges for any of the mentioned splits.
+    train: [[1, 21]] # Folders to be used for train data collection in the experiments
+    test: [[21, 31] , [31, 41]]
   # File names in MIDST data directories.
   single_table_train_data_file_name: "train_with_id.csv"
   multi_table_train_data_file_name: "trans.csv"
   challenge_data_file_name: "challenge_with_id.csv"
-  population_sample_size: 40000 # Population size is the total data that your attack has access to.
+  population_sample_size: 20000 # Population size is the total data that your attack has access to.
   # In experiments, this is sampled out of all the collected training data in case the available data
   # is more than this number. Note that, half of this data is actually used for training, the other half
   # is used for evaluation. For example, with 40k population size, only 20k is used for training the attack model.
@@ -86,7 +92,7 @@ shadow_training:
     fine_tune_diffusion_iterations: 200000 # Original code: 200000
     fine_tune_classifier_iterations: 20000 # Original code: 20000
     pre_train_data_size: 60000 # Original code: 60000
-  number_of_points_to_synthesize: 20000 # Number of synthetic data samples to be generated by shadow models.
+  number_of_points_to_synthesize: 10000 # Number of synthetic data samples to be generated by shadow models.
   # Original code: 20000
 
 
@@ -104,7 +110,7 @@ metaclassifier:
   meta_classifier_model_name: ${metaclassifier.model_type}_metaclassifier_model
 
 attack_success_computation:
-  target_ids_to_test: [21,22,23] # List of target model IDs to compute the attack success for.
+  target_ids_to_test: [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40] # List of target model IDs to compute the attack success for.
 
 # General settings
 random_seed: 42 # Set to null for no seed, or an integer for a fixed seed
diff --git a/examples/ensemble_attack/real_data_collection.py b/examples/ensemble_attack/real_data_collection.py
@@ -4,12 +4,15 @@
 """
 
 from enum import Enum
+from logging import INFO
 from pathlib import Path
+from typing import Literal
 
 import pandas as pd
 from omegaconf import DictConfig
 
 from midst_toolkit.attacks.ensemble.data_utils import load_dataframe, save_dataframe
+from midst_toolkit.common.logger import log
 
 
 class AttackType(Enum):
@@ -29,6 +32,9 @@ class AttackType(Enum):
     TABDDPM_100K = "tabddpm_trained_with_100k"
 
 
+DatasetType = Literal["train", "challenge"]
+
+
 def expand_ranges(ranges: list[tuple[int, int]]) -> list[int]:
     """
     Reads a list of tuples representing ranges and expands them into a flat list of integers.
@@ -49,8 +55,8 @@ def expand_ranges(ranges: list[tuple[int, int]]) -> list[int]:
 def collect_midst_attack_data(
     attack_type: AttackType,
     data_dir: Path,
-    data_split: str,
-    dataset: str,
+    split_folder: str,
+    dataset: DatasetType,
     data_processing_config: DictConfig,
 ) -> pd.DataFrame:
     """
@@ -59,28 +65,29 @@ def collect_midst_attack_data(
     Args:
         attack_type: The attack setting.
         data_dir: The path where the data is stored.
-        data_split: Indicates if this is train, dev, or final data.
+        split_folder: Indicates the folder name to collect data from for a specific data split.
+            ``split_folder`` should exist under ``data_dir / attack_type.value`` and
+            f"{generation_name}_{i}" should be located under ``split_folder``.
         dataset: The dataset to be collected. Either "train" or "challenge".
         data_processing_config: Configuration dictionary containing data specific information.
 
     Returns:
         pd.DataFrame: The specified dataset in this setting.
     """
-    assert data_split in [
+    assert dataset in {
         "train",
-        "dev",
-        "final",
-    ], "data_split should be one of 'train', 'dev', or 'final'."
+        "challenge",
+    }, "Only 'train' and 'challenge' collection is supported."
     # `data_id` is the folder numbering of each training or challenge dataset,
     #  and is defined with the provided config.
-    data_id = expand_ranges(data_processing_config.folder_ranges[data_split])
+    data_id = expand_ranges(data_processing_config.folder_ranges[split_folder])
 
     # Get file name based on the kind of dataset to be collected (i.e. train vs challenge).
     # TODO: Make the below parsing a bit more robust and less brittle
     generation_name = attack_type.value.split("_")[0]
     if dataset == "challenge":
         file_name = data_processing_config.challenge_data_file_name
-    else:  # dataset == "train"
+    else:
         # Multi-table attacks have different file names.
         file_name = (
             data_processing_config.multi_table_train_data_file_name
@@ -90,7 +97,7 @@ def collect_midst_attack_data(
 
     df_real = pd.DataFrame()
     for i in data_id:
-        data_path_ith = data_dir / attack_type.value / data_split / f"{generation_name}_{i}"
+        data_path_ith = data_dir / attack_type.value / split_folder / f"{generation_name}_{i}"
         # Will raise FileNotFoundError if the file does not exist or if it is not a CSV file.
         df_real_ith = load_dataframe(data_path_ith, file_name)
         df_real = df_real_ith if df_real.empty else pd.concat([df_real, df_real_ith])
@@ -102,19 +109,24 @@ def collect_midst_attack_data(
 def collect_midst_data(
     midst_data_input_dir: Path,
     attack_types: list[AttackType],
-    data_splits: list[str],
-    dataset: str,
+    split_folders: list[str],
+    dataset: DatasetType,
     data_processing_config: DictConfig,
 ) -> pd.DataFrame:
     """
     Collect train or challenge data of the specified attack type from the provided data folders
-    in the MIDST competition.
+    in the MIDST competition. The data is going to be collected from all the folders specified
+    in ``split_folders`` argument under each attack type folder. For example, if ``split_folders``
+    contains `train` and `dev`, the function collects data from both `train` and `dev` folders
+    under each attack type folder. For more information about the data collection structure, see
+    the implementation of ``collect_midst_attack_data`` function.
 
     Args:
         midst_data_input_dir: The path where the MIDST data folders are stored.
         attack_types: List of attack types for data collection.
-        data_splits: A list indicating the data split to be collected.
-            Could be any of train, dev, or final data splits.
+        split_folders: A list indicating the folder names to collect data splits from. These folders should exist
+            under each attack type folder where we collect model's data from. For example, it could
+            contain strings like `train`, `dev`, `final`, or `test` based on the directory structure.
         dataset: The dataset to be collected. Either `train` or `challenge`.
         data_processing_config: Configuration dictionary containing data paths and file names.
 
@@ -124,16 +136,16 @@ def collect_midst_data(
     assert dataset in {"train", "challenge"}, "Only 'train' and 'challenge' collection is supported."
     population = []
     for attack_type in attack_types:
-        for data_split in data_splits:
+        for split_folder in split_folders:
             df_real = collect_midst_attack_data(
                 attack_type=attack_type,
                 data_dir=midst_data_input_dir,
-                data_split=data_split,
+                split_folder=split_folder,
                 dataset=dataset,
                 data_processing_config=data_processing_config,
             )
 
-        population.append(df_real)
+            population.append(df_real)
 
     return pd.concat(population).drop_duplicates()
 
@@ -142,26 +154,34 @@ def collect_population_data_ensemble(
     midst_data_input_dir: Path,
     data_processing_config: DictConfig,
     save_dir: Path,
+    base_population: pd.DataFrame | None = None,
     population_splits: list[str] | None = None,
     challenge_splits: list[str] | None = None,
 ) -> pd.DataFrame:
     """
     Collect the population data from the MIDST competition based on Ensemble Attack implementation.
     Returns real data population that consists of the train data of all the attacks
-    (black box and white box), and challenge points from `train`, `dev` and `final` of
-    "tabddpm_black_box" attack. The population data is saved in the provided path,
-    and returned as a dataframe.
+    (black box and white box) as specified in ``data_processing_config.population_attack_data_types_to_collect``
+    , and challenge points from `train`, `dev` and `final` of attacks as specified by
+    ``data_processing_config.challenge_attack_data_types_to_collect``. If ``base_population`` is not None,
+    the collected population data will be concatenated with ``base_population`` to be large enough for
+    the attack (especially DOMIAS), then is saved in the provided path, and returned as a dataframe.
 
     Args:
         midst_data_input_dir: The path where the MIDST data folders are stored.
         data_processing_config: Configuration dictionary containing data information and file names.
         save_dir: The path where the collected population data should be saved.
-        population_splits: A list indicating the data splits to be collected for population data.
-            Could be any of `train`, `dev`, or `final` data splits. If None, the default list of ``["train"]``
+        base_population: Path to a large dataset to be concatenated with the collected population data
+            in this function. In experiments, the original attack's population data (800k records) collected by
+            the attacker team is used as the base population. This data is concatenated with the newly collected
+            population data to form a larger population for the attack (especially needed for DOMIAS). If None,
+            only the newly collected population data is used, which may not yield the expected attack performance.
+        population_splits: A list containing the folder names under attack folders that are
+            considered for population collection. If None, the default list of ``["train"]`` is set in the
+            function based on the original attack implementation.
+        challenge_splits:  list containing the folder names under attack folders that are
+            considered for challenge data collection. If None, the default list of ``["train", "dev", "final"]``
             is set in the function based on the original attack implementation.
-        challenge_splits: A list indicating the data splits to be collected for challenge points.
-            Could be any of `train`, `dev`, or `final` data splits. If None, the default list of
-            ``["train", "dev", "final"]`` is set in the function based on the original attack implementation.
 
     Returns:
         The collected population data as a dataframe.
@@ -176,17 +196,30 @@ def collect_population_data_ensemble(
         challenge_splits = ["train", "dev", "final"]
 
     # Ensemble Attack collects train data of all the attack types (black box and white box)
-    attack_names = data_processing_config.population_attack_data_types_to_collect
+    population_attack_names = data_processing_config.population_attack_data_types_to_collect
     # Provided attack name are valid based on AttackType enum
-    population_attack_types: list[AttackType] = [AttackType(attack_name) for attack_name in attack_names]
+    population_attack_types = [AttackType(attack_name) for attack_name in population_attack_names]
 
-    df_population = collect_midst_data(
+    df_population_experiment = collect_midst_data(
         midst_data_input_dir,
         population_attack_types,
-        data_splits=population_splits,
+        split_folders=population_splits,
         dataset="train",
         data_processing_config=data_processing_config,
     )
+
+    log(INFO, f"Collected experiment population data length before concatenation: {len(df_population_experiment)}")
+
+    if base_population is not None:
+        df_population = pd.concat([df_population_experiment, base_population]).drop_duplicates()
+        log(INFO, f"Concatenated population data length: {len(df_population)}")
+    else:
+        df_population = df_population_experiment
+        log(
+            INFO,
+            "base_population is None, only the newly collected population data is used.",
+        )
+
     # Drop ids.
     df_population_no_id = df_population.drop(columns=["trans_id", "account_id"])
     # Save the population data
@@ -195,13 +228,15 @@ def collect_population_data_ensemble(
 
     challenge_attack_names = data_processing_config.challenge_attack_data_types_to_collect
     challenge_attack_types = [AttackType(attack_name) for attack_name in challenge_attack_names]
+
     df_challenge = collect_midst_data(
         midst_data_input_dir,
         attack_types=challenge_attack_types,
-        data_splits=challenge_splits,
+        split_folders=challenge_splits,
         dataset="challenge",
         data_processing_config=data_processing_config,
     )
+    log(INFO, f"Collected challenge data length: {len(df_challenge)} from splits: {challenge_splits}")
     # Save the challenge points
     save_dataframe(df_challenge, save_dir, "challenge_points_all.csv")