igerber · igerber · Feb 19, 2026 · Feb 18, 2026 · Feb 18, 2026 · Feb 18, 2026
diff --git a/METHODOLOGY_REVIEW.md b/METHODOLOGY_REVIEW.md
@@ -428,15 +428,51 @@ variables appear to the left of the `|` separator.
 |-------|-------|
 | Module | `triple_diff.py` |
 | Primary Reference | Ortiz-Villavicencio & Sant'Anna (2025) |
-| R Reference | (forthcoming) |
-| Status | Not Started |
-| Last Review | - |
+| R Reference | `triplediff::ddd()` (v0.2.1, CRAN) |
+| Status | **Complete** |
+| Last Review | 2026-02-18 |
+
+**Verified Components:**
+- [x] ATT matches R `triplediff::ddd()` for all 3 methods (DR, RA, IPW) — <0.001% relative difference
+- [x] SE matches R `triplediff::ddd()` for all 3 methods — <0.001% relative difference
+- [x] With-covariates ATT matches R — <0.001% relative difference
+- [x] With-covariates SE matches R — <0.001% relative difference
+- [x] Verified across all 4 DGP types from `gen_dgp_2periods()` (different model misspecification scenarios)
+- [x] Influence function-based SE: `SE = std(w3*IF_3 + w2*IF_2 - w1*IF_1, ddof=1) / sqrt(n)`
+- [x] Three-DiD decomposition: `DDD = DiD_3 + DiD_2 - DiD_1` matching R's approach
+- [x] safe_inference() used for all inference fields (t_stat, p_value, conf_int)
 
 **Corrections Made:**
-- (None yet)
+1. **Complete rewrite of estimation methods** (was naive cell-mean approach, now three-DiD
+   decomposition). The original implementation computed DDD directly from 8 cell means with
+   a naive cell-variance SE. Replaced with R's decomposition into three pairwise DiD
+   comparisons (subgroup j vs reference subgroup 4), each using DR/IPW/RA methodology
+   from Callaway & Sant'Anna. This fixed:
+   - DR SE: was off by >100% (naive cell variance vs influence function)
+   - IPW SE: was off by >200% (incorrect cell-probability-ratio weights)
+   - With-covariates ATT: was off by >1000% for all methods (incorrect cell-by-cell regression)
+2. **Influence function SE** replaces naive cell variance for all methods:
+   `SE = std(w3*IF_3 + w2*IF_2 - w1*IF_1, ddof=1) / sqrt(n)` where
+   `w_j = n / n_j` and `IF_j` is the per-observation influence function for pairwise DiD j.
+3. **Propensity score estimation** now runs per-pairwise-comparison (P(subgroup=4|X) within
+   {j, 4} subset) instead of global P(G=1|X).
+4. **Outcome regression** now fits separate OLS per subgroup-time cell within each pairwise
+   comparison, matching R's `compute_outcome_regression_rc()`.
 
 **Outstanding Concerns:**
-- (None yet)
+- Implementation uses `panel=FALSE` (repeated cross-section) mode. Panel mode (`panel=TRUE`)
+  with differenced outcomes not yet implemented.
+
+**R Comparison Results (panel=FALSE, n=500 per DGP):**
+| DGP | Method | Covariates | ATT Diff | SE Diff |
+|-----|--------|-----------|----------|---------|
+| 1 | DR | No | <0.001% | <0.001% |
+| 1 | DR | Yes | <0.001% | <0.001% |
+| 1 | REG | No | <0.001% | <0.001% |
+| 1 | REG | Yes | <0.001% | <0.001% |
+| 1 | IPW | No | <0.001% | <0.001% |
+| 1 | IPW | Yes | <0.001% | <0.001% |
+| 2-4 | All | Both | <0.001% | <0.001% |
 
 ---
 

diff --git a/benchmarks/R/benchmark_triplediff.R b/benchmarks/R/benchmark_triplediff.R
@@ -0,0 +1,105 @@
+#!/usr/bin/env Rscript
+# Benchmark: Triple Difference (R `triplediff` package)
+#
+# This uses triplediff::ddd() with panel=FALSE (repeated cross-section mode),
+# matching the Python TripleDifference estimator's approach.
+#
+# Usage:
+#   Rscript benchmark_triplediff.R --data path/to/data.csv --output path/to/results.json \
+#     [--method dr|reg|ipw] [--covariates true|false]
+
+library(triplediff)
+library(jsonlite)
+library(data.table)
+
+# Parse command line arguments
+args <- commandArgs(trailingOnly = TRUE)
+
+parse_args <- function(args) {
+  result <- list(
+    data = NULL,
+    output = NULL,
+    method = "dr",
+    covariates = FALSE
+  )
+
+  i <- 1
+  while (i <= length(args)) {
+    if (args[i] == "--data") {
+      result$data <- args[i + 1]
+      i <- i + 2
+    } else if (args[i] == "--output") {
+      result$output <- args[i + 1]
+      i <- i + 2
+    } else if (args[i] == "--method") {
+      result$method <- args[i + 1]
+      i <- i + 2
+    } else if (args[i] == "--covariates") {
+      result$covariates <- tolower(args[i + 1]) == "true"
+      i <- i + 2
+    } else {
+      i <- i + 1
+    }
+  }
+
+  if (is.null(result$data) || is.null(result$output)) {
+    stop("Usage: Rscript benchmark_triplediff.R --data <path> --output <path> [--method dr|reg|ipw] [--covariates true|false]")
+  }
+
+  return(result)
+}
+
+config <- parse_args(args)
+
+# Load data
+message(sprintf("Loading data from: %s", config$data))
+data <- fread(config$data)
+
+# Build covariate formula
+cov_cols <- grep("^cov", names(data), value = TRUE)
+if (config$covariates && length(cov_cols) > 0) {
+  xformla <- as.formula(paste("~", paste(cov_cols, collapse = "+")))
+  message(sprintf("Using covariates: %s", paste(cov_cols, collapse = ", ")))
+} else {
+  xformla <- ~1
+  message("No covariates")
+}
+
+# Run benchmark
+message(sprintf("Running DDD estimation (method=%s, panel=FALSE)...", config$method))
+timing <- system.time({
+  res <- ddd(
+    yname = "y",
+    tname = "time",
+    idname = "id",
+    gname = "state",
+    pname = "partition",
+    data = data,
+    control_group = "nevertreated",
+    panel = FALSE,
+    xformla = xformla,
+    est_method = config$method,
+    boot = FALSE
+  )
+})
+
+# Collect results
+output <- list(
+  ATT = res$ATT,
+  se = res$se,
+  lci = res$lci,
+  uci = res$uci,
+  method = config$method,
+  covariates = config$covariates,
+  n_obs = nrow(data),
+  elapsed_seconds = timing["elapsed"]
+)
+
+# Write results
+message(sprintf("Writing results to: %s", config$output))
+write(toJSON(output, pretty = TRUE, auto_unbox = TRUE, digits = 15), config$output)
+
+message("Done.")
+message(sprintf("  ATT = %.6f", res$ATT))
+message(sprintf("  SE  = %.6f", res$se))
+message(sprintf("  Time: %.3fs", timing["elapsed"]))
diff --git a/benchmarks/R/requirements.R b/benchmarks/R/requirements.R
@@ -10,6 +10,7 @@ required_packages <- c(
   "didimputation", # Borusyak, Jaravel & Spiess (2024) imputation DiD
   "HonestDiD",     # Rambachan & Roth (2023) sensitivity analysis
   "fixest",        # Fast TWFE and basic DiD
+  "triplediff",    # Ortiz-Villavicencio & Sant'Anna (2025) triple difference
 
   # Utilities
   "jsonlite",      # JSON output for Python interop

diff --git a/benchmarks/data/synthetic/ddd_r_results.json b/benchmarks/data/synthetic/ddd_r_results.json
@@ -0,0 +1,146 @@
+{
+  "dgp1_dr_nocov": {
+    "ATT": -4.713891309648176,
+    "se": 15.32210646783081,
+    "lci": -34.744668153884774,
+    "uci": 25.316885534588423
+  },
+  "dgp1_dr_cov": {
+    "ATT": -0.370943148650857,
+    "se": 0.3629174762792657,
+    "lci": -1.082248331518387,
+    "uci": 0.340362034216673
+  },
+  "dgp1_reg_nocov": {
+    "ATT": -4.713891309648488,
+    "se": 15.322106467830805,
+    "lci": -34.74466815388507,
+    "uci": 25.3168855345881
+  },
+  "dgp1_reg_cov": {
+    "ATT": -0.3648022796066925,
+    "se": 12.25511177194079,
+    "lci": -24.384379979123477,
+    "uci": 23.65477541991009
+  },
+  "dgp1_ipw_nocov": {
+    "ATT": -4.713891309648119,
+    "se": 15.322106467830803,
+    "lci": -34.7446681538847,
+    "uci": 25.316885534588465
+  },
+  "dgp1_ipw_cov": {
+    "ATT": 0.1746221292513894,
+    "se": 14.84686185530122,
+    "lci": -28.92469239058052,
+    "uci": 29.2739366490833
+  },
+  "dgp2_dr_nocov": {
+    "ATT": -2.802437682226054,
+    "se": 15.162115011913007,
+    "lci": -32.51963703502963,
+    "uci": 26.914761670577523
+  },
+  "dgp2_dr_cov": {
+    "ATT": -0.131884249677628,
+    "se": 0.3615976238477585,
+    "lci": -0.8406025693144961,
+    "uci": 0.57683406995924
+  },
+  "dgp2_reg_nocov": {
+    "ATT": -2.802437682226468,
+    "se": 15.16211501191301,
+    "lci": -32.519637035030044,
+    "uci": 26.91476167057711
+  },
+  "dgp2_reg_cov": {
+    "ATT": -0.1264941935341142,
+    "se": 12.278681040167424,
+    "lci": -24.192266809917065,
+    "uci": 23.939278422848837
+  },
+  "dgp2_ipw_nocov": {
+    "ATT": -2.802437682225957,
+    "se": 15.16211501191301,
+    "lci": -32.51963703502953,
+    "uci": 26.914761670577622
+  },
+  "dgp2_ipw_cov": {
+    "ATT": 0.4425176545858278,
+    "se": 14.578330164503246,
+    "lci": -28.130484422574405,
+    "uci": 29.01551973174606
+  },
+  "dgp3_dr_nocov": {
+    "ATT": -4.047926092563451,
+    "se": 13.619021126223045,
+    "lci": -30.740717004650733,
+    "uci": 22.644864819523832
+  },
+  "dgp3_dr_cov": {
+    "ATT": -1.206339068198347,
+    "se": 5.715500553553746,
+    "lci": -12.408514306782427,
+    "uci": 9.995836170385735
+  },
+  "dgp3_reg_nocov": {
+    "ATT": -4.047926092563443,
+    "se": 13.61902112622304,
+    "lci": -30.740717004650715,
+    "uci": 22.64486481952383
+  },
+  "dgp3_reg_cov": {
+    "ATT": -1.506286210381859,
+    "se": 11.48877687437943,
+    "lci": -24.02387511058219,
+    "uci": 21.01130268981847
+  },
+  "dgp3_ipw_nocov": {
+    "ATT": -4.047926092563728,
+    "se": 13.61902112622304,
+    "lci": -30.740717004651,
+    "uci": 22.644864819523544
+  },
+  "dgp3_ipw_cov": {
+    "ATT": -0.797266162250736,
+    "se": 13.500012852552667,
+    "lci": -27.256805144081792,
+    "uci": 25.66227281958032
+  },
+  "dgp4_dr_nocov": {
+    "ATT": -5.281043961510922,
+    "se": 13.550738720691161,
+    "lci": -31.840003817977955,
+    "uci": 21.277915894956113
+  },
+  "dgp4_dr_cov": {
+    "ATT": -2.919555392612542,
+    "se": 5.682194173268706,
+    "lci": -14.05645132538255,
+    "uci": 8.217340540157466
+  },
+  "dgp4_reg_nocov": {
+    "ATT": -5.281043961511244,
+    "se": 13.550738720691157,
+    "lci": -31.84000381797827,
+    "uci": 21.277915894955783
+  },
+  "dgp4_reg_cov": {
+    "ATT": -3.131035790104079,
+    "se": 11.449511458993447,
+    "lci": -25.571665890309877,
+    "uci": 19.30959431010172
+  },
+  "dgp4_ipw_nocov": {
+    "ATT": -5.281043961510647,
+    "se": 13.550738720691161,
+    "lci": -31.84000381797768,
+    "uci": 21.277915894956386
+  },
+  "dgp4_ipw_cov": {
+    "ATT": -2.588437808164429,
+    "se": 13.347963293853894,
+    "lci": -28.749965131080682,
+    "uci": 23.573089514751825
+  }
+}