Add _logccdf dispatcher for numerically stable log survival function

maresb · maresb · commit 81da946672c6 · 2025-12-15T13:00:52.000+01:00
Add _logccdf (log complementary CDF / log survival function) support:

- pymc/logprob/abstract.py: Add _logccdf singledispatch and _logccdf_helper
- pymc/distributions/distribution.py: Register logccdf methods via metaclass
- pymc/distributions/continuous.py: Add logccdf to Normal using stable normal_lccdf
- pymc/logprob/censoring.py: Use _logccdf for right-censored distributions
- pymc/logprob/binary.py: Use _logccdf for comparison operations
- pymc/logprob/transforms.py: Use _logccdf_helper for monotonic transforms
- pymc/logprob/basic.py: Add public logccdf() function
- pymc/logprob/__init__.py: Export logccdf

This fixes numerical instability when computing log-probabilities for
censored Normal distributions at extreme tail values (e.g., 10+ sigma).
diff --git a/pymc/distributions/continuous.py b/pymc/distributions/continuous.py
@@ -512,6 +512,13 @@ def logcdf(value, mu, sigma):
             msg="sigma > 0",
         )
 
+    def logccdf(value, mu, sigma):
+        return check_parameters(
+            normal_lccdf(mu, sigma, value),
+            sigma > 0,
+            msg="sigma > 0",
+        )
+
     def icdf(value, mu, sigma):
         res = mu + sigma * -np.sqrt(2.0) * pt.erfcinv(2 * value)
         res = check_icdf_value(res, value)
diff --git a/pymc/distributions/distribution.py b/pymc/distributions/distribution.py
@@ -50,7 +50,7 @@
     rv_size_is_none,
     shape_from_dims,
 )
-from pymc.logprob.abstract import MeasurableOp, _icdf, _logcdf, _logprob
+from pymc.logprob.abstract import MeasurableOp, _icdf, _logccdf, _logcdf, _logprob
 from pymc.logprob.basic import logp
 from pymc.logprob.rewriting import logprob_rewrites_db
 from pymc.printing import str_for_dist
@@ -150,6 +150,17 @@ def logcdf(op, value, *dist_params, **kwargs):
                         dist_params = [dist_params[i] for i in params_idxs]
                     return class_logcdf(value, *dist_params)
 
+            class_logccdf = clsdict.get("logccdf")
+            if class_logccdf:
+
+                @_logccdf.register(rv_type)
+                def logccdf(op, value, *dist_params, **kwargs):
+                    if isinstance(op, RandomVariable):
+                        rng, size, *dist_params = dist_params
+                    elif params_idxs:
+                        dist_params = [dist_params[i] for i in params_idxs]
+                    return class_logccdf(value, *dist_params)
+
             class_icdf = clsdict.get("icdf")
             if class_icdf:
 
diff --git a/pymc/logprob/__init__.py b/pymc/logprob/__init__.py
@@ -39,6 +39,7 @@
 from pymc.logprob.basic import (
     conditional_logp,
     icdf,
+    logccdf,
     logcdf,
     logp,
     transformed_conditional_logp,
@@ -59,6 +60,7 @@
 
 __all__ = (
     "icdf",
+    "logccdf",
     "logcdf",
     "logp",
 )
diff --git a/pymc/logprob/abstract.py b/pymc/logprob/abstract.py
@@ -108,6 +108,36 @@ def _logcdf_helper(rv, value, **kwargs):
     return logcdf
 
 
+@singledispatch
+def _logccdf(
+    op: Op,
+    value: TensorVariable,
+    *inputs: TensorVariable,
+    **kwargs,
+):
+    """Create a graph for the log complementary CDF (log survival function) of a ``RandomVariable``.
+
+    This function dispatches on the type of ``op``, which should be a subclass
+    of ``RandomVariable``.  If you want to implement new logccdf graphs
+    for a ``RandomVariable``, register a new function on this dispatcher.
+
+    The log complementary CDF is defined as log(1 - CDF(x)), also known as the
+    log survival function. For distributions with a numerically stable implementation,
+    this should be used instead of computing log(1 - exp(logcdf)).
+    """
+    raise NotImplementedError(f"LogCCDF method not implemented for {op}")
+
+
+def _logccdf_helper(rv, value, **kwargs):
+    """Helper that calls `_logccdf` dispatcher."""
+    logccdf = _logccdf(rv.owner.op, value, *rv.owner.inputs, name=rv.name, **kwargs)
+
+    if rv.name:
+        logccdf.name = f"{rv.name}_logccdf"
+
+    return logccdf
+
+
 @singledispatch
 def _icdf(
     op: Op,
diff --git a/pymc/logprob/basic.py b/pymc/logprob/basic.py
@@ -53,6 +53,7 @@
 from pymc.logprob.abstract import (
     MeasurableOp,
     _icdf_helper,
+    _logccdf_helper,
     _logcdf_helper,
     _logprob,
     _logprob_helper,
@@ -302,6 +303,69 @@ def normal_logcdf(value, mu, sigma):
         return expr
 
 
+def logccdf(rv: TensorVariable, value: TensorLike, warn_rvs=True, **kwargs) -> TensorVariable:
+    """Create a graph for the log complementary CDF (log survival function) of a random variable.
+
+    The log complementary CDF is defined as log(1 - CDF(x)), also known as the
+    log survival function. For distributions with a numerically stable implementation,
+    this is more accurate than computing log(1 - exp(logcdf)).
+
+    Parameters
+    ----------
+    rv : TensorVariable
+    value : tensor_like
+        Should be the same type (shape and dtype) as the rv.
+    warn_rvs : bool, default True
+        Warn if RVs were found in the logccdf graph.
+        This can happen when a variable has other random variables as inputs.
+        In that case, those random variables should be replaced by their respective values.
+
+    Returns
+    -------
+    logccdf : TensorVariable
+
+    Raises
+    ------
+    RuntimeError
+        If the logccdf cannot be derived.
+
+    Examples
+    --------
+    Create a compiled function that evaluates the logccdf of a variable
+
+    .. code-block:: python
+
+        import pymc as pm
+        import pytensor.tensor as pt
+
+        mu = pt.scalar("mu")
+        rv = pm.Normal.dist(mu, 1.0)
+
+        value = pt.scalar("value")
+        rv_logccdf = pm.logccdf(rv, value)
+
+        # Use .eval() for debugging
+        print(rv_logccdf.eval({value: 0.9, mu: 0.0}))  # -1.5272506
+
+        # Compile a function for repeated evaluations
+        rv_logccdf_fn = pm.compile_pymc([value, mu], rv_logccdf)
+        print(rv_logccdf_fn(value=0.9, mu=0.0))  # -1.5272506
+
+    """
+    value = pt.as_tensor_variable(value, dtype=rv.dtype)
+    try:
+        return _logccdf_helper(rv, value, **kwargs)
+    except NotImplementedError:
+        # Try to rewrite rv
+        fgraph, _, _ = construct_ir_fgraph({rv: value})
+        [ir_rv] = fgraph.outputs
+        expr = _logccdf_helper(ir_rv, value, **kwargs)
+        [expr] = cleanup_ir([expr])
+        if warn_rvs:
+            _warn_rvs_in_inferred_graph([expr])
+        return expr
+
+
 def icdf(rv: TensorVariable, value: TensorLike, warn_rvs=True, **kwargs) -> TensorVariable:
     """Create a graph for the inverse CDF of a random variable.
 
diff --git a/pymc/logprob/binary.py b/pymc/logprob/binary.py
@@ -25,6 +25,7 @@
 
 from pymc.logprob.abstract import (
     MeasurableElemwise,
+    _logccdf,
     _logcdf_helper,
     _logprob,
     _logprob_helper,
@@ -95,7 +96,12 @@ def comparison_logprob(op, values, base_rv, operand, **kwargs):
     base_rv_op = base_rv.owner.op
 
     logcdf = _logcdf_helper(base_rv, operand, **kwargs)
-    logccdf = pt.log1mexp(logcdf)
+    # Try to use a numerically stable logccdf if available, otherwise fall back
+    # to computing log(1 - exp(logcdf)) which can be unstable in the tails
+    try:
+        logccdf = _logccdf(base_rv_op, operand, *base_rv.owner.inputs, **kwargs)
+    except NotImplementedError:
+        logccdf = pt.log1mexp(logcdf)
 
     condn_exp = pt.eq(value, np.array(True))
 
diff --git a/pymc/logprob/censoring.py b/pymc/logprob/censoring.py
@@ -47,7 +47,7 @@
 from pytensor.tensor.math import ceil, clip, floor, round_half_to_even
 from pytensor.tensor.variable import TensorConstant
 
-from pymc.logprob.abstract import MeasurableElemwise, _logcdf, _logprob
+from pymc.logprob.abstract import MeasurableElemwise, _logccdf, _logcdf, _logprob
 from pymc.logprob.rewriting import measurable_ir_rewrites_db
 from pymc.logprob.utils import CheckParameterValue, filter_measurable_variables
 
@@ -119,7 +119,13 @@ def clip_logprob(op, values, base_rv, lower_bound, upper_bound, **kwargs):
     if not (isinstance(upper_bound, TensorConstant) and np.all(np.isinf(upper_bound.value))):
         is_upper_bounded = True
 
-        logccdf = pt.log1mexp(logcdf)
+        # Try to use a numerically stable logccdf if available, otherwise fall back
+        # to computing log(1 - exp(logcdf)) which can be unstable in the tails
+        try:
+            logccdf = _logccdf(base_rv_op, value, *base_rv_inputs, **kwargs)
+        except NotImplementedError:
+            logccdf = pt.log1mexp(logcdf)
+
         # For right clipped discrete RVs, we need to add an extra term
         # corresponding to the pmf at the upper bound
         if base_rv.dtype.startswith("int"):
diff --git a/pymc/logprob/transforms.py b/pymc/logprob/transforms.py
@@ -111,6 +111,7 @@
     MeasurableOp,
     _icdf,
     _icdf_helper,
+    _logccdf_helper,
     _logcdf,
     _logcdf_helper,
     _logprob,
@@ -248,9 +249,15 @@ def measurable_transform_logcdf(op: MeasurableTransform, value, *inputs, **kwarg
 
     logcdf = _logcdf_helper(measurable_input, backward_value)
     if is_discrete:
+        # For discrete distributions, use the logcdf at the previous value
         logccdf = pt.log1mexp(_logcdf_helper(measurable_input, backward_value - 1))
     else:
-        logccdf = pt.log1mexp(logcdf)
+        # Try to use a numerically stable logccdf if available, otherwise fall back
+        # to computing log(1 - exp(logcdf)) which can be unstable in the tails
+        try:
+            logccdf = _logccdf_helper(measurable_input, backward_value)
+        except NotImplementedError:
+            logccdf = pt.log1mexp(logcdf)
 
     if isinstance(op.scalar_op, MONOTONICALLY_INCREASING_OPS):
         pass
diff --git a/tests/logprob/test_abstract.py b/tests/logprob/test_abstract.py
@@ -45,8 +45,13 @@
 
 import pymc as pm
 
-from pymc.logprob.abstract import MeasurableElemwise, MeasurableOp, _logcdf_helper
-from pymc.logprob.basic import logcdf
+from pymc.logprob.abstract import (
+    MeasurableElemwise,
+    MeasurableOp,
+    _logccdf_helper,
+    _logcdf_helper,
+)
+from pymc.logprob.basic import logccdf, logcdf
 
 
 def assert_equal_hash(classA, classB):
@@ -80,6 +85,38 @@ def test_logcdf_helper():
     np.testing.assert_almost_equal(x_logcdf.eval(), sp.norm(0, 1).logcdf([0, 1]))
 
 
+def test_logccdf_helper():
+    value = pt.vector("value")
+    x = pm.Normal.dist(0, 1)
+
+    x_logccdf = _logccdf_helper(x, value)
+    np.testing.assert_almost_equal(x_logccdf.eval({value: [0, 1]}), sp.norm(0, 1).logsf([0, 1]))
+
+    x_logccdf = _logccdf_helper(x, [0, 1])
+    np.testing.assert_almost_equal(x_logccdf.eval(), sp.norm(0, 1).logsf([0, 1]))
+
+
+def test_logccdf_helper_numerical_stability():
+    """Test that logccdf is numerically stable in the far right tail.
+
+    This is where log(1 - exp(logcdf)) would lose precision because CDF is very close to 1.
+    """
+    x = pm.Normal.dist(0, 1)
+
+    # Test value far in the right tail where CDF is essentially 1
+    far_tail_value = 10.0
+
+    x_logccdf = _logccdf_helper(x, far_tail_value)
+    result = x_logccdf.eval()
+
+    # scipy.stats.norm.logsf uses a numerically stable implementation
+    expected = sp.norm(0, 1).logsf(far_tail_value)
+
+    # The naive computation would give log(1 - 1) = -inf or very wrong values
+    # The stable implementation should match scipy's logsf closely
+    np.testing.assert_almost_equal(result, expected, decimal=6)
+
+
 def test_logcdf_transformed_argument():
     with pm.Model() as m:
         sigma = pm.HalfFlat("sigma")
@@ -95,3 +132,31 @@ def test_logcdf_transformed_argument():
         pm.TruncatedNormal.dist(0, sigma_value, lower=None, upper=1.0), x_value
     ).eval()
     assert np.isclose(observed, expected)
+
+
+def test_logccdf():
+    """Test the public logccdf function."""
+    value = pt.vector("value")
+    x = pm.Normal.dist(0, 1)
+
+    x_logccdf = logccdf(x, value)
+    np.testing.assert_almost_equal(x_logccdf.eval({value: [0, 1]}), sp.norm(0, 1).logsf([0, 1]))
+
+
+def test_logccdf_numerical_stability():
+    """Test that pm.logccdf is numerically stable in the extreme right tail.
+
+    For a normal distribution, the log survival function at x=10 is very negative
+    (around -52). Using log(1 - exp(logcdf)) would fail because CDF(10) is essentially 1.
+    """
+    x = pm.Normal.dist(0, 1)
+
+    # Test value far in the right tail
+    far_tail_value = 10.0
+
+    result = logccdf(x, far_tail_value).eval()
+    expected = sp.norm(0, 1).logsf(far_tail_value)
+
+    # Should be around -52, not -inf or nan
+    assert np.isfinite(result)
+    np.testing.assert_almost_equal(result, expected, decimal=6)