Skip to content

Commit ce6fa49

Browse files
committed
fix(nanops): fix kurtosis computation on low variance
1 parent e817930 commit ce6fa49

File tree

2 files changed

+25
-10
lines changed

2 files changed

+25
-10
lines changed

pandas/core/nanops.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1261,7 +1261,8 @@ def nanskew(
12611261
return np.nan
12621262

12631263
with np.errstate(invalid="ignore", divide="ignore"):
1264-
mean = values.sum(axis, dtype=np.float64) / count
1264+
total = values.sum(axis, dtype=np.float64)
1265+
mean = total / count
12651266
if axis is not None:
12661267
mean = np.expand_dims(mean, axis)
12671268

@@ -1277,8 +1278,9 @@ def nanskew(
12771278
#
12781279
# #18044 in _libs/windows.pyx calc_skew follow this behavior
12791280
# to fix the fperr to treat m2 <1e-14 as zero
1280-
m2 = _zero_out_fperr(m2)
1281-
m3 = _zero_out_fperr(m3)
1281+
constant_tolerance = (np.finfo(m2.dtype).eps * total) ** 2
1282+
m2 = _zero_out_fperr(m2, constant_tolerance)
1283+
m3 = _zero_out_fperr(m3, constant_tolerance)
12821284

12831285
with np.errstate(invalid="ignore", divide="ignore"):
12841286
result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2**1.5)
@@ -1349,7 +1351,8 @@ def nankurt(
13491351
return np.nan
13501352

13511353
with np.errstate(invalid="ignore", divide="ignore"):
1352-
mean = values.sum(axis, dtype=np.float64) / count
1354+
total = values.sum(axis, dtype=np.float64)
1355+
mean = total / count
13531356
if axis is not None:
13541357
mean = np.expand_dims(mean, axis)
13551358

@@ -1370,8 +1373,12 @@ def nankurt(
13701373
#
13711374
# #18044 in _libs/windows.pyx calc_kurt follow this behavior
13721375
# to fix the fperr to treat denom <1e-14 as zero
1373-
numerator = _zero_out_fperr(numerator)
1374-
denominator = _zero_out_fperr(denominator)
1376+
# #57972 arbitrary <1e-14 tolerance leads to problematic behaviour on low variance.
1377+
# We adapted the tolerance to use one similar to scipy:
1378+
# https://github.com/scipy/scipy/blob/04d6d9c460b1fed83f2919ecec3d743cfa2e8317/scipy/stats/_stats_py.py#L1429
1379+
constant_tolerance = (np.finfo(m2.dtype).eps * total) ** 2
1380+
numerator = _zero_out_fperr(numerator, constant_tolerance)
1381+
denominator = _zero_out_fperr(denominator, constant_tolerance)
13751382

13761383
if not isinstance(denominator, np.ndarray):
13771384
# if ``denom`` is a scalar, check these corner cases first before
@@ -1587,12 +1594,12 @@ def check_below_min_count(
15871594
return False
15881595

15891596

1590-
def _zero_out_fperr(arg):
1597+
def _zero_out_fperr(arg, tol):
15911598
# #18044 reference this behavior to fix rolling skew/kurt issue
15921599
if isinstance(arg, np.ndarray):
1593-
return np.where(np.abs(arg) < 1e-14, 0, arg)
1600+
return np.where(np.abs(arg) < tol, 0, arg)
15941601
else:
1595-
return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg
1602+
return arg.dtype.type(0) if np.abs(arg) < tol else arg
15961603

15971604

15981605
@disallow("M8", "m8")

pandas/tests/test_nanops.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1072,7 +1072,7 @@ def test_constant_series(self, val):
10721072
# xref GH 11974
10731073
data = val * np.ones(300)
10741074
kurt = nanops.nankurt(data)
1075-
assert kurt == 0.0
1075+
tm.assert_equal(kurt, 0.0)
10761076

10771077
def test_all_finite(self):
10781078
alpha, beta = 0.3, 0.1
@@ -1102,6 +1102,14 @@ def test_nans_skipna(self, samples, actual_kurt):
11021102
kurt = nanops.nankurt(samples, skipna=True)
11031103
tm.assert_almost_equal(kurt, actual_kurt)
11041104

1105+
def test_low_variance(self):
1106+
# GH#57972
1107+
data_list = [-2.05191341e-05, -4.10391103e-05] + ([0.0] * 27)
1108+
data = np.array(data_list)
1109+
kurt = nanops.nankurt(data)
1110+
expected = 18.087646853025614 # scipy.stats.kurtosis(data, bias=False)
1111+
tm.assert_almost_equal(kurt, expected)
1112+
11051113
@property
11061114
def prng(self):
11071115
return np.random.default_rng(2)

0 commit comments

Comments
 (0)