138138from itertools import count , groupby , repeat
139139from bisect import bisect_left , bisect_right
140140from math import hypot , sqrt , fabs , exp , erf , tau , log , fsum , sumprod
141- from math import isfinite , isinf , pi , cos , cosh
141+ from math import isfinite , isinf , pi , cos , sin , cosh , atan
142142from functools import reduce
143143from operator import itemgetter
144144from collections import Counter , namedtuple , defaultdict
@@ -803,9 +803,9 @@ def multimode(data):
803803 return [value for value , count in counts .items () if count == maxcount ]
804804
805805
806- def kde (data , h , kernel = 'normal' ):
807- """Kernel Density Estimation: Create a continuous probability
808- density function from discrete samples.
806+ def kde (data , h , kernel = 'normal' , * , cumulative = False ):
807+ """Kernel Density Estimation: Create a continuous probability density
808+ function or cumulative distribution function from discrete samples.
809809
810810 The basic idea is to smooth the data using a kernel function
811811 to help draw inferences about a population from a sample.
@@ -820,20 +820,22 @@ def kde(data, h, kernel='normal'):
820820
821821 Kernels that give some weight to every sample point:
822822
823- normal or gauss
823+ normal ( gauss)
824824 logistic
825825 sigmoid
826826
827827 Kernels that only give weight to sample points within
828828 the bandwidth:
829829
830- rectangular or uniform
830+ rectangular ( uniform)
831831 triangular
832- parabolic or epanechnikov
833- quartic or biweight
832+ parabolic ( epanechnikov)
833+ quartic ( biweight)
834834 triweight
835835 cosine
836836
837+ If *cumulative* is true, will return a cumulative distribution function.
838+
837839 A StatisticsError will be raised if the data sequence is empty.
838840
839841 Example
@@ -847,7 +849,8 @@ def kde(data, h, kernel='normal'):
847849
848850 Compute the area under the curve:
849851
850- >>> sum(f_hat(x) for x in range(-20, 20))
852+ >>> area = sum(f_hat(x) for x in range(-20, 20))
853+ >>> round(area, 4)
851854 1.0
852855
853856 Plot the estimated probability density function at
@@ -876,6 +879,13 @@ def kde(data, h, kernel='normal'):
876879 9: 0.009 x
877880 10: 0.002 x
878881
882+ Estimate P(4.5 < X <= 7.5), the probability that a new sample value
883+ will be between 4.5 and 7.5:
884+
885+ >>> cdf = kde(sample, h=1.5, cumulative=True)
886+ >>> round(cdf(7.5) - cdf(4.5), 2)
887+ 0.22
888+
879889 References
880890 ----------
881891
@@ -888,6 +898,9 @@ def kde(data, h, kernel='normal'):
888898 Interactive graphical demonstration and exploration:
889899 https://demonstrations.wolfram.com/KernelDensityEstimation/
890900
901+ Kernel estimation of cumulative distribution function of a random variable with bounded support
902+ https://www.econstor.eu/bitstream/10419/207829/1/10.21307_stattrans-2016-037.pdf
903+
891904 """
892905
893906 n = len (data )
@@ -903,45 +916,56 @@ def kde(data, h, kernel='normal'):
903916 match kernel :
904917
905918 case 'normal' | 'gauss' :
906- c = 1 / sqrt (2 * pi )
907- K = lambda t : c * exp (- 1 / 2 * t * t )
919+ sqrt2pi = sqrt (2 * pi )
920+ sqrt2 = sqrt (2 )
921+ K = lambda t : exp (- 1 / 2 * t * t ) / sqrt2pi
922+ I = lambda t : 1 / 2 * (1.0 + erf (t / sqrt2 ))
908923 support = None
909924
910925 case 'logistic' :
911926 # 1.0 / (exp(t) + 2.0 + exp(-t))
912927 K = lambda t : 1 / 2 / (1.0 + cosh (t ))
928+ I = lambda t : 1.0 - 1.0 / (exp (t ) + 1.0 )
913929 support = None
914930
915931 case 'sigmoid' :
916932 # (2/pi) / (exp(t) + exp(-t))
917- c = 1 / pi
918- K = lambda t : c / cosh (t )
933+ c1 = 1 / pi
934+ c2 = 2 / pi
935+ K = lambda t : c1 / cosh (t )
936+ I = lambda t : c2 * atan (exp (t ))
919937 support = None
920938
921939 case 'rectangular' | 'uniform' :
922940 K = lambda t : 1 / 2
941+ I = lambda t : 1 / 2 * t + 1 / 2
923942 support = 1.0
924943
925944 case 'triangular' :
926945 K = lambda t : 1.0 - abs (t )
946+ I = lambda t : t * t * (1 / 2 if t < 0.0 else - 1 / 2 ) + t + 1 / 2
927947 support = 1.0
928948
929949 case 'parabolic' | 'epanechnikov' :
930950 K = lambda t : 3 / 4 * (1.0 - t * t )
951+ I = lambda t : - 1 / 4 * t ** 3 + 3 / 4 * t + 1 / 2
931952 support = 1.0
932953
933954 case 'quartic' | 'biweight' :
934955 K = lambda t : 15 / 16 * (1.0 - t * t ) ** 2
956+ I = lambda t : 3 / 16 * t ** 5 - 5 / 8 * t ** 3 + 15 / 16 * t + 1 / 2
935957 support = 1.0
936958
937959 case 'triweight' :
938960 K = lambda t : 35 / 32 * (1.0 - t * t ) ** 3
961+ I = lambda t : 35 / 32 * (- 1 / 7 * t ** 7 + 3 / 5 * t ** 5 - t ** 3 + t ) + 1 / 2
939962 support = 1.0
940963
941964 case 'cosine' :
942965 c1 = pi / 4
943966 c2 = pi / 2
944967 K = lambda t : c1 * cos (c2 * t )
968+ I = lambda t : 1 / 2 * sin (c2 * t ) + 1 / 2
945969 support = 1.0
946970
947971 case _:
@@ -952,6 +976,9 @@ def kde(data, h, kernel='normal'):
952976 def pdf (x ):
953977 return sum (K ((x - x_i ) / h ) for x_i in data ) / (n * h )
954978
979+ def cdf (x ):
980+ return sum (I ((x - x_i ) / h ) for x_i in data ) / n
981+
955982 else :
956983
957984 sample = sorted (data )
@@ -963,9 +990,19 @@ def pdf(x):
963990 supported = sample [i : j ]
964991 return sum (K ((x - x_i ) / h ) for x_i in supported ) / (n * h )
965992
966- pdf .__doc__ = f'PDF estimate with { h = !r} and { kernel = !r} '
993+ def cdf (x ):
994+ i = bisect_left (sample , x - bandwidth )
995+ j = bisect_right (sample , x + bandwidth )
996+ supported = sample [i : j ]
997+ return sum ((I ((x - x_i ) / h ) for x_i in supported ), i ) / n
967998
968- return pdf
999+ if cumulative :
1000+ cdf .__doc__ = f'CDF estimate with { h = !r} and { kernel = !r} '
1001+ return cdf
1002+
1003+ else :
1004+ pdf .__doc__ = f'PDF estimate with { h = !r} and { kernel = !r} '
1005+ return pdf
9691006
9701007
9711008# Notes on methods for computing quantiles
0 commit comments