Prathamesh Joshi
--- a/gstlal-inspiral/python/far.py

+ 64

− 76
+++ b/gstlal-inspiral/python/far.py

+ 64

− 76
 @@ -59,6 +59,7 @@ import math
 import multiprocessing
 import numpy
 import random
+import scipy
 from scipy import interpolate
 import sys
 import time
 @@ -519,6 +520,9 @@ def binned_log_likelihood_ratio_rates_from_samples(signal_lr_lnpdf, noise_lr_lnp

 class RankingStatPDF(object):
 	ligo_lw_name_suffix = u"gstlal_inspiral_rankingstatpdf"
+	extinction_fitting_limits = (1/2., 1/100.)
+	# extinction curve fitting is done for the range of LRs
+	# corresponding to the 1/2 and 1/100 point of the zl CCDF

 	@staticmethod
 	def density_estimate(lnpdf, name, kernel = rate.gaussian_window(4.)):
 @@ -712,89 +716,73 @@ WHERE
 		return self


-	def new_with_extinction(self):
-		self = self.copy()
+	def ready_for_extinction(self):
+		# ensure we have sufficient zerolag and noise samples before attempting the extinction curve fit
+		# none of the checks below should evaluate to True even with a small amount of zerolag and noise
+		# samples. They should only evaluate to True at the start of an online analysis
+		bg = self.noise_lr_lnpdf.copy().array
+		fg = self.zero_lag_lr_lnpdf.copy().array
+		bg[:10] = 0.
+		fg[:10] = 0.
+		if fg.sum() == 0 or bg.sum() == 0:
+			return False
+
+		fg_ccdf = numpy.cumsum(fg[::-1])[::-1]
+		ix_min = (fg_ccdf < fg_ccdf[0] * self.extinction_fitting_limits[0]).argmax()
+		ix_max = (fg_ccdf < fg_ccdf[0] * self.extinction_fitting_limits[1]).argmax()
+		if ix_min == ix_max:
+			return False
+
+		if fg[ix_min: ix_max + 1].sum() == 0 or bg[ix_min: ix_max + 1].sum() == 0:
+			return False
+
+		if (fg_ccdf[ix_min: ix_max + 1] == 0).any():
+			# log will evaluate to -inf, and the curve fitting will crash
+			return False
+
+		return True

-		# the probability that an event survives clustering is the
-		# probability that no event with a higher value of the
-		# ranking statistic is within +/- dt of the event in
-		# question.  .noise_lr_lnpdf.count contains an accurate
-		# model of the counts of pre-clustered events in each
-		# ranking statistic bin, but we know the events are not
-		# independent and so the total number of them cannot be
-		# used to form a Poisson rate for the purpose of computing
-		# the probability we desire.  it has been found,
-		# empirically, that if the CCDF of pre-clustered background
-		# event counts is raised to some power and the clustering
-		# survival probability computed from that as though it were
-		# the CCDF of a Poisson process with some effective mean
-		# event rate, the result is a very good model for the
-		# post-clustering distribution of ranking statistics.  we
-		# have two unknown parameters:  the power to which to raise
-		# the pre-clustered ranking statistic's CCDF, and the mean
-		# event rate to assume.  we solve for these parameters by
-		# fitting to the observed zero-lag ranking statistic
-		# histogram
+
+	def new_with_extinction(self, verbose = False):
+		self = self.copy()

 		x = self.noise_lr_lnpdf.bins[0].centres()
 		assert (x == self.zero_lag_lr_lnpdf.bins[0].centres()).all()

-		# some candidates are rejected by the ranking statistic,
-		# causing there to be a spike in the zero-lag density at ln
-		# L = -inf.  if enough candidates get rejected this spike
-		# becomes the mode of the PDF which messes up the mask
-		# constructed below for the fit.  we zero the first 40 bins
-		# here to prevent that from happening (assume density
-		# estimation kernel = 4 bins wide, with 10 sigma impulse
-		# length)
-		zl_counts = self.zero_lag_lr_lnpdf.array.copy()
-		zl_counts[:40] = 0.
-		if not zl_counts.any():
-			raise ValueError("zero-lag counts are all zero")
-
-		# get the noise counts
-		noise_counts = self.noise_lr_lnpdf.array.copy()
-
-		# get the zerolag counts.
-		# we model the tail of the distribution - top 0.1 - 1% - where
-		# clustering only effects the result at a < 1% level.
-		if zl_counts.sum() < 100 * 100:
-			tail_zl_counts = zl_counts.sum() * 0.99
-		else:
-			tail_zl_counts = zl_counts.sum() - 100
-		onepercent = zl_counts.cumsum().searchsorted(tail_zl_counts)
-
-		# normalize the counts
-		noise_counts /= noise_counts.sum()
-		zl_counts /= zl_counts.sum()
-
-		# compute survival probability
-		norm = zl_counts[onepercent] / noise_counts[onepercent]
-		zl_counts[onepercent:] = 0
-		noise_counts[onepercent:] = 0
-		survival_probability = zl_counts / noise_counts
-		survival_probability[onepercent:] = norm
-		survival_probability[numpy.isnan(survival_probability)] = 0.0
-
-		# apply to background counts and signal counts
-		self.noise_lr_lnpdf.array *= survival_probability
-		self.noise_lr_lnpdf.normalize()
-		self.signal_lr_lnpdf.array *= survival_probability
-		self.signal_lr_lnpdf.normalize()
+		#FIXME: Add comprehensive explanation about the extinction model

-		#
-		# never allow PDFs that have had the extinction model
-		# applied to be written to disk:  on-disk files must only
-		# ever provide the original data.  forbid PDFs that have
-		# been extincted from being re-extincted.
-		#
+		lrs = self.noise_lr_lnpdf.centres()[0]
+		bg = self.noise_lr_lnpdf.array
+		fg = self.zero_lag_lr_lnpdf.array
+
+		# zero out the beginning bins of each because they are notoriously bad and should just be ignored
+		bg[:10] = 0.
+		fg[:10] = 0.
+
+
+		# fitting is done between ix_min and ix_max
+		fg_ccdf = numpy.cumsum(fg[::-1])[::-1]
+		ix_min = (fg_ccdf < fg_ccdf[0] * self.extinction_fitting_limits[0]).argmax()
+		ix_max = (fg_ccdf < fg_ccdf[0] * self.extinction_fitting_limits[1]).argmax()
+
+
+		bgtotal = bg[ix_min: ix_max + 1].sum()
+		bg_ccdf = numpy.cumsum(bg[::-1])[::-1] / bgtotal

-		def new_with_extinction(*args, **kwargs):
-			raise NotImplementedError("re-extincting an extincted RankingStatPDF object is forbidden")
-		self.new_with_extinction = new_with_extinction
-		def to_xml(*args, **kwargs):
-			raise NotImplementedError("writing extincted RankingStatPDF object to disk is forbidden")
-		self.to_xml = to_xml
+		# define a function for the extincted bg for scipy.optimize.curve_fit to call
+		def bg_ccdf_extinct_func(idx, c, A):
+			return numpy.log(A * bgtotal / c) + numpy.log1p(-numpy.exp(-1 * bg_ccdf[idx] * c))
+
+		# find the best fit c for extinction
+		c = scipy.optimize.curve_fit(bg_ccdf_extinct_func, range(ix_min, ix_max + 1), numpy.log(fg_ccdf[ix_min: ix_max + 1]), bounds = [0, numpy.inf], sigma = numpy.sqrt(1 / (bg_ccdf[ix_min: ix_max + 1] * bgtotal)))#, maxfev = 5000)
+		if verbose:
+			print(f"Best value of c is {c[0][0]}, A is {c[0][1]} with covariance {c[1]}", file = sys.stderr)
+
+		# calculate the extincted PDF
+		bg_pdf_extinct = c[0][1] * bg * numpy.exp(-1 * bg_ccdf * c[0][0])
+
+		self.noise_lr_lnpdf.array = bg_pdf_extinct
+		self.noise_lr_lnpdf.normalize()

 		return self