From 3497394920138fb2571d339509b64cfb1258063c Mon Sep 17 00:00:00 2001 From: Kipp Cannon <kipp.cannon@ligo.org> Date: Tue, 14 Mar 2017 09:20:38 +0900 Subject: [PATCH] far.py: flatten ThincaCoincParamsDistributions - collapse inheritence hierarchy, moving all code from parent class here - this is the first step in a rewrite based on the new LR ranking statistic framework in pylal, but in the meantime this improves ranking statistic evaluation performance by reducing the number of method calls. --- gstlal-inspiral/python/far.py | 323 ++++++++++++++++++++++++++++++++-- 1 file changed, 304 insertions(+), 19 deletions(-) diff --git a/gstlal-inspiral/python/far.py b/gstlal-inspiral/python/far.py index 8a886103a0..8f6f88de20 100644 --- a/gstlal-inspiral/python/far.py +++ b/gstlal-inspiral/python/far.py @@ -46,6 +46,14 @@ # +try: + from fpconst import NaN, NegInf, PosInf +except ImportError: + # fpconst is not part of the standard library and might not be + # available + NaN = float("nan") + NegInf = float("-inf") + PosInf = float("+inf") import itertools import math import multiprocessing @@ -61,6 +69,7 @@ import sys from glue import segments from glue.ligolw import ligolw +from glue.ligolw import array as ligolw_array from glue.ligolw import param as ligolw_param from glue.ligolw import lsctables from glue.ligolw import utils as ligolw_utils @@ -98,9 +107,20 @@ class CoincParams(dict): __slots__ = ("horizons","t_offset","coa_phase") -class ThincaCoincParamsDistributions(snglcoinc.CoincParamsDistributions): +class ThincaCoincParamsDistributions(object): ligo_lw_name_suffix = u"gstlal_inspiral_coincparamsdistributions" + # + # Default content handler for loading CoincParamsDistributions + # objects from XML documents + # + + @ligolw_array.use_in + @ligolw_param.use_in + @lsctables.use_in + class LIGOLWContentHandler(ligolw.LIGOLWContentHandler): + pass + # range of SNRs covered by this object snr_min = 4. @@ -118,10 +138,10 @@ class ThincaCoincParamsDistributions(snglcoinc.CoincParamsDistributions): numerator_accidental_weight = 0. # binnings (initialized in .__init__() - binnings = { - } + binnings = {} + pdf_from_rates_func = {} - def __init__(self, instruments = frozenset(("H1", "L1", "V1")), min_instruments = 2, signal_rate = 1.0, delta_t = 0.005, **kwargs): + def __init__(self, instruments = frozenset(("H1", "L1", "V1")), min_instruments = 2, signal_rate = 1.0, delta_t = 0.005, process_id = None, **kwargs): # # check input # @@ -149,7 +169,16 @@ class ThincaCoincParamsDistributions(snglcoinc.CoincParamsDistributions): # this can't be done until the binnings attribute has been # populated - super(ThincaCoincParamsDistributions, self).__init__(**kwargs) + self.zero_lag_rates = dict((param, rate.BinnedArray(binning)) for param, binning in self.binnings.items()) + self.background_rates = dict((param, rate.BinnedArray(binning)) for param, binning in self.binnings.items()) + self.injection_rates = dict((param, rate.BinnedArray(binning)) for param, binning in self.binnings.items()) + self.zero_lag_pdf = {} + self.background_pdf = {} + self.injection_pdf = {} + self.zero_lag_lnpdf_interp = {} + self.background_lnpdf_interp = {} + self.injection_lnpdf_interp = {} + self.process_id = process_id # record of horizon distances for all instruments in the # network @@ -176,7 +205,34 @@ class ThincaCoincParamsDistributions(snglcoinc.CoincParamsDistributions): def instruments(self): return frozenset(self.horizon_history) + @staticmethod + def addbinnedarrays(rate_target_dict, rate_source_dict, pdf_target_dict, pdf_source_dict): + """ + For internal use. + """ + weight_target = {} + weight_source = {} + for name, binnedarray in rate_source_dict.items(): + if name in rate_target_dict: + weight_target[name] = rate_target_dict[name].array.sum() + weight_source[name] = rate_source_dict[name].array.sum() + rate_target_dict[name] += binnedarray + else: + rate_target_dict[name] = binnedarray.copy() + for name, binnedarray in pdf_source_dict.items(): + if name in pdf_target_dict: + binnedarray = binnedarray.copy() + binnedarray.array *= weight_source[name] + pdf_target_dict[name].array *= weight_target[name] + pdf_target_dict[name] += binnedarray + pdf_target_dict[name].array /= weight_source[name] + weight_target[name] + else: + pdf_target_dict[name] = binnedarray.copy() + def __iadd__(self, other): + if type(other) != type(self): + raise TypeError(other) + # NOTE: because we use custom PDF constructions, the stock # .__iadd__() method for this class will not result in # valid PDFs. the rates arrays *are* handled correctly by @@ -197,10 +253,29 @@ class ThincaCoincParamsDistributions(snglcoinc.CoincParamsDistributions): raise ValueError("incompatible minimum number of instruments") if self.delta_t != other.delta_t: raise ValueError("incompatible delta_t coincidence thresholds") - super(ThincaCoincParamsDistributions, self).__iadd__(other) + + self.addbinnedarrays(self.zero_lag_rates, other.zero_lag_rates, self.zero_lag_pdf, other.zero_lag_pdf) + self.addbinnedarrays(self.background_rates, other.background_rates, self.background_pdf, other.background_pdf) + self.addbinnedarrays(self.injection_rates, other.injection_rates, self.injection_pdf, other.injection_pdf) self.horizon_history += other.horizon_history + + # + # rebuild interpolators + # + + self._rebuild_interpolators() + + # + # done + # + return self + def copy(self): + new = type(self)(process_id = self.process_id) + new += self + return new + def coinc_params(self, events, offsetvector, mode = "ranking"): # # 2D (snr, \chi^2) values. @@ -271,13 +346,102 @@ class ThincaCoincParamsDistributions(snglcoinc.CoincParamsDistributions): return params + def add_zero_lag(self, param_dict, weight = 1.0): + """ + Increment a bin in one or more of the observed data (or + "zero lag") histograms by weight (default 1). The names of + the histograms to increment, and the parameters identifying + the bin in each histogram, are given by the param_dict + dictionary. + """ + for param, value in param_dict.items(): + try: + self.zero_lag_rates[param][value] += weight + except IndexError: + # param value out of range + pass + + def add_background(self, param_dict, weight = 1.0): + """ + Increment a bin in one or more of the noise (or + "background") histograms by weight (default 1). The names + of the histograms to increment, and the parameters + identifying the bin in each histogram, are given by the + param_dict dictionary. + """ + for param, value in param_dict.items(): + try: + self.background_rates[param][value] += weight + except IndexError: + # param value out of range + pass + + def add_injection(self, param_dict, weight = 1.0): + """ + Increment a bin in one or more of the signal (or + "injection") histograms by weight (default 1). The names + of the histograms to increment, and the parameters + identifying the bin in each histogram, are given by the + param_dict dictionary. + """ + for param, value in param_dict.items(): + try: + self.injection_rates[param][value] += weight + except IndexError: + # param value out of range + pass + def lnP_noise(self, params): + """ + From a parameter value dictionary as returned by + self.coinc_params(), compute and return the natural + logarithm of the noise probability density at that point in + parameter space. + + The .finish() method must have been invoked before this + method does meaningful things. No attempt is made to + ensure the .finish() method has been invoked nor, if it has + been invoked, that no manipulations have occured that might + require it to be re-invoked (e.g., the contents of the + parameter distributions have been modified and require + re-normalization). + + This default implementation assumes the individual PDFs + containined in the noise dictionary are for + statistically-independent random variables, and computes + and returns the logarithm of their product. Sub-classes + that require more sophisticated calculations can override + this method. + """ # evaluate dt and dphi parameters lnP_dt_dphi_noise = inspiral_extrinsics.lnP_dt_dphi(params, self.delta_t, model = "noise") - return lnP_dt_dphi_noise + super(ThincaCoincParamsDistributions, self).lnP_noise(params) + # evaluate the rest + __getitem__ = self.background_lnpdf_interp.__getitem__ + return lnP_dt_dphi_noise + sum(__getitem__(name)(*value) for name, value in params.items()) def lnP_signal(self, params): + """ + From a parameter value dictionary as returned by + self.coinc_params(), compute and return the natural + logarithm of the signal probability density at that point + in parameter space. + + The .finish() method must have been invoked before this + method does meaningful things. No attempt is made to + ensure the .finish() method has been invoked nor, if it has + been invoked, that no manipulations have occured that might + require it to be re-invoked (e.g., the contents of the + parameter distributions have been modified and require + re-normalization). + + This default implementation assumes the individual PDFs + containined in the signal dictionary are for + statistically-independent random variables, and computes + and returns the logarithm of their product. Sub-classes + that require more sophisticated calculations can override + this method. + """ # NOTE: lnP_signal() and lnP_noise() (not shown here) both # omit the factor P(horizon distance) = 1/T because it is # identical in the numerator and denominator and so factors @@ -299,7 +463,8 @@ class ThincaCoincParamsDistributions(snglcoinc.CoincParamsDistributions): # probabilities to is OK. we probably need to cache these # and save them in the XML file, too, like P(snrs | signal, # instruments) - return lnP_snr_signal + lnP_dt_dphi_signal + super(ThincaCoincParamsDistributions, self).lnP_signal(params) + __getitem__ = self.injection_lnpdf_interp.__getitem__ + return lnP_snr_signal + lnP_dt_dphi_signal + sum(__getitem__(name)(*value) for name, value in params.items()) def add_snrchi_prior(self, rates_dict, n, prefactors_range, df, inv_snr_pow = 4., verbose = False): if verbose: @@ -393,10 +558,33 @@ class ThincaCoincParamsDistributions(snglcoinc.CoincParamsDistributions): self.add_snrchi_prior(self.injection_rates, n, prefactors_range, df, inv_snr_pow = inv_snr_pow, verbose = verbose) def _rebuild_interpolators(self): + """ + Initialize the interp dictionaries from the discretely + sampled PDF data. For internal use only. + """ + self.zero_lag_lnpdf_interp.clear() + self.background_lnpdf_interp.clear() + self.injection_lnpdf_interp.clear() + # build interpolators keys = set(self.zero_lag_rates) keys.remove("instruments") keys.remove("singles") - super(ThincaCoincParamsDistributions, self)._rebuild_interpolators(keys) + def mkinterp(binnedarray): + with numpy.errstate(invalid = "ignore"): + assert not (binnedarray.array < 0.).any() + binnedarray = binnedarray.copy() + with numpy.errstate(divide = "ignore"): + binnedarray.array = numpy.log(binnedarray.array) + return rate.InterpBinnedArray(binnedarray, fill_value = NegInf) + for key, binnedarray in self.zero_lag_pdf.items(): + if key in keys: + self.zero_lag_lnpdf_interp[key] = mkinterp(binnedarray) + for key, binnedarray in self.background_pdf.items(): + if key in keys: + self.background_lnpdf_interp[key] = mkinterp(binnedarray) + for key, binnedarray in self.injection_pdf.items(): + if key in keys: + self.injection_lnpdf_interp[key] = mkinterp(binnedarray) # # the instrument combination "interpolators" are @@ -503,6 +691,15 @@ class ThincaCoincParamsDistributions(snglcoinc.CoincParamsDistributions): binnedarray.array[i] /= norm def finish(self, segs, verbose = False): + """ + Populate the discrete PDF dictionaries from the contents of + the rates dictionaries, and then the PDF interpolator + dictionaries from the discrete PDFs. The raw bin counts + from the rates dictionaries are copied verbatim, smoothed, + and converted to normalized PDFs using the bin volumes. + Finally the dictionary of PDF interpolators is populated + from the discretely sampled PDF data. + """ # # populate background instrument combination rates # @@ -545,25 +742,116 @@ class ThincaCoincParamsDistributions(snglcoinc.CoincParamsDistributions): self.injection_rates["instruments"][instruments,] = self.signal_rate * p # - # chain to parent class + # convert raw bin counts into normalized PDFs # - super(ThincaCoincParamsDistributions, self).finish(verbose = verbose) + self.zero_lag_pdf.clear() + self.background_pdf.clear() + self.injection_pdf.clear() + progressbar = ProgressBar(text = "Computing Parameter PDFs", max = len(self.zero_lag_rates) + len(self.background_rates) + len(self.injection_rates)) if verbose else None + for key, (msg, rates_dict, pdf_dict) in itertools.chain( + zip(self.zero_lag_rates, itertools.repeat(("zero lag", self.zero_lag_rates, self.zero_lag_pdf))), + zip(self.background_rates, itertools.repeat(("background", self.background_rates, self.background_pdf))), + zip(self.injection_rates, itertools.repeat(("injections", self.injection_rates, self.injection_pdf))) + ): + assert numpy.isfinite(rates_dict[key].array).all() and (rates_dict[key].array >= 0).all(), "%s %s counts are not valid" % (key, msg) + pdf_dict[key] = rates_dict[key].copy() + pdf_from_rates_func = self.pdf_from_rates_func[key] + if pdf_from_rates_func is not None: + pdf_from_rates_func(key, pdf_dict) + if progressbar is not None: + progressbar.increment() + + # + # rebuild interpolators + # + + self._rebuild_interpolators() + + @classmethod + def get_xml_root(cls, xml, name): + """ + Sub-classes can use this in their overrides of the + .from_xml() method to find the root element of the XML + serialization. + """ + name = u"%s:%s" % (name, cls.ligo_lw_name_suffix) + xml = [elem for elem in xml.getElementsByTagName(ligolw.LIGO_LW.tagName) if elem.hasAttribute(u"Name") and elem.Name == name] + if len(xml) != 1: + raise ValueError("XML tree must contain exactly one %s element named %s" % (ligolw.LIGO_LW.tagName, name)) + return xml[0] @classmethod def from_xml(cls, xml, name): + """ + In the XML document tree rooted at xml, search for the + serialized CoincParamsDistributions object named name, and + deserialize it. The return value is a two-element tuple. + The first element is the deserialized + CoincParamsDistributions object, the second is the process + ID recorded when it was written to XML. + """ + # find the root element of the XML serialization xml = cls.get_xml_root(xml, name) - self = super(ThincaCoincParamsDistributions, cls).from_xml(xml, name, + + # retrieve the process ID + process_id = ligolw_param.get_pyvalue(xml, u"process_id") + + # create an instance + self = cls( + process_id = process_id, instruments = lsctables.instrument_set_from_ifos(ligolw_param.get_pyvalue(xml, u"instruments")), min_instruments = ligolw_param.get_pyvalue(xml, u"min_instruments"), signal_rate = ligolw_param.get_pyvalue(xml, u"signal_rate"), delta_t = ligolw_param.get_pyvalue(xml, u"delta_t") ) + + # reconstruct the BinnedArray objects + def reconstruct(xml, prefix, target_dict): + for name in [elem.Name.split(u":")[1] for elem in xml.childNodes if elem.Name.startswith(u"%s:" % prefix)]: + target_dict[str(name)] = rate.BinnedArray.from_xml(xml, u"%s:%s" % (prefix, name)) + reconstruct(xml, u"zero_lag", self.zero_lag_rates) + reconstruct(xml, u"zero_lag_pdf", self.zero_lag_pdf) + reconstruct(xml, u"background", self.background_rates) + reconstruct(xml, u"background_pdf", self.background_pdf) + reconstruct(xml, u"injection", self.injection_rates) + reconstruct(xml, u"injection_pdf", self.injection_pdf) + + # reconstruct horizon history self.horizon_history = horizonhistory.HorizonHistories.from_xml(xml, name) + + # + # rebuild interpolators + # + + self._rebuild_interpolators() + + # + # done + # + return self def to_xml(self, name): - xml = super(ThincaCoincParamsDistributions, self).to_xml(name) + """ + Serialize this CoincParamsDistributions object to an XML + fragment and return the root element of the resulting XML + tree. The .process_id attribute of process will be + recorded in the serialized XML, and the object will be + given the name name. + """ + xml = ligolw.LIGO_LW({u"Name": u"%s:%s" % (name, self.ligo_lw_name_suffix)}) + xml.appendChild(ligolw_param.Param.from_pyvalue(u"process_id", self.process_id)) + def store(xml, prefix, source_dict): + for name, binnedarray in sorted(source_dict.items()): + xml.appendChild(binnedarray.to_xml(u"%s:%s" % (prefix, name))) + store(xml, u"zero_lag", self.zero_lag_rates) + store(xml, u"zero_lag_pdf", self.zero_lag_pdf) + store(xml, u"background", self.background_rates) + store(xml, u"background_pdf", self.background_pdf) + store(xml, u"injection", self.injection_rates) + store(xml, u"injection_pdf", self.injection_pdf) + xml.appendChild(ligolw_param.Param.from_pyvalue(u"instruments", lsctables.ifos_from_instrument_set(self.instruments))) xml.appendChild(ligolw_param.Param.from_pyvalue(u"min_instruments", self.min_instruments)) xml.appendChild(ligolw_param.Param.from_pyvalue(u"signal_rate", self.signal_rate)) @@ -781,9 +1069,6 @@ def binned_log_likelihood_ratio_rates_from_samples_wrapper(queue, signal_rates, # Class to compute ranking statistic PDFs for background-like and # signal-like populations # -# FIXME: this is really close to just being another subclass of -# CoincParamsDistributions. consider the wisdom of rewriting it to be such -# class RankingData(object): @@ -1008,9 +1293,9 @@ WHERE progressbar.increment() def __iadd__(self, other): - snglcoinc.CoincParamsDistributions.addbinnedarrays(self.background_likelihood_rates, other.background_likelihood_rates, self.background_likelihood_pdfs, other.background_likelihood_pdfs) - snglcoinc.CoincParamsDistributions.addbinnedarrays(self.signal_likelihood_rates, other.signal_likelihood_rates, self.signal_likelihood_pdfs, other.signal_likelihood_pdfs) - snglcoinc.CoincParamsDistributions.addbinnedarrays(self.zero_lag_likelihood_rates, other.zero_lag_likelihood_rates, self.zero_lag_likelihood_pdfs, other.zero_lag_likelihood_pdfs) + ThincaCoincParamsDistributions.addbinnedarrays(self.background_likelihood_rates, other.background_likelihood_rates, self.background_likelihood_pdfs, other.background_likelihood_pdfs) + ThincaCoincParamsDistributions.addbinnedarrays(self.signal_likelihood_rates, other.signal_likelihood_rates, self.signal_likelihood_pdfs, other.signal_likelihood_pdfs) + ThincaCoincParamsDistributions.addbinnedarrays(self.zero_lag_likelihood_rates, other.zero_lag_likelihood_rates, self.zero_lag_likelihood_pdfs, other.zero_lag_likelihood_pdfs) return self @classmethod -- GitLab