From ffba89f3b705ae8270ac4c2fee2d6a6070ce85ea Mon Sep 17 00:00:00 2001 From: Daichi Tsuna <daichi.tsuna@ligo.org> Date: Thu, 29 Aug 2019 22:59:19 -0700 Subject: [PATCH] porting cosmic string ranking codes to gstlal-burst --- .../python/string/string_extrinsics.py | 141 ++++ gstlal-burst/python/string/string_lr_far.py | 626 ++++++++++++++++++ gstlal-burst/python/string/stringutils.py | 505 ++++++++++++++ 3 files changed, 1272 insertions(+) create mode 100644 gstlal-burst/python/string/string_extrinsics.py create mode 100644 gstlal-burst/python/string/string_lr_far.py create mode 100644 gstlal-burst/python/string/stringutils.py diff --git a/gstlal-burst/python/string/string_extrinsics.py b/gstlal-burst/python/string/string_extrinsics.py new file mode 100644 index 0000000000..b647251c04 --- /dev/null +++ b/gstlal-burst/python/string/string_extrinsics.py @@ -0,0 +1,141 @@ +# Copyright (C) 2016,2017 Kipp Cannon +# Copyright (C) 2011--2014 Kipp Cannon, Chad Hanna, Drew Keppel +# Copyright (C) 2013 Jacob Peoples +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# + + +import numpy + +from ligo.lw import ligolw +from ligo.lw import array as ligolw_array +from glue.text_progress_bar import ProgressBar +from gstlal import stats as gstlalstats +from lal import rate + + +# +# ============================================================================= +# +# SNR, \chi^2 PDF +# +# ============================================================================= +# + + +class NumeratorSNRCHIPDF(rate.BinnedLnPDF): + """ + Reports ln P(chi^2/rho^2 | rho, signal) + """ + # NOTE: by happy coincidence numpy's broadcasting rules allow the + # inherited .at_centres() method to be used as-is + def __init__(self, *args, **kwargs): + super(rate.BinnedLnPDF, self).__init__(*args, **kwargs) + self.volume = self.bins[1].upper() - self.bins[1].lower() + # FIXME: instead of .shape and repeat() use .broadcast_to() + # when we can rely on numpy >= 1.8 + #self.volume = numpy.broadcast_to(self.volume, self.bins.shape) + self.volume.shape = (1, len(self.volume)) + self.volume = numpy.repeat(self.volume, len(self.bins[0]), 0) + self.norm = numpy.zeros((len(self.bins[0]), 1)) + + def __getitem__(self, coords): + return numpy.log(super(rate.BinnedLnPDF, self).__getitem__(coords)) - self.norm[self.bins(*coords)[0]] + + def marginalize(self, *args, **kwargs): + raise NotImplementedError + + def __iadd__(self, other): + # the total count is meaningless, it serves only to set the + # scale by which the density estimation kernel chooses its + # size, so we preserve the count across this operation. if + # the two arguments have different counts, use the + # geometric mean unless one of the two is 0 in which case + # don't screw with the total count + self_count, other_count = self.array.sum(), other.array.sum() + super(rate.BinnedLnPDF, self).__iadd__(other) + if self_count and other_count: + self.array *= numpy.exp((numpy.log(self_count) + numpy.log(other_count)) / 2.) / self.array.sum() + self.norm = numpy.log(numpy.exp(self.norm) + numpy.exp(other.norm)) + return self + + def normalize(self): + # replace the vector with a new one so that we don't + # interfere with any copies that might have been made + with numpy.errstate(divide = "ignore"): + self.norm = numpy.log(self.array.sum(axis = 1)) + self.norm.shape = (len(self.norm), 1) + + @staticmethod + def add_signal_model(lnpdf, n, prefactors_range, df = 100, inv_snr_pow = 4., snr_min = 3.5, progressbar = None): + if df <= 0.: + raise ValueError("require df >= 0: %s" % repr(df)) + pfs = numpy.linspace(prefactors_range[0], prefactors_range[1], 100) + if progressbar is not None: + progressbar.max = len(pfs) + + # FIXME: except for the low-SNR cut, the slicing is done + # to work around various overflow and loss-of-precision + # issues in the extreme parts of the domain of definition. + # it would be nice to identify the causes of these and + # either fix them or ignore them one-by-one with a comment + # explaining why it's OK to ignore the ones being ignored. + # for example, computing snrchi2 by exponentiating the sum + # of the logs of the terms might permit its evaluation + # everywhere on the domain. can ncx2pdf() be made to work + # everywhere? + snrindices, rcossindices = lnpdf.bins[snr_min:1e10, 1e-10:1e10] + snr, dsnr = lnpdf.bins[0].centres()[snrindices], lnpdf.bins[0].upper()[snrindices] - lnpdf.bins[0].lower()[snrindices] + rcoss, drcoss = lnpdf.bins[1].centres()[rcossindices], lnpdf.bins[1].upper()[rcossindices] - lnpdf.bins[1].lower()[rcossindices] + + snr2 = snr**2. + snrchi2 = numpy.outer(snr2, rcoss) * df + + arr = numpy.zeros_like(lnpdf.array) + for pf in pfs: + if progressbar is not None: + progressbar.increment() + arr[snrindices, rcossindices] += gstlalstats.ncx2pdf(snrchi2, df, numpy.array([pf * snr2]).T) + + # convert to counts by multiplying by bin volume, and also + # multiply by an SNR powr law + arr[snrindices, rcossindices] *= numpy.outer(dsnr / snr**inv_snr_pow, drcoss) + + # normalize to a total count of n + arr *= n / arr.sum() + + # add to lnpdf + lnpdf.array += arr + + def to_xml(self, *args, **kwargs): + elem = super(rate.BinnedLnPDF, self).to_xml(*args, **kwargs) + elem.appendChild(ligolw_array.Array.build("norm", self.norm)) + return elem + + @classmethod + def from_xml(cls, xml, name): + xml = cls.get_xml_root(xml, name) + self = super(rate.BinnedLnPDF, cls).from_xml(xml, name) + self.norm = ligolw_array.get_array(xml, "norm").array + return self diff --git a/gstlal-burst/python/string/string_lr_far.py b/gstlal-burst/python/string/string_lr_far.py new file mode 100644 index 0000000000..c4d7cb679c --- /dev/null +++ b/gstlal-burst/python/string/string_lr_far.py @@ -0,0 +1,626 @@ +#################### +# Modules for calculating and storing likelihood ratio density and FAPFARs +# for the cosmic string search. +# + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# + + +import numpy +import random + +from lal import rate +from lalburst import snglcoinc + +from ligo.lw import ligolw +from ligo.lw import param as ligolw_param +from ligo.lw import utils as ligolw_utils + + +from gstlal import string_extrinsics + + +# +# ============================================================================= +# +# Likelihood ratio densities +# +# ============================================================================= +# + + +# +# Numerator & denominator base class +# + + +class LnLRDensity(snglcoinc.LnLRDensity): + # SNR, chi^2 binning definition + snr2_chi2_binning = rate.NDBins((rate.ATanLogarithmicBins(10, 1e7, 801), rate.ATanLogarithmicBins(.1, 1e4, 801))) + + def __init__(self, instruments, snr_threshold, min_instruments=2): + # check input + if min_instruments < 2: + raise ValueError("min_instruments=%d must be >=2" % min_instruments) + if min_instruments > len(instruments): + raise ValueError("not enough instruments (%s) to satisfy min_instruments=%d" % (", ".join(sorted(instruments)), min_instruments)) + if snr_threshold <= 0.: + raise ValueError("SNR threshold = %g must be >0" % snr_threshold) + + self.min_instruments = min_instruments + self.snr_threshold = snr_threshold + self.densities = {} + for instrument in instruments: + self.densities["%s_snr2_chi2" % instrument] = rate.BinnedLnPDF(self.snr2_chi2_binning) + + def __call__(self, **params): + try: + interps = self.interps + except AttributeError: + self.mkinterps() + interps = self.interps + return sum(interps[param](*value) for param, value in params.items()) if params else NegInf + + def __iadd__(self, other): + if type(self) != type(other) or set(self.densities) != set(other.densities): + raise TypeError("cannot add %s and %s" % (type(self), type(other))) + for key, lnpdf in self.densities.items(): + lnpdf += other.densities[key] + try: + del self.interps + except AttributeError: + pass + return self + + def increment(self, weight = 1.0, **params): + instruments = params["snrs"].items() + for instrument in instrumets: + self.densities["%s_snr2_chi2" % instrument].count[params["snrs"][instrument], params["chi2s_over_snr2s"][instrument]] += weight + + def copy(self): + new = type(self)([]) + for key, lnpdf in self.densities.items(): + new.densities[key] = lnpdf.copy() + return new + + def mkinterps(self): + self.interps = dict((key, lnpdf.mkinterp()) for key, lnpdf in self.densities.items()) + + def finish(self): + for key, lnpdf in self.densities.items(): + if key.endswith("_snr2_chi2"): + rate.filter_array(lnpdf.array, rate.gaussian_window(11, 11, sigma = 20)) + else: + # shouldn't get here + raise Exception + lnpdf.normalize() + self.mkinterps() + + def to_xml(self, name): + xml = super(LnLRDensity, self).to_xml(name) + instruments = set(key.split("_", 1)[0] for key in self.densities if key.endswith("_snr2_chi2")) + xml.appendChild(ligolw_param.Param.from_pyvalue(u"instruments", lsctables.instrumentsproperty.set(instruments))) + xml.appendChild(ligolw_param.Param.from_pyvalue(u"min_instruments", self.min_instruments)) + xml.appendChild(ligolw_param.Param.from_pyvalue(u"snr_threshold", self.snr_threshold)) + for key, lnpdf in self.densities.items(): + xml.appendChild(lnpdf.to_xml(key)) + return xml + + @classmethod + def from_xml(cls, xml, name): + xml = cls.get_xml_root(xml, name) + self = cls( + instruments = lsctables.instrumentsproperty.get(ligolw_param.get_pyvalue(xml, u"instruments")), + min_instruments = ligolw_param.get_pyvalue(xml, u"min_instruments"), + snr_threshold = ligolw_param.get_pyvalue(xml, u"snr_threshold") + ) + for key in self.densities: + self.densities[key] = rate.BinnedLnPDF.from_xml(xml, key) + return self + + +# +# Likelihood ratio density (numerator) +# + + +class LnSignalDensity(LnLRDensity): + def add_signal_model(self, prefactors_range = (0.001, 0.30), inv_snr_pow = 4.): + # normalize to 10 *mi*llion signals. This count makes the + # density estimation code choose a suitable kernel size + string_extrinsics.NumeratorSNRCHIPDF.add_signal_model(self.densities["snr2_chi2"], 10000000., prefactors_range, inv_snr_pow = inv_snr_pow, snr_min = self.snr_threshold) + self.densities["snr2_chi2"].normalize() + + def to_xml(self, name): + xml = super(LnSignalDensity, self).to_xml(name) + return xml + + @classmethod + def from_xml(cls, xml, name): + xml = cls.get_xml_root(xml, name) + self = super(LnSignalDensity, cls).from_xml(xml, name) + return self + + +# +# Likelihood ratio density (denominator) +# + + +class LnNoiseDensity(LnLRDensity): + def __call__(self, snrs, chi2s_over_snr2s): + lnP = 0.0 + + # Evaluate P(snrs, chi2s | noise) + # It is assumed that SNR and chi^2 values are + # independent between detectors, and furthermore + # independent from their sensitivities. + interps = self.interps + return lnP + sum(interps["%s_snr2_chi2" % instrument](snrs[instrument], chi2_over_snr2) for instrument, chi2_over_snr2 in chi2s_over_snr2s.items()) + + def candidate_count_model(self): + pass + + def random_params(self): + """ + Generator that yields an endless sequence of randomly + generated candidate parameters. NOTE: the parameters will + be within the domain of the repsective binnings but are not + drawn from the PDF stored in those binnings --- this is not + an MCMC style sampler. Each value in the sequence is a + three-element tuple. The first two elements of each tuple + provide the *args and **kwargs values for calls to this PDF + or the numerator PDF or the ranking statistic object. The + final is the natural logarithm (up to an arbitrary + constant) of the PDF from which the parameters have been + drawn evaluated at the point described by the *args and + **kwargs. + + See also: + + random_sim_params() + + The sequence is suitable for input to the .ln_lr_samples() + log likelihood ratio generator. + """ + snr_slope = 0.8 / len(self.instruments)**3 + + snr2chi2gens = dict((instrument, iter(self.densities["%s_snr2_chi2" % instrument].bins.randcoord(ns = (snr_slope, 1.), domain = (slice(self.snr_threshold, None), slice(None, None)))).next) for instrument in self.instruments) + def nCk(n, k): + return math.factorial(n) // math.factorial(k) // math.factorial(n - k) + while 1: + instruments = tuple(instrument for instrument, rate in rates.items() if rate > 0) + assert len(instruments) < self.min_instruments, "number of instruments smaller than min_instruments" + seq = sum((snr2chi2gens[instrument]() for instrument in instruments), ()) + # set params + for instrument, value in zip(instruments, seq[0::2]): + params["%s_snr2_chi2" % instrument] = (value[0], value[1]) + yield (), params, seq[1::2] + + def to_xml(self, name): + xml = super(LnNoiseDensity, self).to_xml(name) + xml.appendChild(self.triggerrates.to_xml(u"triggerrates")) + return xml + + @classmethod + def from_xml(cls, xml, name): + xml = cls.get_xml_root(xml, name) + self = super(LnNoiseDensity, cls).from_xml(xml, name) + self.triggerrates = trigger_rate.triggerrates.from_xml(xml, u"triggerrates") + self.triggerrates.coalesce() # just in case + return self + + +# +# ============================================================================= +# +# Ranking Statistic PDF +# +# ============================================================================= +# + + +# +# Class to compute ranking statistic PDFs for background-like and +# signal-like populations +# + +class RankingStatPDF(object): + ligo_lw_name_suffix = u"gstlal_string_rankingstatpdf" + + @staticmethod + def density_estimate(lnpdf, name, kernel = rate.gaussian_window(4.)): + """ + For internal use only. + """ + assert not numpy.isnan(lnpdf.array).any(), "%s log likelihood ratio PDF contain NaNs" % name + rate.filter_array(lnpdf.array, kernel) + + @staticmethod + def binned_log_likelihood_ratio_rates_from_samples_wrapper(signal_lr_lnpdf, noise_lr_lnpdf, samples, nsamples): + """ + For internal use only. + """ + try: + # want the forked processes to use different random + # number sequences, so we re-seed Python and + # numpy's random number generators here in the + # wrapper in the hope that that takes care of it + random.seed() + numpy.random.seed() + binned_log_likelihood_ratio_rates_from_samples(signal_lr_lnpdf, noise_lr_lnpdf, samples, nsamples) + return signal_lr_lnpdf.array, noise_lr_lnpdf.array + except: + raise + + def __init__(self, rankingstat, signal_noise_pdfs = None, nsamples = 2**24, verbose = False): + # + # bailout used by .from_xml() class method to get an + # uninitialized instance + # + + if rankingstat is None: + return + + # + # initialize binnings + # + + self.noise_lr_lnpdf = rate.BinnedLnPDF(rate.NDBins((rate.ATanBins(0., 110., 6000),))) + self.signal_lr_lnpdf = rate.BinnedLnPDF(rate.NDBins((rate.ATanBins(0., 110., 6000),))) + self.zero_lag_lnpdf = rate.BinnedLnPDF(rate.NDBins((rate.ATanBins(0., 110., 6000),))) + + # + # bailout used by codes that want all-zeros histograms + # + + if not nsamples: + return + + # + # run importance-weighted random sampling to populate binnings. + # + + if signal_noise_pdfs is None: + signal_noise_pdfs = rankingstat + + self.signal_lr_lnpdf.array, self.noise_lr_lnpdf.array = self.binned_log_likelihood_ratio_rates_from_samples_wrapper( + self.signal_lr_lnpdf, + self.noise_lr_lnpdf, + rankingstat.ln_lr_samples(rankingstat.denominator.random_params(), signal_noise_pdfs), + nsamples = nsamples) + + if verbose: + print >> sys.stderr, "done computing ranking statistic PDFs" + + # + # apply density estimation kernels to counts + # + + self.density_estimate(self.noise_lr_lnpdf, "noise model") + self.density_estimate(self.signal_lr_lnpdf, "signal model") + + # + # set the total sample count in the noise and signal + # ranking statistic histogram equal to the total expected + # count of the respective events from the experiment. This + # information is required so that when adding ranking + # statistic PDFs in our .__iadd__() method they are + # combined with the correct relative weights, so that + # .__iadd__() has the effect of marginalizing the + # distribution over the experients being combined. + # + + self.noise_lr_lnpdf.array /= self.noise_lr_lnpdf.array.sum() + self.noise_lr_lnpdf.normalize() + self.signal_lr_lnpdf.array /= self.signal_lr_lnpdf_array.sum() + self.signal_lr_lnpdf.normalize() + + + def copy(self): + new = self.__class__(None) + new.noise_lr_lnpdf = self.noise_lr_lnpdf.copy() + new.signal_lr_lnpdf = self.signal_lr_lnpdf.copy() + new.zero_lag_lr_lnpdf = self.zero_lag_lr_lnpdf.copy() + return new + + def collect_zero_lag_rates(self, connection, coinc_def_id): + for ln_likelihood_ratio in connection.cursor().execute(""" +SELECT + likelihood, +FROM + coinc_event +WHERE + coinc_def_id == ? + AND NOT EXISTS ( + SELECT + * + FROM + time_slide + WHERE + time_slide.time_slide_id == coinc_event.time_slide_id + AND time_slide.offset != 0 + ) +""", (coinc_def_id,)): + assert ln_likelihood_ratio is not None, "null likelihood ratio encountered. probably coincs have not been ranked" + self.zero_lag_lr_lnpdf.count[ln_likelihood_ratio,] += 1. + self.zero_lag_lr_lnpdf.normalize() + + + def density_estimate_zero_lag_rates(self): + # apply density estimation preserving total count, then + # normalize PDF + count_before = self.zero_lag_lr_lnpdf.array.sum() + # FIXME: should .normalize() be able to handle NaN? + if count_before: + self.density_estimate(self.zero_lag_lr_lnpdf, "zero lag") + self.zero_lag_lr_lnpdf.array *= count_before / self.zero_lag_lr_lnpdf.array.sum() + self.zero_lag_lr_lnpdf.normalize() + + + def __iadd__(self, other): + self.noise_lr_lnpdf += other.noise_lr_lnpdf + self.noise_lr_lnpdf.normalize() + self.signal_lr_lnpdf += other.signal_lr_lnpdf + self.signal_lr_lnpdf.normalize() + self.zero_lag_lr_lnpdf += other.zero_lag_lr_lnpdf + self.zero_lag_lr_lnpdf.normalize() + return self + + @classmethod + def get_xml_root(cls, xml, name): + """ + Sub-classes can use this in their overrides of the + .from_xml() method to find the root element of the XML + serialization. + """ + name = u"%s:%s" % (name, cls.ligo_lw_name_suffix) + xml = [elem for elem in xml.getElementsByTagName(ligolw.LIGO_LW.tagName) if elem.hasAttribute(u"Name") and elem.Name == name] + if len(xml) != 1: + raise ValueError("XML tree must contain exactly one %s element named %s" % (ligolw.LIGO_LW.tagName, name)) + return xml[0] + + + @classmethod + def from_xml(cls, xml, name): + # find the root of the XML tree containing the + # serialization of this object + xml = xls.get_xml_root(xml, name) + # create a mostly uninitialized instance + self = cls(None) + # popuate from XML + self.noise_lr_lnpdf = rate.BinnedLnPDF.from_xml(xml, u"noise_lr_lnpdf") + self.signal_lr_lnpdf = rate.BinnedLnPDF.from_xml(xml, u"signal_lr_lnpdf") + self.zero_lag_lr_lnpdf = rate.BinnedLnPDF.from_xml(xml, u"zero_lag_lr_lnpdf") + return self + + def to_xml(self, name): + # do not allow ourselves to be written to disk without our + # PDF's internal normalization metadata being up to date + self.noise_lr_lnpdf.normalize() + self.signal_lr_lnpdf.normalize() + self.zero_lag_lr_lnpdf.normalize() + + xml = ligolw.LIGO_LW({u"Name": u"%s:%s" % (name, self.ligo_lw_name_suffix)}) + xml.appendChild(self.noise_lr_lnpdf.to_xml(u"noise_lr_lnpdf")) + xml.appendChild(self.signal_lr_lnpdf.to_xml(u"signal_lr_lnpdf")) + xml.appendChild(self.zero_lag_lr_lnpdf.to_xml(u"zero_lag_lr_lnpdf")) + return xml + + +# +# ============================================================================= +# +# False alarm rates/probabilities +# +# ============================================================================= +# + + +# +# Class to compute false-alarm probabilities and false-alarm rates from +# ranking statistic PDFs +# + +class FAPFAR(object): + def __init__(self, rankingstatpdf, livetime): + # input checks + if not rankingstatpdf.zero_lag_lr_lnpdf.array.any(): + raise ValueError("RankingStatPDF's zero-lag counts are all zero") + + self.livetime = livetime + + # set the rate normalization LR threshold to the mode of + # the zero-lag LR distribution. + zl = rankingstat.zero_lag_lr_lnpdf.copy() + rate_normalization_lr_threshold, = zl.argmax() + + # record trials factor, with safety checks + counts = rankingstatpdf.zero_lag_lr_lnpdf.count() + assert not numpy.isnan(counts.array).any(), "zero lag log likelihood ratio counts contain NaNs" + assert (counts.array >= 0.).all(), "zero lag log likelihood ratio rates contain negative values" + self.count_above_threshold = counts[rate_normalization_lr_threshold:,].sum() + + # get noise model ranking stat values and event counts from + # bins + threshold_index = rankingstatpdf.noise_lr_lnpdf.bins[0][rate_normalization_lr_threshold] + ranks = rankingstatpdf.noise_lr_lnpdf.bins[0].lower()[threshold_index:] + counts = rankingstatpdf.noise_lr_lnpdf.array[threshold_index:] + assert not numpy.isnan(counts).any(), "background log likelihood ratio rates contain NaNs" + assert (counts >= 0.).all(), "background log likelihood ratio rates contain negative values" + + # complementary cumulative distribution function + ccdf = counts[::-1].cumsum()[::-1] + ccdf /= ccdf[0] + + # ccdf is P(ranking statistic > threshold | a candidate). + # we need P(rankins statistic > threshold), i.e. need to + # correct for the possibility that no candidate is present. + # specifically, the ccdf needs to =1-1/e at the candidate + # identification threshold, and cdf=1/e at the candidate + # threshold, in order for FAR(threshold) * livetime to + # equal the actual observed number of candidates. + ccdf = gstlalstats.poisson_p_not_0(ccdf) + + # safety checks + assert not numpy.isnan(ranks).any(), "log likelihood ratio co-ordinates contain NaNs" + assert not numpy.isinf(ranks).any(), "log likelihood ratio co-ordinates are not all finite" + assert not numpy.isnan(ccdf).any(), "log likelihood ratio CCDF contains NaNs" + assert ((0. <= ccdf) & (ccdf <= 1.)).all(), "log likelihood ratio CCDF failed to be normalized" + + # build interpolator. + self.ccdf_interpolator = interpolate.interp1d(ranks, ccdf) + + # record min and max ranks so we know which end of the ccdf + # to use when we're out of bounds + self.minrank = ranks[0] + self.maxrank = ranks[-1] + + @gstlalstats.assert_probability + def ccdf_from_rank(self, rank): + return self.ccdf_interpolator(numpy.clip(rank, self.minrank, self.maxrank)) + + def fap_from_rank(self, rank): + # implements equation (8) from Phys. Rev. D 88, 024025. + # arXiv: 1209.0718 + return gstlalstats.fap_after_trials(self.ccdf_from_rank(rank), self.count_above_threshold) + + def far_from_rank(self, rank): + # implements equation (B4) of Phys. Rev. D 88, 024025. + # arXiv: 1209.0718. the return value is divided by T to + # convert events/experiment to events/second. "tdp" = + # true-dismissal probability = 1 - single-event FAP, the + # integral in equation (B4) + log_tdp = numpy.log1p(-self.ccdf_from_rank(rank)) + return self.count_above_threshold * -log_tdp / self.livetime + + # NOTE do we need rank_from_fap and rank_from_far? + + def assign_fapfars(self, connection): + # assign false-alarm probabilities and false-alarm rates + # FIXME we should fix whether we use coinc_burst or multi_burst, + # whichever is less work. + def as_float(f): + def g(x): + return float(f(x)) + return g + connection.create_function("fap_from_rankingstat", 1, as_float(self.fap_from_rank)) + connection.create_function("far_from_rankingstat", 1, as_float(self.far_from_rank)) + connection.cursor().execute(""" +UPDATE + coinc_burst +SET + false_alarm_probability = ( + SELECT + fap_from_rankingstat(coinc_event.likelihood) + FROM + coinc_event + WHERE + coinc_event.coinc_event_id == coinc_burst.coinc_event_id + ), + false_alarm_rate = ( + SELECT + fap_from_rankingstat(coinc_event.likelihood) + FROM + coinc_event + WHERE + coinc_event.coinc_event_id == coinc_burst.coinc_event_id + ) +""") + + +# +# ============================================================================= +# +# I/O +# +# ============================================================================= +# + + +def gen_likelihood_control_doc(xmldoc, rankingstat, rankingstatpdf): + name = u"gstlal_string_likelihood" + node = xmldoc.childNodes[-1] + assert node.tagName == ligolw.LIGO_LW.tagName + + if rankingstat is not None: + node.appendChild(rankingstat.to_xml(name)) + + if rankingstatpdf is not None: + node.appendChild(rankingstatpdf.to_xml(name)) + + return xmldoc + + +def parse_likelihood_control_doc(xmldoc): + name = u"gstlal_string_likelihood" + try: + rankingstat = RankingStat.from_xml(xmldoc, name) + except ValueError: + rankingstat = None + try: + rankingstatpdf = RankingStatPDF.from_xml(xmldoc, name) + except ValueError: + rankingstatpdf = None + if rankingstat is None and rankingstatpdf is None: + raise ValueError("document does not contain likelihood ratio data") + return rankingstat, rankingstatpdf + + +def marginalize_pdf_urls(urls, which, ignore_missing_files = False, verbose = False): + """ + Implements marginalization of PDFs in ranking statistic data files. + The marginalization is over the degree of freedom represented by + the file collection. One or both of the candidate parameter PDFs + and ranking statistic PDFs can be processed, with errors thrown if + one or more files is missing the required component. + """ + name = u"gstlal_string_likelihood" + data = None + for n, url in enumerate(urls, start = 1): + # + # load input document + # + + if verbose: + print >>sys.stderr, "%d/%d:" % (n, len(urls)), + try: + xmldoc = ligolw_utils.load_url(url, verbose = verbose, contenthandler = RankingStat.LIGOLWContentHandler) + except IOError: + # IOError is raised when an on-disk file is + # missing. urllib2.URLError is raised when a URL + # cannot be loaded, but this is subclassed from + # IOError so IOError will catch those, too. + if not ignore_missing_files: + raise + if verbose: + print >>sys.stderr, "Could not load \"%s\" ... skipping as requested" % url + continue + + # + # extract PDF objects compute weighted sum of ranking data + # PDFs + # + + if which == "RankingStat": + if data is None: + data = RankingStat.from_xml(xmldoc, name) + else: + data += RankingStat.from_xml(xmldoc, name) + elif which == "RankingStatPDF": + if data is None: + data = RankingStatPDF.from_xml(xmldoc, name) + else: + data += RankingStatPDF.from_xml(xmldoc, name) + else: + raise ValueError("invalid which (%s)" % which) + xmldoc.unlink() + + return data diff --git a/gstlal-burst/python/string/stringutils.py b/gstlal-burst/python/string/stringutils.py new file mode 100644 index 0000000000..a74f4117f6 --- /dev/null +++ b/gstlal-burst/python/string/stringutils.py @@ -0,0 +1,505 @@ +# Copyright (C) 2009--2018 Kipp Cannon +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# + + +from __future__ import print_function + + +try: + from fpconst import NegInf +except ImportError: + NegInf = float("-inf") +import itertools +import math +import numpy +import scipy.stats +import sys + + +from ligo.lw import ligolw +from ligo.lw import array as ligolw_array +from ligo.lw import param as ligolw_param +from ligo.lw import lsctables +from ligo.lw import utils as ligolw_utils +from ligo.lw.utils import process as ligolw_process +import lal +from lal import rate +from ligo.segments import utils as segmentsUtils +from lalburst import offsetvector +from lalburst import snglcoinc + +from gstlal import string_lr_far + + +__author__ = "Kipp Cannon <kipp.cannon@ligo.org>" +from .git_version import date as __date__ +from .git_version import version as __version__ + + +# +# ============================================================================= +# +# Likelihood Machinery +# +# ============================================================================= +# + + +# +# Make a look-up table of time-of-arrival triangulators +# + + +def triangulators(timing_uncertainties): + """ + Return a dictionary of snglcoinc.TOATriangulator objects + initialized for a variety of instrument combinations. + timing_uncertainties is a dictionary of instrument->$\\Delta t$ + pairs. The return value is a dictionary of (instrument + tuple)->TOATrangulator mappings. The instrument names in each + tuple are sorted in alphabetical order, and the triangulators are + constructed with the instruments in that order (the the + documentation for snglcoinc.TOATriangulator for more information). + + Example: + + >>> x = triangulators({"H1": 0.005, "L1": 0.005, "V1": 0.005}) + + constructs a dictionary of triangulators for every combination of + two or more instruments that can be constructed from those three. + + The program lalapps_string_plot_binj can be used to measure the + timing uncertainties for the instruments in a search. + """ + allinstruments = sorted(timing_uncertainties.keys()) + + triangulators = {} + for n in range(2, len(allinstruments) + 1): + for instruments in itertools.combinations(allinstruments, n): + triangulators[instruments] = snglcoinc.TOATriangulator([lal.cached_detector_by_prefix[instrument].location for instrument in instruments], [timing_uncertainties[instrument] for instrument in instruments]) + + return triangulators + + +# +# A binning for instrument combinations +# +# FIXME: we decided that the coherent and null stream naming convention +# would look like +# +# H1H2:LSC-STRAIN_HPLUS, H1H2:LSC-STRAIN_HNULL +# +# and so on. i.e., the +, x and null streams from a coherent network would +# be different channels from a single instrument whose name would be the +# mash-up of the names of the instruments in the network. that is +# inconsisntent with the "H1H2+", "H1H2-" shown here, so this needs to be +# fixed but I don't know how. maybe it'll go away before it needs to be +# fixed. +# + + +class InstrumentBins(rate.HashableBins): + """ + Example: + + >>> x = InstrumentBins() + >>> x[frozenset(("H1", "L1"))] + 55 + >>> x.centres()[55] + frozenset(['H1', 'L1']) + """ + + names = ("E0", "E1", "E2", "E3", "G1", "H1", "H2", "H1H2+", "H1H2-", "L1", "V1") + + def __init__(self, names): + super(InstrumentBins, self).__init__(frozenset(combo) for n in range(len(names) + 1) for combo in itertools.combinations(names, n)) + + # FIXME: hack to allow instrument binnings to be included as a + # dimension in multi-dimensional PDFs by defining a volume for + # them. investigate more sensible ways to do this. maybe NDBins + # and BinnedDensity should understand the difference between + # functional and parametric co-ordinates. + def lower(self): + return numpy.arange(0, len(self), dtype = "double") + def upper(self): + return numpy.arange(1, len(self) + 1, dtype = "double") + + xml_bins_name = u"instrumentbins" + +# NOTE: side effect of importing this module: +rate.NDBins.xml_bins_name_mapping.update({ + InstrumentBins.xml_bins_name: InstrumentBins, + InstrumentBins: InstrumentBins.xml_bins_name +}) + + +# +# for ranking statistic +# + +def kwarggeniter(d, min_instruments): + d = tuple(sorted(d.items())) + return map(dict, itertools.chain(*(itertools.combinations(d, i) for i in range(min_instruments, len(d) + 1)))) + +def kwarggen(snrs, chi2s_over_snr2s, min_instruments): + for snrs, chi2s_over_snr2s in zip( + kwarggeniter(snrs, min_instruments), + kwarggeniter(chi2s_over_snr2s, min_instruments) + ): + yield { + "snrs": snrs, + "chi2s_over_snr2s": chi2s_over_snr2s, + } + + +# +# Parameter distributions +# + + +class StringCoincParamsDistributions(snglcoinc.LnLikelihoodRatioMixin): + ligo_lw_name_suffix = u"stringcusp_coincparamsdistributions" + + @ligolw_array.use_in + @ligolw_param.use_in + @lsctables.use_in + class LIGOLWContentHandler(ligolw.LIGOLWContentHandler): + pass + + def __init__(self, instruments, snr_threshold, min_instruments=2): + self.triangulators = triangulators(dict.fromkeys(instruments, 8e-5)) + self.numerator = string_lr_far.LnSignalDensity(instruments = instruments, min_instruments = min_instruments, snr_threshold = snr_threshold) + self.denominator = string_lr_far.LnNoiseDensity(instruments = instruments, min_instruments = min_instruments, snr_threshold = snr_threshold) + self.candidates = string_lr_far.LnLRDensity(instruments = instruments, min_instruments = min_instruments, snr_threshold = snr_threshold) + + @property + def min_instruments(self): + return self.denominator.min_instruments + + def __call__(self, **kwargs): + """ + Evaluate the ranking statistic. + """ + # Full lnL ranking stat, defined to be the largest lnL from + # all allowed subsets of trigges. Maximizes over 2+ IFO combos. + return max(super(StringCoincParamsDistribution, self).__call__(**kwargs) for kwargs in kwarggen(min_instruments = self.min_instruments, **kwargs)) + + def __iadd__(self, other): + if type(self) != type(other): + raise TypeError(other) + if set(self.triangulators.keys()) != set(other.triangulators.keys()): + raise ValueError("incompatible instruments") + self.numerator += other.numerator + self.denominator += other.denominator + self.candidates += other.candidates + return self + + def copy(self): + new = type(self)([]) + new.triangulators = self.triangulators # share reference + new.numerator = self.numerator.copy() + new.denominator = self.denominator.copy() + new.candidates = self.candidates.copy() + return new + + def coinc_params(self, events, offsetvector): + params = {} + + # + # check for coincs that have been vetoed entirely + # + + if len(events) < 2: + return params + + # + # Initialize the parameter dictionary, sort the events by + # instrument name (the multi-instrument parameters are defined for + # the instruments in this order and the triangulators are + # constructed this way too), and retrieve the sorted instrument + # names + # + + events = tuple(sorted(events, key = lambda event: event.ifo)) + instruments = tuple(event.ifo for event in events) + + return dict( + snrs = dict((event.ifo, event.snr) for event in events), + chi2s_over_snr2s = dict((event.ifo, event.chisq / event.chisq_dof / event.snr**2.) for event in events), + ) + + def ln_lr_from_triggers(self, events, offsetvector): + return self(**self.coinc_params(events, offsetvector)) + + def finish(self): + self.numerator.finish() + self.denominator.finish() + self.candidates.finish() + return self + + @classmethod + def get_xml_root(cls, xml, name): + name = u"%s:%s" % (name, cls.ligo_lw_name_suffix) + xml = [elem for elem in xml.getElementsByTagName(ligolw.LIGO_LW.tagName) if elem.hasAttribute(u"Name") and elem.Name == name] + if len(xml) != 1: + raise ValueError("XML tree must contain exactly one %s element named %s" % (ligolw.LIGO_LW.tagName, name)) + return xml[0] + + @classmethod + def from_xml(cls, xml, name): + xml = cls.get_xml_root(xml, name) + self = cls([]) + self.numerator = string_lr_far.LnSignalDensity.from_xml(xml, "numerator") + self.denominator = string_lr_far.LnNoiseDensity.from_xml(xml, "denominator") + self.candidates = string_lr_far.LnLRDensity.from_xml(xml, "candidates") + instruments = self.candidates.instruments + self.triangulators = triangulators(dict.fromkeys(instruments, 8e-5)) + return self + + def to_xml(self, name): + xml = ligolw.LIGO_LW({u"Name": u"%s:%s" % (name, self.ligo_lw_name_suffix)}) + xml.appendChild(self.numerator.to_xml("numerator")) + xml.appendChild(self.denominator.to_xml("denominator")) + xml.appendChild(self.candidates.to_xml("candidates")) + return xml + + +# +# I/O +# + + +def load_likelihood_data(filenames, verbose = False): + coinc_params = None + seglists = None + for n, filename in enumerate(filenames, 1): + if verbose: + print("%d/%d:" % (n, len(filenames)), end=' ', file=sys.stderr) + xmldoc = ligolw_utils.load_filename(filename, verbose = verbose, contenthandler = StringCoincParamsDistributions.LIGOLWContentHandler) + this_coinc_params = StringCoincParamsDistributions.from_xml(xmldoc, u"string_cusp_likelihood") + this_seglists = lsctables.SearchSummaryTable.get_table(xmldoc).get_out_segmentlistdict(lsctables.ProcessTable.get_table(xmldoc).get_ids_by_program(u"lalapps_string_meas_likelihood")).coalesce() + xmldoc.unlink() + if coinc_params is None: + coinc_params = this_coinc_params + else: + coinc_params += this_coinc_params + if seglists is None: + seglists = this_seglists + else: + seglists |= this_seglists + return coinc_params, seglists + + +# +# ============================================================================= +# +# Livetime +# +# ============================================================================= +# + + +def time_slides_livetime(seglists, time_slides, min_instruments, verbose = False, clip = None): + """ + seglists is a segmentlistdict of times when each of a set of + instruments were on, time_slides is a sequence of + instrument-->offset dictionaries, each vector of offsets in the + sequence is applied to the segmentlists and the total time during + which at least min_instruments were on is summed and returned. If + clip is not None, after each offset vector is applied to seglists + the result is intersected with clip before computing the livetime. + If verbose is True then progress reports are printed to stderr. + """ + livetime = 0.0 + seglists = seglists.copy() # don't modify original + N = len(time_slides) + if verbose: + print("computing the live time for %d time slides:" % N, file=sys.stderr) + for n, time_slide in enumerate(time_slides): + if verbose: + print("\t%.1f%%\r" % (100.0 * n / N), end=' ', file=sys.stderr) + seglists.offsets.update(time_slide) + if clip is None: + livetime += float(abs(segmentsUtils.vote(seglists.values(), min_instruments))) + else: + livetime += float(abs(segmentsUtils.vote((seglists & clip).values(), min_instruments))) + if verbose: + print("\t100.0%", file=sys.stderr) + return livetime + + +def time_slides_livetime_for_instrument_combo(seglists, time_slides, instruments, verbose = False, clip = None): + """ + like time_slides_livetime() except computes the time for which + exactly the instruments given by the sequence instruments were on + (and nothing else). + """ + livetime = 0.0 + # segments for instruments that must be on + onseglists = seglists.copy(keys = instruments) + # segments for instruments that must be off + offseglists = seglists.copy(keys = set(seglists) - set(instruments)) + N = len(time_slides) + if verbose: + print("computing the live time for %s in %d time slides:" % (", ".join(instruments), N), file=sys.stderr) + for n, time_slide in enumerate(time_slides): + if verbose: + print("\t%.1f%%\r" % (100.0 * n / N), end=' ', file=sys.stderr) + onseglists.offsets.update(time_slide) + offseglists.offsets.update(time_slide) + if clip is None: + livetime += float(abs(onseglists.intersection(onseglists.keys()) - offseglists.union(offseglists.keys()))) + else: + livetime += float(abs((onseglists & clip).intersection(onseglists.keys()) - offseglists.union(offseglists.keys()))) + if verbose: + print("\t100.0%", file=sys.stderr) + return livetime + + +# +# ============================================================================= +# +# Database Utilities +# +# ============================================================================= +# + + +def create_recovered_ln_likelihood_ratio_table(connection, coinc_def_id): + """ + Create a temporary table named "recovered_ln_likelihood_ratio" + containing two columns: "simulation_id", the simulation_id of an + injection, and "ln_likelihood_ratio", the highest log likelihood + ratio at which that injection was recovered by a coincidence of + type coinc_def_id. + """ + cursor = connection.cursor() + cursor.execute(""" +CREATE TEMPORARY TABLE recovered_ln_likelihood_ratio (simulation_id TEXT PRIMARY KEY, ln_likelihood_ratio REAL) + """) + cursor.execute(""" +INSERT OR REPLACE INTO + recovered_ln_likelihood_ratio +SELECT + sim_burst.simulation_id AS simulation_id, + MAX(coinc_event.likelihood) AS ln_likelihood_ratio +FROM + sim_burst + JOIN coinc_event_map AS a ON ( + a.table_name == "sim_burst" + AND a.event_id == sim_burst.simulation_id + ) + JOIN coinc_event_map AS b ON ( + b.coinc_event_id == a.coinc_event_id + ) + JOIN coinc_event ON ( + b.table_name == "coinc_event" + AND b.event_id == coinc_event.coinc_event_id + ) +WHERE + coinc_event.coinc_def_id == ? +GROUP BY + sim_burst.simulation_id + """, (coinc_def_id,)) + cursor.close() + + +def create_sim_burst_best_string_sngl_map(connection, coinc_def_id): + """ + Construct a sim_burst --> best matching coinc_event mapping. + """ + connection.cursor().execute(""" +CREATE TEMPORARY TABLE + sim_burst_best_string_sngl_map +AS + SELECT + sim_burst.simulation_id AS simulation_id, + ( + SELECT + sngl_burst.event_id + FROM + coinc_event_map AS a + JOIN coinc_event_map AS b ON ( + b.coinc_event_id == a.coinc_event_id + ) + JOIN coinc_event ON ( + coinc_event.coinc_event_id == a.coinc_event_id + ) + JOIN sngl_burst ON ( + b.table_name == 'sngl_burst' + AND b.event_id == sngl_burst.event_id + ) + WHERE + a.table_name == 'sim_burst' + AND a.event_id == sim_burst.simulation_id + AND coinc_event.coinc_def_id == ? + ORDER BY + (sngl_burst.chisq / sngl_burst.chisq_dof) / (sngl_burst.snr * sngl_burst.snr) + LIMIT 1 + ) AS event_id + FROM + sim_burst + WHERE + event_id IS NOT NULL + """, (coinc_def_id,)) + + +def create_sim_burst_best_string_coinc_map(connection, coinc_def_id): + """ + Construct a sim_burst --> best matching coinc_event mapping for + string cusp injections and coincs. + """ + # FIXME: this hasn't finished being ported from the inspiral code + connection.cursor().execute(""" +CREATE TEMPORARY TABLE + sim_burst_best_string_coinc_map +AS + SELECT + sim_burst.simulation_id AS simulation_id, + ( + SELECT + coinc_inspiral.coinc_event_id + FROM + coinc_event_map AS a + JOIN coinc_event_map AS b ON ( + b.coinc_event_id == a.coinc_event_id + ) + JOIN coinc_inspiral ON ( + b.table_name == 'coinc_event' + AND b.event_id == coinc_inspiral.coinc_event_id + ) + WHERE + a.table_name == 'sim_burst' + AND a.event_id == sim_burst.simulation_id + AND coinc_event.coinc_def_id == ? + ORDER BY + (sngl_burst.chisq / sngl_burst.chisq_dof) / (sngl_burst.snr * sngl_burst.snr) + LIMIT 1 + ) AS coinc_event_id + FROM + sim_burst + WHERE + coinc_event_id IS NOT NULL + """, (coinc_def_id,)) -- GitLab