diff --git a/gstlal-burst/python/string/lalapps_string_calc_likelihood.py b/gstlal-burst/python/string/lalapps_string_calc_likelihood.py new file mode 100644 index 0000000000000000000000000000000000000000..9ba591a782039651a32ef43d0138a4c7d907ecf2 --- /dev/null +++ b/gstlal-burst/python/string/lalapps_string_calc_likelihood.py @@ -0,0 +1,163 @@ +# +# Copyright (C) 2010 Kipp Cannon +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# + + +from __future__ import print_function + + +from optparse import OptionParser +import sqlite3 +import sys + + +from ligo.lw import dbtables + +from lal.utils import CacheEntry +from lalburst import git_version +from lalburst import calc_likelihood +from lalburst import SnglBurstUtils +from lalburst import stringutils + + +__author__ = "Kipp Cannon <kipp.cannon@ligo.org>" +__version__ = "git id %s" % git_version.id +__date__ = git_version.date + + +# +# ============================================================================= +# +# Command Line +# +# ============================================================================= +# + + +def parse_command_line(): + parser = OptionParser( + version = "Name: %%prog\n%s" % git_version.verbose_msg + ) + parser.add_option("-c", "--input-cache", metavar = "filename", help = "Also process the files named in this LAL cache. See lalapps_path2cache for information on how to produce a LAL cache file.") + parser.add_option("-l", "--likelihood-file", metavar = "filename", action = "append", help = "Set the name of the likelihood ratio data file to use. Can be given more than once.") + parser.add_option("--likelihood-cache", metavar = "filename", help = "Also load the likelihood ratio data files listsed in this LAL cache. See lalapps_path2cache for information on how to produce a LAL cache file.") + parser.add_option("-t", "--tmp-space", metavar = "path", help = "Path to a directory suitable for use as a work area while manipulating the database file. The database file will be worked on in this directory, and then moved to the final location when complete. This option is intended to improve performance when running in a networked environment, where there might be a local disk with higher bandwidth than is available to the filesystem on which the final output will reside.") + parser.add_option("--vetoes-name", metavar = "name", help = "Set the name of the segment lists to use as vetoes (default = do not apply vetoes).") + parser.add_option("-v", "--verbose", action = "store_true", help = "Be verbose.") + options, filenames = parser.parse_args() + + options.likelihood_filenames = [] + if options.likelihood_file is not None: + options.likelihood_filenames += options.likelihood_file + if options.likelihood_cache is not None: + options.likelihood_filenames += [CacheEntry(line).path for line in open(options.likelihood_cache)] + if not options.likelihood_filenames: + raise ValueError("no ranking statistic likelihood data files specified") + + if options.input_cache: + filenames += [CacheEntry(line).path for line in open(options.input_cache)] + if not filenames: + raise ValueError("no candidate databases specified") + + return options, filenames + + +# +# ============================================================================= +# +# Main +# +# ============================================================================= +# + + +# +# command line +# + + +options, filenames = parse_command_line() + + +# +# load likelihood data +# + + +rankingstat = stringutils.marginalize_rankingstat(options.likelihood_filenames, verbose = options.verbose) +if options.verbose: + print("computing event densities ...", file=sys.stderr) +rankingstat.finish() + + +# +# iterate over files +# + + +for n, filename in enumerate(filenames): + # + # Open the database file. + # + + if options.verbose: + print("%d/%d: %s" % (n + 1, len(filenames), filename), file=sys.stderr) + + working_filename = dbtables.get_connection_filename(filename, tmp_path = options.tmp_space, verbose = options.verbose) + connection = sqlite3.connect(working_filename) + if options.tmp_space is not None: + dbtables.set_temp_store_directory(connection, options.tmp_space, verbose = options.verbose) + + # + # Summarize the database. + # + + contents = SnglBurstUtils.CoincDatabase(connection, live_time_program = "StringSearch", search = "StringCusp", veto_segments_name = options.vetoes_name) + if options.verbose: + SnglBurstUtils.summarize_coinc_database(contents) + + + # + # Run likelihood ratio calculation. + # + + calc_likelihood.assign_likelihood_ratios( + connection = contents.connection, + coinc_def_id = contents.bb_definer_id, + offset_vectors = contents.time_slide_table.as_dict(), + vetoseglists = contents.vetoseglists, + events_func = lambda cursor, coinc_event_id: calc_likelihood.sngl_burst_events_func(cursor, coinc_event_id, contents.sngl_burst_table.row_from_cols), + veto_func = calc_likelihood.sngl_burst_veto_func, + ln_likelihood_ratio_func = rankingstat.ln_lr_from_triggers, + verbose = options.verbose + ) + + # + # Clean up. + # + + contents.xmldoc.unlink() + connection.close() + dbtables.put_connection_filename(filename, working_filename, verbose = options.verbose) diff --git a/gstlal-burst/python/string/lalapps_string_calc_rank_pdfs.py b/gstlal-burst/python/string/lalapps_string_calc_rank_pdfs.py new file mode 100644 index 0000000000000000000000000000000000000000..2c6263791275510974822dd714984508a7ff0407 --- /dev/null +++ b/gstlal-burst/python/string/lalapps_string_calc_rank_pdfs.py @@ -0,0 +1,121 @@ +# +# Copyright (C) 2019 Daichi Tsuna +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# a module to construct ranking statistic PDFs from the calc_likelihood outputs + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# + + +from optparse import OptionParser + +from ligo.lw import ligolw +from ligo.lw import utils as ligolw_utils +from ligo.lw.utils import process as ligolw_process + +from lal.utils import CacheEntry +from lalburst import git_version +from lalburst import stringutils +from lalburst import string_lr_far + + +__author__ = "Daichi Tsuna <daichi.tsuna@ligo.org>" +__version__ = "git id %s" % git_version.id +__date__ = git_version.date + + +# +# ============================================================================= +# +# Command Line +# +# ============================================================================= +# + + +def parse_command_line(): + parser = OptionParser( + version = "Name: %%prog\n%s" % git_version.verbose_msg + ) + parser.add_option("-c", "--input-cache", metavar = "filename", help = "Also process the files named in this LAL cache. See lalapps_path2cache for information on how to produce a LAL cache file.") + parser.add_option("-n", "--ranking-stat-samples", metavar = "N", default = 2**24, type = "int", help = "Construct ranking statistic histograms by drawing this many samples from the ranking statistic generator (default = 2^24).") + parser.add_option("-o", "--output", metavar = "filename", help = "Write merged likelihood ratio histograms to this LIGO Light-Weight XML file.") + parser.add_option("-v", "--verbose", action = "store_true", help = "Be verbose.") + options, filenames = parser.parse_args() + + paramdict = options.__dict__.copy() + + if options.input_cache is not None: + filenames = [CacheEntry(line).path for line in open(options.input_cache)] + if not filenames: + raise ValueError("no ranking statistic likelihood data files specified") + + if options.output is None: + raise ValueError("must set --output") + + return options, filenames, paramdict + + +# +# ============================================================================= +# +# Main +# +# ============================================================================= +# + + +# +# command line +# + + +options, filenames, paramdict = parse_command_line() + + +# +# load rankingstat data +# + + +rankingstat = stringutils.marginalize_rankingstat(filenames, verbose = options.verbose) + + +# +# generate rankingstatpdfs +# + + +rankingstatpdf = stringutils.RankingStatPDF(rankingstat, nsamples = options.ranking_stat_samples, verbose = options.verbose) + + +# +# write to output +# + + +xmldoc = ligolw.Document() +xmldoc.appendChild(ligolw.LIGO_LW()) +xmldoc.childNodes[-1].appendChild(rankingstatpdf.to_xml()) +process = ligolw_process.register_to_xmldoc(xmldoc, program = u"lalapps_string_meas_likelihood", paramdict = paramdict, version = __version__, cvs_repository = "lscsoft", cvs_entry_time = __date__, comment = u"") +ligolw_process.set_process_end_time(process) +ligolw_utils.write_filename(xmldoc, options.output, gz = (options.output or "stdout").endswith(".gz"), verbose = options.verbose) diff --git a/gstlal-burst/python/string/lalapps_string_meas_likelihood.py b/gstlal-burst/python/string/lalapps_string_meas_likelihood.py new file mode 100644 index 0000000000000000000000000000000000000000..6a60edcab4e9de908880efc2ff1dc4109e842d72 --- /dev/null +++ b/gstlal-burst/python/string/lalapps_string_meas_likelihood.py @@ -0,0 +1,207 @@ +# Copyright (C) 2010--2018 Kipp Cannon +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# + + +from __future__ import print_function + + +import math +from optparse import OptionParser +import sqlite3 +import string +import sys + +from ligo.lw import ligolw +from ligo.lw import utils as ligolw_utils +from ligo.lw.utils import process as ligolw_process + +from lal.utils import CacheEntry + +from lalburst import SnglBurstUtils +from lalburst import git_version +from lalburst import stringutils +from lalburst import string_lr_far + + +__author__ = "Kipp Cannon <kipp.cannon@ligo.org>" +__version__ = "git id %s" % git_version.id +__date__ = git_version.date + + +# characters allowed to appear in the description string +T010150_letters = set(string.ascii_lowercase + string.ascii_uppercase + string.digits + "_+#") + + +# +# ============================================================================= +# +# Command Line +# +# ============================================================================= +# + + +def parse_command_line(): + parser = OptionParser( + version = "Name: %%prog\n%s" % git_version.verbose_msg, + usage = "%prog [options] [filename ...]", + description = "%prog analyzes a collection of sqlite3 database files containing lalapps_burca outputs of string-cusp coincidence events, and measures probability distributions for a variety of parameters computed from those coincidences. The distributions are written to a likelihood data file in XML format, which can later be used by to assign likelihoods to the coincidences. The files to be processed can be named on the command line and/or provided by a LAL cache file." + ) + parser.add_option("-o", "--output", metavar = "filename", default = None, help = "Set the name of the likelihood data file to write (default = stdout).") + parser.add_option("-c", "--input-cache", metavar = "filename", help = "Also process the files named in this LAL cache. See lalapps_path2cache for information on how to produce a LAL cache file.") + parser.add_option("-t", "--tmp-space", metavar = "path", help = "Path to a directory suitable for use as a work area while manipulating the database file. The database file will be worked on in this directory, and then moved to the final location when complete. This option is intended to improve performance when running in a networked environment, where there might be a local disk with higher bandwidth than is available to the filesystem on which the final output will reside.") + parser.add_option("--T010150", metavar = "description", default = None, help = "Write the output to a file whose name is compatible with the file name format described in LIGO-T010150-00-E, \"Naming Convention for Frame Files which are to be Processed by LDAS\". The description string will be used to form the second field in the file name.") + parser.add_option("--injection-reweight", metavar = "off|astrophysical", default = "off", help = "Set the weight function to be applied to the injections (default = \"off\"). When \"off\", the injections are all given equal weight and so the injection population is whatever was injected. When set to \"astrophysical\", the injections are reweighted to simulate an amplitude^{-4} distribution.") + parser.add_option("--injection-reweight-cutoff", metavar = "amplitude", default = 1e-20, type = "float", help = "When using the astrophysical injection reweighting, do not allow the weight assigned to arbitrarily low-amplitude injections to grow without bound, instead clip the weight assigned to injections to the weight given to injections with this amplitude (default = 1e-20, 0 = disabled). This option is ignored when astrophysical reweighting is not being performed.") + parser.add_option("--vetoes-name", metavar = "name", help = "Set the name of the segment lists to use as vetoes (default = do not apply vetoes).") + parser.add_option("-v", "--verbose", action = "store_true", help = "Be verbose.") + options, filenames = parser.parse_args() + + paramdict = options.__dict__.copy() + + if options.T010150 is not None: + if options.output is not None: + raise ValueError("cannot set both --T010150 and --output") + if options.T010150 == "": + options.T010150 = "STRING_LIKELIHOOD" + elif set(options.T010150) - T010150_letters: + raise ValueError("invalid characters in description \"%s\"" % options.T010150) + + if options.input_cache: + filenames += [CacheEntry(line).path for line in file(options.input_cache)] + + if not filenames: + raise ValueError("no input files!") + + if options.injection_reweight not in ("off", "astrophysical"): + raise ValueError("--injection-reweight \"%s\" not recognized" % options.injections_reweight) + + return options, filenames, paramdict + + +# +# ============================================================================= +# +# Injection ReWeight Functions +# +# ============================================================================= +# + + +def get_injection_weight_func(contents, reweight_type, amplitude_cutoff): + if reweight_type == "off": + def weight_func(sim, amplitude_cutoff = amplitude_cutoff): + # amplitude cut-off is not used + return 1.0 + elif reweight_type == "astrophysical": + population, = ligolw_process.get_process_params(contents.xmldoc, "lalapps_string_injections", "--population") + if population != "string_cusp": + raise ValueError("lalapps_string_injections was not run with --population=\"string_cusp\"") + def weight_func(sim, amplitude_cutoff = amplitude_cutoff): + # the "string_cusp" injection population is uniform + # in log A, meaning the number of injections + # between log A and log A + d log A is independent + # of A. that corresponds to a distribution density + # in A of P(A) dA \propto A^{-1} dA. we want P(A) + # dA \propto A^{-4} dA, so each physical injection + # needs to be treated as if it was A^{-3} virtual + # injections, then the density of virtual + # injections will be P(A) dA \propto A^{-4} dA. + # the factor of 10^{21} simply renders the + # amplitudes closer to 1; since double-precision + # arithmetic is used this should have no affect on + # the results, but it might help make the numbers a + # little easier for humans to look at should anyone + # have occasion to do so. + return (max(sim.amplitude, amplitude_cutoff) * 1e21)**-3 + else: + raise ValueError(reweight_type) + return weight_func + + +# +# ============================================================================= +# +# Main +# +# ============================================================================= +# + + +# +# Command line. +# + + +options, filenames, paramdict = parse_command_line() + + +# +# Start output document +# + + +xmldoc = ligolw.Document() +xmldoc.appendChild(ligolw.LIGO_LW()) +process = ligolw_process.register_to_xmldoc(xmldoc, program = u"lalapps_string_meas_likelihood", paramdict = paramdict, version = __version__, cvs_repository = "lscsoft", cvs_entry_time = __date__, comment = u"") + + +# +# Iterate over files to obtain the denominator and zero-lag +# + +rankingstat = stringutils.marginalize_rankingstat(filenames, verbose = options.verbose) + + +# +# obtain the numerator +# FIXME for now we simply copy over the numerator from an existing table +# + + +adhoc_rankingstat = stringutils.marginalize_rankingstat(["/home/daichi.tsuna/playground/rankingstat_numerator/G1+H1+H2+L1+T1+V1-STRING_LIKELIHOOD_O2CUSP_INJ_0_0-1164600798-154406.xml.gz"], verbose = options.verbose) +adhoc_rankingstat.finish() +rankingstat.numerator = adhoc_rankingstat.numerator +rankingstat.finish() + + +# +# Output. +# + + +xmldoc.childNodes[-1].appendChild(rankingstat.to_xml()) +ligolw_process.set_process_end_time(process) + + +def T010150_basename(instruments, description, seg): + start = int(math.floor(seg[0])) + duration = int(math.ceil(seg[1] - start)) + return "%s-%s-%d-%d" % ("+".join(sorted(instruments)), description, start, duration) +if options.T010150: + filename = "%s.xml.gz" % T010150_basename(segs.keys(), options.T010150, segs.extent_all()) +else: + filename = options.output +ligolw_utils.write_filename(xmldoc, filename, verbose = options.verbose, gz = (filename or "stdout").endswith(".gz")) diff --git a/gstlal-burst/python/string/string_extrinsics.py b/gstlal-burst/python/string/string_extrinsics.py index b647251c049057bfd35b4246117f4a5bcc20c971..67713cd0d3757b571d2aeb337322ef49ae8a09c1 100644 --- a/gstlal-burst/python/string/string_extrinsics.py +++ b/gstlal-burst/python/string/string_extrinsics.py @@ -28,13 +28,23 @@ import numpy -from ligo.lw import ligolw from ligo.lw import array as ligolw_array -from glue.text_progress_bar import ProgressBar from gstlal import stats as gstlalstats +import lal from lal import rate +__doc__ = """ + +The goal of this module is to implement the probability of getting a given set +of extrinsic parameters for a set of detectors parameterized by n-tuples of +trigger parameters: (snr, chi2) assuming that the event is a gravitational wave +signal, *s*, coming from an isotropic distribution in location, orientation and +the volume of space. The implementation of this in the calling code can be +found in :py:mod:`string_lr_far`. +""" + + # # ============================================================================= # @@ -88,7 +98,7 @@ class NumeratorSNRCHIPDF(rate.BinnedLnPDF): self.norm.shape = (len(self.norm), 1) @staticmethod - def add_signal_model(lnpdf, n, prefactors_range, df = 100, inv_snr_pow = 4., snr_min = 3.5, progressbar = None): + def add_signal_model(lnpdf, n, prefactors_range, df, inv_snr_pow = 4., snr_min = 3.5, progressbar = None): if df <= 0.: raise ValueError("require df >= 0: %s" % repr(df)) pfs = numpy.linspace(prefactors_range[0], prefactors_range[1], 100) diff --git a/gstlal-burst/python/string/string_lr_far.py b/gstlal-burst/python/string/string_lr_far.py index 9fd154f79f294b8d5c2b7284c457424388193a8a..1cde0257816e30e8f7672966e36fcb95a0d815d0 100644 --- a/gstlal-burst/python/string/string_lr_far.py +++ b/gstlal-burst/python/string/string_lr_far.py @@ -13,20 +13,23 @@ # +import itertools +import math import numpy import random import sys from lal import rate -from lalburst import snglcoinc +from . import snglcoinc from ligo.lw import ligolw from ligo.lw import lsctables from ligo.lw import param as ligolw_param from ligo.lw import utils as ligolw_utils -from gstlal import stats as gstlalstats -from gstlal import string_extrinsics +# FIXME don't import gstlal modules in lalsuite +from gstlal.stats import trigger_rate +from . import string_extrinsics # # ============================================================================= @@ -43,31 +46,32 @@ from gstlal import string_extrinsics class LnLRDensity(snglcoinc.LnLRDensity): - # SNR threshold FIXME don't hardcode - snr_threshold = 3.75 # SNR, chi^2 binning definition snr2_chi2_binning = rate.NDBins((rate.ATanLogarithmicBins(10, 1e7, 801), rate.ATanLogarithmicBins(.1, 1e4, 801))) - - def __init__(self, instruments, min_instruments=2): + + def __init__(self, instruments, delta_t, snr_threshold, min_instruments = 2): # check input if min_instruments < 2: raise ValueError("min_instruments=%d must be >=2" % min_instruments) if min_instruments > len(instruments): raise ValueError("not enough instruments (%s) to satisfy min_instruments=%d" % (", ".join(sorted(instruments)), min_instruments)) + assert delta_t > 0 and snr_threshold > 0 + self.instruments = frozenset(instruments) + self.delta_t = delta_t + self.snr_threshold = snr_threshold self.min_instruments = min_instruments self.densities = {} - self.instruments = frozenset(instruments) for instrument in self.instruments: self.densities["%s_snr2_chi2" % instrument] = rate.BinnedLnPDF(self.snr2_chi2_binning) - def __call__(self, **params): + def __call__(self): try: interps = self.interps except AttributeError: self.mkinterps() interps = self.interps - return sum(interps[param](*value) for param, value in params.items()) if params else NegInf + #return sum(interps[param](*value) for param, value in params.items()) def __iadd__(self, other): if type(self) != type(other) or set(self.densities) != set(other.densities): @@ -80,12 +84,12 @@ class LnLRDensity(snglcoinc.LnLRDensity): pass return self - def increment(self, weight = 1.0, **params): - for instrument in self.instruments: - self.densities["%s_snr2_chi2" % instrument].count[params['snrs'][instrument], params['chi2s_over_snr2s'][instrument]] += weight + def increment(self, event): + #self.densities["%s_snr2_chi2" % event.ifo].count[event.snr, event.chisq / event.chisq_dof / event.snr**2.] += 1.0 + self.densities["%s_snr2_chi2" % event.ifo].count[event.snr**2, event.chisq / event.chisq_dof] += 1.0 def copy(self): - new = type(self)([]) + new = type(self)(self.instruments, min_instruments = self.min_instruments) for key, lnpdf in self.densities.items(): new.densities[key] = lnpdf.copy() return new @@ -105,8 +109,9 @@ class LnLRDensity(snglcoinc.LnLRDensity): def to_xml(self, name): xml = super(LnLRDensity, self).to_xml(name) - instruments = set(key.split("_", 1)[0] for key in self.densities if key.endswith("_snr2_chi2")) - xml.appendChild(ligolw_param.Param.from_pyvalue(u"instruments", lsctables.instrumentsproperty.set(instruments))) + xml.appendChild(ligolw_param.Param.from_pyvalue(u"instruments", lsctables.instrumentsproperty.set(self.instruments))) + xml.appendChild(ligolw_param.Param.from_pyvalue(u"delta_t", self.delta_t)) + xml.appendChild(ligolw_param.Param.from_pyvalue(u"snr_threshold", self.snr_threshold)) xml.appendChild(ligolw_param.Param.from_pyvalue(u"min_instruments", self.min_instruments)) for key, lnpdf in self.densities.items(): xml.appendChild(lnpdf.to_xml(key)) @@ -117,7 +122,9 @@ class LnLRDensity(snglcoinc.LnLRDensity): xml = cls.get_xml_root(xml, name) self = cls( instruments = lsctables.instrumentsproperty.get(ligolw_param.get_pyvalue(xml, u"instruments")), - min_instruments = ligolw_param.get_pyvalue(xml, u"min_instruments"), + delta_t = ligolw_param.get_pyvalue(xml, u"delta_t"), + snr_threshold = ligolw_param.get_pyvalue(xml, u"snr_threshold"), + min_instruments = ligolw_param.get_pyvalue(xml, u"min_instruments") ) for key in self.densities: self.densities[key] = rate.BinnedLnPDF.from_xml(xml, key) @@ -130,13 +137,14 @@ class LnLRDensity(snglcoinc.LnLRDensity): class LnSignalDensity(LnLRDensity): - def __call__(self, snrs, chi2s_over_snr2s): - if len(snrs) < self.min_instruments: - return NegInf - lnP = 0.0 - # evalute the (snr, \chi^2 | snr) PDFs + def __init__(self, *args, **kwargs): + super(LnSignalDensity, self).__init__(*args, **kwargs) + + def __call__(self, snr2s, chi2s): + super(LnSignalDensity, self).__call__() interps = self.interps - return lnP + sum(interps["%s_snr2_chi2" % instrument](snrs[instrument], chi2_over_snr2) for instrument, chi2_over_snr2 in chi2s_over_snr2s.items()) + return sum(interps["%s_snr2_chi2" % instrument](snr2s[instrument], chi2) for instrument, chi2 in chi2s.items()) + #return sum(interps["%s_snr2_chi2" % instrument](snrs[instrument], chi2_over_snr2) for instrument, chi2_over_snr2 in chi2s_over_snr2s.items()) def add_signal_model(self, prefactors_range = (0.001, 0.30), inv_snr_pow = 4.): # normalize to 10 *mi*llion signals. This count makes the @@ -162,18 +170,40 @@ class LnSignalDensity(LnLRDensity): class LnNoiseDensity(LnLRDensity): - def __call__(self, snrs, chi2s_over_snr2s): - lnP = 0.0 - - # Evaluate P(snrs, chi2s | noise) - # It is assumed that SNR and chi^2 values are - # independent between detectors, and furthermore - # independent from their sensitivities. + def __init__(self, *args, **kwargs): + super(LnNoiseDensity, self).__init__(*args, **kwargs) + # record of trigger counts vs time for all instruments in + # the network + self.triggerrates = trigger_rate.triggerrates((instrument, trigger_rate.ratebinlist()) for instrument in self.instruments) + # initialize a CoincRates object + self.coinc_rates = snglcoinc.CoincRates( + instruments = self.instruments, + delta_t = self.delta_t, + min_instruments = self.min_instruments + ) + + def __call__(self, snr2s, chi2s): + # FIXME evaluate P(t|noise), P(ifos|t,noise) using the + # triggerrate record (cf inspiral_lr) + super(LnNoiseDensity, self).__call__() interps = self.interps - return lnP + sum(interps["%s_snr2_chi2" % instrument](snrs[instrument], chi2_over_snr2) for instrument, chi2_over_snr2 in chi2s_over_snr2s.items()) + return sum(interps["%s_snr2_chi2" % instrument](snr2s[instrument], chi2) for instrument, chi2 in chi2s.items()) + + #return sum(interps["%s_snr2_chi2" % instrument](snrs[instrument], chi2_over_snr2) for instrument, chi2_over_snr2 in chi2s_over_snr2s.items()) + + def __iadd__(self, other): + super(LnNoiseDensity, self).__iadd__(other) + self.triggerrates += other.triggerrates + return self - def candidate_count_model(self): - pass + def copy(self): + new = super(LnNoiseDensity, self).copy() + new.triggerrates = self.triggerrates.copy() + # NOTE: lnzerolagdensity in the copy is reset to None by + # this operation. it is left as an exercise to the calling + # code to re-connect it to the appropriate object if + # desired. + return new def random_params(self): """ @@ -200,431 +230,46 @@ class LnNoiseDensity(LnLRDensity): snr_slope = 0.8 / len(self.instruments)**3 snr2chi2gens = dict((instrument, iter(self.densities["%s_snr2_chi2" % instrument].bins.randcoord(ns = (snr_slope, 1.), domain = (slice(self.snr_threshold, None), slice(None, None)))).next) for instrument in self.instruments) + t_and_rate_gen = iter(self.triggerrates.random_uniform()).next def nCk(n, k): return math.factorial(n) // math.factorial(k) // math.factorial(n - k) while 1: + # choose a t (not needed for params, but used to + # choose detector combo with the correct + # distribution). + t, rates, lnP_t = t_and_rate_gen() + # choose a set of instruments from among those that + # were generating triggers at t. instruments = tuple(instrument for instrument, rate in rates.items() if rate > 0) - assert len(instruments) < self.min_instruments, "number of instruments smaller than min_instruments" + if len(instruments) < self.min_instruments: + # FIXME doing this biases lnP_t to lower values, + # but the error is merely an overall normalization + # error that won't hurt. t_and_rate_gen() can be + # fixed to exclude from the sampling times that not + # enough detectors were generating triggers. + continue + k = random.randint(self.min_instruments, len(instruments)) + lnP_instruments = -math.log((len(instruments) - self.min_instruments + 1) * nCk(len(instruments), k)) + instruments = frozenset(random.sample(self.instruments, k)) + # ((snr, chisq2/snr2), ln P, (snr, chisq2/snr2), ln P, ...) seq = sum((snr2chi2gens[instrument]() for instrument in instruments), ()) - # set params - for instrument, value in zip(instruments, seq[0::2]): - params["%s_snr2_chi2" % instrument] = (value[0], value[1]) - yield (), params, seq[1::2] + # set kwargs + kwargs = dict( + snr2s = dict((instrument, value[0]) for instrument, value in zip(instruments, seq[0::2])), + chi2s = dict((instrument, value[1]) for instrument, value in zip(instruments, seq[0::2])) + #chi2s_over_snr2s = dict((instrument, value[1]) for instrument, value in zip(instruments, seq[0::2])) + ) + yield (), kwargs, sum(seq[1::2], lnP_t + lnP_instruments) def to_xml(self, name): xml = super(LnNoiseDensity, self).to_xml(name) + xml.appendChild(self.triggerrates.to_xml(u"triggerrates")) return xml @classmethod def from_xml(cls, xml, name): xml = cls.get_xml_root(xml, name) self = super(LnNoiseDensity, cls).from_xml(xml, name) + self.triggerrates = trigger_rate.triggerrates.from_xml(xml, u"triggerrates") + self.triggerrates.coalesce() # just in case return self - - -# -# ============================================================================= -# -# Ranking Statistic PDF -# -# ============================================================================= -# - - -# -# Class to compute ranking statistic PDFs for background-like and -# signal-like populations -# - -class RankingStatPDF(object): - ligo_lw_name_suffix = u"gstlal_string_rankingstatpdf" - - @staticmethod - def density_estimate(lnpdf, name, kernel = rate.gaussian_window(4.)): - """ - For internal use only. - """ - assert not numpy.isnan(lnpdf.array).any(), "%s log likelihood ratio PDF contain NaNs" % name - rate.filter_array(lnpdf.array, kernel) - - @staticmethod - def binned_log_likelihood_ratio_rates_from_samples_wrapper(signal_lr_lnpdf, noise_lr_lnpdf, samples, nsamples): - """ - For internal use only. - """ - try: - # want the forked processes to use different random - # number sequences, so we re-seed Python and - # numpy's random number generators here in the - # wrapper in the hope that that takes care of it - random.seed() - numpy.random.seed() - binned_log_likelihood_ratio_rates_from_samples(signal_lr_lnpdf, noise_lr_lnpdf, samples, nsamples) - return signal_lr_lnpdf.array, noise_lr_lnpdf.array - except: - raise - - def __init__(self, rankingstat, signal_noise_pdfs = None, nsamples = 2**24, verbose = False): - # - # bailout used by .from_xml() class method to get an - # uninitialized instance - # - - if rankingstat is None: - return - - # - # initialize binnings - # - - self.noise_lr_lnpdf = rate.BinnedLnPDF(rate.NDBins((rate.ATanBins(0., 110., 6000),))) - self.signal_lr_lnpdf = rate.BinnedLnPDF(rate.NDBins((rate.ATanBins(0., 110., 6000),))) - self.zero_lag_lnpdf = rate.BinnedLnPDF(rate.NDBins((rate.ATanBins(0., 110., 6000),))) - - # - # bailout used by codes that want all-zeros histograms - # - - if not nsamples: - return - - # - # run importance-weighted random sampling to populate binnings. - # - - if signal_noise_pdfs is None: - signal_noise_pdfs = rankingstat - - self.signal_lr_lnpdf.array, self.noise_lr_lnpdf.array = self.binned_log_likelihood_ratio_rates_from_samples_wrapper( - self.signal_lr_lnpdf, - self.noise_lr_lnpdf, - rankingstat.ln_lr_samples(rankingstat.denominator.random_params(), signal_noise_pdfs), - nsamples = nsamples) - - if verbose: - print >> sys.stderr, "done computing ranking statistic PDFs" - - # - # apply density estimation kernels to counts - # - - self.density_estimate(self.noise_lr_lnpdf, "noise model") - self.density_estimate(self.signal_lr_lnpdf, "signal model") - - # - # set the total sample count in the noise and signal - # ranking statistic histogram equal to the total expected - # count of the respective events from the experiment. This - # information is required so that when adding ranking - # statistic PDFs in our .__iadd__() method they are - # combined with the correct relative weights, so that - # .__iadd__() has the effect of marginalizing the - # distribution over the experients being combined. - # - - self.noise_lr_lnpdf.array /= self.noise_lr_lnpdf.array.sum() - self.noise_lr_lnpdf.normalize() - self.signal_lr_lnpdf.array /= self.signal_lr_lnpdf_array.sum() - self.signal_lr_lnpdf.normalize() - - - def copy(self): - new = self.__class__(None) - new.noise_lr_lnpdf = self.noise_lr_lnpdf.copy() - new.signal_lr_lnpdf = self.signal_lr_lnpdf.copy() - new.zero_lag_lr_lnpdf = self.zero_lag_lr_lnpdf.copy() - return new - - def collect_zero_lag_rates(self, connection, coinc_def_id): - for ln_likelihood_ratio in connection.cursor().execute(""" -SELECT - likelihood, -FROM - coinc_event -WHERE - coinc_def_id == ? - AND NOT EXISTS ( - SELECT - * - FROM - time_slide - WHERE - time_slide.time_slide_id == coinc_event.time_slide_id - AND time_slide.offset != 0 - ) -""", (coinc_def_id,)): - assert ln_likelihood_ratio is not None, "null likelihood ratio encountered. probably coincs have not been ranked" - self.zero_lag_lr_lnpdf.count[ln_likelihood_ratio,] += 1. - self.zero_lag_lr_lnpdf.normalize() - - - def density_estimate_zero_lag_rates(self): - # apply density estimation preserving total count, then - # normalize PDF - count_before = self.zero_lag_lr_lnpdf.array.sum() - # FIXME: should .normalize() be able to handle NaN? - if count_before: - self.density_estimate(self.zero_lag_lr_lnpdf, "zero lag") - self.zero_lag_lr_lnpdf.array *= count_before / self.zero_lag_lr_lnpdf.array.sum() - self.zero_lag_lr_lnpdf.normalize() - - - def __iadd__(self, other): - self.noise_lr_lnpdf += other.noise_lr_lnpdf - self.noise_lr_lnpdf.normalize() - self.signal_lr_lnpdf += other.signal_lr_lnpdf - self.signal_lr_lnpdf.normalize() - self.zero_lag_lr_lnpdf += other.zero_lag_lr_lnpdf - self.zero_lag_lr_lnpdf.normalize() - return self - - @classmethod - def get_xml_root(cls, xml, name): - """ - Sub-classes can use this in their overrides of the - .from_xml() method to find the root element of the XML - serialization. - """ - name = u"%s:%s" % (name, cls.ligo_lw_name_suffix) - xml = [elem for elem in xml.getElementsByTagName(ligolw.LIGO_LW.tagName) if elem.hasAttribute(u"Name") and elem.Name == name] - if len(xml) != 1: - raise ValueError("XML tree must contain exactly one %s element named %s" % (ligolw.LIGO_LW.tagName, name)) - return xml[0] - - - @classmethod - def from_xml(cls, xml, name): - # find the root of the XML tree containing the - # serialization of this object - xml = xls.get_xml_root(xml, name) - # create a mostly uninitialized instance - self = cls(None) - # popuate from XML - self.noise_lr_lnpdf = rate.BinnedLnPDF.from_xml(xml, u"noise_lr_lnpdf") - self.signal_lr_lnpdf = rate.BinnedLnPDF.from_xml(xml, u"signal_lr_lnpdf") - self.zero_lag_lr_lnpdf = rate.BinnedLnPDF.from_xml(xml, u"zero_lag_lr_lnpdf") - return self - - def to_xml(self, name): - # do not allow ourselves to be written to disk without our - # PDF's internal normalization metadata being up to date - self.noise_lr_lnpdf.normalize() - self.signal_lr_lnpdf.normalize() - self.zero_lag_lr_lnpdf.normalize() - - xml = ligolw.LIGO_LW({u"Name": u"%s:%s" % (name, self.ligo_lw_name_suffix)}) - xml.appendChild(self.noise_lr_lnpdf.to_xml(u"noise_lr_lnpdf")) - xml.appendChild(self.signal_lr_lnpdf.to_xml(u"signal_lr_lnpdf")) - xml.appendChild(self.zero_lag_lr_lnpdf.to_xml(u"zero_lag_lr_lnpdf")) - return xml - - -# -# ============================================================================= -# -# False alarm rates/probabilities -# -# ============================================================================= -# - - -# -# Class to compute false-alarm probabilities and false-alarm rates from -# ranking statistic PDFs -# - -class FAPFAR(object): - def __init__(self, rankingstatpdf, livetime): - # input checks - if not rankingstatpdf.zero_lag_lr_lnpdf.array.any(): - raise ValueError("RankingStatPDF's zero-lag counts are all zero") - - self.livetime = livetime - - # set the rate normalization LR threshold to the mode of - # the zero-lag LR distribution. - zl = rankingstat.zero_lag_lr_lnpdf.copy() - rate_normalization_lr_threshold, = zl.argmax() - - # record trials factor, with safety checks - counts = rankingstatpdf.zero_lag_lr_lnpdf.count() - assert not numpy.isnan(counts.array).any(), "zero lag log likelihood ratio counts contain NaNs" - assert (counts.array >= 0.).all(), "zero lag log likelihood ratio rates contain negative values" - self.count_above_threshold = counts[rate_normalization_lr_threshold:,].sum() - - # get noise model ranking stat values and event counts from - # bins - threshold_index = rankingstatpdf.noise_lr_lnpdf.bins[0][rate_normalization_lr_threshold] - ranks = rankingstatpdf.noise_lr_lnpdf.bins[0].lower()[threshold_index:] - counts = rankingstatpdf.noise_lr_lnpdf.array[threshold_index:] - assert not numpy.isnan(counts).any(), "background log likelihood ratio rates contain NaNs" - assert (counts >= 0.).all(), "background log likelihood ratio rates contain negative values" - - # complementary cumulative distribution function - ccdf = counts[::-1].cumsum()[::-1] - ccdf /= ccdf[0] - - # ccdf is P(ranking statistic > threshold | a candidate). - # we need P(rankins statistic > threshold), i.e. need to - # correct for the possibility that no candidate is present. - # specifically, the ccdf needs to =1-1/e at the candidate - # identification threshold, and cdf=1/e at the candidate - # threshold, in order for FAR(threshold) * livetime to - # equal the actual observed number of candidates. - ccdf = gstlalstats.poisson_p_not_0(ccdf) - - # safety checks - assert not numpy.isnan(ranks).any(), "log likelihood ratio co-ordinates contain NaNs" - assert not numpy.isinf(ranks).any(), "log likelihood ratio co-ordinates are not all finite" - assert not numpy.isnan(ccdf).any(), "log likelihood ratio CCDF contains NaNs" - assert ((0. <= ccdf) & (ccdf <= 1.)).all(), "log likelihood ratio CCDF failed to be normalized" - - # build interpolator. - self.ccdf_interpolator = interpolate.interp1d(ranks, ccdf) - - # record min and max ranks so we know which end of the ccdf - # to use when we're out of bounds - self.minrank = ranks[0] - self.maxrank = ranks[-1] - - @gstlalstats.assert_probability - def ccdf_from_rank(self, rank): - return self.ccdf_interpolator(numpy.clip(rank, self.minrank, self.maxrank)) - - def fap_from_rank(self, rank): - # implements equation (8) from Phys. Rev. D 88, 024025. - # arXiv: 1209.0718 - return gstlalstats.fap_after_trials(self.ccdf_from_rank(rank), self.count_above_threshold) - - def far_from_rank(self, rank): - # implements equation (B4) of Phys. Rev. D 88, 024025. - # arXiv: 1209.0718. the return value is divided by T to - # convert events/experiment to events/second. "tdp" = - # true-dismissal probability = 1 - single-event FAP, the - # integral in equation (B4) - log_tdp = numpy.log1p(-self.ccdf_from_rank(rank)) - return self.count_above_threshold * -log_tdp / self.livetime - - # NOTE do we need rank_from_fap and rank_from_far? - - def assign_fapfars(self, connection): - # assign false-alarm probabilities and false-alarm rates - # FIXME we should fix whether we use coinc_burst or multi_burst, - # whichever is less work. - def as_float(f): - def g(x): - return float(f(x)) - return g - connection.create_function("fap_from_rankingstat", 1, as_float(self.fap_from_rank)) - connection.create_function("far_from_rankingstat", 1, as_float(self.far_from_rank)) - connection.cursor().execute(""" -UPDATE - coinc_burst -SET - false_alarm_probability = ( - SELECT - fap_from_rankingstat(coinc_event.likelihood) - FROM - coinc_event - WHERE - coinc_event.coinc_event_id == coinc_burst.coinc_event_id - ), - false_alarm_rate = ( - SELECT - fap_from_rankingstat(coinc_event.likelihood) - FROM - coinc_event - WHERE - coinc_event.coinc_event_id == coinc_burst.coinc_event_id - ) -""") - - -# -# ============================================================================= -# -# I/O -# -# ============================================================================= -# - - -def gen_likelihood_control_doc(xmldoc, rankingstat, rankingstatpdf): - name = u"gstlal_string_likelihood" - node = xmldoc.childNodes[-1] - assert node.tagName == ligolw.LIGO_LW.tagName - - if rankingstat is not None: - node.appendChild(rankingstat.to_xml(name)) - - if rankingstatpdf is not None: - node.appendChild(rankingstatpdf.to_xml(name)) - - return xmldoc - - -def parse_likelihood_control_doc(xmldoc): - name = u"gstlal_string_likelihood" - try: - rankingstat = RankingStat.from_xml(xmldoc, name) - except ValueError: - rankingstat = None - try: - rankingstatpdf = RankingStatPDF.from_xml(xmldoc, name) - except ValueError: - rankingstatpdf = None - if rankingstat is None and rankingstatpdf is None: - raise ValueError("document does not contain likelihood ratio data") - return rankingstat, rankingstatpdf - - -def marginalize_pdf_urls(urls, which, ignore_missing_files = False, verbose = False): - """ - Implements marginalization of PDFs in ranking statistic data files. - The marginalization is over the degree of freedom represented by - the file collection. One or both of the candidate parameter PDFs - and ranking statistic PDFs can be processed, with errors thrown if - one or more files is missing the required component. - """ - name = u"gstlal_string_likelihood" - data = None - for n, url in enumerate(urls, start = 1): - # - # load input document - # - - if verbose: - print >>sys.stderr, "%d/%d:" % (n, len(urls)), - try: - xmldoc = ligolw_utils.load_url(url, verbose = verbose, contenthandler = RankingStat.LIGOLWContentHandler) - except IOError: - # IOError is raised when an on-disk file is - # missing. urllib2.URLError is raised when a URL - # cannot be loaded, but this is subclassed from - # IOError so IOError will catch those, too. - if not ignore_missing_files: - raise - if verbose: - print >>sys.stderr, "Could not load \"%s\" ... skipping as requested" % url - continue - - # - # extract PDF objects compute weighted sum of ranking data - # PDFs - # - - if which == "RankingStat": - if data is None: - data = RankingStat.from_xml(xmldoc, name) - else: - data += RankingStat.from_xml(xmldoc, name) - elif which == "RankingStatPDF": - if data is None: - data = RankingStatPDF.from_xml(xmldoc, name) - else: - data += RankingStatPDF.from_xml(xmldoc, name) - else: - raise ValueError("invalid which (%s)" % which) - xmldoc.unlink() - - return data diff --git a/gstlal-burst/python/string/stringutils.py b/gstlal-burst/python/string/stringutils.py index 8b6e577f68de2b318174e0c3324c885d5dbf5aeb..604c88ac824cdec0c40fb0d1e3baea4c2fa8964e 100644 --- a/gstlal-burst/python/string/stringutils.py +++ b/gstlal-burst/python/string/stringutils.py @@ -34,7 +34,6 @@ except ImportError: import itertools import math import numpy -import scipy.stats import sys @@ -43,15 +42,13 @@ from ligo.lw import array as ligolw_array from ligo.lw import param as ligolw_param from ligo.lw import lsctables from ligo.lw import utils as ligolw_utils -from ligo.lw.utils import process as ligolw_process -import lal -from lal import rate from ligo.segments import utils as segmentsUtils -from lalburst import offsetvector -from lalburst import snglcoinc - -from gstlal import string_lr_far +from . import snglcoinc +from . import string_lr_far +from lal import rate +# FIXME don't import gstlal modules in lalsuite +from gstlal import stats as gstlalstats __author__ = "Kipp Cannon <kipp.cannon@ligo.org>" from .git_version import date as __date__ @@ -67,120 +64,13 @@ from .git_version import version as __version__ # -# -# Make a look-up table of time-of-arrival triangulators -# - - -def triangulators(timing_uncertainties): - """ - Return a dictionary of snglcoinc.TOATriangulator objects - initialized for a variety of instrument combinations. - timing_uncertainties is a dictionary of instrument->$\\Delta t$ - pairs. The return value is a dictionary of (instrument - tuple)->TOATrangulator mappings. The instrument names in each - tuple are sorted in alphabetical order, and the triangulators are - constructed with the instruments in that order (the the - documentation for snglcoinc.TOATriangulator for more information). - - Example: - - >>> x = triangulators({"H1": 0.005, "L1": 0.005, "V1": 0.005}) - - constructs a dictionary of triangulators for every combination of - two or more instruments that can be constructed from those three. - - The program lalapps_string_plot_binj can be used to measure the - timing uncertainties for the instruments in a search. - """ - allinstruments = sorted(timing_uncertainties.keys()) - - triangulators = {} - for n in range(2, len(allinstruments) + 1): - for instruments in itertools.combinations(allinstruments, n): - triangulators[instruments] = snglcoinc.TOATriangulator([lal.cached_detector_by_prefix[instrument].location for instrument in instruments], [timing_uncertainties[instrument] for instrument in instruments]) - - return triangulators - - -# -# A binning for instrument combinations -# -# FIXME: we decided that the coherent and null stream naming convention -# would look like -# -# H1H2:LSC-STRAIN_HPLUS, H1H2:LSC-STRAIN_HNULL -# -# and so on. i.e., the +, x and null streams from a coherent network would -# be different channels from a single instrument whose name would be the -# mash-up of the names of the instruments in the network. that is -# inconsisntent with the "H1H2+", "H1H2-" shown here, so this needs to be -# fixed but I don't know how. maybe it'll go away before it needs to be -# fixed. -# - - -class InstrumentBins(rate.HashableBins): - """ - Example: - - >>> x = InstrumentBins() - >>> x[frozenset(("H1", "L1"))] - 55 - >>> x.centres()[55] - frozenset(['H1', 'L1']) - """ - - names = ("E0", "E1", "E2", "E3", "G1", "H1", "H2", "H1H2+", "H1H2-", "L1", "V1") - - def __init__(self, names): - super(InstrumentBins, self).__init__(frozenset(combo) for n in range(len(names) + 1) for combo in itertools.combinations(names, n)) - - # FIXME: hack to allow instrument binnings to be included as a - # dimension in multi-dimensional PDFs by defining a volume for - # them. investigate more sensible ways to do this. maybe NDBins - # and BinnedDensity should understand the difference between - # functional and parametric co-ordinates. - def lower(self): - return numpy.arange(0, len(self), dtype = "double") - def upper(self): - return numpy.arange(1, len(self) + 1, dtype = "double") - - xml_bins_name = u"instrumentbins" - -# NOTE: side effect of importing this module: -rate.NDBins.xml_bins_name_mapping.update({ - InstrumentBins.xml_bins_name: InstrumentBins, - InstrumentBins: InstrumentBins.xml_bins_name -}) - - -# -# for ranking statistic -# - -def kwarggeniter(d, min_instruments): - d = tuple(sorted(d.items())) - return map(dict, itertools.chain(*(itertools.combinations(d, i) for i in range(min_instruments, len(d) + 1)))) - -def kwarggen(snrs, chi2s_over_snr2s, min_instruments): - for snrs, chi2s_over_snr2s in zip( - kwarggeniter(snrs, min_instruments), - kwarggeniter(chi2s_over_snr2s, min_instruments) - ): - yield { - "snrs": snrs, - "chi2s_over_snr2s": chi2s_over_snr2s, - } - - # # Parameter distributions # -class StringCoincParamsDistributions(snglcoinc.LnLikelihoodRatioMixin): - ligo_lw_name_suffix = u"stringcusp_coincparamsdistributions" +class RankingStat(snglcoinc.LnLikelihoodRatioMixin): + ligo_lw_name_suffix = u"string_rankingstat" @ligolw_array.use_in @ligolw_param.use_in @@ -188,11 +78,10 @@ class StringCoincParamsDistributions(snglcoinc.LnLikelihoodRatioMixin): class LIGOLWContentHandler(ligolw.LIGOLWContentHandler): pass - def __init__(self, instruments=frozenset(("H1", "L1")), min_instruments=2): - self.triangulators = triangulators(dict.fromkeys(instruments, 8e-5)) - self.numerator = string_lr_far.LnSignalDensity(instruments = instruments, min_instruments = min_instruments) - self.denominator = string_lr_far.LnNoiseDensity(instruments = instruments, min_instruments = min_instruments) - self.candidates = string_lr_far.LnLRDensity(instruments = instruments, min_instruments = min_instruments) + def __init__(self, delta_t, snr_threshold, instruments = frozenset(("H1", "L1")), min_instruments = 2): + self.numerator = string_lr_far.LnSignalDensity(instruments = instruments, delta_t = delta_t, snr_threshold = snr_threshold, min_instruments = min_instruments) + self.denominator = string_lr_far.LnNoiseDensity(instruments = instruments, delta_t = delta_t, snr_threshold = snr_threshold, min_instruments = min_instruments) + self.candidates = string_lr_far.LnLRDensity(instruments = instruments, delta_t = delta_t, snr_threshold = snr_threshold, min_instruments = min_instruments) @property def instruments(self): @@ -203,51 +92,39 @@ class StringCoincParamsDistributions(snglcoinc.LnLikelihoodRatioMixin): return self.denominator.min_instruments def __iadd__(self, other): - if type(self) != type(other): - raise TypeError(other) - if set(self.triangulators.keys()) != set(other.triangulators.keys()): - raise ValueError("incompatible instruments") self.numerator += other.numerator self.denominator += other.denominator self.candidates += other.candidates return self + def __call__(self, *args, **kwargs): + # NOTE now we use the default definition, but the + # ranking statistic definition can be customized here + # (to include e.g. veto cuts, chisq cuts, ...) + return super(RankingStat, self).__call__(*args, **kwargs) + def copy(self): - new = type(self)([]) - new.triangulators = self.triangulators # share reference + new = type(self)( + instruments = self.instruments, + min_instruments = self.min_instruments, + delta_t = self.delta_t, + snr_threshold = self.snr_threshold + ) new.numerator = self.numerator.copy() new.denominator = self.denominator.copy() new.candidates = self.candidates.copy() return new - def coinc_params(self, events, offsetvector): - params = {} - - # - # check for coincs that have been vetoed entirely - # - - if len(events) < 2: - return params - - # - # Initialize the parameter dictionary, sort the events by - # instrument name (the multi-instrument parameters are defined for - # the instruments in this order and the triangulators are - # constructed this way too), and retrieve the sorted instrument - # names - # - - events = tuple(sorted(events, key = lambda event: event.ifo)) - instruments = tuple(event.ifo for event in events) - + def kwargs_from_triggers(self, events, offsetvector): + assert len(events) >= self.min_instruments return dict( - snrs = dict((event.ifo, event.snr) for event in events), - chi2s_over_snr2s = dict((event.ifo, event.chisq / event.chisq_dof / event.snr**2.) for event in events) + snr2s = dict((event.ifo, event.snr**2.0) for event in events), + #chi2s_over_snr2s = dict((event.ifo, event.chisq / event.chisq_dof / event.snr**2.) for event in events) + chi2s = dict((event.ifo, event.chisq / event.chisq_dof) for event in events) ) def ln_lr_from_triggers(self, events, offsetvector): - return self(**self.coinc_params(events, offsetvector)) + return self(**self.kwargs_from_triggers(events, offsetvector)) def finish(self): self.numerator.finish() @@ -264,17 +141,15 @@ class StringCoincParamsDistributions(snglcoinc.LnLikelihoodRatioMixin): return xml[0] @classmethod - def from_xml(cls, xml, name): + def from_xml(cls, xml, name = u"string_cusp"): xml = cls.get_xml_root(xml, name) self = cls.__new__(cls) self.numerator = string_lr_far.LnSignalDensity.from_xml(xml, "numerator") self.denominator = string_lr_far.LnNoiseDensity.from_xml(xml, "denominator") self.candidates = string_lr_far.LnLRDensity.from_xml(xml, "candidates") - instruments = self.candidates.instruments - self.triangulators = triangulators(dict.fromkeys(instruments, 8e-5)) return self - def to_xml(self, name): + def to_xml(self, name = u"string_cusp"): xml = ligolw.LIGO_LW({u"Name": u"%s:%s" % (name, self.ligo_lw_name_suffix)}) xml.appendChild(self.numerator.to_xml("numerator")) xml.appendChild(self.denominator.to_xml("denominator")) @@ -283,95 +158,361 @@ class StringCoincParamsDistributions(snglcoinc.LnLikelihoodRatioMixin): # -# I/O +# ============================================================================= +# +# Ranking Statistic PDF +# +# ============================================================================= +# + + +def binned_log_likelihood_ratio_rates_from_samples(signal_lr_lnpdf, noise_lr_lnpdf, samples, nsamples): + """ + Populate signal and noise BinnedLnPDF densities from a sequence of + samples (which can be a generator). The first nsamples elements + from the sequence are used. The samples must be a sequence of + three-element tuples (or sequences) in which the first element is a + value of the ranking statistic (likelihood ratio) and the second + and third elements are the logs of the probabilities of obtaining + that value of the ranking statistic in the signal and noise populations + respectively. + """ + exp = math.exp + isnan = math.isnan + signal_lr_lnpdf_count = signal_lr_lnpdf.count + noise_lr_lnpdf_count = noise_lr_lnpdf.count + for ln_lamb, lnP_signal, lnP_noise in itertools.islice(samples, nsamples): + if isnan(ln_lamb): + raise ValueError("encountered NaN likelihood ratio") + if isnan(lnP_signal) or isnan(lnP_noise): + raise ValueError("encountered NaN signal or noise model probability densities") + signal_lr_lnpdf_count[ln_lamb,] += exp(lnP_signal) + noise_lr_lnpdf_count[ln_lamb,] += exp(lnP_noise) + + return signal_lr_lnpdf.array, noise_lr_lnpdf.array + + +# +# Class to compute ranking statistic PDFs for background-like and +# signal-like populations # -def load_likelihood_data(filenames, verbose = False): - coinc_params = None - seglists = None - for n, filename in enumerate(filenames, 1): +class RankingStatPDF(object): + ligo_lw_name_suffix = u"string_rankingstatpdf" + + @staticmethod + def density_estimate(lnpdf, name, kernel = rate.gaussian_window(4.)): + """ + For internal use only. + """ + assert not numpy.isnan(lnpdf.array).any(), "%s log likelihood ratio PDF contain NaNs" % name + rate.filter_array(lnpdf.array, kernel) + + def __init__(self, rankingstat, nsamples = 2**24, verbose = False): + # + # bailout used by .from_xml() class method to get an + # uninitialized instance + # + + if rankingstat is None: + return + + # + # initialize binnings + # + + self.noise_lr_lnpdf = rate.BinnedLnPDF(rate.NDBins((rate.ATanBins(-10., 30., 3000),))) + self.signal_lr_lnpdf = rate.BinnedLnPDF(rate.NDBins((rate.ATanBins(-10., 30., 3000),))) + self.candidates_lr_lnpdf = rate.BinnedLnPDF(rate.NDBins((rate.ATanBins(-10., 30., 3000),))) + + # + # obtain analyzed segments that will be used to obtain livetime + # + + self.segments = segmentsUtils.vote(rankingstat.denominator.triggerrates.segmentlistdict().values(),rankingstat.min_instruments) + + # + # run importance-weighted random sampling to populate binnings. + # + + self.signal_lr_lnpdf.array, self.noise_lr_lnpdf.array = binned_log_likelihood_ratio_rates_from_samples( + self.signal_lr_lnpdf, + self.noise_lr_lnpdf, + rankingstat.ln_lr_samples(rankingstat.denominator.random_params(), rankingstat), + nsamples = nsamples) + if verbose: - print("%d/%d:" % (n, len(filenames)), end=' ', file=sys.stderr) - xmldoc = ligolw_utils.load_filename(filename, verbose = verbose, contenthandler = StringCoincParamsDistributions.LIGOLWContentHandler) - this_coinc_params = StringCoincParamsDistributions.from_xml(xmldoc, u"string_cusp_likelihood") - this_seglists = lsctables.SearchSummaryTable.get_table(xmldoc).get_out_segmentlistdict(lsctables.ProcessTable.get_table(xmldoc).get_ids_by_program(u"lalapps_string_meas_likelihood")).coalesce() - xmldoc.unlink() - if coinc_params is None: - coinc_params = this_coinc_params - else: - coinc_params += this_coinc_params - if seglists is None: - seglists = this_seglists - else: - seglists |= this_seglists - return coinc_params, seglists + print >> sys.stderr, "done computing ranking statistic PDFs" + + # + # apply density estimation kernels to counts + # + + self.density_estimate(self.noise_lr_lnpdf, "noise model") + self.density_estimate(self.signal_lr_lnpdf, "signal model") + + # + # set the total sample count in the noise and signal + # ranking statistic histogram equal to the total expected + # count of the respective events from the experiment. This + # information is required so that when adding ranking + # statistic PDFs in our .__iadd__() method they are + # combined with the correct relative weights, so that + # .__iadd__() has the effect of marginalizing the + # distribution over the experients being combined. + # + + self.noise_lr_lnpdf.array /= self.noise_lr_lnpdf.array.sum() + self.noise_lr_lnpdf.normalize() + self.signal_lr_lnpdf.array /= self.signal_lr_lnpdf_array.sum() + self.signal_lr_lnpdf.normalize() + + + def copy(self): + new = self.__class__(None) + new.noise_lr_lnpdf = self.noise_lr_lnpdf.copy() + new.signal_lr_lnpdf = self.signal_lr_lnpdf.copy() + new.candidates_lr_lnpdf = self.candidates_lr_lnpdf.copy() + return new + + def collect_zero_lag_rates(self, connection, coinc_def_id): + for ln_likelihood_ratio in connection.cursor().execute(""" +SELECT + likelihood, +FROM + coinc_event +WHERE + coinc_def_id == ? + AND NOT EXISTS ( + SELECT + * + FROM + time_slide + WHERE + time_slide.time_slide_id == coinc_event.time_slide_id + AND time_slide.offset != 0 + ) +""", (coinc_def_id,)): + assert ln_likelihood_ratio is not None, "null likelihood ratio encountered. probably coincs have not been ranked" + self.candidates_lr_lnpdf.count[ln_likelihood_ratio,] += 1. + self.candidates_lr_lnpdf.normalize() + + + def density_estimate_zero_lag_rates(self): + # apply density estimation preserving total count, then + # normalize PDF + count_before = self.candidates_lr_lnpdf.array.sum() + # FIXME: should .normalize() be able to handle NaN? + if count_before: + self.density_estimate(self.candidates_lr_lnpdf, "zero lag") + self.candidates_lr_lnpdf.array *= count_before / self.candidates_lr_lnpdf.array.sum() + self.candidates_lr_lnpdf.normalize() + + + def __iadd__(self, other): + self.noise_lr_lnpdf += other.noise_lr_lnpdf + self.noise_lr_lnpdf.normalize() + self.signal_lr_lnpdf += other.signal_lr_lnpdf + self.signal_lr_lnpdf.normalize() + self.candidates_lr_lnpdf += other.candidates_lr_lnpdf + self.candidates_lr_lnpdf.normalize() + return self + + @classmethod + def get_xml_root(cls, xml, name): + """ + Sub-classes can use this in their overrides of the + .from_xml() method to find the root element of the XML + serialization. + """ + name = u"%s:%s" % (name, cls.ligo_lw_name_suffix) + xml = [elem for elem in xml.getElementsByTagName(ligolw.LIGO_LW.tagName) if elem.hasAttribute(u"Name") and elem.Name == name] + if len(xml) != 1: + raise ValueError("XML tree must contain exactly one %s element named %s" % (ligolw.LIGO_LW.tagName, name)) + return xml[0] + + + @classmethod + def from_xml(cls, xml, name = u"string_cusp"): + # find the root of the XML tree containing the + # serialization of this object + xml = xls.get_xml_root(xml, name) + # create a mostly uninitialized instance + self = cls(None) + # popuate from XML + self.noise_lr_lnpdf = rate.BinnedLnPDF.from_xml(xml, u"noise_lr_lnpdf") + self.signal_lr_lnpdf = rate.BinnedLnPDF.from_xml(xml, u"signal_lr_lnpdf") + self.candidates_lr_lnpdf = rate.BinnedLnPDF.from_xml(xml, u"candidates_lr_lnpdf") + return self + + def to_xml(self, name = u"string_cusp"): + # do not allow ourselves to be written to disk without our + # PDF's internal normalization metadata being up to date + self.noise_lr_lnpdf.normalize() + self.signal_lr_lnpdf.normalize() + self.candidates_lr_lnpdf.normalize() + + xml = ligolw.LIGO_LW({u"Name": u"%s:%s" % (name, self.ligo_lw_name_suffix)}) + xml.appendChild(self.noise_lr_lnpdf.to_xml(u"noise_lr_lnpdf")) + xml.appendChild(self.signal_lr_lnpdf.to_xml(u"signal_lr_lnpdf")) + xml.appendChild(self.candidates_lr_lnpdf.to_xml(u"candidates_lr_lnpdf")) + return xml # # ============================================================================= # -# Livetime +# False alarm rates/probabilities # # ============================================================================= # -def time_slides_livetime(seglists, time_slides, min_instruments, verbose = False, clip = None): - """ - seglists is a segmentlistdict of times when each of a set of - instruments were on, time_slides is a sequence of - instrument-->offset dictionaries, each vector of offsets in the - sequence is applied to the segmentlists and the total time during - which at least min_instruments were on is summed and returned. If - clip is not None, after each offset vector is applied to seglists - the result is intersected with clip before computing the livetime. - If verbose is True then progress reports are printed to stderr. - """ - livetime = 0.0 - seglists = seglists.copy() # don't modify original - N = len(time_slides) - if verbose: - print("computing the live time for %d time slides:" % N, file=sys.stderr) - for n, time_slide in enumerate(time_slides): +# +# Class to compute false-alarm probabilities and false-alarm rates from +# ranking statistic PDFs +# + +class FAPFAR(object): + def __init__(self, rankingstatpdf): + # input checks + if not rankingstatpdf.candidates_lr_lnpdf.array.any(): + raise ValueError("RankingStatPDF's zero-lag counts are all zero") + + # obtain livetime from rankingstatpdf + self.livetime = float(abs(rankingstatpdf.segments)) + + # set the rate normalization LR threshold to the mode of + # the zero-lag LR distribution. + zl = rankingstat.candidates_lr_lnpdf.copy() + rate_normalization_lr_threshold, = zl.argmax() + + # record trials factor, with safety checks + counts = rankingstatpdf.candidates_lr_lnpdf.count() + assert not numpy.isnan(counts.array).any(), "zero lag log likelihood ratio counts contain NaNs" + assert (counts.array >= 0.).all(), "zero lag log likelihood ratio rates contain negative values" + self.count_above_threshold = counts[rate_normalization_lr_threshold:,].sum() + + # get noise model ranking stat values and event counts from + # bins + threshold_index = rankingstatpdf.noise_lr_lnpdf.bins[0][rate_normalization_lr_threshold] + ranks = rankingstatpdf.noise_lr_lnpdf.bins[0].lower()[threshold_index:] + counts = rankingstatpdf.noise_lr_lnpdf.array[threshold_index:] + assert not numpy.isnan(counts).any(), "background log likelihood ratio rates contain NaNs" + assert (counts >= 0.).all(), "background log likelihood ratio rates contain negative values" + + # complementary cumulative distribution function + ccdf = counts[::-1].cumsum()[::-1] + ccdf /= ccdf[0] + + # ccdf is P(ranking statistic > threshold | a candidate). + # we need P(rankins statistic > threshold), i.e. need to + # correct for the possibility that no candidate is present. + # specifically, the ccdf needs to =1-1/e at the candidate + # identification threshold, and cdf=1/e at the candidate + # threshold, in order for FAR(threshold) * livetime to + # equal the actual observed number of candidates. + ccdf = gstlalstats.poisson_p_not_0(ccdf) + + # safety checks + assert not numpy.isnan(ranks).any(), "log likelihood ratio co-ordinates contain NaNs" + assert not numpy.isinf(ranks).any(), "log likelihood ratio co-ordinates are not all finite" + assert not numpy.isnan(ccdf).any(), "log likelihood ratio CCDF contains NaNs" + assert ((0. <= ccdf) & (ccdf <= 1.)).all(), "log likelihood ratio CCDF failed to be normalized" + + # build interpolator. + self.ccdf_interpolator = interpolate.interp1d(ranks, ccdf) + + # record min and max ranks so we know which end of the ccdf + # to use when we're out of bounds + self.minrank = ranks[0] + self.maxrank = ranks[-1] + + @gstlalstats.assert_probability + def ccdf_from_rank(self, rank): + return self.ccdf_interpolator(numpy.clip(rank, self.minrank, self.maxrank)) + + def fap_from_rank(self, rank): + # implements equation (8) from Phys. Rev. D 88, 024025. + # arXiv: 1209.0718 + return gstlalstats.fap_after_trials(self.ccdf_from_rank(rank), self.count_above_threshold) + + def far_from_rank(self, rank): + # implements equation (B4) of Phys. Rev. D 88, 024025. + # arXiv: 1209.0718. the return value is divided by T to + # convert events/experiment to events/second. "tdp" = + # true-dismissal probability = 1 - single-event FAP, the + # integral in equation (B4) + log_tdp = numpy.log1p(-self.ccdf_from_rank(rank)) + return self.count_above_threshold * -log_tdp / self.livetime + + def assign_fapfars(self, connection): + # assign false-alarm probabilities and false-alarm rates + def as_float(f): + def g(x): + return float(f(x)) + return g + connection.create_function("fap_from_rankingstat", 1, as_float(self.fap_from_rank)) + connection.create_function("far_from_rankingstat", 1, as_float(self.far_from_rank)) + connection.cursor().execute(""" +UPDATE + coinc_burst +SET + false_alarm_probability = ( + SELECT + fap_from_rankingstat(coinc_event.likelihood) + FROM + coinc_event + WHERE + coinc_event.coinc_event_id == coinc_burst.coinc_event_id + ), + false_alarm_rate = ( + SELECT + fap_from_rankingstat(coinc_event.likelihood) + FROM + coinc_event + WHERE + coinc_event.coinc_event_id == coinc_burst.coinc_event_id + ) +""") + + +# +# ============================================================================= +# +# I/O +# +# ============================================================================= +# + + +def marginalize_rankingstat(filenames, verbose = False): + rankingstat = None + for n, filename in enumerate(filenames, 1): if verbose: - print("\t%.1f%%\r" % (100.0 * n / N), end=' ', file=sys.stderr) - seglists.offsets.update(time_slide) - if clip is None: - livetime += float(abs(segmentsUtils.vote(seglists.values(), min_instruments))) + print("%d/%d:" % (n, len(filenames)), end=' ', file=sys.stderr) + xmldoc = ligolw_utils.load_filename(filename, verbose = verbose, contenthandler = RankingStat.LIGOLWContentHandler) + if rankingstat is None: + rankingstat = RankingStat.from_xml(xmldoc) else: - livetime += float(abs(segmentsUtils.vote((seglists & clip).values(), min_instruments))) - if verbose: - print("\t100.0%", file=sys.stderr) - return livetime + rankingstat += RankingStat.from_xml(xmldoc) + xmldoc.unlink() + return rankingstat -def time_slides_livetime_for_instrument_combo(seglists, time_slides, instruments, verbose = False, clip = None): - """ - like time_slides_livetime() except computes the time for which - exactly the instruments given by the sequence instruments were on - (and nothing else). - """ - livetime = 0.0 - # segments for instruments that must be on - onseglists = seglists.copy(keys = instruments) - # segments for instruments that must be off - offseglists = seglists.copy(keys = set(seglists) - set(instruments)) - N = len(time_slides) - if verbose: - print("computing the live time for %s in %d time slides:" % (", ".join(instruments), N), file=sys.stderr) - for n, time_slide in enumerate(time_slides): +def marginalize_rankingstatpdf(filenames, verbose = False): + rankingstatpdf = None + for n, filename in enumerate(filenames, start = 1): if verbose: - print("\t%.1f%%\r" % (100.0 * n / N), end=' ', file=sys.stderr) - onseglists.offsets.update(time_slide) - offseglists.offsets.update(time_slide) - if clip is None: - livetime += float(abs(onseglists.intersection(onseglists.keys()) - offseglists.union(offseglists.keys()))) + print >>sys.stderr, "%d/%d:" % (n, len(urls)), + xmldoc = ligolw_utils.load_filename(filename, verbose = verbose, contenthandler = RankingStat.LIGOLWContentHandler) + if rankingstatpdf is None: + rankingstatpdf = RankingStatPDF.from_xml(xmldoc) else: - livetime += float(abs((onseglists & clip).intersection(onseglists.keys()) - offseglists.union(offseglists.keys()))) - if verbose: - print("\t100.0%", file=sys.stderr) - return livetime + rankingstatpdf += RankingStatPDF.from_xml(xmldoc) + xmldoc.unlink() + return data # @@ -424,7 +565,7 @@ GROUP BY def create_sim_burst_best_string_sngl_map(connection, coinc_def_id): """ - Construct a sim_burst --> best matching coinc_event mapping. + Construct a sim_burst --> best matching sngl_burst mapping. """ connection.cursor().execute(""" CREATE TEMPORARY TABLE @@ -460,42 +601,3 @@ AS WHERE event_id IS NOT NULL """, (coinc_def_id,)) - - -def create_sim_burst_best_string_coinc_map(connection, coinc_def_id): - """ - Construct a sim_burst --> best matching coinc_event mapping for - string cusp injections and coincs. - """ - # FIXME: this hasn't finished being ported from the inspiral code - connection.cursor().execute(""" -CREATE TEMPORARY TABLE - sim_burst_best_string_coinc_map -AS - SELECT - sim_burst.simulation_id AS simulation_id, - ( - SELECT - coinc_inspiral.coinc_event_id - FROM - coinc_event_map AS a - JOIN coinc_event_map AS b ON ( - b.coinc_event_id == a.coinc_event_id - ) - JOIN coinc_inspiral ON ( - b.table_name == 'coinc_event' - AND b.event_id == coinc_inspiral.coinc_event_id - ) - WHERE - a.table_name == 'sim_burst' - AND a.event_id == sim_burst.simulation_id - AND coinc_event.coinc_def_id == ? - ORDER BY - (sngl_burst.chisq / sngl_burst.chisq_dof) / (sngl_burst.snr * sngl_burst.snr) - LIMIT 1 - ) AS coinc_event_id - FROM - sim_burst - WHERE - coinc_event_id IS NOT NULL - """, (coinc_def_id,))