From b8ec090c10ba753e059c34a262f2f893a97a830c Mon Sep 17 00:00:00 2001
From: Kipp Cannon <kipp.cannon@ligo.org>
Date: Wed, 21 Jun 2017 15:41:31 +0900
Subject: [PATCH] gstlal_inspiral_make_snr_pdf: new version

- add --full feature to enable population of all possible PDFs
- add --full-fragment feature to enable parallelization
- add --seed / --seed-cache feature to enable merging of PDFs generated in parallel
- add process metadata to the output file to record how it was constructed
- remove make_snr_pdf_cache.py from share/ as it is now redundant
---
 .../bin/gstlal_inspiral_make_snr_pdf          | 160 +++++++++++++++---
 gstlal-inspiral/share/make_snr_pdf_cache.py   |  35 ----
 2 files changed, 136 insertions(+), 59 deletions(-)
 delete mode 100644 gstlal-inspiral/share/make_snr_pdf_cache.py

diff --git a/gstlal-inspiral/bin/gstlal_inspiral_make_snr_pdf b/gstlal-inspiral/bin/gstlal_inspiral_make_snr_pdf
index 7c4c59b68f..322c62c01f 100755
--- a/gstlal-inspiral/bin/gstlal_inspiral_make_snr_pdf
+++ b/gstlal-inspiral/bin/gstlal_inspiral_make_snr_pdf
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 #
-# Copyright (C) 2016  Kipp Cannon, Chad Hanna
+# Copyright (C) 2016,2017  Kipp Cannon, Chad Hanna
 #
 # This program is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by the
@@ -33,16 +33,34 @@
 
 
 import itertools
+import logging
 from optparse import OptionParser
-import sys
+import time
 
 
 from glue.ligolw import ligolw
 from glue.ligolw import utils as ligolw_utils
+from glue.ligolw.utils import process as ligolw_process
 from glue.text_progress_bar import ProgressBar
 from gstlal import reference_psd
 from gstlal.stats import inspiral_extrinsics
 import lal.series
+from lal.utils import CacheEntry
+
+
+#
+# =============================================================================
+#
+#                                    Utils
+#
+# =============================================================================
+#
+
+
+def all_instrument_combos(instruments, min_number):
+	for n in range(min_number, len(instruments) + 1):
+		for combo in itertools.combinations(instruments, n):
+			yield combo
 
 
 #
@@ -59,16 +77,54 @@ def parse_command_line():
 		description = ""
 	)
 
+	parser.add_option("-f", "--full", action = "store_true", help = "Generate SNR PDFs for all possible horizon distance ratios as defined by the SNR PDF mechanism's internal quantization system.  Use --instruments to select the detector network to model (required).  A PDF will be constructed for every combination of --min-instruments or more instruments from the set.  When --full enabled, any --horizon-distances options and/or PSD files are ignored.  See also --full-fragment for information on parallelization.")
+	parser.add_option("--full-fragment", metavar = "n/m", default = "1/1", help = "Enable parallelization of --full by selecting a fragment of the full SNR PDFs sequence to generate.  When --full is enabled, the iteration over SNR PDFs employs a reproducible, deterministic, sequence.  That sequence is divided into m (m >= 1) approximately equal-sized intervals, and only the SNR PDFs from the n-th interval (1 <= n <= m) of the sequence will be constructed.  Afterwards, the files generated by several jobs can be combined by re-invoking the program using the --seed or --seed-cache options to obtain the complete SNR PDF collection.  The format of this option is \"n/m\", and the default is \"1/1\", which causes all SNR PDFs to be constructed.")
 	parser.add_option("--horizon-distances", metavar = "instrument=distance[,instrument=distance,...]", action = "append", help = "Cache SNR PDFs for these instruments and horizon distances.  Format is, e.g., H1=120,L1=120,V1=48.  Units for distance are irrelevant (PDFs depend only on their ratios).  A PDF will be constructed for every combination of --min-instruments or more instruments from the set.  All --horizon-distances must list the same instruments (if an instrument is off set its horizon distance to 0).")
-	parser.add_option("--horizon-distance-masses", metavar = "m1,m2", action = "append", default = ["1.4,1.4"], help = "When computing pre-cached SNR PDFs from a collection of PSD files, compute horizon distances for these masses in solar mass units (default = 1.4,1.4).  Can be given multiple times.")
-	parser.add_option("--horizon-distance-flow", metavar = "Hz", default = 10., type = "float", help = "When computing pre-cached SNR PDFs from a collection PSD files, start horizon distance integral at this frequency in Hertz (default = 10 Hz).")
-	parser.add_option("--instruments", metavar = "name[,name,...]", help = "Name the instruments explicitly.  If only PSD files are to be used to get horizon distances (no --horizon-distances options are given) then this options is required, otherwise it is optional.  If both --instruments and --horizon-distances are given, they must list the same instruments.")
+	parser.add_option("--horizon-distance-flow", metavar = "Hz", default = 10., type = "float", help = "When obtaining horizon distances from a collection PSD files, start horizon distance integral at this frequency in Hertz (default = 10 Hz).")
+	parser.add_option("--horizon-distance-masses", metavar = "m1,m2", action = "append", default = ["1.4,1.4"], help = "When obtaining horizon distances from a collection of PSD files, compute them for these masses in solar mass units (default = 1.4,1.4).  Can be given multiple times.")
+	parser.add_option("--instruments", metavar = "name[,name,...]", help = "Set the instruments to include in the network.  If PSD files alone are to be used to provide horizon distances, or --full is selected, then this option is required, otherwise it is optional.  If both --instruments and --horizon-distances are given, they must list the same instruments.")
 	parser.add_option("--min-instruments", metavar = "count", default = 2, type = "int", help = "Set the minimum number of instruments required to form a candidate (default = 2).")
 	parser.add_option("--output", "-o", metavar = "filename", help = "Write SNR PDF cache to this file (required).")
+	parser.add_option("--seed", metavar = "filename", action = "append", default = [], help = "Seed the SNR PDF cache by loading pre-computed SNR PDFs from this file.  Can be given multiple times.")
+	parser.add_option("--seed-cache", metavar = "filename", help = "Seed the SNR PDF cache by loading pre-computed SNR PDFs from the files named in this LAL cache.")
 	parser.add_option("--verbose", "-v", action = "store_true", help = "Be verbose.")
 
 	options, filenames = parser.parse_args()
 
+	processparams = options.__dict__.copy()
+
+	if options.verbose:
+		logging.basicConfig(format = "%(message)s", level = logging.INFO)
+	else:
+		logging.basicConfig(format = "%(message)s", level = logging.WARNING)
+
+	if options.seed_cache:
+		options.seed += [CacheEntry(line).path for line in open(options.seed_cache)]
+
+	if not options.full and not options.horizon_distances and not options.seed and not filenames:
+		raise ValueError("must either enable --full, or provide one or more --horizon-distances, or one or more --seed and/or a --seed-cache, or provide the names of one or more PSD files")
+
+	if options.full:
+		if not options.instruments:
+			raise ValueError("must set --instruments with --full enabled")
+
+		#
+		# disable PSD files and --horizon-distances options.  waste
+		# of time if we're already going to generate all possible
+		# PDFs
+		#
+
+		if filenames:
+			logging.warning("--full enabled, ignoring PSD files")
+		del filenames[:]
+		if options.horizon_distances is not None:
+			logging.warning("--full enabled, ignoring --horizon-distances")
+		options.horizon_distances = None
+
+	options.full_fragment = map(int, options.full_fragment.split("/"))
+	if len(options.full_fragment) != 2 or options.full_fragment[1] < 1 or not 1 <= options.full_fragment[0] <= options.full_fragment[1]:
+		raise ValueError("invalid --full-fragment")
+
 	if options.instruments:
 		options.instruments = set(options.instruments.split(","))
 		if len(options.instruments) < options.min_instruments:
@@ -89,14 +145,16 @@ def parse_command_line():
 			raise ValueError("--instruments does not match --horizon-distances")
 		if any(min(horizon_distances.values()) < 0. for horizon_distances in options.horizon_distances):
 			raise ValueError("all --horizon-distances must be >= 0.")
-	elif not filenames:
-		raise ValueError("must provide either one or more PSD files or one or more --horizon-distances, or one or more of both")
-	elif not options.instruments:
-		raise ValueError("must set --instruments when --horizon-distances not given")
 	else:
-		# we'll populate this later from the PSD files
+		# we'll populate this later
 		options.horizon_distances = []
 
+	# if --full was given it has unset filenames;  if any
+	# --horizon-distances were given then .instruments has been
+	# populated from those if it wasn't already set
+	if filenames and not options.instruments:
+		raise ValueError("must set --instruments if PSD files alone are used for horizon distances")
+
 	options.horizon_distance_masses = [map(float, s.split(",")) for s in options.horizon_distance_masses]
 	if any(len(masses) != 2 for masses in options.horizon_distance_masses):
 		raise ValueError("must provide exactly 2 masses for each --horizon-distance-masses")
@@ -109,7 +167,7 @@ def parse_command_line():
 	if not options.output:
 		raise ValueError("must set --output")
 
-	return options, filenames
+	return options, processparams, filenames
 
 
 #
@@ -126,36 +184,88 @@ def parse_command_line():
 #
 
 
-options, filenames = parse_command_line()
+options, processparams, filenames = parse_command_line()
 
 
 #
-# extract additional horizon distances from PSD files if given
+# initialize empty SNR PDF cache
+#
+
+
+snrpdf = inspiral_extrinsics.SNRPDF(snr_cutoff = 4.)
+snrpdf.snr_joint_pdf_cache.clear()	# just in case
+
+
+#
+# now load seed files if requested.  all SNRPDF instances share a single
+# global cache of PDFs, so loading one from disk has the effect of
+# inserting its PDFs into our existing cache
+#
+
+
+for n, filename in enumerate(options.seed, 1):
+	logging.info("loading seed PDFs %d/%d:" % (n, len(options.seed)))
+	seed_snrpdf = inspiral_extrinsics.SNRPDF.load(open(filename), verbose = options.verbose)
+	if seed_snrpdf.snr_cutoff != snrpdf.snr_cutoff:
+		raise ValueError("incompatible SNR cut-offs")
+	if seed_snrpdf.log_distance_tolerance != snrpdf.log_distance_tolerance:
+		raise ValueError("incompatible distance ratio quantizations")
+	if seed_snrpdf.min_ratio != snrpdf.min_ratio:
+		raise ValueError("incompatible minimum distance ratios")
+
+
+#
+# obtain horizon distances from PSD files if requested
 #
 
 
 for n, filename in enumerate(filenames, 1):
-	if options.verbose:
-		print >>sys.stderr, "%d/%d:" % (n, len(filenames)),
+	logging.info("loading PSD %d/%d:" % (n, len(filenames)))
 	psd = lal.series.read_psd_xmldoc(ligolw_utils.load_filename(filename, contenthandler = lal.series.PSDContentHandler, verbose = options.verbose))
 	for m1, m2 in options.horizon_distance_masses:
 		horizon_distances = dict((instrument, (0. if instrument not in psd else reference_psd.horizon_distance(psd[instrument], m1, m2, 8., options.horizon_distance_flow))) for instrument in options.instruments)
-		if options.verbose:
-			print >>sys.stderr, "\t%s" % ", ".join("%s = %.4g Mpc" % x for x in sorted(horizon_distances.items()))
+		logging.info("\t%s" % ", ".join("%s = %.4g Mpc" % x for x in sorted(horizon_distances.items())))
 		options.horizon_distances.append(horizon_distances)
 
 
 #
-# build and populate SNR PDF cache
+# construct (fragment of) full spectrum of horizon distances if requested
 #
 
 
-snrpdf = inspiral_extrinsics.SNRPDF(snr_cutoff = 4.)
+if options.full:
+	# the use of sorted() in both loops ensures the sequence is
+	# reproducible from one job to the next.  this is required by the
+	# --full-fragment feature.
+	for loudest in sorted(options.instruments):
+		other_instruments = sorted(options.instruments - set((loudest,)))
+		for other_distances in itertools.product(*([snrpdf.quants] * len(other_instruments))):
+			options.horizon_distances.append(snrpdf.quantized_horizon_distances([(loudest, 0.0)] + zip(other_instruments, other_distances)))
+	options.horizon_distances.append(dict.fromkeys(options.instruments, 0.0))
+	# clip total fragment count to total number of ratios
+	n = len(options.horizon_distances)
+	options.full_fragment[1] = min(options.full_fragment[1], n)
+	# clip fragment index to clipped fragment count
+	options.full_fragment[0] = min(*options.full_fragment)
+	# mean horizon distance ratios per fragment
+	ratios_per_fragment = n / float(options.full_fragment[1])
+	# find start and end of interval to compute
+	start = int(round(ratios_per_fragment * (options.full_fragment[0] - 1)))
+	end = int(round(ratios_per_fragment * options.full_fragment[0]))
+	# clip sequence
+	options.horizon_distances = options.horizon_distances[start : end]
+
+
+#
+# populate SNR PDF cache
+#
+
 
-for horizon_distances in options.horizon_distances:
-	for n in range(options.min_instruments, len(horizon_distances) + 1):
-		for instruments in itertools.combinations(horizon_distances, n):
-			snrpdf.add_to_cache(instruments, horizon_distances, verbose = options.verbose)
+start_time = time.time()
+for n, horizon_distances in enumerate(options.horizon_distances, 1):
+	logging.info("horizon distance ratio %d/%d, %g minutes elapsed" % (n, len(options.horizon_distances), (time.time() - start_time) / 60.))
+	for trigger_instruments in all_instrument_combos(options.instruments, options.min_instruments):
+		snrpdf.add_to_cache(trigger_instruments, horizon_distances, verbose = options.verbose)
 
 
 #
@@ -164,5 +274,7 @@ for horizon_distances in options.horizon_distances:
 
 
 xmldoc = ligolw.Document()
-xmldoc.appendChild(snrpdf.to_xml())
+xmldoc.appendChild(ligolw.LIGO_LW())
+process = ligolw_process.register_to_xmldoc(xmldoc, "gstlal_inspiral_make_snr_pdf", processparams, ifos = options.instruments)
+xmldoc.childNodes[0].appendChild(snrpdf.to_xml())
 ligolw_utils.write_filename(xmldoc, options.output, gz = (options.output or "stdout").endswith(".gz"), verbose = options.verbose)
diff --git a/gstlal-inspiral/share/make_snr_pdf_cache.py b/gstlal-inspiral/share/make_snr_pdf_cache.py
deleted file mode 100644
index 75d23a35af..0000000000
--- a/gstlal-inspiral/share/make_snr_pdf_cache.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from glue import iterutils
-from glue.ligolw import ligolw
-from glue.ligolw import utils as ligolw_utils
-from glue.text_progress_bar import ProgressBar
-from gstlal import far
-from gstlal.stats import inspiral_extrinsics
-
-def all_instrument_combos(instruments, min_number):
-	for n in range(min_number, len(instruments) + 1):
-		for combo in iterutils.choices(instruments, n):
-			yield combo
-
-diststats, _, segs = far.parse_likelihood_control_doc(ligolw_utils.load_filename("all_o1_likelihood.xml.gz", contenthandler = far.ThincaCoincParamsDistributions.LIGOLWContentHandler, verbose = True))
-#diststats.finish(segs, verbose = True)
-if set(diststats.horizon_history) != set(diststats.instruments):
-	raise ValueError("horizon histories are for %s, need for %s" % (", ".join(sorted(diststats.horizon_history)), ", ".join(sorted(diststats.instruments)))
-
-snrpdf = inspiral_extrinsics.SNRPDF(diststats.snr_min)
-snrpdf.snr_joint_pdf_cache.clear()
-
-all_horizon_distances = diststats.horizon_history.all()
-with ProgressBar(max = len(all_horizon_distances)) as progressbar:
-	for horizon_distances in all_horizon_distances:
-		progressbar.increment(text = ",".join(sorted("%s=%.5g" % item for item in horizon_distances.items())))
-		progressbar.show()
-		for off_instruments in all_instrument_combos(horizon_distances, 0):
-			dists = horizon_distances.copy()
-			for instrument in off_instruments:
-				dists[instrument] = 0.
-			for instruments in all_instrument_combos(diststats.instruments, diststats.min_instruments):
-				snrpdf.add_to_cache(instruments, dists, verbose = True)
-
-xmldoc = ligolw.Document()
-xmldoc.appendChild(ligolw.LIGO_LW()).appendChild(snrpdf.to_xml())
-ligolw_utils.write_filename(xmldoc, "inspiral_snr_pdf.xml.gz", gz = True, verbose = True)
-- 
GitLab