Prathamesh Joshi · Prathamesh Joshi · d1a99ce5 · 65ae1370 · 61dfeea6 · 151911e0
--- a/gstlal-inspiral/bin/gstlal_inspiral_marginalize_likelihoods_online

+ 31

− 27
+++ b/gstlal-inspiral/bin/gstlal_inspiral_marginalize_likelihoods_online

+ 31

− 27
 @@ -152,23 +152,8 @@ def main():
 	# set up the output paths
 	#

-	marg_pdf_exists = os.path.isfile(options.output)
-	pdfs = DataCache.find(DataType.DIST_STAT_PDFS, svd_bins = "*")
-	if marg_pdf_exists and len(pdfs) == len(registries):
-		files_exist = True
-	elif not marg_pdf_exists and len(pdfs) == 0:
-		files_exist = False
-	elif marg_pdf_exists and len(pdfs) != len(registries): 
-		raise ValueError(f"Number of registry files provided ({len(registries)}) does not match number of DIST_STAT_PDF files found ({len(pdfs)})")
-	else:
-		raise ValueError("Could not find marg DIST_STAT_PDF file")
-
 	svd_bins = [reg[:4] for reg in registries]
-	if files_exist:
-		assert set(pdfs.groupby('svd_bin').keys()) == set(svd_bins), "svd bins of registry files are not the same as svd bins of found PDFs"
-	else:
-		pdfs = DataCache.generate(DataType.DIST_STAT_PDFS, CacheEntry.from_T050017(options.output).observatory, svd_bins = svd_bins)
-
+	pdfs = DataCache.generate(DataType.DIST_STAT_PDFS, CacheEntry.from_T050017(options.output).observatory, svd_bins = svd_bins)
 	pdfs = pdfs.groupby('svd_bin')

 	#
 @@ -220,8 +205,12 @@ def main():
 			url = url_from_registry(reg, likelihood_path)

 			svd_bin = reg[:4]
-			# load the old ranking stat pdf for this bin:
-			old_pdf = far.parse_likelihood_control_doc(ligolw_utils.load_url(pdfs[svd_bin][0], verbose = options.verbose, contenthandler = far.RankingStat.LIGOLWContentHandler)) if files_exist else None
+			if os.path.isfile(pdfs[svd_bin].files[0]):
+				# load the old ranking stat pdf for this bin:
+				_, old_pdf = far.parse_likelihood_control_doc(ligolw_utils.load_url(pdfs[svd_bin].files[0], verbose = options.verbose, contenthandler = far.RankingStat.LIGOLWContentHandler))
+			else:
+				logging.warning(f"Couldn't find {pdfs[svd_bin].files[0]}, starting from scratch")
+				old_pdf = None

 			# create the new ranking stat pdf and marginalize as we go
 			new_pdf_status, pdf = calc_rank_pdfs(url, ranking_stat_samples, options.num_cores, verbose = options.verbose)
 @@ -256,16 +245,32 @@ def main():
 				# get the zerolag pdf for this bin and use it to perform bin-specific extinction
 				zerolag_counts_url = url_from_registry(reg, zerolag_counts_path)
 				pdf += far.RankingStatPDF.from_xml(ligolw_utils.load_url(zerolag_counts_url, verbose = options.verbose, contenthandler = far.RankingStat.LIGOLWContentHandler), u"gstlal_inspiral_likelihood")
-				if data:
-					data += pdf.new_with_extinction()
+				if pdf.ready_for_extinction():
+					# LR calculation has started and we are ready to perform first-round extinction
+					if data:
+						data += pdf.new_with_extinction()
+					else:
+						data = pdf.new_with_extinction()
 				else:
-					data = pdf.new_with_extinction()
+					# add a zeroed-out PDF instead, so that the template ids get added to data
+					logging.warning(f'Skipping first-round extinction for {pdfs[svd_bin].files[0]}, using an empty PDF instead')
+					pdf.noise_lr_lnpdf.array[:] = 0.
+					pdf.signal_lr_lnpdf.array[:] = 0.
+					pdf.zero_lag_lr_lnpdf.array[:] = 0.
+					if data:
+						data += pdf
+					else:
+						data = pdf
+					

 			# while looping through registries
 			# send heartbeat messages
 			if kafka_processor:
 				kafka_processor.heartbeat()
-
+		
+		# zero out the zerolag after the first round of extinction is finished
+		if data:
+			data.zero_lag_lr_lnpdf.count.array[:] = 0

 		# if we fail to complete more than 1% of the bins,
 		# this is a serious problem and we should just quit
 @@ -284,7+289,7 @@
 		# noise and signal model ranking statistic histograms in the
 		# zero-lag counts files downloaded from the jobs must be all 0, and
 		# the zero-lag counts in the output generated by
 		# gstlal_inspiral_calc_rank_pdfs must be 0.  NOTE:  this is where
 		# the zero-lag counts have the density estimation transform
 		# applied.

 		zerolag_counts_url = url_from_registry("gstlal_ll_inspiral_trigger_counter_registry.txt", zerolag_counts_path)

 		# add zerolag counts url to marginalized data
-		data += far.RankingStatPDF.from_xml(ligolw_utils.load_url(zerolag_counts_url, verbose = options.verbose, contenthandler = far.RankingStat.LIGOLWContentHandler), u"gstlal_inspiral_likelihood")
+		if data:
+			data += far.RankingStatPDF.from_xml(ligolw_utils.load_url(zerolag_counts_url, verbose = options.verbose, contenthandler = far.RankingStat.LIGOLWContentHandler), u"gstlal_inspiral_likelihood")
+		else:
+			data = far.RankingStatPDF.from_xml(ligolw_utils.load_url(zerolag_counts_url, verbose = options.verbose, contenthandler = far.RankingStat.LIGOLWContentHandler), u"gstlal_inspiral_likelihood")

 		if kafka_processor:
 			kafka_processor.heartbeat()
 @@ -309,10 +317,6 @@ def main():
 		ligolw_utils.write_filename(xmldoc, options.output, verbose = options.verbose)
 		logging.info(f"Done marginalizing likelihoods.")

-		# we just created the bin-specific and marg DIST_STAT_PDFs,
-		# so the files definitely exist for the next iteration of the loop
-		files_exist = True
-
 		if kafka_processor:
 			kafka_processor.heartbeat()