lalapps_string_final.py 42.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#
# Copyright (C) 2006--2010  Kipp Cannon
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.


#
# =============================================================================
#
#                                   Preamble
#
# =============================================================================
#


"""
String cusp search final output rendering tool.
"""


import bisect
Kipp Cannon's avatar
Kipp Cannon committed
34
35
import heapq
import itertools
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
from optparse import OptionParser
import math
from matplotlib import patches
import numpy
import os
import cPickle as pickle
import random
import select
from scipy import interpolate
from scipy import optimize
import Queue as queue
import sqlite3
import sys
import traceback


from lal.utils import CacheEntry


from glue import segments
from glue import segmentsUtils
from glue.ligolw import dbtables
from glue.ligolw.utils import process as ligolwprocess
from lalburst import git_version
from lalburst import packing
from pylal import rate
from lalburst import SimBurstUtils
from lalburst import SnglBurstUtils
from lalburst import stringutils


SnglBurstUtils.matplotlib.rcParams.update({
	"font.size": 10.0,
	"axes.titlesize": 10.0,
	"axes.labelsize": 10.0,
	"xtick.labelsize": 8.0,
	"ytick.labelsize": 8.0,
	"legend.fontsize": 8.0,
	"grid.linestyle": "-"
})


__author__ = "Kipp Cannon <kipp.cannon@ligo.org>"
__version__ = "git id %s" % git_version.id
__date__ = git_version.date


#
# =============================================================================
#
#                                 Command Line
#
# =============================================================================
#


def parse_command_line():
	parser = OptionParser(
		version = "Name: %%prog\n%s" % git_version.verbose_msg,
		usage = "%prog [options] [file ...]",
		description = "%prog performs the final, summary, stages of the upper-limit string cusp search.  Input consists of a list of all sqlite format database files produced by all injection and non-injection runs of the analysis pipeline.  The file names can be given on the command line and/or provided in a LAL cache file."
	)
	parser.add_option("--cal-uncertainty", metavar = "fraction", type = "float", help = "Set the fractional uncertainty in amplitude due to calibration uncertainty (eg. 0.08).  This option is required, use 0 to disable calibration uncertainty.")
	parser.add_option("--injections-bin-size", metavar = "bins", type = "float", default = 16.7, help = "Set bin width for injection efficiency curves (default = 16.7).")
	parser.add_option("-c", "--input-cache", metavar = "filename", action = "append", help = "Also process the files named in this LAL cache.  See lalapps_path2cache for information on how to produce a LAL cache file.  This option can be given multiple times.")
	parser.add_option("--import-dump", metavar = "filename", action = "append", help = "Import additional rate vs. threshold or efficiency data from this dump file.  Dump files are one of the data products produced by this program.  Whether the file provides rate vs. threshold data or efficiency data will be determined automatically.  This option can be given multiple times")
	parser.add_option("--image-formats", metavar = "ext[,ext,...]", default = "png,pdf", help = "Set list of graphics formats to produce by providing a comma-delimited list of the filename extensions (default = \"png,pdf\").")
	parser.add_option("-p", "--live-time-program", metavar = "program", default = "StringSearch", help = "Set the name, as it appears in the process table, of the program whose search summary entries define the search live time (default = StringSearch).")
	parser.add_option("-o", "--open-box", action = "store_true", help = "Perform open-box analysis.  In a closed-box analysis (the default), information about the events seen at zero-lag is concealed:  the rate vs. threshold plot only shows the rate of events seen in the background, the detection threshold used to measure the efficiency curves is obtained from n-th loudest background event where n is (the integer closest to) the ratio of background livetime to zero-lag livetime, and messages to stdout and stderr that contain information about event counts at zero-lag are silenced.")
	parser.add_option("-t", "--tmp-space", metavar = "path", help = "Path to a directory suitable for use as a work area while manipulating the database file.  The database file will be worked on in this directory, and then moved to the final location when complete.  This option is intended to improve performance when running in a networked environment, where there might be a local disk with higher bandwidth than is available to the filesystem on which the final output will reside.")
	parser.add_option("--vetoes-name", metavar = "name", help = "Set the name of the segment lists to use as vetoes (default = do not apply vetoes).")
	parser.add_option("--detection-threshold", metavar = "likelihood", type = "float", help = "Override the detection threshold.  Only injection files will be processed, and the efficiency curve measured.")
	parser.add_option("--record-background", metavar = "N", type = "int", default = 10000000, help = "Set the number of background likelihood ratios to hold in memory for producing the rate vs. threshold plot (default = 10000000).")
	parser.add_option("--record-candidates", metavar = "N", type = "int", default = 100, help = "Set the number of highest-ranked zero-lag candidates to dump to the candidate file (default = 100).")
	parser.add_option("--threads", metavar = "N", type = "int", default = 1, help = "Set the maximum number of parallel threads to use for processing files (default = 1).  Contention for the global Python interpreter lock will throttle the true number that can run.  The number of threads will be automatically adjusted downwards if the number requested exceeds the number of input files.")
	parser.add_option("-v", "--verbose", action = "store_true", help = "Be verbose.")
	options, filenames = parser.parse_args()

	if options.cal_uncertainty is None:
		raise ValueError("must set --cal-uncertainty (use 0 to ignore calibration uncertainty)")

	options.image_formats = options.image_formats.split(",")

	if options.input_cache:
		filenames += [CacheEntry(line).path for input_cache in options.input_cache for line in file(input_cache)]

	if options.threads < 1:
		raise ValueError("--threads must be >= 1")

	return options, filenames


#
# =============================================================================
#
#                              Rate vs. Threshold
#
# =============================================================================
#


def ratevsthresh_bounds(background_x, background_y, zero_lag_x, zero_lag_y):
	if len(zero_lag_x):
		if len(background_x):
			minX, maxX = min(min(zero_lag_x), min(background_x)), max(max(zero_lag_x), max(background_x))
			minY, maxY = min(min(zero_lag_y), min(background_y)), max(max(zero_lag_y), max(background_y))
		else:
			minX, maxX = min(zero_lag_x), max(zero_lag_x)
			minY, maxY = min(zero_lag_y), max(zero_lag_y)
	else:
		# don't check for background, if there's no zero-lag and no
		# background we're screwed anyway
		minX, maxX = min(background_x), max(background_x)
		minY, maxY = min(background_y), max(background_y)

	# override the plot's X axis lower bound
	minX = 1e-2	# FIXME:  don't hard-code
	if len(zero_lag_x):
		if len(background_x):
			maxY = max(zero_lag_y[bisect.bisect_left(zero_lag_x, minX)], background_y[bisect.bisect_left(background_x, minX)])
		else:
			maxY = zero_lag_y[bisect.bisect_left(zero_lag_x, minX)]
	else:
		maxY = background_y[bisect.bisect_left(background_x, minX)]

	return minX, maxX, minY, maxY


def expected_count(x, background_x, background_y):
	if x > background_x[-1]:
		return 0
	return background_y[bisect.bisect_left(background_x, x)]


def compress_ratevsthresh_curve(x, y, yerr):
	#
	# construct a background mask to retain the highest-ranked 10,000
	# elements, then every 10th until the 100,000th highest-ranked
	# element, then every 100th after that.  this is for reducing the
	# dataset size so matplotlib can handle it and vector graphics
	# output isn't ridiculous in size.
	#

	mask = numpy.arange(len(x))[::-1]
	mask = (mask < 10000) | ((mask < 100000) & (mask % 10 == 0)) | (mask % 100 == 0)

	return x.compress(mask), y.compress(mask), yerr.compress(mask)


class RateVsThreshold(object):
	def __init__(self, open_box, record_background, record_candidates):
		self.zero_lag = []
Kipp Cannon's avatar
Kipp Cannon committed
188
189
190
191
192
193
		self.background = []
		self.n_background = 0
		self.record_background = record_background
		self.most_significant_background = []
		self.candidates = []
		self.record_candidates = record_candidates
194
195
196
197
198
199
		self.zero_lag_time = 0.0
		self.background_time = 0.0
		self.open_box = open_box

	def __iadd__(self, other):
		self.zero_lag += other.zero_lag
Kipp Cannon's avatar
Kipp Cannon committed
200
201
202
203
		self.n_background += other.n_background
		self.background[:] = heapq.nlargest(itertools.chain(self.background, other.background), self.record_background)
		self.most_significant_background[:] = heapq.nlargest(itertools.chain(self.most_significant_background, other.most_significant_background), self.record_candidates)
		self.candidates[:] = heapq.nlargest(itertools.chain(self.candidates, other.candidates), self.record_candidates)
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
		self.zero_lag_time += other.zero_lag_time
		self.background_time += other.background_time
		return self

	def add_contents(self, contents, verbose = False):
		#
		# retrieve the offset vectors, retain only instruments that
		# are available
		#

		zero_lag_time_slides, background_time_slides = SnglBurstUtils.get_time_slides(contents.connection)
		assert len(zero_lag_time_slides) == 1

		#
		# compute the live time
		#

		seglists = contents.seglists - contents.vetoseglists
		self.zero_lag_time += stringutils.time_slides_livetime(seglists, zero_lag_time_slides.values(), 2, clip = contents.coincidence_segments)
		self.background_time += stringutils.time_slides_livetime(seglists, background_time_slides.values(), 2, clip = contents.coincidence_segments)
		if set(("H1", "H2")).issubset(set(contents.seglists)):
			# we have segments for both H1 and H2, remove time
			# when exactly that pair are on
			self.zero_lag_time -= stringutils.time_slides_livetime_for_instrument_combo(seglists, zero_lag_time_slides.values(), ("H1", "H2"), clip = contents.coincidence_segments)
			self.background_time -= stringutils.time_slides_livetime_for_instrument_combo(seglists, background_time_slides.values(), ("H1", "H2"), clip = contents.coincidence_segments)

		#
		# count events
		#

		for likelihood_ratio, instruments, coinc_event_id, peak_time, is_background in contents.connection.cursor().execute("""
SELECT
	coinc_event.likelihood,
	coinc_event.instruments,
	coinc_event.coinc_event_id,
	(
		SELECT
			AVG(sngl_burst.peak_time) + 1e-9 * AVG(sngl_burst.peak_time_ns)
		FROM
			sngl_burst
			JOIN coinc_event_map ON (
				coinc_event_map.coinc_event_id == coinc_event.coinc_event_id
				AND coinc_event_map.table_name == 'sngl_burst'
				AND coinc_event_map.event_id == sngl_burst.event_id
			)
	),
	EXISTS (
		SELECT
			*
		FROM
			time_slide
		WHERE
			time_slide.time_slide_id == coinc_event.time_slide_id
			AND time_slide.offset != 0
	)
FROM
	coinc_event
WHERE
	coinc_event.coinc_def_id == ?
		""", (contents.bb_definer_id,)):
Kipp Cannon's avatar
Kipp Cannon committed
264
265
266
			# likelihood ratio must be listed first to
			# act as the sort key
			record = (likelihood_ratio, contents.filename, coinc_event_id, dbtables.lsctables.instrument_set_from_ifos(instruments), peak_time)
267
268
269
270
271
272
			if likelihood_ratio is None:
				# coinc got vetoed (unable to compute
				# likelihood)
				pass
			elif is_background:
				# non-vetoed background
Kipp Cannon's avatar
Kipp Cannon committed
273
274
275
276
277
278
279
280
281
				self.n_background += 1
				if len(self.background) < self.record_background:
					heapq.heappush(self.background, likelihood_ratio)
				else:
					heapq.heappushpop(self.background, likelihood_ratio)
				if len(self.most_significant_background) < self.record_candidates:
					heapq.heappush(self.most_significant_background, record)
				else:
					heapq.heappushpop(self.most_significant_background, record)
282
283
284
			else:
				# non-vetoed zero lag
				self.zero_lag.append(likelihood_ratio)
Kipp Cannon's avatar
Kipp Cannon committed
285
286
287
288
				if len(self.candidates) < self.record_candidates:
					heapq.heappush(self.candidates, record)
				else:
					heapq.heappushpop(self.candidates, record)
289
290
291
292
293
294
295

	def finish(self):
		#
		# dump info about highest-ranked zero-lag and background
		# events
		#

Kipp Cannon's avatar
Kipp Cannon committed
296
297
298
		self.most_significant_background.sort()
		self.candidates.sort()

299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
		f = file("string_most_significant_background.txt", "w")
		print >>f, "Highest-Ranked Background Events"
		print >>f, "================================"
		for likelihood_ratio, filename, coinc_event_id, instruments, peak_time in self.most_significant_background:
			print >>f
			print >>f, "%s in %s:" % (str(coinc_event_id), filename)
			print >>f, "Recovered in: %s" % ", ".join(sorted(instruments or []))
			print >>f, "Mean peak time:  %.16g s GPS" % peak_time
			print >>f, "\\Lambda:  %.16g" % likelihood_ratio

		f = file("string_candidates.txt", "w")
		print >>f, "Highest-Ranked Zero-Lag Events"
		print >>f, "=============================="
		if self.open_box:
			for likelihood_ratio, filename, coinc_event_id, instruments, peak_time in self.candidates:
				print >>f
				print >>f, "%s in %s:" % (str(coinc_event_id), filename)
				print >>f, "Recovered in: %s" % ", ".join(sorted(instruments or []))
				print >>f, "Mean peak time:  %.16g s GPS" % peak_time
				print >>f, "\\Lambda:  %.16g" % likelihood_ratio
		else:
			print >>f
			print >>f, "List suppressed:  box is closed"

		#
Kipp Cannon's avatar
Kipp Cannon committed
324
		# sort the ranking statistics and convert to arrays.
325
326
		#

Kipp Cannon's avatar
Kipp Cannon committed
327
		self.background.sort()
328
329
		self.zero_lag.sort()
		self.zero_lag = numpy.array(self.zero_lag, dtype = "double")
Kipp Cannon's avatar
Kipp Cannon committed
330
		background_x = numpy.array(self.background, dtype = "double")
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474

		#
		# convert counts to rates and their uncertainties
		#

		# background count expected in zero-lag and \sqrt{N}
		# standard deviation
		background_y = numpy.arange(len(background_x), 0.0, -1.0, dtype = "double") / self.background_time * self.zero_lag_time
		background_yerr = numpy.sqrt(background_y)

		# count observed in zero-lag
		zero_lag_y = numpy.arange(len(self.zero_lag), 0.0, -1.0, dtype = "double")

		# compute zero-lag - expected residual in units of \sqrt{N}
		zero_lag_residual = numpy.zeros((len(self.zero_lag),), dtype = "double")
		for i in xrange(len(self.zero_lag)):
			n = expected_count(self.zero_lag[i], background_x, background_y)
			zero_lag_residual[i] = (zero_lag_y[i] - n) / math.sqrt(n)

		# convert counts to rates
		background_y /= self.zero_lag_time
		background_yerr /= self.zero_lag_time
		zero_lag_y /= self.zero_lag_time

		#
		# determine the horizontal and vertical extent of the plot
		#

		if self.open_box:
			minX, maxX, minY, maxY = ratevsthresh_bounds(background_x, background_y, self.zero_lag, zero_lag_y)
		else:
			minX, maxX, minY, maxY = ratevsthresh_bounds(background_x, background_y, [], [])

		#
		# compress the background data
		#

		background_x, background_y, background_yerr = compress_ratevsthresh_curve(background_x, background_y, background_yerr)

		#
		# start the rate vs. threshold plot
		#

		ratefig, axes = SnglBurstUtils.make_burst_plot(r"Likelihood Ratio Threshold $\Lambda$", "Event Rate (Hz)", width = 108.0)
		axes.loglog()
		axes.set_position([0.125, 0.15, 0.83, 0.75])
		axes.xaxis.grid(True, which = "major,minor")
		axes.yaxis.grid(True, which = "major,minor")
		if self.open_box:
			axes.set_title(r"Event Rate vs.\ Likelihood Ratio Threshold")
		else:
			axes.set_title(r"Event Rate vs.\ Likelihood Ratio Threshold (Closed Box)")

		# warning:  the error bar polygon is not *really* clipped
		# to the axes' bounding box, the result will be incorrect
		# if the density of data points is small where the polygon
		# encounters the axes' bounding box.

		poly_x = numpy.concatenate((background_x, background_x[::-1]))
		poly_y = numpy.concatenate((background_y + 1 * background_yerr, (background_y - 1 * background_yerr)[::-1]))
		axes.add_patch(patches.Polygon(zip(poly_x, numpy.clip(poly_y, minY, maxY)), edgecolor = "k", facecolor = "k", alpha = 0.3))
		line1, = axes.loglog(background_x.repeat(2)[:-1], background_y.repeat(2)[1:], color = "k", linestyle = "--")
		if self.open_box:
			line2, = axes.loglog(self.zero_lag.repeat(2)[:-1], zero_lag_y.repeat(2)[1:], color = "k", linestyle = "-", linewidth = 2)
			axes.legend((line1, line2), (r"Expected background", r"Zero-lag"), loc = "lower left")
		else:
			axes.legend((line1,), (r"Expected background",), loc = "lower left")

		axes.set_xlim((minX, maxX))
		axes.set_ylim((minY, maxY))

		#
		# start the count residual vs. threshold plot
		#

		residualfig, axes = SnglBurstUtils.make_burst_plot(r"Likelihood Ratio Threshold $\Lambda$", r"Event Count Residual / $\sqrt{N}$", width = 108.0)
		axes.semilogx()
		axes.set_position([0.125, 0.15, 0.83, 0.75])
		axes.xaxis.grid(True, which = "major,minor")
		axes.yaxis.grid(True, which = "major,minor")
		if self.open_box:
			axes.set_title(r"Event Count Residual vs.\ Likelihood Ratio Threshold")
		else:
			axes.set_title(r"Event Count Residual vs.\ Likelihood Ratio Threshold (Closed Box)")

		axes.add_patch(patches.Polygon(((minX, -1), (maxX, -1), (maxX, +1), (minX, +1)), edgecolor = "k", facecolor = "k", alpha = 0.3))
		if self.open_box:
			line1, = axes.semilogx(self.zero_lag, zero_lag_residual, color = "k", linestyle = "-", linewidth = 2)

		axes.set_xlim((minX, maxX))

		#
		# done
		#

		return ratefig, residualfig


#
# =============================================================================
#
#                                  Efficiency
#
# =============================================================================
#


def slope(x, y):
	"""
	From the x and y arrays, compute the slope at the x co-ordinates
	using a first-order finite difference approximation.
	"""
	slope = numpy.zeros((len(x),), dtype = "double")
	slope[0] = (y[1] - y[0]) / (x[1] - x[0])
	for i in xrange(1, len(x) - 1):
		slope[i] = (y[i + 1] - y[i - 1]) / (x[i + 1] - x[i - 1])
	slope[-1] = (y[-1] - y[-2]) / (x[-1] - x[-2])
	return slope


def upper_err(y, yerr, deltax):
	z = y + yerr
	deltax = int(deltax)
	upper = numpy.zeros((len(yerr),), dtype = "double")
	for i in xrange(len(yerr)):
		upper[i] = max(z[max(i - deltax, 0) : min(i + deltax, len(z))])
	return upper - y


def lower_err(y, yerr, deltax):
	z = y - yerr
	deltax = int(deltax)
	lower = numpy.zeros((len(yerr),), dtype = "double")
	for i in xrange(len(yerr)):
		lower[i] = min(z[max(i - deltax, 0) : min(i + deltax, len(z))])
	return y - lower


def write_efficiency(fileobj, bins, eff, yerr):
	print >>fileobj, "# A	e	D[e]"
	for A, e, De in zip(bins.centres()[0], eff, yerr):
		print >>fileobj, "%.16g	%.16g	%.16g" % (A, e, De)


475
def render_data_from_bins(dump_file, axes, efficiency_num, efficiency_den, cal_uncertainty, filter_width, colour = "k", erroralpha = 0.3, linestyle = "-"):
476
477
	# extract array of x co-ordinates, and the factor by which x
	# increases from one sample to the next.
478
479
	(x,) = efficiency_den.centres()
	x_factor_per_sample = efficiency_den.bins()[0].delta
480
481
482
483
484

	# compute the efficiency, the slope (units = efficiency per
	# sample), the y uncertainty (units = efficiency) due to binomial
	# counting fluctuations, and the x uncertainty (units = samples)
	# due to the width of the smoothing filter.
485
	eff = efficiency_num.array / efficency_den.array
486
	dydx = slope(numpy.arange(len(x), dtype = "double"), eff)
487
	yerr = numpy.sqrt(eff * (1 - eff) / efficiency_den.array)
488
489
490
491
492
493
494
495
496
497
498
499
500
501
	xerr = numpy.array([filter_width / 2] * len(yerr))

	# compute the net y err (units = efficiency) by (i) multiplying the
	# x err by the slope, (ii) dividing the calibration uncertainty
	# (units = percent) by the fractional change in x per sample and
	# multiplying by the slope, (iii) adding the two in quadradure with
	# the y err.
	net_yerr = numpy.sqrt((xerr * dydx)**2 + yerr**2 + (cal_uncertainty / x_factor_per_sample * dydx)**2)

	# compute net xerr (units = percent) by dividing yerr by slope and
	# then multiplying by the fractional change in x per sample.
	net_xerr = net_yerr / dydx * x_factor_per_sample

	# write the efficiency data to file
502
	write_efficiency(dump_file, efficiency_den.bins(), eff, net_yerr)
503
504
505
506
507
508
509
510
511
512
513
514
515
516

	# plot the efficiency curve and uncertainty region
	patch = patches.Polygon(zip(numpy.concatenate((x, x[::-1])), numpy.concatenate((eff + upper_err(eff, yerr, filter_width / 2), (eff - lower_err(eff, yerr, filter_width / 2))[::-1]))), edgecolor = colour, facecolor = colour, alpha = erroralpha)
	axes.add_patch(patch)
	line, = axes.plot(x, eff, colour + linestyle)

	# compute 50% point and its uncertainty
	A50 = optimize.bisect(interpolate.interp1d(x, eff - 0.5), x[0], x[-1], xtol = 1e-40)
	A50_err = interpolate.interp1d(x, net_xerr)(A50)

	# mark 50% point on graph
	#axes.axvline(A50, color = colour, linestyle = linestyle)

	# print some analysis
517
518
	num_injections = efficiency_den.array.sum()
	num_samples = len(efficiency_den)
519
	print >>sys.stderr, "Bins were %g samples wide, ideal would have been %g" % (filter_width, (num_samples / num_injections / interpolate.interp1d(x, dydx)(A50)**2.0)**(1.0/3.0))
520
	print >>sys.stderr, "Average number of injections in each bin = %g" % efficiency_den.array.mean()
521
522
523
524
525
526
527
528
529
530
531
532

	return line, A50, A50_err


class Efficiency(object):
	def __init__(self, detection_threshold, cal_uncertainty, filter_width, open_box):
		self.detection_threshold = detection_threshold
		self.cal_uncertainty = cal_uncertainty
		self.filter_width = filter_width
		self.seglists = segments.segmentlistdict()
		self.vetoseglists = segments.segmentlistdict()
		self.found = []
Kipp Cannon's avatar
Kipp Cannon committed
533
534
535
		self.n_diagnostics = 100	# keep 100 loudest missed and quietest found injections
		self.loudest_missed = []
		self.quietest_found = []
536
537
538
539
540
541
542
543
544
		self.all = []
		self.open_box = open_box

	def __iadd__(self, other):
		assert other.detection_threshold == self.detection_threshold
		assert other.open_box == self.open_box
		self.seglists |= other.seglists
		self.vetoseglists |= other.vetoseglists
		self.found += other.found
Kipp Cannon's avatar
Kipp Cannon committed
545
546
		self.loudest_missed[:] = heapq.nlargest(itertools.chain(self.loudest_missed, other.loudest_missed), self.n_diagnostics)
		self.quietest_found[:] = heapq.nlargest(itertools.chain(self.quietest_found, other.quietest_found), self.n_diagnostics)
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
		self.all += other.all
		return self

	def add_contents(self, contents, verbose = False):
		#
		# update segment information
		#

		self.seglists |= contents.seglists
		self.vetoseglists |= contents.vetoseglists
		seglists = contents.seglists - contents.vetoseglists
		if set(("H1", "H2")).issubset(set(seglists)):
			# we have segments for both H1 and H2, remove time
			# when exactly that pair are on
			seglists -= segments.segmentlistdict.fromkeys(seglists, seglists.intersection(("H1", "H2")) - seglists.union(set(seglists) - set(("H1", "H2"))))

		# for each injection, retrieve the highest likelihood ratio
		# of the burst coincs that were found to match the
		# injection or null if no burst coincs matched the
		# injection
		offsetvectors = contents.time_slide_table.as_dict()
		stringutils.create_recovered_likelihood_table(contents.connection, contents.bb_definer_id)
		for values in contents.connection.cursor().execute("""
SELECT
	sim_burst.*,
	recovered_likelihood.likelihood
FROM
	sim_burst
	LEFT JOIN recovered_likelihood ON (
		recovered_likelihood.simulation_id == sim_burst.simulation_id
	)
		"""):
			sim = contents.sim_burst_table.row_from_cols(values[:-1])
			likelihood_ratio = values[-1]
			found = likelihood_ratio is not None
			# were at least 2 instruments on when the injection
			# was made?
			if len(SimBurstUtils.on_instruments(sim, seglists, offsetvectors[sim.time_slide_id])) >= 2:
				# yes
				self.all.append(sim)
				if found and likelihood_ratio > self.detection_threshold:
					self.found.append(sim)
					# 1/amplitude needs to be first so
					# that it acts as the sort key
Kipp Cannon's avatar
Kipp Cannon committed
591
					record = (1.0 / sim.amplitude, sim, offsetvectors[sim.time_slide_id], contents.filename, likelihood_ratio)
592
					if len(self.quietest_found) < self.n_diagnostics:
Kipp Cannon's avatar
Kipp Cannon committed
593
594
595
						heapq.heappush(self.quietest_found, record)
					else:
						heapq.heappushpop(self.quietest_found, record)
596
597
598
				else:
					# amplitude needs to be first so
					# that it acts as the sort key
Kipp Cannon's avatar
Kipp Cannon committed
599
600
601
602
603
					record = (sim.amplitude, sim, offsetvectors[sim.time_slide_id], contents.filename, likelihood_ratio)
					if len(self.loudest_missed) < self.n_diagnostics:
						heapq.heappush(self.loudest_missed, record)
					else:
						heapq.heappushpop(self.loudest_missed, record)
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
			elif found:
				# no, but it was found anyway
				print >>sys.stderr, "%s: odd, injection %s was found but not injected ..." % (contents.filename, sim.simulation_id)

	def finish(self):
		fig, axes = SnglBurstUtils.make_burst_plot(r"Injection Amplitude (\(\mathrm{s}^{-\frac{1}{3}}\))", "Detection Efficiency", width = 108.0)
		axes.set_title(r"Detection Efficiency vs.\ Amplitude")
		axes.semilogx()
		axes.set_position([0.10, 0.150, 0.86, 0.77])

		# set desired yticks
		axes.set_yticks((0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0))
		axes.set_yticklabels((r"\(0\)", r"\(0.1\)", r"\(0.2\)", r"\(0.3\)", r"\(0.4\)", r"\(0.5\)", r"\(0.6\)", r"\(0.7\)", r"\(0.8\)", r"\(0.9\)", r"\(1.0\)"))
		axes.xaxis.grid(True, which = "major,minor")
		axes.yaxis.grid(True, which = "major,minor")

		# put made and found injections in the denominators and
		# numerators of the efficiency bins
		bins = rate.NDBins((rate.LogarithmicBins(min(sim.amplitude for sim in self.all), max(sim.amplitude for sim in self.all), 400),))
623
624
		efficiency_num = rate.BinnedArray(bins)
		efficiency_den = rate.BinnedArray(bins)
625
		for sim in self.found:
626
			efficiency_num[sim.amplitude,] += 1
627
		for sim in self.all:
628
			efficiency_den[sim.amplitude,] += 1
629
630
631
632
633
634
635
636
637
638
639

		# generate and plot trend curves.  adjust window function
		# normalization so that denominator array correctly
		# represents the number of injections contributing to each
		# bin:  make w(0) = 1.0.  note that this factor has no
		# effect on the efficiency because it is common to the
		# numerator and denominator arrays.  we do this for the
		# purpose of computing the Poisson error bars, which
		# requires us to know the counts for the bins
		windowfunc = rate.gaussian_window(self.filter_width)
		windowfunc /= windowfunc[len(windowfunc) / 2 + 1]
640
641
		rate.filter_binned_array(efficiency_num, windowfunc)
		rate.filter_binned_array(efficiency_den, windowfunc)
642
643
644

		# regularize:  adjust unused bins so that the efficiency is
		# 0, not NaN
645
		assert (efficiency_num <= efficiency_den).all()
646
		efficiency_den[efficiency_num == 0 & efficiency_den == 0] = 1
647

648
		line1, A50, A50_err = render_data_from_bins(file("string_efficiency.dat", "w"), axes, efficiency_num, efficiency_den, self.cal_uncertainty, self.filter_width, colour = "k", linestyle = "-", erroralpha = 0.2)
649
650
651
652
653
654
655
656
657
658
659
660
661
662
		print >>sys.stderr, "Pipeline's 50%% efficiency point for all detections = %g +/- %g%%\n" % (A50, A50_err * 100)

		# add a legend to the axes
		axes.legend((line1,), (r"\noindent Injections recovered with $\Lambda > %s$" % SnglBurstUtils.latexnumber("%.2e" % self.detection_threshold),), loc = "lower right")

		# adjust limits
		axes.set_xlim([1e-21, 2e-18])
		axes.set_ylim([0.0, 1.0])

		#
		# dump some information about the highest-amplitude missed
		# and quietest-amplitude found injections
		#

Kipp Cannon's avatar
Kipp Cannon committed
663
664
665
		self.loudest_missed.sort(reverse = True)
		self.quietest_found.sort(reverse = True)

666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
		f = file("string_loud_missed_injections.txt", "w")
		print >>f, "Highest Amplitude Missed Injections"
		print >>f, "==================================="
		for amplitude, sim, offsetvector, filename, likelihood_ratio in self.loudest_missed:
			print >>f
			print >>f, "%s in %s:" % (str(sim.simulation_id), filename)
			if likelihood_ratio is None:
				print >>f, "Not recovered"
			else:
				print >>f, "Recovered with \\Lambda = %.16g, detection threshold was %.16g" % (likelihood_ratio, self.detection_threshold)
			for instrument in self.seglists:
				print >>f, "In %s:" % instrument
				print >>f, "\tInjected amplitude:\t%.16g" % SimBurstUtils.string_amplitude_in_instrument(sim, instrument, offsetvector)
				print >>f, "\tTime of injection:\t%s s" % sim.time_at_instrument(instrument, offsetvector)
			print >>f, "Amplitude in waveframe:\t%.16g" % sim.amplitude
			t = sim.get_time_geocent()
			print >>f, "Time at geocentre:\t%s s" % t
			print >>f, "Segments within 60 seconds:\t%s" % segmentsUtils.segmentlistdict_to_short_string(self.seglists & segments.segmentlistdict((instrument, segments.segmentlist([segments.segment(t-offsetvector[instrument]-60, t-offsetvector[instrument]+60)])) for instrument in self.seglists))
			print >>f, "Vetoes within 60 seconds:\t%s" % segmentsUtils.segmentlistdict_to_short_string(self.vetoseglists & segments.segmentlistdict((instrument, segments.segmentlist([segments.segment(t-offsetvector[instrument]-60, t-offsetvector[instrument]+60)])) for instrument in self.vetoseglists))

		f = file("string_quiet_found_injections.txt", "w")
		print >>f, "Lowest Amplitude Found Injections"
		print >>f, "================================="
		for inv_amplitude, sim, offsetvector, filename, likelihood_ratio in self.quietest_found:
			print >>f
			print >>f, "%s in %s:" % (str(sim.simulation_id), filename)
			if likelihood_ratio is None:
				print >>f, "Not recovered"
			else:
				print >>f, "Recovered with \\Lambda = %.16g, detection threshold was %.16g" % (likelihood_ratio, self.detection_threshold)
			for instrument in self.seglists:
				print >>f, "In %s:" % instrument
				print >>f, "\tInjected amplitude:\t%.16g" % SimBurstUtils.string_amplitude_in_instrument(sim, instrument, offsetvector)
				print >>f, "\tTime of injection:\t%s s" % sim.time_at_instrument(instrument, offsetvector)
			print >>f, "Amplitude in waveframe:\t%.16g" % sim.amplitude
			t = sim.get_time_geocent()
			print >>f, "Time at geocentre:\t%s s" % t
			print >>f, "Segments within 60 seconds:\t%s" % segmentsUtils.segmentlistdict_to_short_string(self.seglists & segments.segmentlistdict((instrument, segments.segmentlist([segments.segment(t-offsetvector[instrument]-60, t-offsetvector[instrument]+60)])) for instrument in self.seglists))
			print >>f, "Vetoes within 60 seconds:\t%s" % segmentsUtils.segmentlistdict_to_short_string(self.vetoseglists & segments.segmentlistdict((instrument, segments.segmentlist([segments.segment(t-offsetvector[instrument]-60, t-offsetvector[instrument]+60)])) for instrument in self.vetoseglists))

		#
		# done
		#

		return fig,


#
# =============================================================================
#
#                                     Main
#
# =============================================================================
#


def group_files(filenames, verbose = False):
	# figure out which files are injection runs and which aren't
	injection_filenames = []
	noninjection_filenames = []
	for n, filename in enumerate(filenames):
		if verbose:
			print >>sys.stderr, "%d/%d: %s" % (n + 1, len(filenames), filename)
		connection = sqlite3.connect(filename)
		if "sim_burst" in dbtables.get_table_names(connection):
			if verbose:
				print >>sys.stderr, "\t--> injections"
			injection_filenames.append(filename)
		else:
			if verbose:
				print >>sys.stderr, "\t--> non-injections"
			noninjection_filenames.append(filename)
		connection.close()
	return injection_filenames, noninjection_filenames


def pack_files(filenames, n_threads, verbose = False):
	bins = packing.BiggestIntoEmptiest([packing.Bin() for n in range(n_threads)])
	bins.packlist((os.stat(filename).st_size, filename) for filename in filenames)
	return [bin.objects for bin in bins.bins]


def import_dumps(filenames, verbose = False):
	rate_vs_threshold = None
	efficiency = None
	for filename in filenames:
		if verbose:
			print >>sys.stderr, "\t%s ..." % filename,
		dump = pickle.load(open(filename))
		if type(dump) is RateVsThreshold:
			if verbose:
				print >>sys.stderr, "found rate vs. threshold data"
			if rate_vs_threshold is None:
				rate_vs_threshold = dump
			else:
				rate_vs_threshold += dump
		elif type(dump) is Efficiency:
			if verbose:
				print >>sys.stderr, "found efficiency data"
			if efficiency is None:
				efficiency = dump
			else:
				efficiency += dump
		else:
			raise ValueError("cannot determine contents of %s" % filename)
	return rate_vs_threshold, efficiency


def process_file(filename, products, live_time_program, tmp_path = None, veto_segments_name = None, verbose = False):
	#
	# connect to database and summarize contents
	#

	working_filename = dbtables.get_connection_filename(filename, tmp_path = tmp_path, verbose = verbose)
	contents = SnglBurstUtils.CoincDatabase(sqlite3.connect(working_filename), live_time_program, search = "StringCusp", veto_segments_name = veto_segments_name)
	if verbose:
		SnglBurstUtils.summarize_coinc_database(contents, filename = working_filename)

	#
	# augment summary with extra stuff we need.  the filename
	# is recorded for dumping debuggin information related to
	# missed injections.  if burca was run with the
	# --coincidence-segments option then the value is copied
	# into a segmentlistdict to facilitate the computation of
	# livetime
	#

	contents.filename = filename

795
	contents.coincidence_segments = ligolwprocess.get_process_params(contents.xmldoc, "lalapps_burca", "--coincidence-segments")
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
	if contents.coincidence_segments:
		# as a side-effect, this enforces the rule that
		# burca has been run on the input file exactly once
		contents.coincidence_segments, = contents.coincidence_segments
		contents.coincidence_segments = segments.segmentlistdict.fromkeys(contents.seglists, segmentsUtils.from_range_strings(contents.coincidence_segments.split(","), boundtype = dbtables.lsctables.LIGOTimeGPS).coalesce())
	else:
		contents.coincidence_segments = None

	#
	# process contents
	#

	for n, product in enumerate(products):
		if verbose:
			print >>sys.stderr, "%s: adding to product %d ..." % (working_filename, n)
		product.add_contents(contents, verbose = verbose)

	#
	# close
	#

	contents.connection.close()
	dbtables.discard_connection_filename(filename, working_filename, verbose = verbose)


def process_files(filenames, products, live_time_program, tmp_path = None, veto_segments_name = None, verbose = False):
	for n, filename in enumerate(filenames):
		if verbose:
			print >>sys.stderr, "%d/%d: %s" % (n + 1, len(filenames), filename)
		process_file(filename, products, live_time_program, tmp_path = tmp_path, veto_segments_name = veto_segments_name, verbose = verbose)


def write_products(products, prefix, image_formats, verbose = False):
	format = "%%s%%0%dd%%s.%%s" % (int(math.log10(max(len(products) - 1, 1))) + 1)
	n = 1
	while products:
		if verbose:
			print >>sys.stderr, "finishing product %d ..." % (n - 1)
		product = products.pop(0)
		# write dump of raw data
		filename = "%sdump_%d.pickle" % (prefix, n)
		if verbose:
			print >>sys.stderr, "\twriting %s ..." % filename,
		pickle.dump(product, open(filename, "w"))
		if verbose:
			print >>sys.stderr, "done"
		# write plots
		for m, fig in enumerate(product.finish()):
			for ext in image_formats:
				filename = format % (prefix, n, chr(m + 97), ext)
				if verbose:
					print >>sys.stderr, "\twriting %s ..." % filename,
				fig.savefig(filename)
				if verbose:
					print >>sys.stderr, "done"
		n += 1


options, filenames = parse_command_line()


print >>sys.stderr, """Command line:

$ %s
""" % " ".join(sys.argv)
if options.open_box:
	print >>sys.stderr, """

---=== !! BOX IS OPEN !! ===---

PRESS CTRL-C SOON IF YOU DIDN'T MEAN TO OPEN THE BOX

"""


#
# figure out which files are from injection runs and which aren't
#


if options.verbose:
	print >>sys.stderr, "Identifying injection and non-injection databases ..."
injection_filenames, noninjection_filenames = group_files(filenames, verbose = options.verbose)


#
# intialize the data collection objects
#


if options.import_dump:
	if options.verbose:
		print >>sys.stderr, "Loading dump files ..."
	rate_vs_threshold, efficiency = import_dumps(options.import_dump, verbose = options.verbose)
	# override box openness in rate-vs-threshold data
	if rate_vs_threshold is not None:
		rate_vs_threshold.open_box = options.open_box
	if efficiency is not None and options.open_box and not efficiency.open_box:
		raise ValueError("Box is open but one or more imjported efficiency dump file was generated in closed-box mode.  Efficiency must be re-measured in open-box mode to use correct detection threshold.")
else:
	rate_vs_threshold, efficiency = None, None


#
# collect zero-lag and background statistics
#


if options.detection_threshold is None:
	if options.verbose:
		print >>sys.stderr, "Collecting background and zero-lag statistics ..."

	children = {}
	rate_vs_thresholds = []
	# group files into bins of about the same total number of bytes.
	# don't try to create more groups than there are files
	for filenames in pack_files(noninjection_filenames, min(options.threads, len(noninjection_filenames)), verbose = options.verbose):
		# shuffle file names to avoid copying all the big files
		# into the scratch space at the same time
		random.shuffle(filenames)
		# launch thread
		r, w = os.pipe()
		r, w = os.fdopen(r, "r", 0), os.fdopen(w, "w", 0)
		pid = os.fork()
		if pid == 0:
			# we're the child process
			r.close()
			try:
				# create a new book-keeping object
				rate_vs_threshold = RateVsThreshold(options.open_box, record_background = options.record_background, record_candidates = options.record_candidates)
				process_files(filenames, [rate_vs_threshold], options.live_time_program, tmp_path = options.tmp_space, veto_segments_name = options.vetoes_name, verbose = options.verbose)
				pickle.dump(rate_vs_threshold, w)
			except:
				pickle.dump(traceback.format_exc(), w)
				w.close()
				sys.exit(1)
			w.close()
			sys.exit(0)
		# we're not the child process
		w.close()
		children[r] = pid
	# collect all children, report any exceptions that occured, combine
	# results
	while children:
		for r in select.select(children.keys(), [], [])[0]:
			process_output = pickle.load(r)
			r.close()
			pid, exit_status = os.waitpid(children.pop(r), 0)
			if isinstance(process_output, RateVsThreshold):
				if rate_vs_threshold is None:
					rate_vs_threshold = process_output
				else:
					rate_vs_threshold += process_output
			else:
				print >>sys.stderr, process_output
			del process_output
			if exit_status:
				sys.exit(exit_status)
	if rate_vs_threshold is None:
		raise ValueError("no non-injection data available:  cannot determine detection threshold.  provide non-injection data or set an explicit detection treshold.  Consult --help for more information.")

	# write data products
	write_products([rate_vs_threshold], "string_rate_", options.image_formats, verbose = options.verbose)

	# print summary information, and set detection threshold
	if options.open_box:
		try:
			print >>sys.stderr, "Zero-lag events: %d" % len(rate_vs_threshold.zero_lag)
		except OverflowError:
			# python < 2.5 can't handle list-like things with more than 2**31 elements
			# FIXME:  remove when we can rely on python >= 2.5
			print >>sys.stderr, "Zero-lag events: %d" % rate_vs_threshold.zero_lag.n
	print >>sys.stderr, "Total time in zero-lag segments: %s s" % str(rate_vs_threshold.zero_lag_time)
Kipp Cannon's avatar
Kipp Cannon committed
969
	print >>sys.stderr, "Time-slide events: %d" % rate_vs_threshold.n_background
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
	print >>sys.stderr, "Total time in time-slide segments: %s s" % str(rate_vs_threshold.background_time)
	if options.open_box:
		detection_threshold = rate_vs_threshold.zero_lag[-1]
		print >>sys.stderr, "Likelihood ratio for highest-ranked zero-lag survivor: %.9g" % detection_threshold
	else:
		# the ranking statistic values are stored in decreasing
		# order.  if the background and zero-lag live times are
		# identical, then the loudest zero-lag event is simulated
		# by the loudest background event so we want entry 0 in the
		# list; if the background livetime is twice the zero-lag
		# live time then the loudest zero-lag event is simulated by
		# the next-to-loudest background event so we want entry 1
		# in the list.
		detection_threshold = rate_vs_threshold.background[int(round(rate_vs_threshold.background_time / rate_vs_threshold.zero_lag_time)) - 1]
		print >>sys.stderr, "Simulated likelihood ratio for highest-ranked zero-lag survivor: %.9g" % detection_threshold
else:
	detection_threshold = options.detection_threshold
	print >>sys.stderr, "Likelihood ratio for highest-ranked zero-lag survivor from command line: %.9g" % detection_threshold


#
# measure detection efficiency based on loudest (simulated, if box is
# closed) zero-lag survivor
#


if options.verbose:
	print >>sys.stderr, "Collecting efficiency statistics ..."
children = {}
# group files into bins of about the same total number of bytes.  don't try
# to create more groups than there are files
For faster browsing, not all history is shown. View entire blame