Makefile.gstlal_feature_extractor_offline: fix issue with generating segments...

Makefile.gstlal_feature_extractor_offline: fix issue with generating segments from int8s conversion, add default safety info, clean up comments and unused variables

Makefile.gstlal_feature_extractor_offline: fix issue with generating segments...
40e573e9 · Patrick Godwin · dd5c83a1 · 40e573e9
Commit 40e573e9 authored 5 years ago by Patrick Godwin
--- a/gstlal-burst/share/feature_extractor/Makefile.gstlal_feature_extractor_offline
+++ b/gstlal-burst/share/feature_extractor/Makefile.gstlal_feature_extractor_offline
 SHELL := /bin/bash # Use bash syntax

-########################
-#        Guide         #
-########################
+#################################################################################
+# GUIDE                                                                         #
+#################################################################################

 # Author: Patrick Godwin (patrick.godwin@ligo.org)
 #
@@ -31,63 +31,39 @@ SHELL := /bin/bash # Use bash syntax
 #     * UNSAFE_CHANNEL_INCLUDE: specify unsafe channels to include, ignoring safety information.
 #
 #   Waveform parameters:
-#     * WAVEFORM: type of waveform used to perform matched filtering (sine_gaussian/half_sine_gaussian).
+#     * WAVEFORM: type of waveform used to perform matched filtering.
+#                 options: sine_gaussian/half_sine_gaussian/tapered_sine_gaussian
 #     * MISMATCH: maximum mismatch between templates (corresponding to Omicron's mismatch definition).
 #     * QHIGH: maximum value of Q
 #
 #   Data transfer/saving:
 #     * OUTPATH: directory in which to save features.
-#     * SAMPLE_RATE: rate at which to aggregate features for a given channel. Can be sampled at 1 Hz or higher (powers of 2).
+#     * SAMPLE_RATE: rate at which to aggregate features for a given channel.
+#                    Can be sampled at 1 Hz or higher (powers of 2).
 #     * SAVE_CADENCE: span of a typical dataset within an hdf5 file.
 #     * PERSIST_CADENCE: span of a typical hdf5 file.
 #
-# Setting the number of streams (ADVANCED USAGE):
-#
-#     * MAX_SERIAL_STREAMS: Maximum # of streams that a single gstlal_feature_extractor job will
-#         process at once. This is determined by sum_i(channel_i * # rates_i). Number of rates for a
-#         given channels is determined by log2(max_rate/min_rate) + 1.
-#     * MAX_PARALLEL_STREAMS: Maximum # of streams that a single job will run in the lifespan of a job.
-#         This is distinct from serial streams since when a job is first launched, it will cache
-#         auxiliary channel frames containing all channels that meet the criterion here, and then process
-#         each channel subset sequentially determined by the serial streams. This is to save on input I/O.
-#     * CONCURRENCY: determines the maximum # of concurrent reads from the same frame file. For most
-#         purposes, it will be set to 1. Use this at your own risk.
-#
-#   NOTE: This won't have to be changed for almost all use cases, and the current configuration has been
-#     optimized to aim for short run times.
-#
-#   Definition: Target number of streams (N_channels x N_rates_per_channel) that each cpu will process.
-#
-#     * if max_serial_streams > max_parallel_streams, all jobs will be parallelized by channel
-#     * if max_parallel_streams > num_channels in channel list, all jobs will be processed serially,
-#         with processing driven by max_serial_streams.
-#     * any other combination will produce a mix of parallelization by channels and processing channels serially per job.
-#
-#   Playing around with combinations of MAX_SERIAL_STREAMS, MAX_PARALLEL_STREAMS, CONCURRENCY, will entirely
-#   determine the structure of the offline DAG. Doing so will also change the memory usage for each job, and so you'll
-#   need to tread lightly. Changing CONCURRENCY in particular may cause I/O locks due to jobs fighting to read from the same
-#   frame file.
-#
-# In order to start up offline runs, you'll need an installation of gstlal. An installation Makefile that
-# includes Kafka dependencies are located at: gstlal/gstlal-burst/share/feature_extractor/Makefile.gstlal_idq_icc
-#
-# To run, making sure that the correct environment is sourced:
+# To generate the DAG needed to start an analysis, run:
 #
 #   $ make -f Makefile.gstlal_feature_extractor_offline
 #

-########################
-# User/Accounting Tags #
-########################
+#################################################################################
+# CONFIGURATION                                                                 #
+#################################################################################

-# Set the accounting tag from https://ldas-gridmon.ligo.caltech.edu/ldg_accounting/user
-ACCOUNTING_TAG=ligo.dev.o3.detchar.onlinedq.idq
+#-------------------------------------
+### User/Accounting Tags
+
+ACCOUNTING_TAG=ligo.dev.o2.detchar.onlinedq.idq
 GROUP_USER=albert.einstein
 CONDOR_COMMANDS:=--condor-command=accounting_group=$(ACCOUNTING_TAG) --condor-command=accounting_group_user=$(GROUP_USER)

-#########################
-# Triggering parameters #
-#########################
+# Set accounting tag at:
+#     https://ldas-gridmon.ligo.caltech.edu/ldg_accounting/user
+
+#-------------------------------------
+### Analysis configuration

 # analysis times
 START = 1187000000
@@ -99,17 +75,13 @@ SAMPLE_RATE = 16
 SAVE_CADENCE = 20
 PERSIST_CADENCE = 200

-# Parameter space config of waveforms
+# parameter space for waveforms
 WAVEFORM = tapered_sine_gaussian
 MISMATCH = 0.03
 QHIGH = 40

-# Detector
-CLUSTER:=$(shell hostname -d)
-
-##########################
-# Auxiliary channel info #
-##########################
+#-------------------------------------
+### Channel list configuration

 IFO = H1
 #IFO = L1
@@ -123,7 +95,7 @@ CHANNEL_LIST = $(IFO)-$(EPOCH)-$(LEVEL).ini
 SECTION_INCLUDE =

 # if not specified, use defaults
-SAFETY_INCLUDE =
+SAFETY_INCLUDE = safe unsafe unsafeabove2kHz unknown
 FIDELITY_EXCLUDE =

 # if specified, override safety checks for these channels
@@ -135,13 +107,11 @@ SAFETY_INCLUDE_COMMANDS := $(addprefix --safety-include ,$(SAFETY_INCLUDE))
 FIDELITY_EXCLUDE_COMMANDS := $(addprefix --fidelity-exclude ,$(FIDELITY_EXCLUDE))
 UNSAFE_CHANNEL_INCLUDE_COMMANDS := $(addprefix --unsafe-channel-include ,$(UNSAFE_CHANNEL_INCLUDE))

-################
-# Segment info #
-################
+#-------------------------------------
+### Segment configuration

 # Info from https://wiki.ligo.org/viewauth/LSC/JRPComm/ObsRun2
-# Select correct calibration type
-# GSTLAL_SEGMENTS Options
+
 SEG_SERVER=https://segments.ligo.org
 # C00
 LIGO_SEGMENTS="$(IFO):DMT-ANALYSIS_READY:1"
@@ -150,6 +120,10 @@ LIGO_SEGMENTS="$(IFO):DMT-ANALYSIS_READY:1"
 # C02
 #LIGO_SEGMENTS="$*:DCS-ANALYSIS_READY_C02:1"

+#################################################################################
+# DAG CONFIGURATION (OPTIONAL)                                                  #
+#################################################################################
+
 # length of time to process for a given job
 SEGMENT_LENGTH = 4000

@@ -160,29 +134,40 @@ SEGMENT_TRIM = 0
 SEGMENT_MIN_LENGTH = 512
 FSTART=$(shell echo $$((${START}-${SEG_PAD})))

-#########################
-# DAG architecture info #
-#########################
+# Setting the number of streams (ADVANCED USAGE):
+#
+#     * MAX_SERIAL_STREAMS: Maximum # of streams that a single gstlal_feature_extractor job will
+#         process at once. This is determined by sum_i(channel_i * # rates_i). Number of rates for a
+#         given channels is determined by log2(max_rate/min_rate) + 1.
+#     * MAX_PARALLEL_STREAMS: Maximum # of streams that a single job will run in the lifespan of a job.
+#         This is distinct from serial streams since when a job is first launched, it will cache
+#         auxiliary channel frames containing all channels that meet the criterion here, and then process
+#         each channel subset sequentially determined by the serial streams. This is to save on input I/O.
+#     * CONCURRENCY: determines the maximum # of concurrent reads from the same frame file. For most
+#         purposes, it will be set to 1. Use this at your own risk.
+#
+#   NOTE: This won't have to be changed for almost all use cases, and the current configuration has been
+#     optimized to aim for short run times.
+#
+#   Definition: Target number of streams (N_channels x N_rates_per_channel) that each cpu will process.
+#
+#     * if max_serial_streams > max_parallel_streams, all jobs will be parallelized by channel
+#     * if max_parallel_streams > num_channels in channel list, all jobs will be processed serially,
+#         with processing driven by max_serial_streams.
+#     * any other combination will produce a mix of parallelization by channels and processing channels serially per job.
+#
+#   Playing around with combinations of MAX_SERIAL_STREAMS, MAX_PARALLEL_STREAMS, CONCURRENCY, will entirely
+#   determine the structure of the offline DAG. Doing so will also change the memory usage for each job, and so you'll
+#   need to tread lightly. Changing CONCURRENCY in particular may cause I/O locks due to jobs fighting to read from the same
+#   frame file.

 MAX_PARALLEL_STREAMS = 600
 MAX_SERIAL_STREAMS = 210
 CONCURRENCY = 1

-#################
-# Web directory #
-#################
-
-# A user tag for the run
-#TAG = O2_C00
-# Run number
-#RUN = run_1
-# A web directory for output (note difference between cit+uwm and Atlas)
-# cit & uwm
-#WEBDIR = ~/public_html/observing/$(TAG)/$(START)-$(STOP)-$(RUN)
-
-############
-# Workflow #
-############
+#################################################################################
+# WORKFLOW                                                                      #
+#################################################################################

 all : dag
 	@echo "Submit with: condor_submit_dag feature_extractor_pipe.dag"
@@ -228,7 +213,8 @@ $(CHANNEL_LIST) : frame.cache
 # Produce segments file
 segments.xml.gz : frame.cache
 	ligolw_segment_query_dqsegdb --segment-url=${SEG_SERVER} -q --gps-start-time ${FSTART} --gps-end-time ${STOP} --include-segments=$(LIGO_SEGMENTS) --result-name=datasegments > $@
-	ligolw_cut --ilwdchar-compat --delete-column segment:segment_def_cdb --delete-column segment:creator_db --delete-column segment_definer:insertion_time $@
+	ligolw_no_ilwdchar $@
+	ligolw_cut --delete-column segment:segment_def_cdb --delete-column segment:creator_db --delete-column segment_definer:insertion_time $@
 	gstlal_segments_trim --trim $(SEGMENT_TRIM) --gps-start-time $(FSTART) --gps-end-time $(STOP) --min-length $(SEGMENT_MIN_LENGTH) --output $@ $@

 frame.cache :