From 538848bfbc426a3aa26b3af693715c1a662e6ad8 Mon Sep 17 00:00:00 2001
From: Patrick Godwin <patrick.godwin@ligo.org>
Date: Thu, 19 Jul 2018 11:48:29 -0700
Subject: [PATCH] Makefile.gstlal_feature_extractor_offline: add guide to run
 offline DAG, update guide for online DAG

---
 .../Makefile.gstlal_feature_extractor_offline | 125 ++++++++++++++----
 .../Makefile.gstlal_feature_extractor_online  |   8 ++
 2 files changed, 104 insertions(+), 29 deletions(-)

diff --git a/gstlal-burst/share/feature_extractor/Makefile.gstlal_feature_extractor_offline b/gstlal-burst/share/feature_extractor/Makefile.gstlal_feature_extractor_offline
index a2fcd59b23..315d5ed5a2 100644
--- a/gstlal-burst/share/feature_extractor/Makefile.gstlal_feature_extractor_offline
+++ b/gstlal-burst/share/feature_extractor/Makefile.gstlal_feature_extractor_offline
@@ -1,5 +1,79 @@
-SHELL := /bin/bash
-# condor commands
+SHELL := /bin/bash # Use bash syntax
+
+########################
+#        Guide         #
+########################
+
+# Author: Patrick Godwin (patrick.godwin@ligo.org)
+#
+# This Makefile is designed to launch offline feature extractor jobs.
+#
+# For general use cases, the only configuration options that need to be changed are:
+#
+#  * User/Accounting tags: GROUP_USER, ACCOUNTING_TAG
+#  * Analysis times: START, STOP
+#  * Data ingestion: IFO, CHANNEL_LIST
+#  * Waveform parameters: WAVEFORM, MISMATCH, QHIGH
+#
+# Configuration options:
+#
+#   Analysis times:
+#     * START: set the analysis gps start time
+#     * STOP: set the analysis gps stop time
+#
+#   Data ingestion:
+#     * IFO: select the IFO for auxiliary channels to be ingested (H1/L1).
+#     * CHANNEL_LIST: a list of channels for the feature extractor to process. Provided
+#         lists for O1/O2 and H1/L1 lists are in gstlal/gstlal-burst/share/feature_extractor.
+#     * MAX_SERIAL_STREAMS: Maximum # of streams that a single gstlal_feature_extractor job will
+#         process at once. This is determined by sum_i(channel_i * # rates_i). Number of rates for a
+#         given channels is determined by log2(max_rate/min_rate) + 1.
+#     * MAX_PARALLEL_STREAMS: Maximum # of streams that a single job will run in the lifespan of a job.
+#         This is distinct from serial streams since when a job is first launched, it will cache
+#         auxiliary channel frames containing all channels that meet the criterion here, and then process
+#         each channel subset sequentially determined by the serial streams. This is to save on input I/O.
+#     * CONCURRENCY: determines the maximum # of concurrent reads from the same frame file. For most
+#         purposes, it will be set to 1. Use this at your own risk.
+#
+#   Waveform parameters:
+#     * WAVEFORM: type of waveform used to perform matched filtering (sine_gaussian/half_sine_gaussian).
+#     * MISMATCH: maximum mismatch between templates (corresponding to Omicron's mismatch definition).
+#     * QHIGH: maximum value of Q
+#
+#   Data transfer/saving:
+#     * OUTPATH: directory in which to save features.
+#     * SAVE_CADENCE: span of a typical dataset within an hdf5 file.
+#     * PERSIST_CADENCE: span of a typical hdf5 file.
+#
+# Setting the number of streams (ADVANCED USAGE):
+#
+#   NOTE: This won't have to be changed for almost all use cases, and the current configuration has been
+#     optimized to aim for short run times.
+#
+#   Definition: Target number of streams (N_channels x N_rates_per_channel) that each cpu will process.
+#
+#     * if max_serial_streams > max_parallel_streams, all jobs will be parallelized by channel
+#     * if max_parallel_streams > num_channels in channel list, all jobs will be processed serially,
+#         with processing driven by max_serial_streams.
+#     * any other combination will produce a mix of parallelization by channels and processing channels serially per job.
+#
+#   Playing around with combinations of MAX_SERIAL_STREAMS, MAX_PARALLEL_STREAMS, CONCURRENCY, will entirely
+#   determine the structure of the offline DAG. Doing so will also change the memory usage for each job, and so you'll
+#   need to tread lightly. Changing CONCURRENCY in particular may cause I/O locks due to jobs fighting to read from the same
+#   frame file.
+#
+# In order to start up offline runs, you'll need an installation of gstlal. An installation Makefile that
+# includes Kafka dependencies are located at: gstlal/gstlal-burst/share/feature_extractor/Makefile.gstlal_idq_icc
+#
+# To run, making sure that the correct environment is sourced:
+#
+#   $ make -f Makefile.gstlal_feature_extractor_offline
+#
+
+########################
+# User/Accounting Tags #
+########################
+
 # Set the accounting tag from https://ldas-gridmon.ligo.caltech.edu/ldg_accounting/user
 ACCOUNTING_TAG=ligo.dev.o3.detchar.onlinedq.idq
 GROUP_USER=albert.einstein
@@ -9,50 +83,38 @@ CONDOR_COMMANDS:=--condor-command=accounting_group=$(ACCOUNTING_TAG) --condor-co
 # Triggering parameters #
 #########################
 
-SEG_PAD = 1000
-
-# The GPS start time for analysis
+# analysis times
 START = 1187000000
-FSTART=$(shell echo $$((${START}-${SEG_PAD})))
-
-# The GPS end time for analysis
 STOP  = 1187100000
 
-OUTPATH = $(PWD)
+# IFO for auxiliary features
+IFO = H1
+#IFO = L1
 
 # channel list for analysis
 CHANNEL_LIST = H1_O2_standard_channel_list.txt
 
-# Target number of streams (N_channels x N_rates_per_channel) that each cpu will process
-# NOTE: * if max_serial_streams > max_parallel_streams, all jobs will be parallelized by channel
-#       * if max_parallel_streams > num_channels in channel list, all jobs will be processed serially, with processing driven by max_serial_streams
-#       * any other combination will produce a mix of parallelization by channels and processing channels serially per job
-MAX_PARALLEL_STREAMS = 600
-MAX_SERIAL_STREAMS = 210
-
-# Maximum number of concurrent reads from the same frame file, done to prevent I/O locks
-CONCURRENCY = 1
-
-# length of time to process for a given job
-SEGMENT_LENGTH = 4000
+# save preferences
+SAVE_CADENCE = 20
+PERSIST_CADENCE = 200
+OUTPATH = $(PWD)
 
 # Parameter space config of waveforms
+WAVEFORM = sine_gaussian
 MISMATCH = 0.03
 QHIGH = 40
 
-# waveforms
-WAVEFORM = sine_gaussian
+# DAG layout settings
+MAX_PARALLEL_STREAMS = 600
+MAX_SERIAL_STREAMS = 210
+CONCURRENCY = 1
 
-# save preferences
-SAVE_CADENCE = 20
-PERSIST_CADENCE = 200
+# length of time to process for a given job
+SEGMENT_LENGTH = 4000
 
 # Detector
 CLUSTER:=$(shell hostname -d)
 
-IFO = H1
-#IFO = L1
-
 ###############################
 # Segment and frame type info #
 ###############################
@@ -73,6 +135,11 @@ SEGMENT_MIN_LENGTH = 512
 
 FRAME_TYPE=R
 
+# don't generally have to mess with this, provides padding
+# to account for PSD estimation
+FSTART=$(shell echo $$((${START}-${SEG_PAD})))
+SEG_PAD = 1000
+
 #################
 # Web directory #
 #################
diff --git a/gstlal-burst/share/feature_extractor/Makefile.gstlal_feature_extractor_online b/gstlal-burst/share/feature_extractor/Makefile.gstlal_feature_extractor_online
index 2f9f8b221d..6ae0d902a9 100644
--- a/gstlal-burst/share/feature_extractor/Makefile.gstlal_feature_extractor_online
+++ b/gstlal-burst/share/feature_extractor/Makefile.gstlal_feature_extractor_online
@@ -72,6 +72,14 @@ SHELL := /bin/bash # Use bash syntax
 #         is to prevent a single feature extractor job from holding up the online pipeline. This will
 #         also depend on the latency induced by the feature extractor, especially when using templates
 #         that have latencies associated with them such as Sine-Gaussians.
+#
+# In order to start up online runs, you'll need an installation of gstlal. An installation Makefile that
+# includes Kafka dependencies are located at: gstlal/gstlal-burst/share/feature_extractor/Makefile.gstlal_idq_icc
+#
+# To run, making sure that the correct environment is sourced:
+#
+#   $ make -f Makefile.gstlal_feature_extractor_online
+#
 
 ########################
 # User/Accounting Tags #
-- 
GitLab