From 538848bfbc426a3aa26b3af693715c1a662e6ad8 Mon Sep 17 00:00:00 2001 From: Patrick Godwin <patrick.godwin@ligo.org> Date: Thu, 19 Jul 2018 11:48:29 -0700 Subject: [PATCH] Makefile.gstlal_feature_extractor_offline: add guide to run offline DAG, update guide for online DAG --- .../Makefile.gstlal_feature_extractor_offline | 125 ++++++++++++++---- .../Makefile.gstlal_feature_extractor_online | 8 ++ 2 files changed, 104 insertions(+), 29 deletions(-) diff --git a/gstlal-burst/share/feature_extractor/Makefile.gstlal_feature_extractor_offline b/gstlal-burst/share/feature_extractor/Makefile.gstlal_feature_extractor_offline index a2fcd59b23..315d5ed5a2 100644 --- a/gstlal-burst/share/feature_extractor/Makefile.gstlal_feature_extractor_offline +++ b/gstlal-burst/share/feature_extractor/Makefile.gstlal_feature_extractor_offline @@ -1,5 +1,79 @@ -SHELL := /bin/bash -# condor commands +SHELL := /bin/bash # Use bash syntax + +######################## +# Guide # +######################## + +# Author: Patrick Godwin (patrick.godwin@ligo.org) +# +# This Makefile is designed to launch offline feature extractor jobs. +# +# For general use cases, the only configuration options that need to be changed are: +# +# * User/Accounting tags: GROUP_USER, ACCOUNTING_TAG +# * Analysis times: START, STOP +# * Data ingestion: IFO, CHANNEL_LIST +# * Waveform parameters: WAVEFORM, MISMATCH, QHIGH +# +# Configuration options: +# +# Analysis times: +# * START: set the analysis gps start time +# * STOP: set the analysis gps stop time +# +# Data ingestion: +# * IFO: select the IFO for auxiliary channels to be ingested (H1/L1). +# * CHANNEL_LIST: a list of channels for the feature extractor to process. Provided +# lists for O1/O2 and H1/L1 lists are in gstlal/gstlal-burst/share/feature_extractor. +# * MAX_SERIAL_STREAMS: Maximum # of streams that a single gstlal_feature_extractor job will +# process at once. This is determined by sum_i(channel_i * # rates_i). Number of rates for a +# given channels is determined by log2(max_rate/min_rate) + 1. +# * MAX_PARALLEL_STREAMS: Maximum # of streams that a single job will run in the lifespan of a job. +# This is distinct from serial streams since when a job is first launched, it will cache +# auxiliary channel frames containing all channels that meet the criterion here, and then process +# each channel subset sequentially determined by the serial streams. This is to save on input I/O. +# * CONCURRENCY: determines the maximum # of concurrent reads from the same frame file. For most +# purposes, it will be set to 1. Use this at your own risk. +# +# Waveform parameters: +# * WAVEFORM: type of waveform used to perform matched filtering (sine_gaussian/half_sine_gaussian). +# * MISMATCH: maximum mismatch between templates (corresponding to Omicron's mismatch definition). +# * QHIGH: maximum value of Q +# +# Data transfer/saving: +# * OUTPATH: directory in which to save features. +# * SAVE_CADENCE: span of a typical dataset within an hdf5 file. +# * PERSIST_CADENCE: span of a typical hdf5 file. +# +# Setting the number of streams (ADVANCED USAGE): +# +# NOTE: This won't have to be changed for almost all use cases, and the current configuration has been +# optimized to aim for short run times. +# +# Definition: Target number of streams (N_channels x N_rates_per_channel) that each cpu will process. +# +# * if max_serial_streams > max_parallel_streams, all jobs will be parallelized by channel +# * if max_parallel_streams > num_channels in channel list, all jobs will be processed serially, +# with processing driven by max_serial_streams. +# * any other combination will produce a mix of parallelization by channels and processing channels serially per job. +# +# Playing around with combinations of MAX_SERIAL_STREAMS, MAX_PARALLEL_STREAMS, CONCURRENCY, will entirely +# determine the structure of the offline DAG. Doing so will also change the memory usage for each job, and so you'll +# need to tread lightly. Changing CONCURRENCY in particular may cause I/O locks due to jobs fighting to read from the same +# frame file. +# +# In order to start up offline runs, you'll need an installation of gstlal. An installation Makefile that +# includes Kafka dependencies are located at: gstlal/gstlal-burst/share/feature_extractor/Makefile.gstlal_idq_icc +# +# To run, making sure that the correct environment is sourced: +# +# $ make -f Makefile.gstlal_feature_extractor_offline +# + +######################## +# User/Accounting Tags # +######################## + # Set the accounting tag from https://ldas-gridmon.ligo.caltech.edu/ldg_accounting/user ACCOUNTING_TAG=ligo.dev.o3.detchar.onlinedq.idq GROUP_USER=albert.einstein @@ -9,50 +83,38 @@ CONDOR_COMMANDS:=--condor-command=accounting_group=$(ACCOUNTING_TAG) --condor-co # Triggering parameters # ######################### -SEG_PAD = 1000 - -# The GPS start time for analysis +# analysis times START = 1187000000 -FSTART=$(shell echo $$((${START}-${SEG_PAD}))) - -# The GPS end time for analysis STOP = 1187100000 -OUTPATH = $(PWD) +# IFO for auxiliary features +IFO = H1 +#IFO = L1 # channel list for analysis CHANNEL_LIST = H1_O2_standard_channel_list.txt -# Target number of streams (N_channels x N_rates_per_channel) that each cpu will process -# NOTE: * if max_serial_streams > max_parallel_streams, all jobs will be parallelized by channel -# * if max_parallel_streams > num_channels in channel list, all jobs will be processed serially, with processing driven by max_serial_streams -# * any other combination will produce a mix of parallelization by channels and processing channels serially per job -MAX_PARALLEL_STREAMS = 600 -MAX_SERIAL_STREAMS = 210 - -# Maximum number of concurrent reads from the same frame file, done to prevent I/O locks -CONCURRENCY = 1 - -# length of time to process for a given job -SEGMENT_LENGTH = 4000 +# save preferences +SAVE_CADENCE = 20 +PERSIST_CADENCE = 200 +OUTPATH = $(PWD) # Parameter space config of waveforms +WAVEFORM = sine_gaussian MISMATCH = 0.03 QHIGH = 40 -# waveforms -WAVEFORM = sine_gaussian +# DAG layout settings +MAX_PARALLEL_STREAMS = 600 +MAX_SERIAL_STREAMS = 210 +CONCURRENCY = 1 -# save preferences -SAVE_CADENCE = 20 -PERSIST_CADENCE = 200 +# length of time to process for a given job +SEGMENT_LENGTH = 4000 # Detector CLUSTER:=$(shell hostname -d) -IFO = H1 -#IFO = L1 - ############################### # Segment and frame type info # ############################### @@ -73,6 +135,11 @@ SEGMENT_MIN_LENGTH = 512 FRAME_TYPE=R +# don't generally have to mess with this, provides padding +# to account for PSD estimation +FSTART=$(shell echo $$((${START}-${SEG_PAD}))) +SEG_PAD = 1000 + ################# # Web directory # ################# diff --git a/gstlal-burst/share/feature_extractor/Makefile.gstlal_feature_extractor_online b/gstlal-burst/share/feature_extractor/Makefile.gstlal_feature_extractor_online index 2f9f8b221d..6ae0d902a9 100644 --- a/gstlal-burst/share/feature_extractor/Makefile.gstlal_feature_extractor_online +++ b/gstlal-burst/share/feature_extractor/Makefile.gstlal_feature_extractor_online @@ -72,6 +72,14 @@ SHELL := /bin/bash # Use bash syntax # is to prevent a single feature extractor job from holding up the online pipeline. This will # also depend on the latency induced by the feature extractor, especially when using templates # that have latencies associated with them such as Sine-Gaussians. +# +# In order to start up online runs, you'll need an installation of gstlal. An installation Makefile that +# includes Kafka dependencies are located at: gstlal/gstlal-burst/share/feature_extractor/Makefile.gstlal_idq_icc +# +# To run, making sure that the correct environment is sourced: +# +# $ make -f Makefile.gstlal_feature_extractor_online +# ######################## # User/Accounting Tags # -- GitLab