There will be maintenance performed on git.ligo.org, chat.ligo.org, containers.lig.org, and docs.ligo.org starting at 9am PDT on Tuesday 18th August 2020. There will be an extremely small period of downtime at the start of the maintenance window as various services are restarted. Please address any comments, questions, or concerns to computing-help@igwn.org.

Commit 2a0806c8 authored by John Douglas Veitch's avatar John Douglas Veitch Committed by Vivien Raymond

Add retries to LALInferenceMCMC checkpointing

parent b7136ff2
......@@ -460,13 +460,43 @@ void PTMCMCAlgorithm(struct tagLALInferenceRunState *runState) {
}
MPI_Bcast(&local_saveStateFlag, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&local_exitFlag, 1, MPI_INT, 0, MPI_COMM_WORLD);
INT4 saveattempts=0;
INT4 retrydelay=5; /* 5 seconds before initial retry */
INT4 retcode=XLAL_SUCCESS;
/* The following checkpoint code is wrapped in do {} while loops
* to allow 10 retries, in case of filesystem congestion
*/
if(local_saveStateFlag!=0)
{
LALInferenceCheckpointMCMC(runState);
LALInferenceWriteMCMCSamples(runState);
/* Wait for all processes to save */
MPI_Barrier(MPI_COMM_WORLD);
__master_saveStateFlag=0;
do
{
XLAL_TRY(LALInferenceCheckpointMCMC(runState), retcode);
if(retcode!=XLAL_SUCCESS)
{
saveattempts+=1;
fprintf(stderr,"Process %i failed to write checkpoint file %s \
at attempt %i, waiting to retry\n",MPIrank, runState->resumeOutFileName, saveattempts);
sleep(retrydelay*saveattempts); /* In case of IO failure wait progressively longer */
}
} while (retcode!=XLAL_SUCCESS && saveattempts<10);
if(retcode!=XLAL_SUCCESS) {fprintf(stderr,"Process %i failed to checkpoint\n", MPIrank);}
saveattempts=0;
do
{
XLAL_TRY(LALInferenceWriteMCMCSamples(runState), retcode);
if(retcode!=XLAL_SUCCESS)
{
saveattempts+=1;
fprintf(stderr,"Process %i failed to write samples file %s \
at attempt %i, waiting to retry\n",MPIrank, runState->outFileName, saveattempts);
sleep(retrydelay*saveattempts); /* In case of IO failure wait progressively longer */
}
} while (retcode!=XLAL_SUCCESS && saveattempts<10);
if(retcode!=XLAL_SUCCESS) {fprintf(stderr,"Process %i failed to checkpoint\n", MPIrank);}
/* Wait for all processes to save */
MPI_Barrier(MPI_COMM_WORLD);
__master_saveStateFlag=0;
local_saveStateFlag=0;
}
if(local_exitFlag) {
/* Wait for all processes to be ready to exit */
......
......@@ -2423,6 +2423,8 @@ class EngineJob(LALInferenceDAGJob,pipeline.CondorDAGJob,pipeline.AnalysisJob):
self.set_executable_installed(False)
# Set the options which are always used
self.set_sub_file(os.path.abspath(submitFile))
# 500 MB should be enough for anyone
self.add_condor_cmd('request_disk','500M')
if self.engine=='lalinferencemcmc':
self.binary=cp.get('condor',self.engine.replace('mpi',''))
self.mpirun=cp.get('condor','mpirun')
......@@ -2821,7 +2823,7 @@ def topdir(path):
return topdir(a)
else:
return b
class BayesWavePSDNode(EngineNode):
def __init__(self,bayeswavepsd_job):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment