Maintenance will be performed on git.ligo.org, chat.ligo.org, containers.ligo.org, and docs.ligo.org on Tuesday 26 May 2020 starting at approximately 10am CDT. It is expected to take around 30 minutes and will involve a short period of downtime, around 5 minutes, towards the end of the maintenance period. Please address any questions, comments, or concerns to uwm-help@cgca.uwm.edu.

Commit 2a0806c8 authored by John Douglas Veitch's avatar John Douglas Veitch Committed by Vivien Raymond

Add retries to LALInferenceMCMC checkpointing

parent b7136ff2
...@@ -460,13 +460,43 @@ void PTMCMCAlgorithm(struct tagLALInferenceRunState *runState) { ...@@ -460,13 +460,43 @@ void PTMCMCAlgorithm(struct tagLALInferenceRunState *runState) {
} }
MPI_Bcast(&local_saveStateFlag, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&local_saveStateFlag, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&local_exitFlag, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&local_exitFlag, 1, MPI_INT, 0, MPI_COMM_WORLD);
INT4 saveattempts=0;
INT4 retrydelay=5; /* 5 seconds before initial retry */
INT4 retcode=XLAL_SUCCESS;
/* The following checkpoint code is wrapped in do {} while loops
* to allow 10 retries, in case of filesystem congestion
*/
if(local_saveStateFlag!=0) if(local_saveStateFlag!=0)
{ {
LALInferenceCheckpointMCMC(runState); do
LALInferenceWriteMCMCSamples(runState); {
/* Wait for all processes to save */ XLAL_TRY(LALInferenceCheckpointMCMC(runState), retcode);
MPI_Barrier(MPI_COMM_WORLD); if(retcode!=XLAL_SUCCESS)
__master_saveStateFlag=0; {
saveattempts+=1;
fprintf(stderr,"Process %i failed to write checkpoint file %s \
at attempt %i, waiting to retry\n",MPIrank, runState->resumeOutFileName, saveattempts);
sleep(retrydelay*saveattempts); /* In case of IO failure wait progressively longer */
}
} while (retcode!=XLAL_SUCCESS && saveattempts<10);
if(retcode!=XLAL_SUCCESS) {fprintf(stderr,"Process %i failed to checkpoint\n", MPIrank);}
saveattempts=0;
do
{
XLAL_TRY(LALInferenceWriteMCMCSamples(runState), retcode);
if(retcode!=XLAL_SUCCESS)
{
saveattempts+=1;
fprintf(stderr,"Process %i failed to write samples file %s \
at attempt %i, waiting to retry\n",MPIrank, runState->outFileName, saveattempts);
sleep(retrydelay*saveattempts); /* In case of IO failure wait progressively longer */
}
} while (retcode!=XLAL_SUCCESS && saveattempts<10);
if(retcode!=XLAL_SUCCESS) {fprintf(stderr,"Process %i failed to checkpoint\n", MPIrank);}
/* Wait for all processes to save */
MPI_Barrier(MPI_COMM_WORLD);
__master_saveStateFlag=0;
local_saveStateFlag=0;
} }
if(local_exitFlag) { if(local_exitFlag) {
/* Wait for all processes to be ready to exit */ /* Wait for all processes to be ready to exit */
......
...@@ -2423,6 +2423,8 @@ class EngineJob(LALInferenceDAGJob,pipeline.CondorDAGJob,pipeline.AnalysisJob): ...@@ -2423,6 +2423,8 @@ class EngineJob(LALInferenceDAGJob,pipeline.CondorDAGJob,pipeline.AnalysisJob):
self.set_executable_installed(False) self.set_executable_installed(False)
# Set the options which are always used # Set the options which are always used
self.set_sub_file(os.path.abspath(submitFile)) self.set_sub_file(os.path.abspath(submitFile))
# 500 MB should be enough for anyone
self.add_condor_cmd('request_disk','500M')
if self.engine=='lalinferencemcmc': if self.engine=='lalinferencemcmc':
self.binary=cp.get('condor',self.engine.replace('mpi','')) self.binary=cp.get('condor',self.engine.replace('mpi',''))
self.mpirun=cp.get('condor','mpirun') self.mpirun=cp.get('condor','mpirun')
...@@ -2821,7 +2823,7 @@ def topdir(path): ...@@ -2821,7 +2823,7 @@ def topdir(path):
return topdir(a) return topdir(a)
else: else:
return b return b
class BayesWavePSDNode(EngineNode): class BayesWavePSDNode(EngineNode):
def __init__(self,bayeswavepsd_job): def __init__(self,bayeswavepsd_job):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment