Commit cac7e583 authored by Vivien Raymond's avatar Vivien Raymond

Merge branch 'lalinfmcmc_retry_checkpoint' into 'master'

Add retries to LALInferenceMCMC checkpointing

See merge request !1165
parents b7136ff2 2a0806c8
......@@ -460,13 +460,43 @@ void PTMCMCAlgorithm(struct tagLALInferenceRunState *runState) {
}
MPI_Bcast(&local_saveStateFlag, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&local_exitFlag, 1, MPI_INT, 0, MPI_COMM_WORLD);
INT4 saveattempts=0;
INT4 retrydelay=5; /* 5 seconds before initial retry */
INT4 retcode=XLAL_SUCCESS;
/* The following checkpoint code is wrapped in do {} while loops
* to allow 10 retries, in case of filesystem congestion
*/
if(local_saveStateFlag!=0)
{
LALInferenceCheckpointMCMC(runState);
LALInferenceWriteMCMCSamples(runState);
/* Wait for all processes to save */
MPI_Barrier(MPI_COMM_WORLD);
__master_saveStateFlag=0;
do
{
XLAL_TRY(LALInferenceCheckpointMCMC(runState), retcode);
if(retcode!=XLAL_SUCCESS)
{
saveattempts+=1;
fprintf(stderr,"Process %i failed to write checkpoint file %s \
at attempt %i, waiting to retry\n",MPIrank, runState->resumeOutFileName, saveattempts);
sleep(retrydelay*saveattempts); /* In case of IO failure wait progressively longer */
}
} while (retcode!=XLAL_SUCCESS && saveattempts<10);
if(retcode!=XLAL_SUCCESS) {fprintf(stderr,"Process %i failed to checkpoint\n", MPIrank);}
saveattempts=0;
do
{
XLAL_TRY(LALInferenceWriteMCMCSamples(runState), retcode);
if(retcode!=XLAL_SUCCESS)
{
saveattempts+=1;
fprintf(stderr,"Process %i failed to write samples file %s \
at attempt %i, waiting to retry\n",MPIrank, runState->outFileName, saveattempts);
sleep(retrydelay*saveattempts); /* In case of IO failure wait progressively longer */
}
} while (retcode!=XLAL_SUCCESS && saveattempts<10);
if(retcode!=XLAL_SUCCESS) {fprintf(stderr,"Process %i failed to checkpoint\n", MPIrank);}
/* Wait for all processes to save */
MPI_Barrier(MPI_COMM_WORLD);
__master_saveStateFlag=0;
local_saveStateFlag=0;
}
if(local_exitFlag) {
/* Wait for all processes to be ready to exit */
......
......@@ -2423,6 +2423,8 @@ class EngineJob(LALInferenceDAGJob,pipeline.CondorDAGJob,pipeline.AnalysisJob):
self.set_executable_installed(False)
# Set the options which are always used
self.set_sub_file(os.path.abspath(submitFile))
# 500 MB should be enough for anyone
self.add_condor_cmd('request_disk','500M')
if self.engine=='lalinferencemcmc':
self.binary=cp.get('condor',self.engine.replace('mpi',''))
self.mpirun=cp.get('condor','mpirun')
......@@ -2821,7 +2823,7 @@ def topdir(path):
return topdir(a)
else:
return b
class BayesWavePSDNode(EngineNode):
def __init__(self,bayeswavepsd_job):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment