From 8cf948592141eaf7a217e449a3be089d979a5308 Mon Sep 17 00:00:00 2001 From: Ezekiel Dohmen <ezekiel.dohmen@ligo.org> Date: Tue, 26 Apr 2022 16:10:11 -0700 Subject: [PATCH] Adding atomic vars to signal kernel models should exit --- src/fe/controllerApp.c | 13 +++- src/fe/controllerIop.c | 12 +++- src/fe/moduleLoad.c | 81 ++++++++++++++---------- src/include/util/kernel/exit_signaling.h | 24 +++++++ 4 files changed, 92 insertions(+), 38 deletions(-) create mode 100644 src/include/util/kernel/exit_signaling.h diff --git a/src/fe/controllerApp.c b/src/fe/controllerApp.c index 782a483b0..b18a103b4 100644 --- a/src/fe/controllerApp.c +++ b/src/fe/controllerApp.c @@ -36,7 +36,7 @@ #include "modelRateInfo.h" #include "fm10Gen.h" #include "util/printl.h" - +#include "util/kernel/exit_signaling.h" #include "../fe/timing_common.h" //captureEocTiming #include "../fe/timing_kernel.h" @@ -204,6 +204,8 @@ fe_start_controller( void ) if ( initVars( pDsp[ 0 ], pDsp[ 0 ], dspCoeff, MAX_MODULES, pCoeff[ 0 ] ) ) { pLocalEpics->epicsOutput.fe_status = FILT_INIT_ERROR; + wait_for_exit_signal(); + atomic_set(&g_atom_has_exited, 1); return; } @@ -243,6 +245,8 @@ fe_start_controller( void ) { pLocalEpics->epicsOutput.fe_status = DAQ_INIT_ERROR; vmeDone = 1; + wait_for_exit_signal(); + atomic_set(&g_atom_has_exited, 1); return; } #endif @@ -287,6 +291,8 @@ fe_start_controller( void ) if ( status ) { pLocalEpics->epicsOutput.fe_status = DAC_INIT_ERROR; + wait_for_exit_signal(); + atomic_set(&g_atom_has_exited, 1); return; } @@ -385,6 +391,8 @@ fe_start_controller( void ) pLocalEpics->epicsOutput.diagWord |= ADC_TIMEOUT_ERR; pLocalEpics->epicsOutput.fe_status = ADC_TO_ERROR; deallocate_dac_channels( ); + wait_for_exit_signal(); + atomic_set(&g_atom_has_exited, 1); return; } @@ -497,7 +505,7 @@ fe_start_controller( void ) // ***************************************************************** /// \> Check if code exit is requested if ( cycleNum == MAX_MODULES ) - vmeDone = stop_working_threads | + vmeDone = atomic_read(&g_atom_should_exit) | checkEpicsReset( cycleNum, (struct CDS_EPICS*)pLocalEpics ); // ***************************************************************** @@ -673,5 +681,6 @@ fe_start_controller( void ) deallocate_dac_channels( ); /* System reset command received */ + atomic_set(&g_atom_has_exited, 1); return; } diff --git a/src/fe/controllerIop.c b/src/fe/controllerIop.c index 259f7b166..942d28f43 100644 --- a/src/fe/controllerIop.c +++ b/src/fe/controllerIop.c @@ -37,6 +37,7 @@ #include "fm10Gen_types.h" #include "controller.h" #include "modelRateInfo.h" +#include "util/kernel/exit_signaling.h" #include "drv/daqLib.h" #include "../fe/timing_kernel.h" #include "../fe/sync21pps.h" @@ -256,6 +257,8 @@ fe_start_controller( void ) { pLocalEpics->epicsOutput.fe_status = FILT_INIT_ERROR; fe_status_return = FILT_INIT_ERROR; + wait_for_exit_signal(); + atomic_set(&g_atom_has_exited, 1); return; } @@ -291,6 +294,8 @@ fe_start_controller( void ) pLocalEpics->epicsOutput.fe_status = DAQ_INIT_ERROR; fe_status_return = DAQ_INIT_ERROR; vmeDone = 1; + wait_for_exit_signal(); + atomic_set(&g_atom_has_exited, 1); return; } @@ -619,7 +624,7 @@ fe_start_controller( void ) pLocalEpics->epicsOutput.diagWord |= ADC_TIMEOUT_ERR; pLocalEpics->epicsOutput.fe_status = ADC_TO_ERROR; pLocalEpics->epicsOutput.epicsSync++; - stop_working_threads = 1; + atomic_set(&g_atom_should_exit, 1); vmeDone = 1; continue; } @@ -631,7 +636,7 @@ fe_start_controller( void ) feStatus |= adc_status_update( &adcinfo ); pLocalEpics->epicsOutput.stateWord = FE_ERROR_ADC; pLocalEpics->epicsOutput.fe_status = CHAN_HOP_ERROR; - stop_working_threads = 1; + atomic_set(&g_atom_should_exit, 1); vmeDone = 1; pLocalEpics->epicsOutput.epicsSync++; continue; @@ -884,7 +889,7 @@ fe_start_controller( void ) // ***************************************************************** /// \> Check if code exit is requested if ( cycleNum == MAX_MODULES ) - vmeDone = stop_working_threads | + vmeDone = atomic_read(&g_atom_should_exit) | checkEpicsReset( cycleNum, (struct CDS_EPICS*)pLocalEpics ); // ***************************************************************** @@ -1249,5 +1254,6 @@ fe_start_controller( void ) pLocalEpics->epicsOutput.cpuMeter = 0; /* System reset command received */ + atomic_set(&g_atom_has_exited, 1); return; } diff --git a/src/fe/moduleLoad.c b/src/fe/moduleLoad.c index 573c7dfca..58d6b4f76 100644 --- a/src/fe/moduleLoad.c +++ b/src/fe/moduleLoad.c @@ -8,6 +8,8 @@ #include "verify_card_count.h" #include "print_io_info.h" #include "util/printl.h" +#include "util/timing.h" +#include "util/kernel/exit_signaling.h" #include "drv/map.h" //mapPciModules() #include "drv/ligoPcieTiming.h" #include "../fe/verify_slots.h" @@ -29,10 +31,18 @@ extern void fe_start_controller( void ); // // File function prototypes // -void rt_fe_cleanup( void ); -int rt_fe_init( void ); +static void rt_fe_cleanup( void ); +static int rt_fe_init( void ); + +// +// Signaling variables for proper module shutdown logic +atomic_t g_atom_should_exit = ATOMIC_INIT(0); +atomic_t g_atom_has_exited = ATOMIC_INIT(0); +// +// This symbol is used to enforce the IOP model +// is first loaded, before any app models #ifdef IOP_MODEL int need_to_load_IOP_first; EXPORT_SYMBOL( need_to_load_IOP_first ); @@ -55,14 +65,13 @@ static int fe_start_controller_kthread( void * arg ) fe_start_controller(); return 0; } - #endif //NO_CPU_SHUTDOWN -// MAIN routine: Code starting point + +// Linux Module init: Code starting point // **************************************************************** /// Startup function for initialization of kernel module. -int -rt_fe_init( void ) +static int __init rt_fe_init( void ) { int jj, kk; /// @param ii,jj,kk default loop counters int cards; /// @param cards Number of PCIe cards found on bus @@ -85,7 +94,7 @@ rt_fe_init( void ) #ifndef NO_CPU_SHUTDOWN /// Verify requested core is free. - if ( is_cpu_taken_by_rcg_model( CPUID ) ) + if ( is_cpu_occupied( CPUID ) ) { printl( KERN_ALERT "Error: CPU %d already taken\n", CPUID ); return -1; @@ -141,7 +150,6 @@ rt_fe_init( void ) { pLocalEpics->epicsOutput.fe_status = IO_CARD_MAP_ERROR; rt_fe_cleanup( ); - detach_shared_memory(); return -5; } @@ -151,7 +159,6 @@ rt_fe_init( void ) printl( "" SYSTEM_NAME_STRING_LOWER ": ERROR: No ADC cards found - exiting\n" ); rt_fe_cleanup( ); - detach_shared_memory(); return -5; } @@ -231,7 +238,6 @@ rt_fe_init( void ) printl( "" SYSTEM_NAME_STRING_LOWER ": ERROR: Exit on incorrect card count \n" ); rt_fe_cleanup( ); - detach_shared_memory(); return -5; } #endif @@ -262,7 +268,7 @@ rt_fe_init( void ) printl( "" SYSTEM_NAME_STRING_LOWER ": Locking CPU core %d\n", CPUID ); // The code runs on the disabled CPU - set_fe_code_idle( fe_start_controller, CPUID ); + set_rt_callback( fe_start_controller, CPUID ); msleep( 100 ); rts_isolator_exec( CPUID ); @@ -270,40 +276,49 @@ rt_fe_init( void ) return 0; } +void wait_for_module_exit(void) +{ + uint64_t stop_sig_time_ns = getMonotonic_ns_utin64(); + if( is_cpu_occupied(CPUID) ) + { + set_rt_callback( 0, CPUID ); + + // Wait for the module to signal that it has exited + while (atomic_read(&g_atom_has_exited) == 0) + { + msleep( 1 ); + } + + printl("It took %lld ms for the RT code to exit.\n", + (getMonotonic_ns_utin64() - stop_sig_time_ns)/1000000); + + set_rt_callback( 0, CPUID ); + msleep( 1000 ); + + // Bring the CPU back up + rts_isolator_cleanup( CPUID ); + } +} + /// Kernel module cleanup function -void -rt_fe_cleanup( void ) +static void __exit rt_fe_cleanup( void ) { -#ifndef NO_CPU_SHUTDOWN - /// Unset the code callback - set_fe_code_idle( 0, CPUID ); -#endif - // printl("Setting stop_working_threads to 1\n"); - // Stop the code and wait + // Signal the model to stop + atomic_set(&g_atom_should_exit, 1); + #ifdef NO_CPU_SHUTDOWN kthread_stop( sthread ); -#endif - stop_working_threads = 1; msleep( 1000 ); +#else + wait_for_module_exit(); +#endif #ifdef DOLPHIN_TEST /// Cleanup Dolphin card connections finish_dolphin( ); #endif -#ifndef NO_CPU_SHUTDOWN - - /// Bring the CPU core back on line - // Unset the code callback - set_fe_code_idle( 0, CPUID ); - // printll("Will bring back CPU %d\n", CPUID); - msleep( 1000 ); - // Bring the CPU back up - rts_isolator_cleanup( CPUID ); - msleep( 1000 ); -#endif - // Print out any error messages from FE code on exit print_exit_messages( fe_status_return, fe_status_return_subcode, SYSTEM_NAME_STRING_LOWER ); detach_shared_memory( ); diff --git a/src/include/util/kernel/exit_signaling.h b/src/include/util/kernel/exit_signaling.h new file mode 100644 index 000000000..675b9c04f --- /dev/null +++ b/src/include/util/kernel/exit_signaling.h @@ -0,0 +1,24 @@ +#ifndef LIGO_EXIT_SIGNALING_H +#define LIGO_EXIT_SIGNALING_H + +#include <linux/types.h> +#include <linux/delay.h> +#include <linux/atomic.h> + +// Used to signal the model to exit, defined in fe/moduleLoad.c +extern atomic_t g_atom_should_exit; +// Used by fe_start_controller() (The model) to signal it has exited, +// defined in fe/moduleLoad.c +extern atomic_t g_atom_has_exited; + +// When a model experiences a nonrecoverable error and needs to exit +// we block execution with this function until the model is rmmod-ed +static void wait_for_exit_signal(void) +{ + while(atomic_read(&g_atom_should_exit) == 0) + { + } +} + + +#endif //LIGO_EXIT_SIGNALING_H -- GitLab