From 8cf948592141eaf7a217e449a3be089d979a5308 Mon Sep 17 00:00:00 2001
From: Ezekiel Dohmen <ezekiel.dohmen@ligo.org>
Date: Tue, 26 Apr 2022 16:10:11 -0700
Subject: [PATCH] Adding atomic vars to signal kernel models should exit

---
 src/fe/controllerApp.c                   | 13 +++-
 src/fe/controllerIop.c                   | 12 +++-
 src/fe/moduleLoad.c                      | 81 ++++++++++++++----------
 src/include/util/kernel/exit_signaling.h | 24 +++++++
 4 files changed, 92 insertions(+), 38 deletions(-)
 create mode 100644 src/include/util/kernel/exit_signaling.h

diff --git a/src/fe/controllerApp.c b/src/fe/controllerApp.c
index 782a483b0..b18a103b4 100644
--- a/src/fe/controllerApp.c
+++ b/src/fe/controllerApp.c
@@ -36,7 +36,7 @@
 #include "modelRateInfo.h"
 #include "fm10Gen.h"
 #include "util/printl.h"
-
+#include "util/kernel/exit_signaling.h"
 #include "../fe/timing_common.h" //captureEocTiming
 #include "../fe/timing_kernel.h"
 
@@ -204,6 +204,8 @@ fe_start_controller( void )
     if ( initVars( pDsp[ 0 ], pDsp[ 0 ], dspCoeff, MAX_MODULES, pCoeff[ 0 ] ) )
     {
         pLocalEpics->epicsOutput.fe_status = FILT_INIT_ERROR;
+        wait_for_exit_signal();
+        atomic_set(&g_atom_has_exited, 1);
         return;
     }
 
@@ -243,6 +245,8 @@ fe_start_controller( void )
     {
         pLocalEpics->epicsOutput.fe_status = DAQ_INIT_ERROR;
         vmeDone = 1;
+        wait_for_exit_signal();
+        atomic_set(&g_atom_has_exited, 1);
         return;
     }
 #endif
@@ -287,6 +291,8 @@ fe_start_controller( void )
     if ( status )
     {
         pLocalEpics->epicsOutput.fe_status = DAC_INIT_ERROR;
+        wait_for_exit_signal();
+        atomic_set(&g_atom_has_exited, 1);
         return;
     }
 
@@ -385,6 +391,8 @@ fe_start_controller( void )
                 pLocalEpics->epicsOutput.diagWord |= ADC_TIMEOUT_ERR;
                 pLocalEpics->epicsOutput.fe_status = ADC_TO_ERROR;
                 deallocate_dac_channels( );
+                wait_for_exit_signal();
+                atomic_set(&g_atom_has_exited, 1);
                 return;
             }
 
@@ -497,7 +505,7 @@ fe_start_controller( void )
         // *****************************************************************
         /// \> Check if code exit is requested
         if ( cycleNum == MAX_MODULES )
-            vmeDone = stop_working_threads |
+            vmeDone =  atomic_read(&g_atom_should_exit) |
                 checkEpicsReset( cycleNum, (struct CDS_EPICS*)pLocalEpics );
         // *****************************************************************
 
@@ -673,5 +681,6 @@ fe_start_controller( void )
     deallocate_dac_channels( );
 
     /* System reset command received */
+    atomic_set(&g_atom_has_exited, 1);
     return;
 }
diff --git a/src/fe/controllerIop.c b/src/fe/controllerIop.c
index 259f7b166..942d28f43 100644
--- a/src/fe/controllerIop.c
+++ b/src/fe/controllerIop.c
@@ -37,6 +37,7 @@
 #include "fm10Gen_types.h"
 #include "controller.h"
 #include "modelRateInfo.h"
+#include "util/kernel/exit_signaling.h"
 #include "drv/daqLib.h"
 #include "../fe/timing_kernel.h"
 #include "../fe/sync21pps.h"
@@ -256,6 +257,8 @@ fe_start_controller( void )
     {
         pLocalEpics->epicsOutput.fe_status = FILT_INIT_ERROR;
         fe_status_return = FILT_INIT_ERROR;
+        wait_for_exit_signal();
+        atomic_set(&g_atom_has_exited, 1);
         return;
     }
 
@@ -291,6 +294,8 @@ fe_start_controller( void )
         pLocalEpics->epicsOutput.fe_status = DAQ_INIT_ERROR;
         fe_status_return = DAQ_INIT_ERROR;
         vmeDone = 1;
+        wait_for_exit_signal();
+        atomic_set(&g_atom_has_exited, 1);
         return;
     }
 
@@ -619,7 +624,7 @@ fe_start_controller( void )
             pLocalEpics->epicsOutput.diagWord |= ADC_TIMEOUT_ERR;
             pLocalEpics->epicsOutput.fe_status = ADC_TO_ERROR;
             pLocalEpics->epicsOutput.epicsSync++;
-            stop_working_threads = 1;
+            atomic_set(&g_atom_should_exit, 1);
             vmeDone = 1;
             continue;
         }
@@ -631,7 +636,7 @@ fe_start_controller( void )
             feStatus |= adc_status_update( &adcinfo );
             pLocalEpics->epicsOutput.stateWord = FE_ERROR_ADC;
             pLocalEpics->epicsOutput.fe_status = CHAN_HOP_ERROR;
-            stop_working_threads = 1;
+            atomic_set(&g_atom_should_exit, 1);
             vmeDone = 1;
             pLocalEpics->epicsOutput.epicsSync++;
             continue;
@@ -884,7 +889,7 @@ fe_start_controller( void )
             // *****************************************************************
             /// \> Check if code exit is requested
             if ( cycleNum == MAX_MODULES )
-                vmeDone = stop_working_threads |
+                vmeDone = atomic_read(&g_atom_should_exit) |
                     checkEpicsReset( cycleNum, (struct CDS_EPICS*)pLocalEpics );
 
             // *****************************************************************
@@ -1249,5 +1254,6 @@ fe_start_controller( void )
     pLocalEpics->epicsOutput.cpuMeter = 0;
 
     /* System reset command received */
+    atomic_set(&g_atom_has_exited, 1);
     return;
 }
diff --git a/src/fe/moduleLoad.c b/src/fe/moduleLoad.c
index 573c7dfca..58d6b4f76 100644
--- a/src/fe/moduleLoad.c
+++ b/src/fe/moduleLoad.c
@@ -8,6 +8,8 @@
 #include "verify_card_count.h"
 #include "print_io_info.h"
 #include "util/printl.h"
+#include "util/timing.h"
+#include "util/kernel/exit_signaling.h"
 #include "drv/map.h" //mapPciModules()
 #include "drv/ligoPcieTiming.h"
 #include "../fe/verify_slots.h"
@@ -29,10 +31,18 @@ extern void  fe_start_controller( void );
 //
 // File function prototypes
 //
-void rt_fe_cleanup( void );
-int rt_fe_init( void );
+static void rt_fe_cleanup( void );
+static int rt_fe_init( void );
+
+//
+// Signaling variables for proper module shutdown logic
+atomic_t g_atom_should_exit = ATOMIC_INIT(0);
+atomic_t g_atom_has_exited = ATOMIC_INIT(0);
 
 
+//
+// This symbol is used to enforce the IOP model
+// is first loaded, before any app models
 #ifdef IOP_MODEL
 int need_to_load_IOP_first;
 EXPORT_SYMBOL( need_to_load_IOP_first );
@@ -55,14 +65,13 @@ static int  fe_start_controller_kthread( void * arg )
     fe_start_controller();
     return 0;
 }
-
 #endif //NO_CPU_SHUTDOWN
 
-// MAIN routine: Code starting point
+
+// Linux Module init: Code starting point
 // ****************************************************************
 /// Startup function for initialization of kernel module.
-int
-rt_fe_init( void )
+static int __init rt_fe_init( void )
 {
     int jj, kk; /// @param ii,jj,kk default loop counters
     int cards; /// @param cards Number of PCIe cards found on bus
@@ -85,7 +94,7 @@ rt_fe_init( void )
 
 #ifndef NO_CPU_SHUTDOWN
     /// Verify requested core is free.
-    if ( is_cpu_taken_by_rcg_model( CPUID ) )
+    if ( is_cpu_occupied( CPUID ) )
     {
         printl( KERN_ALERT "Error: CPU %d already taken\n", CPUID );
         return -1;
@@ -141,7 +150,6 @@ rt_fe_init( void )
     {
         pLocalEpics->epicsOutput.fe_status = IO_CARD_MAP_ERROR;
         rt_fe_cleanup( );
-        detach_shared_memory();
         return -5;
     }
 
@@ -151,7 +159,6 @@ rt_fe_init( void )
         printl( "" SYSTEM_NAME_STRING_LOWER
                 ": ERROR: No ADC cards found - exiting\n" );
         rt_fe_cleanup( );
-        detach_shared_memory();
         return -5;
     }
 
@@ -231,7 +238,6 @@ rt_fe_init( void )
         printl( "" SYSTEM_NAME_STRING_LOWER
                 ": ERROR: Exit on incorrect card count \n" );
         rt_fe_cleanup( );
-        detach_shared_memory();
         return -5;
     }
 #endif
@@ -262,7 +268,7 @@ rt_fe_init( void )
     printl( "" SYSTEM_NAME_STRING_LOWER ": Locking CPU core %d\n", CPUID );
 
     // The code runs on the disabled CPU
-    set_fe_code_idle( fe_start_controller, CPUID );
+    set_rt_callback( fe_start_controller, CPUID );
     msleep( 100 );
     rts_isolator_exec( CPUID );
 
@@ -270,40 +276,49 @@ rt_fe_init( void )
     return 0;
 }
 
+void wait_for_module_exit(void)
+{
+    uint64_t stop_sig_time_ns = getMonotonic_ns_utin64();
+    if( is_cpu_occupied(CPUID) )
+    {
+        set_rt_callback( 0, CPUID );
+        
+        // Wait for the module to signal that it has exited
+        while (atomic_read(&g_atom_has_exited) == 0)
+        {
+            msleep( 1 );
+        }
+
+        printl("It took %lld ms for the RT code to exit.\n",
+                 (getMonotonic_ns_utin64() - stop_sig_time_ns)/1000000);
+
+        set_rt_callback( 0, CPUID );
+        msleep( 1000 );
+
+        // Bring the CPU back up
+        rts_isolator_cleanup( CPUID );
+    }
+}
+
 /// Kernel module cleanup function
-void
-rt_fe_cleanup( void )
+static void __exit rt_fe_cleanup( void )
 {
-#ifndef NO_CPU_SHUTDOWN
-    /// Unset the code callback
-    set_fe_code_idle( 0, CPUID );
-#endif
 
-    // printl("Setting stop_working_threads to 1\n");
-    // Stop the code and wait
+    // Signal the model to stop
+    atomic_set(&g_atom_should_exit, 1);
+
 #ifdef NO_CPU_SHUTDOWN
     kthread_stop( sthread );
-#endif
-    stop_working_threads = 1;
     msleep( 1000 );
+#else
+    wait_for_module_exit();
+#endif
 
 #ifdef DOLPHIN_TEST
     /// Cleanup Dolphin card connections
     finish_dolphin( );
 #endif
 
-#ifndef NO_CPU_SHUTDOWN
-
-    /// Bring the CPU core back on line
-    // Unset the code callback
-    set_fe_code_idle( 0, CPUID );
-    // printll("Will bring back CPU %d\n", CPUID);
-    msleep( 1000 );
-    // Bring the CPU back up
-    rts_isolator_cleanup( CPUID );
-    msleep( 1000 );
-#endif
-
     // Print out any error messages from FE code on exit
     print_exit_messages( fe_status_return, fe_status_return_subcode, SYSTEM_NAME_STRING_LOWER );
     detach_shared_memory( );
diff --git a/src/include/util/kernel/exit_signaling.h b/src/include/util/kernel/exit_signaling.h
new file mode 100644
index 000000000..675b9c04f
--- /dev/null
+++ b/src/include/util/kernel/exit_signaling.h
@@ -0,0 +1,24 @@
+#ifndef LIGO_EXIT_SIGNALING_H
+#define LIGO_EXIT_SIGNALING_H
+
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/atomic.h>
+
+// Used to signal the model to exit, defined in fe/moduleLoad.c
+extern atomic_t g_atom_should_exit;
+// Used by fe_start_controller() (The model) to signal it has exited,
+// defined in fe/moduleLoad.c
+extern atomic_t g_atom_has_exited; 
+
+// When a model experiences a nonrecoverable error and needs to exit
+// we block execution with this function until the model is rmmod-ed
+static void wait_for_exit_signal(void)
+{
+    while(atomic_read(&g_atom_should_exit) == 0)
+    {
+    }
+}
+
+
+#endif //LIGO_EXIT_SIGNALING_H
-- 
GitLab