From c5c004615117642c70e740b16705eb10cc687ae5 Mon Sep 17 00:00:00 2001 From: "ezekiel.dohmen" <ezekiel.dohmen@ligo.org> Date: Wed, 13 Mar 2024 15:46:15 -0500 Subject: [PATCH] Saving work for IPC wait --- src/epics/util/skeleton.st | 1 + src/fe/commData3.c | 247 ++++++++++++++++++++++++++----------- src/fe/controllerApp.c | 16 ++- src/fe/controllerIop.c | 14 +++ src/include/commData3.h | 13 +- src/include/controller.h | 1 + 6 files changed, 216 insertions(+), 76 deletions(-) diff --git a/src/epics/util/skeleton.st b/src/epics/util/skeleton.st index e483321a9..794af8030 100644 --- a/src/epics/util/skeleton.st +++ b/src/epics/util/skeleton.st @@ -666,6 +666,7 @@ state monScreen{ pvPut(uptime_minute); festat = pEpics->epicsOutput.fe_status; +%% if(festat == -10) strcpy(msgFESTAT,"IPC Init ERROR "); %% if(festat == -9) strcpy(msgFESTAT,"IO Card Map ERROR "); %% if(festat == -8) strcpy(msgFESTAT,"IO Config ERROR "); %% if(festat == -7) strcpy(msgFESTAT,"ADC Timeout - exiting"); diff --git a/src/fe/commData3.c b/src/fe/commData3.c index 8c186b86a..5a00731ea 100644 --- a/src/fe/commData3.c +++ b/src/fe/commData3.c @@ -17,19 +17,84 @@ /// WARRANTY; without even the implied warranty of MERCHANTABILITY or /// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License /// for more details. + + +/// Information on Waiting for Late IPCs +/// +/// When an IPC is late, we would like to wait some additional time for +/// the IPC to arrive. The simplest approtch might be to to wait a fixed +/// amount of time, based on model cycle rate. But because commData3Receive() +/// is called first thing in the model's feCode() we don't know how much time +/// we need for the model logic to execute. A better approtch would be +/// to take into account an estimate of how long the model has been taking +/// and use that to limit how long we can wait. This way models with lots of +/// headroom (lots of models like this) can wait for longer and models with +/// no headroom won't be impaired. +/// +/// The g_max_cycle_time_us variable will store the maximum time the model +/// has taken in the last second, and will be used to estimate the +/// headroom we have to wait for IPCs. It is updated by calling +/// commData3SetCycleMax(), the controller will do this once a second. +/// +/// The g_cycle_start_tsc variable will store the start time TSC counter. +/// This will be used to measure how long we have been waiting and when +/// we should abort out wait. This should be updated by the +/// commData3SetCycleStartTime() function every cycle before calling feCode(). +/// +/// Because waiting for IPCs will in turn make this model send its IPCs later +/// and thus make its receivers get their data later, we could start a chain +/// reaction where IPC errors become more common and then the IPC waiting +/// becomes the main reason for missed IPCs. So this solution will assume +/// mostly good IPCs with the occasional late IPC. That means we should only +/// wait when the last IPC was good, and once a IPC times out, we shouldn't +/// wait for IPCs until they start coming in on time again. +/// + #include "commData3.h" #include "controller.h" //cdsPciModules, _shmipc_shm #include "drv/rts-logger.h" +#include "util/timing.h" +#include "modelRateInfo.h" #ifdef __KERNEL__ #include <asm/cacheflush.h> +#include <linux/delay.h> #else #include <stdio.h> #include <stddef.h> #endif -static int localCycleTo65K; +#define MULT_HEADROOM_TO_WAIT 0.8 //We will wait for 80% of the expected headroom + +/** This is calculated in the init function, and stores a multiplier that will + * convert cycle numbers from models of different rates to the equivalent + * count for a 2^16 Hz model. 2048Hz(190) -> 65536Hz(6080) + */ +static int g_localCycleTo65K; + +/** The g_max_cycle_time_us variable stores the model's max execution time + * (over the last second) and is updated by the controller once a second. + * This is used to calculate the expected headroom that we have to wait for + * late IPCs. + */ +static uint32_t g_max_cycle_time_us; + +/** The g_cycle_start_tsc is a captured monotonic clk from the start of the + * models latest cycle. This is used to measure how long we have been waiting + * so we can timeout. + */ +static uint64_t g_cycle_start_tsc; + +/** The g_in_error_state variable is used to mark when one or more IPCs have + * timed out. This is used to prevent us from timing out over and over again. + * After we go into this error state, the system will wait until all IPCs + * are received as expected (in a cycle) before starting to wait for + * late IPCs again. + */ +static bool g_in_error_state; + +static bool g_init_failed; /// This function is called from the user application to initialize /// communications structures and pointers. @@ -47,15 +112,21 @@ void commData3Init( //Set ipcMemOffset to default value, will re-set below unsigned long ipcMemOffset = IPC_PCIE_BASE_OFFSET + RFM0_OFFSET; + //We will default to a very short wait time, until commData3SetCycleMax() + //can update it. + g_max_cycle_time_us = 1; + g_in_error_state = false; + g_init_failed = false; + //For now all cycle numbers passed are in terms of a 65K or less //so we don't need to support a divisor if (model_rate_hz >= IPC_MAX_RATE) { - localCycleTo65K = 1; + g_localCycleTo65K = 1; } else { - localCycleTo65K = IPC_MAX_RATE / model_rate_hz; + g_localCycleTo65K = IPC_MAX_RATE / model_rate_hz; } for ( ii = 0; ii < connects; ii++ ) @@ -84,12 +155,12 @@ void commData3Init( RTSLOG_DEBUG("New Recever : sendRate: %d, rcvCycle65k: %d, localCycleTo65K: %d\n", ipcInfo[ ii ].sendRate, ipcInfo[ii].rcvCycle65k, - localCycleTo65K); + g_localCycleTo65K); } // Clear the data point ipcInfo[ ii ].data = 0.0; - ipcInfo[ ii ].pIpcDataRead[ 0 ] = NULL; - ipcInfo[ ii ].pIpcDataWrite[ 0 ] = NULL; + ipcInfo[ ii ].pIpcDataRead = NULL; + ipcInfo[ ii ].pIpcDataWrite = NULL; // Save pointers to the IPC communications memory locations. if ( ipcInfo[ ii ].netType == IRFM0 ) @@ -102,52 +173,58 @@ void commData3Init( ( ipcInfo[ ii ].mode == ISND ) && ( cdsPciModules.dolphinRfmWritePtr ) ) { - ipcInfo[ ii ].pIpcDataWrite[ 0 ] = + ipcInfo[ ii ].pIpcDataWrite = (CDS_IPC_COMMS*)( (volatile char*)( cdsPciModules.dolphinRfmWritePtr ) + ipcMemOffset ); } - if ( ( ipcInfo[ ii ].netType == IRFM0 || + else if ( ( ipcInfo[ ii ].netType == IRFM0 || ipcInfo[ ii ].netType == IRFM1 ) && ( ipcInfo[ ii ].mode == IRCV ) && ( cdsPciModules.dolphinRfmReadPtr ) ) { - ipcInfo[ ii ].pIpcDataRead[ 0 ] = + ipcInfo[ ii ].pIpcDataRead = (CDS_IPC_COMMS*)( (volatile char*)( cdsPciModules.dolphinRfmReadPtr ) + ipcMemOffset ); } - if ( ipcInfo[ ii ].netType == + else if ( ipcInfo[ ii ].netType == ISHME ) // Computer shared memory ****************************** { if ( ipcInfo[ ii ].mode == ISND ) - ipcInfo[ ii ].pIpcDataWrite[ 0 ] = + ipcInfo[ ii ].pIpcDataWrite = (CDS_IPC_COMMS*)( _shmipc_shm + IPC_BASE_OFFSET ); else - ipcInfo[ ii ].pIpcDataRead[ 0 ] = + ipcInfo[ ii ].pIpcDataRead = (CDS_IPC_COMMS*)( _shmipc_shm + IPC_BASE_OFFSET ); } // PCIe communications requires one pointer for sending data and a // second one for receiving data. - if ( ( ipcInfo[ ii ].netType == IPCIE ) && + else if ( ( ipcInfo[ ii ].netType == IPCIE ) && ( ipcInfo[ ii ].mode == IRCV ) && ( cdsPciModules.dolphinPcieReadPtr ) ) { - ipcInfo[ ii ].pIpcDataRead[ 0 ] = + ipcInfo[ ii ].pIpcDataRead = (CDS_IPC_COMMS*)( (volatile char*)( cdsPciModules.dolphinPcieReadPtr ) + IPC_PCIE_BASE_OFFSET ); } - if ( ( ipcInfo[ ii ].netType == IPCIE ) && + else if ( ( ipcInfo[ ii ].netType == IPCIE ) && ( ipcInfo[ ii ].mode == ISND ) && ( cdsPciModules.dolphinPcieWritePtr ) ) { - ipcInfo[ ii ].pIpcDataWrite[ 0 ] = + ipcInfo[ ii ].pIpcDataWrite = (CDS_IPC_COMMS*)( (volatile char*)( cdsPciModules.dolphinPcieWritePtr ) + IPC_PCIE_BASE_OFFSET ); } + else //Error case, no match + { + RTSLOG_ERROR("commData3Init() - IPC index %d, was not matched as a supported type. " + "netType: %d, mode: %d",ii , ipcInfo[ ii ].netType, ipcInfo[ ii ].mode ); + g_init_failed = true; + } #if 0 // Following for diags, if desired. Otherwise, leave out as it fills dmesg if(ipcInfo[ii].mode == ISND && ipcInfo[ii].netType != ISHME) { @@ -156,12 +233,17 @@ void commData3Init( RTSLOG_DEBUG("Sender Model Name = %s\n",ipcInfo[ii].senderModelName); RTSLOG_DEBUG("RCV Rate = %d\n",ipcInfo[ii].rcvRate); RTSLOG_DEBUG("Send Computer Number = %d\n",ipcInfo[ii].sendNode); - RTSLOG_DEBUG("Send address = %lx\n",(unsigned long)&ipcInfo[ii].pIpcDataWrite[0]->dBlock[0][ipcInfo[ii].ipcNum].data); + RTSLOG_DEBUG("Send address = %lx\n",(unsigned long)&ipcInfo[ii].pIpcDataWrite->dBlock[0][ipcInfo[ii].ipcNum].data); } #endif } } +int commData3GetInitStatus( void ) +{ + return g_init_failed; +} + // ************************************************************************************************* /// This function is called once on FE code init and returns the number of /// IPC senders and receivers so they can be reported in a data channel @@ -265,21 +347,19 @@ void commData3Send( chan = ipcInfo[ ii ].ipcNum; // Determine next block to write in IPC_BLOCKS block buffer ipcIndex = ipcInfo[ ii ].ipcNum; - // Don't write to PCI RFM if network error detected by IOP - if ( ipcInfo[ ii ].pIpcDataWrite[ 0 ] != NULL ) - { - // Write Data - ipcInfo[ ii ] - .pIpcDataWrite[ 0 ] - ->dBlock[ sendBlock ][ ipcIndex ] - .data = ipcInfo[ ii ].data; - // Write timestamp/cycle counter word - ipcInfo[ ii ] - .pIpcDataWrite[ 0 ] - ->dBlock[ sendBlock ][ ipcIndex ] - .timestamp = syncWord; - lastPcie = ii; - } + + // Write Data + ipcInfo[ ii ] + .pIpcDataWrite + ->dBlock[ sendBlock ][ ipcIndex ] + .data = ipcInfo[ ii ].data; + // Write timestamp/cycle counter word + ipcInfo[ ii ] + .pIpcDataWrite + ->dBlock[ sendBlock ][ ipcIndex ] + .timestamp = syncWord; + lastPcie = ii; + } } @@ -289,7 +369,7 @@ void commData3Send( { clflush_cache_range( (void*)&( ipcInfo[ lastPcie ] - .pIpcDataWrite[ 0 ] + .pIpcDataWrite ->dBlock[ sendBlock ][ ipcInfo[ lastPcie ].ipcNum ] .data ), 16 ); @@ -326,24 +406,22 @@ void commData3Send( chan = ipcInfo[ ii ].ipcNum; // Determine next block to write in IPC_BLOCKS block buffer ipcIndex = ipcInfo[ ii ].ipcNum; - // Don't write to PCI RFM if network error detected by IOP - if ( ipcInfo[ ii ].pIpcDataWrite[ 0 ] != NULL ) + + // Write Data + ipcInfo[ ii ] + .pIpcDataWrite + ->dBlock[ sendBlock ][ ipcIndex ] + .data = ipcInfo[ ii ].data; + // Write timestamp/cycle counter word + ipcInfo[ ii ] + .pIpcDataWrite + ->dBlock[ sendBlock ][ ipcIndex ] + .timestamp = syncWord; + if ( ipcInfo[ ii ].netType == IPCIE ) { - // Write Data - ipcInfo[ ii ] - .pIpcDataWrite[ 0 ] - ->dBlock[ sendBlock ][ ipcIndex ] - .data = ipcInfo[ ii ].data; - // Write timestamp/cycle counter word - ipcInfo[ ii ] - .pIpcDataWrite[ 0 ] - ->dBlock[ sendBlock ][ ipcIndex ] - .timestamp = syncWord; - if ( ipcInfo[ ii ].netType == IPCIE ) - { - lastPcie = ii; - } + lastPcie = ii; } + } } @@ -353,7 +431,7 @@ void commData3Send( { clflush_cache_range( (void*)&( ipcInfo[ lastPcie ] - .pIpcDataWrite[ 0 ] + .pIpcDataWrite ->dBlock[ sendBlock ][ ipcInfo[ lastPcie ].ipcNum ] .data ), 16 ); @@ -383,7 +461,7 @@ int commData3Receive( { unsigned long syncWord; // Combined GPS timestamp and cycle counter word // received with data - unsigned long mySyncWord; // Local version of syncWord for comparison and + unsigned long expectedSyncWord; // Local version of syncWord for comparison and // error detection int ipcIndex; // Pointer to next IPC data buffer int cycle65k; // All data sent with 64K cycle count; need to convert local @@ -392,12 +470,22 @@ int commData3Receive( int rcvBlock; // Which of the IPC_BLOCKS IPC data blocks to read from double tmp; // Temp location for data for checking NaN int numInError=0; + uint64_t max_time_to_wait_ns; + int ipc_good = false; + if ( g_max_cycle_time_us >= ((1.0/FE_RATE)*1000000)) { //Max is larger than cycle time (no headroom) + max_time_to_wait_ns = 0; + } + else { + max_time_to_wait_ns = ((((1.0/FE_RATE)*1000000) - g_max_cycle_time_us) * 1000) * MULT_HEADROOM_TO_WAIT; + } // Create local 65K cycle count - cycle65k = ( cycle * localCycleTo65K ); + cycle65k = ( cycle * g_localCycleTo65K ); // Calculate the block where the next data point is at rcvBlock = (cycle65k) % IPC_BLOCKS; + // Create local GPS time/cycle word for comparison to ipc + expectedSyncWord = ( (uint64_t)timeSec << 32 ) + cycle65k; for ( ii = 0; ii < connects; ii++ ) @@ -407,38 +495,42 @@ int commData3Receive( if ( (cycle65k % ipcInfo[ ii ].rcvCycle65k ) == 0 ) // Time to rcv { - if ( ipcInfo[ ii ].pIpcDataRead[ 0 ] != NULL ) - { + do { ipcIndex = ipcInfo[ ii ].ipcNum; // Read GPS time/cycle count tmp = ipcInfo[ ii ] - .pIpcDataRead[ 0 ] - ->dBlock[ rcvBlock ][ ipcIndex ] - .data; + .pIpcDataRead + ->dBlock[ rcvBlock ][ ipcIndex ] + .data; syncWord = ipcInfo[ ii ] - .pIpcDataRead[ 0 ] - ->dBlock[ rcvBlock ][ ipcIndex ] - .timestamp; - mySyncWord = timeSec; - // Create local GPS time/cycle word for comparison to ipc - mySyncWord = ( mySyncWord << 32 ) + cycle65k; + .pIpcDataRead + ->dBlock[ rcvBlock ][ ipcIndex ] + .timestamp; // If IPC syncword = local syncword, data is good - if ( syncWord == mySyncWord ) + if ( syncWord == expectedSyncWord ) { ipcInfo[ ii ].data = tmp; - // If IPC syncword != local syncword, data is BAD - // Set error and leave value same as last good receive + ipc_good = true; + break; //Got latest data, exit wait loop } else { - ipcInfo[ ii ].errFlag++; + //wait, in case IPC is almost here + if ( g_in_error_state == false ) + udelay(1); } - } - else - { + + } while ( timer_tock_ns(&g_cycle_start_tsc) < max_time_to_wait_ns && g_in_error_state == false ); + + //We timed out + if( ipc_good == false ) { ipcInfo[ ii ].errFlag++; + g_in_error_state = true; } + ipc_good = false; //Reset for next IPC + + } if (ipcInfo[ ii ].errFlag != 0) ++numInError; @@ -464,6 +556,21 @@ int commData3Receive( } } //if ( cycle == 0 ) + if( numInError == 0) { + //We got all our IPCs, so we are in a good state + //start waiting for IPCs again + g_in_error_state = false; + } + return numInError; } +void commData3SetCycleMax(unsigned measured_max_us) +{ + g_max_cycle_time_us = measured_max_us; +} + +void commData3SetCycleStartTime(uint64_t cycle_start_tsc) +{ + g_cycle_start_tsc = cycle_start_tsc; +} \ No newline at end of file diff --git a/src/fe/controllerApp.c b/src/fe/controllerApp.c index 38a1d9709..baf9eaa8d 100644 --- a/src/fe/controllerApp.c +++ b/src/fe/controllerApp.c @@ -280,7 +280,18 @@ fe_start_controller( void ) dspPtr[ 0 ], &dspCoeff[ 0 ], (struct CDS_EPICS*)pLocalEpics, - 1 ); + FE_CODE_INIT ); + + //Check the status of the IPC init called in feCode() + if ( commData3GetInitStatus() != 0) { + fe_status_return = IPC_INIT_ERROR; + pLocalEpics->epicsOutput.fe_status = IPC_INIT_ERROR; + RTSLOG_ERROR("Failed to initialize IPCs, waiting for an exit signal.\n"); + vmeDone = 1; + wait_for_exit_signal(); + atomic_set(&g_atom_has_exited, 1); + return; + } // Initialize timing info variables initializeTimingDiags( &timeinfo ); @@ -426,6 +437,7 @@ fe_start_controller( void ) /// - -- This is where the user application produced by RCG gets called /// and executed. \n\n cpuClock[ CPU_TIME_USR_START ] = rdtsc_ordered( ); + commData3SetCycleStartTime( cpuClock[ CPU_TIME_USR_START ] ); iopDacEnable = feCode( cycleNum, dWord, dacOut, @@ -472,6 +484,8 @@ fe_start_controller( void ) { sendTimingDiags2Epics( pLocalEpics, &timeinfo, &adcinfo ); + commData3SetCycleMax( timeinfo.cycleTimeSec ); + if ( ( adcinfo.adcHoldTime > CYCLE_TIME_ALRM_HI ) || ( adcinfo.adcHoldTime < CYCLE_TIME_ALRM_LO ) ) { diff --git a/src/fe/controllerIop.c b/src/fe/controllerIop.c index 3d37e26da..d03143ffe 100644 --- a/src/fe/controllerIop.c +++ b/src/fe/controllerIop.c @@ -339,6 +339,17 @@ fe_start_controller( void ) (struct CDS_EPICS*)pLocalEpics, FE_CODE_INIT ); + //Check the status of the IPC init called in feCode() + if ( commData3GetInitStatus() != 0) { + fe_status_return = IPC_INIT_ERROR; + pLocalEpics->epicsOutput.fe_status = IPC_INIT_ERROR; + RTSLOG_ERROR("Failed to initialize IPCs, waiting for an exit signal.\n"); + vmeDone = 1; + wait_for_exit_signal(); + atomic_set(&g_atom_has_exited, 1); + return; + } + // Initialize timing info variables initializeTimingDiags( &timeinfo ); @@ -767,6 +778,7 @@ fe_start_controller( void ) // Following captures time after ADC reads cpuClock[ CPU_TIME_USR_START ] = rdtsc_ordered( ); + commData3SetCycleStartTime( cpuClock[ CPU_TIME_USR_START ] ); // In normal operation, the following for loop runs only once per IOP // code cycle. This for loop runs > once per cycle if ADC is clocking @@ -1170,6 +1182,8 @@ fe_start_controller( void ) { sendTimingDiags2Epics( pLocalEpics, &timeinfo, &adcinfo ); + commData3SetCycleMax( timeinfo.cycleTimeSec ); + pLocalEpics->epicsOutput.dacEnable = dacEnable; if ( ( adcinfo.adcHoldTime > CYCLE_TIME_ALRM_HI ) || diff --git a/src/include/commData3.h b/src/include/commData3.h index 0c721da99..319b2dbc0 100644 --- a/src/include/commData3.h +++ b/src/include/commData3.h @@ -5,6 +5,8 @@ #ifndef __COMMDATA3_H__ #define __COMMDATA3_H__ +#include "util/fixed_width_types.h" + /// \file commData3.h /// \brief Header file with IPC communications structures /// @@ -61,18 +63,15 @@ typedef struct CDS_IPC_INFO int mode; /// Errors/sec detected for a single IPC int errFlag; - int errFlagS[ 2 ]; /// Marks error to IPC status by network type int errTotal; /// Name of the IPC signal from the user model char* name; /// Name of the model which contains the IPC sender part char* senderModelName; - unsigned long lastSyncWord[ 2 ]; - /// Pointer to the IPC data memory location - CDS_IPC_COMMS* pIpcDataRead[ 2 ]; - CDS_IPC_COMMS* pIpcDataWrite[ 2 ]; /// Pointer to the IPC data memory location + CDS_IPC_COMMS* pIpcDataRead; + CDS_IPC_COMMS* pIpcDataWrite; } CDS_IPC_INFO; @@ -107,6 +106,10 @@ void commData3GetIpcNums(int totalIPCs, CDS_IPC_INFO ipcInfo[], int * numSenders void commData3Send(int connects, CDS_IPC_INFO ipcInfo[], int timeSec, int cycle); int commData3Receive(int connects, CDS_IPC_INFO ipcInfo[], int timeSec, int cycle); +void commData3SetCycleMax(unsigned measured_max_us); +void commData3SetCycleStartTime(uint64_t cycle_start_tsc); +int commData3GetInitStatus( void ); + #ifdef __cplusplus } diff --git a/src/include/controller.h b/src/include/controller.h index d7f30184b..fdb38dbd4 100644 --- a/src/include/controller.h +++ b/src/include/controller.h @@ -76,6 +76,7 @@ extern char fp[ 64 * 1024 ]; #define CPU_TIME_ADC_WAIT 9 // fe_state defs +#define IPC_INIT_ERROR -10 #define IO_CARD_MAP_ERROR -9 #define IO_CONFIG_ERROR -8 #define ADC_TO_ERROR -7 -- GitLab