There will be a short amount of downtime, for git.ligo.org, docs.ligo.org, and chat.ligo.org, starting around approximately 10am CDT on Tuesday 18th June 2019. This is to enable access controls for GitLab Pages. More information can be found here.

Commit 975e60a4 authored by Jameson Rollins's avatar Jameson Rollins

status channels for tracking subordinate node count of managers

SUBNODES_TOTAL and SUBNODES_NOT_OK give the total count of subnodes and
the subset that are not reporting OK status, respectively.  This info is
pulled from the manager interfaces in the worker subprocess.
parent db4a2b95
......@@ -500,6 +500,9 @@ guardian version: {}
if self['SPM_SNAP']:
self.worker.snapshot_request.set()
self['SPM_SNAP'] = False
# manager status
self['SUBNODES_TOTAL'] = self.worker['SUBNODES_TOTAL']
self['SUBNODES_NOT_OK'] = self.worker['SUBNODES_NOT_OK']
# log manager changes
if self['MANAGER'] != manager:
......
......@@ -166,6 +166,18 @@ guarddb = {
'archive': True,
},
# manager subordinate node count
'SUBNODES_TOTAL': {
'type': 'int',
'value': -1,
},
'SUBNODES_NOT_OK': {
'type': 'int',
'value': -1,
'low': -1,
'high': 1,
},
# daemon message
'GRDMSG': {
'type': 'char',
......
......@@ -77,6 +77,8 @@ class Worker(multiprocessing.Process):
self._shmem['SPM_DIFF'+str(i)+'_S'] = SharedMemString(size=USERMSG_STRING_LENGTH)
self._shmem['SPM_DIFF'+str(i)+'_C'] = SharedMemString(size=USERMSG_STRING_LENGTH)
self._shmem['SPM_DIFF'+str(i)+'_D'] = SharedMemString(size=USERMSG_STRING_LENGTH)
self._shmem['SUBNODES_TOTAL'] = multiprocessing.Value('H', lock=True)
self._shmem['SUBNODES_NOT_OK'] = multiprocessing.Value('H', lock=True)
# load request event
self.load_request = multiprocessing.Event()
......@@ -453,11 +455,18 @@ class Worker(multiprocessing.Process):
try:
# MANAGER: if this is a manager make sure all nodes
# are initialized (node.init() is idempotent)
# MANAGER
subnodes_total = set()
subnodes_not_ok = set()
for mngr in self._system.node_managers:
# make sure all nodes are initialized (node.init() is idempotent)
mngr.init()
# count subordinates nodes
subnodes_total |= set(mngr.nodes.keys())
subnodes_not_ok |= mngr.not_ok()
# FIXME: should we be running check_fault() here?
self['SUBNODES_TOTAL'] = len(subnodes_total)
self['SUBNODES_NOT_OK'] = len(subnodes_not_ok)
###########
# USER CODE
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment