manager.py 14.4 KB
Newer Older
1
import functools
2

Jameson Rollins's avatar
Jameson Rollins committed
3
from .state import GuardStateDecorator
4

5

6 7 8 9
class NodeError(Exception):
    pass

class NodeConnectError(Exception):
10 11
    pass

12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35

def node_checker(node_manager, fail_return=None):
    """Return GuardStateDecorator for checking fault status of Nodes.

    `node_manager` is a Node or NodeManager object with a
    check_fault() method.  Returns a GuardStateDecorator with it's
    pre_exec method set to be the check_fault method.  The
    "fail_return" option should specify an alternate return value for
    the decorated state method in case the check fails (i.e. a jump
    state name) (default None).

    """
    # we want to return a GuardStateDecorator here, where the
    # pre_exec function is the NodeManager.check method.  since
    # "self" here is the NodeManager, we tell pre_exec to just
    # ignore it's argument, which would have been the
    # GuardStateDecorator self.
    class checker(GuardStateDecorator):
        def pre_exec(__):
            if node_manager.check_fault():
                return fail_return
    return checker


36
# FIXME: handle rejected requests
37
# FIXME: check for dead node
38 39

class Node(object):
40 41 42
    """Manager interface to a single Guardian node.

    >>> SUS_ETMX = Node('SUS_ETMX')  # create the node object
43 44
    >>> SUS_ETMX.init()              # initialize (handled automatically in daemon)
    >>> SUS_ETMX.set_managed()       # set node to be in MANAGED mode
45
    >>> SUS_ETMX.set_request('DAMPED') # request DAMPED state from node
46
    >>> SUS_ETMX.arrived             # True if node arrived at requested state
47
    >>> SUS_ETMX.check_fault()       # Check for management-related "faults" in the Node
48
    >>> SUS_ETMX.release()           # release node from management
49 50

    """
51 52

    # node attributes
53 54
    attrs=('OP',
           'MODE',
55
           'MANAGER',
56
           'REQUEST',
57 58 59
           'STATE_S',
           'TARGET_S',
           'REQUEST_S',
60
           'STATUS',
61
           'STALLED',
62 63
           'ERROR',
           'NOTIFICATION',
64
           )
65

66
    def __init__(self, name):
67
        self.__name = name
68
        self.__prefix = ':GRD-{name}_'.format(name=self.name)
69 70
        # cache that we have set node to be managed
        self.__managed = False
71
        # cache of requested state
72
        self.__request = None
73 74
        # initialization state
        self.__initialized = False
75

76
    def __repr__(self):
77 78 79
        return "%s(%r)" % (self.__class__.__name__, self.__name)

    def __str__(self):
80 81 82 83 84
        if self.__initialized:
            s = 'initialized'
        else:
            s = 'uninitialized'
        return "<%s '%s', %s>" % (self.__class__.__name__, self.__name, s)
85

86 87
    @property
    def name(self):
88
        """Node name"""
89 90
        return self.__name

91
    def __get(self, attr):
92
        """get attribute of node."""
93
        return ezca.read(self.__prefix+attr, as_string=True)
94

95
    def __put(self, attr, value):
96
        """set attribute of node."""
97
        assert attr in ['MODE', 'MANAGER', 'REQUEST'], "MANAGER, MODE and REQUEST are the only settable node attributes."
98
        return ezca.write(self.__prefix+attr, value)
99 100 101 102 103 104

    ########################################

    def init(self):
        """Initialize the node.

105 106 107
        Under normal circumstances, i.e. in a running guardian daemon,
        node initialization is handled automatically.  This function
        therefore does not need to be executed in user code.
108 109

        """
110 111 112 113 114 115 116
        # FIXME: could elliminate this enitrely by just using the rote
        # ezca object
        if self.__initialized:
            return

        # initialize node PVs
        for attr in self.attrs:
117
            ezca.connect(self.__prefix+attr)
118

119 120 121 122 123 124 125 126 127 128
        # if the node is currently set to be managed by me
        if self.i_manage:

            # update the internal manage indicator
            self.__managed = True

            # set the internal request indicator based on the current
            # request (assuming it was set by me)
            if not self.__request:
                self.__request = self.REQUEST
129

130 131
        self.__initialized = True

132
    def __eq__(self, state):
133
        """True if node state string equals string."""
134 135 136
        return self.state == state

    def __ne__(self, state):
137
        """True if node state string does not equal string."""
138
        return not self.__eq__(state)
139

140
    @property
141 142 143 144 145 146 147
    def OP(self):
        """node OP"""
        return self.__get('OP')

    @property
    def MODE(self):
        """node MODE"""
148 149
        return self.__get('MODE')

150 151
    @property
    def managed(self):
152
        """True if node is MANAGED"""
153
        return self.MODE == 'MANAGED'
154 155

    @property
156
    def MANAGER(self):
157 158
        """MANAGER string of node"""
        return self.__get('MANAGER')
159 160

    def set_managed(self):
161
        """Set node to be managed by this manager."""
162 163
        # note that we have set node to be managed
        self.__managed = True
164 165
        self.__put('MANAGER', _SYSTEM_)

166 167 168 169 170 171
    def release(self):
        """Release node from management by this manager (MODE=>AUTO)."""
        log("Releasing node from management: %s" % self.name)
        self.__put('MODE', 'AUTO')
        self.__managed = False

172 173 174
    @property
    def i_manage(self):
        """True if node is being managed by this system"""
175
        return self.MANAGER == _SYSTEM_
176 177

    @property
178
    def ERROR(self):
179
        """True if node in ERROR."""
180
        return eval(self.__get('ERROR'))
181

182
    @property
183
    def NOTIFICATION(self):
184
        """True if node NOTIFICATION present."""
185 186
        return eval(self.__get('NOTIFICATION'))

187
    @property
188
    def OK(self):
189 190 191
        """Current OK status of node."""
        return eval(self.__get('OK'))

192
    @property
193
    def REQUEST(self):
194
        """Current REQUEST state of node."""
195
        return self.__get('REQUEST_S')
196
    request = REQUEST
197

198
    def set_request(self, state):
199 200 201
        """Set REQUEST state for node.

        """
202 203 204 205 206 207 208 209 210
        assert state != None
        self.__put('REQUEST', state)
        # HACK: We explicitly get the value without relying on the
        # monitor (use_monitor=False) to make sure we have the
        # post-update value.  If it doesn't match what we put then the
        # value was invalid.  Should be a better way to do this.
        pv = ezca.connect(self.__prefix+'REQUEST')
        if pv.get(use_monitor=False) != state:
            raise NodeError("Invalid REQUEST: {}".format(state))
211
        self.__request = state
212
        # FIXME: set call back to record when subordinate makes it to
213
        # request, for benefit of arrived.
214 215

    @property
216
    def STATE(self):
217
        """Current STATE of node."""
218
        return self.__get('STATE_S')
219
    state = STATE
220

221
    @property
222
    def TARGET(self):
223
        """Current TARGET state of node."""
224 225
        return self.__get('TARGET_S')

226
    @property
227
    def arrived(self):
228 229 230 231 232 233 234 235
        """True if node STATE equals the last manager-requested state.

        NOTE: This will be False if STATE == REQUEST but REQUEST was
        not last set by this Node manager object.  This prevents false
        positives in the case that the REQUEST has been changed out of
        band.

        """
236 237 238 239 240 241 242 243 244 245 246
        # FIXME: Maybe this should be a latching value?  It could be
        # reset when the request is set, and set True once it arrives,
        # and then not change when it looses the requested state.
        # Would need some sort of request callback for this.

        # assume true if no request has ever been issued for this
        # node.
        if not self.__request:
            return True
        else:
            return self.STATE == self.__request
247

248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
    @property
    def STATUS(self):
        """Current STATUS of node."""
        return self.__get('STATUS')

    @property
    def done(self):
        """True if STATUS is DONE.

        A state is DONE if it is the requested state and the state
        method has returned True.

        """
        return self.STATUS == 'DONE'

263 264 265 266 267 268 269
    @property
    def completed(self):
        """True is node has arrived at the request state, and state is done.

        """
        return self.arrived and self.done

270
    @property
271
    def STALLED(self):
272
        """True if the node has stalled in the current state.
273

274 275 276
        This is true when STATE == TARGET != REQUEST, which is
        typically the result of a jump transition while in managed
        mode.
277 278

        """
279
        return eval(self.__get('STALLED'))
280

281 282 283
    def revive(self):
        """Re-request last requested state.

284 285 286 287
        The last requested state in this case is the one requested
        from this Node object.

        Useful for reviving stalled nodes, basically counteracting the
288
        stalling that is the effect of a jump transition while being
289
        in MANAGED mode.  See the 'STALLED' property.
290 291

        """
292 293
        if self.__request and self.STALLED:
            self.set_request(self.__request)
294

295
    def check_fault(self):
296 297
        """Return fault status of node.

298 299 300 301 302 303 304 305
        Runs a series of checks on the "management status" of the
        node, and returns True if any of the following checks fail:
         * node still alive and running
         * node does not show ERROR status
         * REQUEST hasn't deviated from last set value
         * if node had been set MANAGED, it is still set, and MANAGER
           hasn't changed
         * node has no notifications (failure does not produce fault)
306

307
        Any failure of the above also produces a NOTIFICATION message.
308 309

        """
310
        fault = False
311

312 313 314
        def snotify(msg):
            notify("{}: {}".format(self.name, msg))

315
        if self.NOTIFICATION:
316
            snotify("has notification")
317

318
        if self.ERROR:
319
            snotify("ERROR!")
320 321
            fault = True

322
        if self.OP != 'EXEC':
323
            snotify("not EXEC")
324 325
            fault = True

326 327 328
        if self.__request:
            request = self.REQUEST
            if request != self.__request:
329
                snotify("REQUEST CHANGED (was: %s, now: %s)" % (self.__request, request))
330 331
                fault = True

332
        if self.__managed:
333
            if not self.managed:
334
                snotify("NOT IN MANAGED MODE")
335 336
                fault = True

337
            # if we're not listed as manager
338
            elif not self.i_manage:
339
                snotify("STOLEN (by: %s)" % self.MANAGER)
340 341
                fault = True

342
        return fault
343

344 345 346 347 348
    @functools.wraps(node_checker)
    def checker(self, fail_return=None):
        return node_checker(self, fail_return=fail_return)


349
class NodeManager(object):
350
    """Manager interface to a set of subordinate Guardian nodes.
351 352

    This should be instantiated with a list of node names to be
353 354 355
    managed.  Node objects are instantiated for each node.

    >>> nodes = NodeManager(['SUS_ITMX','SUS_ETMX'])
356 357 358 359 360 361
    >>> nodes.init()                   # initialize (handled automatically in daemon)
    >>> nodes.set_managed()            # set all nodes to be in MANAGED mode
    >>> nodes['SUS_ETMX'] = 'ALIGNED'  # request state of node
    >>> nodes['SUS_ITMX'] = 'ALIGNED'  # request state of node
    >>> nodes.arrived                  # True if all nodes have arrived at their
                                       # requested states
362
    >>> nodes.check_fault()            # Check for management-related "faults" in all nodes
363 364

    """
365
    def __init__(self, nodes):
366 367 368 369
        self.nodes = {}
        for node in nodes:
            self.nodes[node] = Node(node)

370
    def __repr__(self):
371 372 373
        return "%s(%r)" % (self.__class__.__name__, self.nodes.keys())

    def __str__(self):
374 375
        return "<%s %s>" % (self.__class__.__name__, self.nodes.keys())

376
    def __getitem__(self, node):
377
        """Retrieve Node object for named node."""
378 379
        return self.nodes[node]

380 381 382 383 384 385 386
    def __contains__(self, node):
        """True if manager contains node."""
        if isinstance(node, Node):
            return node.name in self.nodes
        else:
            return node in self.nodes

387
    def __setitem__(self, node, state):
388
        """Request state for named node."""
389 390 391 392
        try:
            self.nodes[node].set_request(state)
        except NodeError as e:
            raise NodeError("%s: %s" % (node, e))
393

394 395 396 397 398
    def __iter__(self):
        """Iterator of node objects."""
        for node in self.nodes.itervalues():
            yield node

399
    def init(self):
400 401
        """Initialize all nodes.

402 403 404
        Under normal circumstances, i.e. in a running guardian daemon,
        node initialization is handled automatically.  This function
        therefore does not need to be executed in user code.
405 406 407 408 409

        """
        for node in self:
            node.init()

410 411 412 413 414 415 416 417 418 419 420
    def set_managed(self, nodes=None):
        """Set all nodes to be managed by this manager.

        `names` can be a list of node names to set managed.

        """
        if nodes:
            nl = [self[node] for node in nodes]
        else:
            nl = self
        for node in nl:
421 422
            node.set_managed()

423 424 425 426 427 428 429 430 431 432 433 434 435
    def release(self, nodes=None):
        """Release all nodes from management by this manager.

        `nodes` can be a list of node names to release.

        """
        if nodes:
            nl = [self[node] for node in nodes]
        else:
            nl = self
        for node in nl:
            node.release()

436
    @property
437 438 439
    def arrived(self):
        """Return True if all nodes have arrived at their requested state."""
        for node in self:
440
            if not node.arrived:
441 442
                return False
        return True
443 444 445 446 447 448 449 450

    @property
    def completed(self):
        """Return True if all nodes are arrived and done."""
        for node in self:
            if not node.completed:
                return False
        return True
451

452
    def get_stalled_nodes(self):
453
        """Return a list of all stalled nodes."""
454 455 456 457 458 459 460 461
        return [node for node in self if node.STALLED]

    def revive_all(self):
        """Revive all stalled nodes."""
        for node in self:
            if node.STALLED:
                log("Reviving stalled node: %s" % node.name)
                node.revive()
462

463
    def not_ok(self):
Jameson Rollins's avatar
Jameson Rollins committed
464
        """Return set of node names not currently reporting OK status."""
465 466 467 468 469 470
        notok = set()
        for node in self:
            if not node.OK:
                notok.add(node.name)
        return notok

471
    def check_fault(self):
472 473
        """Check fault status of all nodes.

474
        Runs check_fault() method for all nodes.  Returns True if any
475
        nodes are in fault.
476 477 478

        """
        any_fault = False
479
        for node in self:
480
            any_fault |= node.check_fault()
481
        return any_fault
482

483
    @functools.wraps(node_checker)
484
    def checker(self, fail_return=None):
485
        return node_checker(self, fail_return=fail_return)