diff --git a/config/settings/base.py b/config/settings/base.py index c1d374608d37d2f3641b53dc7d6a9b577d1a8de4..a0337b4b1b2d62afc0a0faf6733549bd1ed7a299 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -150,6 +150,10 @@ SEND_MATTERMOST_ALERTS = False # IGWN_ALERT_GROUP environment variable. DEFAULT_IGWN_ALERT_GROUP = 'lvalert-dev' +# overseer timeout: +OVERSEER_TIMEOUT = float(get_from_env('IGWN_ALERT_OVERSEER_TIMEOUT', + fail_if_not_found=False, default_value=0.1)) + # Use LVAlert Overseer? USE_LVALERT_OVERSEER = True # For each LVAlert server, a separate instance of LVAlert Overseer diff --git a/docker/supervisord-igwn-alert-overseer.conf b/docker/supervisord-igwn-alert-overseer.conf index bcafa51bb295c33494d5cdc82175cf6b3b75e4a8..09e6fe4f712ac2c541b5485a1c8e63e2e9a1e0be 100644 --- a/docker/supervisord-igwn-alert-overseer.conf +++ b/docker/supervisord-igwn-alert-overseer.conf @@ -3,7 +3,7 @@ autostart=%(ENV_ENABLE_IGWN_OVERSEER)s command=igwn_alert_overseer -a %(ENV_IGWN_ALERT_USER)s -b %(ENV_IGWN_ALERT_PASSWORD)s -s %(ENV_IGWN_ALERT_SERVER)s -p %(ENV_IGWN_ALERT_OVERSEER_PORT)s -g %(ENV_IGWN_ALERT_GROUP)s - -l - -e - -q - -c + -l - -e - -q - -c -f -i %(ENV_IGWN_ALERT_FLUSH_INTERVAL)s user=gracedb group=www-data stdout_logfile=/dev/stdout diff --git a/gracedb/alerts/lvalert.py b/gracedb/alerts/lvalert.py index f840aa564231cc4a6bc46c713b199541d97febde..a40453b09582175afe18f0faf3518fdd9d0833c2 100644 --- a/gracedb/alerts/lvalert.py +++ b/gracedb/alerts/lvalert.py @@ -1,19 +1,28 @@ import logging -from multiprocessing import Process import os - +import asyncio +import datetime +import json +import time +import functools from django.conf import settings from igwn_alert import client from igwn_alert_overseer.overseer.overseer_client import overseer_client from tornado.ioloop import IOLoop -import asyncio -import json - +from tornado.iostream import StreamClosedError +from asyncio.exceptions import InvalidStateError # Set up logger logger = logging.getLogger(__name__) +# man, just shorten the variable name +OVERSEER_TIMEOUT = settings.OVERSEER_TIMEOUT + +def timeout_and_stop(io_loop): + logger.critical(f'Overseer IO Loop timed out after {OVERSEER_TIMEOUT} seconds.') + io_loop.stop() + def send_with_lvalert_overseer(node_name, message, port): @@ -30,11 +39,44 @@ def send_with_lvalert_overseer(node_name, message, port): # Format message. FIXME maybe move this step into the overseer client? msg_dict = json.dumps(msg_dict) - # Start IOLoop: - asyncio.set_event_loop(asyncio.new_event_loop()) - resp = client.send_to_overseer(msg_dict, logger) - IOLoop.instance().start() - rdict = json.loads(resp.result()) + alert_loop = asyncio.new_event_loop() + try: + asyncio.set_event_loop(alert_loop) + + # Start the async request to push the message to the overseer, and + # await the success/failure response. + resp = client.send_to_overseer(msg_dict, logger) + + # Start the async I/O loop within the current thread + io_loop = IOLoop.instance() + + # Construct a callable that passes io_loop as an argument + overseer_timeout = functools.partial(timeout_and_stop, io_loop) + + # Add a timeout for the scenario where the overseer server isn't + # running or responding. This shouldn't actually happen, but hey. + io_loop.add_timeout(time.time() + OVERSEER_TIMEOUT, overseer_timeout) + + # Start the I/O loop + io_loop.start() + + # Interpret the response + rdict = json.loads(resp.result()) + + # Two scenarios here: the overseer client code gives a StreamClosedError + # when the I/O loop was stopped after it timed out. I think the + # InvalidStateError came as a result of prior implementation of this logic, + # so i don't think it would occur again... but if it does it still represents + # an invalid response from the overseer, so the alert should be sent again. + except (StreamClosedError, InvalidStateError) as e: + # close the loop and free up the port: + alert_loop.close() + + # return false and then attempt to send with the client code. + return False + finally: + # close the loop and free up the port: + alert_loop.close() # Return a boolean indicating whether the message was sent # successfully or not diff --git a/requirements.txt b/requirements.txt index 38614bf6bcce3daf26fc0e5f5bf048837b0618d0..c1a615538f12b868c330aeb645840b4ce779933f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,10 +21,10 @@ flake8==3.9.2 gpstime==0.8.1 gssapi==1.8.2 gunicorn[gthread]==21.2.0 -hop-client==0.8.0 +hop-client==0.9.0 html5lib==1.1.0 -igwn-alert==0.4.0 -igwn-alert-overseer==0.6.1 +igwn-alert==0.5.0 +igwn-alert-overseer==0.7.0 ipdb==0.13.13 ipython==8.14.0 #jwt==1.3.1