#!/usr/bin/python # Copyright (C) 2004-2006, Christof Meerwald # http://cmeerw.org # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; version 2 dated June, 1991. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 # USA import apsw, bisect, errno, getopt, random, socket, string, sys, time import traceback, types import parseconf class CONF: conffile = 'downtime.conf' config = [] grace_period = 0 notify = '' verbose = 0 SOCKET_CONNECTTIMEOUT = 200 SOCKET_TIMEOUT = 60 class FailureScope: HOST = 0 SERVICE = 1 DOMAIN = 2 def __init__(self, scope): self._scope = scope def __eq__(self, obj): if isinstance(obj, FailureScope): return self._scope == obj._scope return self._scope == obj def __ne__(self, obj): return not (self == obj) class Failure: COMMUNICATION = 0 TEMPORARY = 1 PERMANENT = 2 def __init__(self, failure): self._failure = failure def __eq__(self, obj): if isinstance(obj, Failure): return self._failure == obj._failure return self._failure == obj def __ne__(self, obj): return not (self == obj) class Result: def __init__(self, failure, scope, msg, stats=None): self._failure = failure self._scope = scope self._msg = msg self._stats = stats def get_failure(self): return self._failure def get_scope(self): return self._scope def get_message(self): return self._msg def get_stats(self): return self._stats class MonitoringStatus: def __init__(self, target, t): self._target = target self._timestamp = t self._failure = None self._scope = None self._message = None self._alerted = False self._initial = True def get_failure(self): return self._failure def get_scope(self): return self._scope def get_timestamp(self): return self._timestamp def get_message(self): return self._message def get_target(self): return self._target def get_alerted(self): return self._alerted def get_initial(self): return self._initial def set_alerted(self): self._alerted = True def clear_alerted(self): self._alerted = False def clear_initial(self): self._initial = False def clear(self, t): self._failure, self._scope, self._message = None, None, None self._timestamp = t def set(self, failure, scope, t, message): self.clear_initial() self._failure, self._scope = failure, scope self._timestamp, self._message = t, message self._alerted = False def get_host(self): return self._target.get_host() def get_domain(self): return self._target.get_domain() def get_service_name(self): return self._target.get_service_name() status_host = {} def get_host_status(target, t): try: return status_host[target.get_host()] except KeyError: status = MonitoringStatus(target, t) status_host[host] = status return status status_domain = {} def get_domain_status(target, t): try: return status_domain[(target.get_host(), target.get_service_name(), target.get_domain())] except KeyError: status = MonitoringStatus(target, t) status_domain[(target.get_host(), target.get_service_name(), target.get_domain())] = status return status status_service = {} def get_service_status(target, t): try: return status_domain[(target.get_host(), target.get_service_name())] except KeyError: status = MonitoringStatus(target, t) status_domain[(target.get_host(), target.get_service_name())] = status return status class Cursor: def __init__(self, _db): self._txn = False self._cursor = _db.cursor() def __del__(self): if self._txn: self._cursor.execute('END') def begin(self): self._cursor.execute('BEGIN') self._txn = True def end(self): if self._txn: self._cursor.execute('END') self._txn = False def execute(self, stmt, bindings=None): if bindings == None: return self._cursor.execute(stmt) else: return self._cursor.execute(stmt, bindings) def next(self): return self._cursor.next() def transition_host_up(status): t = status.get_timestamp() host = status.get_host() if not status.get_initial(): print time.asctime(time.localtime(t)), 'Host %s is up again' % (host,) cursor = Cursor(db) cursor.execute('INSERT INTO state_transition (host, tstamp, state) VALUES (?, ?, 16364)', (host, t)) cursor.end() def alert_host_up(status): notify_up(status) def transition_host_down(status): t = status.get_timestamp() host = status.get_host() print time.asctime(time.localtime(t)), 'Host %s is unreachable' % (host,) cursor = Cursor(db) cursor.execute('INSERT INTO state_transition (host, tstamp, state) VALUES (?, ?, 0)', (host, t)) cursor.end() def alert_host_down(status): notify_down(status) def transition_service_up(status): t = status.get_timestamp() service_name, host = (status.get_service_name(), status.get_host()) if not status.get_initial(): print time.asctime(time.localtime(t)), 'Service %s on host %s is up again' % (service_name, host) cursor = Cursor(db) cursor.execute('INSERT INTO state_transition (host, service, tstamp, state) VALUES (?, ?, ?, 16384)', (host, service_name, t)) cursor.end() def alert_service_up(status): notify_up(status) def transition_service_down(status): t = status.get_timestamp() service_name, host = (status.get_service_name(), status.get_host()) print time.asctime(time.localtime(t)), 'Service %s on host %s is down' % (service_name, host) cursor = Cursor(db) cursor.execute('INSERT INTO state_transition (host, service, tstamp, state) VALUES (?, ?, ?, 0)', (host, service_name, t)) cursor.end() def alert_service_down(status): notify_down(status) def transition_domain_up(status): t = status.get_timestamp() service_name, host, domain = (status.get_service_name(), status.get_host(), status.get_domain()) if not status.get_initial(): print time.asctime(time.localtime(t)), 'Service %s for domain %s on host %s is up again' % (service_name, domain, host) cursor = Cursor(db) cursor.execute('INSERT INTO state_transition (host, service, domain, tstamp, state) VALUES (?, ?, ?, ?, 16384)', (host, service_name, domain, t)) cursor.end() def alert_domain_up(status): notify_up(status) def transition_domain_down(status): t = status.get_timestamp() service_name, host, domain = (status.get_service_name(), status.get_host(), status.get_domain()) print time.asctime(time.localtime(t)), 'Service %s for domain %s on host %s is down' % (service_name, domain, host) cursor = Cursor(db) cursor.execute('INSERT INTO state_transition (host, service, domain, tstamp, state) VALUES (?, ?, ?, ?, 16384)', (host, service_name, domain, t)) cursor.end() def alert_domain_down(status): notify_down(status) if hasattr(socket, 'setdefaulttimeout'): # Python >= 2.3 has native support for socket timeouts socket.setdefaulttimeout(CONF.SOCKET_CONNECTTIMEOUT) TimeoutException = socket.timeout else: # try to use timeoutsocket if it is available try: import timeoutsocket timeoutsocket.setDefaultSocketTimeout(CONF.SOCKET_CONNECTTIMEOUT) TimeoutException = timeoutsocket.Timeout except ImportError: class TimeoutException(Exception): pass def schedule(queue, item): t = item[0] + int(random.normalvariate(0, item[1].get_interval() / 30)) if CONF.verbose: print 'scheduling %s: domain=%s, host=%s for' % (item[1].get_service_name(), item[1].get_domain(), item[1].get_host()), time.asctime(time.localtime(t)) bisect.insort(queue, (t, item[1])) optlist, args = getopt.getopt(sys.argv[1:], 'c:g:n:hv', ['config=', 'grace=', 'grace-period=', 'notify=', 'verbose', 'help']) for opt, arg in optlist: if (opt == '-c') or (opt == '--config'): CONF.conffile = arg if (opt == '-g') or (opt == '--grace') or (opt == '--grace-period'): CONF.grace_period = string.atoi(arg) if (opt == '-n') or (opt == '--notify'): CONF.notify = arg if (opt == '-v') or (opt == '--verbose'): CONF.verbose += 1 if (opt == '-h') or (opt == '--help'): print """Syntax: %s [-c|--config=] [-g|--grace=] [-n|--notify=] [-v|--verbose]""" % (sys.argv[0],) sys.exit(0) if CONF.notify: import notify try: pos = CONF.notify.index(':') modname = CONF.notify[:pos] arg = CONF.notify[pos + 1:] except KeyError: modname = CONF.notify arg = '' __import__('notify.' + modname) module = getattr(notify, modname) module.CONF = CONF module.Failure = Failure module.FailureScope = FailureScope module.Result = Result init = getattr(module, 'init') init(arg) notify_up = getattr(module, 'notify_up') notify_down = getattr(module, 'notify_down') else: def notify_up(status): pass def notify_down(status): pass CONF.config = parseconf.parse(CONF.conffile) db = apsw.Connection('stats.db') db.setbusytimeout(10000) db.cursor().execute('PRAGMA synchronous=NORMAL') random.seed() queue = [] t = int(time.time()) import plugins for entry in CONF.config: modname = entry.get_service_name().lower() if not hasattr(plugins, modname): __import__('plugins.' + modname) module = getattr(plugins, modname) module.CONF = CONF module.Failure = Failure module.FailureScope = FailureScope module.Result = Result check_fn = getattr(plugins, modname).check entry.set_check_fn(check_fn) schedule(queue, (t + int(random.uniform(0, entry.get_interval())), entry)) while 1: t, entry = queue[0] service_name = entry.get_service_name() host = entry.get_host() domain = entry.get_domain() del queue[0] while time.time() < (t - 3): time.sleep(t - time.time()) if CONF.verbose: t = int(time.time()) print time.asctime(time.localtime(t)), '%s: checking domain=%s host=%s' % (service_name, entry.get_domain(), entry.get_host()) try: result = entry.get_check_fn()(entry) if isinstance(result, Result): failure = result.get_failure() scope = result.get_scope() message = result.get_message() stats = result.get_stats() elif type(result) == types.TupleType: failure, scope, message = result stats = None else: failure, scope, message, stats = None, None, None, None del result except socket.gaierror: failure = Failure(Failure.TEMPORARY) scope = FailureScope(FailureScope.HOST) message = 'Can\'t resolve "%s"' % (entry.get_host(),) except socket.error, e: if e.args[0] in [errno.ETIMEDOUT, errno.ENETDOWN, errno.ENETUNREACH, errno.EHOSTUNREACH]: failure = Failure(Failure.COMMUNICATION) scope = FailureScope(FailureScope.HOST) message = 'socket: ' + str(e) else: failure = Failure(Failure.PERMANENT) scope = FailureScope(FailureScope.SERVICE) message = 'socket: ' + str(e) except TimeoutException, e: failure = Failure(Failure.COMMUNICATION) scope = FailureScope(FailureScope.HOST) message = 'timeout: ' + str(e) except: exc_type, exc_value, exc_tb = sys.exc_info() failure = Failure(Failure.TEMPORARY) scope = FailureScope(FailureScope.SERVICE) message = 'Unknown exception caught\n' + ''.join(traceback.format_exception(exc_type, exc_value, exc_tb)) del exc_type, exc_value, exc_tb t = int(time.time()) suppress = False if failure != Failure.COMMUNICATION: status = get_host_status(entry, t) if status.get_failure() or status.get_initial(): status.clear(t) transition_host_up(status) status.clear_initial() if status.get_alerted(): alert_host_up(status) if failure == None: status = get_domain_status(entry, t) if status.get_failure() or status.get_initial(): status.clear(t) transition_domain_up(status) status.clear_initial() if status.get_alerted(): alert_domain_up(status) status = get_service_status(entry, t) if status.get_failure() or status.get_initial(): status.clear(t) transition_service_up(status) status.clear_initial() if status.get_alerted(): alert_service_up(status) elif failure == Failure.COMMUNICATION: suppress = True status = get_host_status(entry, t) if not status.get_failure(): status.set(failure, scope, t, message) transition_host_down(status) if not status.get_alerted() and (t - status.get_timestamp()) >= CONF.grace_period: status.set_alerted() alert_host_down(status) elif failure == Failure.TEMPORARY: pass elif failure == Failure.PERMANENT: if scope == FailureScope.SERVICE: status = get_service_status(entry, t) if not status.get_failure(): status.set(failure, scope, t, message) transition_service_down(status) else: suppress = True if not status.get_alerted(): status.set_alerted() alert_service_down(status) elif scope == FailureScope.DOMAIN: status = get_domain_status(entry, t) if not status.get_failure(): status.set(failure, scope, t, message) transition_domain_down(status) else: suppress = True if not status.get_alerted(): status.set_alerted() alert_domain_down(status) if failure != None and not suppress: print time.asctime(time.localtime(t)), '%s (domain=%s, host=%s): %s' % (service_name, entry.get_domain(), entry.get_host(), message) schedule(queue, (t + entry.get_interval(), entry))