[859] | 1 | #!/usr/bin/env python |
---|
| 2 | # -*- coding: ISO-8859-1 -*- |
---|
| 3 | |
---|
| 4 | ################################## |
---|
| 5 | # @program smon |
---|
| 6 | # @description simulation monitor |
---|
| 7 | # @copyright Copyright â(c)2009 Centre National de la Recherche Scientifique CNRS. |
---|
| 8 | # All Rights Reservedâ |
---|
| 9 | # @svn_file $Id: analyzer 2545 2013-02-01 09:58:10Z jripsl $ |
---|
| 10 | # @version $Rev: 2545 $ |
---|
| 11 | # @lastrevision $Date: 2013-02-01 10:58:10 +0100 (Fri, 01 Feb 2013) $ |
---|
| 12 | # @license CeCILL (http://dods.ipsl.jussieu.fr/jripsl/smon/LICENSE) |
---|
| 13 | ################################## |
---|
| 14 | |
---|
| 15 | import sys |
---|
| 16 | import signal |
---|
| 17 | import traceback |
---|
| 18 | import smtplib |
---|
| 19 | from email.mime.text import MIMEText |
---|
[879] | 20 | import time |
---|
| 21 | import datetime |
---|
[859] | 22 | |
---|
| 23 | |
---|
[876] | 24 | # line below is to include "smon" package in the search path |
---|
| 25 | sys.path.append("/home/jripsl/snapshot/Monitoring") |
---|
[859] | 26 | |
---|
[876] | 27 | import smon.repo_io as repo_io |
---|
| 28 | import smon.types as types |
---|
[859] | 29 | |
---|
| 30 | |
---|
| 31 | |
---|
| 32 | class CheckList(): |
---|
[879] | 33 | max_time_between_msg=20 # unit => seconds |
---|
[859] | 34 | |
---|
| 35 | @classmethod |
---|
| 36 | def msg_timeout(cls,message): |
---|
| 37 | |
---|
[879] | 38 | # get current epoch |
---|
| 39 | current_epoch=time.time() |
---|
[859] | 40 | |
---|
[879] | 41 | # get msg epoch |
---|
| 42 | msg_time=time.strptime(str(message.crea_date), "%Y-%m-%d %H:%M:%S.%f") |
---|
| 43 | msg_epoch=time.mktime(msg_time) |
---|
[859] | 44 | |
---|
[879] | 45 | diff = current_epoch - msg_epoch |
---|
| 46 | |
---|
[876] | 47 | # debug |
---|
[879] | 48 | #print "cur=%i,ms=%s"%(current_epoch,message.crea_date) |
---|
[859] | 49 | |
---|
[879] | 50 | # debug |
---|
| 51 | #print "diff=%i"%int(diff) |
---|
| 52 | |
---|
[859] | 53 | if diff>cls.max_time_between_msg: |
---|
| 54 | |
---|
| 55 | return True |
---|
| 56 | else: |
---|
| 57 | return False |
---|
| 58 | |
---|
| 59 | @classmethod |
---|
| 60 | def C0001(cls): |
---|
| 61 | """ |
---|
| 62 | description |
---|
| 63 | check heartbeat (currently, heartbeat is implemented using simulation progress messages). |
---|
| 64 | if progress messages suddently stops, it is likely that the simulation was Killed or |
---|
| 65 | a Segfault occurs. In such case, we inform the other components (failover, prodiguer GUI..) by |
---|
| 66 | changing the simulation status |
---|
| 67 | """ |
---|
| 68 | |
---|
[876] | 69 | |
---|
| 70 | for simulation in repo_io.get_running_simulations(): |
---|
| 71 | |
---|
[935] | 72 | print "\nchecking heartbeat ('%s')"%simulation.name |
---|
[876] | 73 | |
---|
| 74 | try: |
---|
| 75 | message=repo_io.retrieve_last_message(simulation) |
---|
[877] | 76 | |
---|
[879] | 77 | # debug |
---|
| 78 | #print "found" |
---|
| 79 | |
---|
[876] | 80 | except types.MessageNotFoundException, e: |
---|
[879] | 81 | # when we are here, it mean we are in the interval when a new simulation have just been inserted but the corresponding message have not been inserted yet |
---|
[876] | 82 | |
---|
[879] | 83 | print "no message found for simulation ('%s')"%simulation.name |
---|
| 84 | |
---|
[876] | 85 | continue |
---|
| 86 | |
---|
| 87 | |
---|
[879] | 88 | if cls.msg_timeout(message): |
---|
[859] | 89 | |
---|
[876] | 90 | simulation.status="error" |
---|
[859] | 91 | |
---|
[876] | 92 | repo_io.update_simulation_status(simulation) |
---|
| 93 | |
---|
[935] | 94 | print "heartbeat NOK - simulation status set to 'error'\n" |
---|
[876] | 95 | |
---|
| 96 | |
---|
| 97 | else: |
---|
[935] | 98 | print "heartbeat OK\n" |
---|
[876] | 99 | |
---|
[859] | 100 | class Analyzer(): |
---|
| 101 | |
---|
| 102 | @classmethod |
---|
| 103 | def start(cls): |
---|
[876] | 104 | repo_io.init() # open DB connection |
---|
[859] | 105 | |
---|
[876] | 106 | Analyzer.main() |
---|
| 107 | |
---|
[859] | 108 | @classmethod |
---|
| 109 | def stop(cls): |
---|
[876] | 110 | repo_io.free() # close DB connection |
---|
[859] | 111 | |
---|
| 112 | @classmethod |
---|
| 113 | def main(self): |
---|
| 114 | |
---|
| 115 | """ |
---|
| 116 | # parse args |
---|
| 117 | parser = argparse.ArgumentParser(prog='analyzer') |
---|
| 118 | parser.add_argument('-v', dest='verbose',required=False,action='store_true') |
---|
| 119 | args = parser.parse_args() |
---|
| 120 | |
---|
| 121 | # check |
---|
| 122 | if not os.path.exists(SMON.smon_home): |
---|
| 123 | sys.exit(1) |
---|
| 124 | |
---|
| 125 | SMON.init_singleton() |
---|
| 126 | """ |
---|
| 127 | |
---|
| 128 | print ' [*] Analyzer running. To exit press CTRL+C' |
---|
| 129 | |
---|
[876] | 130 | while True: |
---|
[859] | 131 | |
---|
[935] | 132 | #print "checking simulations heartbeats" |
---|
[859] | 133 | |
---|
[876] | 134 | CheckList.C0001() |
---|
[859] | 135 | |
---|
[876] | 136 | |
---|
[879] | 137 | time.sleep(3) |
---|
[876] | 138 | |
---|
[859] | 139 | """ |
---|
| 140 | SMON.free_singleton() |
---|
| 141 | """ |
---|
| 142 | |
---|
| 143 | def signal_handler(signal, frame): |
---|
| 144 | print 'You pressed Ctrl+C!' |
---|
[876] | 145 | |
---|
| 146 | Analyzer.stop() |
---|
| 147 | |
---|
[859] | 148 | sys.exit(0) |
---|
| 149 | |
---|
| 150 | if __name__ == '__main__': |
---|
| 151 | |
---|
| 152 | signal.signal(signal.SIGINT, signal_handler) |
---|
| 153 | |
---|
| 154 | try: |
---|
[876] | 155 | Analyzer.start() |
---|
[859] | 156 | |
---|
| 157 | sys.exit(0) |
---|
| 158 | |
---|
| 159 | except Exception, e: |
---|
| 160 | |
---|
| 161 | traceback.print_exc() |
---|
| 162 | |
---|
| 163 | sys.exit(1) |
---|