Many small updates and fixes:
Stephen Soltesz [Wed, 13 Apr 2011 19:31:43 +0000 (19:31 +0000)]
better logging in plc.py

14 files changed:
Monitor.spec
commands/bootman.py
commands/checksync.py
commands/nodebad.py
commands/policy.py
commands/shconfig.py
config.d/init-bootman-sequence.py
cron.d/copy-logs.sh [deleted file]
monitor/bootman.py
monitor/common.py
monitor/generic.py
monitor/wrapper/plc.py
monitor/wrapper/plccache.py
web/MonitorWeb/monitorweb/static/images/favicon.ico

index 61fe0f1..32ecb44 100644 (file)
@@ -350,10 +350,12 @@ chkconfig --add monitor
 chkconfig monitor on
 
 %post runlevelagent
-chkconfig --add monitor-runlevelagent
-chkconfig monitor-runlevelagent on
-if [ "$PL_BOOTCD" != "1" ] ; then
-       service monitor-runlevelagent restart
+if [ -f /etc/planetlab/node_id ] ; then
+    chkconfig --add monitor-runlevelagent
+    chkconfig monitor-runlevelagent on
+    if [ "$PL_BOOTCD" != "1" ] ; then
+        service monitor-runlevelagent restart
+    fi
 fi
 
 
index 347199d..930c8fc 100755 (executable)
@@ -13,6 +13,7 @@ import traceback
 import subprocess
 from sets import Set
 from monitor.bootman import *
+from monitor.util import file 
 
 # MAIN -------------------------------------------------------------------
 
@@ -41,7 +42,7 @@ def main():
        config = parsermodule.parse_args(parser)
 
        if config.nodelist:
-               nodes = config.getListFromFile(config.nodelist)
+               nodes = file.getListFromFile(config.nodelist)
        elif config.node:
                nodes = [ config.node ]
        else:
index d92d60f..494f5f7 100755 (executable)
@@ -20,7 +20,7 @@ if True:
 
 
 
-if True:
+if False:
     fbquery = HistoryNodeRecord.query.all()
     hostnames = [ n.hostname for n in fbquery ]
 
@@ -35,7 +35,7 @@ if True:
     session.flush()
 
 
-if True:
+if False:
     fbquery = HistoryPCURecord.query.all()
     pcus = [ n.plc_pcuid for n in fbquery ]
 
index dc86664..d1b2d35 100755 (executable)
@@ -6,9 +6,9 @@ import string
 import time
 from datetime import datetime,timedelta
 
-from monitor.query import verify,query_to_dict,node_select
 
 from monitor.common import *
+from monitor.query import verify,query_to_dict,node_select
 
 from monitor import config
 from monitor.wrapper import plc,plccache
@@ -23,164 +23,171 @@ api = plc.getAuthAPI()
 round = 1
 count = 0
 def main():
-       main2(config)
+    main2(config)
 
 def main2(config):
 
-       l_plcnodes = plccache.l_nodes
-       l_nodes = get_nodeset(config)
-       
-       checkAndRecordState(l_nodes, l_plcnodes)
+    l_plcnodes = plccache.l_nodes
+    l_nodes = get_nodeset(config)
+    
+    checkAndRecordState(l_nodes, l_plcnodes)
 
 # Node states:
 
 def check_node_state(rec, node):
 
-       node_state = rec.observed_status
-       if rec.plc_node_stats:
-               print rec.plc_node_stats
-               boot_state = rec.plc_node_stats['boot_state']
-               run_level = rec.plc_node_stats['run_level']
-               last_contact = rec.plc_node_stats['last_contact']
-               node.plc_nodeid = rec.plc_node_stats['node_id']
-       else:
-               boot_state = "unknown"
-               last_contact = None
-
-       if boot_state == 'disable': boot_state = 'disabled'
-       if boot_state == 'diag' or boot_state == 'diagnose': boot_state = 'safeboot'
-
-       if len(rec.plc_node_stats['pcu_ids']) > 0:
-               node.haspcu = True
-       else:
-               node.haspcu = False
-
-       node.firewall = rec.firewall
-       node.plc_siteid = rec.plc_node_stats['site_id']
-
-       # NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
-       #                       'translations' into the node.status state
-       #               'BOOT' is a permanent state, but we want it to have a bit of
-       #                       hysteresis (less than 0.5 days)
-       #################################################################
-       # "Initialize" the findbad states into nodebad status if they are not already set
-
-       if node_state == 'DOWN':
-               if boot_state == 'disabled' and changed_lessthan(node.last_changed, 60) and \
-                       node.status != 'disabled':
-                       # NOTE: if changed less than 2 months, then we can allow this. 
-                       # otherwise, apply 'down' status after greater than 2 months (below).
-
-                       print "changed status from %s to %s" % (node.status, boot_state)
-                       node.status = boot_state
-                       node.last_changed = datetime.now()
-
-               if node.status not in ['offline', 'down', 'disabled']:
-                       print "changed status from %s to offline" % node.status
-                       node.status = 'offline'
-                       node.last_changed = datetime.now()
-
-       if node_state == 'DEBUG':
-               if boot_state != 'disabled' and boot_state != 'safeboot':
-                       print "changed status from %s to failboot" % (node.status)
-                       current_status = "failboot"
-               else:
-                       print "changed status from %s to %s" % (node.status, boot_state)
-                       current_status = boot_state
-
-               if current_status != node.status and \
-                       current_status in ['failboot', 'disabled', 'safeboot']:
-
-                       node.status = current_status
-                       node.last_changed = datetime.now()
-
-       if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
-               print "changed status from %s to online" % node.status
-               node.status = 'online'
-               node.last_changed = datetime.now()
-
-       #################################################################
-       # Switch temporary hystersis states into their 'firm' states.
-       #         online -> good                after half a day
-       #         offline -> down               after two days
-       #         failboot -> down  after 30 days
-       #         safeboot -> failboot after 60 days
-       #         disabled -> down              after 60 days
-
-       if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
-               print "changed status from %s to good" % node.status
-               node.status = 'good'
-               # NOTE: do not reset last_changed, or you lose how long it's been up.
-
-       if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
-               print "changed status from %s to down" % node.status
-               node.status = 'down'
-               # NOTE: do not reset last_changed, or you lose how long it's been down.
-
-       if node.status == 'failboot' and changed_greaterthan(node.last_changed, 30):
-               print "changed status from %s to down" % node.status
-               node.status = 'down'
-               # NOTE: do not reset last_changed, or you lose how long it's been down.
-
-       if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60):
-               print "changed status from %s to down" % node.status
-               # NOTE: change an admin mode back into failboot after two months.
-               node.status = 'failboot'
-               node.last_changed = datetime.now()
-
-       # extreme cases of offline nodes
-       if ( boot_state == 'disabled' or last_contact == None ) and \
-                       changed_greaterthan(node.last_changed, 2*30) and \
-                       node.status != 'down':
-               print "changed status from %s to down" % node.status
-               node.status = 'down'
-               node.last_changed = datetime.now()
+    node_state = rec.observed_status
+    if rec.plc_node_stats:
+        print rec.plc_node_stats
+        boot_state = rec.plc_node_stats['boot_state']
+        run_level = rec.plc_node_stats['run_level']
+        last_contact = rec.plc_node_stats['last_contact']
+        node.plc_nodeid = rec.plc_node_stats['node_id']
+    else:
+        boot_state = "unknown"
+        last_contact = None
+
+    if boot_state == 'disable': boot_state = 'disabled'
+    if boot_state == 'diag' or boot_state == 'diagnose': boot_state = 'safeboot'
+
+    if rec.plc_node_stats and len(rec.plc_node_stats['pcu_ids']) > 0:
+        node.haspcu = True
+    else:
+        node.haspcu = False
+
+    node.firewall = rec.firewall
+    node.plc_siteid = rec.plc_node_stats['site_id']
+
+    # NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
+    #             'translations' into the node.status state
+    #        'BOOT' is a permanent state, but we want it to have a bit of
+    #            hysteresis (less than 0.5 days)
+    #################################################################
+    # "Initialize" the findbad states into nodebad status if they are not already set
+
+    if node_state == 'DOWN':
+        if boot_state == 'disabled' and changed_lessthan(node.last_changed, 60) and \
+            node.status != 'disabled':
+            # NOTE: if changed less than 2 months, then we can allow this. 
+            # otherwise, apply 'down' status after greater than 2 months (below).
+
+            print "changed status from %s to %s" % (node.status, boot_state)
+            node.status = boot_state
+            node.last_changed = datetime.now()
+
+        if node.status not in ['offline', 'down', 'disabled']:
+            print "changed status from %s to offline" % node.status
+            node.status = 'offline'
+            node.last_changed = datetime.now()
+
+    if node_state == 'DEBUG':
+        if boot_state != 'disabled' and boot_state != 'safeboot':
+            print "changed status from %s to failboot" % (node.status)
+            current_status = "failboot"
+        else:
+            print "changed status from %s to %s" % (node.status, boot_state)
+            current_status = boot_state
+
+        if current_status != node.status and \
+            current_status in ['failboot', 'disabled', 'safeboot']:
+
+            node.status = current_status
+            node.last_changed = datetime.now()
+
+    if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
+        print "changed status from %s to online" % node.status
+        node.status = 'online'
+        node.last_changed = datetime.now()
+
+    #################################################################
+    # Switch temporary hystersis states into their 'firm' states.
+    #      online -> good        after half a day
+    #      offline -> down        after two days
+    #      failboot -> down  after 30 days
+    #      safeboot -> failboot after 60 days
+    #      disabled -> down        after 60 days
+
+    if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
+        print "changed status from %s to good" % node.status
+        node.status = 'good'
+        # NOTE: do not reset last_changed, or you lose how long it's been up.
+
+    if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
+        print "changed status from %s to down" % node.status
+        node.status = 'down'
+        # NOTE: do not reset last_changed, or you lose how long it's been down.
+
+    if node.status == 'failboot' and changed_greaterthan(node.last_changed, 30):
+        print "changed status from %s to down" % node.status
+        node.status = 'down'
+        # NOTE: do not reset last_changed, or you lose how long it's been down.
+
+    if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60):
+        print "changed status from %s to down" % node.status
+        # NOTE: change an admin mode back into failboot after two months.
+        node.status = 'failboot'
+        node.last_changed = datetime.now()
+
+    # extreme cases of offline nodes
+    if ( boot_state == 'disabled' or last_contact == None ) and \
+            changed_greaterthan(node.last_changed, 2*30) and \
+            node.status != 'down':
+        print "changed status from %s to down" % node.status
+        node.status = 'down'
+        node.last_changed = datetime.now()
 
 def checkAndRecordState(l_nodes, l_plcnodes):
-       global count
-
-       for nodename in l_nodes:
-
-               nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename, 
-                                                       if_new_set={'status' : 'offline', 
-                                                                               'last_changed' : datetime.now()})
-               nodehist.last_checked = datetime.now()
-
-               try:
-                       # Find the most recent record
-                       noderec = FindbadNodeRecord.get_latest_by(hostname=nodename)
-               except:
-                       print "COULD NOT FIND %s" % nodename
-                       import traceback
-                       email_exception()
-                       print traceback.print_exc()
-                       continue
-
-               if not noderec:
-                       print "none object for %s"% nodename
-                       continue
-
-               check_node_state(noderec, nodehist)
-
-               count += 1
-               print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
-
-       # NOTE: this commits all pending operations to the DB.  Do not remove. 
-       session.flush()
-
-       return True
+    global count
+
+    for nodename in l_nodes:
+
+        nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename, 
+                            if_new_set={'status' : 'offline', 
+                                        'last_changed' : datetime.now()})
+        nodehist.last_checked = datetime.now()
+
+        try:
+            # Find the most recent record
+            noderec = FindbadNodeRecord.get_latest_by(hostname=nodename)
+        except:
+            print "COULD NOT FIND %s" % nodename
+            import traceback
+            email_exception()
+            print traceback.print_exc()
+            continue
+
+        if not noderec:
+            print "none object for %s"% nodename
+            continue
+
+        try:
+            check_node_state(noderec, nodehist)
+        except:
+            print "check_node_state failed %s" % nodename
+            import traceback
+            email_exception(nodename)
+            print traceback.print_exc()
+            continue
+
+        count += 1
+        print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
+
+    # NOTE: this commits all pending operations to the DB.  Do not remove. 
+    session.flush()
+
+    return True
 
 if __name__ == '__main__':
-       from monitor import parser as parsermodule
-       parser = parsermodule.getParser(['nodesets'])
-       parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, cachenodes=False)
-       parser = parsermodule.getParser(['defaults'], parser)
-       config = parsermodule.parse_args(parser)
-
-       try:
-               main2(config)
-       except Exception, err:
-               import traceback
-               print traceback.print_exc()
-               print "Exception: %s" % err
-               sys.exit(0)
+    from monitor import parser as parsermodule
+    parser = parsermodule.getParser(['nodesets'])
+    parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, cachenodes=False)
+    parser = parsermodule.getParser(['defaults'], parser)
+    config = parsermodule.parse_args(parser)
+
+    try:
+        main2(config)
+    except Exception, err:
+        import traceback
+        print traceback.print_exc()
+        print "Exception: %s" % err
+        sys.exit(0)
index 992e578..30b522a 100755 (executable)
@@ -78,12 +78,13 @@ def main(hostnames, sitenames):
        node_count = 1
        site_count = 1
        #print "hosts: %s" % hostnames
+       print "apply-policy"
        for i,host in enumerate(hostnames):
                try:
                        lb = plccache.plcdb_hn2lb[host]
                except:
                        print "unknown host in plcdb_hn2lb %s" % host
-                       email_exception(host)
+                       email_exception("%s %s" % (i,host))
                        continue
 
                nodeblack = BlacklistRecord.get_by(hostname=host)
@@ -105,7 +106,7 @@ def main(hostnames, sitenames):
                        not found_within(recent_actions, 'online_notice', 0.5):
                                # NOTE: chronicly flapping nodes will not get 'online' notices
                                #               since, they are never up long enough to be 'good'.
-                           # NOTE: searching for down_notice proves that the node has
+                               # NOTE: searching for down_notice proves that the node has
                                #               gone through a 'down' state first, rather than just
                                #               flapping through: good, offline, online, ...
                                #       
@@ -139,7 +140,7 @@ def main(hostnames, sitenames):
 
                                sitehist.attemptReboot(host)
                                print "send message for host %s try_reboot" % host
-                               if not fbpcu.test_is_ok() and \
+                               if False and not fbpcu.test_is_ok() and \
                                        not found_within(recent_actions, 'pcuerror_notice', 3.0):
 
                                        args = {}
@@ -159,7 +160,7 @@ def main(hostnames, sitenames):
 
                # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
                #               will be false for a day after the above condition is satisfied
-               if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
+               if False and nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
                        changed_greaterthan(nodehist.last_changed,1.5) and \
                        not nodehist.firewall and \
                        found_between(recent_actions, 'try_reboot', 3.5, 1) and \
@@ -198,11 +199,11 @@ def main(hostnames, sitenames):
                                        sitehist.sendMessage('down_notice', hostname=host)
                                        print "send message for host %s down" % host
 
-                               if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
+                               #if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
                                        # send down node notice
                                        #email_exception(host, "firewall_notice")
-                                       sitehist.sendMessage('firewall_notice', hostname=host)
-                                       print "send message for host %s down" % host
+                               #       sitehist.sendMessage('firewall_notice', hostname=host)
+                               #       print "send message for host %s down" % host
 
                node_count = node_count + 1
                print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
index ba2f5e5..0c599ab 100755 (executable)
@@ -4,5 +4,5 @@ from monitor import config
 
 for attr in dir(config):
        val = config.__getattribute__(attr)
-       if attr[0].isupper() and attr[1].isupper():
+       if (attr[0].isupper() and attr[1].isupper()) or ('email' in attr):
                print '%s="%s" ' % (attr, val)
index 59e0e8b..f261693 100755 (executable)
@@ -29,6 +29,7 @@ def getSequences():
                                "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
                                "bminit-cfg-auth-getplc-implementerror-update-debug-done",
                                "bminit-cfg-auth-authfail2-protoerror2-debug-done",
+                "bminit-cfg-auth-protoerror-protoerror2-exception-debug-validate-done",
                                ]:
                        sequences.update({n : "restart_bootmanager_boot"})
 
@@ -62,6 +63,7 @@ def getSequences():
                                "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
                                "bminit-cfg-auth-getplc-update-installinit-validate-exception-missingkernel-debug-validate-done",
                                "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-nospace-debug-validate-done",
+                "bminit-cfg-auth-getplc-update-installinit-validate-netcfg-disk-update4-update3-rebuildinitrd-update3-implementerror-nospace-debug-validate-done",
                                "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-nospace-nospace-nospace-nospace-nospace-nospace-nospace-nospace-implementerror-nospace-debug-validate-done",
                                ]:
                        sequences.update({n : "restart_bootmanager_rins"})
diff --git a/cron.d/copy-logs.sh b/cron.d/copy-logs.sh
deleted file mode 100755 (executable)
index 5c13a00..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-cd /usr/share/monitor
-source agent.sh &> /dev/null
-
-rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/php.log /var/lib/monitor/httpd-log
-rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/httpd/*-* /var/lib/monitor/httpd-log
-rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/httpd/*error* /var/lib/monitor/httpd-log
-
-rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/*-filesystem* /var/lib/monitor/filesystem
-rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/*-checkrpm* /var/lib/monitor/checkrpm
-
-rsync -qv -az -e ssh root@amber.cs.princeton.edu:/vservers/db-current/var/log/*-filesystem* /var/lib/monitor/filesystem
-rsync -qv -az -e ssh root@amber.cs.princeton.edu:/vservers/db-current/var/log/*-checkrpm* /var/lib/monitor/checkrpm
-
-rsync -qv -az -e ssh root@janine.cs.princeton.edu:/vservers/boot-current/var/log/*-filesystem* /var/lib/monitor/filesystem
-rsync -qv -az -e ssh root@janine.cs.princeton.edu:/vservers/boot-current/var/log/*-checkrpm* /var/lib/monitor/checkrpm
-rsync -qv -az -e ssh root@janine.cs.princeton.edu:/vservers/boot-current/var/log/bm/ /var/lib/monitor/bmlogs/
index eac2761..2070e00 100755 (executable)
@@ -291,7 +291,7 @@ class PlanetLabSession:
 
                # COPY Rpyc files to host
                #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
-               cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
+               cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/monitor/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
                if self.verbose: print cmd
                print cmd
                # TODO: Add timeout
@@ -449,6 +449,7 @@ class DebugInterface:
 
        def getDiskSteps(self):
                steps = [
+                       ('scsierror2' , 'sd \d:\d:\d:\d: ioctl_internal_command return code = \d+'),
                        ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
                        ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
                        ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
index 2eb2bb7..5cf8151 100644 (file)
@@ -282,3 +282,14 @@ def found_within(recent_actions, action_type, within):
        print "%s NOT found_within %s in recent_actions" % (action_type, timedelta(within) )
        return False
        
+
+class Time:
+    @classmethod
+    def dt_to_ts(cls, dt):
+        t = time.mktime(dt.timetuple())
+        return t
+
+    @classmethod
+    def ts_to_dt(cls, ts):
+        d = datetime.fromtimestamp(ts)
+        return d
index 657c865..c1680d2 100644 (file)
@@ -38,6 +38,7 @@ def dsn_from_dsln(d_sites, id2lb, l_nodes):
        lb2hn = {}
        dsn = {}
        hn2lb = {}
+       exclude = []
        for id in id2lb:
                if id2lb[id] not in lb2hn:
                        lb2hn[id2lb[id]] = []
@@ -48,6 +49,7 @@ def dsn_from_dsln(d_sites, id2lb, l_nodes):
                        login_base = id2lb[node['site_id']]
                else:
                        print >>sys.stderr, "%s has a foreign site_id %s" % (node['hostname'], node['site_id'])
+                       exclude.append(node['hostname'])
                        continue
                        for i in id2lb:
                                print i, " ", id2lb[i]
@@ -66,7 +68,7 @@ def dsn_from_dsln(d_sites, id2lb, l_nodes):
                dsn[login_base][hostname]['monitor'] = {}
 
                hn2lb[hostname] = login_base
-       return (dsn, hn2lb, lb2hn)
+       return (dsn, hn2lb, lb2hn, exclude)
 
 
 class Time:
index 97200d9..00632bf 100644 (file)
@@ -28,6 +28,21 @@ except:
        # NOTE: this host is used by default when there are no auth files.
        XMLRPC_SERVER="https://boot.planet-lab.org/PLCAPI/"
 
+global_log_api = True
+logging.basicConfig(level=logging.DEBUG,
+                    format='%(asctime)s %(levelname)s %(name)s : %(message)s',
+                    datefmt='%s %Y-%m-%dT%H:%M:%S',
+                    filename='/usr/share/monitor/myops-api-log.log',
+                    filemode='a')
+apilog = logging.getLogger("api")
+
+def log_api_call(name, *params):
+    logstr = "%s(" %name
+    for x in params:
+        logstr += "%s," % x
+    logstr = logstr[:-1] + ")"
+    if global_log_api: apilog.debug(logstr)
+
 logger = logging.getLogger("monitor")
        
 class Auth:
@@ -75,7 +90,11 @@ class PLC:
                        raise AssertionError("method does not exist")
 
                try:
-                       return lambda *params : method(self.auth, *params)
+                       def call_method(aut, *params):
+                               if global_log_api: log_api_call(name, *params)
+                               return method(aut, *params)
+                       return lambda *params : call_method(self.auth, *params)
+                       #return lambda *params : method(self.auth, *params)
                except xmlrpclib.ProtocolError:
                        traceback.print_exc()
                        global_error_count += 1
@@ -361,7 +380,7 @@ def suspendSiteSlices(loginbase):
                try:
                        if not debug:
                            if not isSliceExempt(slice):
-                                   api.AddSliceAttribute(auth.auth, slice, "enabled", "0")
+                                   api.AddSliceTag(auth.auth, slice, "enabled", "0")
                except Exception, exc:
                        logger.info("suspendSlices:  %s" % exc)
 
@@ -389,11 +408,11 @@ def enableSiteSlices(loginbase):
                                if len(slice_list) == 0:
                                        return
                                slice_id = slice_list[0]['slice_id']
-                               l_attr = api.GetSliceAttributes(auth.auth, {'slice_id': slice_id}, None)
+                               l_attr = api.GetSliceTags(auth.auth, {'slice_id': slice_id}, None)
                                for attr in l_attr:
-                                       if "enabled" == attr['name'] and attr['value'] == "0":
+                                       if "enabled" == attr['tagname'] and attr['value'] == "0":
                                                logger.info("Deleted enable=0 attribute from slice %s" % slice)
-                                               api.DeleteSliceAttribute(auth.auth, attr['slice_attribute_id'])
+                                               api.DeleteSliceTag(auth.auth, attr['slice_tag_id'])
                except Exception, exc:
                        logger.info("enableSiteSlices: %s" % exc)
                        print "exception: %s" % exc
@@ -411,7 +430,7 @@ def enableSlices(nodename):
 #      api = xmlrpclib.Server(auth.server, verbose=False)
 #      for slice in  slices(siteId(nodename)):
 #              logger.info("Suspending slice %s" % slice)
-#              api.SliceAttributeAdd(auth.auth, slice, "plc_slice_state", {"state" : "suspended"})
+#              api.SliceTagAdd(auth.auth, slice, "plc_slice_state", {"state" : "suspended"})
 #
 def enableSiteSliceCreation(loginbase):
        if isPendingSite(loginbase):
@@ -427,7 +446,8 @@ def enableSiteSliceCreation(loginbase):
                        site = api.GetSites(auth.auth, loginbase)[0]
                        if site['enabled'] == False:
                                logger.info("\tcalling UpdateSite(%s, enabled=True)" % loginbase)
-                               api.UpdateSite(auth.auth, loginbase, {'enabled': True})
+                               if not isSiteExempt(loginbase):
+                                       api.UpdateSite(auth.auth, loginbase, {'enabled': True})
        except Exception, exc:
                print "ERROR: enableSiteSliceCreation:  %s" % exc
                logger.info("ERROR: enableSiteSliceCreation:  %s" % exc)
@@ -444,9 +464,9 @@ def areSlicesEnabled(site):
                        return None
                for slice in slice_list:
                        slice_id = slice['slice_id']
-                       l_attr = api.GetSliceAttributes({'slice_id': slice_id})
+                       l_attr = api.GetSliceTags({'slice_id': slice_id})
                        for attr in l_attr:
-                               if "enabled" == attr['name'] and attr['value'] == "0":
+                               if "enabled" == attr['tagname'] and attr['value'] == "0":
                                        return False
 
        except Exception, exc:
index 60dbd22..4778a7d 100755 (executable)
@@ -5,9 +5,9 @@ from monitor.wrapper import plc
 from monitor.generic import *
 from monitor.database.info.model import *
 from monitor import database
+from monitor import config
 import profile
 
-
 l_sites = None
 l_nodes = None
 l_pcus = None
@@ -16,7 +16,7 @@ plcdb_hn2lb = None
 plcdb_lb2hn = None
 plcdb_id2lb = None
 
-class CachedPLC(PLC):
+class CachedPLC(plc.PLC):
 
        def _param_to_str(self, name, *params):
                fields = len(params)
@@ -98,11 +98,13 @@ def init():
        print >>sys.stderr, "building id2lb"
        (d_sites,id2lb) = dsites_from_lsites_id(l_sites)
        print >>sys.stderr, "building lb2hn"
-       (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
+       (plcdb, hn2lb, lb2hn, exclude) = dsn_from_dsln(d_sites, id2lb, l_nodes)
 
        plcdb_hn2lb = hn2lb
        plcdb_lb2hn = lb2hn
        plcdb_id2lb = id2lb
+
+       l_nodes = filter(lambda x: x['hostname'] not in exclude, l_nodes)
        
        return
 
@@ -146,6 +148,13 @@ def deleteExtra(l_plc, objectClass=PlcSite, dbKey='loginbase', plcKey='login_bas
                dbobj = objectClass.get_by(**{dbKey : obj})
                dbobj.delete()
 
+def conv(s):
+    # strip non-ascii characters to prvent errors
+    r = s
+    if type(s) in (str,unicode):
+        r = "".join([x for x in s if ord(x) < 128])
+    return r
+
 def sync():
        l_sites = plc.api.GetSites({'peer_id':None}, 
                                                ['login_base', 'site_id', 'abbreviated_name', 'latitude', 
@@ -172,8 +181,8 @@ def sync():
                dbpcu = PlcPCU2.findby_or_create(pcu_id=pcu['pcu_id'])
                dbpcu.date_checked = datetime.now()
                for key in pcu.keys():
-                       print >>sys.stderr, "setting %s  = %s" % (key, pcu[key])
-                       setattr(dbpcu, key, pcu[key])
+                       print >>sys.stderr, "setting %s  = %s" % (key, conv(pcu[key]))
+                       setattr(dbpcu, key, conv(pcu[key]))
 
        deleteExtra(l_pcus, PlcPCU2, 'pcu_id', 'pcu_id')
        deleteExtra(l_pcus, HistoryPCURecord, 'plc_pcuid', 'pcu_id')
index 332557b..eb03967 100644 (file)
Binary files a/web/MonitorWeb/monitorweb/static/images/favicon.ico and b/web/MonitorWeb/monitorweb/static/images/favicon.ico differ