added comonquery command-line tool.
Stephen Soltesz [Tue, 2 Jun 2009 21:30:28 +0000 (21:30 +0000)]
added flush and clear commands to the beginning of each web entry point in
controllers.py; I think this will help address the IntegrityErrors seen here
and at PLE.
moved plccache to local functions to speed invocation of some calls (nodequery)
added several tags to bootman.py to help with new 3.0 BootManager issue.
moved bootman import in monitor/database/info/interface.py due to import
error.  still need to investigate this
added extra RPM checks to node environment checks.  This isn't put in the db
but the log files can be queried over time.

comonquery.py [new file with mode: 0755]
monitor/bootman.py
monitor/common.py
monitor/database/info/interface.py
monitor/model.py
monitor/scanapi.py
monitor/wrapper/plccache.py
nodequery.py
nodesets.py
web/MonitorWeb/monitorweb/controllers.py

diff --git a/comonquery.py b/comonquery.py
new file mode 100755 (executable)
index 0000000..72e5d13
--- /dev/null
@@ -0,0 +1,152 @@
+#!/usr/bin/python
+
+
+import sys
+from monitor import database
+from monitor.common import *
+from monitor.model import Record
+import glob
+import os
+import traceback
+
+import time
+import re
+import string
+
+from monitor.wrapper import plc
+api = plc.getAuthAPI()
+
+from monitor.util import file
+from monitor import config
+
+from monitor.sources import comon
+
+default_fields="name,resptime,sshstatus,date,uptime,lastcotop,cpuspeed,memsize,disksize"
+
+class NoKeyException(Exception): pass
+
+def daysdown_print_nodeinfo(co_nodeinfo, hostname):
+       co_nodeinfo['hostname'] = hostname
+       co_nodeinfo['daysdown'] = Record.getStrDaysDown(co_nodeinfo)
+       co_nodeinfo['intdaysdown'] = Record.getDaysDown(co_nodeinfo)
+
+       print "%(intdaysdown)5s %(hostname)-44s | %(state)10.10s | %(daysdown)s" % co_nodeinfo
+
+def co_print_nodeinfo(co_nodeinfo, hostname, fields=None):
+       
+       # co_nodeinfo['bootstate'] : unknown pattern
+       co_nodeinfo['name'] = hostname
+
+       if 'uptime' in co_nodeinfo and co_nodeinfo['uptime'] != "null":
+               co_nodeinfo['uptime'] = diff_time(time.time()-float(co_nodeinfo['uptime']))
+
+       if 'date' in co_nodeinfo and co_nodeinfo['date'] != "null":
+               co_nodeinfo['date'] = diff_time(float(co_nodeinfo['date']))
+
+       if fields == default_fields.split(','):
+
+               print "%(name)-40s %(sshstatus)5.5s %(resptime)6.6s %(lastcotop)6.6s %(uptime)s" % co_nodeinfo
+       else:
+               format = ""
+               for f in fields:
+                       format += "%%(%s)s " % f
+               print format % co_nodeinfo
+
+def main():
+
+       from monitor import parser as parsermodule
+       parser = parsermodule.getParser()
+
+       parser.set_defaults(node=None, 
+                               select=None, 
+                               list=None, 
+                               dns=False,
+                               listkeys=False,
+                               pcuselect=None, 
+                               nodelist=None, 
+                               daysdown=None, 
+                               fields=default_fields)
+       parser.add_option("", "--daysdown", dest="daysdown", action="store_true",
+                                               help="List the node state and days down...")
+
+       parser.add_option("", "--select", dest="select", metavar="key=value", 
+                                               help="List all nodes with the given key=value pattern")
+       parser.add_option("", "--fields", dest="fields", metavar="key,list,...", 
+                                               help="a list of keys to display for each entry.")
+       parser.add_option("", "--list", dest="list", action="store_true", 
+                                               help="Write only the hostnames as output.")
+       parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt", 
+                                               help="A list of nodes to bring out of debug mode.")
+       parser.add_option("", "--listkeys", dest="listkeys", action="store_true",
+                                               help="A list of nodes to bring out of debug mode.")
+
+       parser.add_option("", "--dns", dest="dns", action="store_true",
+                                               help="A convenience query for dns values")
+
+       parser = parsermodule.getParser(['defaults'], parser)
+       config = parsermodule.parse_args(parser)
+       
+       #if config.fromtime:
+       #       fb = None
+       #else:
+       #       fb = None
+
+       # lastcotop measures whether cotop is actually running.  this is a better
+       # metric than sshstatus, or other values from CoMon
+
+       COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
+                                       "table=table_nodeview&formatcsv"
+       if config.dns:
+               config.fields = "name,dns1udp,dns1tcp,dns2udp,dns2tcp"
+               config.select = "dns1udp>0||dns1tcp>0||dns2udp>0||dns2tcp>0"
+
+       if config.fields == "all":
+               cotop_url = COMON_COTOPURL
+       else:
+               cotop_url = COMON_COTOPURL + "&dumpcols='%s'" % config.fields
+
+       if config.select:
+               cotop_url = cotop_url + "&select='%s'" % config.select
+
+       if config.listkeys:
+               cotop_url = COMON_COTOPURL + "&limit=1"
+
+       cotop = comon.Comon()
+       cohash = cotop.coget(cotop_url)
+
+       if config.nodelist:
+               nodelist = file.getListFromFile(config.nodelist)
+       else:
+               # NOTE: list of nodes should come from comon query.   
+               nodelist = cohash.keys()
+
+       print "%(name)-40s %(sshstatus)5.5s %(resptime)6.6s %(lastcotop)6.6s %(uptime)s" % {
+                                       'name' : 'hostname', 
+                                       'sshstatus' : 'sshstatus', 
+                                       'resptime' : 'resptime', 
+                                       'lastcotop' : 'lastcotop', 
+                                       'uptime' : 'uptime'}
+       for node in nodelist:
+               config.node = node
+
+               if node not in cohash: continue
+
+               co_nodeinfo = cohash[node]
+
+               if config.listkeys:
+                       print "Primary keys available in the comon object:"
+                       for key in co_nodeinfo.keys():
+                               print "\t",key
+                       sys.exit(0)
+                       
+               if config.list:
+                       print node
+               else:
+                       if config.daysdown:
+                               daysdown_print_nodeinfo(co_nodeinfo, node)
+                       else:
+                               fields = config.fields.split(",")
+                               co_print_nodeinfo(co_nodeinfo, node, fields)
+               
+if __name__ == "__main__":
+       main()
index 531f883..4693315 100755 (executable)
@@ -430,11 +430,16 @@ class DebugInterface:
                                "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
                                "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
                                "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
                                ]:
                        sequences.update({n : "restart_bootmanager_rins"})
 
                # repair_node_keys
-               sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
+               for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
+                                       "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
+                               ]:
+                       sequences.update({n: "repair_node_keys"})
 
                #   conn.restart_node('reinstall')
                for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
@@ -459,6 +464,7 @@ class DebugInterface:
                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
                                 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
+                                "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
                                 ]:
                        sequences.update({n: "restart_node_boot"})
 
@@ -748,7 +754,8 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                        if conn.compare_and_repair_nodekeys():
                                # the keys either are in sync or were forced in sync.
                                # so try to reboot the node again.
-                               conn.restart_bootmanager('reinstall')
+                               # TODO: why was this originally 'reinstall' instead of 'boot'??
+                               conn.restart_bootmanager('boot')
                                pass
                        else:
                                # there was some failure to synchronize the keys.
index 9878d52..da174d8 100644 (file)
@@ -4,7 +4,7 @@ import struct
 from monitor import reboot
 from monitor import util
 from monitor import database
-from monitor.wrapper import plc, plccache
+from monitor.wrapper import plc
 
 from datetime import datetime, timedelta
 from monitor.model import Message
@@ -187,6 +187,7 @@ def get_nodeset(config):
                Given the config values passed in, return the set of hostnames that it
                evaluates to.
        """
+       from monitor.wrapper import plccache
        api = plc.getAuthAPI()
        l_nodes = plccache.l_nodes
 
index 47c7553..7a41b89 100644 (file)
@@ -1,4 +1,3 @@
-from monitor import bootman            # debug nodes
 
 from monitor import reboot
 from monitor.common import *
@@ -162,6 +161,7 @@ class SiteInterface(HistorySiteRecord):
                self.db.message_status = "new"
 
        def runBootManager(self, hostname):
+               from monitor import bootman
                print "attempting BM reboot of %s" % hostname
                ret = ""
                try:
index 2f2f5e3..5d0fc05 100755 (executable)
@@ -2,7 +2,7 @@
 
 from monitor import database
 
-from monitor.wrapper import plc, plccache
+from monitor.wrapper import plc
 from monitor.wrapper import mailer
 import time
 
@@ -413,6 +413,7 @@ class Target:
 class Record(object):
 
        def __init__(self, hostname, data):
+               from monitor.wrapper import plccache
                self.hostname = hostname
                self.data = data
                self.plcdb_hn2lb = plccache.plcdb_hn2lb
index 667c504..35f24ac 100644 (file)
@@ -212,6 +212,7 @@ class ScanNodeInternal(ScanInterface):
                                                echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
                                                echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
                                                echo '  "rpm_version":"'`rpm -q NodeManager`'",'
+                                               echo '  "rpm_versions":"'`rpm -q -a`'",'
                                                echo "}"
 EOF                            """)
                                        
@@ -227,6 +228,7 @@ EOF                         """)
                                                                                'fs_status' : '',
                                                                                'dns_status' : '',
                                                                                'rpm_version' : '',
+                                                                               'rpm_versions' : '',
                                                                                'princeton_comon_dir' : "", 
                                                                                'princeton_comon_running' : "", 
                                                                                'princeton_comon_procs' : "", 'ssh_portused' : None})
@@ -234,6 +236,8 @@ EOF                         """)
                                print traceback.print_exc()
                                sys.exit(1)
 
+                       print "ALLVERSIONS: %s %s" % (nodename, values['rpm_versions'])
+
                        print "RPMVERSION: %s %s" % (nodename, values['rpm_version'])
                        ### RUN SSH ######################
                        b_getbootcd_id = True
index f92fa85..fea4c72 100755 (executable)
@@ -62,6 +62,9 @@ plcdb_lb2hn = None
 plcdb_id2lb = None
 
 def init():
+       import traceback
+       print "IMPORTING PLCCACHE: ",
+       traceback.print_stack()
        global l_sites
        global l_nodes
        global l_pcus
index e9001a6..738e58d 100755 (executable)
@@ -13,11 +13,11 @@ import time
 import re
 import string
 
-from monitor.wrapper import plc, plccache
+from monitor.wrapper import plc
 api = plc.getAuthAPI()
 
-from monitor.database.info.model import FindbadNodeRecord, FindbadPCURecord, session
-from monitor import util
+from monitor.database.info.model import HistoryNodeRecord, FindbadNodeRecord, FindbadPCURecord, session
+from monitor.util import file as utilfile
 from monitor import config
 
 
@@ -383,13 +383,12 @@ def main():
                fb = None
 
        if config.nodelist:
-               nodelist = util.file.getListFromFile(config.nodelist)
+               nodelist = utilfile.getListFromFile(config.nodelist)
        else:
                # NOTE: list of nodes should come from findbad db.   Otherwise, we
                # don't know for sure that there's a record in the db..
-               plcnodes = plccache.l_nodes
-               nodelist = [ node['hostname'] for node in plcnodes ]
-               #nodelist = ['planetlab-1.cs.princeton.edu']
+               fbquery = HistoryNodeRecord.query.all()
+               nodelist = [ n.hostname for n in fbquery ]
 
        pculist = None
        if config.select is not None and config.pcuselect is not None:
index ea69d6b..6461dfb 100755 (executable)
@@ -3,8 +3,8 @@
 import sys
 import os
 from sets import Set
-import parser as parsermodule
-import util.file
+from monitor import parser as parsermodule
+from monitor.util import file
 
 def main():
        parser = parsermodule.getParser()
@@ -17,8 +17,8 @@ def main():
        f1 = config.args[0]
        f2 = config.args[1]
 
-       s1 = util.file.getListFromFile(f1)
-       s2 = util.file.getListFromFile(f2)
+       s1 = file.getListFromFile(f1)
+       s2 = file.getListFromFile(f2)
 
        s = nodesets(config.operation, s1, s2)
 
index 9bdb912..e2fb9bd 100644 (file)
@@ -15,6 +15,7 @@ from monitor_xmlrpc import MonitorXmlrpcServer
 
 from monitor import reboot
 from monitor import scanapi
+import time
 
 from monitor.wrapper.plccache import plcdb_id2lb as site_id2lb
 from monitor.wrapper.plccache import plcdb_hn2lb as site_hn2lb
@@ -155,7 +156,6 @@ def prep_node_for_display(node):
 class Root(controllers.RootController, MonitorXmlrpcServer):
        @expose(template="monitorweb.templates.welcome")
        def index(self):
-               import time
                # log.debug("Happy TurboGears Controller Responding For Duty")
                flash("Your application is now running")
                return dict(now=time.ctime())
@@ -173,7 +173,10 @@ class Root(controllers.RootController, MonitorXmlrpcServer):
 
        @expose(template="monitorweb.templates.nodelist")
        def node(self, filter='boot'):
-               import time
+               print "NODE------------------"
+               print "befor-len: ", len( [ i for i in session] )
+               session.flush(); session.clear()
+               print "after-len: ", len( [ i for i in session] )
                fbquery = FindbadNodeRecord.get_all_latest()
                query = []
                filtercount = {'down' : 0, 'boot': 0, 'debug' : 0, 'diagnose' : 0, 'disabled': 0, 
@@ -428,7 +431,10 @@ class Root(controllers.RootController, MonitorXmlrpcServer):
 
        @expose(template="monitorweb.templates.pculist")
        def pcu(self, filter='all'):
-               import time
+               print "PCUVIEW------------------"
+               print "befor-len: ", len( [ i for i in session] )
+               session.flush(); session.clear()
+               print "after-len: ", len( [ i for i in session] )
                fbquery = FindbadPCURecord.get_all_latest()
                query = []
                filtercount = {'ok' : 0, 'NetDown': 0, 'Not_Run' : 0, 'pending' : 0, 'all' : 0}
@@ -475,6 +481,10 @@ class Root(controllers.RootController, MonitorXmlrpcServer):
 
        @expose(template="monitorweb.templates.sitelist")
        def site(self, filter='all'):
+               print "SITE------------------"
+               print "befor-len: ", len( [ i for i in session] )
+               session.flush(); session.clear()
+               print "after-len: ", len( [ i for i in session] )
                filtercount = {'good' : 0, 'down': 0, 'online':0, 'offline' : 0, 'new' : 0, 'pending' : 0, 'all' : 0}
                fbquery = HistorySiteRecord.query.all()
                query = []