added templating to google gadget xml file in monitor-server; previously it
Stephen Soltesz [Fri, 20 Nov 2009 22:36:17 +0000 (22:36 +0000)]
was hard-coded to monitor.planet-lab.org ; now PLE can have their own google
gadget.
added policy to close tickets if all nodes & pcus at a site are ok, to prevent
some leaking tickets.

comonquery.py
cron.d/copy-logs.sh
monitor-server.init
monitor/common.py
monitor/wrapper/mailer.py
policy.py
todo [deleted file]
web/MonitorWeb/monitorweb/static/xml/gadget.xml.in [moved from web/MonitorWeb/monitorweb/static/xml/gadget.xml with 65% similarity]

index 72e5d13..db0bafe 100755 (executable)
@@ -94,7 +94,7 @@ def main():
        # lastcotop measures whether cotop is actually running.  this is a better
        # metric than sshstatus, or other values from CoMon
 
-       COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
+       COMON_COTOPURL= "http://comon.cs.princeton.edu/status/tabulator.cgi?" + \
                                        "table=table_nodeview&formatcsv"
        if config.dns:
                config.fields = "name,dns1udp,dns1tcp,dns2udp,dns2tcp"
index 61754b5..5c13a00 100755 (executable)
@@ -3,6 +3,7 @@
 cd /usr/share/monitor
 source agent.sh &> /dev/null
 
+rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/php.log /var/lib/monitor/httpd-log
 rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/httpd/*-* /var/lib/monitor/httpd-log
 rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/httpd/*error* /var/lib/monitor/httpd-log
 
index 8c26416..424c362 100644 (file)
@@ -18,6 +18,8 @@
 local_config=/etc/planetlab/configs/site.xml
 
 MONITORPATH=/usr/share/monitor
+WEB_ROOT_PATH=web/MonitorWeb/monitorweb
+WEB_XML_PATH=static/xml
 
 # Be verbose
 set -x
@@ -40,6 +42,39 @@ if [ -z "$PLC_MONITOR_IP" ] ; then
        PLC_MONITOR_IP=$( gethostbyname $PLC_MONITOR_HOST )
 fi
 
+function update_config ()
+{
+       pattern=$1
+       with=$2
+       file=$3
+       sed -i -e "s/$pattern/$with/g" $file
+}
+function apply_template ()
+{
+       TEMPLATE=$1
+       DESTFILE=$2
+
+       tmp_file=$(mktemp)
+       cp $TEMPLATE $tmp_file
+
+       update_config PLC_NAME "$PLC_NAME" $tmp_file
+       update_config PLC_WWW_HOSTNAME $PLC_WWW_HOST $tmp_file
+       update_config MONITOR_HOSTNAME $PLC_MONITOR_HOST $tmp_file
+
+       cp $tmp_file $DESTFILE
+       rm -f $tmp_file
+}
+
+function check_gadget_config ()
+{
+       for input_file in $MONITORPATH/$WEB_ROOT_PATH/$WEB_XML_PATH/*.in ; do 
+               output_file=$MONITORPATH/$WEB_ROOT_PATH/$WEB_XML_PATH/`basename $input_file | sed -e 's/.in\$//'`
+               if [ $input_file -nt $output_file ] ; then 
+                       apply_template $input_file $output_file
+               fi
+       done
+}
+
 function check_monitor_schema_and_data() 
 {
        # NOTE: call create_all() to setup the database from the info model.
@@ -150,6 +185,7 @@ function create_httpd_conf ()
 
 # NOTE: redirect path without trailing '/' to path with.  Favor SSL.
 Redirect /monitor https://${PLC_MONITOR_HOST}:${PLC_WWW_SSL_PORT}/monitor/
+#RedirectMatch ^/$ https://${PLC_MONITOR_HOST}:${PLC_WWW_SSL_PORT}/monitor
 
 # NOTE: this directive strips '/monitor/' from the requested path and pastes
 #       the remaining part to the end of the ProxyPass url below.  All TG urls
@@ -201,6 +237,8 @@ case "$1" in
                # WRITE default /etc/monitor.conf
                check_monitor_conf
 
+               check_gadget_config
+
                if [ -n "$WROTE_PG_CONFIG" ] ; then
                        # NOTE: restart db to enable access by users granted above.
                        service plc restart postgresql
index 05a4ec2..850d36b 100644 (file)
@@ -43,12 +43,17 @@ def get_current_state(fbnode):
        return l
 
 def color_pcu_state(fbnode):
+       if fbnode['plc_pcuid'] is None:
+               return 'NOPCU'
+       else:
+               return 'PCU'
 
        if 'plcnode' in fbnode and 'pcu_ids' in fbnode['plcnode'] and len(fbnode['plcnode']['pcu_ids']) > 0 :
                values = reboot.get_pcu_values(fbnode['plcnode']['pcu_ids'][0])
                if values == None:
                        return fbnode['pcu']
        else:
+               print fbnode.keys()
                if 'pcu' not in fbnode:
                        return 'NOPCU'
                else:
index 1b45f0e..9f22c96 100755 (executable)
@@ -65,7 +65,11 @@ def getTicketStatus(ticket_id):
                r_values[key] = ":".join(vals[1:])
                r_values[key] = r_values[key].strip()
 
-       r_values['Created'] = calendar.timegm(time.strptime(r_values['Created']))
+       if 'Created' in r_values:
+               r_values['Created'] = calendar.timegm(time.strptime(r_values['Created']))
+       else:
+               r_values['Created'] = calendar.timegm(time.localtime())
+               
        #r_values['Told'] = calendar.timegm(time.strptime(r_values['Told']))
        return r_values
 
@@ -339,12 +343,7 @@ def email(subject, text, to):
                for mta in [MTA, 'golf.cs.princeton.edu']:
                        try:
                                # This is normal operation
-                               #print MTA
-                               #print FROM
-                               #print to
-                               #print msg
                                server = smtplib.SMTP(mta)
-                               #server = smtplib.SMTP('golf.cs.princeton.edu')
                                server.sendmail(FROM, to,  msg)
                                if config.bcc and not config.debug:
                                        server.sendmail(FROM, config.email,  msg)
@@ -361,17 +360,10 @@ def email(subject, text, to):
                        except Exception, err:
                                print "Mailer error2: failed using MTA(%s) with: %s" % (mta, err)
        else:
-               #print "Would mail %s" %to
                logger.debug("Would send mail to %s" % to)
 
 if __name__=="__main__":
        import smtplib
        import emailTxt
        import plc 
-       #email("[spam] bcc test from golf.cs.princeton.edu", 
-       #         "It gets to both recipients", 
-       #         "soltesz@cs.utk.edu")
        emailViaRT("mail via RT", "Let's see if this succeeds...", [FROM])
-       #email("Re: [PL #21323] TEST 7", 
-       #                  mailtxt.newbootcd_one[1] % {'hostname_list':"hostname list..."},
-       #                  [FROM])
index cdd311c..992e578 100755 (executable)
--- a/policy.py
+++ b/policy.py
@@ -21,6 +21,7 @@ from optparse import OptionParser
 from monitor import config
 from monitor import parser as parsermodule
 from monitor.common import *
+from monitor.const import MINUP
 from monitor.model import *
 from monitor.wrapper import plc
 from monitor.wrapper import plccache
@@ -36,6 +37,41 @@ def logic():
        plc.nodeBootState(host, 'reinstall')
        node_end_record(host)
 
+def check_node_and_pcu_status_for(loginbase):
+       """
+               this function checks whether all the nodes and associated pcus for a
+               given site are considered 'good'.  
+               
+               If so, the function returns True.
+               Otherwise, the function returns False.
+       """
+
+       results = [] 
+       for node in plccache.plcdb_lb2hn[loginbase]:
+
+               noderec  = FindbadNodeRecord.findby_or_create(hostname=node['hostname'])
+               nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname'])
+               nodebl   = BlacklistRecord.get_by(hostname=node['hostname'])
+               pcuhist  = HistoryPCURecord.get_by(plc_pcuid=noderec.plc_pcuid)
+
+               if (nodehist is not None and nodehist.status == 'good' and \
+                       ((pcuhist is not None and pcuhist.status == 'good') or (pcuhist is None)) ):
+                       if nodebl is None:                      # no entry in blacklist table
+                               results.append(True)
+                       elif nodebl is not None and nodebl.expired():   # expired entry in blacklist table
+                               results.append(True)
+                       else:
+                               results.append(False)   # entry that is not expired.
+               else:
+                       results.append(False)
+
+       try:
+               print "test: %s" % results
+               # NOTE: incase results is empty, reduce does not work on an empty set.
+               return reduce(lambda x,y: x&y, results) and len(results) > MINUP
+       except:
+               return False
+
 def main(hostnames, sitenames):
        # commands:
        i = 1
@@ -231,7 +267,17 @@ def main(hostnames, sitenames):
                                sitehist.closeTicket()
 
                                print "send message for site %s penalty cleared" % site
-
+                               
+                       # check all nodes and pcus for this site; if they're all ok,
+                       #               close the ticket, else leave it open.
+                       # NOTE: in the case where a PCU reboots and fails, a message is
+                       #               sent, but the PCU may appear to be ok according to tests.
+                       # NOTE: Also, bootmanager sends messages regarding disks,
+                       #               configuration, etc.  So, the conditions here are 'good'
+                       #               rather than 'not down' as it is in sitebad.
+                       close_ticket = check_node_and_pcu_status_for(site)
+                       if close_ticket:
+                               sitehist.closeTicket()
 
                site_count = site_count + 1
 
diff --git a/todo b/todo
deleted file mode 100644 (file)
index f69785f..0000000
--- a/todo
+++ /dev/null
@@ -1,196 +0,0 @@
-Structure:
-
-monitor module
-       plc wrapper
-       util functions
-       pkl database access
-       database models
-       third-party data sources
-
-pcucontrol
-       maps types to code
-       reboot.py
-       interface.py
-
-       transport:
-               pyssh 
-               ssh
-               telnetlib
-       models:
-               hpilo cmds
-               intelamt cmds
-               racadm cmd
-               ipmitool cmd
-
-web
-       cgi scripts
-       tgweb
-               project...
-
-cmds
-       py scripts
-       node
-       site
-       pcu
-       query
-       grouprins
-
-bootman
-       rpyc
-       
-
-
-
-
-###############################
-for each node:
-       Check Status ->
-               if Pass Threshold -> 
-                       Create Issue -> 
-                               Take Action -> 
-                                       email
-                                       bm
-                                       pcu
-                                       plc reset
-                                       apply penalties
-                                       flag for admin
-
-for each issue
-       check issue.status
-       if issue.status is "open": 
-               issue.take_next_action()
-       if issue.closed:
-               issue.shutdown()
-       if issue.paused:
-               pass
-
-action_list for issuetype (pcudown)
-       send email
-               yield
-       send email, apply penalty
-               yield
-       send email, apply second penalty
-               yield
-       send email
-
-action_list for issuetype (badhardware)
-action_list for issuetype (dnserror)
-action_list for issuetype (nodeconfig)
-action_list for issuetype (oldbootcd)
-
-action_list for issuetype (nodedown)
-       if pcuok, reboot
-               yield
-       if pcuok, and reboot failed, set rins, reboot
-               yield
-       create_issue pcubroken
-       send email
-               yield
-       send email, apply penalty
-               yield
-       send email, apppy second penalty
-               yield
-       send email
-       
-
-TOOLS:
-  * add a '--nocache'  to the default set of options.
-  * add a cache parameter in the monitor.conf file.
-
-
-
-TODO:
- * install openssh-server, passwd, perl-libwww-perl (for rt), rt-3.4.1,  MySQL-python
-       * had to mount -t devpts devpts /dev/pts to get ssh to work inside the
-         chroot. also, disable the pam modules in /etc/pam.d/sshd
-
- * blue
- * auto configuration for php configuration.  
-       maybe run translation of monitor.conf before loading monitorconfig.php?
- * blue2
-
- * A setup script of some kind would be nice that walked through : 
-    - writing monitorconfig.py
-       - creation of monitorconfig.php
-       - run syncplcdb.py
-       - testapi.py
-       - findbad.py on sample site.
-       - nodebad.py
-       - findbadpcus.py
-       - nodequery.py
-       - nodegroups.py
-       - loads webpage for those retreived values to confirm setup succeeded.
-
- * reimplement the config.py / .config mechanism.  I'd like for many commands
-   to share very similar argument or argument sets, as well as have some
-   common config options.  I'm not sure the best way to do this.
-    
-        - features of config.py
-               * parse arguments and return an object with attributes equal to the
-                 parser values.
-               * maintain values consistently across modules at run time.
-               * have default values that are not specified at each run time.
-               * easy to import and use
-
-        - config module is available via 'import config' or as returned by
-                 parsermodule.parse_args()
-     - python supports load-once modules, so subsequent imports refer to the
-          same module object.
-          
- * have package pull in threadpool from easy_install
-
- * place PKL files in a real database
-
- * clean up plc.py; there's a lot of redundent code.
-
- * figure out python paths for user commands.
-   - directories for pickle files.
-   - add user in rpm install
-   - user permissions for data files for day-to-day operations.
-
- * fix BayTechCtrlCUnibe expect script.
-
- * separate modules into different, logical categories, and create a python
-   module as part of the install:
-               command line, 
-               configuration, 
-               policy, 
-               data model, 
-               data access,
-               object interfaces.
-
-Lower priority:
- * Add a more structured, 'automate' library of scripts and means of making
-   batch calls, etc.
-
- * add a third package for user tools that will interact with the Monitor
-   service.  Mostly, I'm guessing this would be queries for the live status of
-   nodes and a more reliable 'reboot' and 'reinstall' mechanism than currently
-   availble with PLC.
-
-Done:
- * Find a better location to place and pull the PKL files currently in the pdb
-   directory.  Ultimately, these should be stored in a real DB.  Until then,
-   they should sit in a location that is accessible from the www scripts,
-   backend scripts, and user utilities.
- * nodebad loads plc_hn2lb unconditionally
- * nodeinfo loads act_all unconditionally
- * change findbad.py default db name
- * remove deps on www.printbadnodes
- * reboot.py loads findbadpcus unconditionally.
- * nodequery loads findbad unconditionally
- * unified_model loads findbad unconditionally
-
- * threadpool package.
- * build cmdamt with g++ prior to packaging
-
- * www/*.py need appropriate access to database.py,  config.py, monitorconfig.py, etc.
-       - need to convert monitor.conf into monitorconf.sh and monitorconf.php
-
- * pull out global configuration information from various files, like rt_db,
-   mailer.py,  auth.py, and any others.  Create a single configuration file
-   from which all others pull.
-
-   - convert plc and other files to use the new monitorconfig.py rather than
-     auth, or plc.*
-   - need to alter all import 'auth' statements.
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <Module>
-<ModulePrefs title="MyOps Summary" title_url="http://www.planet-lab.org">
+<ModulePrefs title="MyOps Summary PLC_NAME" title_url="http://PLC_WWW_HOSTNAME">
 <Require feature="dynamic-height"/>
 </ModulePrefs>
 <Content type="html"><![CDATA[
@@ -10,7 +10,7 @@ var displaycontent = function (responseText) {
        _gel('content_div').innerHTML = responseText; 
        _IG_AdjustIFrameHeight();
 };
-_IG_FetchContent('http://monitor.planet-lab.org/monitor/summary', displaycontent, { refreshInterval: 300 }); 
+_IG_FetchContent('http://MONITOR_HOSTNAME/monitor/summary', displaycontent, { refreshInterval: 300 }); 
 </script>
 ]]></Content>
 </Module>