Several updates to policy, repair, and automate script.
Stephen Soltesz [Sun, 22 May 2011 01:37:13 +0000 (01:37 +0000)]
Add a check for the number of sshkeys added to the current agent
Separate police.py and repair.py for failboot nodes

automate-default.sh
commands/policy.py
commands/repair.py [new file with mode: 0755]
monitor/scanapi.py
statistics/functions.r

index a51144a..db53fd4 100755 (executable)
@@ -11,6 +11,16 @@ set -e
 DATE=`date +%Y-%m-%d-%T`
 MONITOR_PID="${MONITOR_SCRIPT_ROOT}/SKIP"
 
+function send_mail ()
+{
+    subject=$1
+    body=$2
+    mail -s "$subject" $exception_email <<EOF
+$body
+EOF
+}
+
+
 echo "#######################################"; echo "Running Monitor at $DATE"; echo "######################################"
 echo "Performing API test"
 API=$(${MONITOR_SCRIPT_ROOT}/tools/testapi.py)
@@ -62,6 +72,13 @@ fi
 #TODO: should add a call to ssh-add -l to check if the keys are loaded or not.
 source ${MONITOR_SCRIPT_ROOT}/agent.sh
 
+# CHECK AGENT IS UP AND RUNNING
+count=$( ssh-add -l | wc -l ) 
+if [ $count -lt 3 ] ; then
+    send_mail "ssh-agent is not up and running." "Add keys before monitoring can continue"
+       exit
+fi
+
 ${MONITOR_SCRIPT_ROOT}/commands/syncwithplc.py $DATE || :
 service plc restart monitor
 
index 7f8c5a2..392746f 100755 (executable)
@@ -182,16 +182,6 @@ def main(hostnames, sitenames):
                 sitehist.sendMessage('pcufailed_notice', hostname=host, pcu_name=pcu_name)
                 print "send message for host %s PCU Failure" % host
 
-        if nodehist.status == 'failboot' and \
-            changed_greaterthan(nodehist.last_changed, 0.25) and \
-            not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
-                # send down node notice
-                # delay 0.5 days before retrying...
-
-                print "send message for host %s bootmanager_restore" % host
-                sitehist.runBootManager(host)
-            #    sitehist.sendMessage('retry_bootman', hostname=host)
-
         if nodehist.status == 'down' and \
             changed_greaterthan(nodehist.last_changed, 2):
                 if not nodehist.firewall and not found_within(recent_actions, 'down_notice', 3.5):
diff --git a/commands/repair.py b/commands/repair.py
new file mode 100755 (executable)
index 0000000..1a0d8ab
--- /dev/null
@@ -0,0 +1,124 @@
+#!/usr/bin/python
+
+# This script is used to manipulate the operational state of nodes in
+# different node groups.  These are basically set operations on nodes via the
+# PLC api.
+# 
+# Take the ng name as an argument....
+# optionally, 
+#  * get a list of nodes in the given nodegroup.
+#  * set some or all in the set to rins.
+#  * restart them all.
+#  * do something else to them all.
+# 
+
+import os
+import time
+import traceback
+import sys
+from optparse import OptionParser
+
+from monitor import config
+from monitor import parser as parsermodule
+from monitor.common import *
+from monitor.const import MINUP
+from monitor.model import *
+from monitor.wrapper import plc
+from monitor.wrapper import plccache
+from monitor.database.info.model import *
+from monitor.database.info.interface import *
+
+from monitor.query import verify,query_to_dict,node_select
+
+def main(hostnames, config):
+    # commands:
+    i = 1
+    node_count = 1
+    print "failboot-repair"
+    for i,host in enumerate(hostnames):
+        try:
+            lb = plccache.plcdb_hn2lb[host]
+        except:
+            print "unknown host in plcdb_hn2lb %s" % host
+            email_exception("%s %s" % (i,host))
+            continue
+
+        nodeblack = BlacklistRecord.get_by(hostname=host)
+
+        if nodeblack and not nodeblack.expired():
+            print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
+            continue
+
+        sitehist = SiteInterface.get_or_make(loginbase=lb)
+
+        recent_actions = sitehist.getRecentActions(hostname=host)
+
+        nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
+
+        print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
+
+        if nodehist.status == 'failboot' and \
+            changed_greaterthan(nodehist.last_changed, 0.25) and \
+            ( not found_between(recent_actions, 'bootmanager_restore', 0.5, 0) \
+                       or config.force ):
+                # send down node notice
+                # delay 0.5 days before retrying...
+                print "send message for host %s bootmanager_restore" % host
+                sitehist.runBootManager(host)
+
+        node_count = node_count + 1
+        print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
+        sys.stdout.flush()
+        session.flush()
+
+    session.flush()
+    return
+
+
+if __name__ == "__main__":
+    parser = parsermodule.getParser(['nodesets'])
+    parser.set_defaults(rins=False,
+                        reboot=False,
+                        force=False, 
+                        nosetup=False, 
+                        verbose=False, 
+                        quiet=False,)
+
+    parser.add_option("", "--force", dest="force", action="store_true", 
+                        help="Force action regardless of previous actions/logs.")
+    parser.add_option("", "--rins", dest="rins", action="store_true", 
+                        help="Set the boot_state to 'rins' for all nodes.")
+    parser.add_option("", "--reboot", dest="reboot", action="store_true", 
+                        help="Actively try to reboot the nodes, keeping a log of actions.")
+
+    parser.add_option("", "--verbose", dest="verbose", action="store_true", 
+                        help="Extra debug output messages.")
+
+    parser = parsermodule.getParser(['defaults'], parser)
+    config = parsermodule.parse_args(parser)
+
+    fbquery = HistoryNodeRecord.query.all()
+    hostnames = [ n.hostname for n in fbquery ]
+    
+    if config.site:
+        # TODO: replace with calls to local db.  the api fails so often that
+        #         these calls should be regarded as unreliable.
+        l_nodes = plccache.GetNodesBySite(config.site)
+        filter_hostnames = [ n['hostname'] for n in l_nodes ]
+
+        hostnames = filter(lambda x: x in filter_hostnames, hostnames)
+
+    if config.node:
+        hostnames = [ config.node ] 
+
+    try:
+        main(hostnames, config)
+        session.flush()
+    except KeyboardInterrupt:
+        print "Killed by interrupt"
+        session.flush()
+        sys.exit(0)
+    except:
+        email_exception()
+        print traceback.print_exc();
+        print "fail all..."
index 84bb6e0..eef53aa 100644 (file)
@@ -184,10 +184,10 @@ class ScanNodeInternal(ScanInterface):
                #               commands at once.
                values = {}
                nmap = command.CMD()
-               print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename
-               (oval1,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename)
-               (oval2,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename)
-               (oval3,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename)
+               print "nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename
+               (oval1,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename)
+               (oval2,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename)
+               (oval3,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep -v Down | grep Ports:" % nodename)
                # NOTE: an empty / error value for oval, will still work.
                values['port_status'] = {}
                (o1,continue_probe) = nmap_port_status(oval1)
@@ -249,8 +249,8 @@ class ScanNodeInternal(ScanInterface):
                                        echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
                                        echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
                                        echo '  "fs_status":"'`grep proc /proc/mounts | grep ro, ; if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 20 touch /var/log/monitor 2>&1 ; if [ -d /vservers/ ] ; then timeout.pl 20 touch /vservers/monitor.log 2>&1  ; fi ; fi`'",'
-                                       echo '  "rpm_version":"'`if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 30 rpm -q NodeManager ; fi`'",'
-                                       echo '  "rpm_versions":"'`if [ -x /usr/bin/timeout.pl ] ; then timeout.pl 45 rpm -q -a ; fi`'",'
+                                       echo '  "rpm_version":"''",'
+                                       echo '  "rpm_versions":"''",'
                                        echo '  "md5sums":"'`md5sum /etc/yum.conf /etc/yum.myplc.d/myplc.repo /etc/yum.myplc.d/stock.repo  | awk '{print $1}'`'",'
                                        echo '  "md5sum_yum":"'`grep -v -E "^#" /etc/yum.myplc.d/myplc.repo | md5sum`'",'
                                        echo '  "nada":"'``'",'
@@ -529,13 +529,13 @@ class ScanPCU(ScanInterface):
                                traceback.print_exc()
                                continue_probe = False
 
-                       if b_except or not continue_probe: return (None, None, None)
+                       if b_except or not continue_probe: return (None, None)
 
                        #### RUN NMAP ###############################
                        if continue_probe:
                                nmap = command.CMD()
-                               print "nmap -oG - -P0 -p22,23,80,443,623,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])
-                               (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,623,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
+                               print "nmap -oG - -P0 -p22,23,80,443,623,5869,9100,16992 %s | grep -v Down | grep Ports:" % reboot.pcu_name(values['plc_pcu_stats'])
+                               (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,623,5869,9100,16992 %s | grep -v Down | grep Ports:" % reboot.pcu_name(values['plc_pcu_stats']))
                                # NOTE: an empty / error value for oval, will still work.
                                (values['port_status'], continue_probe) = nmap_port_status(oval)
                        else:
index e02ee67..8548e41 100644 (file)
@@ -96,7 +96,7 @@ slices_4 <- function (x, components=FALSE)
         a<-(m+d+c*r+b+p);
     }
 
-    return (a/5*5);
+    return (a/5*5);     # I know. Preserved for clarity and consistency with earlier examples
 }
 
 index_of_bin <- function (h, value)
@@ -194,112 +194,6 @@ year_hist <- function (t, year, from, to, max, type="week", title="Histogram for
     return (h);
 }
 
-year_hist_unique <- function (t, year, from, to, max, type="week", title="Histogram for Tickets in")
-{
-    dates <-seq(as.Date(from), as.Date(to), type)
-    months <- format(dates, "%b-%d")
-    hbreaks<-unclass(as.POSIXct(dates))
-
-    rows <- NULL
-    for ( d in hbreaks )
-    {
-        d_end <- d+60*60*24
-        t_sub <- t[which(t$start > d & t$start <= d_end),]
-        rows <- rbind(rows, c('start'=d, 'reboots'=length(unique(t_sub$hostname))) )
-    }
-    rows <- data.frame(rows)
-
-    if ( max == 0 ) {
-        max = max(rows$reboots)
-    }
-    main<-sprintf(paste(title, "%s: MEAN %s\n"), year, mean(rows$reboots))
-    print(main);
-    barplot(rows$reboots, ylim=c(0,max), main=main, axes=FALSE, space=0)
-    #plot(h, ylim=c(0,max), main=main, axes=FALSE)
-    axis(1, labels=months, at=seq(1,length(hbreaks)))
-    axis(2)
-    abline(mean(rows$reboots), 0, col='grey')
-    #qqnorm(h$counts)
-    #qqline(h$counts)
-    return (rows);
-}
-
-year_hist_unique_recent <- function (t, year, from, to, max, blocks=c(1,3,7,14,30), type="week", title="Histogram for Tickets in")
-{
-    dates <-seq(as.Date(from), as.Date(to), type)
-    months <- format(dates, "%b-%d")
-    hbreaks<-unclass(as.POSIXct(dates))
-
-    rows <- NULL
-
-
-    for ( d in hbreaks )
-    {
-        # initialize row for this iteration
-        row <- NULL
-        row[as.character(0)] <- 0
-        for ( block in blocks ) {
-            row[as.character(block)] <- 0
-        }
-
-        # find the range : d plus a day
-        d_end <- d+60*60*24
-        # find unique hosts in this day range
-        t_sub <- t[which(t$start > d & t$start <= d_end),]
-        unique_hosts <- unique(t_sub$hostname)
-        if (length(unique_hosts) == 0 ) { 
-            rows <- rbind(rows, c('start'=d, row))
-            next 
-        }
-
-        #print(sprintf("unique_hosts: %s\n", unique_hosts));
-        print(sprintf("unique_hosts: %s\n", length(unique_hosts)));
-
-        for ( host in as.character(unique_hosts) ) 
-        {
-            found <- 0
-            for ( block in blocks )
-            {
-                #print(sprintf("date: %s, block: -%s, %s\n", d, block, host));
-                #print(sprintf("row: %s\n", row));
-                # find the range : 'block' days ago to 'd'
-                d_back <- d - 60*60*24 * block
-                t_back_sub <- t[which(t$start > d_back & t$start <= d),]
-                u <- unique(t_back_sub$hostname)
-                if ( length(u[u==host]) >= 1) 
-                {
-    #               add to block_count and go to next host.
-                    found <- 1
-                    i <- as.character(block)
-                    row[i] <- row[i] + 1
-                    break
-                }
-            }
-            if ( found == 0 )
-            {
-                # no range found
-                row['0'] <- row['0'] + 1
-            }
-        }
-        rows <- rbind(rows, c('start'=d, row))
-    }
-
-    rows <- data.frame(rows)
-
-    if ( max == 0 ) {
-        max = max(rows['0'])
-    }
-    #main<-sprintf(paste(title, "%s: MEAN %s\n"), year, mean(rows$reboots))
-    #print(main);
-    #barplot(rows$reboots, ylim=c(0,max), main=main, axes=FALSE, space=0)
-    ##plot(h, ylim=c(0,max), main=main, axes=FALSE)
-    #axis(1, labels=months, at=seq(1,length(hbreaks)))
-    #axis(2)
-    #abline(mean(rows$reboots), 0, col='grey')
-    #qqnorm(h$counts)
-    #qqline(h$counts)
-    return (rows);
-}
 
 source("myImagePlot.R")
 reboot_image <- function (t, year, from, to, max=0, type="week", title="")
@@ -397,6 +291,17 @@ add_timestamp <- function (t)
     return (t);
 }
 
+convert_datestr <- function (t, format)
+{
+    t$start <- c(0)  # assign new column with zero value initially
+    for ( i in 1:length(t$Date) )
+    {
+        tstamp <-unclass(as.POSIXct(strptime(t$Date[i], format)))[1]
+        t$start[i] <- tstamp
+    }
+    return (t);
+}
+
 abline_at_date <- function (date, col='black', lty=1, format="%Y-%m-%d", height=0)
 {
     ts <-unclass(as.POSIXct(date, format=format, origin="1970-01-01"))[1]
@@ -420,3 +325,152 @@ lowess_smooth <- function (x, y, delta=(60*60*24), f=0.02)
     a<-lowess(x, y, delta=delta, f=f)
     return (a);
 }
+
+in_list <- function ( str, str_list )
+{
+    for ( f in str_list )
+    {
+        if ( str == f )
+        {
+            return (TRUE);
+        }
+    }
+    return (FALSE);
+}
+
+col2hex <- function  (colorname, alpha=1)
+{
+    hex = "FFFFFFFF";
+    c_rgb <- col2rgb(colorname)
+    c_rgb <- c_rgb / 255
+    hex <- rgb(c_rgb[1,1], c_rgb[2,1], c_rgb[3,1], alpha)
+    return (hex);
+}
+
+printf <- function (...)
+{
+    return(print(sprintf(...)));
+}
+
+time_graph_setup <- function (from, to)
+{
+    # find 'type' range of days
+    xlim <- c(tstamp(from, format="%Y/%m/%d"), tstamp(to, format="%Y/%m/%d"))
+
+    begin_date <- as.Date(from)
+    end_date <- as.Date(to)
+
+    begin_day <- as.numeric(format(begin_date, "%j"))
+    end_day <- as.numeric(format(end_date, "%j"))
+    print(begin_day)
+
+    date_days <-seq(as.Date(from), as.Date(to), 'day')
+    date_weeks <-seq(as.Date(from), as.Date(to), 'week')
+    date_months <-seq(as.Date(from), as.Date(to), 'month')
+    date_years <-seq(as.Date(from), as.Date(to), 'year')
+
+    day_str <- format(date_months, "%a")
+    day_ts <- unclass(as.POSIXct(date_days))
+
+    week_str <- format(date_months, "%W")
+    week_ts <- unclass(as.POSIXct(date_weeks))
+
+    month_str <- format(date_months, "%b")
+    month_ts <- unclass(as.POSIXct(date_months))
+
+    year_str <- format(date_years, "%Y")
+    year_ts <- unclass(as.POSIXct(date_years))
+    print(year_ts)
+    year_ts_before <- year_ts
+
+    l <- length(year_ts)
+    print(l)
+    if ( l == 1 ) {
+        # center year between begin_day and end_day
+        print("one year!")
+        year_ts[1] <- (xlim[1] + xlim[2]) / 2.0
+    } else 
+    {
+        print("multitple years!")
+        # center first year between start day and last day of that year.
+        print(year_ts)
+        year_ts[1] <- year_ts[1] + ((365 - begin_day)/2.0)*60*60*24
+        print(year_ts)
+        year_ts[l] <- year_ts[l] + ( -begin_day + end_day/2.0)*60*60*24
+        print(year_ts)
+        if ( l > 2 ) {
+            year_ts <- c(year_ts[1], year_ts[seq(2,l-1)] + (180 - begin_day)*60*60*24, year_ts[l])
+        }
+        print(year_ts)
+    }
+    print(year_ts - year_ts_before)
+        
+    return (list(xlim=xlim, day_str=day_str, day_ts=day_ts,
+                 week_str=week_str, week_ts=week_ts, 
+                 month_str=month_str, month_ts=month_ts, 
+                 year_str=year_str, year_ts=year_ts))
+}
+
+planetlab_releases <- function (height) 
+{
+    h = height
+    tstamp_20040412 <-abline_at_date("2004-04-12", col='white', lty=0, height=h)
+    tstamp_20041112 <-abline_at_date("2004-11-12", col='white', lty=3, height=h)
+    tstamp_20050301 <-abline_at_date("2005-03-01", col='grey60', lty=3, height=h)
+    tstamp_20050615 <-abline_at_date("2005-06-15", col='white',  lty=0, height=h)
+    tstamp_20051001 <-abline_at_date("2005-10-01", col='grey60', lty=3, height=h)
+    tstamp_20060519 <-abline_at_date("2006-05-19", col='grey60', lty=3, height=h)
+    tstamp_20070228 <-abline_at_date("2007-02-28", col='grey60', lty=3, height=h)
+    tstamp_20070501 <-abline_at_date("2007-05-01", col='white',  lty=0, height=h)
+    tstamp_20071021 <-abline_at_date("2007-10-21", col='grey60', lty=3, height=h)
+    tstamp_20080601 <-abline_at_date("2008-06-01", col='grey60', lty=3, height=h)
+    tstamp_20080815 <-abline_at_date("2008-08-15", col='white',  lty=0, height=h)
+    tstamp_20090501 <-abline_at_date("2009-05-01", col='grey60', lty=3, height=h)
+    tstamp_20100201 <-abline_at_date("2010-02-01", col='white',  lty=0, height=h)
+    tstamp_20100628 <-abline_at_date("2010-06-28", col='white', lty=3, height=h)
+    tstamp_20110222 <-abline_at_date("2011-02-22", col='grey60', lty=3, height=h)
+    # I think 5.0 was released 02/22/2011... not 03-09
+
+    text(x=c(tstamp_20040412,
+            tstamp_20041112,
+            tstamp_20050301,
+            tstamp_20050615,
+            tstamp_20051001,
+            tstamp_20060519,
+            tstamp_20070228,
+            tstamp_20071021,
+            tstamp_20080601,
+            tstamp_20090501,
+            tstamp_20100628,
+            tstamp_20110222),
+         y=c(h-h*0.05),
+         #labels=c('Release', '3.0', '3.1', '', '3.2', '3.3', '4.0', '4.1', '4.2', '4.3')) 
+         labels=c('', '', '3.1', '', '3.2', '3.3', '4.0', '4.1', '4.2', '4.3', '', '5.0')) 
+    text(x=c(tstamp_20050301), y=c(h), labels=c("Releases"))
+}
+
+plc_releases <- function (height) 
+{
+    h = height
+    tstamp_pre <-abline_at_date("2004-10-01", col='grey60', lty=3, height=h)
+    tstamp_3_1 <-abline_at_date("2005-03-01", col='grey60', lty=3, height=h)
+    tstamp_3_2 <-abline_at_date("2005-10-01", col='grey60', lty=3, height=h)
+    tstamp_3_3 <-abline_at_date("2006-05-19", col='grey60', lty=3, height=h)
+    tstamp_4_0 <-abline_at_date("2007-02-28", col='grey60', lty=3, height=h)
+    tstamp_4_1 <-abline_at_date("2007-10-21", col='grey60', lty=3, height=h)
+    tstamp_4_2 <-abline_at_date("2008-06-01", col='grey60', lty=3, height=h)
+    tstamp_4_3 <-abline_at_date("2009-05-01", col='grey60', lty=3, height=h)
+    tstamp_5_0 <-abline_at_date("2011-02-22", col='grey60', lty=3, height=h)
+
+    text(x=c(tstamp_3_1,
+            tstamp_3_2,
+            tstamp_3_3,
+            tstamp_4_0,
+            tstamp_4_1,
+            tstamp_4_2,
+            tstamp_4_3,
+            tstamp_5_0),
+         y=c(h-h*0.05),
+         labels=c('3.1', '3.2', '3.3', '4.0', '4.1', '4.2', '4.3', '5.0')) 
+    text(x=c(tstamp_pre), y=c(h), labels=c("Releases"))
+}