rework check-tcp so that we first wait for the network to be ready in the sliver
Thierry Parmentelat [Tue, 3 Mar 2015 23:36:21 +0000 (00:36 +0100)]
system/Completer.py
system/TestNode.py
system/TestPlc.py
system/TestSliver.py
system/config_default.py
system/tcptest.py

index 5cf3c9b..3d49dac 100755 (executable)
@@ -13,7 +13,7 @@ class Completer:
         self.tasks=tasks
         self.verbose=verbose
         self.message="({})".format(message) if message else ""
-    def run (self, timeout_timedelta, silent_timedelta, period=None):
+    def run (self, timeout_timedelta, silent_timedelta, period):
         begin = datetime.now()
         timeout = begin+timeout_timedelta
         timeout_minutes = timeout_timedelta.total_seconds()/60
index 2b24ba8..6bf40ad 100644 (file)
@@ -21,9 +21,9 @@ class CompleterTaskNodeSsh (CompleterTask):
         self.test_ssh =  TestSsh (self.hostname,key=self.local_key)
     def run (self, silent):
         command = self.test_ssh.actual_command(self.command)
-        retcod=utils.system (command, silent=silent, dry_run=self.dry_run)
-        if self.expected:       return retcod==0
-        else:                   return retcod !=0
+        retcod = utils.system (command, silent=silent, dry_run=self.dry_run)
+        if self.expected:       return retcod == 0
+        else:                   return retcod != 0
     def failure_epilogue (self):
         print "Cannot reach %s in %s mode"%(self.hostname, self.boot_state)
 
index 6b910a7..9504b70 100644 (file)
@@ -1140,14 +1140,14 @@ class TestPlc:
     ### initscripts
     def do_check_initscripts(self):
         class CompleterTaskInitscript (CompleterTask):
-            def __init__ (self, test_sliver, stamp):
+            def __init__(self, test_sliver, stamp):
                 self.test_sliver=test_sliver
                 self.stamp=stamp
-            def actual_run (self):
-                return self.test_sliver.check_initscript_stamp (self.stamp)
-            def message (self):
+            def actual_run(self):
+                return self.test_sliver.check_initscript_stamp(self.stamp)
+            def message(self):
                 return "initscript checker for %s"%self.test_sliver.name()
-            def failure_epilogue (self):
+            def failure_epilogue(self):
                 print "initscript stamp %s not found in sliver %s"%(self.stamp,self.test_sliver.name())
             
         tasks=[]
@@ -1164,8 +1164,9 @@ class TestPlc:
                 test_slice = TestSlice (self,test_site,slice_spec)
                 test_node = TestNode (self,test_site,node)
                 test_sliver = TestSliver (self, test_node, test_slice)
-                tasks.append ( CompleterTaskInitscript (test_sliver, stamp))
-        return Completer (tasks, message='check_initscripts').run (timedelta(minutes=5), timedelta(minutes=4), timedelta(seconds=10))
+                tasks.append(CompleterTaskInitscript(test_sliver, stamp))
+        return Completer(tasks, message='check_initscripts').\
+            run (timedelta(minutes=5), timedelta(minutes=4), timedelta(seconds=10))
            
     def check_initscripts(self):
         "check that the initscripts have triggered"
@@ -1306,26 +1307,54 @@ class TestPlc:
             utils.header ("check_tcp: no/empty config found")
             return True
         specs = self.plc_spec['tcp_specs']
-        overall=True
+        overall = True
+
+        # first wait for the network to be up and ready from the slices
+        class CompleterTaskNetworkReadyInSliver(CompleterTask):
+            def __init__(self, test_sliver):
+                self.test_sliver = test_sliver
+            def actual_run(self):
+                return self.test_sliver.check_tcp_ready(port=9999)
+            def message(self):
+                return "network ready checker for %s" % self.test_sliver.name()
+            def failure_epilogue(self):
+                print "could not bind port from sliver %s" % self.test_sliver.name()
+
+        tasks = []
+        for spec in specs:
+            # locate the TestSliver instances involved, and cache them in the spec instance
+            spec['s_sliver'] = self.locate_sliver_obj_cross (spec['server_node'], spec['server_slice'], other_plcs)
+            spec['c_sliver'] = self.locate_sliver_obj_cross (spec['client_node'], spec['client_slice'], other_plcs)
+            message = "Will check TCP between s=%s and c=%s" % \
+                      (spec['s_sliver'].name(), spec['c_sliver'].name())
+            if 'client_connect' in spec:
+                message += " (using %s)" % spec['client_connect']
+            utils.header(message)
+            tasks.append(CompleterTaskNetworkReadyInSliver (spec['s_sliver']))
+
+        # wait for the netork to be OK in all server sides
+        if not Completer(tasks, message='check for network readiness in slivers').\
+           run(timedelta(seconds=30), timedelta(seconds=24), period=timedelta(seconds=5)):
+            return False
+            
+        # run server and client
         for spec in specs:
             port = spec['port']
             # server side
             # the issue here is that we have the server run in background
             # and so we have no clue if it took off properly or not
             # looks like in some cases it does not
-            s_test_sliver = self.locate_sliver_obj_cross (spec['server_node'], spec['server_slice'], other_plcs)
-            if not s_test_sliver.run_tcp_server(port, timeout=20):
+            if not spec['s_sliver'].run_tcp_server(port, timeout=20):
                 overall = False
                 break
 
             # idem for the client side
-            c_test_sliver = self.locate_sliver_obj_cross (spec['client_node'], spec['client_slice'], other_plcs)
-            # use nodename from locatesd sliver, unless 'client_connect' is set
+            # use nodename from located sliver, unless 'client_connect' is set
             if 'client_connect' in spec:
                 destination = spec['client_connect']
             else:
-                destination = s_test_sliver.test_node.name()
-            if not c_test_sliver.run_tcp_client(destination, port):
+                destination = spec['s_sliver'].test_node.name()
+            if not spec['c_sliver'].run_tcp_client(destination, port):
                 overall = False
         return overall
 
index a82b27b..f69e4c5 100644 (file)
@@ -31,7 +31,7 @@ class TestSliver:
                         # so that copies end up in the home dir
                         buildname=".")
 
-    def name (self):
+    def name(self):
         return "%s@%s"%(self.test_slice.name(),self.test_node.name())
 
     def check_initscript_stamp(self, stamp):
@@ -39,19 +39,27 @@ class TestSliver:
         return self.test_ssh.run("ls -l /var/tmp/%s.stamp"%stamp)==0
     
     def run_tcp_server (self, port, timeout=10):
-        server_command = "./tcptest.py server -p %d -t %d"%(port,timeout)
-        return self.test_ssh.copy("tcptest.py")==0 and \
+        server_command = "./tcptest.py server -p %d -t %d"%(port, timeout)
+        return self.test_ssh.copy("tcptest.py") == 0 and \
             self.test_ssh.run(server_command, background=True)==0
 
+    def check_tcp_ready (self, port):
+        server_command = "./tcptest.py ready -p %d"%(port)
+        return self.test_ssh.copy("tcptest.py") == 0 and \
+            self.test_ssh.run(server_command) == 0
+
     def run_tcp_client (self, servername, port, retry=5):
         client_command="./tcptest.py client -a %s -p %d"%(servername, port)
-        if self.test_ssh.copy("tcptest.py")!=0: return False
-        utils.header ("tcp client - first attempt")
-        if self.test_ssh.run(client_command, background=False)==0: return True
-        # if first try has failed, wait for <retry> s an try again
-        time.sleep(retry)
-        utils.header ("tcp client - second attempt")
-        if self.test_ssh.run(client_command, background=False)==0: return True
+        if self.test_ssh.copy("tcptest.py") != 0:
+            return False
+        # allow for 2 attempts
+        attempts = 2
+        for attempt in range (attempts):
+            if attempt != 0:
+                time.sleep(retry)
+            utils.header ("tcp client - attempt # %s" % (attempt+1))
+            if self.test_ssh.run(client_command) == 0:
+                return True
         return False
 
     # use the node's main ssh root entrance, as the slice entrance might be down
index c2c1651..00ababf 100644 (file)
@@ -434,6 +434,9 @@ def tcp_specs (options,index):
     # with the addition of omf-friendly slices..
     slice3='%s_sl4'%login_base(2)
     slice4='%s_sl5'%login_base(2)
+
+# NOTE: port 9999 is hard-wired in the code to be used for checking network readiness
+# so it is not to be used here
 # bind on 0.0.0.0 and try to reach this on localhost
 # not expected to work
     same_node_same_slice_lo =   { 'server_node': 'node1', 'server_slice': slice1,
index dd2bc25..bff3c66 100755 (executable)
@@ -8,19 +8,19 @@ import time
 import subprocess
 import socket
 import SocketServer
+import threading
 from optparse import OptionParser    
 
-def myprint(message, is_client=True):
+def myprint(message, id='client'):
     now=time.strftime("%H:%M:%S", time.localtime())
-    id = 'tcpclient' if is_client else 'tcpserver'
     print "*",now,'(%s)' % id, '--',message
     sys.stdout.flush()
 
-def show_network_status(is_client):
-    myprint("ip address show", is_client=is_client)
-    subprocess.call(['ip','address','show'])
-    myprint("ip route show", is_client=is_client)
-    subprocess.call(['ip','route','show'])
+def show_network_status(id):
+    myprint("ip address show", id=id)
+    subprocess.call(['ip', 'address', 'show'])
+    myprint("ip route show", id=id)
+    subprocess.call(['ip', 'route', 'show'])
 
 class EchoRequestHandler(SocketServer.StreamRequestHandler):
     def handle(self):
@@ -33,10 +33,10 @@ class UppercaseRequestHandler(SocketServer.StreamRequestHandler):
         self.wfile.write(line.upper())
 
 class Server:
-
+    """
+    A TCP server, running for some finite amount of time
+    """
     def main(self):
-        import threading
-
         parser = OptionParser()
         parser.add_option("-p", "--port", action="store", dest="port", type="int",
                           default=10000, help="port number")
@@ -44,17 +44,15 @@ class Server:
                           default=socket.gethostname(), help="address")
         parser.add_option("-t", "--timeout", action="store", dest="timeout", type="int",
                           default="0")
-        
         (options, args) = parser.parse_args()
+
         if len(args) != 0:
             parser.print_help()
             sys.exit(1)
 
-        show_network_status(is_client=False)
-
+        show_network_status(id='server')
         server = SocketServer.TCPServer((options.address, options.port),
                                         UppercaseRequestHandler)
-
         try:
             if options.timeout:
                 t = threading.Thread(target=server.serve_forever)
@@ -68,7 +66,34 @@ class Server:
             print 'Bailing out on keyboard interrupt'
             sys.exit(1)
             
+class Ready:
+    """
+    A utility that does exit(0) iff network as perceived
+    from the sliver is ready. Designed to be run before Server,
+    so one can wait for the right conditions.
+    """
+    def main(self):
+        parser = OptionParser()
+        # by default use another port so we don't run into
+        # the SO_LINGER kind of trouble
+        parser.add_option("-p", "--port", action="store", dest="port", type="int",
+                          default=9999, help="port number")
+        parser.add_option("-a", "--address", action="store", dest="address", 
+                          default=socket.gethostname(), help="address")
+        (options, args) = parser.parse_args()
+
+        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        try:
+            s.bind((options.address, options.port))
+            sys.exit(0)
+        except Exception as e:
+            print e
+            sys.exit(1)
+        
 class Client:
+    """
+    Runs a client against a Server instance
+    """
     def main(self):
         parser = OptionParser()
         parser.add_option("-p","--port", action="store", dest="port", type="int",
@@ -88,7 +113,7 @@ class Client:
         result=True
         for i in range(1,options.loops+1):
             s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-            s.connect((options.address , options.port))
+            s.connect((options.address, options.port))
             mout=i*'ping ' + '\n'
             min=mout.upper()
             if s.send(mout) != len(mout):
@@ -113,12 +138,15 @@ class Client:
         sys.exit(exit_return)
 
 if __name__ == '__main__':
-    for argv in sys.argv[1:]:
-        if argv.find("client") >= 0:
-            sys.argv.remove(argv)
+    for arg in sys.argv[1:]:
+        if arg.find("client") >= 0:
+            sys.argv.remove(arg)
             Client().main()
-        elif argv.find("server") >= 0:
-            sys.argv.remove(argv)
+        elif arg.find("server") >= 0:
+            sys.argv.remove(arg)
             Server().main()
+        elif arg.find("ready") >= 0:
+            sys.argv.remove(arg)
+            Ready().main()
     print 'you must specify either --client or --server'
     sys.exit(1)