ceph · acalhounRH · Jun 4, 2018 · Jun 5, 2018 · Jun 5, 2018 · Jun 5, 2018
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,4 @@
 *.pyc
 *.pyo
+/.project
+/.pydevproject
diff --git a/benchmark/cephtestrados.py b/benchmark/cephtestrados.py
@@ -10,7 +10,7 @@
 logger = logging.getLogger('cbt')
 
 from cluster.ceph import Ceph
-from benchmark import Benchmark
+from benchmark.benchmark import Benchmark
 
 class CephTestRados(Benchmark):
 
@@ -59,7 +59,7 @@ def addweight(self, weight):
 
     def exists(self):
         if os.path.exists(self.out_dir):
-            print 'Skipping existing test in %s.' % self.out_dir
+            print ('Skipping existing test in %s.' % self.out_dir)
             return True
         return False
 

diff --git a/benchmark/cosbench.py b/benchmark/cosbench.py
@@ -11,7 +11,7 @@
 import logging
 
 from cluster.ceph import Ceph
-from benchmark import Benchmark
+from benchmark.benchmark import Benchmark
 
 logger = logging.getLogger("cbt")
 

diff --git a/benchmark/getput.py b/benchmark/getput.py
@@ -9,7 +9,7 @@
 import re
 
 from cluster.ceph import Ceph
-from benchmark import Benchmark
+from benchmark.benchmark import Benchmark
 
 logger = logging.getLogger("cbt")
 

diff --git a/benchmark/kvmrbdfio.py b/benchmark/kvmrbdfio.py
@@ -7,7 +7,7 @@
 import string
 import logging
 
-from benchmark import Benchmark
+from benchmark.benchmark import Benchmark
 
 logger = logging.getLogger("cbt")
 

diff --git a/benchmark/librbdfio.py b/benchmark/librbdfio.py
@@ -7,9 +7,10 @@
 import threading
 import logging
 import json
+import socket
 
 from cluster.ceph import Ceph
-from benchmark import Benchmark
+from benchmark.benchmark import Benchmark
 
 logger = logging.getLogger("cbt")
 
@@ -49,14 +50,14 @@ def __init__(self, cluster, config):
         self.pool_name = config.get("poolname", "cbt-librbdfio")
         self.rbdname = config.get('rbdname', '')
 
-	self.total_procs = self.procs_per_volume * self.volumes_per_client * len(settings.getnodes('clients').split(','))
+        self.total_procs = self.procs_per_volume * self.volumes_per_client * len(settings.getnodes('clients').split(','))
         self.run_dir = '%s/osd_ra-%08d/op_size-%08d/concurrent_procs-%03d/iodepth-%03d/%s' % (self.run_dir, int(self.osd_ra), int(self.op_size), int(self.total_procs), int(self.iodepth), self.mode)
         self.out_dir = self.archive_dir
 
         self.norandommap = config.get("norandommap", False)
         # Make the file names string (repeated across volumes)
         self.names = ''
-        for proc_num in xrange(self.procs_per_volume):
+        for proc_num in range(self.procs_per_volume):
             rbd_name = 'cbt-librbdfio-`%s`-file-%d' % (common.get_fqdn_cmd(), proc_num)
             self.names += '--name=%s ' % rbd_name
 
@@ -75,9 +76,10 @@ def initialize(self):
 
         logger.info('Pausing for 60s for idle monitoring.')
         monitoring.start("%s/idle_monitoring" % self.run_dir)
+        monitoring.start_pbench("%s/idle" % self.out_dir)
         time.sleep(60)
         monitoring.stop()
-
+        monitoring.stop_pbench("%s/idle" % self.out_dir)
         common.sync_files('%s/*' % self.run_dir, self.out_dir)
 
         self.mkimages()
@@ -86,7 +88,7 @@ def initialize(self):
         ps = []
         logger.info('Attempting to populating fio files...')
         if (self.use_existing_volumes == False):
-          for volnum in xrange(self.volumes_per_client):
+          for volnum in range(self.volumes_per_client):
               rbd_name = 'cbt-librbdfio-`%s`-%d' % (common.get_fqdn_cmd(), volnum)
               pre_cmd = 'sudo %s --ioengine=rbd --clientname=admin --pool=%s --rbdname=%s --invalidate=0  --rw=write --numjobs=%s --bs=4M --size %dM %s --output-format=%s > /dev/null' % (self.cmd_path, self.pool_name, rbd_name, self.numjobs, self.vol_size, self.names, self.fio_out_format)
               p = common.pdsh(settings.getnodes('clients'), pre_cmd)
@@ -105,6 +107,7 @@ def run(self):
         self.cluster.dump_config(self.run_dir)
 
         monitoring.start(self.run_dir)
+        monitoring.start_pbench(self.out_dir)
 
         time.sleep(5)
 
@@ -115,7 +118,7 @@ def run(self):
 
         logger.info('Running rbd fio %s test.', self.mode)
         ps = []
-        for i in xrange(self.volumes_per_client):
+        for i in range(self.volumes_per_client):
             fio_cmd = self.mkfiocmd(i)
             p = common.pdsh(settings.getnodes('clients'), fio_cmd)
             ps.append(p)
@@ -125,6 +128,7 @@ def run(self):
         if 'recovery_test' in self.cluster.config:
             self.cluster.wait_recovery_done()
 
+        monitoring.stop_pbench(self.out_dir)
         monitoring.stop(self.run_dir)
 
         # Finally, get the historic ops
@@ -139,7 +143,7 @@ def mkfiocmd(self, volnum):
             rbdname = 'cbt-librbdfio-`%s`-%d' % (common.get_fqdn_cmd(), volnum)
 
         logger.debug('Using rbdname %s', rbdname)
-        out_file = '%s/output.%d' % (self.run_dir, volnum)
+        out_file = '%s/output.%d.`%s`' % (self.run_dir, volnum, common.get_fqdn_cmd())
 
         fio_cmd = 'sudo %s --ioengine=rbd --clientname=admin --pool=%s --rbdname=%s --invalidate=0' % (self.cmd_path_full, self.pool_name, rbdname)
         fio_cmd += ' --rw=%s' % self.mode
@@ -191,7 +195,7 @@ def mkimages(self):
               self.cluster.rmpool(self.data_pool, self.data_pool_profile)
               self.cluster.mkpool(self.data_pool, self.data_pool_profile, 'rbd')
           for node in common.get_fqdn_list('clients'):
-              for volnum in xrange(0, self.volumes_per_client):
+              for volnum in range(0, self.volumes_per_client):
                   node = node.rpartition("@")[2]
                   self.cluster.mkimage('cbt-librbdfio-%s-%d' % (node,volnum), self.vol_size, self.pool_name, self.data_pool, self.vol_object_size)
         monitoring.stop()
@@ -201,7 +205,13 @@ def recovery_callback(self):
 
     def parse(self, out_dir):
         for client in settings.cluster.get('clients'):
-            for i in xrange(self.volumes_per_client):
+            try:
+                socket.inet_aton(client)
+                client = socket.gethostbyaddr(client)
+            except:
+                pass
+
+            for i in range(self.volumes_per_client):
                 found = 0
                 out_file = '%s/output.%d.%s' % (out_dir, i, client)
                 json_out_file = '%s/json_output.%d.%s' % (out_dir, i, client)

diff --git a/benchmark/nullbench.py b/benchmark/nullbench.py
@@ -5,7 +5,7 @@
 import os
 
 from cluster.ceph import Ceph
-from benchmark import Benchmark
+from benchmark.benchmark import Benchmark
 
 class Nullbench(Benchmark):
 

diff --git a/benchmark/radosbench.py b/benchmark/radosbench.py
@@ -10,7 +10,7 @@
 import json
 
 from cluster.ceph import Ceph
-from benchmark import Benchmark
+from benchmark.benchmark import Benchmark
 
 logger = logging.getLogger("cbt")
 
@@ -121,10 +121,11 @@ def _run(self, mode, run_dir, out_dir):
 
         # Run rados bench
         monitoring.start(run_dir)
+        monitoring.start_pbench("%s/%s" % (self.out_dir, mode))
         logger.info('Running radosbench %s test.' % mode)
         ps = []
         for i in xrange(self.concurrent_procs):
-            out_file = '%s/output.%s' % (run_dir, i)
+            out_file = '%s/output.%s.`%s`' % (run_dir, i, common.get_fqdn_cmd())
             objecter_log = '%s/objecter.%s.log' % (run_dir, i)
             # default behavior is to use a single storage pool 
             pool_name = self.pool
@@ -139,6 +140,7 @@ def _run(self, mode, run_dir, out_dir):
             ps.append(p)
         for p in ps:
             p.wait()
+        monitoring.stop_pbench("%s/%s" % (self.out_dir, mode))
         monitoring.stop(run_dir)
 
         # If we were doing recovery, wait until it's done.

diff --git a/benchmark/rawfio.py b/benchmark/rawfio.py
@@ -7,7 +7,7 @@
 import string
 import logging
 
-from benchmark import Benchmark
+from benchmark.benchmark import Benchmark
 
 logger = logging.getLogger("cbt")
 

diff --git a/benchmark/rbdfio.py b/benchmark/rbdfio.py
@@ -6,7 +6,7 @@
 import time
 import logging
 
-from benchmark import Benchmark
+from benchmark.benchmark import Benchmark
 
 logger = logging.getLogger("cbt")
 

diff --git a/benchmark/smallfile.py b/benchmark/smallfile.py
@@ -0,0 +1,175 @@
+# Benchmark subclass to invoke smallfile
+# this benchmark will iterate over smallfile test parameters
+# something that smallfile cannot do today
+#
+# see examples/smallfile.yaml for how to use it 
+#
+# at present, this does not create the filesystem or mount it,
+# all clients and head node must have filesystem mounted
+#
+# it assumes that all hosts are accessed with the same user account
+# so that user@hostname pdsh syntax is not needed 
+#
+# it has only been tested with a single Cephfs mountpoint/host
+
+import copy
+import common
+import monitoring
+import os
+import time
+import logging
+import settings
+import yaml
+import json
+import subprocess
+
+from benchmark.benchmark import Benchmark
+
+logger = logging.getLogger("cbt")
+
+# we do this so source of exception is really obvious
+class CbtSmfExc(Exception):
+    pass
+
+class Smallfile(Benchmark):
+
+    def __init__(self, cluster, config):
+        super(Smallfile, self).__init__(cluster, config)
+        self.out_dir = self.archive_dir
+        self.config = config
+        mons = settings.getnodes('mons').split(',')
+        self.any_mon = mons[0]
+        self.clients = settings.getnodes('clients').split(',')
+        self.any_client = self.clients[0]
+        self.head = settings.getnodes('head')
+        self.cephfs_data_pool_name = config.get('data_pool_name', 'cephfs_data')
+        self.cleandir()
+
+
+    # this function uses "ceph df" output to monitor
+    # cephfs_data pool object count, when that stops going down
+    # then the pool is stable and it's ok to start another test
+
+    def get_cephfs_data_objects(self):
+        (cephdf_out, cephdf_err) = common.pdsh(
+            self.any_mon, 'ceph -f json df', continue_if_error=False).communicate()
+        # pdsh prepends JSON output with IP address of host that did the command, 
+        # we have to strip the IP address off before JSON parser will accept it
+        start_of_json = cephdf_out.index('{')
+        json_str = cephdf_out[start_of_json:]
+        cephdf = json.loads(json_str)
+        cephfs_data_objs = -1
+        for p in cephdf['pools']:
+            if p['name'] == self.cephfs_data_pool_name:
+                cephfs_data_objs = int(p['stats']['objects'])
+                break
+        if cephfs_data_objs == -1:
+            raise CbtSmfExc('could not find cephfs_data pool in ceph -f json df output')
+        logger.info('cephfs_data pool object count = %d' % cephfs_data_objs)
+        return cephfs_data_objs
+
+    def run(self):
+        super(Smallfile, self).run()
+
+        # someday we might want to allow the option 
+        # to NOT drop cache
+        self.dropcaches()
+        # FIXME: if desired, drop cache on OSDs
+        # FIXME: if desired, drop cache on MDSs
+
+        # dump the cluster config
+        self.cluster.dump_config(self.run_dir)
+
+        # input YAML parameters for smallfile are subset
+        # extract parameters that you need
+
+        smfparams = copy.deepcopy(self.config)
+        del smfparams['benchmark']
+        del smfparams['iteration']
+        try:
+            del smfparams['data_pool_name']
+        except KeyError:
+            pass
+        operation = smfparams['operation']
+        topdir = smfparams['top'].split(',')[0]
+        yaml_input_pathname = os.path.join(self.out_dir, 'smfparams.yaml')
+        with open(yaml_input_pathname, 'w') as yamlf:
+            yamlf.write(yaml.dump(smfparams, default_flow_style=False))
+
+        # generate client list
+
+        client_list_path = os.path.join(self.out_dir, 'client.list')
+        with open(client_list_path, 'w') as client_f:
+            for c in self.clients:
+                client_f.write(c + '\n')
+
+        # ensure SMF directory exists
+        # for shared filesystem, we only need 1 client to 
+        # initialize it
+
+        logger.info('using client %s to initialize shared filesystem' % self.any_client)
+        common.pdsh(self.any_client, 'mkdir -p -v -m 0777 ' + topdir, continue_if_error=False).communicate()
+
+        # Run the backfill testing thread if requested
+        if 'recovery_test' in self.cluster.config:
+            recovery_callback = self.recovery_callback
+            self.cluster.create_recovery_test(self.run_dir, recovery_callback)
+
+        # Run smallfile
+        monitoring.start(self.run_dir)
+        monitoring.start_pbench(self.out_dir)
+        logger.info('Running smallfile test, see %s for parameters' % yaml_input_pathname)
+        smfcmd = [ 'smallfile_cli.py', 
+                   '--host-set', client_list_path,
+                   '--response-times', 'Y',
+                   '--yaml-input-file', yaml_input_pathname, 
+                   '--verbose', 'Y', 
+                   '--output-json', '%s/smfresult.json' % self.out_dir ]
+        logger.info('smallfile command: %s' % ' '.join(smfcmd))
+        logger.info('YAML inputs: %s' % yaml.dump(smfparams))
+        smf_out_path = os.path.join(self.out_dir, 'smf-out.log')
+        (smf_out_str, smf_err_str) = common.pdsh(self.head, ' '.join(smfcmd), continue_if_error=False).communicate()
+        with open(smf_out_path, 'w') as smf_outf:
+            smf_outf.write(smf_out_str + '\n')
+        logger.info('smallfile result: %s' % smf_out_path)
+        monitoring.stop_pbench(self.out_dir)
+        monitoring.stop(self.run_dir)
+
+
+        # save response times
+        rsptimes_target_dir = os.path.join(self.out_dir, 'rsptimes')
+        common.mkdir_p(rsptimes_target_dir)
+        common.rpdcp(self.head, '', 
+                     os.path.join(os.path.join(topdir, 'network_shared'), 'rsptimes*csv'), 
+                     rsptimes_target_dir)
+
+        if operation == 'cleanup':
+            common.pdsh(self.any_client, 'rm -rf ' + topdir, continue_if_error=False).communicate()
+            common.pdsh(self.any_client, 'mkdir -v -m 0777 ' + topdir, continue_if_error=False).communicate()
+            # wait until cephfs_data pool stops decreasing
+            logger.info('wait for cephfs_data pool to empty')
+            pool_shrinking = True
+            old_data_objs = self.get_cephfs_data_objects()
+            while pool_shrinking:
+                time.sleep(10)
+                data_objs = self.get_cephfs_data_objects()
+                if old_data_objs == data_objs:
+                    logger.info('pool stopped shrinking')
+                    pool_shrinking = False
+                else:
+                    logger.info('pool shrank by %d objects', old_data_objs - data_objs)
+                    old_data_objs = data_objs
+
+        # If we were doing recovery, wait until it's done.
+        if 'recovery_test' in self.cluster.config:
+            self.cluster.wait_recovery_done()
+
+        # Finally, get the historic ops
+        self.cluster.dump_historic_ops(self.run_dir)
+        common.sync_files(self.run_dir, self.out_dir)
+
+    def recovery_callback(self): 
+        pass
+
+    def __str__(self):
+        return "%s\n%s\n%s" % (self.run_dir, self.out_dir, super(Smallfile, self).__str__())