From 6e342f7069f11881ef02c9f0ed36ed7a91b9c21d Mon Sep 17 00:00:00 2001 From: Simone Russo Date: Mon, 24 Nov 2014 17:28:01 +0000 Subject: [PATCH 1/3] Using multiple namespaces. Broken if not using namespaces. Need to fix unit test --- snakebite/config.py | 79 +++++++++++++++++++++++++++++++-------------- 1 file changed, 55 insertions(+), 24 deletions(-) diff --git a/snakebite/config.py b/snakebite/config.py index 9582d74..164e567 100644 --- a/snakebite/config.py +++ b/snakebite/config.py @@ -35,50 +35,77 @@ def get_config_from_env(cls): @staticmethod def read_hadoop_config(hdfs_conf_path): + config_entries = [] if os.path.exists(hdfs_conf_path): try: tree = ET.parse(hdfs_conf_path) except: log.error("Unable to parse %s" % hdfs_conf_path) - return + return config_entries root = tree.getroot() - for p in root.findall("./property"): - yield p - + config_entries.extend(root.findall("./property")) + return config_entries @classmethod def read_core_config(cls, core_site_path): - config = [] - for property in cls.read_hadoop_config(core_site_path): + conf = cls.read_hadoop_config(core_site_path) + configs = [] + for property in conf: if property.findall('name')[0].text == 'fs.defaultFS': parse_result = urlparse(property.findall('value')[0].text) - log.debug("Got namenode '%s' from %s" % (parse_result.geturl(), core_site_path)) + configs.append(cls.get_namenode(parse_result)) - config.append({"namenode": parse_result.hostname, - "port": parse_result.port if parse_result.port - else Namenode.DEFAULT_PORT}) + cls.set_trash_mode(property) - if property.findall('name')[0].text == 'fs.trash.interval': - cls.use_trash = True - - return config + return configs @classmethod - def read_hdfs_config(cls, hdfs_site_path): + def read_hdfs_config(cls, config_path, conf, environment, environment_suffixes): configs = [] - for property in cls.read_hadoop_config(hdfs_site_path): - if property.findall('name')[0].text.startswith("dfs.namenode.rpc-address"): + for property in conf: + if cls.is_namenode_host(property, environment, environment_suffixes): parse_result = urlparse("//" + property.findall('value')[0].text) - log.debug("Got namenode '%s' from %s" % (parse_result.geturl(), hdfs_site_path)) - configs.append({"namenode": parse_result.hostname, - "port": parse_result.port if parse_result.port - else Namenode.DEFAULT_PORT}) + log.debug("Got namenode '%s' from %s" % (parse_result.geturl(), config_path)) + configs.append(cls.get_namenode(parse_result)) - if property.findall('name')[0].text == 'fs.trash.interval': - cls.use_trash = True + cls.set_trash_mode(property) return configs + @classmethod + def is_namenode_host(cls, property, environment, environment_suffixes): + name = property.findall('name')[0].text + for suffix in environment_suffixes: + if name.startswith("dfs.namenode.rpc-address." + environment + "." + suffix): + return True + return False + + @classmethod + def get_namenode(cls, parse_result): + return { + "namenode": parse_result.hostname, + "port": parse_result.port if parse_result.port else Namenode.DEFAULT_PORT + } + + @classmethod + def set_trash_mode(cls, property): + if property.findall('name')[0].text == 'fs.trash.interval': + cls.use_trash = True + + @classmethod + def get_environment(cls, core_config): + for config in core_config: + environment_name = config['namenode'] + if environment_name: + return environment_name + + @classmethod + def get_environment_suffixes(cls, environment, hadoop_config): + for property in hadoop_config: + if property.findall('name')[0].text == 'dfs.ha.namenodes.' + environment: + return property.findall('value')[0].text.split(',') + + core_try_paths = ('/etc/hadoop/conf/core-site.xml', '/usr/local/etc/hadoop/conf/core-site.xml', '/usr/local/hadoop/conf/core-site.xml') @@ -100,8 +127,12 @@ def get_external_config(cls): if configs: break + environment = cls.get_environment(configs) + for hdfs_conf_path in cls.hdfs_try_paths: - tmp_config = cls.read_hdfs_config(hdfs_conf_path) + hadoop_config = cls.read_hadoop_config(hdfs_conf_path) + environment_suffixes = cls.get_environment_suffixes(environment, hadoop_config) + tmp_config = cls.read_hdfs_config(hdfs_conf_path, hadoop_config, environment, environment_suffixes) if tmp_config: # if there is hdfs-site data available return it return tmp_config From d03444d30bc66941c4c2b6dbeeea5577e1265858 Mon Sep 17 00:00:00 2001 From: Simone Russo Date: Tue, 25 Nov 2014 10:24:01 +0000 Subject: [PATCH 2/3] Non namespace namenode config now supported as well. Minor refactoring --- snakebite/config.py | 68 ++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 41 deletions(-) diff --git a/snakebite/config.py b/snakebite/config.py index 164e567..cee0c37 100644 --- a/snakebite/config.py +++ b/snakebite/config.py @@ -1,5 +1,4 @@ import os -import sys import logging import xml.etree.ElementTree as ET @@ -11,28 +10,6 @@ class HDFSConfig(object): use_trash = False - @classmethod - def get_config_from_env(cls): - '''Gets configuration out of environment. - - Returns list of dicts - list of namenode representations - ''' - core_path = os.path.join(os.environ['HADOOP_HOME'], 'conf', 'core-site.xml') - configs = cls.read_core_config(core_path) - - hdfs_path = os.path.join(os.environ['HADOOP_HOME'], 'conf', 'hdfs-site.xml') - tmp_config = cls.read_hdfs_config(hdfs_path) - - if tmp_config: - # if config exists in hdfs - it's HA config, update configs - configs = tmp_config - - if not configs: - raise Exception("No config found in %s nor in %s" % (core_path, hdfs_path)) - - return configs - - @staticmethod def read_hadoop_config(hdfs_conf_path): config_entries = [] @@ -75,6 +52,8 @@ def read_hdfs_config(cls, config_path, conf, environment, environment_suffixes): @classmethod def is_namenode_host(cls, property, environment, environment_suffixes): name = property.findall('name')[0].text + if not environment_suffixes: + return name.startswith("dfs.namenode.rpc-address") for suffix in environment_suffixes: if name.startswith("dfs.namenode.rpc-address." + environment + "." + suffix): return True @@ -117,23 +96,30 @@ def get_environment_suffixes(cls, environment, hadoop_config): @classmethod def get_external_config(cls): if os.environ.get('HADOOP_HOME'): - configs = cls.get_config_from_env() - return configs + core_paths = [os.path.join(os.environ['HADOOP_HOME'], 'conf', 'core-site.xml')] + hdfs_paths = [os.path.join(os.environ['HADOOP_HOME'], 'conf', 'hdfs-site.xml')] else: # Try to find other paths - configs = [] - for core_conf_path in cls.core_try_paths: - configs = cls.read_core_config(core_conf_path) - if configs: - break - - environment = cls.get_environment(configs) - - for hdfs_conf_path in cls.hdfs_try_paths: - hadoop_config = cls.read_hadoop_config(hdfs_conf_path) - environment_suffixes = cls.get_environment_suffixes(environment, hadoop_config) - tmp_config = cls.read_hdfs_config(hdfs_conf_path, hadoop_config, environment, environment_suffixes) - if tmp_config: - # if there is hdfs-site data available return it - return tmp_config - return configs + core_paths = cls.core_try_paths + hdfs_paths = cls.hdfs_try_paths + + configs = [] + for core_conf_path in core_paths: + configs = cls.read_core_config(core_conf_path) + if configs: + break + + environment = cls.get_environment(configs) + + for hdfs_conf_path in hdfs_paths: + hadoop_config = cls.read_hadoop_config(hdfs_conf_path) + environment_suffixes = cls.get_environment_suffixes(environment, hadoop_config) + tmp_config = cls.read_hdfs_config(hdfs_conf_path, hadoop_config, environment, environment_suffixes) + if tmp_config: + # if there is hdfs-site data available return it + return tmp_config + + if not configs: + raise Exception("No configs found") + + return configs From 7924de864e3a4a016b8a9319ee184e819be5e46a Mon Sep 17 00:00:00 2001 From: Simone Russo Date: Tue, 25 Nov 2014 10:55:28 +0000 Subject: [PATCH 3/3] Fixed unit tests --- test/config_test.py | 36 ++++++++++++++++++++-- test/testconfig/conf/ha-port-hdfs-site.xml | 5 +++ 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/test/config_test.py b/test/config_test.py index e337b9c..3aa4abb 100644 --- a/test/config_test.py +++ b/test/config_test.py @@ -31,6 +31,19 @@ def _verify_hdfs_settings(self, config): self.assertEqual('namenode2.mydomain', config[1]['namenode']) self.assertEqual(8888, config[1]['port']) + # namenodes in ha-port-hdfs-site.xml with no namespace (so all of them expected) + def _verify_hdfs_settings_all(self, config): + self.assertEquals(len(config), 3) + # assert first NN + self.assertEqual('namenode1.mydomain', config[0]['namenode']) + self.assertEqual(8888, config[0]['port']) + # assert second NN + self.assertEqual('namenode2.mydomain', config[1]['namenode']) + self.assertEqual(8888, config[1]['port']) + # assert third NN + self.assertEqual('namenode.other-domain', config[2]['namenode']) + self.assertEqual(8888, config[2]['port']) + def _verify_hdfs_noport_settings(self, config): self.assertEquals(len(config), 2) # assert first NN @@ -40,10 +53,27 @@ def _verify_hdfs_noport_settings(self, config): self.assertEqual('namenode2.mydomain', config[1]['namenode']) self.assertEqual(8020, config[1]['port']) + # namenodes in ha-port-hdfs-site.xml using namespace in ha-core-site.xml + def _verify_hdfs_port_settings(self, config): + self.assertEquals(len(config), 2) + # assert first NN + self.assertEqual('namenode1.mydomain', config[0]['namenode']) + self.assertEqual(8888, config[0]['port']) + # assert second NN + self.assertEqual('namenode2.mydomain', config[1]['namenode']) + self.assertEqual(8888, config[1]['port']) + def test_read_hdfs_config_ha(self): - hdfs_site_path = self.get_config_path('ha-port-hdfs-site.xml') - config = HDFSConfig.read_hdfs_config(hdfs_site_path) - self._verify_hdfs_settings(config) + hdfs_core_path = self.get_config_path('ha-port-hdfs-site.xml') + conf = HDFSConfig.read_hadoop_config(hdfs_core_path) + config = HDFSConfig.read_hdfs_config('', conf, '', []) + self._verify_hdfs_settings_all(config) + + def test_read_hdfs_port_config_ha(self): + hdfs_core_path = self.get_config_path('ha-port-hdfs-site.xml') + conf = HDFSConfig.read_hadoop_config(hdfs_core_path) + config = HDFSConfig.read_hdfs_config('', conf, 'testha', ['namenode1-mydomain', 'namenode2-mydomain']) + self._verify_hdfs_port_settings(config) def test_read_core_config_ha(self): core_site_path = self.get_config_path('ha-core-site.xml') diff --git a/test/testconfig/conf/ha-port-hdfs-site.xml b/test/testconfig/conf/ha-port-hdfs-site.xml index 5aa344b..654ca51 100644 --- a/test/testconfig/conf/ha-port-hdfs-site.xml +++ b/test/testconfig/conf/ha-port-hdfs-site.xml @@ -22,6 +22,11 @@ namenode2.mydomain:8888 + + dfs.namenode.rpc-address.testha.other-domain + namenode.other-domain:8888 + + dfs.namenode.http-address.testha.namenode1-mydomain namenode1.mydomain:50070