diff --git a/snakebite/config.py b/snakebite/config.py index 9582d74..cee0c37 100644 --- a/snakebite/config.py +++ b/snakebite/config.py @@ -1,5 +1,4 @@ import os -import sys import logging import xml.etree.ElementTree as ET @@ -11,74 +10,81 @@ class HDFSConfig(object): use_trash = False - @classmethod - def get_config_from_env(cls): - '''Gets configuration out of environment. - - Returns list of dicts - list of namenode representations - ''' - core_path = os.path.join(os.environ['HADOOP_HOME'], 'conf', 'core-site.xml') - configs = cls.read_core_config(core_path) - - hdfs_path = os.path.join(os.environ['HADOOP_HOME'], 'conf', 'hdfs-site.xml') - tmp_config = cls.read_hdfs_config(hdfs_path) - - if tmp_config: - # if config exists in hdfs - it's HA config, update configs - configs = tmp_config - - if not configs: - raise Exception("No config found in %s nor in %s" % (core_path, hdfs_path)) - - return configs - - @staticmethod def read_hadoop_config(hdfs_conf_path): + config_entries = [] if os.path.exists(hdfs_conf_path): try: tree = ET.parse(hdfs_conf_path) except: log.error("Unable to parse %s" % hdfs_conf_path) - return + return config_entries root = tree.getroot() - for p in root.findall("./property"): - yield p - + config_entries.extend(root.findall("./property")) + return config_entries @classmethod def read_core_config(cls, core_site_path): - config = [] - for property in cls.read_hadoop_config(core_site_path): + conf = cls.read_hadoop_config(core_site_path) + configs = [] + for property in conf: if property.findall('name')[0].text == 'fs.defaultFS': parse_result = urlparse(property.findall('value')[0].text) - log.debug("Got namenode '%s' from %s" % (parse_result.geturl(), core_site_path)) + configs.append(cls.get_namenode(parse_result)) - config.append({"namenode": parse_result.hostname, - "port": parse_result.port if parse_result.port - else Namenode.DEFAULT_PORT}) - - if property.findall('name')[0].text == 'fs.trash.interval': - cls.use_trash = True + cls.set_trash_mode(property) - return config + return configs @classmethod - def read_hdfs_config(cls, hdfs_site_path): + def read_hdfs_config(cls, config_path, conf, environment, environment_suffixes): configs = [] - for property in cls.read_hadoop_config(hdfs_site_path): - if property.findall('name')[0].text.startswith("dfs.namenode.rpc-address"): + for property in conf: + if cls.is_namenode_host(property, environment, environment_suffixes): parse_result = urlparse("//" + property.findall('value')[0].text) - log.debug("Got namenode '%s' from %s" % (parse_result.geturl(), hdfs_site_path)) - configs.append({"namenode": parse_result.hostname, - "port": parse_result.port if parse_result.port - else Namenode.DEFAULT_PORT}) + log.debug("Got namenode '%s' from %s" % (parse_result.geturl(), config_path)) + configs.append(cls.get_namenode(parse_result)) - if property.findall('name')[0].text == 'fs.trash.interval': - cls.use_trash = True + cls.set_trash_mode(property) return configs + @classmethod + def is_namenode_host(cls, property, environment, environment_suffixes): + name = property.findall('name')[0].text + if not environment_suffixes: + return name.startswith("dfs.namenode.rpc-address") + for suffix in environment_suffixes: + if name.startswith("dfs.namenode.rpc-address." + environment + "." + suffix): + return True + return False + + @classmethod + def get_namenode(cls, parse_result): + return { + "namenode": parse_result.hostname, + "port": parse_result.port if parse_result.port else Namenode.DEFAULT_PORT + } + + @classmethod + def set_trash_mode(cls, property): + if property.findall('name')[0].text == 'fs.trash.interval': + cls.use_trash = True + + @classmethod + def get_environment(cls, core_config): + for config in core_config: + environment_name = config['namenode'] + if environment_name: + return environment_name + + @classmethod + def get_environment_suffixes(cls, environment, hadoop_config): + for property in hadoop_config: + if property.findall('name')[0].text == 'dfs.ha.namenodes.' + environment: + return property.findall('value')[0].text.split(',') + + core_try_paths = ('/etc/hadoop/conf/core-site.xml', '/usr/local/etc/hadoop/conf/core-site.xml', '/usr/local/hadoop/conf/core-site.xml') @@ -90,19 +96,30 @@ def read_hdfs_config(cls, hdfs_site_path): @classmethod def get_external_config(cls): if os.environ.get('HADOOP_HOME'): - configs = cls.get_config_from_env() - return configs + core_paths = [os.path.join(os.environ['HADOOP_HOME'], 'conf', 'core-site.xml')] + hdfs_paths = [os.path.join(os.environ['HADOOP_HOME'], 'conf', 'hdfs-site.xml')] else: # Try to find other paths - configs = [] - for core_conf_path in cls.core_try_paths: - configs = cls.read_core_config(core_conf_path) - if configs: - break - - for hdfs_conf_path in cls.hdfs_try_paths: - tmp_config = cls.read_hdfs_config(hdfs_conf_path) - if tmp_config: - # if there is hdfs-site data available return it - return tmp_config - return configs + core_paths = cls.core_try_paths + hdfs_paths = cls.hdfs_try_paths + + configs = [] + for core_conf_path in core_paths: + configs = cls.read_core_config(core_conf_path) + if configs: + break + + environment = cls.get_environment(configs) + + for hdfs_conf_path in hdfs_paths: + hadoop_config = cls.read_hadoop_config(hdfs_conf_path) + environment_suffixes = cls.get_environment_suffixes(environment, hadoop_config) + tmp_config = cls.read_hdfs_config(hdfs_conf_path, hadoop_config, environment, environment_suffixes) + if tmp_config: + # if there is hdfs-site data available return it + return tmp_config + + if not configs: + raise Exception("No configs found") + + return configs diff --git a/test/config_test.py b/test/config_test.py index e337b9c..3aa4abb 100644 --- a/test/config_test.py +++ b/test/config_test.py @@ -31,6 +31,19 @@ def _verify_hdfs_settings(self, config): self.assertEqual('namenode2.mydomain', config[1]['namenode']) self.assertEqual(8888, config[1]['port']) + # namenodes in ha-port-hdfs-site.xml with no namespace (so all of them expected) + def _verify_hdfs_settings_all(self, config): + self.assertEquals(len(config), 3) + # assert first NN + self.assertEqual('namenode1.mydomain', config[0]['namenode']) + self.assertEqual(8888, config[0]['port']) + # assert second NN + self.assertEqual('namenode2.mydomain', config[1]['namenode']) + self.assertEqual(8888, config[1]['port']) + # assert third NN + self.assertEqual('namenode.other-domain', config[2]['namenode']) + self.assertEqual(8888, config[2]['port']) + def _verify_hdfs_noport_settings(self, config): self.assertEquals(len(config), 2) # assert first NN @@ -40,10 +53,27 @@ def _verify_hdfs_noport_settings(self, config): self.assertEqual('namenode2.mydomain', config[1]['namenode']) self.assertEqual(8020, config[1]['port']) + # namenodes in ha-port-hdfs-site.xml using namespace in ha-core-site.xml + def _verify_hdfs_port_settings(self, config): + self.assertEquals(len(config), 2) + # assert first NN + self.assertEqual('namenode1.mydomain', config[0]['namenode']) + self.assertEqual(8888, config[0]['port']) + # assert second NN + self.assertEqual('namenode2.mydomain', config[1]['namenode']) + self.assertEqual(8888, config[1]['port']) + def test_read_hdfs_config_ha(self): - hdfs_site_path = self.get_config_path('ha-port-hdfs-site.xml') - config = HDFSConfig.read_hdfs_config(hdfs_site_path) - self._verify_hdfs_settings(config) + hdfs_core_path = self.get_config_path('ha-port-hdfs-site.xml') + conf = HDFSConfig.read_hadoop_config(hdfs_core_path) + config = HDFSConfig.read_hdfs_config('', conf, '', []) + self._verify_hdfs_settings_all(config) + + def test_read_hdfs_port_config_ha(self): + hdfs_core_path = self.get_config_path('ha-port-hdfs-site.xml') + conf = HDFSConfig.read_hadoop_config(hdfs_core_path) + config = HDFSConfig.read_hdfs_config('', conf, 'testha', ['namenode1-mydomain', 'namenode2-mydomain']) + self._verify_hdfs_port_settings(config) def test_read_core_config_ha(self): core_site_path = self.get_config_path('ha-core-site.xml') diff --git a/test/testconfig/conf/ha-port-hdfs-site.xml b/test/testconfig/conf/ha-port-hdfs-site.xml index 5aa344b..654ca51 100644 --- a/test/testconfig/conf/ha-port-hdfs-site.xml +++ b/test/testconfig/conf/ha-port-hdfs-site.xml @@ -22,6 +22,11 @@ namenode2.mydomain:8888 + + dfs.namenode.rpc-address.testha.other-domain + namenode.other-domain:8888 + + dfs.namenode.http-address.testha.namenode1-mydomain namenode1.mydomain:50070