From 51525629ca0668c65fe5af67b366c7cc67107710 Mon Sep 17 00:00:00 2001 From: cchung100m Date: Thu, 11 Dec 2025 23:03:18 +0800 Subject: [PATCH 01/10] HDDS-13955: Handle empty datanode.id file gracefully --- .../hadoop/ozone/HddsDatanodeService.java | 2 +- .../common/helpers/ContainerUtils.java | 25 ++++++++++++++-- .../common/helpers/DatanodeIdYaml.java | 5 ++++ .../helpers/EmptyDatanodeIdFileException.java | 29 +++++++++++++++++++ 4 files changed, 58 insertions(+), 3 deletions(-) create mode 100644 hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/EmptyDatanodeIdFileException.java diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java index 3efa83a55cd4..53136b517e63 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java @@ -433,7 +433,7 @@ private DatanodeDetails initializeDatanodeDetails() File idFile = new File(idFilePath); DatanodeDetails details; if (idFile.exists()) { - details = ContainerUtils.readDatanodeDetailsFrom(idFile); + details = ContainerUtils.readDatanodeDetailsFrom(idFile, conf); } else { // There is no datanode.id file, this might be the first time datanode // is started. diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java index 33d7dc9a3249..7ffb20bee070 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java @@ -35,12 +35,14 @@ import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.Objects; +import java.util.Properties; import java.util.UUID; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.codec.digest.DigestUtils; import org.apache.hadoop.fs.FileAlreadyExistsException; import org.apache.hadoop.hdds.HddsConfigKeys; +import org.apache.hadoop.hdds.scm.ScmConfigKeys; import org.apache.hadoop.hdds.conf.ConfigurationSource; import org.apache.hadoop.hdds.fs.SpaceUsageSource; import org.apache.hadoop.hdds.protocol.DatanodeDetails; @@ -166,13 +168,32 @@ public static synchronized void writeDatanodeDetailsTo( * @return {@link DatanodeDetails} * @throws IOException If the id file is malformed or other I/O exceptions */ - public static synchronized DatanodeDetails readDatanodeDetailsFrom(File path) - throws IOException { + public static synchronized DatanodeDetails readDatanodeDetailsFrom( + File path, ConfigurationSource conf) throws IOException { if (!path.exists()) { throw new IOException("Datanode ID file not found."); } try { return DatanodeIdYaml.readDatanodeIdFile(path); + } catch (EmptyDatanodeIdFileException e) { + LOG.warn("Datanode ID file is empty. Recovering from VERSION file.", e); + // Get the datanode UUID from the VERSION file + String dataNodeDir = conf.get(ScmConfigKeys.HDDS_DATANODE_DIR_KEY); + if (dataNodeDir == null || dataNodeDir.isEmpty()) { + throw new IOException("hdds.datanode.dir is not configured.", e); + } + File versionFile = new File(dataNodeDir, "hdds/VERSION"); + Properties props = DatanodeVersionFile.readFrom(versionFile); + String dnUuid = props.getProperty(OzoneConsts.DATANODE_UUID); + + // Create a new DatanodeDetails with the UUID + DatanodeDetails.Builder builder = DatanodeDetails.newBuilder(); + builder.setUuid(UUID.fromString(dnUuid)); + DatanodeDetails datanodeDetails = builder.build(); + + // Write the recovered Datanode ID to the datanode.id file + DatanodeIdYaml.createDatanodeIdFile(datanodeDetails, path, conf); + return datanodeDetails; } catch (IOException e) { LOG.warn("Error loading DatanodeDetails yaml from {}", path.getAbsolutePath(), e); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/DatanodeIdYaml.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/DatanodeIdYaml.java index d3fd432efef8..d12a19c28116 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/DatanodeIdYaml.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/DatanodeIdYaml.java @@ -87,6 +87,11 @@ public static DatanodeDetails readDatanodeIdFile(File path) throw new IOException("Unable to parse yaml file.", e); } + if (datanodeDetailsYaml == null) { + throw new EmptyDatanodeIdFileException( + "Datanode ID file is empty: " + path.getAbsolutePath()); + } + DatanodeDetails.Builder builder = DatanodeDetails.newBuilder(); builder.setUuid(UUID.fromString(datanodeDetailsYaml.getUuid())) .setIpAddress(datanodeDetailsYaml.getIpAddress()) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/EmptyDatanodeIdFileException.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/EmptyDatanodeIdFileException.java new file mode 100644 index 000000000000..957f5289fbe2 --- /dev/null +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/EmptyDatanodeIdFileException.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.container.common.helpers; + +import java.io.IOException; + +/** + * Exception thrown when a Datanode ID file is empty + */ +public class EmptyDatanodeIdFileException extends IOException { + public EmptyDatanodeIdFileException(String message) { + super(message); + } +} From 68904d3bc3293c8930301c22cb954ef06bf51246 Mon Sep 17 00:00:00 2001 From: cchung100m Date: Fri, 12 Dec 2025 19:12:59 +0800 Subject: [PATCH 02/10] HDDS-13955: Fix checkstyle --- .../hadoop/ozone/container/common/helpers/ContainerUtils.java | 2 +- .../container/common/helpers/EmptyDatanodeIdFileException.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java index 7ffb20bee070..2cb6e9dab45e 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java @@ -42,7 +42,6 @@ import org.apache.commons.codec.digest.DigestUtils; import org.apache.hadoop.fs.FileAlreadyExistsException; import org.apache.hadoop.hdds.HddsConfigKeys; -import org.apache.hadoop.hdds.scm.ScmConfigKeys; import org.apache.hadoop.hdds.conf.ConfigurationSource; import org.apache.hadoop.hdds.fs.SpaceUsageSource; import org.apache.hadoop.hdds.protocol.DatanodeDetails; @@ -50,6 +49,7 @@ import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandRequestProto; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandResponseProto; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; +import org.apache.hadoop.hdds.scm.ScmConfigKeys; import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException; import org.apache.hadoop.ozone.OzoneConsts; import org.apache.hadoop.ozone.container.common.impl.ContainerData; diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/EmptyDatanodeIdFileException.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/EmptyDatanodeIdFileException.java index 957f5289fbe2..71ea19adbcca 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/EmptyDatanodeIdFileException.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/EmptyDatanodeIdFileException.java @@ -20,7 +20,7 @@ import java.io.IOException; /** - * Exception thrown when a Datanode ID file is empty + * Exception thrown when a Datanode ID file is empty. */ public class EmptyDatanodeIdFileException extends IOException { public EmptyDatanodeIdFileException(String message) { From ee916ad8c9f1929cea878b9d6f66ae9b4892f88b Mon Sep 17 00:00:00 2001 From: cchung100m Date: Fri, 12 Dec 2025 19:30:07 +0800 Subject: [PATCH 03/10] HDDS-13955: Fix compile error --- .../container/common/helpers/TestContainerUtils.java | 10 +++++----- .../repair/datanode/schemaupgrade/UpgradeUtils.java | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/helpers/TestContainerUtils.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/helpers/TestContainerUtils.java index 2a2d90ae18c2..d0239974b364 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/helpers/TestContainerUtils.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/helpers/TestContainerUtils.java @@ -125,13 +125,13 @@ public void testDatanodeIDPersistent(@TempDir File tempDir) throws Exception { // Read should return an empty value if file doesn't exist File nonExistFile = new File(tempDir, "non_exist.id"); assertThrows(IOException.class, - () -> ContainerUtils.readDatanodeDetailsFrom(nonExistFile)); + () -> ContainerUtils.readDatanodeDetailsFrom(nonExistFile, conf)); // Read should fail if the file is malformed File malformedFile = new File(tempDir, "malformed.id"); createMalformedIDFile(malformedFile); assertThrows(IOException.class, - () -> ContainerUtils.readDatanodeDetailsFrom(malformedFile)); + () -> ContainerUtils.readDatanodeDetailsFrom(malformedFile, conf)); // Test upgrade scenario - protobuf file instead of yaml File protoFile = new File(tempDir, "valid-proto.id"); @@ -139,7 +139,7 @@ public void testDatanodeIDPersistent(@TempDir File tempDir) throws Exception { HddsProtos.DatanodeDetailsProto proto = id1.getProtoBufMessage(); proto.writeTo(out); } - assertDetailsEquals(id1, ContainerUtils.readDatanodeDetailsFrom(protoFile)); + assertDetailsEquals(id1, ContainerUtils.readDatanodeDetailsFrom(protoFile, conf)); id1.setInitialVersion(1); assertWriteRead(tempDir, id1); @@ -152,7 +152,7 @@ private void assertWriteRead(@TempDir File tempDir, File file = new File(tempDir, "valid-values.id"); ContainerUtils.writeDatanodeDetailsTo(details, file, conf); - DatanodeDetails read = ContainerUtils.readDatanodeDetailsFrom(file); + DatanodeDetails read = ContainerUtils.readDatanodeDetailsFrom(file, conf); assertDetailsEquals(details, read); assertEquals(details.getCurrentVersion(), read.getCurrentVersion()); @@ -163,7 +163,7 @@ private void assertWriteReadWithChangedIpAddress(@TempDir File tempDir, // Write a single ID to the file and read it out File file = new File(tempDir, "valid-values.id"); ContainerUtils.writeDatanodeDetailsTo(details, file, conf); - DatanodeDetails read = ContainerUtils.readDatanodeDetailsFrom(file); + DatanodeDetails read = ContainerUtils.readDatanodeDetailsFrom(file, conf); assertEquals(details.getIpAddress(), read.getIpAddress()); read.validateDatanodeIpAddress(); assertEquals("127.0.0.1", read.getIpAddress()); diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/datanode/schemaupgrade/UpgradeUtils.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/datanode/schemaupgrade/UpgradeUtils.java index 09c2480e9ef6..e447f1011543 100644 --- a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/datanode/schemaupgrade/UpgradeUtils.java +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/datanode/schemaupgrade/UpgradeUtils.java @@ -73,7 +73,7 @@ public static DatanodeDetails getDatanodeDetails(OzoneConfiguration conf) File idFile = new File(idFilePath); Preconditions.checkState(idFile.exists(), "Datanode id file: " + idFilePath + " not exists"); - return ContainerUtils.readDatanodeDetailsFrom(idFile); + return ContainerUtils.readDatanodeDetailsFrom(idFile, conf); } public static File getVolumeUpgradeCompleteFile(HddsVolume volume) { From 3a923cd7b03d6864fa829e75cbb33e4a7f031b2a Mon Sep 17 00:00:00 2001 From: cchung100m Date: Sun, 14 Dec 2025 14:09:04 +0800 Subject: [PATCH 04/10] HDDS-13955: Refactor by review comments --- .../common/helpers/ContainerUtils.java | 27 ++++++++++++----- .../common/helpers/DatanodeIdYaml.java | 8 +++-- .../helpers/EmptyDatanodeIdFileException.java | 29 ------------------- 3 files changed, 25 insertions(+), 39 deletions(-) delete mode 100644 hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/EmptyDatanodeIdFileException.java diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java index 2cb6e9dab45e..cc2eb9ce7f43 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java @@ -34,6 +34,7 @@ import java.nio.file.Paths; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import java.util.Collection; import java.util.Objects; import java.util.Properties; import java.util.UUID; @@ -49,8 +50,8 @@ import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandRequestProto; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandResponseProto; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; -import org.apache.hadoop.hdds.scm.ScmConfigKeys; import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException; +import org.apache.hadoop.hdds.utils.HddsServerUtil; import org.apache.hadoop.ozone.OzoneConsts; import org.apache.hadoop.ozone.container.common.impl.ContainerData; import org.apache.hadoop.ozone.container.common.impl.ContainerDataYaml; @@ -175,16 +176,28 @@ public static synchronized DatanodeDetails readDatanodeDetailsFrom( } try { return DatanodeIdYaml.readDatanodeIdFile(path); - } catch (EmptyDatanodeIdFileException e) { + } catch (IOException e) { LOG.warn("Datanode ID file is empty. Recovering from VERSION file.", e); // Get the datanode UUID from the VERSION file - String dataNodeDir = conf.get(ScmConfigKeys.HDDS_DATANODE_DIR_KEY); - if (dataNodeDir == null || dataNodeDir.isEmpty()) { + String dnUuid = null; + Collection dataNodeDirs = HddsServerUtil.getDatanodeStorageDirs(conf); + if (dataNodeDirs.isEmpty()) { throw new IOException("hdds.datanode.dir is not configured.", e); } - File versionFile = new File(dataNodeDir, "hdds/VERSION"); - Properties props = DatanodeVersionFile.readFrom(versionFile); - String dnUuid = props.getProperty(OzoneConsts.DATANODE_UUID); + for (String dataNodeDir : dataNodeDirs) { + File versionFile = new File(dataNodeDir, "hdds/VERSION"); + if (versionFile.exists()) { + Properties props = DatanodeVersionFile.readFrom(versionFile); + dnUuid = props.getProperty(OzoneConsts.DATANODE_UUID); + if (dnUuid != null && !dnUuid.isEmpty()) { + break; + } + } + } + if (dnUuid == null) { + throw new IOException("Conuld not find a valid datanode UUID from " + + "any VERSION file in " + dataNodeDirs, e); + } // Create a new DatanodeDetails with the UUID DatanodeDetails.Builder builder = DatanodeDetails.newBuilder(); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/DatanodeIdYaml.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/DatanodeIdYaml.java index d12a19c28116..07bdedb4398e 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/DatanodeIdYaml.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/DatanodeIdYaml.java @@ -87,9 +87,11 @@ public static DatanodeDetails readDatanodeIdFile(File path) throw new IOException("Unable to parse yaml file.", e); } - if (datanodeDetailsYaml == null) { - throw new EmptyDatanodeIdFileException( - "Datanode ID file is empty: " + path.getAbsolutePath()); + if (datanodeDetailsYaml == null + || datanodeDetailsYaml.getUuid() == null + || datanodeDetailsYaml.getUuid().isEmpty()) { + throw new IOException( + "Datanode ID file is empty or has null UUID: " + path.getAbsolutePath()); } DatanodeDetails.Builder builder = DatanodeDetails.newBuilder(); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/EmptyDatanodeIdFileException.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/EmptyDatanodeIdFileException.java deleted file mode 100644 index 71ea19adbcca..000000000000 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/EmptyDatanodeIdFileException.java +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.ozone.container.common.helpers; - -import java.io.IOException; - -/** - * Exception thrown when a Datanode ID file is empty. - */ -public class EmptyDatanodeIdFileException extends IOException { - public EmptyDatanodeIdFileException(String message) { - super(message); - } -} From 6768196e554f7b92f62999e1a3ca161742d9e3ec Mon Sep 17 00:00:00 2001 From: cchung100m Date: Sun, 14 Dec 2025 14:30:05 +0800 Subject: [PATCH 05/10] HDDS-13955: Combine the recovery handling for Datanode ID file not found --- .../common/helpers/ContainerUtils.java | 72 +++++++++---------- 1 file changed, 34 insertions(+), 38 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java index cc2eb9ce7f43..60f5cfc62282 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java @@ -177,46 +177,42 @@ public static synchronized DatanodeDetails readDatanodeDetailsFrom( try { return DatanodeIdYaml.readDatanodeIdFile(path); } catch (IOException e) { - LOG.warn("Datanode ID file is empty. Recovering from VERSION file.", e); - // Get the datanode UUID from the VERSION file - String dnUuid = null; - Collection dataNodeDirs = HddsServerUtil.getDatanodeStorageDirs(conf); - if (dataNodeDirs.isEmpty()) { - throw new IOException("hdds.datanode.dir is not configured.", e); - } - for (String dataNodeDir : dataNodeDirs) { - File versionFile = new File(dataNodeDir, "hdds/VERSION"); - if (versionFile.exists()) { - Properties props = DatanodeVersionFile.readFrom(versionFile); - dnUuid = props.getProperty(OzoneConsts.DATANODE_UUID); - if (dnUuid != null && !dnUuid.isEmpty()) { - break; + LOG.warn("Failed to read Datanode ID file as YAML. Reason: {}. " + + "Attempting recovery.", e.getMessage()); + try { + LOG.info("Attempting to recover Datanode ID from VERSION file."); + String dnUuid = null; + Collection dataNodeDirs = HddsServerUtil.getDatanodeStorageDirs(conf); + if (dataNodeDirs.isEmpty()) { + throw new IOException("hdds.datanode.dir is not configured."); + } + for (String dataNodeDir : dataNodeDirs) { + File versionFile = new File(dataNodeDir, "hdds/VERSION"); + if (versionFile.exists()) { + Properties props = DatanodeVersionFile.readFrom(versionFile); + dnUuid = props.getProperty(OzoneConsts.DATANODE_UUID); + if (dnUuid != null && !dnUuid.isEmpty()) { + break; + } } } - } - if (dnUuid == null) { - throw new IOException("Conuld not find a valid datanode UUID from " + - "any VERSION file in " + dataNodeDirs, e); - } - - // Create a new DatanodeDetails with the UUID - DatanodeDetails.Builder builder = DatanodeDetails.newBuilder(); - builder.setUuid(UUID.fromString(dnUuid)); - DatanodeDetails datanodeDetails = builder.build(); - - // Write the recovered Datanode ID to the datanode.id file - DatanodeIdYaml.createDatanodeIdFile(datanodeDetails, path, conf); - return datanodeDetails; - } catch (IOException e) { - LOG.warn("Error loading DatanodeDetails yaml from {}", - path.getAbsolutePath(), e); - // Try to load as protobuf before giving up - try (InputStream in = Files.newInputStream(path.toPath())) { - return DatanodeDetails.getFromProtoBuf( - HddsProtos.DatanodeDetailsProto.parseFrom(in)); - } catch (IOException io) { - throw new IOException("Failed to parse DatanodeDetails from " - + path.getAbsolutePath(), io); + if (dnUuid == null) { + throw new IOException("Could not find a valid datanode UUID from " + + "any VERSION file in " + dataNodeDirs); + } + DatanodeDetails.Builder builder = DatanodeDetails.newBuilder(); + builder.setUuid(UUID.fromString(dnUuid)); + DatanodeDetails datanodeDetails = builder.build(); + DatanodeIdYaml.createDatanodeIdFile(datanodeDetails, path, conf); + LOG.info("Successfully recovered and rewrote datanode ID file."); + return datanodeDetails; + } catch (IOException recoveryEx) { + LOG.warn("Datanode ID recovery from VERSION file failed. Reason: {}. " + + "Falling back to reading as Protobuf.", recoveryEx.getMessage()); + try (InputStream in = Files.newInputStream(path.toPath())) { + return DatanodeDetails.getFromProtoBuf( + HddsProtos.DatanodeDetailsProto.parseFrom(in)); + } } } } From 89fec250f390d7a0b87c65c0214dca2079850636 Mon Sep 17 00:00:00 2001 From: cchung100m Date: Sun, 14 Dec 2025 16:39:06 +0800 Subject: [PATCH 06/10] HDDS-13955: Add test case: testDatanodeIdRecovery --- .../common/helpers/TestContainerUtils.java | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/helpers/TestContainerUtils.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/helpers/TestContainerUtils.java index d0239974b364..e262e795aa66 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/helpers/TestContainerUtils.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/helpers/TestContainerUtils.java @@ -26,6 +26,7 @@ import static org.apache.hadoop.ozone.container.ContainerTestHelper.getDummyCommandRequestProto; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.mockStatic; import static org.mockito.Mockito.when; @@ -37,6 +38,7 @@ import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.nio.file.Files; +import java.util.UUID; import org.apache.commons.lang3.RandomUtils; import org.apache.hadoop.hdds.HddsConfigKeys; import org.apache.hadoop.hdds.conf.OzoneConfiguration; @@ -45,6 +47,7 @@ import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandResponseProto; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.scm.ByteStringConversion; +import org.apache.hadoop.hdds.scm.ScmConfigKeys; import org.apache.hadoop.ozone.common.ChunkBuffer; import org.apache.ratis.thirdparty.com.google.protobuf.TextFormat; import org.junit.jupiter.api.BeforeEach; @@ -146,6 +149,43 @@ public void testDatanodeIDPersistent(@TempDir File tempDir) throws Exception { } } + @Test + public void testDatanodeIdRecovery(@TempDir File tempDir) throws IOException { + // 1. Setup storage directory and VERSION file + String datanodeUuid = UUID.randomUUID().toString(); + File storageDir = new File(tempDir, "datanode-storage"); + assertTrue(storageDir.mkdirs()); + conf.set(ScmConfigKeys.HDDS_DATANODE_DIR_KEY, storageDir.getAbsolutePath()); + + File hddsSubDir = new File(storageDir, "hdds"); + assertTrue(hddsSubDir.mkdirs()); + File versionFile = new File(hddsSubDir, "VERSION"); + DatanodeVersionFile dnVersionFile = new DatanodeVersionFile( + "storage-id", "cluster-id", datanodeUuid, System.currentTimeMillis(), 0); + dnVersionFile.createVersionFile(versionFile); + + // 2. Simulate a corrupted/empty datanode.id file + File datanodeIdFile = new File(tempDir, "datanode.id"); + assertTrue(datanodeIdFile.createNewFile()); + + assertEquals(0, datanodeIdFile.length(), "Datanode ID file should be empty initially"); + + // 3. Call readDatanodeDetailsFrom and verify recovery + DatanodeDetails recoveredDetails = + ContainerUtils.readDatanodeDetailsFrom(datanodeIdFile, conf); + + // 4. Assertions + // Recovered UUID matches the one in the VERSION file + assertEquals(datanodeUuid, recoveredDetails.getUuidString()); + + // datanode.id file is recreated and is not empty + assertTrue(datanodeIdFile.length() > 0, "Datanode ID file should have been recreated with content"); + + // The recreated file can be read normally and contains the correct UUID + DatanodeDetails finalDetails = ContainerUtils.readDatanodeDetailsFrom(datanodeIdFile, conf); + assertEquals(datanodeUuid, finalDetails.getUuidString()); + } + private void assertWriteRead(@TempDir File tempDir, DatanodeDetails details) throws IOException { // Write a single ID to the file and read it out From 61129827d43313b645dcaf23638fbd0b592e484a Mon Sep 17 00:00:00 2001 From: cchung100m Date: Tue, 16 Dec 2025 18:46:23 +0800 Subject: [PATCH 07/10] HDDS-13955: Add IOException to handle DatanodeDetails.getFromProtoBuf --- .../hadoop/ozone/container/common/helpers/ContainerUtils.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java index 60f5cfc62282..8b98b15c57d6 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java @@ -212,6 +212,9 @@ public static synchronized DatanodeDetails readDatanodeDetailsFrom( try (InputStream in = Files.newInputStream(path.toPath())) { return DatanodeDetails.getFromProtoBuf( HddsProtos.DatanodeDetailsProto.parseFrom(in)); + } catch (IOException io) { + throw new IOException("Failed to parse DatanodeDetails from " + + path.getAbsolutePath(), io); } } } From 46afa5bd2e02328bba9eb383a18ac8592949dd86 Mon Sep 17 00:00:00 2001 From: cchung100m Date: Sat, 20 Dec 2025 22:02:42 +0800 Subject: [PATCH 08/10] HDDS-13955: refactor the recovery code into separate methods --- .../common/helpers/ContainerUtils.java | 74 +++++++++++-------- 1 file changed, 45 insertions(+), 29 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java index 8b98b15c57d6..b5d0b6e09381 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java @@ -180,38 +180,12 @@ public static synchronized DatanodeDetails readDatanodeDetailsFrom( LOG.warn("Failed to read Datanode ID file as YAML. Reason: {}. " + "Attempting recovery.", e.getMessage()); try { - LOG.info("Attempting to recover Datanode ID from VERSION file."); - String dnUuid = null; - Collection dataNodeDirs = HddsServerUtil.getDatanodeStorageDirs(conf); - if (dataNodeDirs.isEmpty()) { - throw new IOException("hdds.datanode.dir is not configured."); - } - for (String dataNodeDir : dataNodeDirs) { - File versionFile = new File(dataNodeDir, "hdds/VERSION"); - if (versionFile.exists()) { - Properties props = DatanodeVersionFile.readFrom(versionFile); - dnUuid = props.getProperty(OzoneConsts.DATANODE_UUID); - if (dnUuid != null && !dnUuid.isEmpty()) { - break; - } - } - } - if (dnUuid == null) { - throw new IOException("Could not find a valid datanode UUID from " + - "any VERSION file in " + dataNodeDirs); - } - DatanodeDetails.Builder builder = DatanodeDetails.newBuilder(); - builder.setUuid(UUID.fromString(dnUuid)); - DatanodeDetails datanodeDetails = builder.build(); - DatanodeIdYaml.createDatanodeIdFile(datanodeDetails, path, conf); - LOG.info("Successfully recovered and rewrote datanode ID file."); - return datanodeDetails; + return recoverDatanodeDetailsFromVersionFile(path, conf); } catch (IOException recoveryEx) { LOG.warn("Datanode ID recovery from VERSION file failed. Reason: {}. " + "Falling back to reading as Protobuf.", recoveryEx.getMessage()); - try (InputStream in = Files.newInputStream(path.toPath())) { - return DatanodeDetails.getFromProtoBuf( - HddsProtos.DatanodeDetailsProto.parseFrom(in)); + try { + return readDatanodeDetailsFromProto(path); } catch (IOException io) { throw new IOException("Failed to parse DatanodeDetails from " + path.getAbsolutePath(), io); @@ -220,6 +194,48 @@ public static synchronized DatanodeDetails readDatanodeDetailsFrom( } } + /** + * Recover DatanodeDetails from VERSION file. + */ + private static DatanodeDetails recoverDatanodeDetailsFromVersionFile( + File path, ConfigurationSource conf) throws IOException { + LOG.info("Attempting to recover Datanode ID from VERSION file."); + String dnUuid = null; + Collection dataNodeDirs = + HddsServerUtil.getDatanodeStorageDirs(conf); + if (dataNodeDirs.isEmpty()) { + throw new IOException("hdds.datanode.dir is not configured."); + } + for (String dataNodeDir : dataNodeDirs) { + File versionFile = new File(dataNodeDir, "hdds/VERSION"); + if (versionFile.exists()) { + Properties props = DatanodeVersionFile.readFrom(versionFile); + dnUuid = props.getProperty(OzoneConsts.DATANODE_UUID); + if (dnUuid != null && !dnUuid.isEmpty()) { + break; + } + } + } + if (dnUuid == null) { + throw new IOException("Could not find a valid datanode UUID from " + + "any VERSION file in " + dataNodeDirs); + } + DatanodeDetails.Builder builder = DatanodeDetails.newBuilder(); + builder.setUuid(UUID.fromString(dnUuid)); + DatanodeDetails datanodeDetails = builder.build(); + DatanodeIdYaml.createDatanodeIdFile(datanodeDetails, path, conf); + LOG.info("Successfully recovered and rewrote datanode ID file."); + return datanodeDetails; + } + + private static DatanodeDetails readDatanodeDetailsFromProto(File path) + throws IOException { + try (InputStream in = Files.newInputStream(path.toPath())) { + return DatanodeDetails.getFromProtoBuf( + HddsProtos.DatanodeDetailsProto.parseFrom(in)); + } + } + /** * Verify that the checksum stored in containerData is equal to the * computed checksum. From 6ad2e09b17b6f3d1783446bd5c7420b1dace8a4a Mon Sep 17 00:00:00 2001 From: cchung100m Date: Tue, 30 Dec 2025 23:11:26 +0800 Subject: [PATCH 09/10] Remove the unnecessary HddsServerUtil.getDatanodeStorageDirs check --- .../container/common/helpers/ContainerUtils.java | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java index b5d0b6e09381..9dfcbc77f5e5 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java @@ -70,6 +70,8 @@ public final class ContainerUtils { private static final Logger LOG = LoggerFactory.getLogger(ContainerUtils.class); + private static final String VERSION_FILE = "VERSION"; + private ContainerUtils() { //never constructed. } @@ -177,13 +179,13 @@ public static synchronized DatanodeDetails readDatanodeDetailsFrom( try { return DatanodeIdYaml.readDatanodeIdFile(path); } catch (IOException e) { - LOG.warn("Failed to read Datanode ID file as YAML. Reason: {}. " + - "Attempting recovery.", e.getMessage()); + LOG.warn("Failed to read Datanode ID file as YAML. " + + "Attempting recovery.", e); try { return recoverDatanodeDetailsFromVersionFile(path, conf); } catch (IOException recoveryEx) { - LOG.warn("Datanode ID recovery from VERSION file failed. Reason: {}. " + - "Falling back to reading as Protobuf.", recoveryEx.getMessage()); + LOG.warn("Datanode ID recovery from VERSION file failed. " + + "Falling back to reading as Protobuf.", recoveryEx); try { return readDatanodeDetailsFromProto(path); } catch (IOException io) { @@ -203,11 +205,8 @@ private static DatanodeDetails recoverDatanodeDetailsFromVersionFile( String dnUuid = null; Collection dataNodeDirs = HddsServerUtil.getDatanodeStorageDirs(conf); - if (dataNodeDirs.isEmpty()) { - throw new IOException("hdds.datanode.dir is not configured."); - } for (String dataNodeDir : dataNodeDirs) { - File versionFile = new File(dataNodeDir, "hdds/VERSION"); + File versionFile = new File(dataNodeDir, HddsVolume.HDDS_VOLUME_DIR + "/" + VERSION_FILE); if (versionFile.exists()) { Properties props = DatanodeVersionFile.readFrom(versionFile); dnUuid = props.getProperty(OzoneConsts.DATANODE_UUID); From fe899ba3d2bc3966833528db8990b535157f3967 Mon Sep 17 00:00:00 2001 From: cchung100m Date: Wed, 31 Dec 2025 19:10:08 +0800 Subject: [PATCH 10/10] Refactor the VERSION_FILE --- .../ozone/container/common/helpers/ContainerUtils.java | 5 ++--- .../ozone/container/common/utils/StorageVolumeUtil.java | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java index 9dfcbc77f5e5..7d16546fb69f 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java @@ -56,6 +56,7 @@ import org.apache.hadoop.ozone.container.common.impl.ContainerData; import org.apache.hadoop.ozone.container.common.impl.ContainerDataYaml; import org.apache.hadoop.ozone.container.common.impl.ContainerSet; +import org.apache.hadoop.ozone.container.common.utils.StorageVolumeUtil; import org.apache.hadoop.ozone.container.common.volume.HddsVolume; import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData; import org.slf4j.Logger; @@ -70,8 +71,6 @@ public final class ContainerUtils { private static final Logger LOG = LoggerFactory.getLogger(ContainerUtils.class); - private static final String VERSION_FILE = "VERSION"; - private ContainerUtils() { //never constructed. } @@ -206,7 +205,7 @@ private static DatanodeDetails recoverDatanodeDetailsFromVersionFile( Collection dataNodeDirs = HddsServerUtil.getDatanodeStorageDirs(conf); for (String dataNodeDir : dataNodeDirs) { - File versionFile = new File(dataNodeDir, HddsVolume.HDDS_VOLUME_DIR + "/" + VERSION_FILE); + File versionFile = new File(dataNodeDir, HddsVolume.HDDS_VOLUME_DIR + "/" + StorageVolumeUtil.VERSION_FILE); if (versionFile.exists()) { Properties props = DatanodeVersionFile.readFrom(versionFile); dnUuid = props.getProperty(OzoneConsts.DATANODE_UUID); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/StorageVolumeUtil.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/StorageVolumeUtil.java index 5e6fe086a165..c71fc6cde6d3 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/StorageVolumeUtil.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/StorageVolumeUtil.java @@ -43,7 +43,7 @@ */ public final class StorageVolumeUtil { - private static final String VERSION_FILE = "VERSION"; + public static final String VERSION_FILE = "VERSION"; private static final String STORAGE_ID_PREFIX = "DS-"; private StorageVolumeUtil() {