diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java index 3efa83a55cd4..53136b517e63 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java @@ -433,7 +433,7 @@ private DatanodeDetails initializeDatanodeDetails() File idFile = new File(idFilePath); DatanodeDetails details; if (idFile.exists()) { - details = ContainerUtils.readDatanodeDetailsFrom(idFile); + details = ContainerUtils.readDatanodeDetailsFrom(idFile, conf); } else { // There is no datanode.id file, this might be the first time datanode // is started. diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java index 33d7dc9a3249..7d16546fb69f 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java @@ -34,7 +34,9 @@ import java.nio.file.Paths; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import java.util.Collection; import java.util.Objects; +import java.util.Properties; import java.util.UUID; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -49,10 +51,12 @@ import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandResponseProto; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException; +import org.apache.hadoop.hdds.utils.HddsServerUtil; import org.apache.hadoop.ozone.OzoneConsts; import org.apache.hadoop.ozone.container.common.impl.ContainerData; import org.apache.hadoop.ozone.container.common.impl.ContainerDataYaml; import org.apache.hadoop.ozone.container.common.impl.ContainerSet; +import org.apache.hadoop.ozone.container.common.utils.StorageVolumeUtil; import org.apache.hadoop.ozone.container.common.volume.HddsVolume; import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData; import org.slf4j.Logger; @@ -166,25 +170,68 @@ public static synchronized void writeDatanodeDetailsTo( * @return {@link DatanodeDetails} * @throws IOException If the id file is malformed or other I/O exceptions */ - public static synchronized DatanodeDetails readDatanodeDetailsFrom(File path) - throws IOException { + public static synchronized DatanodeDetails readDatanodeDetailsFrom( + File path, ConfigurationSource conf) throws IOException { if (!path.exists()) { throw new IOException("Datanode ID file not found."); } try { return DatanodeIdYaml.readDatanodeIdFile(path); } catch (IOException e) { - LOG.warn("Error loading DatanodeDetails yaml from {}", - path.getAbsolutePath(), e); - // Try to load as protobuf before giving up - try (InputStream in = Files.newInputStream(path.toPath())) { - return DatanodeDetails.getFromProtoBuf( - HddsProtos.DatanodeDetailsProto.parseFrom(in)); - } catch (IOException io) { - throw new IOException("Failed to parse DatanodeDetails from " - + path.getAbsolutePath(), io); + LOG.warn("Failed to read Datanode ID file as YAML. " + + "Attempting recovery.", e); + try { + return recoverDatanodeDetailsFromVersionFile(path, conf); + } catch (IOException recoveryEx) { + LOG.warn("Datanode ID recovery from VERSION file failed. " + + "Falling back to reading as Protobuf.", recoveryEx); + try { + return readDatanodeDetailsFromProto(path); + } catch (IOException io) { + throw new IOException("Failed to parse DatanodeDetails from " + + path.getAbsolutePath(), io); + } + } + } + } + + /** + * Recover DatanodeDetails from VERSION file. + */ + private static DatanodeDetails recoverDatanodeDetailsFromVersionFile( + File path, ConfigurationSource conf) throws IOException { + LOG.info("Attempting to recover Datanode ID from VERSION file."); + String dnUuid = null; + Collection dataNodeDirs = + HddsServerUtil.getDatanodeStorageDirs(conf); + for (String dataNodeDir : dataNodeDirs) { + File versionFile = new File(dataNodeDir, HddsVolume.HDDS_VOLUME_DIR + "/" + StorageVolumeUtil.VERSION_FILE); + if (versionFile.exists()) { + Properties props = DatanodeVersionFile.readFrom(versionFile); + dnUuid = props.getProperty(OzoneConsts.DATANODE_UUID); + if (dnUuid != null && !dnUuid.isEmpty()) { + break; + } } } + if (dnUuid == null) { + throw new IOException("Could not find a valid datanode UUID from " + + "any VERSION file in " + dataNodeDirs); + } + DatanodeDetails.Builder builder = DatanodeDetails.newBuilder(); + builder.setUuid(UUID.fromString(dnUuid)); + DatanodeDetails datanodeDetails = builder.build(); + DatanodeIdYaml.createDatanodeIdFile(datanodeDetails, path, conf); + LOG.info("Successfully recovered and rewrote datanode ID file."); + return datanodeDetails; + } + + private static DatanodeDetails readDatanodeDetailsFromProto(File path) + throws IOException { + try (InputStream in = Files.newInputStream(path.toPath())) { + return DatanodeDetails.getFromProtoBuf( + HddsProtos.DatanodeDetailsProto.parseFrom(in)); + } } /** diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/DatanodeIdYaml.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/DatanodeIdYaml.java index d3fd432efef8..07bdedb4398e 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/DatanodeIdYaml.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/DatanodeIdYaml.java @@ -87,6 +87,13 @@ public static DatanodeDetails readDatanodeIdFile(File path) throw new IOException("Unable to parse yaml file.", e); } + if (datanodeDetailsYaml == null + || datanodeDetailsYaml.getUuid() == null + || datanodeDetailsYaml.getUuid().isEmpty()) { + throw new IOException( + "Datanode ID file is empty or has null UUID: " + path.getAbsolutePath()); + } + DatanodeDetails.Builder builder = DatanodeDetails.newBuilder(); builder.setUuid(UUID.fromString(datanodeDetailsYaml.getUuid())) .setIpAddress(datanodeDetailsYaml.getIpAddress()) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/StorageVolumeUtil.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/StorageVolumeUtil.java index 5e6fe086a165..c71fc6cde6d3 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/StorageVolumeUtil.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/StorageVolumeUtil.java @@ -43,7 +43,7 @@ */ public final class StorageVolumeUtil { - private static final String VERSION_FILE = "VERSION"; + public static final String VERSION_FILE = "VERSION"; private static final String STORAGE_ID_PREFIX = "DS-"; private StorageVolumeUtil() { diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/helpers/TestContainerUtils.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/helpers/TestContainerUtils.java index 2a2d90ae18c2..e262e795aa66 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/helpers/TestContainerUtils.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/helpers/TestContainerUtils.java @@ -26,6 +26,7 @@ import static org.apache.hadoop.ozone.container.ContainerTestHelper.getDummyCommandRequestProto; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.mockStatic; import static org.mockito.Mockito.when; @@ -37,6 +38,7 @@ import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.nio.file.Files; +import java.util.UUID; import org.apache.commons.lang3.RandomUtils; import org.apache.hadoop.hdds.HddsConfigKeys; import org.apache.hadoop.hdds.conf.OzoneConfiguration; @@ -45,6 +47,7 @@ import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandResponseProto; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.scm.ByteStringConversion; +import org.apache.hadoop.hdds.scm.ScmConfigKeys; import org.apache.hadoop.ozone.common.ChunkBuffer; import org.apache.ratis.thirdparty.com.google.protobuf.TextFormat; import org.junit.jupiter.api.BeforeEach; @@ -125,13 +128,13 @@ public void testDatanodeIDPersistent(@TempDir File tempDir) throws Exception { // Read should return an empty value if file doesn't exist File nonExistFile = new File(tempDir, "non_exist.id"); assertThrows(IOException.class, - () -> ContainerUtils.readDatanodeDetailsFrom(nonExistFile)); + () -> ContainerUtils.readDatanodeDetailsFrom(nonExistFile, conf)); // Read should fail if the file is malformed File malformedFile = new File(tempDir, "malformed.id"); createMalformedIDFile(malformedFile); assertThrows(IOException.class, - () -> ContainerUtils.readDatanodeDetailsFrom(malformedFile)); + () -> ContainerUtils.readDatanodeDetailsFrom(malformedFile, conf)); // Test upgrade scenario - protobuf file instead of yaml File protoFile = new File(tempDir, "valid-proto.id"); @@ -139,20 +142,57 @@ public void testDatanodeIDPersistent(@TempDir File tempDir) throws Exception { HddsProtos.DatanodeDetailsProto proto = id1.getProtoBufMessage(); proto.writeTo(out); } - assertDetailsEquals(id1, ContainerUtils.readDatanodeDetailsFrom(protoFile)); + assertDetailsEquals(id1, ContainerUtils.readDatanodeDetailsFrom(protoFile, conf)); id1.setInitialVersion(1); assertWriteRead(tempDir, id1); } } + @Test + public void testDatanodeIdRecovery(@TempDir File tempDir) throws IOException { + // 1. Setup storage directory and VERSION file + String datanodeUuid = UUID.randomUUID().toString(); + File storageDir = new File(tempDir, "datanode-storage"); + assertTrue(storageDir.mkdirs()); + conf.set(ScmConfigKeys.HDDS_DATANODE_DIR_KEY, storageDir.getAbsolutePath()); + + File hddsSubDir = new File(storageDir, "hdds"); + assertTrue(hddsSubDir.mkdirs()); + File versionFile = new File(hddsSubDir, "VERSION"); + DatanodeVersionFile dnVersionFile = new DatanodeVersionFile( + "storage-id", "cluster-id", datanodeUuid, System.currentTimeMillis(), 0); + dnVersionFile.createVersionFile(versionFile); + + // 2. Simulate a corrupted/empty datanode.id file + File datanodeIdFile = new File(tempDir, "datanode.id"); + assertTrue(datanodeIdFile.createNewFile()); + + assertEquals(0, datanodeIdFile.length(), "Datanode ID file should be empty initially"); + + // 3. Call readDatanodeDetailsFrom and verify recovery + DatanodeDetails recoveredDetails = + ContainerUtils.readDatanodeDetailsFrom(datanodeIdFile, conf); + + // 4. Assertions + // Recovered UUID matches the one in the VERSION file + assertEquals(datanodeUuid, recoveredDetails.getUuidString()); + + // datanode.id file is recreated and is not empty + assertTrue(datanodeIdFile.length() > 0, "Datanode ID file should have been recreated with content"); + + // The recreated file can be read normally and contains the correct UUID + DatanodeDetails finalDetails = ContainerUtils.readDatanodeDetailsFrom(datanodeIdFile, conf); + assertEquals(datanodeUuid, finalDetails.getUuidString()); + } + private void assertWriteRead(@TempDir File tempDir, DatanodeDetails details) throws IOException { // Write a single ID to the file and read it out File file = new File(tempDir, "valid-values.id"); ContainerUtils.writeDatanodeDetailsTo(details, file, conf); - DatanodeDetails read = ContainerUtils.readDatanodeDetailsFrom(file); + DatanodeDetails read = ContainerUtils.readDatanodeDetailsFrom(file, conf); assertDetailsEquals(details, read); assertEquals(details.getCurrentVersion(), read.getCurrentVersion()); @@ -163,7 +203,7 @@ private void assertWriteReadWithChangedIpAddress(@TempDir File tempDir, // Write a single ID to the file and read it out File file = new File(tempDir, "valid-values.id"); ContainerUtils.writeDatanodeDetailsTo(details, file, conf); - DatanodeDetails read = ContainerUtils.readDatanodeDetailsFrom(file); + DatanodeDetails read = ContainerUtils.readDatanodeDetailsFrom(file, conf); assertEquals(details.getIpAddress(), read.getIpAddress()); read.validateDatanodeIpAddress(); assertEquals("127.0.0.1", read.getIpAddress()); diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/datanode/schemaupgrade/UpgradeUtils.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/datanode/schemaupgrade/UpgradeUtils.java index 09c2480e9ef6..e447f1011543 100644 --- a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/datanode/schemaupgrade/UpgradeUtils.java +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/datanode/schemaupgrade/UpgradeUtils.java @@ -73,7 +73,7 @@ public static DatanodeDetails getDatanodeDetails(OzoneConfiguration conf) File idFile = new File(idFilePath); Preconditions.checkState(idFile.exists(), "Datanode id file: " + idFilePath + " not exists"); - return ContainerUtils.readDatanodeDetailsFrom(idFile); + return ContainerUtils.readDatanodeDetailsFrom(idFile, conf); } public static File getVolumeUpgradeCompleteFile(HddsVolume volume) {