Skip to content

Commit

Permalink
Merge branch 'master' into HDDS-8968
Browse files Browse the repository at this point in the history
  • Loading branch information
Sadanand Shenoy committed Jul 12, 2023
2 parents d516343 + 92b49eb commit a94ab27
Show file tree
Hide file tree
Showing 89 changed files with 2,089 additions and 452 deletions.
17 changes: 17 additions & 0 deletions hadoop-hdds/common/src/main/resources/ozone-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2021,6 +2021,14 @@
This fallback approach is not recommended for production environments.
</description>
</property>
<property>
<name>ozone.om.ratis.snapshot.max.total.sst.size</name>
<value>100000000</value>
<tag>OZONE, OM, RATIS</tag>
<description>
Max size of SST files in OM Ratis Snapshot tarball.
</description>
</property>
<property>
<name>ozone.om.snapshot.provider.socket.timeout</name>
<value>5000s</value>
Expand Down Expand Up @@ -3961,6 +3969,15 @@
</description>
</property>

<property>
<name>ozone.om.snapshot.diff.disable.native.libs</name>
<value>false</value>
<tag>OZONE, OM</tag>
<description>
Flag to perform snapshot diff without using native libs(can be slow).
</description>
</property>

<property>
<name>ozone.om.snapshot.diff.max.page.size</name>
<value>1000</value>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ public abstract class ContainerData {

private boolean isEmpty;

private int replicaIndex;

/** Timestamp of last data scan (milliseconds since Unix Epoch).
* {@code null} if not yet scanned (or timestamp not recorded,
* eg. in prior versions). */
Expand Down Expand Up @@ -164,6 +166,7 @@ protected ContainerData(ContainerData source) {
this(source.getContainerType(), source.getContainerID(),
source.getLayoutVersion(), source.getMaxSize(),
source.getOriginPipelineId(), source.getOriginNodeId());
replicaIndex = source.replicaIndex;
}

/**
Expand Down Expand Up @@ -196,6 +199,14 @@ public synchronized ContainerDataProto.State getState() {
return state;
}

public int getReplicaIndex() {
return replicaIndex;
}

public void setReplicaIndex(int replicaIndex) {
this.replicaIndex = replicaIndex;
}

/**
* Set the state of the container.
* @param state
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException;
import org.apache.hadoop.ozone.container.common.interfaces.Container;
import org.apache.hadoop.ozone.container.common.statemachine.StateContext;
import org.apache.hadoop.ozone.container.common.utils.ContainerLogger;
import org.apache.hadoop.ozone.container.common.volume.HddsVolume;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -194,13 +195,14 @@ public void handleVolumeFailures(StateContext context) {
AtomicBoolean failedVolume = new AtomicBoolean(false);
AtomicInteger containerCount = new AtomicInteger(0);
containerMap.values().forEach(c -> {
if (c.getContainerData().getVolume().isFailed()) {
removeContainer(c.getContainerData().getContainerID());
ContainerData data = c.getContainerData();
if (data.getVolume().isFailed()) {
removeContainer(data.getContainerID());
LOG.debug("Removing Container {} as the Volume {} " +
"has failed", c.getContainerData().getContainerID(),
c.getContainerData().getVolume());
"has failed", data.getContainerID(), data.getVolume());
failedVolume.set(true);
containerCount.incrementAndGet();
ContainerLogger.logLost(data, "Volume failure");
}
});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,15 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.util.Map;
import java.util.Optional;
import java.util.Set;

import static org.apache.hadoop.hdds.scm.protocolPB.ContainerCommandResponseBuilders.malformedRequest;
import static org.apache.hadoop.hdds.scm.protocolPB.ContainerCommandResponseBuilders.unsupportedRequest;
import static org.apache.hadoop.ozone.container.common.interfaces.Container.ScanResult;

/**
* Ozone Container dispatcher takes a call from the netty server and routes it
Expand Down Expand Up @@ -359,8 +361,12 @@ private ContainerCommandResponseProto dispatchRequest(
|| containerState == State.RECOVERING);
// mark and persist the container state to be unhealthy
try {
// TODO HDDS-7096: Use on demand scanning here instead.
handler.markContainerUnhealthy(container);
// TODO HDDS-7096 + HDDS-8781: Use on demand scanning for the open
// container instead.
handler.markContainerUnhealthy(container,
ScanResult.unhealthy(ScanResult.FailureType.WRITE_FAILURE,
new File(container.getContainerData().getContainerPath()),
new StorageContainerException(result)));
LOG.info("Marked Container UNHEALTHY, ContainerID: {}", containerID);
} catch (IOException ioe) {
// just log the error here in case marking the container fails,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,65 @@
* Interface for Container Operations.
*/
public interface Container<CONTAINERDATA extends ContainerData> extends RwLock {
/**
* Encapsulates the result of a container scan.
*/
class ScanResult {
/**
* Represents the reason a container scan failed and a container should
* be marked unhealthy.
*/
public enum FailureType {
MISSING_CONTAINER_DIR,
MISSING_METADATA_DIR,
MISSING_CONTAINER_FILE,
MISSING_CHUNKS_DIR,
MISSING_CHUNK_FILE,
CORRUPT_CONTAINER_FILE,
CORRUPT_CHUNK,
INCONSISTENT_CHUNK_LENGTH,
INACCESSIBLE_DB,
WRITE_FAILURE
}

private final boolean healthy;
private final File unhealthyFile;
private final FailureType failureType;
private final Throwable exception;

private ScanResult(boolean healthy, FailureType failureType,
File unhealthyFile, Throwable exception) {
this.healthy = healthy;
this.unhealthyFile = unhealthyFile;
this.failureType = failureType;
this.exception = exception;
}

public static ScanResult healthy() {
return new ScanResult(true, null, null, null);
}

public static ScanResult unhealthy(FailureType type, File failingFile,
Throwable exception) {
return new ScanResult(false, type, failingFile, exception);
}

public boolean isHealthy() {
return healthy;
}

public File getUnhealthyFile() {
return unhealthyFile;
}

public FailureType getFailureType() {
return failureType;
}

public Throwable getException() {
return exception;
}
}

/**
* Creates a container.
Expand Down Expand Up @@ -174,7 +233,7 @@ ContainerReplicaProto getContainerReport()
* @return true if the integrity checks pass
* Scan the container metadata to detect corruption.
*/
boolean scanMetaData() throws InterruptedException;
ScanResult scanMetaData() throws InterruptedException;

/**
* Return if the container data should be checksum verified to detect
Expand All @@ -195,6 +254,6 @@ ContainerReplicaProto getContainerReport()
* false otherwise
* @throws InterruptedException if the scan is interrupted.
*/
boolean scanData(DataTransferThrottler throttler, Canceler canceler)
ScanResult scanData(DataTransferThrottler throttler, Canceler canceler)
throws InterruptedException;
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
import org.apache.hadoop.ozone.container.keyvalue.TarContainerPacker;
import org.apache.ratis.statemachine.StateMachine;

import static org.apache.hadoop.ozone.container.common.interfaces.Container.ScanResult;

/**
* Dispatcher sends ContainerCommandRequests to Handler. Each Container Type
* should have an implementation for Handler.
Expand Down Expand Up @@ -151,18 +153,22 @@ public abstract void markContainerForClose(Container container)
* Marks the container Unhealthy. Moves the container to UNHEALTHY state.
*
* @param container container to update
* @param reason The reason the container was marked unhealthy
* @throws IOException in case of exception
*/
public abstract void markContainerUnhealthy(Container container)
public abstract void markContainerUnhealthy(Container container,
ScanResult reason)
throws IOException;

/**
* Moves the Container to QUASI_CLOSED state.
*
* @param container container to be quasi closed
* @param reason The reason the container was quasi closed, for logging
* purposes.
* @throws IOException
*/
public abstract void quasiCloseContainer(Container container)
public abstract void quasiCloseContainer(Container container, String reason)
throws IOException;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ public void handle(SCMCommand command, OzoneContainer ozoneContainer,
// are moved to CLOSED immediately rather than going to quasi-closed.
controller.closeContainer(containerId);
} else {
controller.quasiCloseContainer(containerId);
controller.quasiCloseContainer(containerId,
"Ratis pipeline does not exist");
LOG.info("Marking Container {} quasi closed", containerId);
}
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1072,7 +1072,8 @@ public void notifyGroupRemove() {
for (Long cid : container2BCSIDMap.keySet()) {
try {
containerController.markContainerForClose(cid);
containerController.quasiCloseContainer(cid);
containerController.quasiCloseContainer(cid,
"Ratis group removed");
} catch (IOException e) {
LOG.debug("Failed to quasi-close container {}", cid);
}
Expand Down
Loading

0 comments on commit a94ab27

Please sign in to comment.