Skip to content

Commit

Permalink
GH-5148 improved soft fail on corruption for values.id and values.has…
Browse files Browse the repository at this point in the history
…h files.
  • Loading branch information
hmottestad committed Oct 24, 2024
1 parent 92f4fe4 commit be2a482
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
*******************************************************************************/
package org.eclipse.rdf4j.sail.nativerdf;

import static org.eclipse.rdf4j.sail.nativerdf.NativeStore.SOFT_FAIL_ON_CORRUPT_DATA;

import java.io.IOException;

import org.eclipse.rdf4j.common.io.ByteArrayUtil;
Expand All @@ -20,6 +22,9 @@
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.sail.SailException;
import org.eclipse.rdf4j.sail.nativerdf.btree.RecordIterator;
import org.eclipse.rdf4j.sail.nativerdf.model.CorruptIRI;
import org.eclipse.rdf4j.sail.nativerdf.model.CorruptIRIOrBNode;
import org.eclipse.rdf4j.sail.nativerdf.model.CorruptUnknownValue;

/**
* A statement iterator that wraps a RecordIterator containing statement records and translates these records to
Expand Down Expand Up @@ -74,6 +79,17 @@ public Statement getNextElement() throws SailException {
if (contextID != 0) {
context = valueStore.getResource(contextID);
}
if (SOFT_FAIL_ON_CORRUPT_DATA) {
if (subj == null) {
subj = new CorruptIRIOrBNode(valueStore.getRevision(), subjID, null);
}
if (pred == null) {
pred = new CorruptIRI(valueStore.getRevision(), predID, null, null);
}
if (obj == null) {
obj = new CorruptUnknownValue(valueStore.getRevision(), objID, null);
}
}

return valueStore.createStatement(subj, pred, obj, context);
} catch (IOException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.commons.io.FileUtils;
import org.eclipse.rdf4j.collection.factory.api.CollectionFactory;
import org.eclipse.rdf4j.collection.factory.mapdb.MapDb3CollectionFactory;
import org.eclipse.rdf4j.common.annotation.InternalUseOnly;
import org.eclipse.rdf4j.common.concurrent.locks.Lock;
import org.eclipse.rdf4j.common.concurrent.locks.LockManager;
import org.eclipse.rdf4j.common.io.MavenUtil;
Expand Down Expand Up @@ -62,6 +63,15 @@ public class NativeStore extends AbstractNotifyingSail implements FederatedServi

private static final String VERSION = MavenUtil.loadVersion("org.eclipse.rdf4j", "rdf4j-sail-nativerdf", "devel");

/**
* Do not throw an exception when corrupt data is detected. Instead, try to return as much data as possible.
*
* Variable can be set through the system property org.eclipse.rdf4j.sail.nativerdf.softFailOnCorruptData.
*/
@InternalUseOnly
public static boolean SOFT_FAIL_ON_CORRUPT_DATA = "true"
.equalsIgnoreCase(System.getProperty("org.eclipse.rdf4j.sail.nativerdf.softFailOnCorruptData"));;

private static final Cleaner REMOVE_STORES_USED_FOR_MEMORY_OVERFLOW = Cleaner.create();

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
*******************************************************************************/
package org.eclipse.rdf4j.sail.nativerdf;

import static org.eclipse.rdf4j.sail.nativerdf.NativeStore.SOFT_FAIL_ON_CORRUPT_DATA;

import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
Expand Down Expand Up @@ -128,12 +130,6 @@ public class ValueStore extends SimpleValueFactory {
*/
private final ConcurrentCache<String, Integer> namespaceIDCache;

/**
* Do not throw an exception in case a value cannot be loaded, e.g. due to a corrupt value store.
*/
public static boolean SOFT_FAIL_ON_CORRUPT_DATA = "true"
.equalsIgnoreCase(System.getProperty("org.eclipse.rdf4j.sail.nativerdf.softFailOnCorruptData"));;

/*--------------*
* Constructors *
*--------------*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
*******************************************************************************/
package org.eclipse.rdf4j.sail.nativerdf.datastore;

import static org.eclipse.rdf4j.sail.nativerdf.NativeStore.SOFT_FAIL_ON_CORRUPT_DATA;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
Expand All @@ -18,7 +20,6 @@
import java.util.NoSuchElementException;

import org.eclipse.rdf4j.common.io.NioFile;
import org.eclipse.rdf4j.sail.nativerdf.ValueStore;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -203,38 +204,48 @@ public byte[] getData(long offset) throws IOException {
(data[3]) & 0x000000ff;

// If the data length is larger than 750MB, we are likely reading the wrong data. Probably data corruption.
if (dataLength > 750 * 1024 * 1024) {
if (ValueStore.SOFT_FAIL_ON_CORRUPT_DATA) {
if (dataLength > 128 * 1024 * 1024) {
if (SOFT_FAIL_ON_CORRUPT_DATA) {
logger.error(
"Data length is {}MB which is larger than 750MB. This is likely data corruption. Truncating length to 32 MB.",
dataLength / ((1024 * 1024)));
dataLength = 32 * 1024 * 1024;
}
}

// We have either managed to read enough data and can return the required subset of the data, or we have read
// too little so we need to execute another read to get the correct data.
if (dataLength <= data.length - 4) {
try {

// adjust the approximate average with 1 part actual length and 99 parts previous average up to a sensible
// max of 200
dataLengthApproximateAverage = (int) (Math.min(200,
((dataLengthApproximateAverage / 100.0) * 99) + (dataLength / 100.0)));
// We have either managed to read enough data and can return the required subset of the data, or we have
// read
// too little so we need to execute another read to get the correct data.
if (dataLength <= data.length - 4) {

return Arrays.copyOfRange(data, 4, dataLength + 4);
// adjust the approximate average with 1 part actual length and 99 parts previous average up to a
// sensible
// max of 200
dataLengthApproximateAverage = (int) (Math.min(200,
((dataLengthApproximateAverage / 100.0) * 99) + (dataLength / 100.0)));

} else {
return Arrays.copyOfRange(data, 4, dataLength + 4);

// adjust the approximate average, but favour the actual dataLength since dataLength predictions misses are
// costly
dataLengthApproximateAverage = Math.min(200, (dataLengthApproximateAverage + dataLength) / 2);
} else {

// we didn't read enough data so we need to execute a new read
data = new byte[dataLength];
buf = ByteBuffer.wrap(data);
nioFile.read(buf, offset + 4L);
// adjust the approximate average, but favour the actual dataLength since dataLength predictions misses are costly
dataLengthApproximateAverage = Math.min(200, (dataLengthApproximateAverage + dataLength) / 2);

return data;
// we didn't read enough data so we need to execute a new read
data = new byte[dataLength];
buf = ByteBuffer.wrap(data);
nioFile.read(buf, offset + 4L);

return data;
}
} catch (OutOfMemoryError e) {
if (dataLength > 128 * 1024 * 1024) {
logger.error(
"Trying to read large amounts of data may be a sign of data corruption. Consider setting the system property org.eclipse.rdf4j.sail.nativerdf.softFailOnCorruptData to true");
}
throw e;
}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ public void testCorruptValuesHashFile() throws IOException {
private List<Statement> getStatements() {
List<Statement> list = new ArrayList<>();

ValueStore.SOFT_FAIL_ON_CORRUPT_DATA = true;
NativeStore.SOFT_FAIL_ON_CORRUPT_DATA = true;

try (RepositoryConnection conn = repo.getConnection()) {
StringWriter stringWriter = new StringWriter();
Expand All @@ -267,15 +267,12 @@ private List<Statement> getStatements() {
}
return list;
} finally {
ValueStore.SOFT_FAIL_ON_CORRUPT_DATA = false;
NativeStore.SOFT_FAIL_ON_CORRUPT_DATA = false;
}
}

@AfterEach
public void after() throws IOException {
repo.shutDown();
restoreFile(dataDir, "values.hash");
restoreFile(dataDir, "values.id");
restoreFile(dataDir, "values.dat");
}
}

0 comments on commit be2a482

Please sign in to comment.