From b6215bb4f40cefec9d9ecf5af768eab6b57ee838 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ha=CC=8Avard=20Ottestad?= Date: Thu, 24 Oct 2024 13:17:01 +0200 Subject: [PATCH] GH-5148 improved soft fail on corruption for values.id and values.hash files. --- .../nativerdf/NativeStatementIterator.java | 16 ++++++ .../rdf4j/sail/nativerdf/NativeStore.java | 10 ++++ .../rdf4j/sail/nativerdf/ValueStore.java | 8 +-- .../sail/nativerdf/datastore/DataFile.java | 52 ++++++++++++------- .../NativeSailStoreCorruptionTest.java | 7 +-- 5 files changed, 62 insertions(+), 31 deletions(-) diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeStatementIterator.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeStatementIterator.java index b8776bdaf8..9c18795f32 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeStatementIterator.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeStatementIterator.java @@ -10,6 +10,8 @@ *******************************************************************************/ package org.eclipse.rdf4j.sail.nativerdf; +import static org.eclipse.rdf4j.sail.nativerdf.NativeStore.SOFT_FAIL_ON_CORRUPT_DATA; + import java.io.IOException; import org.eclipse.rdf4j.common.io.ByteArrayUtil; @@ -20,6 +22,9 @@ import org.eclipse.rdf4j.model.Value; import org.eclipse.rdf4j.sail.SailException; import org.eclipse.rdf4j.sail.nativerdf.btree.RecordIterator; +import org.eclipse.rdf4j.sail.nativerdf.model.CorruptIRI; +import org.eclipse.rdf4j.sail.nativerdf.model.CorruptIRIOrBNode; +import org.eclipse.rdf4j.sail.nativerdf.model.CorruptUnknownValue; /** * A statement iterator that wraps a RecordIterator containing statement records and translates these records to @@ -74,6 +79,17 @@ public Statement getNextElement() throws SailException { if (contextID != 0) { context = valueStore.getResource(contextID); } + if (SOFT_FAIL_ON_CORRUPT_DATA) { + if (subj == null) { + subj = new CorruptIRIOrBNode(valueStore.getRevision(), subjID, null); + } + if (pred == null) { + pred = new CorruptIRI(valueStore.getRevision(), predID, null, null); + } + if (obj == null) { + obj = new CorruptUnknownValue(valueStore.getRevision(), objID, null); + } + } return valueStore.createStatement(subj, pred, obj, context); } catch (IOException e) { diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeStore.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeStore.java index c82bca9d4d..36149498fe 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeStore.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeStore.java @@ -24,6 +24,7 @@ import org.apache.commons.io.FileUtils; import org.eclipse.rdf4j.collection.factory.api.CollectionFactory; import org.eclipse.rdf4j.collection.factory.mapdb.MapDb3CollectionFactory; +import org.eclipse.rdf4j.common.annotation.InternalUseOnly; import org.eclipse.rdf4j.common.concurrent.locks.Lock; import org.eclipse.rdf4j.common.concurrent.locks.LockManager; import org.eclipse.rdf4j.common.io.MavenUtil; @@ -62,6 +63,15 @@ public class NativeStore extends AbstractNotifyingSail implements FederatedServi private static final String VERSION = MavenUtil.loadVersion("org.eclipse.rdf4j", "rdf4j-sail-nativerdf", "devel"); + /** + * Do not throw an exception when corrupt data is detected. Instead, try to return as much data as possible. + * + * Variable can be set through the system property org.eclipse.rdf4j.sail.nativerdf.softFailOnCorruptData. + */ + @InternalUseOnly + public static boolean SOFT_FAIL_ON_CORRUPT_DATA = "true" + .equalsIgnoreCase(System.getProperty("org.eclipse.rdf4j.sail.nativerdf.softFailOnCorruptData"));; + private static final Cleaner REMOVE_STORES_USED_FOR_MEMORY_OVERFLOW = Cleaner.create(); /** diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/ValueStore.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/ValueStore.java index 59dbd4ea3f..8ed2f97948 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/ValueStore.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/ValueStore.java @@ -10,6 +10,8 @@ *******************************************************************************/ package org.eclipse.rdf4j.sail.nativerdf; +import static org.eclipse.rdf4j.sail.nativerdf.NativeStore.SOFT_FAIL_ON_CORRUPT_DATA; + import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; @@ -128,12 +130,6 @@ public class ValueStore extends SimpleValueFactory { */ private final ConcurrentCache namespaceIDCache; - /** - * Do not throw an exception in case a value cannot be loaded, e.g. due to a corrupt value store. - */ - public static boolean SOFT_FAIL_ON_CORRUPT_DATA = "true" - .equalsIgnoreCase(System.getProperty("org.eclipse.rdf4j.sail.nativerdf.softFailOnCorruptData"));; - /*--------------* * Constructors * *--------------*/ diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/DataFile.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/DataFile.java index bf3fb4c92d..10d98ab3b4 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/DataFile.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/DataFile.java @@ -10,6 +10,8 @@ *******************************************************************************/ package org.eclipse.rdf4j.sail.nativerdf.datastore; +import static org.eclipse.rdf4j.sail.nativerdf.NativeStore.SOFT_FAIL_ON_CORRUPT_DATA; + import java.io.Closeable; import java.io.File; import java.io.IOException; @@ -18,7 +20,6 @@ import java.util.NoSuchElementException; import org.eclipse.rdf4j.common.io.NioFile; -import org.eclipse.rdf4j.sail.nativerdf.ValueStore; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -203,8 +204,8 @@ public byte[] getData(long offset) throws IOException { (data[3]) & 0x000000ff; // If the data length is larger than 750MB, we are likely reading the wrong data. Probably data corruption. - if (dataLength > 750 * 1024 * 1024) { - if (ValueStore.SOFT_FAIL_ON_CORRUPT_DATA) { + if (dataLength > 128 * 1024 * 1024) { + if (SOFT_FAIL_ON_CORRUPT_DATA) { logger.error( "Data length is {}MB which is larger than 750MB. This is likely data corruption. Truncating length to 32 MB.", dataLength / ((1024 * 1024))); @@ -212,29 +213,40 @@ public byte[] getData(long offset) throws IOException { } } - // We have either managed to read enough data and can return the required subset of the data, or we have read - // too little so we need to execute another read to get the correct data. - if (dataLength <= data.length - 4) { + try { - // adjust the approximate average with 1 part actual length and 99 parts previous average up to a sensible - // max of 200 - dataLengthApproximateAverage = (int) (Math.min(200, - ((dataLengthApproximateAverage / 100.0) * 99) + (dataLength / 100.0))); + // We have either managed to read enough data and can return the required subset of the data, or we have + // read + // too little so we need to execute another read to get the correct data. + if (dataLength <= data.length - 4) { - return Arrays.copyOfRange(data, 4, dataLength + 4); + // adjust the approximate average with 1 part actual length and 99 parts previous average up to a + // sensible + // max of 200 + dataLengthApproximateAverage = (int) (Math.min(200, + ((dataLengthApproximateAverage / 100.0) * 99) + (dataLength / 100.0))); - } else { + return Arrays.copyOfRange(data, 4, dataLength + 4); - // adjust the approximate average, but favour the actual dataLength since dataLength predictions misses are - // costly - dataLengthApproximateAverage = Math.min(200, (dataLengthApproximateAverage + dataLength) / 2); + } else { - // we didn't read enough data so we need to execute a new read - data = new byte[dataLength]; - buf = ByteBuffer.wrap(data); - nioFile.read(buf, offset + 4L); + // adjust the approximate average, but favour the actual dataLength since dataLength predictions misses + // are costly + dataLengthApproximateAverage = Math.min(200, (dataLengthApproximateAverage + dataLength) / 2); - return data; + // we didn't read enough data so we need to execute a new read + data = new byte[dataLength]; + buf = ByteBuffer.wrap(data); + nioFile.read(buf, offset + 4L); + + return data; + } + } catch (OutOfMemoryError e) { + if (dataLength > 128 * 1024 * 1024) { + logger.error( + "Trying to read large amounts of data may be a sign of data corruption. Consider setting the system property org.eclipse.rdf4j.sail.nativerdf.softFailOnCorruptData to true"); + } + throw e; } } diff --git a/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/NativeSailStoreCorruptionTest.java b/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/NativeSailStoreCorruptionTest.java index bff8735439..14fe7e3279 100644 --- a/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/NativeSailStoreCorruptionTest.java +++ b/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/NativeSailStoreCorruptionTest.java @@ -251,7 +251,7 @@ public void testCorruptValuesHashFile() throws IOException { private List getStatements() { List list = new ArrayList<>(); - ValueStore.SOFT_FAIL_ON_CORRUPT_DATA = true; + NativeStore.SOFT_FAIL_ON_CORRUPT_DATA = true; try (RepositoryConnection conn = repo.getConnection()) { StringWriter stringWriter = new StringWriter(); @@ -267,15 +267,12 @@ private List getStatements() { } return list; } finally { - ValueStore.SOFT_FAIL_ON_CORRUPT_DATA = false; + NativeStore.SOFT_FAIL_ON_CORRUPT_DATA = false; } } @AfterEach public void after() throws IOException { repo.shutDown(); - restoreFile(dataDir, "values.hash"); - restoreFile(dataDir, "values.id"); - restoreFile(dataDir, "values.dat"); } }