From 51db4476056c393e41933173ea1762cb9220f45e Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 26 Sep 2023 10:47:46 +0200 Subject: [PATCH] PARQUET-2354: Fix race condition in CharsetValidator The `CharsetValidator` has a static singleton instance at `BinaryTruncator.DEFAULT_UTF8_TRUNCATOR.validator`, so it can be accessed from multiple threads. Before the change, all threads would operate on a shared "dummy buffer" for decoding. --- .../internal/column/columnindex/BinaryTruncator.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryTruncator.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryTruncator.java index 8a6f0078bd..57fbb7966b 100644 --- a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryTruncator.java +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryTruncator.java @@ -40,7 +40,7 @@ enum Validity { } private static class CharsetValidator { - private final CharBuffer dummyBuffer = CharBuffer.allocate(1024); + private final ThreadLocal dummyBuffer = ThreadLocal.withInitial(() -> CharBuffer.allocate(1024)); private final CharsetDecoder decoder; CharsetValidator(Charset charset) { @@ -50,11 +50,13 @@ private static class CharsetValidator { } Validity checkValidity(ByteBuffer buffer) { + // TODO this is currently used for UTF-8 only, so validity check could be done without copying. + CharBuffer charBuffer = dummyBuffer.get(); int pos = buffer.position(); CoderResult result = CoderResult.OVERFLOW; while (result.isOverflow()) { - dummyBuffer.clear(); - result = decoder.decode(buffer, dummyBuffer, true); + charBuffer.clear(); + result = decoder.decode(buffer, charBuffer, true); } buffer.position(pos); if (result.isUnderflow()) {