Skip to content

Commit d1fc878

Browse files
committed
GH-3489: Fix Binary.copy() to materialize direct ByteBuffer-backed values to heap
Why Binary.copy() is a no-op for ByteBufferBackedBinary instances when isBackingBytesReused is false, returning this instead of a heap-backed copy. When the off-heap decompression path is enabled, column readers produce ByteBufferBackedBinary values that are zero-copy views into Arena-managed direct buffers. Code that calls copy() to take ownership of the data -- such as DictionaryValuesWriter.writeBytes() -- silently receives the original object still pointing into the direct buffer. When the source PageReadStore is closed and the Arena is freed, those Binary values become dangling references, causing IllegalStateException: Already closed on subsequent access. What Override copy() in Binary.ByteBufferBackedBinary to always materialize to a heap-backed Binary via fromConstantByteArray(getBytes()) when the backing ByteBuffer is direct. Heap-backed ByteBuffer values delegate to the existing super.copy() behavior to avoid unnecessary copies. The change is in parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java.
1 parent d96c669 commit d1fc878

2 files changed

Lines changed: 147 additions & 0 deletions

File tree

parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,16 @@ public Binary slice(int start, int length) {
497497
return Binary.fromConstantByteArray(getBytesUnsafe(), start, length);
498498
}
499499

500+
@Override
501+
public Binary copy() {
502+
if (value.isDirect()) {
503+
// Direct ByteBuffers may be backed by memory that can be freed independently, so always materialize to
504+
// a heap-backed copy to avoid use-after-free.
505+
return Binary.fromConstantByteArray(getBytes());
506+
}
507+
return super.copy();
508+
}
509+
500510
@Override
501511
public int hashCode() {
502512
if (value.hasArray()) {

parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
import static org.junit.Assert.assertArrayEquals;
2222
import static org.junit.Assert.assertEquals;
23+
import static org.junit.Assert.assertNotSame;
2324
import static org.junit.Assert.assertSame;
2425
import static org.junit.Assert.assertTrue;
2526
import static org.junit.Assert.fail;
@@ -106,6 +107,27 @@ public BinaryAndOriginal get(byte[] bytes, boolean reused) throws Exception {
106107
}
107108
};
108109

110+
private static final BinaryFactory DIRECT_BUFFER_BF = new BinaryFactory() {
111+
@Override
112+
public BinaryAndOriginal get(byte[] bytes, boolean reused) throws Exception {
113+
ByteBuffer direct = ByteBuffer.allocateDirect(bytes.length);
114+
direct.put(bytes);
115+
direct.flip();
116+
Binary b;
117+
118+
if (reused) {
119+
b = Binary.fromReusedByteBuffer(direct);
120+
} else {
121+
b = Binary.fromConstantByteBuffer(direct);
122+
}
123+
124+
assertArrayEquals(bytes, b.getBytes());
125+
// Return the backing byte[] so tests can mutate it, though for direct buffers
126+
// there is no accessible backing array. We return a copy of the original bytes.
127+
return new BinaryAndOriginal(b, bytes);
128+
}
129+
};
130+
109131
private static final BinaryFactory STRING_BF = new BinaryFactory() {
110132
@Override
111133
public BinaryAndOriginal get(byte[] bytes, boolean reused) throws Exception {
@@ -148,6 +170,71 @@ public void testByteBufferBackedBinary() throws Exception {
148170
testBinary(BUFFER_BF, false);
149171
}
150172

173+
@Test
174+
public void testDirectByteBufferBackedBinary() throws Exception {
175+
// Direct buffers have different copy() semantics (always materializes to heap),
176+
// so we test them separately instead of using the generic testBinary flow.
177+
testSlice(DIRECT_BUFFER_BF, true);
178+
testSlice(DIRECT_BUFFER_BF, false);
179+
testDirectConstantCopy(DIRECT_BUFFER_BF);
180+
testDirectReusedCopy(DIRECT_BUFFER_BF);
181+
testSerializable(DIRECT_BUFFER_BF, true);
182+
testSerializable(DIRECT_BUFFER_BF, false);
183+
}
184+
185+
@Test
186+
public void testDirectByteBufferCopyAlwaysMaterializesToHeap() throws Exception {
187+
// For constant (non-reused) direct ByteBuffers, copy() must return a new Binary
188+
// rather than 'this', because the direct memory can be freed independently.
189+
byte[] data = testString.getBytes(UTF8);
190+
ByteBuffer direct = ByteBuffer.allocateDirect(data.length);
191+
direct.put(data);
192+
direct.flip();
193+
194+
Binary binary = Binary.fromConstantByteBuffer(direct);
195+
Binary copy = binary.copy();
196+
197+
// The copy must NOT be the same object, even though the binary is constant
198+
assertNotSame("copy() of a direct ByteBuffer-backed constant Binary must not return 'this'", binary, copy);
199+
assertArrayEquals(data, copy.getBytes());
200+
assertArrayEquals(data, copy.getBytesUnsafe());
201+
}
202+
203+
@Test
204+
public void testDirectByteBufferCopyIsIndependentOfOriginalBuffer() throws Exception {
205+
// Verify the copied Binary is independent of the original direct ByteBuffer.
206+
// Simulates the scenario where direct memory is overwritten after copy.
207+
byte[] data = testString.getBytes(UTF8);
208+
ByteBuffer direct = ByteBuffer.allocateDirect(data.length);
209+
direct.put(data);
210+
direct.flip();
211+
212+
Binary binary = Binary.fromReusedByteBuffer(direct);
213+
Binary copy = binary.copy();
214+
215+
// Overwrite the direct buffer content to simulate memory reuse / free
216+
direct.clear();
217+
for (int i = 0; i < data.length; i++) {
218+
direct.put((byte) 0);
219+
}
220+
221+
// The copy should still hold the original data
222+
assertArrayEquals(data, copy.getBytes());
223+
assertArrayEquals(data, copy.getBytesUnsafe());
224+
}
225+
226+
@Test
227+
public void testHeapByteBufferConstantCopyReturnsSame() throws Exception {
228+
// For heap-backed constant ByteBuffers, copy() should return 'this' (existing behavior)
229+
byte[] data = testString.getBytes(UTF8);
230+
ByteBuffer heap = ByteBuffer.wrap(data);
231+
232+
Binary binary = Binary.fromConstantByteBuffer(heap);
233+
Binary copy = binary.copy();
234+
235+
assertSame("copy() of a heap ByteBuffer-backed constant Binary should return 'this'", binary, copy);
236+
}
237+
151238
@Test
152239
public void testEqualityMethods() throws Exception {
153240
Binary bin1 = Binary.fromConstantByteArray("alice".getBytes(), 1, 3);
@@ -223,6 +310,56 @@ private void testReusedCopy(BinaryFactory bf) throws Exception {
223310
assertArrayEquals(testString.getBytes(UTF8), copy.copy().getBytes());
224311
}
225312

313+
/**
314+
* Tests copy() on a constant (non-reused) direct ByteBuffer-backed Binary.
315+
* Unlike heap-backed binaries, copy() must return a new object because the direct
316+
* memory can be freed independently.
317+
*/
318+
private void testDirectConstantCopy(BinaryFactory bf) throws Exception {
319+
BinaryAndOriginal bao = bf.get(testString.getBytes(UTF8), false);
320+
assertEquals(false, bao.binary.isBackingBytesReused());
321+
322+
assertArrayEquals(testString.getBytes(UTF8), bao.binary.getBytes());
323+
assertArrayEquals(testString.getBytes(UTF8), bao.binary.getBytesUnsafe());
324+
assertArrayEquals(testString.getBytes(UTF8), bao.binary.copy().getBytesUnsafe());
325+
assertArrayEquals(testString.getBytes(UTF8), bao.binary.copy().getBytes());
326+
327+
bao = bf.get(testString.getBytes(UTF8), false);
328+
assertEquals(false, bao.binary.isBackingBytesReused());
329+
330+
Binary copy = bao.binary.copy();
331+
332+
// Direct ByteBuffer-backed constant Binary.copy() must NOT return 'this'
333+
assertNotSame(copy, bao.binary);
334+
// But the data must be equal
335+
assertEquals(bao.binary, copy);
336+
}
337+
338+
/**
339+
* Tests copy() on a reused direct ByteBuffer-backed Binary.
340+
* The copy must be fully independent and survive mutation of the original buffer.
341+
*/
342+
private void testDirectReusedCopy(BinaryFactory bf) throws Exception {
343+
BinaryAndOriginal bao = bf.get(testString.getBytes(UTF8), true);
344+
assertEquals(true, bao.binary.isBackingBytesReused());
345+
346+
assertArrayEquals(testString.getBytes(UTF8), bao.binary.getBytes());
347+
assertArrayEquals(testString.getBytes(UTF8), bao.binary.getBytesUnsafe());
348+
assertArrayEquals(testString.getBytes(UTF8), bao.binary.copy().getBytesUnsafe());
349+
assertArrayEquals(testString.getBytes(UTF8), bao.binary.copy().getBytes());
350+
351+
bao = bf.get(testString.getBytes(UTF8), true);
352+
assertEquals(true, bao.binary.isBackingBytesReused());
353+
354+
Binary copy = bao.binary.copy();
355+
assertNotSame(copy, bao.binary);
356+
357+
assertArrayEquals(testString.getBytes(UTF8), copy.getBytes());
358+
assertArrayEquals(testString.getBytes(UTF8), copy.getBytesUnsafe());
359+
assertArrayEquals(testString.getBytes(UTF8), copy.copy().getBytesUnsafe());
360+
assertArrayEquals(testString.getBytes(UTF8), copy.copy().getBytes());
361+
}
362+
226363
private void testSerializable(BinaryFactory bf, boolean reused) throws Exception {
227364
BinaryAndOriginal bao = bf.get("polygon".getBytes(UTF8), reused);
228365

0 commit comments

Comments
 (0)