From a53a04945df9a5c00ed82a67699e6e15e51f9053 Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Tue, 22 Apr 2025 10:45:01 +0200 Subject: [PATCH 1/4] GH-721: Allow using 1GB+ data buffers in variable width vectors Allow actually reaching MAX_BUFFER_SIZE at reallocating variable width vectors instead of exceeding it calculating the next power of 2. --- .../arrow/vector/BaseVariableWidthVector.java | 7 +++++-- .../arrow/vector/BaseVariableWidthViewVector.java | 14 ++++++++++---- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index 7b8d2cdfda..cf63bfbc48 100644 --- a/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -571,10 +571,13 @@ public void reallocDataBuffer(long desiredAllocSize) { return; } - final long newAllocationSize = CommonUtil.nextPowerOfTwo(desiredAllocSize); + final long newAllocationSize = Math.min(CommonUtil.nextPowerOfTwo(desiredAllocSize), + MAX_BUFFER_SIZE); assert newAllocationSize >= 1; - checkDataBufferSize(newAllocationSize); + if (newAllocationSize < desiredAllocSize) { + checkDataBufferSize(desiredAllocSize); + } final ArrowBuf newBuf = allocator.buffer(newAllocationSize); newBuf.setBytes(0, valueBuffer, 0, valueBuffer.capacity()); diff --git a/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java b/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java index e0e16762f2..5425f06210 100644 --- a/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java +++ b/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java @@ -550,15 +550,18 @@ public void reallocViewBuffer(long desiredAllocSize) { if (desiredAllocSize == 0) { return; } - long newAllocationSize = CommonUtil.nextPowerOfTwo(desiredAllocSize); + long newAllocationSize = Math.min(CommonUtil.nextPowerOfTwo(desiredAllocSize), MAX_BUFFER_SIZE); assert newAllocationSize >= 1; - checkDataBufferSize(newAllocationSize); // for each set operation, we have to allocate 16 bytes // here we are adjusting the desired allocation-based allocation size // to align with the 16bytes requirement. newAllocationSize = roundUpToMultipleOf16(newAllocationSize); + if (newAllocationSize < desiredAllocSize) { + checkDataBufferSize(desiredAllocSize); + } + final ArrowBuf newBuf = allocator.buffer(newAllocationSize); newBuf.setBytes(0, viewBuffer, 0, viewBuffer.capacity()); @@ -587,10 +590,13 @@ public void reallocViewDataBuffer(long desiredAllocSize) { return; } - final long newAllocationSize = CommonUtil.nextPowerOfTwo(desiredAllocSize); + final long newAllocationSize = Math.min(CommonUtil.nextPowerOfTwo(desiredAllocSize), + MAX_BUFFER_SIZE); assert newAllocationSize >= 1; - checkDataBufferSize(newAllocationSize); + if (newAllocationSize < desiredAllocSize) { + checkDataBufferSize(desiredAllocSize); + } final ArrowBuf newBuf = allocator.buffer(newAllocationSize); dataBuffers.add(newBuf); From e84f8b60a42b132d2e690613bd25732d6eb522c1 Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Tue, 22 Apr 2025 10:53:08 +0200 Subject: [PATCH 2/4] fix format issues --- .../java/org/apache/arrow/vector/BaseVariableWidthVector.java | 4 ++-- .../org/apache/arrow/vector/BaseVariableWidthViewVector.java | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index cf63bfbc48..1609e64ca5 100644 --- a/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -571,8 +571,8 @@ public void reallocDataBuffer(long desiredAllocSize) { return; } - final long newAllocationSize = Math.min(CommonUtil.nextPowerOfTwo(desiredAllocSize), - MAX_BUFFER_SIZE); + final long newAllocationSize = + Math.min(CommonUtil.nextPowerOfTwo(desiredAllocSize), MAX_BUFFER_SIZE); assert newAllocationSize >= 1; if (newAllocationSize < desiredAllocSize) { diff --git a/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java b/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java index 5425f06210..beda91dc3f 100644 --- a/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java +++ b/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java @@ -590,8 +590,8 @@ public void reallocViewDataBuffer(long desiredAllocSize) { return; } - final long newAllocationSize = Math.min(CommonUtil.nextPowerOfTwo(desiredAllocSize), - MAX_BUFFER_SIZE); + final long newAllocationSize = + Math.min(CommonUtil.nextPowerOfTwo(desiredAllocSize), MAX_BUFFER_SIZE); assert newAllocationSize >= 1; if (newAllocationSize < desiredAllocSize) { From 2f93de2f4932bca9f9b3bbdfcda3736f5b3dec0f Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Tue, 22 Apr 2025 14:53:43 +0200 Subject: [PATCH 3/4] Add unit test --- pom.xml | 4 ++-- .../org/apache/arrow/vector/TestVectorReAlloc.java | 12 ++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index cb189dc8f9..e781c9d233 100644 --- a/pom.xml +++ b/pom.xml @@ -327,8 +327,8 @@ under the License. true UTC - 1048576 + which in turn can cause OOM. Using 1MB - 1byte to simulate the defaul limit of 2^31 - 1 bytes. --> + 1048575 false diff --git a/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java b/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java index f5ec42c71c..bc47150376 100644 --- a/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java +++ b/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java @@ -24,6 +24,7 @@ import java.nio.charset.StandardCharsets; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.memory.util.CommonUtil; import org.apache.arrow.vector.complex.DenseUnionVector; import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.ListVector; @@ -222,6 +223,17 @@ public void testVariableAllocateAfterReAlloc() throws Exception { } } + @Test + public void testVariableReAllocAbove1GB() throws Exception { + try (final VarCharVector vector = new VarCharVector("", allocator)) { + long desiredSizeAboveLastPowerOf2 = + CommonUtil.nextPowerOfTwo(BaseVariableWidthVector.MAX_ALLOCATION_SIZE) / 2 + 1; + vector.reallocDataBuffer(desiredSizeAboveLastPowerOf2); + + assertTrue(vector.getDataBuffer().capacity() >= desiredSizeAboveLastPowerOf2); + } + } + @Test public void testLargeVariableAllocateAfterReAlloc() throws Exception { try (final LargeVarCharVector vector = new LargeVarCharVector("", allocator)) { From 7961c0a45ea8733993987e6c25203873e01458bd Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Tue, 22 Apr 2025 16:21:53 +0200 Subject: [PATCH 4/4] Fix testing issues due to round ups --- pom.xml | 4 ++-- .../java/org/apache/arrow/vector/TestValueVector.java | 2 +- .../apache/arrow/vector/util/TestVectorAppender.java | 11 ++++++++++- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index e781c9d233..ab988c7d05 100644 --- a/pom.xml +++ b/pom.xml @@ -327,8 +327,8 @@ under the License. true UTC - 1048575 + which in turn can cause OOM. Using 2MB - 1byte to simulate the defaul limit of 2^31 - 1 bytes. --> + 2097151 false diff --git a/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 83e470ae25..daec331831 100644 --- a/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -95,7 +95,7 @@ public void init() { private static final byte[] STR5 = "EEE5".getBytes(utf8Charset); private static final byte[] STR6 = "FFFFF6".getBytes(utf8Charset); private static final int MAX_VALUE_COUNT = - (int) (Integer.getInteger("arrow.vector.max_allocation_bytes", Integer.MAX_VALUE) / 7); + (int) (Integer.getInteger("arrow.vector.max_allocation_bytes", Integer.MAX_VALUE) / 9); private static final int MAX_VALUE_COUNT_8BYTE = (int) (MAX_VALUE_COUNT / 2); @AfterEach diff --git a/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java b/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java index e1b3889d85..4ee9630a4d 100644 --- a/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java +++ b/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java @@ -28,6 +28,7 @@ import java.util.stream.Stream; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.memory.util.CommonUtil; import org.apache.arrow.vector.BaseValueVector; import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.BigIntVector; @@ -309,7 +310,15 @@ public void testAppendEmptyVariableWidthVector() { @Test public void testAppendLargeAndSmallVariableVectorsWithinLimit() { - int sixteenthOfMaxAllocation = Math.toIntExact(BaseValueVector.MAX_ALLOCATION_SIZE / 16); + // Using the max power of 2 allocation size to avoid hitting the max limit at round ups + long maxPowerOfTwoAllocationSize = + CommonUtil.nextPowerOfTwo(BaseValueVector.MAX_ALLOCATION_SIZE); + if (maxPowerOfTwoAllocationSize > BaseValueVector.MAX_ALLOCATION_SIZE) { + maxPowerOfTwoAllocationSize = + CommonUtil.nextPowerOfTwo(BaseValueVector.MAX_ALLOCATION_SIZE / 2); + } + + int sixteenthOfMaxAllocation = Math.toIntExact(maxPowerOfTwoAllocationSize / 16); try (VarCharVector target = makeVarCharVec(1, sixteenthOfMaxAllocation); VarCharVector delta = makeVarCharVec(sixteenthOfMaxAllocation, 1)) { new VectorAppender(delta).visit(target, null);