Skip to content

Commit

Permalink
apacheGH-37939: [C++] Use signed arithmetic for frame of reference wh…
Browse files Browse the repository at this point in the history
…en encoding DELTA_BINARY_PACKED (apache#37940)

Closes apache#37939. 

### What changes are included in this PR?

This PR changes values used in the `DELTA_BINARY_PACKED` encoder to signed types. To gracefully handle overflow, arithmetic is still performed in the unsigned domain, but other operations such as computing the min and max deltas are done in the signed domain.

Using signed types ensures the optimal number of bits is used when encoding the deltas, which was not the case before if any negative deltas were encountered (which is obviously common).

### Are these changes tested?
I've included two tests that result in overflow.

### Are there any user-facing changes?
No

* Closes: apache#37939

Authored-by: seidl <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
etseidl authored Oct 3, 2023
1 parent 92de9a3 commit 5514b22
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 12 deletions.
27 changes: 15 additions & 12 deletions cpp/src/parquet/encoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ using arrow::VisitNullBitmapInline;
using arrow::internal::AddWithOverflow;
using arrow::internal::checked_cast;
using arrow::internal::MultiplyWithOverflow;
using arrow::internal::SafeSignedSubtract;
using arrow::internal::SubtractWithOverflow;
using arrow::util::SafeLoad;
using arrow::util::SafeLoadAs;
Expand Down Expand Up @@ -2189,9 +2190,9 @@ class DeltaBitPackEncoder : public EncoderImpl, virtual public TypedEncoder<DTyp
const uint32_t values_per_mini_block_;
uint32_t values_current_block_{0};
uint32_t total_value_count_{0};
UT first_value_{0};
UT current_value_{0};
ArrowPoolVector<UT> deltas_;
T first_value_{0};
T current_value_{0};
ArrowPoolVector<T> deltas_;
std::shared_ptr<ResizableBuffer> bits_buffer_;
::arrow::BufferBuilder sink_;
::arrow::bit_util::BitWriter bit_writer_;
Expand All @@ -2212,12 +2213,12 @@ void DeltaBitPackEncoder<DType>::Put(const T* src, int num_values) {
total_value_count_ += num_values;

while (idx < num_values) {
UT value = static_cast<UT>(src[idx]);
T value = src[idx];
// Calculate deltas. The possible overflow is handled by use of unsigned integers
// making subtraction operations well-defined and correct even in case of overflow.
// Encoded integers will wrap back around on decoding.
// See http://en.wikipedia.org/wiki/Modular_arithmetic#Integers_modulo_n
deltas_[values_current_block_] = value - current_value_;
deltas_[values_current_block_] = SafeSignedSubtract(value, current_value_);
current_value_ = value;
idx++;
values_current_block_++;
Expand All @@ -2233,9 +2234,11 @@ void DeltaBitPackEncoder<DType>::FlushBlock() {
return;
}

const UT min_delta =
// Calculate the frame of reference for this miniblock. This value will be subtracted
// from all deltas to guarantee all deltas are positive for encoding.
const T min_delta =
*std::min_element(deltas_.begin(), deltas_.begin() + values_current_block_);
bit_writer_.PutZigZagVlqInt(static_cast<T>(min_delta));
bit_writer_.PutZigZagVlqInt(min_delta);

// Call to GetNextBytePtr reserves mini_blocks_per_block_ bytes of space to write
// bit widths of miniblocks as they become known during the encoding.
Expand All @@ -2250,17 +2253,17 @@ void DeltaBitPackEncoder<DType>::FlushBlock() {
std::min(values_per_mini_block_, values_current_block_);

const uint32_t start = i * values_per_mini_block_;
const UT max_delta = *std::max_element(
const T max_delta = *std::max_element(
deltas_.begin() + start, deltas_.begin() + start + values_current_mini_block);

// The minimum number of bits required to write any of values in deltas_ vector.
// See overflow comment above.
const auto bit_width = bit_width_data[i] =
bit_util::NumRequiredBits(max_delta - min_delta);
const auto bit_width = bit_width_data[i] = bit_util::NumRequiredBits(
static_cast<UT>(max_delta) - static_cast<UT>(min_delta));

for (uint32_t j = start; j < start + values_current_mini_block; j++) {
// See overflow comment above.
const UT value = deltas_[j] - min_delta;
// Convert delta to frame of reference. See overflow comment above.
const UT value = static_cast<UT>(deltas_[j]) - static_cast<UT>(min_delta);
bit_writer_.PutValue(value, bit_width);
}
// If there are not enough values to fill the last mini block, we pad the mini block
Expand Down
64 changes: 64 additions & 0 deletions cpp/src/parquet/encoding_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1634,6 +1634,70 @@ TYPED_TEST(TestDeltaBitPackEncoding, NonZeroPaddedMiniblockBitWidth) {
}
}

// Test that the DELTA_BINARY_PACKED encoding works properply in the presence of values
// that will cause integer overflow (see GH-37939).
TYPED_TEST(TestDeltaBitPackEncoding, DeltaBitPackedWrapping) {
using T = typename TypeParam::c_type;

// Values that should wrap when converted to deltas, and then when converted to the
// frame of reference.
std::vector<T> int_values = {std::numeric_limits<T>::min(),
std::numeric_limits<T>::max(),
std::numeric_limits<T>::min(),
std::numeric_limits<T>::max(),
0,
-1,
0,
1,
-1,
1};
const int num_values = static_cast<int>(int_values.size());

auto const encoder = MakeTypedEncoder<TypeParam>(
Encoding::DELTA_BINARY_PACKED, /*use_dictionary=*/false, this->descr_.get());
encoder->Put(int_values, num_values);
auto const encoded = encoder->FlushValues();

auto const decoder =
MakeTypedDecoder<TypeParam>(Encoding::DELTA_BINARY_PACKED, this->descr_.get());

std::vector<T> decoded(num_values);
decoder->SetData(num_values, encoded->data(), static_cast<int>(encoded->size()));

const int values_decoded = decoder->Decode(decoded.data(), num_values);

ASSERT_EQ(num_values, values_decoded);
ASSERT_NO_FATAL_FAILURE(
VerifyResults<T>(decoded.data(), int_values.data(), num_values));
}

// Test that the DELTA_BINARY_PACKED encoding does not use more bits to encode than
// necessary (see GH-37939).
TYPED_TEST(TestDeltaBitPackEncoding, DeltaBitPackedSize) {
using T = typename TypeParam::c_type;
constexpr int num_values = 128;
// 128 values should be <= 1 block of values encoded with 2 bits
// delta header should be 0x8001|0x8002 0x04 0x8001 0x02 (6 bytes)
// mini-block header should be 0x01 0x02020202 (5 bytes)
constexpr int encoded_size = 2 * num_values / 8 + 6 + 5;

// Create a run of {1, 0, -1, 0, 1, 0, ...}.
// min_delta is -1, max_delta is 1, max_delta - min_delta is 2, so this requires 2 bits
// to encode.
std::vector<T> int_values(num_values);
std::iota(int_values.begin(), int_values.end(), 0);
std::transform(int_values.begin(), int_values.end(), int_values.begin(), [](T idx) {
return (idx % 2) == 1 ? 0 : (idx % 4) == 0 ? 1 : -1;
});

auto const encoder = MakeTypedEncoder<TypeParam>(
Encoding::DELTA_BINARY_PACKED, /*use_dictionary=*/false, this->descr_.get());
encoder->Put(int_values, num_values);
auto const encoded = encoder->FlushValues();

ASSERT_EQ(encoded->size(), encoded_size);
}

// ----------------------------------------------------------------------
// Rle for Boolean encode/decode tests.

Expand Down

0 comments on commit 5514b22

Please sign in to comment.