diff --git a/src/gribjump/GribJumpDataAccessor.h b/src/gribjump/GribJumpDataAccessor.h index 779ed2e..ea5480f 100644 --- a/src/gribjump/GribJumpDataAccessor.h +++ b/src/gribjump/GribJumpDataAccessor.h @@ -19,13 +19,13 @@ namespace gribjump { class GribJumpDataAccessor : public mc::DataAccessor { public: - GribJumpDataAccessor(eckit::DataHandle& dh, const mc::Range range) : dh_{dh}, data_section_range_{range} {} + GribJumpDataAccessor(eckit::DataHandle& dh, const mc::Block range) : dh_{dh}, data_section_range_{range} {} void write(const eckit::Buffer& buffer, const size_t offset) const override { NOTIMP; } - eckit::Buffer read(const mc::Range& range) const override { + eckit::Buffer read(const mc::Block& range) const override { eckit::Offset offset = range.first; eckit::Length size = range.second; @@ -51,7 +51,7 @@ class GribJumpDataAccessor : public mc::DataAccessor { private: eckit::DataHandle& dh_; - mc::Range data_section_range_; + mc::Block data_section_range_; }; } // namespace gribjump \ No newline at end of file diff --git a/src/gribjump/compression/DataAccessor.h b/src/gribjump/compression/DataAccessor.h index 9be5164..cfa6e1c 100644 --- a/src/gribjump/compression/DataAccessor.h +++ b/src/gribjump/compression/DataAccessor.h @@ -26,7 +26,7 @@ class DataAccessor { public: virtual ~DataAccessor() = default; virtual void write(const eckit::Buffer& buffer, const size_t offset) const = 0; - virtual eckit::Buffer read(const Range& range) const = 0; + virtual eckit::Buffer read(const Block& range) const = 0; virtual eckit::Buffer read() const = 0; virtual size_t eof() const = 0; }; @@ -43,7 +43,7 @@ class PosixAccessor : public DataAccessor { NOTIMP; } - eckit::Buffer read(const Range& range) const override { + eckit::Buffer read(const Block& range) const override { const auto [offset, size] = range; eckit::Buffer buf(size); ifs_.seekg(offset, std::ios::beg); @@ -89,7 +89,7 @@ class MemoryAccessor : public DataAccessor { NOTIMP; } - eckit::Buffer read(const Range& range) const override { + eckit::Buffer read(const Block& range) const override { const auto [offset, size] = range; if (offset + size > buf_.size()) { std::stringstream ss; diff --git a/src/gribjump/compression/NumericCompressor.h b/src/gribjump/compression/NumericCompressor.h index 95dadf5..c8274bb 100644 --- a/src/gribjump/compression/NumericCompressor.h +++ b/src/gribjump/compression/NumericCompressor.h @@ -37,23 +37,23 @@ class NumericDecompressor { using CompressedData = eckit::Buffer; using Values = std::vector; virtual Values decode(const CompressedData&) = 0; - virtual Values decode(const std::shared_ptr, const Range&) = 0; + virtual Values decode(const std::shared_ptr, const Block&) = 0; - virtual std::vector decode(const std::shared_ptr& accessor, const std::vector& ranges) { + virtual std::vector decode(const std::shared_ptr& accessor, const std::vector& ranges) { using Values = typename NumericDecompressor::Values; std::vector result; decode(accessor, ranges, result); return result; } - virtual void decode(const std::shared_ptr& accessor, const std::vector& ranges, std::vector& result) { + virtual void decode(const std::shared_ptr& accessor, const std::vector& ranges, std::vector& result) { using Values = typename NumericDecompressor::Values; - std::unordered_map>> ranges_map; + std::unordered_map>> ranges_map; // find which sub_ranges are in which buckets - RangeBuckets buckets; + BlockBuckets buckets; for (const auto& range : ranges) { buckets << range; } diff --git a/src/gribjump/compression/Range.cc b/src/gribjump/compression/Range.cc index 5895aa3..0e41db3 100644 --- a/src/gribjump/compression/Range.cc +++ b/src/gribjump/compression/Range.cc @@ -12,23 +12,23 @@ #include #include -std::pair begin_end(const mc::Range& range) +std::pair begin_end(const mc::Block& range) { const auto [offset, size] = range; return {offset, offset + size}; } -mc::Range operator+(const mc::Range& r1, const mc::Range& r2) +mc::Block operator+(const mc::Block& r1, const mc::Block& r2) { auto [b1, e1] = begin_end(r1); auto [b2, e2] = begin_end(r2); assert(!((b1 > e2) && (b2 > e1))); - return mc::Range{std::min(b1, b2), std::max(e1, e2) - std::min(b1, b2)}; + return mc::Block{std::min(b1, b2), std::max(e1, e2) - std::min(b1, b2)}; } -std::ostream& operator<<(std::ostream& os, const mc::Range& range) +std::ostream& operator<<(std::ostream& os, const mc::Block& range) { auto [rb, re] = begin_end(range); os << "[" << rb << ", " << re << "]"; @@ -36,7 +36,7 @@ std::ostream& operator<<(std::ostream& os, const mc::Range& range) } -std::ostream& operator<<(std::ostream& os, const mc::RangeBucket& range) +std::ostream& operator<<(std::ostream& os, const mc::BlockBucket& range) { os << range.first << std::endl; for (const auto& r : range.second) { @@ -47,50 +47,58 @@ std::ostream& operator<<(std::ostream& os, const mc::RangeBucket& range) } -mc::RangeBuckets& operator<<(mc::RangeBuckets& buckets, const mc::Range& r) -{ - const mc::Range sub_range{r}; - const auto [srb_tmp, sre_tmp] = begin_end(sub_range); - auto srb = srb_tmp; // not necessary in C++20 - auto sre = sre_tmp; // not necessary in C++20 - - auto r1 = std::find_if(buckets.begin(), buckets.end(), [&](const auto bucket) { - const auto [bucket_range, _] = bucket; - const auto [brb, bre] = begin_end(bucket_range); - return brb <= srb && srb <= bre; - }); - - if (r1 != buckets.end()) { - r1->first = sub_range + r1->first; - r1->second.push_back(sub_range); - if (std::next(r1) != buckets.end()) { - auto r2 = std::next(r1); - auto [r2_begin, r2_end] = begin_end(r2->first); - - if (r2_begin <= sre && sre <= r2_end) { - r1->first = r1->first + r2->first; - std::copy(r2->second.begin(), r2->second.end(), std::back_inserter(r1->second)); - buckets.erase(r2); - } - } - return buckets; - } - auto r2 = std::find_if(buckets.begin(), buckets.end(), [&](auto l) { - auto [l_begin, l_end] = begin_end(l.first); - return l_begin <= sre && sre <= l_end; - }); - if (r2 != buckets.end()) { - r2->first = sub_range + r1->first; - return buckets; +mc::BlockBuckets& operator<<(mc::BlockBuckets& buckets, const mc::Block& r) { + + const mc::Block sub_range{r}; + const auto [sub_start, sub_end] = begin_end(sub_range); + + // Find the position where the range might be inserted + auto it = std::lower_bound(buckets.begin(), buckets.end(), sub_range, + [](const mc::BlockBucket& bucket, const mc::Block& range) { + const auto [bucket_start, bucket_end] = begin_end(bucket.first); + return bucket_end < range.first; + }); + + mc::Block merged_range = sub_range; + mc::SubBlocks merged_subranges = {sub_range}; + + // Merge with any overlapping buckets before the insertion point + while (it != buckets.begin()) { + auto prev_it = std::prev(it); + const auto [prev_start, prev_end] = begin_end(prev_it->first); + + if (prev_end < sub_start) break; // No overlap + + // Expand the merged range + merged_range.first = std::min(merged_range.first, prev_start); + merged_range.second = (std::max(prev_end, sub_end) - merged_range.first); + + merged_subranges.insert(merged_subranges.end(), prev_it->second.begin(), prev_it->second.end()); + + it = buckets.erase(prev_it); } - buckets.push_back({sub_range, mc::SubRanges{sub_range}}); + // Merge with any overlapping buckets after the insertion point + while (it != buckets.end()) { + const auto [next_start, next_end] = begin_end(it->first); + + if (next_start > sub_end) break; // No overlap + + // Expand the merged range + merged_range.first = std::min(merged_range.first, next_start); + merged_range.second = (std::max(next_end, sub_end) - merged_range.first); + merged_subranges.insert(merged_subranges.end(), it->second.begin(), it->second.end()); + + // Erase the current bucket and move the iterator forward + it = buckets.erase(it); + } + + buckets.insert(it, {merged_range, merged_subranges}); return buckets; } - -std::size_t std::hash::operator() (const mc::Range& range) const +std::size_t std::hash::operator() (const mc::Block& range) const { static_assert(sizeof(std::size_t) == sizeof(std::uint64_t), "std::size_t must be 64 bits"); const auto [offset, size] = range; diff --git a/src/gribjump/compression/Range.h b/src/gribjump/compression/Range.h index 5342547..58779e9 100644 --- a/src/gribjump/compression/Range.h +++ b/src/gribjump/compression/Range.h @@ -17,31 +17,29 @@ #include #include #include - namespace mc { -using Range = std::pair; +using Block = std::pair; } template<> -struct std::hash { - std::size_t operator()(const mc::Range& range) const; +struct std::hash { + std::size_t operator()(const mc::Block& range) const; }; namespace mc { // A bucket is a continous range of data that can be decoded in one go -std::pair get_begin_end(const Range& range); -using SubRange = Range; -using SubRanges = std::vector; -using RangeBucket = std::pair; -using RangeBuckets = std::list; - +std::pair get_begin_end(const Block& range); +using SubBlock = Block; +using SubBlocks = std::vector; +using BlockBucket = std::pair; +using BlockBuckets = std::vector; // Sorted to allow binary search (std::lower_bound) } // namespace mc -std::ostream& operator<<(std::ostream& os, const mc::Range& range); -std::ostream& operator<<(std::ostream& os, const mc::RangeBucket& bucket); -mc::RangeBuckets& operator<<(mc::RangeBuckets& buckets, const mc::Range& r); -mc::Range operator+(const mc::Range& r1, const mc::Range& r2); +std::ostream& operator<<(std::ostream& os, const mc::Block& range); +std::ostream& operator<<(std::ostream& os, const mc::BlockBucket& bucket); +mc::BlockBuckets& operator<<(mc::BlockBuckets& buckets, const mc::Block& r); +mc::Block operator+(const mc::Block& r1, const mc::Block& r2); -std::pair begin_end(const mc::Range& range); +std::pair begin_end(const mc::Block& range); diff --git a/src/gribjump/compression/compressors/Aec.h b/src/gribjump/compression/compressors/Aec.h index 912868b..14b303d 100644 --- a/src/gribjump/compression/compressors/Aec.h +++ b/src/gribjump/compression/compressors/Aec.h @@ -193,7 +193,7 @@ class AecDecompressor : public NumericDecompressor, public AecParams } - Values decode(const std::shared_ptr accessor, const Range& range) override + Values decode(const std::shared_ptr accessor, const Block& range) override { if (sizeof(ValueType) == 1 && !(bits_per_sample_ > 0 && bits_per_sample_ <= 8)) throw eckit::Exception("bits_per_sample must be between 1 and 8 for 1-byte types", Here()); diff --git a/src/gribjump/compression/compressors/Ccsds.h b/src/gribjump/compression/compressors/Ccsds.h index 1b4a235..1cd5a3d 100644 --- a/src/gribjump/compression/compressors/Ccsds.h +++ b/src/gribjump/compression/compressors/Ccsds.h @@ -204,7 +204,7 @@ class CcsdsDecompressor : public mc::NumericDecompressor, public Ccsd } - Values decode(const std::shared_ptr accessor, const Range& range) override { + Values decode(const std::shared_ptr accessor, const Block& range) override { if (range.second == 0) return Values{}; @@ -245,7 +245,7 @@ class CcsdsDecompressor : public mc::NumericDecompressor, public Ccsd size_t n_elems_; template - Values decode_range_ (const std::shared_ptr accessor, const Range& simple_range, double bscale, double dscale) { + Values decode_range_ (const std::shared_ptr accessor, const Block& simple_range, double bscale, double dscale) { AecDecompressor aec{}; auto flags = modify_aec_flags(flags_); aec.flags(flags); diff --git a/src/gribjump/compression/compressors/Simple.h b/src/gribjump/compression/compressors/Simple.h index 6d218e6..1b0695b 100644 --- a/src/gribjump/compression/compressors/Simple.h +++ b/src/gribjump/compression/compressors/Simple.h @@ -103,7 +103,7 @@ class SimpleDecompressor : public NumericDecompressor, public SimpleP size_t buffer_size() const { return buffer_size_; } SimpleDecompressor& buffer_size(size_t buffer_size) { buffer_size_ = buffer_size; return *this; } - Values decode(const std::shared_ptr accessor, const Range& range) override { + Values decode(const std::shared_ptr accessor, const Block& range) override { //SP sp{}; using SP = SimplePacking; SimplePacking sp{}; @@ -121,12 +121,12 @@ class SimpleDecompressor : public NumericDecompressor, public SimpleP size_t end = offset + size; size_t new_end = (end + (chunk_nvals - 1)) / chunk_nvals * chunk_nvals; size_t new_size = new_end - new_offset; - Range inclusive_range{new_offset, new_size}; + Block inclusive_range{new_offset, new_size}; params.n_vals = new_size; size_t last_pos = std::min(bin_pos(new_size, bits_per_value_), accessor->eof() - bin_pos(new_offset, bits_per_value_)); - Range data_range{bin_pos(new_offset, bits_per_value_), last_pos}; + Block data_range{bin_pos(new_offset, bits_per_value_), last_pos}; eckit::Buffer compressed = accessor->read(data_range); typename SP::Buffer data{(unsigned char*) compressed.data(), (unsigned char*) compressed.data() + data_range.second}; diff --git a/src/gribjump/jumper/CcsdsJumper.cc b/src/gribjump/jumper/CcsdsJumper.cc index 192d07a..9348fd1 100644 --- a/src/gribjump/jumper/CcsdsJumper.cc +++ b/src/gribjump/jumper/CcsdsJumper.cc @@ -44,7 +44,7 @@ void CcsdsJumper::readValues(eckit::DataHandle& dh, const eckit::Offset offset, .offsets(info.ccsdsOffsets()); - auto data_range = mc::Range{offset + info.offsetBeforeData(), info.offsetAfterData() - info.offsetBeforeData()}; + auto data_range = mc::Block{offset + info.offsetBeforeData(), info.offsetAfterData() - info.offsetBeforeData()}; std::shared_ptr data_accessor = std::make_shared(dh, data_range); // TODO(maee): Optimize this diff --git a/src/gribjump/jumper/Jumper.cc b/src/gribjump/jumper/Jumper.cc index 5421372..3b14943 100644 --- a/src/gribjump/jumper/Jumper.cc +++ b/src/gribjump/jumper/Jumper.cc @@ -44,11 +44,11 @@ std::vector> toBitset(const Bitmap& bitmap) { // Convert ranges to intervals // TODO(maee): Simplification: Switch to intervals or ranges -std::vector toRanges(const std::vector& intervals) { - std::vector ranges; +std::vector toRanges(const std::vector& intervals) { + std::vector ranges; std::transform(intervals.begin(), intervals.end(), std::back_inserter(ranges), [](auto interval) { auto [begin, end] = interval; - return mc::Range{begin, end - begin}; + return mc::Block{begin, end - begin}; }); return ranges; } diff --git a/src/gribjump/jumper/Jumper.h b/src/gribjump/jumper/Jumper.h index 9aebd1b..c24f7aa 100644 --- a/src/gribjump/jumper/Jumper.h +++ b/src/gribjump/jumper/Jumper.h @@ -59,6 +59,6 @@ class BadJumpInfoException : public eckit::Exception { // Convert ranges to intervals // TODO(maee): Simplification: Switch to intervals or ranges -std::vector toRanges(const std::vector& intervals); +std::vector toRanges(const std::vector& intervals); } // namespace gribjump diff --git a/src/gribjump/jumper/SimpleJumper.cc b/src/gribjump/jumper/SimpleJumper.cc index 91b1a13..9f7b5d1 100644 --- a/src/gribjump/jumper/SimpleJumper.cc +++ b/src/gribjump/jumper/SimpleJumper.cc @@ -31,7 +31,7 @@ void SimpleJumper::readValues(eckit::DataHandle& dh, const eckit::Offset offset, const SimpleInfo& info = *psimple; - std::shared_ptr data_accessor = std::make_shared(dh, mc::Range{offset + info.offsetBeforeData(), info.offsetAfterData() - info.offsetBeforeData()}); // TODO XXX + std::shared_ptr data_accessor = std::make_shared(dh, mc::Block{offset + info.offsetBeforeData(), info.offsetAfterData() - info.offsetBeforeData()}); // TODO XXX mc::SimpleDecompressor simple{}; simple .bits_per_value(info.bitsPerValue()) diff --git a/tests/test_misc_units.cc b/tests/test_misc_units.cc index 35cf804..79bb582 100644 --- a/tests/test_misc_units.cc +++ b/tests/test_misc_units.cc @@ -13,6 +13,7 @@ #include "eckit/testing/Test.h" #include "gribjump/info/LRUCache.h" +#include "gribjump/compression/NumericCompressor.h" #include "metkit/mars/MarsParser.h" @@ -56,7 +57,47 @@ CASE( "test_lru" ){ EXPECT_THROWS_AS(cache.get("z"), eckit::BadValue); } + //----------------------------------------------------------------------------- +CASE( "test buckets" ){ + using namespace mc; + BlockBuckets buckets; + for (size_t i = 100; i < 1000; i += 100) { + mc::Block r{i, 10}; + buckets << r; + } + + // i.e. {100, 10}, {200, 10}, ..., {900, 10} + EXPECT_EQUAL(buckets.size(), 9); + + // Add a bucket that extends {100, 10} forward + buckets << Block{205, 20}; + + // Add a bucket that extends {100, 10} backward + buckets << Block{195, 10}; + + EXPECT_EQUAL(buckets.size(), 9); // We've only grown existing bucket + EXPECT_EQUAL(buckets[1].second.size(), 3); // The second bucket now has 3 subblocks + + // Check that it has the correct extents. Should be from 195 to 225 i.e. {195, 30} + EXPECT_EQUAL(buckets[1].first.first, 195); + EXPECT_EQUAL(buckets[1].first.second, 30); + + // Add a bucket that overlaps with nothing to the beginning, middle and end + buckets << Block{0, 10} << Block{150, 10} << Block{1500, 10}; + EXPECT_EQUAL(buckets.size(), 12); // We've added 3 new buckets + + // Add a bucket that merges two adjacent buckets + buckets << Block{305, 100}; + std::cout << buckets << std::endl; + EXPECT_EQUAL(buckets.size(), 11); // We've merged two buckets + + // Add a bucket that completely overlaps with all buckets + buckets << Block{0, 2000}; + EXPECT_EQUAL(buckets.size(), 1); // We've merged all buckets + EXPECT_EQUAL(buckets[0].first.first, 0); + EXPECT_EQUAL(buckets[0].first.second, 2000); +} } // namespace test } // namespace gribjump