Merge pull request ClickHouse#60255 from ClickHouse/dont-load-useless…

…-index-parts-in-memory Do not load useless columns from the index in memory
bigo-sg · Mar 12, 2024 · bdc884d · bdc884d
2 parents bd4f948 + cdb9c96
commit bdc884d
Show file tree

Hide file tree

Showing 12 changed files with 138 additions and 30 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -61,8 +61,8 @@ if (ENABLE_CHECK_HEAVY_BUILDS)
     # set CPU time limit to 1000 seconds
     set (RLIMIT_CPU 1000)
 
-    # -fsanitize=memory is too heavy
-    if (SANITIZE STREQUAL "memory")
+    # -fsanitize=memory and address are too heavy
+    if (SANITIZE)
        set (RLIMIT_DATA 10000000000) # 10G
     endif()
 

diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh
@@ -343,7 +343,7 @@ quit
         # which is confusing.
         task_exit_code=$fuzzer_exit_code
         echo "failure" > status.txt
-        echo "Achtung!" > description.txt
+        echo "Let op!" > description.txt
         echo "Fuzzer went wrong with error code: ($fuzzer_exit_code). Its process died somehow when the server stayed alive. The server log probably won't tell you much so try to find information in other files." >>description.txt
         { rg -ao "Found error:.*" fuzzer.log || rg -ao "Exception:.*" fuzzer.log; } | tail -1 >>description.txt
     fi

diff --git a/src/Core/Field.h b/src/Core/Field.h
@@ -497,7 +497,7 @@ class Field
 
         switch (which)
         {
-            case Types::Null:    return false;
+            case Types::Null:    return get<Null>() < rhs.get<Null>();
             case Types::Bool:    [[fallthrough]];
             case Types::UInt64:  return get<UInt64>()  < rhs.get<UInt64>();
             case Types::UInt128: return get<UInt128>() < rhs.get<UInt128>();
@@ -541,7 +541,7 @@ class Field
 
         switch (which)
         {
-            case Types::Null:    return true;
+            case Types::Null:    return get<Null>() <= rhs.get<Null>();
             case Types::Bool: [[fallthrough]];
             case Types::UInt64:  return get<UInt64>()  <= rhs.get<UInt64>();
             case Types::UInt128: return get<UInt128>() <= rhs.get<UInt128>();
@@ -590,7 +590,7 @@ class Field
 
         switch (which)
         {
-            case Types::Null: return true;
+            case Types::Null: return get<Null>() == rhs.get<Null>();
             case Types::Bool: [[fallthrough]];
             case Types::UInt64: return get<UInt64>() == rhs.get<UInt64>();
             case Types::Int64:   return get<Int64>() == rhs.get<Int64>();

diff --git a/src/Core/Types.h b/src/Core/Types.h
@@ -23,9 +23,9 @@ struct Null
 {
     enum class Value
     {
-        Null,
-        PositiveInfinity,
-        NegativeInfinity,
+        NegativeInfinity = -1,
+        Null = 0,
+        PositiveInfinity = 1,
     };
 
     Value value{Value::Null};
@@ -34,15 +34,12 @@ struct Null
     bool isPositiveInfinity() const { return value == Value::PositiveInfinity; }
     bool isNegativeInfinity() const { return value == Value::NegativeInfinity; }
 
-    bool operator==(const Null & other) const
+    auto operator<=>(const Null & other) const
     {
-        return value == other.value;
+        return static_cast<int>(value) <=> static_cast<int>(other.value);
     }
 
-    bool operator!=(const Null & other) const
-    {
-        return !(*this == other);
-    }
+    bool operator==(const Null &) const = default;
 };
 
 using UInt128 = ::UInt128;

diff --git a/src/Processors/QueryPlan/PartsSplitter.cpp b/src/Processors/QueryPlan/PartsSplitter.cpp
@@ -101,9 +101,9 @@ bool isSafePrimaryKey(const KeyDescription & primary_key)
 
 int compareValues(const Values & lhs, const Values & rhs)
 {
-    chassert(lhs.size() == rhs.size());
+    size_t size = std::min(lhs.size(), rhs.size());
 
-    for (size_t i = 0; i < lhs.size(); ++i)
+    for (size_t i = 0; i < size; ++i)
     {
         if (applyVisitor(FieldVisitorAccurateLess(), lhs[i], rhs[i]))
             return -1;
@@ -124,8 +124,9 @@ class IndexAccess
     Values getValue(size_t part_idx, size_t mark) const
     {
         const auto & index = parts[part_idx].data_part->getIndex();
-        Values values(index.size());
-        for (size_t i = 0; i < values.size(); ++i)
+        size_t size = index.size();
+        Values values(size);
+        for (size_t i = 0; i < size; ++i)
         {
             index[i]->get(mark, values[i]);
             if (values[i].isNull())

diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
@@ -837,6 +837,27 @@ void IMergeTreeDataPart::loadIndex() const
             for (size_t j = 0; j < key_size; ++j)
                 key_serializations[j]->deserializeBinary(*loaded_index[j], *index_file, {});
 
+        /// Cut useless suffix columns, if necessary.
+        Float64 ratio_to_drop_suffix_columns = storage.getSettings()->primary_key_ratio_of_unique_prefix_values_to_skip_suffix_columns;
+        if (key_size > 1 && ratio_to_drop_suffix_columns > 0 && ratio_to_drop_suffix_columns < 1)
+        {
+            chassert(marks_count > 0);
+            for (size_t j = 0; j < key_size - 1; ++j)
+            {
+                size_t num_changes = 0;
+                for (size_t i = 1; i < marks_count; ++i)
+                    if (0 != loaded_index[j]->compareAt(i, i - 1, *loaded_index[j], 0))
+                        ++num_changes;
+
+                if (static_cast<Float64>(num_changes) / marks_count >= ratio_to_drop_suffix_columns)
+                {
+                    key_size = j + 1;
+                    loaded_index.resize(key_size);
+                    break;
+                }
+            }
+        }
+
         for (size_t i = 0; i < key_size; ++i)
         {
             loaded_index[i]->shrinkToFit();

diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@@ -1018,7 +1018,11 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
     DataTypes key_types;
     for (size_t i : key_indices)
     {
-        index_columns->emplace_back(ColumnWithTypeAndName{index[i], primary_key.data_types[i], primary_key.column_names[i]});
+        if (i < index.size())
+            index_columns->emplace_back(index[i], primary_key.data_types[i], primary_key.column_names[i]);
+        else
+            index_columns->emplace_back(); /// The column of the primary key was not loaded in memory - we'll skip it.
+
         key_types.emplace_back(primary_key.data_types[i]);
     }
 
@@ -1027,7 +1031,6 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
     std::function<void(size_t, size_t, FieldRef &)> create_field_ref;
     if (key_condition.hasMonotonicFunctionsChain())
     {
-
         create_field_ref = [index_columns](size_t row, size_t column, FieldRef & field)
         {
             field = {index_columns.get(), row, column};
@@ -1067,7 +1070,11 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
             {
                 for (size_t i = 0; i < used_key_size; ++i)
                 {
-                    create_field_ref(range.begin, i, index_left[i]);
+                    if ((*index_columns)[i].column)
+                        create_field_ref(range.begin, i, index_left[i]);
+                    else
+                        index_left[i] = NEGATIVE_INFINITY;
+
                     index_right[i] = POSITIVE_INFINITY;
                 }
             }
@@ -1078,8 +1085,17 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
 
                 for (size_t i = 0; i < used_key_size; ++i)
                 {
-                    create_field_ref(range.begin, i, index_left[i]);
-                    create_field_ref(range.end, i, index_right[i]);
+                    if ((*index_columns)[i].column)
+                    {
+                        create_field_ref(range.begin, i, index_left[i]);
+                        create_field_ref(range.end, i, index_right[i]);
+                    }
+                    else
+                    {
+                        /// If the PK column was not loaded in memory - exclude it from the analysis.
+                        index_left[i] = NEGATIVE_INFINITY;
+                        index_right[i] = POSITIVE_INFINITY;
+                    }
                 }
             }
             key_condition_maybe_true = key_condition.mayBeTrueInRange(used_key_size, index_left.data(), index_right.data(), key_types);
@@ -1114,6 +1130,7 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
     bool part_offset_condition_exact_range
         = !part_offset_condition || part_offset_condition->alwaysUnknownOrTrue() || part_offset_condition->matchesExactContinuousRange();
     const String & part_name = part->isProjectionPart() ? fmt::format("{}.{}", part->name, part->getParentPart()->name) : part->name;
+
     if (!key_condition_exact_range || !part_offset_condition_exact_range)
     {
         // Do exclusion search, where we drop ranges that do not match
@@ -1128,10 +1145,10 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
             part->index_granularity_info.index_granularity_bytes);
 
         /** There will always be disjoint suspicious segments on the stack, the leftmost one at the top (back).
-        * At each step, take the left segment and check if it fits.
-        * If fits, split it into smaller ones and put them on the stack. If not, discard it.
-        * If the segment is already of one mark length, add it to response and discard it.
-        */
+          * At each step, take the left segment and check if it fits.
+          * If fits, split it into smaller ones and put them on the stack. If not, discard it.
+          * If the segment is already of one mark length, add it to response and discard it.
+          */
         std::vector<MarkRange> ranges_stack = { {0, marks_count} };
 
         size_t steps = 0;
@@ -1141,7 +1158,7 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
             MarkRange range = ranges_stack.back();
             ranges_stack.pop_back();
 
-            steps++;
+            ++steps;
 
             if (!may_be_true_in_range(range))
                 continue;

diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h
@@ -204,7 +204,7 @@ struct Settings;
     M(UInt64, marks_compress_block_size, 65536, "Mark compress block size, the actual size of the block to compress.", 0) \
     M(UInt64, primary_key_compress_block_size, 65536, "Primary compress block size, the actual size of the block to compress.", 0) \
     M(Bool, primary_key_lazy_load, true, "Load primary key in memory on first use instead of on table initialization. This can save memory in the presence of a large number of tables.", 0) \
-    \
+    M(Float, primary_key_ratio_of_unique_prefix_values_to_skip_suffix_columns, 0.9f, "If the value of a column of the primary key in data part changes at least in this ratio of times, skip loading next columns in memory. This allows to save memory usage by not loading useless columns of the primary key.", 0) \
     /** Projection settings. */ \
     M(UInt64, max_projections, 25, "The maximum number of merge tree projections.", 0) \
 

diff --git a/tests/queries/0_stateless/02998_primary_key_skip_columns.reference b/tests/queries/0_stateless/02998_primary_key_skip_columns.reference
@@ -0,0 +1,18 @@
+100000
+14954
+798
+15908
+108
+120
+2334
+19
+Key size: 	2400000
+100000
+14954
+798
+15908
+108
+120
+2334
+19
+Key size: 	800000
diff --git a/tests/queries/0_stateless/02998_primary_key_skip_columns.sql b/tests/queries/0_stateless/02998_primary_key_skip_columns.sql
@@ -0,0 +1,33 @@
+DROP TABLE IF EXISTS test;
+
+CREATE TABLE test (a UInt64, b UInt64, c UInt64) ENGINE = MergeTree ORDER BY (a, b, c) SETTINGS index_granularity = 1, primary_key_ratio_of_unique_prefix_values_to_skip_suffix_columns = 1;
+INSERT INTO test SELECT sipHash64(number, 1), sipHash64(number, 2), sipHash64(number, 3) FROM numbers(100000);
+
+SELECT count() FROM test;
+SELECT count() FROM test WHERE a > 1849813033528774208 AND a < 4594276315503201760;
+SELECT count() FROM test WHERE b > 7898976344263989848 AND b < 8040320939819153137;
+SELECT count() FROM test WHERE c > 13239894303140990071 AND c < 16179795840886947236;
+SELECT count() FROM test WHERE a > 1849813033528774208 AND a < 4594276315503201760 AND b > 7898976344263989848 AND b < 8040320939819153137;
+SELECT count() FROM test WHERE b > 7898976344263989848 AND b < 8040320939819153137 AND c > 13239894303140990071 AND c < 16179795840886947236;
+SELECT count() FROM test WHERE a > 1849813033528774208 AND a < 4594276315503201760 AND c > 13239894303140990071 AND c < 16179795840886947236;
+SELECT count() FROM test WHERE a > 1849813033528774208 AND a < 4594276315503201760 AND b > 7898976344263989848 AND b < 8040320939819153137 AND c > 13239894303140990071 AND c < 16179795840886947236;
+
+SELECT 'Key size: ', round(sum(primary_key_bytes_in_memory), -5) FROM system.parts WHERE database = currentDatabase() AND table = 'test';
+
+ALTER TABLE test MODIFY SETTING primary_key_ratio_of_unique_prefix_values_to_skip_suffix_columns = 0.9;
+
+DETACH TABLE test;
+ATTACH TABLE test;
+
+SELECT count() FROM test;
+SELECT count() FROM test WHERE a > 1849813033528774208 AND a < 4594276315503201760;
+SELECT count() FROM test WHERE b > 7898976344263989848 AND b < 8040320939819153137;
+SELECT count() FROM test WHERE c > 13239894303140990071 AND c < 16179795840886947236;
+SELECT count() FROM test WHERE a > 1849813033528774208 AND a < 4594276315503201760 AND b > 7898976344263989848 AND b < 8040320939819153137;
+SELECT count() FROM test WHERE b > 7898976344263989848 AND b < 8040320939819153137 AND c > 13239894303140990071 AND c < 16179795840886947236;
+SELECT count() FROM test WHERE a > 1849813033528774208 AND a < 4594276315503201760 AND c > 13239894303140990071 AND c < 16179795840886947236;
+SELECT count() FROM test WHERE a > 1849813033528774208 AND a < 4594276315503201760 AND b > 7898976344263989848 AND b < 8040320939819153137 AND c > 13239894303140990071 AND c < 16179795840886947236;
+
+SELECT 'Key size: ', round(sum(primary_key_bytes_in_memory), -5) FROM system.parts WHERE database = currentDatabase() AND table = 'test';
+
+DROP TABLE test;
diff --git a/tests/queries/0_stateless/03008_index_small.reference b/tests/queries/0_stateless/03008_index_small.reference
@@ -0,0 +1,2 @@
+3
+3
diff --git a/tests/queries/0_stateless/03008_index_small.sql b/tests/queries/0_stateless/03008_index_small.sql
@@ -0,0 +1,19 @@
+DROP TABLE IF EXISTS test;
+
+CREATE TABLE test (a UInt8, b UInt8) ENGINE = MergeTree ORDER BY (a, b)
+SETTINGS index_granularity = 1, primary_key_ratio_of_unique_prefix_values_to_skip_suffix_columns = 0.01;
+
+SET optimize_move_to_prewhere = 0;
+
+INSERT INTO test
+SELECT number DIV 2, number
+FROM numbers(3);
+
+SELECT count() FROM test WHERE b >= 0;
+
+DETACH TABLE test;
+ATTACH TABLE test;
+
+SELECT count() FROM test WHERE b >= 0;
+
+DROP TABLE test;
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,18 @@
		100000
		14954
		798
		15908
		108
		120
		2334
		19
		Key size: 2400000
		100000
		14954
		798
		15908
		108
		120
		2334
		19
		Key size: 800000