diff --git a/60029 - Data Processing Systems/60029 - Data Processing Systems.pdf b/60029 - Data Processing Systems/60029 - Data Processing Systems.pdf index 4d0ba18..604c785 100644 Binary files a/60029 - Data Processing Systems/60029 - Data Processing Systems.pdf and b/60029 - Data Processing Systems/60029 - Data Processing Systems.pdf differ diff --git a/60029 - Data Processing Systems/60029 - Data Processing Systems.tex b/60029 - Data Processing Systems/60029 - Data Processing Systems.tex index eef5277..2e3d713 100644 --- a/60029 - Data Processing Systems/60029 - Data Processing Systems.tex +++ b/60029 - Data Processing Systems/60029 - Data Processing Systems.tex @@ -1,3 +1,5 @@ +% !TeX root = 60029 - Data Processing Systems.tex + \documentclass{report} \title{60029 - Data Processing Systems} diff --git a/60029 - Data Processing Systems/advanced_topics/advanced_topics.tex b/60029 - Data Processing Systems/advanced_topics/advanced_topics.tex index 089aa78..a5ee13f 100644 --- a/60029 - Data Processing Systems/advanced_topics/advanced_topics.tex +++ b/60029 - Data Processing Systems/advanced_topics/advanced_topics.tex @@ -88,7 +88,7 @@ \section{CodeGen} \begin{definitionbox}{voodoo} A \textit{Vector-Dataflow Language} used as a unified algebra for code generating DBMS - \href{https://www.cs.albany.edu/~jhh/courses/readings/pirk.pvldb16.pdf}{original paper} + \href{https://www.cs.albany.edu/~jhh/courses/readings/pirk.pvldb16.pdf}{(original paper)}. \end{definitionbox} \subsection{Vector Operations} @@ -223,43 +223,7 @@ \subsection{Hoare Partitioning} \item $O(n)$ time complexity \item Does not require extra memory / partitions in-place. \end{itemize} - -\begin{minted}{cpp} -// partition part of a vector in range [start-inc, end-exc) and return the pivot index -template -size_t partition(std::vector &sort_vec, size_t start, size_t end) { - // get pivot - T pivot = sort_vec[start]; - size_t count = 0; - - // determine where to partition / where to place pivot value - for (size_t i = start + 1; i < end; i++) { - if (sort_vec[i] <= pivot) - count++; - } - - // swap pivot into place, will partition around pivot - size_t pivotIndex = start + count; - std::swap(sort_vec[pivotIndex], sort_vec[start]); - - // start pointers i & j at ends of range - size_t i = start, j = end - 1; - - // advance pointers, swap and partition - while (i < pivotIndex && j > pivotIndex) { - while (sort_vec[i] <= pivot) i++; - while (sort_vec[j] > pivot) j--; - - if (i < pivotIndex && j >= pivotIndex) { - std::swap(sort_vec[i], sort_vec[j]); - i++; - j--; - } - } - - return pivotIndex; -} -\end{minted} +\inputminted[firstline=5]{cpp}{advanced_topics/code/partition_comparison/partitions/in_place_conditional.h} Consider the following section from the algorithm. \begin{minted}{cpp} @@ -298,95 +262,17 @@ \subsection{Predication} \\ In the below examples we \textit{predicate} by removing a jump/branch, rather than using predicated instructions. \end{sidenotebox} We can start with a basic out-of-place partition. -\begin{minted}{cpp} -template -size_t partition(const std::vector& input_vec, std::vector& output_vec, size_t start, size_t end) -{ - const T& pivot = input_vec[(start + end) / 2]; - size_t left_index = start; - size_t right_index = end - 1; - - for (auto i = start; i < end; i++) { - if (input_vec[i] < pivot) { - output_vec[left_index] = input_vec[i]; - left_index++; - } else { - output_vec[right_index] = input_vec[i]; - right_index--; - } - } - - return right_index; -} -\end{minted} +\inputminted[firstline=7]{cpp}{advanced_topics/code/partition_comparison/partitions/out_of_place_conditional.h} Here the \mintinline{cpp}{if (input_vec[i] < pivot)} condition has low selectivity, and is part of the hot loop. \\ \\ We can predicate this by always writing the \mintinline{cpp}{input_vec[i]}, and incrementing the pivot indexes based on the condition. -\begin{minted}{cpp} -template -size_t partition(const std::vector& input_vec, std::vector& output_vec, size_t start, size_t end) -{ - const T& pivot = input_vec[(start + end) / 2]; - size_t left_index = start; - size_t right_index = end - 1; - - for (auto i = start; i < end; i++) { - output_vec[left_index] = input_vec[i]; - output_vec[right_index] = input_vec[i]; - - // increment using boolean, if not incremented, value is overwritten on the next iteration - // of the loop - left_index += input_vec[i] < pivot; - right_index -= input_vec[i] >= pivot; - } - - return right_index; -} -\end{minted} +\inputminted[firstline=6]{cpp}{advanced_topics/code/partition_comparison/partitions/out_of_place_predicated.h} \subsection{Predicated Cracking} \begin{sidenotebox}{Cracking stuff!} The algorithm presented here is from the paper \href{https://core.ac.uk/download/pdf/301643658.pdf}{\textit{Database Cracking: Fancy Scan, not Poor Man's Sort!}} \end{sidenotebox} - -\begin{minted}{cpp} -template -size_t partition(std::vector &sort_vec, size_t start, size_t end) { - - bool cmp = false; - size_t left_ptr = start; - size_t right_ptr = end - 1; - T active = sort_vec[left_ptr]; - T backup = sort_vec[right_ptr]; - T pivot = sort_vec[(start + end) / 2]; // somewhat arbitrary pivot selection - - while (left_ptr < right_ptr) { - // compare and write - cmp = pivot > active; - sort_vec[left_ptr] = active; - sort_vec[right_ptr] = active; - - // advance cursor - left_ptr += cmp; - right_ptr -= 1 - cmp; - - // backup phase - active = cmp * sort_vec[left_ptr] + (1-cmp) * sort_vec[right_ptr]; - - // swap active - std::swap(active, backup); - } - - sort_vec[left_ptr] = active; - return left_ptr; -} -\end{minted} +\inputminted[firstline=5]{cpp}{advanced_topics/code/partition_comparison/partitions/in_place_predicated.h} \begin{sidenotebox}{Conditional Understanding} A python script in the included code for these notes can be used to generate apply and print steps of this algorithm. \end{sidenotebox} - -\section{Stream Processing} - - -\section{Composable Data Processing} - diff --git a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/CMakeLists.txt b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/CMakeLists.txt index 0976c16..546f0bf 100644 --- a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/CMakeLists.txt +++ b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.14) -project(Partition-Benchmarks) +project(Partition-Comparison) include(FetchContent) @@ -24,5 +24,10 @@ FetchContent_MakeAvailable( googlebenchmark ) -add_executable(${PROJECT_NAME} benchmarks.cpp) -target_link_libraries(${PROJECT_NAME} benchmark::benchmark) \ No newline at end of file +add_executable(Benchmark benchmarks.cpp) +target_link_libraries(Benchmark benchmark::benchmark) + +add_executable(Test tests.cpp) +target_link_libraries(Test gtest_main gmock) # Remove GTest::GTest + +add_executable(Examples examples.cpp) \ No newline at end of file diff --git a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/README.md b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/README.md new file mode 100644 index 0000000..4ecbb2d --- /dev/null +++ b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/README.md @@ -0,0 +1,11 @@ +## What is this? +[Tests](tests.cpp) & [benchmarks](benchmarks.cpp) for the partition algorithms discussed in the advanced topics lecture. + +## To build & Run +```bash +cmake -S . -B build -DCMAKE_BUILD_TYPE=Release +make -j -C build/ +./build/Test +./build/Benchmark +./build/Examples +``` diff --git a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/benchmarks.cpp b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/benchmarks.cpp index 27c006a..d84df39 100644 --- a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/benchmarks.cpp +++ b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/benchmarks.cpp @@ -1,39 +1,42 @@ -#include "in_place_conditional.h" -#include "in_place_predicated.h" -#include "out_of_place_conditional.h" -#include "out_of_place_predicated.h" +#include "partitions/in_place_conditional.h" +#include "partitions/in_place_predicated.h" +#include "partitions/out_of_place_conditional.h" +#include "partitions/out_of_place_predicated.h" #include - std::vector ordered_ints(size_t n) { - std::vector vec; - vec.reserve(n); - for (auto i = 0; i < n; i++) vec.push_back(i); - return vec; + std::vector vec; + vec.reserve(n); + for (auto i = 0; i < n; i++) + vec.push_back(i); + return vec; } std::vector alternating_ints(size_t n) { - std::vector vec; - vec.reserve(n); - for (auto i = 0; i < n; i++) vec.push_back(i % 2 == 0 ? i : n - i); - return vec; + std::vector vec; + vec.reserve(n); + for (auto i = 0; i < n; i++) + vec.push_back(i % 2 == 0 ? i : n - i); + return vec; } std::vector random_ints(size_t n) { - std::vector vec; - vec.reserve(n); - srand(n); - for (auto i = 0; i < n; i++) vec.push_back(rand()); - return vec; + std::vector vec; + vec.reserve(n); + srand(n); + for (auto i = 0; i < n; i++) + vec.push_back(rand()); + return vec; } // Benchmarking in place partitioning -template&, size_t, size_t), std::vector generate(size_t)> +template &, size_t, size_t), + std::vector generate(size_t)> static void partition_in_place(benchmark::State &state) { for (auto _ : state) { state.PauseTiming(); - std::vector workload(generate(state.range(0))); + std::vector workload(generate(state.range(0))); state.ResumeTiming(); partition(workload, 0, workload.size()); benchmark::DoNotOptimize(workload); @@ -41,38 +44,53 @@ static void partition_in_place(benchmark::State &state) { } // Benchmarking out of place (no including alloc time for aux vector) -template&, std::vector&, size_t, size_t), std::vector generate(size_t)> +template &, std::vector &, size_t, + size_t), + std::vector generate(size_t)> static void partition_out_of_place(benchmark::State &state) { for (auto _ : state) { state.PauseTiming(); std::vector workload(generate(state.range(0))); - std::vector aux(state.range(0)); + std::vector aux(state.range(0)); state.ResumeTiming(); partition(workload, aux, 0, workload.size()); benchmark::DoNotOptimize(aux); } } -BENCHMARK(partition_in_place)->Range(8 * 1024, 64 * 1024); -BENCHMARK(partition_in_place)->Range(8 * 1024, 64 * 1024); -BENCHMARK(partition_out_of_place)->Range(8 * 1024, 64 * 1024); -BENCHMARK(partition_out_of_place)->Range(8 * 1024, 64 * 1024); +BENCHMARK(partition_in_place) + ->Range(8 * 1024, 64 * 1024); +BENCHMARK(partition_in_place) + ->Range(8 * 1024, 64 * 1024); +BENCHMARK( + partition_out_of_place) + ->Range(8 * 1024, 64 * 1024); +BENCHMARK( + partition_out_of_place) + ->Range(8 * 1024, 64 * 1024); -BENCHMARK(partition_in_place)->Range(8 * 1024, 64 * 1024); -BENCHMARK(partition_in_place)->Range(8 * 1024, 64 * 1024); -BENCHMARK(partition_out_of_place)->Range(8 * 1024, 64 * 1024); -BENCHMARK(partition_out_of_place)->Range(8 * 1024, 64 * 1024); +BENCHMARK(partition_in_place) + ->Range(8 * 1024, 64 * 1024); +BENCHMARK( + partition_in_place) + ->Range(8 * 1024, 64 * 1024); +BENCHMARK( + partition_out_of_place) + ->Range(8 * 1024, 64 * 1024); +BENCHMARK( + partition_out_of_place) + ->Range(8 * 1024, 64 * 1024); -BENCHMARK(partition_in_place)->Range(8 * 1024, 64 * 1024); -BENCHMARK(partition_in_place)->Range(8 * 1024, 64 * 1024); -BENCHMARK(partition_out_of_place)->Range(8 * 1024, 64 * 1024); -BENCHMARK(partition_out_of_place)->Range(8 * 1024, 64 * 1024); +BENCHMARK(partition_in_place) + ->Range(8 * 1024, 64 * 1024); +BENCHMARK(partition_in_place) + ->Range(8 * 1024, 64 * 1024); +BENCHMARK( + partition_out_of_place) + ->Range(8 * 1024, 64 * 1024); +BENCHMARK( + partition_out_of_place) + ->Range(8 * 1024, 64 * 1024); -// To run this use: -// cd partition_comparison -// ```bash -// cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -// make -j -C build/ -// ./build/Partition-Benchmarks -// ``` BENCHMARK_MAIN(); diff --git a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/examples.cpp b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/examples.cpp new file mode 100644 index 0000000..e49bfb8 --- /dev/null +++ b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/examples.cpp @@ -0,0 +1,40 @@ +#include "partitions/in_place_conditional.h" +#include "partitions/in_place_predicated.h" +#include "partitions/out_of_place_conditional.h" +#include "partitions/out_of_place_predicated.h" +#include +#include +#include + +template void print_vec(const std::vector &v) { + for (const auto &x : v) + std::cout << x << ","; + std::cout << std::endl; +} + +void example_1() { + std::vector is = {2, 3, 12, 5, 6, 7, 34, 3, 2, 1, 3, 5, 7, 23}; + // std::vector is = { + // 2, 1, 2, 6, 7, 34, 3, 5, 12, 3, 5, 7, 3, 23, + // }; + + print_vec(is); + auto part = predicated_cracking::partition(is, 0, is.size()); + print_vec(is); + std::cout << "partition at vec[" << part << "] = " << is[part] << std::endl; +} + +void example_2() { + std::vector is = {7}; + std::vector ans(is.size()); + + print_vec(is); + auto part = out_of_place_cond::partition(is, ans, 0, is.size()); + print_vec(ans); + std::cout << "partition at vec[" << part << "] = " << ans[part] << std::endl; +} + +int main() { + example_1(); + example_2(); +} diff --git a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/in_place_predicated.h b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/in_place_predicated.h deleted file mode 100644 index 67e209d..0000000 --- a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/in_place_predicated.h +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once -#include - -namespace predicated_cracking { -template -size_t partition(std::vector &sort_vec, size_t start, size_t end) { - - bool cmp = false; - size_t left_ptr = start; - size_t right_ptr = end-1; - T active = sort_vec[left_ptr]; - T backup = sort_vec[right_ptr]; - T pivot = sort_vec[(start + end) / 2]; // somewhat arbitrary pivot selection - - while (left_ptr < right_ptr) { - // compare and write - cmp = pivot > active; - sort_vec[left_ptr] = active; - sort_vec[right_ptr] = active; - - // advance cursor - left_ptr += cmp; - right_ptr -= 1 - cmp; - - // backup phase - active = cmp * sort_vec[left_ptr] + (1-cmp) * sort_vec[right_ptr]; - - // swap active - std::swap(active, backup); - } - sort_vec[left_ptr] = active; - return left_ptr; -} - -} \ No newline at end of file diff --git a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/out_of_place_conditional.h b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/out_of_place_conditional.h deleted file mode 100644 index 5945941..0000000 --- a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/out_of_place_conditional.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once -#include -#include - -namespace out_of_place_cond { -template -size_t partition(const std::vector& input_vec, std::vector& output_vec, size_t start, size_t end) -{ - const T& pivot = input_vec[(start + end) / 2]; - size_t left_index = start; - size_t right_index = end - 1; - - for (auto i = start; i < end; i++) { - if (input_vec[i] < pivot) { - output_vec[left_index] = input_vec[i]; - left_index++; - } else { - output_vec[right_index] = input_vec[i]; - right_index--; - } - } - - return right_index; -} -} \ No newline at end of file diff --git a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/out_of_place_predicated.h b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/out_of_place_predicated.h deleted file mode 100644 index df04763..0000000 --- a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/out_of_place_predicated.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once -#include -#include - -namespace out_of_place_pred { -template -size_t partition(const std::vector& input_vec, std::vector& output_vec, size_t start, size_t end) -{ - const T& pivot = input_vec[(start + end) / 2]; - size_t left_index = start; - size_t right_index = end - 1; - - for (auto i = start; i < end; i++) { - output_vec[left_index] = input_vec[i]; - output_vec[right_index] = input_vec[i]; - - // increment using boolean, if not incremented, value is overwritten on - // the next iteration of the loop - left_index += input_vec[i] < pivot; - right_index -= input_vec[i] >= pivot; - } - - return right_index; -} -} \ No newline at end of file diff --git a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/in_place_conditional.h b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/partitions/in_place_conditional.h similarity index 80% rename from 60029 - Data Processing Systems/advanced_topics/code/partition_comparison/in_place_conditional.h rename to 60029 - Data Processing Systems/advanced_topics/code/partition_comparison/partitions/in_place_conditional.h index 245a547..b8026e0 100644 --- a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/in_place_conditional.h +++ b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/partitions/in_place_conditional.h @@ -1,7 +1,9 @@ #pragma once +#include #include namespace hoare { +// INV: sort_vec.size() > 0 template size_t partition(std::vector &sort_vec, size_t start, size_t end) { // get pivot @@ -13,18 +15,20 @@ size_t partition(std::vector &sort_vec, size_t start, size_t end) { if (sort_vec[i] <= pivot) count++; } - + // swap pivot into place, will partition around pivot size_t pivotIndex = start + count; std::swap(sort_vec[pivotIndex], sort_vec[start]); - + // start pointers i & j at ends of range size_t i = start, j = end - 1; - + // advance pointers, swap and partition while (i < pivotIndex && j > pivotIndex) { - while (sort_vec[i] <= pivot) i++; - while (sort_vec[j] > pivot) j--; + while (sort_vec[i] <= pivot) + i++; + while (sort_vec[j] > pivot) + j--; if (i < pivotIndex && j >= pivotIndex) { std::swap(sort_vec[i], sort_vec[j]); @@ -35,4 +39,4 @@ size_t partition(std::vector &sort_vec, size_t start, size_t end) { return pivotIndex; } -} \ No newline at end of file +} // namespace hoare \ No newline at end of file diff --git a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/partitions/in_place_predicated.h b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/partitions/in_place_predicated.h new file mode 100644 index 0000000..41b3706 --- /dev/null +++ b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/partitions/in_place_predicated.h @@ -0,0 +1,54 @@ +#pragma once +#include +#include + +namespace predicated_cracking { + +constexpr bool USE_CONDITIONS = false; + +// INV: sort_vec.size() > 0 +template +size_t partition(std::vector &sort_vec, size_t start, size_t end) { + + bool cmp = false; + size_t left_ptr = start; + size_t right_ptr = end - 1; + T active = sort_vec[left_ptr]; + T backup = sort_vec[right_ptr]; + T pivot = sort_vec[(start + end) / 2]; // somewhat arbitrary pivot selection + + while (left_ptr < right_ptr) { + + // write active + sort_vec[left_ptr] = active; + sort_vec[right_ptr] = active; + + if constexpr (USE_CONDITIONS) { + if (pivot > active) { + left_ptr++; + active = sort_vec[left_ptr]; + } else { + right_ptr--; + active = sort_vec[right_ptr]; + } + } else { + // compare and write + cmp = pivot > active; + + // advance cursor + left_ptr += cmp; + right_ptr -= 1 - cmp; + + // backup phase + active = cmp * sort_vec[left_ptr] + (1 - cmp) * sort_vec[right_ptr]; + } + + // swap active + std::swap(active, backup); + } + + sort_vec[left_ptr] = active; + return left_ptr; +} + +} // namespace predicated_cracking \ No newline at end of file diff --git a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/partitions/out_of_place_conditional.h b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/partitions/out_of_place_conditional.h new file mode 100644 index 0000000..c031668 --- /dev/null +++ b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/partitions/out_of_place_conditional.h @@ -0,0 +1,28 @@ +#pragma once +#include +#include +#include // DEBUG +#include + +namespace out_of_place_cond { +// INV: input_vec.size() > 0 +template +size_t partition(const std::vector &input_vec, std::vector &output_vec, + size_t start, size_t end) { + const T &pivot = input_vec[(start + end) / 2]; + size_t left_index = start; + size_t right_index = end - 1; + + for (auto i = start; i < end; i++) { + if (input_vec[i] < pivot) { + output_vec[left_index] = input_vec[i]; + left_index++; + } else { + output_vec[right_index] = input_vec[i]; + right_index--; + } + } + + return left_index; +} +} // namespace out_of_place_cond \ No newline at end of file diff --git a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/partitions/out_of_place_predicated.h b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/partitions/out_of_place_predicated.h new file mode 100644 index 0000000..cacb71c --- /dev/null +++ b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/partitions/out_of_place_predicated.h @@ -0,0 +1,27 @@ +#pragma once +#include +#include +#include + +namespace out_of_place_pred { +// INV: input_vec.size() > 0 +template +size_t partition(const std::vector &input_vec, std::vector &output_vec, + size_t start, size_t end) { + const T &pivot = input_vec[(start + end) / 2]; + size_t left_index = start; + size_t right_index = end - 1; + + for (auto i = start; i < end; i++) { + output_vec[left_index] = input_vec[i]; + output_vec[right_index] = input_vec[i]; + + // increment using boolean, if not incremented, value is overwritten on + // the next iteration of the loop + left_index += input_vec[i] < pivot; + right_index -= input_vec[i] >= pivot; + } + + return left_index; +} +} // namespace out_of_place_pred \ No newline at end of file diff --git a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/test.cpp b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/test.cpp deleted file mode 100644 index e62e24b..0000000 --- a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/test.cpp +++ /dev/null @@ -1,15 +0,0 @@ -#include -#include "in_place_predicated.h" - -using namespace predicated_cracking; - -int main() { - std::vector is = {3, 2, 4, 2, 8, 1, 9, 3, 8, 1, 5, 0, 7, 5, 7}; - - for (const auto& x : is) std::cout << x << ","; - auto part = partition(is, 0, is.size()); - std::cout << std::endl; - for (const auto& x : is) std::cout << x << ","; - std::cout << std::endl << part; -} - diff --git a/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/tests.cpp b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/tests.cpp new file mode 100644 index 0000000..571310f --- /dev/null +++ b/60029 - Data Processing Systems/advanced_topics/code/partition_comparison/tests.cpp @@ -0,0 +1,63 @@ +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include +#include +#include +#include + +#include "partitions/in_place_conditional.h" +#include "partitions/in_place_predicated.h" +#include "partitions/out_of_place_conditional.h" +#include "partitions/out_of_place_predicated.h" + +#include + +void check_partitioned(const std::vector &orig, + const std::vector &part, size_t part_at) { + EXPECT_THAT(orig, ::testing::UnorderedElementsAreArray(part)); + + // the + auto part_at_val = part[part_at]; + auto top_min = + *std::min_element(std::next(part.begin(), part_at), part.end()); + auto bot_max = + *std::max_element(part.begin(), std::next(part.begin(), part_at)); + EXPECT_LE(bot_max, top_min); +} + +using inplace = size_t(std::vector &, size_t, size_t); +using outplace = size_t(const std::vector &, std::vector &, size_t, + size_t); + +void partition_check(outplace fn, std::vector vals) { + std::vector partitioned(vals.size()); + auto part_at = fn(vals, partitioned, 0, vals.size()); + check_partitioned(vals, partitioned, part_at); +} + +void partition_check(inplace fn, std::vector vals) { + std::vector orig = vals; + auto part_at = fn(vals, 0, vals.size()); + check_partitioned(orig, vals, part_at); +} + +#define NEW_TEST(PARTITION) \ + TEST(PARTITION, Single) { partition_check(PARTITION::partition, {1}); } \ + TEST(PARTITION, OrderedAsc) { \ + partition_check(PARTITION::partition, {1, 2, 3, 4, 5, 6}); \ + } \ + TEST(PARTITION, OrderedDesc) { \ + partition_check(PARTITION::partition, {6, 5, 4, 3, 2, 1}); \ + } \ + TEST(PARTITION, AllSame) { \ + partition_check(PARTITION::partition, {1, 1, 1, 1, 1, 1}); \ + } \ + TEST(PARTITION, Large) { \ + partition_check(PARTITION::partition, \ + {2, 3, 12, 5, 6, 7, 34, 3, 2, 1, 3, 5, 7, 23}); \ + } + +NEW_TEST(hoare) +NEW_TEST(predicated_cracking) +NEW_TEST(out_of_place_cond) +NEW_TEST(out_of_place_pred) \ No newline at end of file diff --git a/60029 - Data Processing Systems/algorithms_and_indices/algorithms_and_indices.tex b/60029 - Data Processing Systems/algorithms_and_indices/algorithms_and_indices.tex index 5374d80..525b019 100644 --- a/60029 - Data Processing Systems/algorithms_and_indices/algorithms_and_indices.tex +++ b/60029 - Data Processing Systems/algorithms_and_indices/algorithms_and_indices.tex @@ -7,7 +7,7 @@ \section{Sorting Algorithms (unassessed)} \subsection{Quicksort} \begin{center} - \includegraphics[width=.9\textwidth]{algorithms_and_indices/images/quicksort.drawio.png} + \includegraphics[width=.8\textwidth]{algorithms_and_indices/images/quicksort.drawio.png} \end{center} \inputminted{cpp}{algorithms_and_indices/code/sort_comparison/sorts/quicksort.h} \begin{center} @@ -84,7 +84,7 @@ \subsection{Radix Sort} \begin{center} \includegraphics[width=\textwidth]{algorithms_and_indices/images/radixsort.drawio.png} \end{center} -A non-comparative sorting algorithm. Rather than comparing elements to determine an order, continually split buckets by some place. +A non-comparative sorting algorithm. Rather than comparing elements to determine an order, partition into buckets based on some key (e.g digit). \begin{center} \begin{tabular}{c} \textbf{Worst-Case Complexity} \\ @@ -407,6 +407,7 @@ \subsection{Hashing} \end{tabular} \end{center} \end{sidenotebox} +Hashes can collide, and hence we need a way to resolve this. \begin{examplebox}{Hash it out} Write a basic Modulo-Division hash using the interface above provided. Take the modulus as a template parameter. @@ -418,8 +419,6 @@ \subsection{Hashing} \end{minted} \end{examplebox} -Hashes can collide, and hence we need a way to resolve this. - \subsection{Bucket Hashmap (Separate Chaining)} Collisions are resolved using linked-list buckets. \begin{center} diff --git a/60029 - Data Processing Systems/algorithms_and_indices/code/hashtable_comparison/tests.cpp b/60029 - Data Processing Systems/algorithms_and_indices/code/hashtable_comparison/tests.cpp index 1ebeaf0..9ddf4e9 100644 --- a/60029 - Data Processing Systems/algorithms_and_indices/code/hashtable_comparison/tests.cpp +++ b/60029 - Data Processing Systems/algorithms_and_indices/code/hashtable_comparison/tests.cpp @@ -1,13 +1,12 @@ #include "hashtables/bucket.h" -#include "hashtables/std_unordered_map.h" #include "hashtables/probing.h" +#include "hashtables/std_unordered_map.h" #include "hashers/const_hash.h" #include "hashers/std_hash.h" -#include "hashtable.h" #include "hasher.h" - +#include "hashtable.h" #include "gtest/gtest.h" #include @@ -18,57 +17,60 @@ using HashTables = ::testing::Types< HashMap::Buckets>, HashMap::STD>, HashMap::Probing, 1>>, - HashMap::Probing, 1>>, - HashMap::Probing>> ->; + HashMap::Probing, 1>>, + HashMap::Probing>>>; template class HashTableTests : public testing::Test {}; TYPED_TEST_SUITE(HashTableTests, HashTables); TYPED_TEST(HashTableTests, InitiallyEmpty) { - TypeParam hashmap; - EXPECT_EQ(hashmap.size(), 0); - EXPECT_FALSE(hashmap.find(-1)); - EXPECT_FALSE(hashmap.find(0)); + TypeParam hashmap; + EXPECT_EQ(hashmap.size(), 0); + EXPECT_FALSE(hashmap.find(-1)); + EXPECT_FALSE(hashmap.find(0)); } TYPED_TEST(HashTableTests, CanInsertUniqueKeys) { - TypeParam hashmap; - EXPECT_TRUE(hashmap.insert(3, 3)); - EXPECT_EQ(hashmap.size(), 1); - EXPECT_FALSE(hashmap.insert(3, 4)); - EXPECT_EQ(hashmap.size(), 1); + TypeParam hashmap; + EXPECT_TRUE(hashmap.insert(3, 3)); + EXPECT_EQ(hashmap.size(), 1); + EXPECT_FALSE(hashmap.insert(3, 4)); + EXPECT_EQ(hashmap.size(), 1); } TYPED_TEST(HashTableTests, CanEraseElements) { - TypeParam hashmap; - EXPECT_TRUE(hashmap.insert(3, 3)); - EXPECT_EQ(hashmap.size(), 1); - EXPECT_TRUE(hashmap.erase(3)); - EXPECT_EQ(hashmap.size(), 0); + TypeParam hashmap; + EXPECT_TRUE(hashmap.insert(3, 3)); + EXPECT_EQ(hashmap.size(), 1); + EXPECT_TRUE(hashmap.erase(3)); + EXPECT_EQ(hashmap.size(), 0); } TYPED_TEST(HashTableTests, CanFindElements) { - int k = 3; - int v = 3; - TypeParam hashmap; - EXPECT_TRUE(hashmap.insert(k, v)); - int* value = hashmap.find(k); - EXPECT_TRUE(value); - EXPECT_EQ(*value, v); + int k = 3; + int v = 3; + TypeParam hashmap; + EXPECT_TRUE(hashmap.insert(k, v)); + int *value = hashmap.find(k); + EXPECT_TRUE(value); + EXPECT_EQ(*value, v); } TYPED_TEST(HashTableTests, ManyInsertsAndErase) { - // intended to test resize - TypeParam hashmap; - const auto max = 1 << 16; - for (auto i = 0; i < max; i ++) EXPECT_TRUE(hashmap.insert(i, max - i)); - for (auto i = 0; i < max; i ++) { - int* val = hashmap.find(i); - EXPECT_TRUE(val); - EXPECT_EQ(*val, max - i); - }; - for (auto i = 0; i < max; i ++) EXPECT_TRUE(hashmap.erase(i)); - EXPECT_EQ(hashmap.size(), 0); + // intended to test resize + TypeParam hashmap; + const auto max = 1 << 16; + for (auto i = 0; i < max; i++) + EXPECT_TRUE(hashmap.insert(i, max - i)); + for (auto i = 0; i < max; i++) { + int *val = hashmap.find(i); + EXPECT_TRUE(val); + EXPECT_EQ(*val, max - i); + }; + for (auto i = 0; i < max; i++) + EXPECT_TRUE(hashmap.erase(i)); + EXPECT_EQ(hashmap.size(), 0); } diff --git a/60029 - Data Processing Systems/algorithms_and_indices/code/join_comparison/joins/unique_sort_merge.h b/60029 - Data Processing Systems/algorithms_and_indices/code/join_comparison/joins/unique_sort_merge.h index 0a2acfc..39c45e2 100644 --- a/60029 - Data Processing Systems/algorithms_and_indices/code/join_comparison/joins/unique_sort_merge.h +++ b/60029 - Data Processing Systems/algorithms_and_indices/code/join_comparison/joins/unique_sort_merge.h @@ -8,7 +8,8 @@ using namespace std; // A sort merge join, that assumes there are no duplicates on the left table. template -Table unique_sort_merge_join(const Table &leftT, const Table &rightT) { +Table +unique_sort_merge_join(const Table &leftT, const Table &rightT) { auto result = join_empty(leftT, rightT); // copy tables (so we can keep const, just reorder) diff --git a/60029 - Data Processing Systems/algorithms_and_indices/code/sort_comparison/tests.cpp b/60029 - Data Processing Systems/algorithms_and_indices/code/sort_comparison/tests.cpp index aff3d01..df07b3f 100644 --- a/60029 - Data Processing Systems/algorithms_and_indices/code/sort_comparison/tests.cpp +++ b/60029 - Data Processing Systems/algorithms_and_indices/code/sort_comparison/tests.cpp @@ -1,6 +1,7 @@ #include "sorts/heapsort.h" -#include "sorts/quicksort.h" #include "sorts/mergesort.h" +#include "sorts/quicksort.h" + #include "generate.h" @@ -10,35 +11,34 @@ using namespace std; -constexpr auto intcomp = [](const int& a, const int& b) -> bool {return a > b;}; +constexpr auto intcomp = [](const int &a, const int &b) -> bool { + return a > b; +}; -// Cannot pass non-type template params to typed tests, so we wrap with a type +// Cannot pass non-type template params to typed tests, so we wrap with a type // associated with the sort. -template struct Sorter { - static constexpr auto s = sort; +template struct Sorter { + static constexpr auto s = sort; }; - -using Sorts = ::testing::Types< - Sorter>, - Sorter>, - Sorter> ->; +using Sorts = ::testing::Types>, + Sorter>, + Sorter>>; template class SortTest : public testing::Test {}; TYPED_TEST_SUITE(SortTest, Sorts); TYPED_TEST(SortTest, EmptyList) { - vector input{}; - vector answer{}; - TypeParam::s(input); - EXPECT_EQ(answer, answer); + vector input{}; + vector answer{}; + TypeParam::s(input); + EXPECT_EQ(answer, answer); } TYPED_TEST(SortTest, OnetoFive) { - vector input{5,4,3,2,1}; - vector answer{1,2,3,4,5}; - TypeParam::s(input); - EXPECT_EQ(answer, answer); + vector input{5, 4, 3, 2, 1}; + vector answer{1, 2, 3, 4, 5}; + TypeParam::s(input); + EXPECT_EQ(answer, answer); } diff --git a/60029 - Data Processing Systems/algorithms_and_indices/diagrams/partitioning.drawio b/60029 - Data Processing Systems/algorithms_and_indices/diagrams/partitioning.drawio index 0ad2179..9dd386f 100644 --- a/60029 - Data Processing Systems/algorithms_and_indices/diagrams/partitioning.drawio +++ b/60029 - Data Processing Systems/algorithms_and_indices/diagrams/partitioning.drawio @@ -1 +1 @@ -7Vtdb9owFP01PHbCcZzAY0u7TdM2TeumtY8ecYk3E0eOGbBfPyexCUkgZP3ASUGqaHxjG/ucY9/rDwZwMl+9EzgOP/GAsIEzDFYDeD1wHDD0xupfallrywgMc8tM0EDbCsMt/UtMUW1d0IAkpYyScyZpXDZOeRSRqSzZsBB8Wc72wFn5W2M8IzXD7RSzuvUHDWSYW0eOX9jfEzoLzTcD0+M5Npl1FUmIA77MTVnn4M0ATgTnMn+aryaEpegZXHIE3u55u2mYIJFsU2Ax+fb9Zvr57/0P+hB9999efgIXFxDpxsm16TEJFAA6yYUM+YxHmN0U1ivBF1FA0mqHKlXk+ch5rIxAGX8RKdeaTbyQXJlCOWf6LVlRebf1fJ9W9Qbp1PVK15wl1iYRSbG+KzKmyfvtd0WxLGXK5f1LO7UXN0MQX4gpaQDL6A+LGZFNoLobetXAIHxOVINUQUEYlvRPuSFYC3S2yVdwqB40jf9Bqa73D2YL/U01issELkMqyW2Ms74v1Tguk6WrI0KSVTOE9R7rAq4ezHoSQFp0y2JAIZ0j3BpLUAv6+SFyYP9UX9J8MQReXvVOS9Xv08BxVO90T/XOsCx749IOyH70UqoH47PqW6setlQ9GNlUPeyB6q3L/jzZt5c9ait7x6bsUfdlv4l5bMl+ByZHlP3wMbIH1mTv9SLG8bone7djMY5ZMpxV30L1fi9iHL8Hqrcte+CeZd9a9uNexDjj7sveeozjNsc4EY/s67yyXQmOp3MzKR0Uums1qjHN7JLSkdexHUuwa3/LTf/QRHVTDvyrD5xGA/9am2vjgjEaJ+QwejiJ83OUB7pKEX8OOGF14vDqE4e7A0/3xeAcvSY4AbQMp+nC64DTNprQOzD/PcaF2d9agm5Ld4RseiPYvEn+bCFFj/iANvlwmxf0p8iH1WUJat5NP0U+3LHVCcs96Hu/YCGppLzBAReEgcNOWHCJs+rg9cW4QP1JTnhUiRCHdScMnF0h94utLY8sdHv7IW7bey2+VUfQvMV1inx4T+RDF/2iQnO5tctTGYrArwyxvAO6VIXVTTOeQHTzlbRTJNqqg0HngVeDxOqGmWlmzz2+izrn8ncdsGpgAy6TPUhmmJfgSqTgv8mEMy6KEfJAGauYMKOzFNOpgowo+1UKKJ1idqlfzGkQsH3bIuUh9wyMgDE6yIh3VEJ2nf2dECGwc4SMGn3B+S5VfV3Yxp1YXdGbZnbp/KV6mwraPmlEzdtQ5wP2enjUeeGjXXGUZeFXj9jtC9/Z74J7eLRj++ARNYSYGs6vmfibQvfXE99Uwpv6KftRoxvUEG6eHjkIdYsdrz5dX2alYp4k9KcCxhn+SqeirJMyIiJJmyFSe2aVYfo45UKQJOZRQKOZzpsvlZ1J5tzSLmYZcVSuMcuVZLUFJCbqI5Js/XoFAEZlAWwu124pwPF3SOARt2BUsvglbL6TWPygGN78Aw== \ No newline at end of file +7Vxbc5s4FP41fswON3F5TBz3stPudprtNHlUQNjKYsQKObb765eLMAZk7LYYZJOZjAcOQojv06dzdCQy0afLzXsKo8Vn4qFgoineZqLfTzTNUe3kNzVsc4PqGFpumVPscVtpeMA/EDcq3LrCHoorBRkhAcNR1eiSMEQuq9ggpWRdLeaToPrUCM5Rw/DgwqBp/Y49tsittmaV9g8IzxfFk1XTya8sYVGYVxEvoEfWuSl7OX020aeUEJYfLTdTFKTgFbjkCLw7cHXXMIpCdsoNq+k/32buXz+evmM//Ga9u/2s3tzogDeObYs3Rl4CAD8llC3InIQwmJXWO0pWoYfSapXkrCzziZAoMaqJ8QUxtuVswhUjiWnBlgG/ijaYPe4dP6VV/QH42f2G15ydbIuTkNHtY1kwPX3av1belp0V9zVhKvggK+qiFmyK7gbpHLE2DI28YArc3hM4C+8RWaKkQUkBigLI8Gu1Z0HeQee7ciWHyQGn8Sco5fW+wmDFn9SguErgeoEZeohgBsY6kXGVLB8HwZQEhGb36r6PTNdN7DGj5F+0d8WznGelFfJXRBnatGLErxpc/nzYALybrksJAl5isac+nUuge1A1/fJ0UlFJKZrOdaKdqJOiQkl0ol2DTjSlKpTCbR4Rin0unajOm04O+okTdVLETJLoRL9KnQwulDeHchAbcKpQNKmEAq5RKLtIbCihCFDsUSjKrwhF7Uso5mVGXuYgQvGQD1cB62p+IlfcVUx83lTSxMa6zLjLukKVDC0T1XiTySFsnMuMupzrk8ngMZfRxHBfJiEJh9dFLQ2snk0XxZh1VBiGXFFW0e5LVgYwJcsDF4NfBVQj/QPT5J3YxLr7k+BwYt1zc0NHQYCjGB1HG8ZRvp7l403KUB1+DyLbF04HTddGz343BOj1oclsDk2GgAHjbATY4yZA1QcmYPcKIyVgaPx188go/iuOuP9cuHGiUwVS+VS9fXmjs8BIXj50qfgw2tMeI+BDrskYaF/VGAEfhiMVIUW7W7z1F0gZZpi0uOySMPW426aEwaw6/f7GUQTzBttF4nWMZxsYoKN1DLsWtypNt61qoqnD2ebUPUujt7xRMe09Kg1LKmUY7anAEfBhds0Hv/VLEv6zvexWTYqqVZNY/kb8rhqru2b8BtHtWxxHQLRcLgmMXnhArkRh0W75Y4Rus+5AuhBBtDDOifAIiw8gn3FUgbeKGlfUPsTcBAM8TzlwExRRYr9L0cMuDG75hSX2vOBQqqYq0Q4YUR1wlBGzV0JEa7AjIkSXjhC71XeMetdcMfM87n7kyhkU7b6ufXP60Gu4oD01NuqtDuDkSE0uqQBRpHZxUqlvdxheKtphNz+KJa2hF3VBS+DLCfiaCaxtAnI9UVct6Grueeg15gItQfD4yAFALnbMpku4ze6KSBzj5wQYTXlJB6/svVmIaJw2g6b2zMoW6aFLKEVxREIPh3NeNp/wa9PMo6avmBWEYbXGrFSc1eahCCU/IQu219sBVLvaAXZbr/d6gGYJukAXe5JePn7yPmzXfz88ft3OlP/gg//RGuhDxnoupoGgAOeWkKCCaZ/bvISQir55u4iM2G+xMOSamZAG0eLlhfXsIb+nEmI6zFdqZ8W0z88KhJjK8UFTt5gODOnxoHPYKVmn4A+4x1AIvux7bM8Kfp87bIXoi/Kho0G/zw3mQvQl+Wyi08G8zwybGFTZN42fdzgfvE9fxRSxtz6dnJb/li3fhlT+czt99j8= \ No newline at end of file diff --git a/60029 - Data Processing Systems/algorithms_and_indices/images/partitioning.drawio.png b/60029 - Data Processing Systems/algorithms_and_indices/images/partitioning.drawio.png index 103bcb7..22e8a03 100644 Binary files a/60029 - Data Processing Systems/algorithms_and_indices/images/partitioning.drawio.png and b/60029 - Data Processing Systems/algorithms_and_indices/images/partitioning.drawio.png differ diff --git a/60029 - Data Processing Systems/introduction/introduction.tex b/60029 - Data Processing Systems/introduction/introduction.tex index 29d1318..d2b91b6 100644 --- a/60029 - Data Processing Systems/introduction/introduction.tex +++ b/60029 - Data Processing Systems/introduction/introduction.tex @@ -259,6 +259,7 @@ \subsection{Read Phenomena} \end{definitionbox} \end{tcbraster} \subsection{Isolation levels} +\textit{Discussed in detail in \autoref{chap:transactions}.} \begin{definitionbox}{Serialisable} \begin{center} \begin{tabular}{c | c | c} diff --git a/60029 - Data Processing Systems/optimisation/optimisation.tex b/60029 - Data Processing Systems/optimisation/optimisation.tex index a29c5f8..0541084 100644 --- a/60029 - Data Processing Systems/optimisation/optimisation.tex +++ b/60029 - Data Processing Systems/optimisation/optimisation.tex @@ -6,7 +6,7 @@ \section{Motivation} \begin{itemize} \item Users want zero-overhead, the system should be as fast as hand-written \& optimised code. \item The database is expected to learn from data (e.g second run of a query is faster) - \item System must be highly flexible (users can create relations, indices, build complex queries without needing to upgrade/reconfigure/recompile any part of the DBMS) + \item System must be highly flexible (users can create relations, indices, build complex queries without needing to upgrade/reconfigure/recompile any part of the DBMS) \end{itemize} In reality current \textit{DBMS} generally succeed in meeting these \textit{miraculous} expectations. @@ -20,9 +20,9 @@ \subsection{Query Optimisers vs Optimising Compilers} The main difference is timing of access to code and input data. \begin{center} \begin{tabular}{l l l} - & \textbf{Code/Query} & \textbf{Input Data} \\ - \textbf{Compiler Optimiser} & At compile time & Unknown \\ - \textbf{Query Optimiser} & At query time & Known before query \\ + & \textbf{Code/Query} & \textbf{Input Data} \\ + \textbf{Compiler Optimiser} & At compile time & Unknown \\ + \textbf{Query Optimiser} & At query time & Known before query \\ \end{tabular} \end{center} @@ -33,7 +33,7 @@ \subsection{Query Optimisers vs Optimising Compilers} ./myprog.cpp # Generates myprog.gcda g++ -fprofile-use myprog.cpp # Use profile when optimising \end{minted} -\end{sidenotebox} +\end{sidenotebox} Correctness is difficult. \begin{itemize} @@ -67,7 +67,7 @@ \subsection{Query Equivalence} \begin{itemize} \item We can determine equivalences between compositions of operators. \item Substitutions of a part of a plan with an equivalent, results in a new equivalent plan. - \item We can use this to transform plans into more optimal (but equivalent) plans. + \item We can use this to transform plans into more optimal (but equivalent) plans. \end{itemize} \begin{sidenotebox}{MonetDB Optimiser} @@ -102,7 +102,7 @@ \section{Peephole Transformations} \subsection{Avoiding Cycles} \begin{definitionbox}{Analytically Optimal Plan} - The final plan output of the optimiser (not necessarily the most optimal plan). + The final plan output of the optimiser (not necessarily the most optimal plan). \end{definitionbox} \begin{center} \includegraphics[width=.8\textwidth]{optimisation/images/optimiser_cycle.drawio.png} @@ -119,7 +119,7 @@ \subsection{Branches} \textbf{Simplicity} & Very easy to implement (particularly with pattern matching). \\ \textbf{Time} & Matching and applying rules is faster than more holistic approaches. \\ \textbf{Verifiability} & Can check each rule for correctness by checking if all rules produce semantically equivalent sub-plans. \\ - \textbf{Composability} & Can easily add new rules to be composed with previous, rules can enable new rules to be applied. \\ + \textbf{Composability} & Can easily add new rules to be composed with previous, rules can enable new rules to be applied. \\ \end{tabbox} \begin{tabbox}{consbox} \textbf{Loops} & Developer must be careful to not introduce potential loops in rule application. \\ @@ -129,27 +129,27 @@ \subsection{Branches} \section{Classifying Optimisation} \begin{center} \begin{tabular}{l p{.8\textwidth}} - \textbf{Algorithm} & The implementation of operators (e.g joins). \\ + \textbf{Algorithm} & The implementation of operators (e.g joins). \\ \textbf{Data} & Data \& metadata held by the system (e.g cardinalities, histograms) \\ \end{tabular} \end{center} \begin{center} \begin{tabular}{l l l c c} - & & &\multicolumn{2}{c}{\textbf{Algorithm}} \\ - & & & Agnostic & Aware \\ - & & & \textit{Logical} & \textit{Physical} \\ - \multirow{2}{*}{\textbf{Data}} & Agnostic & \textit{Rule-Based} & $\bullet$ & $\bullet$ \\ - & Aware & \textit{Cost-Based} & $\bullet$ & $\bullet$ \\ + & & & \multicolumn{2}{c}{\textbf{Algorithm}} \\ + & & & Agnostic & Aware \\ + & & & \textit{Logical} & \textit{Physical} \\ + \multirow{2}{*}{\textbf{Data}} & Agnostic & \textit{Rule-Based} & $\bullet$ & $\bullet$ \\ + & Aware & \textit{Cost-Based} & $\bullet$ & $\bullet$ \\ \end{tabular} \end{center} In DBMS optimisations are defined as operating on \textit{logical} or \textit{physical plans}, and are either \textit{rule-based} or \textit{cost-based}. \begin{center} \begin{tabular}{l l p{.7\textwidth}} - \textbf{Logical} & \textit{Algorithm-Agnostic} & Deals only with relational algebra. \\ - \textbf{Physical} & \textit{Algorithm-Aware} & Can use different operator implementations, indices etc. \\ - \textbf{Rule-Based} & \textit{Data-Agnostic} & Applying optimisation rules that are almost always beneficial. \\ - \textbf{Cost-Based} & \textit{Data-Aware} & Using data to estimate the cost of operations in order to determine which transformations to apply (e.g reordering selections based on each's estimated selectivity). \\ + \textbf{Logical} & \textit{Algorithm-Agnostic} & Deals only with relational algebra. \\ + \textbf{Physical} & \textit{Algorithm-Aware} & Can use different operator implementations, indices etc. \\ + \textbf{Rule-Based} & \textit{Data-Agnostic} & Applying optimisation rules that are almost always beneficial. \\ + \textbf{Cost-Based} & \textit{Data-Aware} & Using data to estimate the cost of operations in order to determine which transformations to apply (e.g reordering selections based on each's estimated selectivity). \\ \end{tabular} \end{center} @@ -157,11 +157,11 @@ \section{Classifying Optimisation} % diverging plans \section{Logical Optimisation} -In order to demonstrate logical optimisation we use a representation of +In order to demonstrate logical optimisation we use a representation of (pseudo) relational algebra in Haskell. \begin{center} -\begin{minipage}{.5\textwidth} - \begin{minted}{haskell} + \begin{minipage}{.5\textwidth} + \begin{minted}{haskell} data Operator = Scan Table | Select Operator Predicate @@ -172,14 +172,14 @@ \section{Logical Optimisation} | Union Operator Operator | Aggregation Operator AggFun | TopN Operator SortBy - \end{minted} -\end{minipage} \hfill \begin{minipage}{.49\textwidth} - \begin{itemize} - \item Purely logical representation, Processing model \& operator implementations not specified. - \item Other functions for predicting cost, ordering predicates defined - \item Using \mintinline{haskell}{data} to allow for easy pattern matching, rather than using an operator typeclass. - \end{itemize} -\end{minipage} + \end{minted} + \end{minipage} \hfill \begin{minipage}{.49\textwidth} + \begin{itemize} + \item Purely logical representation, Processing model \& operator implementations not specified. + \item Other functions for predicting cost, ordering predicates defined + \item Using \mintinline{haskell}{data} to allow for easy pattern matching, rather than using an operator typeclass. + \end{itemize} + \end{minipage} \end{center} We include basic functions for applying transformations to the plan: \begin{minted}{haskell} @@ -202,7 +202,7 @@ \section{Logical Optimisation} = case peep orig of Just opt -> opt Nothing -> apply (root peep) orig -\end{minted} +\end{minted} All that remains is to determine the \mintinline{haskell}{Peephole}'s rules. \begin{sidenotebox}{Your turn!} One way to further simplify the representation is to embed RA as a DSL within another language. \href{https://racket-lang.org/}{Racket} (\textit{the language oriented programming language}) is designed for this. Have a go with your own implementation! @@ -244,7 +244,7 @@ \subsubsection{Selection Pushdown} \end{center} Selections can be \textit{pushed down} through joins if they only use attributed from one side of the join. \begin{itemize} - \item As selections are pipelineable, this often a good optimisation when the underlying processing model is volcano. + \item As selections are pipelineable, this often a good optimisation when the underlying processing model is volcano. \end{itemize} \begin{minted}{sql} SELECT * FROM opL JOIN opR WHERE p2; @@ -266,8 +266,8 @@ \subsubsection{Selection Pushdown} Is \textit{selection pushdown} ever not very beneficial, provide some edge cases? \tcblower \begin{itemize} - \item If the selectivity of the selection is $100\%$ and the join does not increase cardinality (no benefit). - \item If the join significantly reduces cardinality. + \item If the selectivity of the selection is $100\%$ and the join does not increase cardinality (no benefit). + \item If the join significantly reduces cardinality. \end{itemize} \unfinished % Lecture mentions function calls vs access & fk index based joins @@ -277,7 +277,7 @@ \subsubsection{Selection Ordering} \begin{center} \includegraphics[width=.4\textwidth]{optimisation/images/selection_reordering.drawio.png} \end{center} -Reordering selections to reduce cardinality at the earliest possible operator. +Reordering selections to reduce cardinality at the earliest possible operator. \begin{itemize} \item We infer which selection has the lowest selectivity using a heuristic \item A common heuristic for comparison operators: \mintinline{sql}{==} $ < $ ( \mintinline{sql}{<} and \mintinline{sql}{>} ) $ < $ ( \mintinline{sql}{<=} and \mintinline{sql}{>=} ) $ < $ \mintinline{sql}{<>}. @@ -351,9 +351,9 @@ \subsubsection{Histograms} \begin{minipage}{.49\textwidth} \begin{center} \begin{tabular}{l c c c c c} - \multicolumn{6}{c}{$\underline{histogram_a}$} \\ - \textbf{values} & $v_1$ & $v_2$ & $v_3$ & \dots & $v_n$ \\ - \textbf{frequency} & $c_1$ & $c_2$ & $c_3$ & \dots & $c_n$ \\ + \multicolumn{6}{c}{$\underline{histogram_a}$} \\ + \textbf{values} & $v_1$ & $v_2$ & $v_3$ & \dots & $v_n$ \\ + \textbf{frequency} & $c_1$ & $c_2$ & $c_3$ & \dots & $c_n$ \\ \end{tabular} \end{center} \end{minipage} \hfill \begin{minipage}{.49\textwidth} @@ -375,20 +375,20 @@ \subsubsection{Multidimensional Histograms} Often attribute values are correlated (e.g largest orders tend to be urgent). \begin{center} \begin{tabular}{l l c c c c c} - \multicolumn{7}{c}{$\underline{histogram_{(a_1, a_2)}}$} \\ - & & \multicolumn{5}{c}{attribute $a_1$} \\ - & & ${v_{a_1}}_1$ & ${v_{a_1}}_2$ & ${v_{a_1}}_3$ & \dots & ${v_{a_1}}_n$ \\ - \multirow{5}{*}{attribute $a_2$} & ${v_{a_2}}_1$ & $c_{(1,1)}$ & $c_{(2,1)}$ & $c_{(3,1)}$ & $c_{(4,1)}$ & $c_{(5,1)}$ \\ - & ${v_{a_2}}_2$ & $c_{(1,2)}$ & $c_{(2,2)}$ & $c_{(3,2)}$ & $c_{(4,2)}$ & $c_{(5,2)}$ \\ - & ${v_{a_2}}_3$ & $c_{(1,3)}$ & $c_{(2,3)}$ & $c_{(3,3)}$ & $c_{(4,3)}$ & $c_{(5,3)}$ \\ - & \vdots & $c_{(1,4)}$ & $c_{(2,4)}$ & $c_{(3,4)}$ & $c_{(4,4)}$ & $c_{(5,4)}$ \\ - & ${v_{a_2}}_n$ & $c_{(1,5)}$ & $c_{(2,5)}$ & $c_{(3,5)}$ & $c_{(4,5)}$ & $c_{(5,5)}$ \\ + \multicolumn{7}{c}{$\underline{histogram_{(a_1, a_2)}}$} \\ + & & \multicolumn{5}{c}{attribute $a_1$} \\ + & & ${v_{a_1}}_1$ & ${v_{a_1}}_2$ & ${v_{a_1}}_3$ & \dots & ${v_{a_1}}_n$ \\ + \multirow{5}{*}{attribute $a_2$} & ${v_{a_2}}_1$ & $c_{(1,1)}$ & $c_{(2,1)}$ & $c_{(3,1)}$ & $c_{(4,1)}$ & $c_{(5,1)}$ \\ + & ${v_{a_2}}_2$ & $c_{(1,2)}$ & $c_{(2,2)}$ & $c_{(3,2)}$ & $c_{(4,2)}$ & $c_{(5,2)}$ \\ + & ${v_{a_2}}_3$ & $c_{(1,3)}$ & $c_{(2,3)}$ & $c_{(3,3)}$ & $c_{(4,3)}$ & $c_{(5,3)}$ \\ + & \vdots & $c_{(1,4)}$ & $c_{(2,4)}$ & $c_{(3,4)}$ & $c_{(4,4)}$ & $c_{(5,4)}$ \\ + & ${v_{a_2}}_n$ & $c_{(1,5)}$ & $c_{(2,5)}$ & $c_{(3,5)}$ & $c_{(4,5)}$ & $c_{(5,5)}$ \\ \end{tabular} \end{center} \begin{itemize} \item Store multiple histograms to show frequencies of attribute values, given other attribute's value. - \item Number of histograms grows combinatorially with number of tables. \toimprove + \item Number of histograms grows combinatorially with number of tables. \item Reducing the number of histograms, but still producing good selectivity estimates is an open area of research. \end{itemize} \[selectivity(a_1 = v_1 \land a_2 = v_2) = P(a_1 = v_1 | a_2 = v_2) \times P(a_2 = v_2) = \cfrac{histogram_{(a_1, a_2)}.(v_1, v_2)}{histogram_{(a_1, a_2)}.total} \] @@ -400,7 +400,7 @@ \section{Physical Optimisation} \item Operator implementations (e.g which join: sort-merge, hash, nested loop, index based join etc) \item Costs of different implementations (e.g hash join vs nested-loop $\to$ time versus memory) \item Available indices \& data structure choices (e.g type of hashmap, hash function) - \end{itemize} + \end{itemize} Physical plan optimisation focuses on optimising the plan for the specific system the query is executed on. \end{definitionbox} @@ -418,10 +418,10 @@ \subsection{Rule Based Physical Optimisation} Much like \textit{logical rule-based optimisation}, (almost) universally beneficial (given the decided cost metric) rules to improve performance. \begin{center} \begin{tabular}{l p{.8\textwidth}} - \textbf{Data structures} & Always use hash map with rehashing for probe if expected collisions are high. \\ - \textbf{Parallelism} & always use parallel sort for \mintinline{sql}{ORDER BY}, always partition hash joins \\ + \textbf{Data structures} & Always use hash map with rehashing for probe if expected collisions are high. \\ + \textbf{Parallelism} & always use parallel sort for \mintinline{sql}{ORDER BY}, always partition hash joins \\ \textbf{Using Indices} & If foreign key index exists, always use for foreign key join, if getting range always use available bitmap or B+ tree index. \\ - \textbf{Cache} & Always use cache-conscious partitioning to improve locality. \\ + \textbf{Cache} & Always use cache-conscious partitioning to improve locality. \\ \end{tabular} \end{center} @@ -430,11 +430,11 @@ \subsection{Cost Based Physical Optimisation} \begin{center} \begin{tabular}{l p{.8\textwidth}} \textbf{Data} & Consider cardinalities \& how this affect operator choice (e.g choose sort-merge join over hash if the required hashtable is too large for the buffer pool). \\ - \textbf{Hardware} & Function call overhead (for this architecture), buffer pool size, access latencies, available parallelism (hardware threads). \\ - \textbf{Algorithm} & Must consider how algorithms expected costs change with parameters (e.g cardinality) \\ + \textbf{Hardware} & Function call overhead (for this architecture), buffer pool size, access latencies, available parallelism (hardware threads). \\ + \textbf{Algorithm} & Must consider how algorithms expected costs change with parameters (e.g cardinality) \\ \end{tabular} \end{center} -This is the current state of the art in optimisation. +This is the current state of the art in optimisation. \section{SparkSQL} \begin{sidenotebox}{SparkSQL Logical Optimiser} diff --git a/60029 - Data Processing Systems/processing_models/code/bulk_processing/.gitignore b/60029 - Data Processing Systems/processing_models/code/bulk_processing/.gitignore new file mode 100644 index 0000000..c795b05 --- /dev/null +++ b/60029 - Data Processing Systems/processing_models/code/bulk_processing/.gitignore @@ -0,0 +1 @@ +build \ No newline at end of file diff --git a/60029 - Data Processing Systems/processing_models/code/bulk_processing/CMakeLists.txt b/60029 - Data Processing Systems/processing_models/code/bulk_processing/CMakeLists.txt new file mode 100644 index 0000000..b640f23 --- /dev/null +++ b/60029 - Data Processing Systems/processing_models/code/bulk_processing/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.22) +project(Functions) + +include(FetchContent) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED YES) +set(CMAKE_CXX_EXTENSIONS NO) + +add_executable(Examples examples.cpp) diff --git a/60029 - Data Processing Systems/processing_models/code/bulk_processing/README.md b/60029 - Data Processing Systems/processing_models/code/bulk_processing/README.md new file mode 100644 index 0000000..d3715c7 --- /dev/null +++ b/60029 - Data Processing Systems/processing_models/code/bulk_processing/README.md @@ -0,0 +1,13 @@ +## What is this? +[Examples](examples.h) for a basic bulk processing example. + +## To build & Run +```bash +cmake -S . -B build -DCMAKE_BUILD_TYPE=Release +make -j -C build/ +./build/Examples +``` + +## Contribute! +- Adding tests, comparative benchmark against [volcano](./../volcano/) + diff --git a/60029 - Data Processing Systems/processing_models/code/bulk_operators.cpp b/60029 - Data Processing Systems/processing_models/code/bulk_processing/examples.cpp similarity index 63% rename from 60029 - Data Processing Systems/processing_models/code/bulk_operators.cpp rename to 60029 - Data Processing Systems/processing_models/code/bulk_processing/examples.cpp index d38a9a3..6043029 100644 --- a/60029 - Data Processing Systems/processing_models/code/bulk_operators.cpp +++ b/60029 - Data Processing Systems/processing_models/code/bulk_processing/examples.cpp @@ -1,12 +1,10 @@ #include #include -using namespace std; - // We create a basic NAry table type with type V (some std::variant) -template using Row = vector; +template using Row = std::vector; -template using Table = vector>; +template using Table = std::vector>; template size_t select_eq(Table &outputBuffer, const Table &inputBuffer, @@ -41,32 +39,34 @@ void example_bulk() { select_eq(UrgentAndPendingOrders, PendingOrders, URGENT, 2); for (auto &r : UrgentAndPendingOrders) { - cout << "id: " << r[0] << endl; + std::cout << "id: " << r[0] << std::endl; } } -using Candidates = vector; +using Candidates = std::vector; -template -size_t add_candidates(const Table& underlyingBuffer, Candidates& outputRows) { - for (uint32_t i = 0; i < underlyingBuffer.size(); i++) { - outputRows.push_back(i); - } - return outputRows.size(); +template +size_t add_candidates(const Table &underlyingBuffer, + Candidates &outputRows) { + for (uint32_t i = 0; i < underlyingBuffer.size(); i++) { + outputRows.push_back(i); + } + return outputRows.size(); } -template -size_t select_eq(const Table& underlyingBuffer, Candidates& outputRows, const Candidates& inputRows, V eq_value, size_t attribOffset) { - for (const uint32_t index : inputRows) { - if (underlyingBuffer[index][attribOffset] == eq_value) { - outputRows.push_back(index); - } +template +size_t select_eq(const Table &underlyingBuffer, Candidates &outputRows, + const Candidates &inputRows, V eq_value, size_t attribOffset) { + for (const uint32_t index : inputRows) { + if (underlyingBuffer[index][attribOffset] == eq_value) { + outputRows.push_back(index); } - return outputRows.size(); + } + return outputRows.size(); } void example_reference_bulk() { - // we can make a basic example with int only values as + // we can make a basic example with int only values as // CREATE TABLE Orders (orderId int, status int, urgency int); // C-style enums used for brevity enum Urgency { URGENT, NOT_URGENT, IGNORE }; @@ -80,17 +80,17 @@ void example_reference_bulk() { {4, PENDING, URGENT}, }; - Candidates OrdersCandidates, PendingOrders, UrgentAndPendingOrders; add_candidates(Orders, OrdersCandidates); select_eq(Orders, PendingOrders, OrdersCandidates, PENDING, 1); select_eq(Orders, UrgentAndPendingOrders, PendingOrders, URGENT, 2); for (auto &r : UrgentAndPendingOrders) { - cout << "id: " << Orders[r][0] << endl; + std::cout << "id: " << Orders[r][0] << std::endl; } } int main() { - example_reference_bulk(); + example_bulk(); + example_reference_bulk(); } diff --git a/60029 - Data Processing Systems/processing_models/code/functions/lambdas.cpp b/60029 - Data Processing Systems/processing_models/code/functions/lambdas.cpp index 67ef036..df6bca9 100644 --- a/60029 - Data Processing Systems/processing_models/code/functions/lambdas.cpp +++ b/60029 - Data Processing Systems/processing_models/code/functions/lambdas.cpp @@ -1,88 +1,90 @@ #include -#include -#include #include +#include +#include /* Helper code for getting type names */ -template -consteval auto t_location() { - const auto& loc = std::source_location::current(); - return loc.function_name(); +template consteval auto t_location() { + const auto &loc = std::source_location::current(); + return loc.function_name(); } -template -std::string type() { - constexpr auto prefix_len = std::char_traits::length("consteval auto t_location() [with T = "); - std::string s(t_location()); - return std::string(&s[prefix_len], &s[s.size()-1]); +template std::string type() { + constexpr auto prefix_len = + std::char_traits::length("consteval auto t_location() [with T = "); + std::string s(t_location()); + return std::string(&s[prefix_len], &s[s.size() - 1]); } // We define a lambda, each lambda has its own type. -// - We cannot write this type, but we left the compiler deduce it using `auto` -// - Everywhere we use pass & use add, we are effectively just making a normal -// function call (no need to pass function pointers (compiler can bake in the +// - We cannot write this type, but we left the compiler deduce it using `auto` +// - Everywhere we use pass & use add, we are effectively just making a normal +// function call (no need to pass function pointers (compiler can bake in the // jump), can inline. -// - However, this means we get no runtime polymorphism. When passing add to a -// function as an auto-parameter we are not `passing a lambda`, but rather +// - However, this means we get no runtime polymorphism. When passing add to a +// function as an auto-parameter we are not `passing a lambda`, but rather // telling the compiler to generate a version of the function that uses `add` -// (it is an implicit template) -auto add = [](int a, int b){ return a + b; }; +// (it is an implicit template) +auto add = [](int a, int b) { return a + b; }; // For example both below are equivalent -int apply_op(auto op) { return op(2,3); } -template int apply_op() { return op(2,3); } +int apply_op(auto op) { return op(2, 3); } +template int apply_op() { return op(2, 3); } -// But wait a minute! You said zero-sized, but `sizeof(add) = 1`, why is it a byte large?! -// - zero sized types are very useful, but can be a headache for compiler writers -auto lambda_1 = []{ return 1; }; -auto lambda_2 = []{ return 2; }; -// If `sizeof(lambda_1) = sizeof(lambda_2) = 0`, then `&lambda_1 = &lambda_2`, -// but they're different objects, and the cpp spec says different objects have +// But wait a minute! You said zero-sized, but `sizeof(add) = 1`, why is it a +// byte large?! +// - zero sized types are very useful for programmers, but a headache for +// compiler writers +auto lambda_1 = [] { return 1; }; +auto lambda_2 = [] { return 2; }; +// If `sizeof(lambda_1) = sizeof(lambda_2) = 0`, then `&lambda_1 = &lambda_2`, +// but they're different objects, and the cpp spec says different objects have // different addresses? +// We could technically get around this in the context of a struct's members +// using [[no_unique_address]] -// C-stye function pointers +// C-style function pointers // - Jump to a pointer // - Runtime polymorphism -// - unary '+' is for type promotion (e.g bool -> int, lambda type -> function ptr) +// - unary '+' is for type promotion (e.g bool -> int, lambda type -> function +// ptr) int (*add_func_ptr)(int, int) = +add; -int apply_op_ptr(int op(int, int)) { - return op(2,3); -} +int apply_op_ptr(int op(int, int)) { return op(2, 3); } -// std::function is a generalised wrapper for lambdas, binds. +// std::function is a generalised wrapper for lambdas, binds. // - A function object // - Runtime polymorphism // - Preferable over using function pointers / more idiomatic cpp std::function add_std_func = add; - // virtuals can be used for runtime polymorphism // - struct's represented by: // my class { void *vtable; ... } // - the vtable is a const void*[] containing pointers to members. -// - each method has an implicit first argument of `this` (the class), some +// - each method has an implicit first argument of `this` (the class), some // languages do this explicitly (e.g Rust) struct VirtualAdd { - virtual int add(int a, int b) { return a + b; } + virtual int add(int a, int b) { return a + b; } }; VirtualAdd add_virtual; int main() { - std::cout << "for add:" << std::endl - << "Type: " << type() << std::endl - << "size: " << sizeof(decltype(add)) << std::endl; + std::cout << "for add:" << std::endl + << "Type: " << type() << std::endl + << "size: " << sizeof(decltype(add)) << std::endl; + + std::cout << "for add_func_ptr:" << std::endl + << "Type: " << type() << std::endl + << "size: " << sizeof(decltype(add_func_ptr)) << std::endl; - std::cout << "for add_func_ptr:" << std::endl - << "Type: " << type() << std::endl - << "size: " << sizeof(decltype(add_func_ptr)) << std::endl; - - std::cout << "for add_std_func:" << std::endl - << "Type: " << type() << std::endl - << "size: " << sizeof(decltype(add_std_func)) << std::endl; + std::cout << "for add_std_func:" << std::endl + << "Type: " << type() << std::endl + << "size: " << sizeof(decltype(add_std_func)) << std::endl; - std::cout << "for Virtuals:" << std::endl - << "Type: " << type() << std::endl - << "size: " << sizeof(decltype(add_virtual)) << std::endl - << "(size is just of the sizeof(void*) = " << sizeof(void*) << " vtable pointer)" << std::endl; + std::cout << "for Virtuals:" << std::endl + << "Type: " << type() << std::endl + << "size: " << sizeof(decltype(add_virtual)) << std::endl + << "(size is just of the sizeof(void*) = " << sizeof(void *) + << " vtable pointer)" << std::endl; } diff --git a/60029 - Data Processing Systems/processing_models/code/volcano/.gitignore b/60029 - Data Processing Systems/processing_models/code/volcano/.gitignore new file mode 100644 index 0000000..c795b05 --- /dev/null +++ b/60029 - Data Processing Systems/processing_models/code/volcano/.gitignore @@ -0,0 +1 @@ +build \ No newline at end of file diff --git a/60029 - Data Processing Systems/processing_models/code/volcano/CMakeLists.txt b/60029 - Data Processing Systems/processing_models/code/volcano/CMakeLists.txt new file mode 100644 index 0000000..5f172d8 --- /dev/null +++ b/60029 - Data Processing Systems/processing_models/code/volcano/CMakeLists.txt @@ -0,0 +1,35 @@ +cmake_minimum_required(VERSION 3.22) +project(Functions) + +include(FetchContent) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED YES) +set(CMAKE_CXX_EXTENSIONS NO) + +FetchContent_Declare( + googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG main +) + +FetchContent_Declare( + googlebenchmark + GIT_REPOSITORY https://github.com/google/benchmark.git + GIT_TAG main +) + +FetchContent_MakeAvailable( + googletest + googlebenchmark +) + +add_executable(Benchmark benchmarks.cpp) +target_link_libraries(Benchmark benchmark::benchmark) + +add_executable(Test tests.cpp) +add_library(GTest::GTest INTERFACE IMPORTED) +target_link_libraries(GTest::GTest INTERFACE gtest_main) +target_link_libraries(Test GTest::GTest) + +add_executable(Examples examples.cpp) diff --git a/60029 - Data Processing Systems/processing_models/code/volcano/README.md b/60029 - Data Processing Systems/processing_models/code/volcano/README.md new file mode 100644 index 0000000..5972ddb --- /dev/null +++ b/60029 - Data Processing Systems/processing_models/code/volcano/README.md @@ -0,0 +1,18 @@ +## What is this? +[Operators](operators.h) for a basic volcano operator implementation. +- Uses templates for output types (for simplicity), an actual system would need to return variants. +- Tests and benchmarks currently empty. + +## To build & Run +```bash +cmake -S . -B build -DCMAKE_BUILD_TYPE=Release +make -j -C build/ +./build/Test +./build/Benchmark +./build/Examples +``` + +## Contribute! +- Adding new operators. +- Pretty printing the volcano operator structure / algebra from the operators themselves. +- Counting function calls (consider our use of templates here/inlining) diff --git a/60029 - Data Processing Systems/processing_models/code/volcano/benchmarks.cpp b/60029 - Data Processing Systems/processing_models/code/volcano/benchmarks.cpp new file mode 100644 index 0000000..8058a70 --- /dev/null +++ b/60029 - Data Processing Systems/processing_models/code/volcano/benchmarks.cpp @@ -0,0 +1,5 @@ +#include "operators.h" + +#include + +BENCHMARK_MAIN(); \ No newline at end of file diff --git a/60029 - Data Processing Systems/processing_models/code/volcano/examples.cpp b/60029 - Data Processing Systems/processing_models/code/volcano/examples.cpp new file mode 100644 index 0000000..7a42d77 --- /dev/null +++ b/60029 - Data Processing Systems/processing_models/code/volcano/examples.cpp @@ -0,0 +1,117 @@ +#include "operators.h" + +/* Prints for debugging purposes */ +template +std::ostream &operator<<(std::ostream &os, const std::vector &vec) { + os << "["; + for (const auto &iter : vec) { + os << " " << iter; + } + os << "]"; + return os; +} + +/* Note: need to unwrap the T (otherwise will match empty parameter + * pack/variadict template and fail of _Nth_type with 0 types). + */ +template +std::ostream &operator<<(std::ostream &os, const std::variant &var) { + std::visit([&os](auto &&arg) { os << arg; }, var); + return os; +} + +template void printConsume(Operator &op) { + op.open(); + for (auto val = op.next(); val.has_value(); val = op.next()) { + std::cout << val.value() << std::endl; + } + op.close(); +} + +void example1() { + std::cout << "SELECT *" << std::endl + << "FROM table CROSS JOIN table;" << std::endl; + + std::shared_ptr data = std::make_shared
(Table{{1, 'c', true}, + {1, 'c', false}, + {2, 'c', false}, + {1, 'd', true}, + {3, 'e', false}}); + + auto scan1 = std::make_unique>(data); + auto scan2 = std::make_unique>(data); + auto cross = std::make_unique>(std::move(scan1), + std::move(scan2)); + + Project> proj( + std::move(cross), [](std::tuple t) { + auto vec2 = std::get<1>(t); + auto vec1 = std::get<0>(t); + vec1.insert(vec1.end(), vec2.begin(), vec2.end()); + return vec1; + }); + + printConsume(proj); +} + +void example2() { + std::cout << "SELECT table.1, table.2" << std::endl + << "FROM table" << std::endl + << "WHERE table.0 = 1;" << std::endl; + std::shared_ptr
data = std::make_shared
(Table{{1, 'c', true}, + {1, 'c', false}, + {2, 'c', false}, + {1, 'd', true}, + {3, 'e', false}}); + + auto scan = std::make_unique>(data); + + auto filter = std::make_unique>( + std::move(scan), [](Row r) { return std::get(r[0]) == 1; }); + + Project proj(std::move(filter), [](Row r) { + return Row{r[1], r[2]}; + }); + + printConsume(proj); +} + +size_t hashValue(Value v) { return std::hash{}(v); } +size_t nextSlotLinear(size_t prev) { return prev + 1; } + +void example3() { + std::cout << "SELECT table.1, MAX(table.0)" << std::endl + << "FROM table" << std::endl + << "GROUP BY table.1;" << std::endl; + + std::shared_ptr
data = std::make_shared
(Table{{1, 'c', true}, + {1, 'd', true}, + {1, 'c', false}, + {2, 'c', false}, + {5, 'c', false}, + {3, 'e', false}}); + + auto scan = std::make_unique>(data); + + // Group by for single column + auto groupBySecondCol = [](Row r) { return r[1]; }; + auto aggregateSecondCol = [](std::optional r1, Row r2) { + if (r1.has_value()) { + return Row{std::max(std::get(r1.value()[0]), std::get(r2[0])), + r2[1]}; + } else { + return Row{r2[0], r2[1]}; + } + }; + + GroupBy groupby( + std::move(scan), groupBySecondCol, aggregateSecondCol); + + printConsume(groupby); +} + +int main() { + example1(); + example2(); + example3(); +} \ No newline at end of file diff --git a/60029 - Data Processing Systems/processing_models/code/volcano/operators.h b/60029 - Data Processing Systems/processing_models/code/volcano/operators.h new file mode 100644 index 0000000..5ebfb1f --- /dev/null +++ b/60029 - Data Processing Systems/processing_models/code/volcano/operators.h @@ -0,0 +1,361 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +template struct Operator { + virtual void open() = 0; + virtual void close() = 0; + virtual std::optional next() = 0; +}; + +template struct Scan : Operator { + using TableType = std::vector; + /* Many different operators can have a reference to and read the table. + * - shared_ptr drops table after it is no longer needed + * - must avoid copying very large table structure + */ + Scan(std::shared_ptr t) : _table(t), _index(0) { assert(_table); } + + /* No operation on open / close */ + void open() override {} + void close() override {} + + std::optional next() override { + if (_index < (*_table).size()) { + return (*_table)[_index++]; + } else { + return {}; + } + } + +private: + std::shared_ptr _table; + size_t _index; +}; + +template struct Project : Operator { + using Projection = std::function; + + Project(std::unique_ptr> child, Projection proj) + : _child(move(child)), _proj(proj) { + assert(_child); + } + + void open() override { _child->open(); } + void close() override { _child->close(); } + + std::optional next() override { + // Note: can be simplified with + // std::optional::and_then(std::function) in C++23 + auto next = _child->next(); + if (next.has_value()) { + return _proj(next.value()); + } else { + return {}; + } + } + +private: + std::unique_ptr> _child; + Projection _proj; +}; + +template struct Select : Operator { + using Predicate = std::function; + + Select(std::unique_ptr> child, Predicate pred) + : _child(move(child)), _pred(pred) { + assert(_child); + } + + void open() override { _child->open(); } + void close() override { _child->close(); } + + std::optional next() override { + auto candidate = _child->next(); + // keep getting candidates until there are no more, or one is valid. + while (candidate.has_value() && !_pred(candidate.value())) { + candidate = _child->next(); + } + return candidate; + } + +private: + std::unique_ptr> _child; + Predicate _pred; +}; + +template struct Union : Operator { + Union(std::unique_ptr> leftChild, + std::unique_ptr> rightChild) + : _leftChild(move(leftChild)), _rightChild(move(rightChild)) { + assert(_leftChild && _rightChild); + } + + void open() override { + _leftChild->open(); + _rightChild->open(); + } + void close() override { + _leftChild->close(); + _rightChild->close(); + } + + std::optional next() override { + auto candidate = _leftChild->next(); + if (candidate.has_value()) { + return candidate; + } else { + return _rightChild->next(); + } + } + +private: + std::unique_ptr> _leftChild, _rightChild; +}; + +/* The definition of difference forces the pipeline to be broken (buffering) */ +template struct Difference : Operator { + Difference(std::unique_ptr> fromChild, + std::unique_ptr> subChild) + : _fromChild(fromChild), _subChild(subChild), _subBuffer() { + assert(_fromChild && _subChild); + } + + void open() override { + _fromChild->open(); + _subChild->open(); + + // buffer all to subtract + for (auto candidate = _subChild->next(); candidate.has_value(); + candidate = _subChild->next()) { + _subBuffer.push_back(candidate); + } + } + void close() override { + _fromChild->close(); + _subChild->close(); + } + + std::optional next() override { + auto candidate = _fromChild->next(); + // keep gettihg next until there is no next candidate, or the candidate is + // not being subtracted + while (candidate.has_value() && _subBuffer.contains(candidate.value())) { + candidate = _fromChild->next(); + } + return candidate; + } + +private: + std::unique_ptr> _fromChild, _subChild; + std::unordered_set _subBuffer; +}; + +template +struct BreakingCrossProduct : Operator> { + BreakingCrossProduct(std::unique_ptr> leftChild, + std::unique_ptr> rightChild) + : _leftChild(move(leftChild)), _rightChild(move(rightChild)), + _leftCurrent(), _rightIndex(0), _rightBuffer() { + assert(_leftChild && _rightChild); + } + + void open() override { + _leftChild->open(); + _rightChild->open(); + + // set first left (can be none -> in which case next will never return + // anything) + _leftCurrent = _leftChild->next(); + + // buffer in the entirety of the right + for (auto candidate = _rightChild->next(); candidate.has_value(); + candidate = _rightChild->next()) { + _rightBuffer.push_back(candidate.value()); + } + } + + void close() override { + _leftChild->close(); + _rightChild->close(); + } + + std::optional> next() override { + // instd::variant: _rightBuffer.size() > _rightIndex >= 0 + if (_leftCurrent.has_value() && !_rightBuffer.empty()) { + auto next_val = + std::make_tuple(_leftCurrent.value(), _rightBuffer[_rightIndex]); + + _rightIndex++; + if (_rightIndex == _rightBuffer.size()) { + _rightIndex = 0; + _leftCurrent = _leftChild->next(); + } + + return next_val; + } else { + return {}; + } + } + +private: + std::unique_ptr> _leftChild; + std::unique_ptr> _rightChild; + std::optional _leftCurrent; + size_t _rightIndex; + std::vector _rightBuffer; +}; + +template +struct CrossProduct : Operator> { + CrossProduct(std::unique_ptr> leftChild, + std::unique_ptr> rightChild) + : _leftChild(move(leftChild)), _rightChild(move(rightChild)), + _leftCurrent(), _rightBuffered(), _rightOffset(0) { + assert(_leftChild && _rightChild); + } + + void open() override { + _leftChild->open(); + _rightChild->open(); + _leftCurrent = _leftChild->next(); + } + void close() override { + _leftChild->close(); + _rightChild->close(); + } + + std::optional> next() override { + /* invariants: + * - _leftCurrent is already set + * - if there are no more _rightChild to get, then we are iterating over the + * _leftChild + */ + auto rightCandidate = _rightChild->next(); + if (rightCandidate.has_value()) { + // still getting content from the right had side + _rightBuffered.push_back(rightCandidate.value()); + } else if (_rightOffset == _rightBuffered.size()) { + // all tuples have been taken from right hand side, now using buffer + _leftCurrent = _leftChild->next(); + _rightOffset = 0; + } + + // only return if both sides have values + if (_leftCurrent.has_value() && !_rightBuffered.empty()) { + // get tuple and increment _rightOffset + return std::make_tuple(_leftCurrent.value(), + _rightBuffered[_rightOffset++]); + } else { + return {}; + } + } + +private: + std::unique_ptr> _leftChild; + std::unique_ptr> _rightChild; + std::optional _leftCurrent; + std::vector _rightBuffered; + size_t _rightOffset; +}; + +/* We use the template to determine the hash and nextSlot implementations used + * T -> type of data provided by the child + * S -> data output by the groupBy & aggregation + * K -> the type grouped on, produced by a grouping function (K group(T)) + * hash -> a function to convert a key into a hash + * nextSlot -> to determine next slot in collisions + */ +template +struct GroupBy : Operator { + using Aggregation = std::function, T)>; + using Grouping = std::function; + + GroupBy(std::unique_ptr> child, Grouping grouping, + Aggregation aggregation) + : _child(move(child)), _grouping(grouping), _aggregation(aggregation), + _hashTable(), _hashTableCursor(0) { + assert(_child); + } + + void open() override { + _child->open(); + + std::vector childValues; + for (auto currentVal = _child->next(); currentVal.has_value(); + currentVal = _child->next()) { + childValues.push_back(currentVal.value()); + } + + _hashTable = std::vector>>( + childValues.size(), std::optional>()); + for (T val : childValues) { + K key = _grouping(val); + size_t slot = hashFun(key) % _hashTable.size(); + while (_hashTable[slot].has_value() && + _hashTable[slot].value().first != key) { + slot = nextSlot(slot) % _hashTable.size(); + } + + // slot is now correct, either a value present with the same key, or none. + auto prev_val = _hashTable[slot].has_value() + ? _hashTable[slot].value().second + : std::optional(); + _hashTable[slot] = std::optional>( + std::make_pair(move(key), _aggregation(prev_val, val))); + } + + // all values moved into the hashtable, so std::vector deallocated + } + + void close() override { _child->close(); } + + std::optional next() override { + while (_hashTableCursor < _hashTable.size()) { + auto slot = _hashTable[_hashTableCursor]; + _hashTableCursor++; + + if (slot.has_value()) { + return slot.value().second; + } + } + return {}; + } + +private: + Aggregation _aggregation; + Grouping _grouping; + std::unique_ptr> _child; + std::vector>> _hashTable; + size_t _hashTableCursor; +}; + +/* By templating our operators in terms of output, we can be more flexible with + * our implementation: + * Which types of values? e.g Scan>? + * Copy or indirection? e.g Scan or Scan + * + * When using operators, we will need to use some structure that encodes type at + * runtime, so we can use... + */ + +/* Can only determine row size and types at runtime + * - length unknown -> use std::vector + * - types unknown -> use std::variant + * + * We use N-ary storage for the table (volcano processes row by row) + */ +using Value = std::variant; +using Row = std::vector; +using Table = std::vector; diff --git a/60029 - Data Processing Systems/processing_models/code/volcano/tests.cpp b/60029 - Data Processing Systems/processing_models/code/volcano/tests.cpp new file mode 100644 index 0000000..9361596 --- /dev/null +++ b/60029 - Data Processing Systems/processing_models/code/volcano/tests.cpp @@ -0,0 +1,5 @@ +#include "operators.h" + +#include "gtest/gtest.h" + +// Currently Untested - feel free to add your own! diff --git a/60029 - Data Processing Systems/processing_models/code/volcano_operators.cpp b/60029 - Data Processing Systems/processing_models/code/volcano_operators.cpp deleted file mode 100644 index efb9837..0000000 --- a/60029 - Data Processing Systems/processing_models/code/volcano_operators.cpp +++ /dev/null @@ -1,566 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace std; - -template -struct Operator -{ - virtual void open() = 0; - virtual void close() = 0; - virtual optional next() = 0; -}; - -template -struct Scan : Operator -{ - using TableType = vector; - /* Many different operators can have a reference to and read the table. - * - shared_ptr drops table after it is no longer needed - * - must avoid copying very large table structure - */ - Scan(shared_ptr t) : _table(t), _index(0) { assert(_table); } - - /* No operation on open / close */ - void open() override {} - void close() override {} - - optional next() override - { - if (_index < (*_table).size()) - { - return (*_table)[_index++]; - } - else - { - return {}; - } - } - -private: - shared_ptr _table; - size_t _index; -}; - -template -struct Project : Operator -{ - using Projection = function; - - Project(unique_ptr> child, Projection proj) - : _child(move(child)), _proj(proj) - { - assert(_child); - } - - void open() override { _child->open(); } - void close() override { _child->close(); } - - optional next() override - { - // Note: can be simplified with optional::and_then(function) in C++23 - auto next = _child->next(); - if (next.has_value()) - { - return _proj(next.value()); - } - else - { - return {}; - } - } - -private: - unique_ptr> _child; - Projection _proj; -}; - -template -struct Select : Operator -{ - using Predicate = function; - - Select(unique_ptr> child, Predicate pred) - : _child(move(child)), _pred(pred) - { - assert(_child); - } - - void open() override { _child->open(); } - void close() override { _child->close(); } - - optional next() override - { - auto candidate = _child->next(); - // keep getting candidates until there are no more, or one is valid. - while (candidate.has_value() && !_pred(candidate.value())) - { - candidate = _child->next(); - } - return candidate; - } - -private: - unique_ptr> _child; - Predicate _pred; -}; - -template -struct Union : Operator -{ - Union(unique_ptr> leftChild, unique_ptr> rightChild) - : _leftChild(move(leftChild)), _rightChild(move(rightChild)) - { - assert(_leftChild && _rightChild); - } - - void open() override - { - _leftChild->open(); - _rightChild->open(); - } - void close() override - { - _leftChild->close(); - _rightChild->close(); - } - - optional next() override - { - auto candidate = _leftChild->next(); - if (candidate.has_value()) - { - return candidate; - } - else - { - return _rightChild->next(); - } - } - -private: - unique_ptr> _leftChild, _rightChild; -}; - -/* The definition of difference forces the pipeline to be broken (buffering) */ -template -struct Difference : Operator -{ - Difference(unique_ptr> fromChild, - unique_ptr> subChild) - : _fromChild(fromChild), _subChild(subChild), _subBuffer() - { - assert(_fromChild && _subChild); - } - - void open() override - { - _fromChild->open(); - _subChild->open(); - - // buffer all to subtract - for (auto candidate = _subChild->next(); candidate.has_value(); - candidate = _subChild->next()) - { - _subBuffer.push_back(candidate); - } - } - void close() override - { - _fromChild->close(); - _subChild->close(); - } - - optional next() override - { - auto candidate = _fromChild->next(); - // keep gettihg next until there is no next candidate, or the candidate is - // not being subtracted - while (candidate.has_value() && _subBuffer.contains(candidate.value())) - { - candidate = _fromChild->next(); - } - return candidate; - } - -private: - unique_ptr> _fromChild, _subChild; - unordered_set _subBuffer; -}; - -template -struct BreakingCrossProduct : Operator> -{ - BreakingCrossProduct(unique_ptr> leftChild, - unique_ptr> rightChild) - : _leftChild(move(leftChild)), _rightChild(move(rightChild)), - _leftCurrent(), _rightIndex(0), _rightBuffer() - { - assert(_leftChild && _rightChild); - } - - void open() override - { - _leftChild->open(); - _rightChild->open(); - - // set first left (can be none -> in which case next will never return - // anything) - _leftCurrent = _leftChild->next(); - - // buffer in the entirety of the right - for (auto candidate = _rightChild->next(); candidate.has_value(); - candidate = _rightChild->next()) - { - _rightBuffer.push_back(candidate.value()); - } - } - - void close() override - { - _leftChild->close(); - _rightChild->close(); - } - - optional> next() override - { - // invariant: _rightBuffer.size() > _rightIndex >= 0 - if (_leftCurrent.has_value() && !_rightBuffer.empty()) - { - auto next_val = - make_tuple(_leftCurrent.value(), _rightBuffer[_rightIndex]); - - _rightIndex++; - if (_rightIndex == _rightBuffer.size()) - { - _rightIndex = 0; - _leftCurrent = _leftChild->next(); - } - - return next_val; - } - else - { - return {}; - } - } - -private: - unique_ptr> _leftChild; - unique_ptr> _rightChild; - optional _leftCurrent; - size_t _rightIndex; - vector _rightBuffer; -}; - -template -struct CrossProduct : Operator> -{ - CrossProduct(unique_ptr> leftChild, - unique_ptr> rightChild) - : _leftChild(move(leftChild)), _rightChild(move(rightChild)), - _leftCurrent(), _rightBuffered(), _rightOffset(0) - { - assert(_leftChild && _rightChild); - } - - void open() override - { - _leftChild->open(); - _rightChild->open(); - _leftCurrent = _leftChild->next(); - } - void close() override - { - _leftChild->close(); - _rightChild->close(); - } - - optional> next() override - { - /* invariants: - * - _leftCurrent is already set - * - if there are no more _rightChild to get, then we are iterating over the - * _leftChild - */ - auto rightCandidate = _rightChild->next(); - if (rightCandidate.has_value()) - { - // still getting content from the right had side - _rightBuffered.push_back(rightCandidate.value()); - } - else if (_rightOffset == _rightBuffered.size()) - { - // all tuples have been taken from right hand side, now using buffer - _leftCurrent = _leftChild->next(); - _rightOffset = 0; - } - - // only return if both sides have values - if (_leftCurrent.has_value() && !_rightBuffered.empty()) - { - // get tuple and increment _rightOffset - return make_tuple(_leftCurrent.value(), _rightBuffered[_rightOffset++]); - } - else - { - return {}; - } - } - -private: - unique_ptr> _leftChild; - unique_ptr> _rightChild; - optional _leftCurrent; - vector _rightBuffered; - size_t _rightOffset; -}; - -/* We use the template to determine the hash and nextSlot implementations used - * T -> type of data provided by the child - * S -> data output by the groupBy & aggregation - * K -> the type grouped on, produced by a grouping function (K group(T)) - * hash -> a function to convert a key into a hash - * nextSlot -> to determine next slot in collisions - */ -template -struct GroupBy : Operator -{ - using Aggregation = function, T)>; - using Grouping = function; - - GroupBy(unique_ptr> child, - Grouping grouping, - Aggregation aggregation) : _child(move(child)), _grouping(grouping), - _aggregation(aggregation), _hashTable(), _hashTableCursor(0) - { - assert(_child); - } - - void open() override - { - _child->open(); - - vector childValues; - for (auto currentVal = _child->next(); - currentVal.has_value(); - currentVal = _child->next()) - { - childValues.push_back(currentVal.value()); - } - - _hashTable = vector>>(childValues.size(), optional>()); - for (T val : childValues) - { - K key = _grouping(val); - size_t slot = hashFun(key) % _hashTable.size(); - while (_hashTable[slot].has_value() && _hashTable[slot].value().first != key) - { - slot = nextSlot(slot) % _hashTable.size(); - } - - // slot is now correct, either a value present with the same key, or none. - auto prev_val = _hashTable[slot].has_value() ? _hashTable[slot].value().second : optional(); - _hashTable[slot] = optional>(make_pair(move(key), _aggregation(prev_val, val))); - } - - // all values moved into the hashtable, so vector deallocated - } - - void close() override - { - _child->close(); - } - - optional next() override - { - while (_hashTableCursor < _hashTable.size()) - { - auto slot = _hashTable[_hashTableCursor]; - _hashTableCursor++; - - if (slot.has_value()) - { - return slot.value().second; - } - } - return {}; - } - -private: - Aggregation _aggregation; - Grouping _grouping; - unique_ptr> _child; - vector>> _hashTable; - size_t _hashTableCursor; -}; - -/* By templating our operators in terms of output, we can be more flexible with - * our implementation: - * Which types of values? e.g Scan>? - * Copy or indirection? e.g Scan or Scan - * - * When using operators, we will need to use some structure that encodes type at - * runtime, so we can use... - */ - -/* Can only determine row size and types at runtime - * - size unknown -> use vector - * - types unknown -> use variant - * - * We use N-ary storage for the table (volcano processes row by row) - */ -using Value = variant; -using Row = vector; -using Table = vector; - -/* Prints for debugging purposes */ -template -std::ostream &operator<<(std::ostream &os, const std::vector &vec) -{ - os << "["; - for (const auto &iter : vec) - { - os << " " << iter; - } - os << "]"; - return os; -} - -/* Note: need to unwrap the T (otherwise will match empty parameter - * pack/variadict template and fail of _Nth_type with 0 types). - */ -template -std::ostream &operator<<(std::ostream &os, const std::variant &var) -{ - std::visit([&os](auto &&arg) - { os << arg; }, - var); - return os; -} - -/* SELECT * - * FROM table cross join table; - */ -void example1() -{ - shared_ptr
data = make_shared
(Table{ - {1, 'c', true}, - {1, 'c', false}, - {2, 'c', false}, - {1, 'd', true}, - {3, 'e', false}}); - - auto scan1 = make_unique>(data); - auto scan2 = make_unique>(data); - auto cross = make_unique>(move(scan1), move(scan2)); - - Project> proj(move(cross), [](tuple t) - { - auto vec2 = get<1>(t); - auto vec1 = get<0>(t); - vec1.insert(vec1.end(), vec2.begin(), vec2.end()); - return vec1; }); - - proj.open(); - for (auto val = proj.next(); val.has_value(); val = proj.next()) - { - cout << val.value() << endl; - } - proj.close(); -} - -/* SELECT table.1, table.2 - * FROM table - * WHERE table.0 = 1; - */ -void example2() -{ - shared_ptr
data = make_shared
(Table{ - {1, 'c', true}, - {1, 'c', false}, - {2, 'c', false}, - {1, 'd', true}, - {3, 'e', false}}); - - auto scan = make_unique>(data); - - auto filter = make_unique>(move(scan), [](Row r) - { return get(r[0]) == 1; }); - - Project proj(move(filter), [](Row r) - { return Row{r[1], r[2]}; }); - - proj.open(); - for (auto val = proj.next(); val.has_value(); val = proj.next()) - { - cout << val.value() << endl; - } - proj.close(); -} - -/* SELECT table.1, MAX(table.0) - * FROM table - * GROUP BY table.1; - */ -size_t hashValue(Value v) -{ - return hash{}(v); -} -size_t nextSlotLinear(size_t prev) -{ - return prev + 1; -} - -void example3() -{ - shared_ptr
data = make_shared
(Table{ - {1, 'c', true}, - {1, 'd', true}, - {1, 'c', false}, - {2, 'c', false}, - {5, 'c', false}, - {3, 'e', false}}); - - auto scan = make_unique>(data); - - // Group by for single column - auto groupBySecondCol = [](Row r) - { return r[1]; }; - auto aggregateSecondCol = [](optional r1, Row r2) - { - if (r1.has_value()) - { - return Row{max(get(r1.value()[0]), get(r2[0])), r2[1]}; - } - else - { - return Row{r2[0], r2[1]}; - } - }; - - GroupBy groupby(move(scan), groupBySecondCol, aggregateSecondCol); - - groupby.open(); - for (auto val = groupby.next(); val.has_value(); val = groupby.next()) - { - cout << val.value() << endl; - } - groupby.close(); -} - -int main() -{ - // example1(); - // example2(); - example3(); -} \ No newline at end of file diff --git a/60029 - Data Processing Systems/processing_models/processing_models.tex b/60029 - Data Processing Systems/processing_models/processing_models.tex index 6106323..92066db 100644 --- a/60029 - Data Processing Systems/processing_models/processing_models.tex +++ b/60029 - Data Processing Systems/processing_models/processing_models.tex @@ -10,9 +10,7 @@ \section{Motivation} \begin{definitionbox}{Function Objects} References to code that can be passed, invoked, change state and produce values. - \toimprove - % add use of auto, lambda's own type, then std::function, also explain other captures. - % Also use of auto arguments + See these notes associated code for more detail. \begin{minted}{cpp} #include @@ -42,20 +40,12 @@ \section{Volcano Processing} \textbf{I/O Behaviour} & As tuples are consumed as soon as they are produced, no waiting for I/O to create and buffer the next tuple. \\ \end{tabbox} \begin{tabbox}{consbox} - \textbf{Lots of Calls!} & CPU spends much time loading and calling function pointers to operators, predicates and aggregate functions. \\ + \textbf{Lots of Calls!} & Function calls are expensive, virtual calls even more so! Operators work my virtual calls to on parent operators per tuple, so the number of calls grows with the table size. \\ \end{tabbox} \subsection{Operators} A basic interface for operators can be devised as: -\begin{minted}{cpp} -template -struct Operator -{ - virtual void open() = 0; - virtual void close() = 0; - virtual optional next() = 0; -}; -\end{minted} +\inputminted[firstline=14, lastline=18]{cpp}{processing_models/code/volcano/operators.h} In order to allow the greatest flexibility in using our operators, they are parameterised by \mintinline{cpp}{typename T}. In the concrete examples this is set as a \textit{runtime tracked} type \mintinline{cpp}{Row} which is variable size, and contains variants of \mintinline{cpp}{int}, \mintinline{cpp}{char}, \mintinline{cpp}{bool}, etc. \\ @@ -73,147 +63,16 @@ \subsection{Operators} \subsubsection{Scan} Scans a table already loaded into memory to return its rows. -\begin{minted}{cpp} -template -struct Scan : Operator -{ - using TableType = vector; - /* Many different operators can have a reference to and read the table. - * - shared_ptr drops table after it is no longer needed - * - must avoid copying very large table structure - */ - Scan(shared_ptr t) : _table(t), _index(0) { assert(_table); } - - /* No operation on open / close */ - void open() override {} - void close() override {} - - optional next() override - { - if (_index < (*_table).size()) - { - return (*_table)[_index++]; - } - else - { - return {}; - } - } - -private: - shared_ptr _table; - size_t _index; -}; -\end{minted} +\inputminted[firstline=20, lastline=38]{cpp}{processing_models/code/volcano/operators.h} \subsubsection{Project} -\begin{minted}{cpp} -template -struct Project : Operator -{ - using Projection = function; - - Project(unique_ptr> child, Projection proj) - : _child(move(child)), _proj(proj) - { - assert(_child); - } - - void open() override { _child->open(); } - void close() override { _child->close(); } - - optional next() override - { - // Note: can be simplified with optional::and_then(function) in C++23 - auto next = _child->next(); - if (next.has_value()) - { - return _proj(next.value()); - } - else - { - return {}; - } - } - -private: - unique_ptr> _child; - Projection _proj; -}; -\end{minted} +\inputminted[firstline=45, lastline=65]{cpp}{processing_models/code/volcano/operators.h} \subsubsection{Select} -\begin{minted}{cpp} -template -struct Select : Operator -{ - using Predicate = function; - - Select(unique_ptr> child, Predicate pred) - : _child(move(child)), _pred(pred) - { - assert(_child); - } - - void open() override { _child->open(); } - void close() override { _child->close(); } - - optional next() override - { - auto candidate = _child->next(); - // keep getting candidates until there are no more, or one is valid. - while (candidate.has_value() && !_pred(candidate.value())) - { - candidate = _child->next(); - } - return candidate; - } - -private: - unique_ptr> _child; - Predicate _pred; -}; -\end{minted} +\inputminted[firstline=72, lastline=90]{cpp}{processing_models/code/volcano/operators.h} \subsubsection{Union} -\begin{minted}{cpp} -template -struct Union : Operator -{ - Union(unique_ptr> leftChild, unique_ptr> rightChild) - : _leftChild(move(leftChild)), _rightChild(move(rightChild)) - { - assert(_leftChild && _rightChild); - } - - void open() override - { - _leftChild->open(); - _rightChild->open(); - } - void close() override - { - _leftChild->close(); - _rightChild->close(); - } - - optional next() override - { - auto candidate = _leftChild->next(); - if (candidate.has_value()) - { - return candidate; - } - else - { - return _rightChild->next(); - } - } - -private: - unique_ptr> _leftChild, _rightChild; -}; -\end{minted} +\inputminted[firstline=97, lastline=120]{cpp}{processing_models/code/volcano/operators.h} \subsubsection{Difference} @@ -226,193 +85,19 @@ \subsubsection{Difference} Difference breaks the pipeline as we need to know all tuples from one side (the subtracting set) before we can start to produce rows. -\begin{minted}{cpp} -/* The definition of difference forces the pipeline to be broken (buffering) */ -template -struct Difference : Operator -{ - Difference(unique_ptr> fromChild, - unique_ptr> subChild) - : _fromChild(fromChild), _subChild(subChild), _subBuffer() - { - assert(_fromChild && _subChild); - } - - void open() override - { - _fromChild->open(); - _subChild->open(); - - // buffer all to subtract - for (auto candidate = _subChild->next(); candidate.has_value(); - candidate = _subChild->next()) - { - _subBuffer.push_back(candidate); - } - } - void close() override - { - _fromChild->close(); - _subChild->close(); - } - - optional next() override - { - auto candidate = _fromChild->next(); - // keep getting next until there is no next candidate, or the candidate is - // not being subtracted - while (candidate.has_value() && _subBuffer.contains(candidate.value())) - { - candidate = _fromChild->next(); - } - return candidate; - } - -private: - unique_ptr> _fromChild, _subChild; - unordered_set _subBuffer; -}; -\end{minted} +\inputminted[firstline=127, lastline=162]{cpp}{processing_models/code/volcano/operators.h} \subsubsection{Cartesian/Cross Product} This can be optionally implemented as a \textit{pipeline breaker}. -\begin{minted}{cpp} - -template -struct BreakingCrossProduct : Operator> -{ - BreakingCrossProduct(unique_ptr> leftChild, - unique_ptr> rightChild) - : _leftChild(move(leftChild)), _rightChild(move(rightChild)), - _leftCurrent(), _rightIndex(0), _rightBuffer() - { - assert(_leftChild && _rightChild); - } - - void open() override - { - _leftChild->open(); - _rightChild->open(); - - // set first left (can be none -> in which case next will never return - // anything) - _leftCurrent = _leftChild->next(); - - // buffer in the entirety of the right - for (auto candidate = _rightChild->next(); candidate.has_value(); - candidate = _rightChild->next()) - { - _rightBuffer.push_back(candidate.value()); - } - } - - void close() override - { - _leftChild->close(); - _rightChild->close(); - } - - optional> next() override - { - // invariant: _rightBuffer.size() > _rightIndex >= 0 - if (_leftCurrent.has_value() && !_rightBuffer.empty()) - { - auto next_val = - make_tuple(_leftCurrent.value(), _rightBuffer[_rightIndex]); - - _rightIndex++; - if (_rightIndex == _rightBuffer.size()) - { - _rightIndex = 0; - _leftCurrent = _leftChild->next(); - } - - return next_val; - } - else - { - return {}; - } - } +\inputminted[firstline=164, lastline=217]{cpp}{processing_models/code/volcano/operators.h} -private: - unique_ptr> _leftChild; - unique_ptr> _rightChild; - optional _leftCurrent; - size_t _rightIndex; - vector _rightBuffer; -}; -\end{minted} A Non-pipeline breaking implementation has two phases: \begin{enumerate} \item Collecting rows from the right child operator, while using the same row from the left. \item The right child operator has been exhausted, slowly get tuples from the left while traversing tuples collected from the right. \end{enumerate} -\begin{minted}{cpp} -template -struct CrossProduct : Operator> -{ - CrossProduct(unique_ptr> leftChild, - unique_ptr> rightChild) - : _leftChild(move(leftChild)), _rightChild(move(rightChild)), - _leftCurrent(), _rightBuffered(), _rightOffset(0) - { - assert(_leftChild && _rightChild); - } - - void open() override - { - _leftChild->open(); - _rightChild->open(); - _leftCurrent = _leftChild->next(); - } - void close() override - { - _leftChild->close(); - _rightChild->close(); - } - - optional> next() override - { - /* invariants: - * - _leftCurrent is already set - * - if there are no more _rightChild to get, then we are iterating over the - * _leftChild - */ - auto rightCandidate = _rightChild->next(); - if (rightCandidate.has_value()) - { - // still getting content from the right had side - _rightBuffered.push_back(rightCandidate.value()); - } - else if (_rightOffset == _rightBuffered.size()) - { - // all tuples have been taken from right hand side, now using buffer - _leftCurrent = _leftChild->next(); - _rightOffset = 0; - } - - // only return if both sides have values - if (_leftCurrent.has_value() && !_rightBuffered.empty()) - { - // get tuple and increment _rightOffset - return make_tuple(_leftCurrent.value(), _rightBuffered[_rightOffset++]); - } - else - { - return {}; - } - } - -private: - unique_ptr> _leftChild; - unique_ptr> _rightChild; - optional _leftCurrent; - vector _rightBuffered; - size_t _rightOffset; -}; -\end{minted} +\inputminted[firstline=219, lastline=270]{cpp}{processing_models/code/volcano/operators.h} \subsubsection{Group Aggregation} This is fundamentally a \textit{pipeline breaker}, and must buffer rows prior to \mintinline{cpp}{next()}. @@ -422,87 +107,7 @@ \subsubsection{Group Aggregation} \item Get the key (column being grouped by e.g \mintinline{SQL}{GROUP BY column1}) and aggregation (e.g \mintinline{SQL}{SELECT MAX(column2)}) and place in a hashmap. \item Finally provide rows through \mintinline{cpp}{next()} \end{enumerate} -\begin{minted}{cpp} -/* We use the template to determine the hash and nextSlot implementations used - * T -> type of data provided by the child - * S -> data output by the groupBy & aggregation - * K -> the type grouped on, produced by a grouping function (K group(T)) - * hash -> a function to convert a key into a hash - * nextSlot -> to determine next slot in collisions - */ - template -struct GroupBy : Operator -{ - using Aggregation = function, T)>; - using Grouping = function; - - GroupBy(unique_ptr> child, - Grouping grouping, - Aggregation aggregation) : _child(move(child)), _grouping(grouping), - _aggregation(aggregation), _hashTable(), _hashTableCursor(0) - { - assert(_child); - } - - void open() override - { - _child->open(); - - vector childValues; - for (auto currentVal = _child->next(); - currentVal.has_value(); - currentVal = _child->next()) - { - childValues.push_back(currentVal.value()); - } - - _hashTable = vector>>(childValues.size(), optional>()); - for (T val : childValues) - { - K key = _grouping(val); - size_t slot = hashFun(key) % _hashTable.size(); - while (_hashTable[slot].has_value() && _hashTable[slot].value().first != key) - { - slot = nextSlot(slot) % _hashTable.size(); - } - - // slot is now correct, either a value present with the same key, or none. - auto prev_val = _hashTable[slot].has_value() ? _hashTable[slot].value().second : optional(); - _hashTable[slot] = optional>(make_pair(move(key), _aggregation(prev_val, val))); - } - - // all values moved into the hashtable, so vector deallocated - } - - void close() override - { - _child->close(); - } - - optional next() override - { - while (_hashTableCursor < _hashTable.size()) - { - auto slot = _hashTable[_hashTableCursor]; - _hashTableCursor++; - - if (slot.has_value()) - { - return slot.value().second; - } - } - return {}; - } - -private: - Aggregation _aggregation; - Grouping _grouping; - unique_ptr> _child; - vector>> _hashTable; - size_t _hashTableCursor; -}; -\end{minted} +\inputminted[firstline=272, lastline=342]{cpp}{processing_models/code/volcano/operators.h} \subsubsection{Operators Composed} We can finally define types to use with our operators. @@ -516,27 +121,30 @@ \subsubsection{Operators Composed} SELECT table.1, MAX(table.0) FROM table GROUP BY table.1; \end{minted} \begin{minted}{cpp} -shared_ptr
data = make_shared
(Table{ - {1, 'c', true}, - {1, 'c', false}, - {2, 'c', false}, - {1, 'd', true}, - {3, 'e', false}}); - -auto scan1 = make_unique>(data); -auto scan2 = make_unique>(data); -auto cross = make_unique>(move(scan1), move(scan2)); - -Project> proj(move(cross), [](tuple t) - { - auto vec2 = get<1>(t); - auto vec1 = get<0>(t); - vec1.insert(vec1.end(), vec2.begin(), vec2.end()); - return vec1; -}); - -GroupBy - groupby(move(scan), groupBySecondCol, aggregateSecondCol); +std::shared_ptr
data = std::make_shared
(Table{ + {1, 'c', true}, + {1, 'd', true}, + {1, 'c', false}, + {2, 'c', false}, + {5, 'c', false}, + {3, 'e', false}} +); + +auto scan = std::make_unique>(data); + +// Group by for single column +auto groupBySecondCol = [](Row r) { return r[1]; }; +auto aggregateSecondCol = [](std::optional r1, Row r2) { + if (r1.has_value()) { + return Row{std::max(std::get(r1.value()[0]), std::get(r2[0])), + r2[1]}; + } else { + return Row{r2[0], r2[1]}; + } +}; + +GroupBy groupby( + std::move(scan), groupBySecondCol, aggregateSecondCol); groupby.open(); for (auto val = groupby.next(); val.has_value(); val = groupby.next()) @@ -619,7 +227,7 @@ \subsubsection{IO Operations} \end{examplebox} \subsubsection{CPU Efficiency} -\begin{sidenotebox}{Slow Jumps} +\begin{sidenotebox}{Not all function calls are equal.} A jump to a function pointer (e.g a \mintinline{cpp}{std::function}, \mintinline{cpp}{virtual} method or \mintinline{cpp}{OUT (*function_ptr)(A, B, ...)}) is expensive. \begin{center} \includegraphics[width=.8\textwidth]{processing_models/images/jump_to_register.drawio.png} @@ -632,11 +240,11 @@ \subsubsection{CPU Efficiency} To avoid this cost: \begin{itemize} \item Jump to an immediate value (typically pc-relative immediate offset in the jump instruction), as the jump location is part of the instruction, there is no hazard. But the function to jump to must be known at compile time. Still affects returns (jump to link register/return address register) (though this should be very fast due to return-address stack branch predictors). - \item Inline a function (must be known at compile time) + \item Determine the function to call at compile time (jump to label in asm $\to$ jump to immediate pc-relative address). This is still costly (depending on calling convention), so we can go further an inline. \item Do fewer of these calls to function pointers/virtuals. \end{itemize} \end{sidenotebox} -For each operation we can count the function calls per tuple +For each operation we can count the function calls per tuple. \begin{center} \begin{tabular}{l l p{.6\textwidth}} Scan & 0 & Tuples read straight from buffer. \\ @@ -680,9 +288,9 @@ \section{Bulk Processing} \begin{definitionbox}{Bulk Processing} Queries are processed in batches. \begin{itemize} - \item Turn \textit{control dependencies} to \textit{data dependencies} \& buffer. - \item Pass references to buffers between operators. - \item Better locality for code (I-cache) \& data. + \item Turn \textit{control dependencies} to \textit{data dependencies}. + \item Apply operator to a buffer of tuples, copy or pass references to buffers between operators. + \item Reduces the number of function calls (e.g $1$ per tuple per operator $\to$ $1$ per operator). \end{itemize} \end{definitionbox} For example a basic select operator could be implemented on an Nary Table: diff --git a/60029 - Data Processing Systems/relational_algebra/relational_algebra.tex b/60029 - Data Processing Systems/relational_algebra/relational_algebra.tex index 5f759f8..61146f0 100644 --- a/60029 - Data Processing Systems/relational_algebra/relational_algebra.tex +++ b/60029 - Data Processing Systems/relational_algebra/relational_algebra.tex @@ -163,14 +163,40 @@ \subsection{Nomenclatures} \end{tabular} \end{center} +\subsection{Schemas} +\begin{definitionbox}{Database Schema} + The logical structure of the database that is exposed to users (e.g through SQL). + \begin{itemize} + \item defines the tables, their columns, relations, indexes and constraints in a database. + \item In some systems (e.g postgres), can include permissions/access control, functions, views and more. + \item Analogous to the \textit{type} of the database. + \item Does not describe the physical layout, + \end{itemize} + \begin{minted}{sql} +-- one postgres database can have many different schemas +CREATE SCHEMA my_schema; + +CREATE TABLE foo( + id SERIAL PRIMARY KEY, + name VARCHAR(20) NOT NULL, + bestie SERIAL REFERENCES foo(id), + added_date DATE NOT NULL CHECK (added_date > '2000-01-01') +); + +CREATE INDEX idx_added_date_desc ON people (added_date DESC); + \end{minted} +\end{definitionbox} + + + \section{Implementing Relational Algebra in C++} \begin{sidenotebox}{A note on types\dots} - Here we will express operators \& relations in the C++ types system. + Here we will express operators \& relations in the C++ type system. \\ - \\ In real data processing systems (and in particular databases), the schema \& types are not know at compile time (i.e do not know the types of columns, tables until they are created, amended, and operated on at runtime). + \\ In real databases the schema \& types are not know when the database itself is compiled, but rather later at runtime (i.e do not know the types of columns, tables until they are created, amended, and operated on at runtime). \end{sidenotebox} \noindent -In order to implement relations we will make use of several containers from the +In order to implement a model of relational algebra we will make use of several containers from the \href{https://en.wikipedia.org/wiki/Standard_Template_Library}{STL (standard template library)}. \begin{minted}{cpp} #include diff --git a/60029 - Data Processing Systems/storage/diagrams/delta_main.drawio b/60029 - Data Processing Systems/storage/diagrams/delta_main.drawio index 971e097..42fce26 100644 --- a/60029 - Data Processing Systems/storage/diagrams/delta_main.drawio +++ b/60029 - Data Processing Systems/storage/diagrams/delta_main.drawio @@ -1 +1 @@ -5VnbctowEP0aHpPxBQw8JpBLZ5KWCZlp8qhiYasRXiqLAP36rmwZWxYhNCGBUj94vEdaWdqzu1rZDb83WVwJMo1vIaS84TnhouH3G57X9ly8K2CZA0GzkwORYGEOuSUwZL+pBh2NzlhIU6OjBOCSTU1wBElCR9LAiBAwN7uNgZtvnZKIWsBwRLiNfmehjHO047VL/JqyKC7e7AbdvGVCis56JWlMQphXIP+i4fcEgMyfJose5cp2hV1yvcsXWlcTEzSR2yicOL/ofd/pTOXVj8GT27sZPPCTwsypXBYrpiEaQIsgZAwRJIRflOi5gFkSUjWsg1LZ5wZgiqCL4E8q5VKzSWYSEIrlhOtWumDyofL8qIY6bWmpv9AjZ8KyEBIplg9VoaKlxFItkwq9VBIhz5QnIJBAohaQr1kt9EVbaiiFmRjRTQZsaqckIqJyQ8fminIMFQoTipNEPUE5kezZnAjRThut+pW84oOm9m9o9vZJc0ntY6XlFZpdk2bvNZ53yemWlLruXjnNx30mfKbf9PWEZO8fShAqrdUpNwmdx0zS4ZRkpphj8jbJGzPOe8BBZLo+8Rwn42AMiazg4+zK4kzAE620BJcOXitmnqmQdLGZG9uUhUKRRPUu0tLivEzJqy5xJR0HzkcZ3//XA6r9iQG1bY58d0Bp1QEwnOLKe2rOg1v0abd6mePls9RD1NxkNae3e07TCts+HcFkCil6xjHGrh8cWOy21jDAJbFMjkuWpl1NO+liompsDRHOogTFEZqIIn6uDMiwqDzTDRMWhllKWEekSbXiTCcEt72jbNoyGfFsRoI1hPgfRUhgEXJLWPL/8OF3D4uP7j73ts84E+xwa+tsu7Xts1TsWAH2JUnRYxFL6BzvfXKw+W8H8VXb/4PWfsPLtSv3OxyYUXSDY6fCrZVia4qBdVw0P4wLuxi4o8pWXE2aZVGS1WXhUdNS24DcNTtQ91NDxC4JLkmq8tW3+8Hx0uB2Do2HtsXDNUvxiKJsdLw8+P6h8WDv4EU83JwdcUA060f3vRNhH98HVDAI2QjRWxYJLOfgUM8uu6il2gdWTHUtQizjH9nphSbhB/7PaGx5nnnv/4wXPtU1Tffq1Pwmn7/1cc4axw+cU6dyeRtHffMnPxTLH3h59/IvqH/xBw== \ No newline at end of file +5VnbctowEP0aHpPxBQw8JpBLZ5KWCZkpeVSxsNXIXlcWAfr1lWwZWxYhNEDjUD94vEeri/fsrlZ2yx1EyxuGkvAefExbjuUvW+6w5TjdjiXuEljlgNdWQMCIn0N2CYzJb6zAQm1OfJxqihyAcpLo4BTiGE+5hiHGYKGrzYDqsyYowAYwniJqot+Jz8Mc7TndEr/FJAiLmW2vn7dEqFBWb5KGyIdFBXKvWu6AAfD8KVoOMJW2K+yS97t+pXW9MIZjvkuHM+sXfhxavYTf/Bg924O70YSeFWZO+ap4Y+wLAygRGA8hgBjRqxK9ZDCPfSyHtYRU6twBJAK0BfgTc75SbKI5BwGFPKKqFS8Jn1Sen+RQ5x0lDZdq5ExYFULM2WpSFSq9pFh2y6SiX8oR4xfSEwQQQyxfwDSdsmYKczbF2+zVVj6IWID5FkWlJ41ZmUExc4MhwmKRQoFhijh50b0NKacN1nolr+JBUfs3NDt70WzvR3NJ7VOl5Q2abZ1m5y2e9+B0R0ptu1mc5uO+IDpXM309Q9n8Yw5MprU65XrcLkLC8ThBmW0WInnr5M0IpQOgwLK+LnIsK+NgBjGv4LPsyuKMwTOutFjZtY2ZF8w4Xm41ZdFaJFG1ixS7yqJMyWuVsJKOPetYxnc/e0B1jxdQu+bIwweU6joCIta89p6a84gt+rxfvfTx8mWrIWpusl7T+z2nbYTtEE8hSiAVnnGKset6DYvdzgYGKEeGycX7cd2uup1UMVE1toIQJUEsxKmwGhb4pbQWEUXlhWqIiO9nKWETkTrVkjOVEOzugbJpR2fEMRnxNhDiHosQzyDkHpH4/+HD7TeLj/5ee9snOBO8f2vr7bq1NapU7BkB9iVOhXsKLMYLcR+ixua/A8RXbf/3Oh8bXrZZuT+IgQkWbnDqVNi1UmxDMbCJi/bRuDCLgQcsbUXlokkWJVld5p80LbUNyN6wA/X/aYiYJcE1SmW++vY4Ol0a7F7TeOgaPNySVBxRpI1OlwfXbRoP5g5exMPdxQkHRLt+dP9wIszj+wgzAj6ZCvSeBEyUc9DUs8shaqluw4qpvkGIYfwTO73g2D/c/4zWjueZg//PeOVTXVt3r17Nb/IXMj7OGeO4nnVuVS5n66jv/uQnxPIHXq5e/gV1r/4A \ No newline at end of file diff --git a/60029 - Data Processing Systems/storage/diagrams/kernel_architecture.drawio b/60029 - Data Processing Systems/storage/diagrams/kernel_architecture.drawio index 9c66bb0..f779ce0 100644 --- a/60029 - Data Processing Systems/storage/diagrams/kernel_architecture.drawio +++ b/60029 - Data Processing Systems/storage/diagrams/kernel_architecture.drawio @@ -1 +1 @@ -7Vxbd6o4FP41PrYrJFz0sWo7PTPtms7pWWvOeZqVQkSmSFwxVp1fPwkGuYS22IqA1YeWbAIk35e9dzbZoYdGs/VvDM+n99QjYQ8Cb91D4x6EhmMNxD8p2WwlfaQEPgs8VSkVPAb/ESUESroMPLLIVeSUhjyY54UujSLi8pwMM0ZX+WoTGuafOsc+0QSPLg516d+Bx6eqF9BJ5bck8KfJkw1b9W+Gk8qqJ4sp9ugqI0LXPTRilPLt0Ww9IqEEL8Fle93NK2d3DWMk4lUuiG5vNt+Hz+GPn/bvj2y6/GcwfriA9vY2Lzhcqh6r1vJNAgGjy8gj8i6gh4aracDJ4xy78uxKkC5kUz4LRckQhx5eTOO6sjAJwnBEQ8riGyEMAYCWlNOIZ+ST+CfkC87oM8mcAfFPnFGtJIyT9av9N3aoiuFI6IxwthFVkgtMRYQaiYatyquUV2Qp2TTDKUyEWI0lf3fvFG5xoBDfB31wYPQLgNsYgBjY/QHP8XgA9C8MkIcfGTr8xqAEfgPWBb+lgU08ofyqSBmfUp9GOLxOpcM8HWmdO0rnCq1/CecbZcnwktM8RWQd8J/y8ktLlX5lzozX6s5xYZMUItHdzEWy+Ct7Lr0sLiXXbfsnO/U2awIDumQueQOrxPZi5hP+1pAuHwWMhJgHL/l2HF6fmmU0w2fK7juMGjk+U3rbw6jdJKOGZiAfOWXSZ0M7FK0ePjFx5MujexwJOTuoAXX6ABjDxjyWVfBYTonJhCUm067NYZlnBausYLCqyQTlo+BINlPTsOFyMpF69PUUDJlNK9hZv6rrF6qoX6+MgeOoF9LU6885YVg4MV3Bvs3mIZkJHES7aLQ4IUVDdts82aHj3g5FXkaZlTtu5OXUi75pAnBlNzXYL/oFvAc63mYJ3GZdaPdPGm2ngHa/YbQHJ412BUN+VLSTt0inCbfRMkti6PH4CaENW2ZJDD02OyG0UdssiT5VH2GOQ+qLYhF20W2exzaPVUQjUgBciXAY+JEougImEUyjoQQxcHF4pU7MAs+L47oyMvN0F3i7iX/1TGJ2dGXosUvoQbXRY2r0jAU9X48Zo23E1BxKtWt6iUpWDZ2jmqnTjp0Ki7S7BdnG4D7t4KlgTMpegB4Xbj16esA+0d+FnbyZv0BWjhoTNGznoT73/351/xWJef912nGJ0cOEcbB4PjNTlmJyXGZ0Xy3nrE94ISIK8AdhkcyWayVN+TwuwZlaPTJKfNUhOYSF+Ra0LY1Dwygh0bJqIhE1uybXrTQhqPz3u4tyqNFFb9NollP4IVLhXqxmlRZ+TZb1qfpRWXY+xLJzZnk/J6uHDNdr4f8EZ0L6TbrCiXR0h4zaAOhjUDKbqRC1ZQkzD+MzreKiDLQvYYnbLFtgry2SQ+fs2j1SWUBVVWs0mcVEZ4P6BVhueMLb8clRR1hOmplxm3fUl9GiED6EODqww7QA+Vg+1DEc5qANDtM+O8zqSlZ19wKCjZrShidBXTelHWG5Yc3t+rSoGywnzcwuTU03i7o8ptcHwEFt9ZjQboPHdM4es7qWwapahhq1pQ1z2nWP2RGW+2ePefosJ83ca/nzMz7zEx+YOILPRKgNPlPPcr3FzFuJXh2Uik9sgDoCFSZoAxV6atnjX3c92QYbzyS8279C4hEWvMQau+jB0U6B6XaNA4c+eWJ1pMKGZCLvUEceATwMs3ZxsxVE0k5rzJZ92MWsK2XA1N+t3YrHyr3TICQv8SeS5nHAIJhduCx4CiJfFOiSz5f8zKMMMIBRqqFlPNaWv2OWLS0Sdyk1L6XQFWjgINpSGNJVjmaqdvsuZPsiTz4xt8tXMj0nbjAJ3DqSHDtIvNMC4i1dgccBIy4PZctn8YcQJKNevAMBBJLGGZlR+Whpnr04/04eEe6eaZVUmW2gVc/He3Xy04E0vAPQVNzPCS3nsumEO6tCyj2JvCv5db8U9RxSGdb07NOPBGU6gBl4ysZwIqscaaknPNBAPDiduBb2JJrF2eg2VlRXpcC/eyNtb/82ltRuFDO463YZqaKYfsRwWz39FCS6/h8= \ No newline at end of file +7V1bd9u4Ef41Pm0fogPwzkfbcnbTOrFrp2e7j7AES9xQokpSG3t/fQHeRHJgmZYIArok5yQSeNV83wxmBsDgwrxevPwSk9X8azSl4YWBpi8X5vjCMLBhYfYfb3nNWxwL5Q2zOJgWJ20aHoO/aNFYnrYOpjRpnJhGUZgGq2bjJFou6SRttJE4jn42T3uOwuZTV2RGQcPjhISw9bdgms7zVs9wN+2/0mA2L5+MHT8/siDlycUvSeZkGv2sNZk3F+Z1HEVp/mnxck1DLrxSLvl1n984Wr1YTJdplwuWeLz8ETjxXz7G908vn/6X2OtPjpnf5k8SrotfXLxt+lqKII7Wyynld0EX5tXPeZDSxxWZ8KM/GeisbZ4uQvYNs4/PQRheR2EUZ9eaDkHo+Zm3R8u01v6c/WHtSRpHP2jtCMr+sCNTksyzp/LbFm9J45S+vPn7cSVVRkcaLWgav7JTigsM1x3Z+UUFF02zgObnBlnTLdrmNVTd4jpSkGlW3Xwjb/ahEPkHxI/lSh8hG9EPSh+Pr65lSN9pS9/GAun7Aunj0mT0Ln4bA3HTKVP/4msUp/NoFi1JeLNpvdq03kbRqpDQHzRNXwvrRdZp1ISFvgTpfwsE+effa5/HXEC4+vZaO3RP44D9UBoXbRzF4hnYZt831MAZliROL7nJYw2TkCRJMCmbPwdh2MCS/8ztSDKpROt4Qrex1/PFmMc0JGnwZ/MJIvSKS++jgD274orpAa6YLQrk71ZcVzd6rVtZJaGKG1le60ZMOjOaghsxOZLX2mkrfkKy7ZW95gsj9KH3ap3PPuRvsOF2Jd496G7uQPcmyfQhvwq6W1LoXmL/UY5CUu2sN70xzDozbC+GuXIYhntimG1Chlkje1COucBnGo1GgHbMU0mbLGn6OstoSVsOU9FEwmC25Igz+Dg3rrjfE7CI4LI4sAim04y4Imes6a7VKGVY5ffiJUvfbx+vysEAD8eGXpUNnaq2YejPpTWhDzuEDViyl8+MwMguv/5eP5YZAlR9e4UI+dBRroKRjragskQj137LGAFb1KMBKZyQXJ23gYRsKYbGM5uGxnS69T8fdbdct/UcvN3d8l172/ly3C0PGCqgGEcSWjteCw4LhnbYEIR2vrTIzlXqiIzq2o+3ar96r8OQE8UBt9aWE3pZpV6Vz3nHFoD3GsIWGL62xqAH/bcQSK3ZguSOwABIS+2YklNrigUO/HD1Atc3k9yHwEH6Ur3A7aMWuK+fwGHkeUQCtw3tBG57QL5nJ+4tckpy4tqZI0lOnO22nLj2YOZ772UM4MSZR+3E2bZ2BgD7h2gBesgj9WgYrDdQ39cwGMMYBgeZzeeY7xiG9nuZAxgG66iDDRsMKyk3DJYDBH4eBLiAadd38JE3CGDYSgx3NQhQHwKojQgMMgjQ3Xi/n6c35Aw5Wy0rCWY4SR7DsySHUsRAyLBVGUyzJV3sQ3XESKCPWJpCWnAw4mr9/MxMm4G+kiWZ8U97AdCKgLQCxHoriqkD4g1pIG1HiX08yNDWkhPagqlhkjxY0/lYaAvea4jQ1jrq0Na0tPNgy+cfqcDhXGvlAocM//Xy8VfW8s+7L9966/x83fo+02gCYRhwgpBo1rUhCwgHxm7jIPnBWh5oEkxpZgvHJCUAk/3DuZA+p70Fc9mswpA80fCKTH7MsvPar9JE/HP2px9c4XIG0YR67AwKLXTrc+dyyhqf+Munc8r+vXv8G+swEQOKf0tek5QuDgLtHnBznGb3bgsWoWA8JGrYP8TJMnplWV1PDPq+Pqo3kI9qfizLCt5riCyrq++Eurqd6GlOnQmzeqpdKPeogwRDvzS3B4OEz8FL1psmnG15F5qcbNcpcngG7To9AwBUpdXuoyg8CGTUubBQ40TO0KAurHfU09icVm5JNFAkWgAtz8LBSWzXLPQLoxn7egjKI1E9HK+lHKJZ9cgYCVb3yFMPGLxfGE7IhZqsyLKBl/O/Na9tcDXJxXPJ3eHZE/k7ezf2cNT67x+Z0BCX56dnsgjC1/ySRbSMkhyR6njW+WVH0eolb+fc+FTAfJm9BvP1s0Pla7BPs/x/u12vwmYS4K1ZMYbqWykRO5MJaxnzz1y0Nv/VNhPke+fi6twS951uY2xukwu+OrI5kMu9OlCyljfwRzW4zRszdvP2TJ14C86+1h6Vk7fLj6kOVb9jQ2ebE3rzU/zND0evtTt4tfac4NUxt3aoYHl1zKodY2yv2me157elnH2tRF1vbBKgOA8wJec705+c8iWtjqaXcFsrJKqaGMp6CUFeiaZkKkoSHrDYW6OUpqda7DDg/vzl9vvNA2u7vvs23lP2h5Q0txQnzT0Yej/ePXxnLVe/nxAMgsXNg8JQPqxuiG4efmF9AirgOLHBJE81IDAAP9SJgAoDcdNuwSqYsDTo/EEfhhl3j+z7N5r+jGI+VviYMjnJVDQP8b9bBxkEDJrFZBpQMVDlsXEQ00kaRMvsspgLuBf3wTfaA4Idp53JgxE6ELcR0x3uP9x+OaNXQ89H5sjVDD6M4Lz222gW5Ajeh1nwk8VCT3EZBt2t0mARJD1OJxQM9FWdXdt/R8THruxO0LPbiiZyDwf11DGCacu7FY1JygRgoC+LVUgX7OcRTtzDGC9Q1xm2fBzR7F1PkHMzbVngYgMgpqqqVjVWLx6frybgV7PxVUzAb1kF/EQwNTpYhX5qeKGiBtoARXj2tBkWsBn3JE5g3p3dLFglb6lxjTIkWeU1eJ+zUUq9zTizme3hJ+xCO24L7Hh7FkaPqq5m3cxZ1XdTdburquN91/HsqepwmO2RLsiSddbQhbtk1HlNAugmHKcd8O1m2lUDI+CfjcDhGAHcub/H+04S3I9WljZu5PZJnrVSjkajliM6s+rNIF0dq5QWlca7sMo9s0p3VmEYm7QzTyfhnHhOO9ek3j0xzzGKqmUGhQvRRYHzGf3qugWkC0nOzkaPrKrtXKSEVVglq87OxnGyCsNhyvv5a3J63ka7rLp6X8OAE24ANCc4BOm2VsKoH4A0oMP+mEYx313vJCrM2F57+N4QrK0wXOF0fYl1uDos2aPL6RtdCWyu4dHsVjt2XF37FyjojiZozwW3Lqxk3taZrluPwVuB7Q7eWL3b14LZqpxUR/SLsXVJ0LMHFa7JIRDBxi0a7LoDnY2tUVkXQRkP4DyeMU0mcfDEt1bly2W+k6dsISlfivONLMqPt+Q1WqfFl+toyWwoyVeS84bP6+WkmFDCv35hnHg5ndWoHmqnZoQLtDbLuAfKzqjdgu0DxRze2rJp2BipqyHqEvGUVRSU1XW0fWfktYyN1ZqXJNvYWBrVbEZbCdjz3l87Fh9VQ0BJRfW8dunLtvckvauDEyHzKBHdvNDJOpsPKTEKqcI/RZtm2COnWckFC1ZoYH8TrAy0JyCcz/8fPtEMMTXmnEui8E/+dZm7Hih6zki88UkmUbhecEcD0XRywh6GqJKsqOKFPCgtxZVLXT0djHrstEOY08m4+12NO+rFu/hoESmM2rbnnZ3Y4AX2fhfIqTtVPkSDUYfOA57t0Sx8mJS3ulLeVzuVWlA+94Em6zCFxDnKgYN2XRksqMw07MCBDfPRAItkTlY0I3LaAZCnai3K3ToNgyUt2qck/nHHrgrSTJlGKKuSXSzzRBARhGxE1W3mAcuGCnwKa1iset/Jrqs1e7OeXxMya8z/ygLAMaGLLgRAmvw77MZwirpi6agrvRfB1FhXIAAO7qYrEpWlw4iaGmWpJP9xZVkx/3ST3n+I8oWiDYeu580l1WuS06HS30laPR2x6uDNHY/VAwA4RjcA5JUIdzq4aKeoLHCjCw2URVSj8ViVBQKggbJo609r7CKYOnY7HZztU7R6gr1H1GPVe3F6ja0eBMARVM4WAYClAeAdY8YtP/JbIdOetnYQ5BVMa9QxCSQPP+2ycFD4g+YabNvvCIq8bIOg8u1ZqbqmGLrmVeWplHbJukFVSgCJoJK0sJeSp1Dapu90UyhBaKtBLyWowHtKKiUAxekKisQN5OD8ck2USuOAV5BnVd5hCcopawKkdgGvDraw972KDirk7exMVEhJgEDboFdjyyeYg6Le8p0HOLqmZdVjdVIDHBAAwb4wQ84XMjBMStzd3+y57cWHHOwNXs4WvPqQPqj1bQpqIhhDJloNQbmK8ZeH73vuA6On/C0M5C/YA2Zg+cMUwrHSXyB+5fQ3YLB5rOK3QfU3DcQv2GlAlfgF5T6qC8HGPfMoDv5iJ5Pyoe3FXk9RmkaL8gnFLzH6gBHszKIBjDBw+X7z9f4YtchyNRQ/DDeOVfw2XMWoXPwYiv9Y+xDT0E/8RodxsferEhXrz/atS7Pr+jUo8YEq05hoZDaXQe5enKa8UFFlGkNQn+rsTRyeQgsKTZ1hPLzMgqhOlLLMwoHgqGOGwjQAjmd1PLxMh6CWjbIg4VBg1DDWMzUKtQ8ERh1jRlOjmPFQYNQwfykoRXW3orxA2D2ZSale2cfW4VOSzKvAVmk2pxUxmggCagpLXkqEFI5H/kJ5lJtVNc0KiLVrhF9Hi0Ww/RQ5hcaOigyWA6pei/hgicvNySOEBcPR/cy01rXH/VbxHVNQDB6bQ0MAQ5AxSckTycr+/YvGSxpKUC8t6/hhBDd3NwWzaIat5GdYEqcPirSk2n5EDy0RlOcfXktgaHC1DnjZMETSaHVRVsKcn5Umw8wWYCZyHyUqjcS5Z8KCsgSh591m/zXcBBkaJPDlh9cg6PvxfoZfx336r3QRFY85VZ0RZCOH1hmJy+BEOrPHjFnpOiMq4Du4zsBBt0JnIq4z4yD5cdIaI5hfO7DG2DCAuaUzupxqj4spSW8Edd+GxgRGNN+58A8gRxCSJxpeVesRWo/9aKbxc/anH5wd1NS9KhlYw9kZFuYOK3rgnklvTkLZsmagWzpXvK3Ee/WchbvlXmwrHZ3/yF7KNpfrM96v2uyIyTHQXl+mMzKa9GtPf+m815cHbjXwxheC9UtFWruWdhHlQZdLOmFC5KuR2LkxE2fcm3PmC+IZpRtkuGAmrqivR0MaHB8GMI//vuXaI+pcjgYJH/pdyqEwuhSiPmXbr8pSe0Bt24sLuxpqeCdvWDtt2B0SSGeODc8xgUHalWRbbNtgLOtQN+rMsuFZJso37Eoz0b0G55nsCldNn8VD/O8WnglD7VlMpgEVx7DlsXEQ00lR5mDJd+vpK7+EQWQgGMRwcHXSQJ5OhzJYZ/ugwj6I+DJC9T/mruYC3npoc1Gmcs600412sCRLX6wDdx6YdNW+eTXSRelckOZ4L3O7peJHFUz3mcPtuOOXpG5KEI4P3U3hcibTebPJLjvv/bFerErRkHjSc1K3gELX3Sc9p7nvtaZ7SZYzfs6MVs9oV29G+6UFLgn6znaq4Hw9NaAcRz3vpqpcA7rvr1o4UXuowJ67ruEzaQ6ONLmBVcUZo8s2PMIqE/LiTQhMRRf8Ea50h/N9zbbe0OyBBuGddqe1Y6Dpua0bOUPnNmAq9J7E5bg7WfD4Lv+XtTzSBVmyuJB9vGH8i5NsCRK6eVmFJJu4esns1Cu/uLhy+ZSsKuBPabZRr8P/bbJZghhXwPUdKqezr3EUpXWOMcHNv0ZTys/4Pw== \ No newline at end of file diff --git a/60029 - Data Processing Systems/storage/images/delta_main.drawio.png b/60029 - Data Processing Systems/storage/images/delta_main.drawio.png index 293978d..891bb41 100644 Binary files a/60029 - Data Processing Systems/storage/images/delta_main.drawio.png and b/60029 - Data Processing Systems/storage/images/delta_main.drawio.png differ diff --git a/60029 - Data Processing Systems/storage/images/kernel_architecture.drawio.png b/60029 - Data Processing Systems/storage/images/kernel_architecture.drawio.png index 0d6d9d8..d187a4b 100644 Binary files a/60029 - Data Processing Systems/storage/images/kernel_architecture.drawio.png and b/60029 - Data Processing Systems/storage/images/kernel_architecture.drawio.png differ diff --git a/60029 - Data Processing Systems/storage/storage.tex b/60029 - Data Processing Systems/storage/storage.tex index 2ac5e7b..9ee8786 100644 --- a/60029 - Data Processing Systems/storage/storage.tex +++ b/60029 - Data Processing Systems/storage/storage.tex @@ -19,6 +19,16 @@ \section{Database Management System Kernel} \end{definitionbox} \section{Storage} +\subsection{Buffer Manager} +\begin{definitionbox}{Buffer Manager} + Part of the database kernel that manages disk-resident data, and moves disk resident data required by the storage manager into pages in memory (the buffer pool). + \begin{itemize} + \item + \end{itemize} + +\end{definitionbox} + + \subsection{Storage Manager} Multi-dimensional data must be stored in a 1-dimensional memory. \begin{itemize} diff --git a/60029 - Data Processing Systems/streams/code/push_operator.cpp b/60029 - Data Processing Systems/streams/code/push_operator.cpp deleted file mode 100644 index c3f9f73..0000000 --- a/60029 - Data Processing Systems/streams/code/push_operator.cpp +++ /dev/null @@ -1,290 +0,0 @@ -#include -#include -#include -#include -#include -#include - -template -class PushOperator -{ -public: - virtual void process(Event data) = 0; -}; - -template -class Output : public PushOperator -{ - std::ostream &output_; - -public: - Output(std::ostream &output) : output_(output) {} - - void process(Event data) override - { - output_ << "->" << data << std::endl; - } -}; - -template -class Select : public PushOperator -{ - PushOperator *plan_; - std::function predicate_; - -public: - Select(PushOperator *plan, std::function predicate) : plan_(plan), predicate_(predicate) {} - - void process(Event data) override - { - if (predicate_(data)) - plan_->process(std::move(data)); - } -}; - -template -class Project : public PushOperator -{ - PushOperator *plan_; - std::function function_; - -public: - Project(PushOperator *plan, std::function function) : plan_(plan), function_(function) {} - - void process(InputEvent data) override - { - plan_->process(function_(std::move(data))); - } -}; - -template -class Source -{ -public: - virtual void run() = 0; -}; - -template -class UserInput : public Source -{ - PushOperator *plan_; - std::istream &src_; - -public: - UserInput(PushOperator *plan, std::istream &src) : plan_(plan), src_{src} {} - - void run() override - { - for (Event r;; src_ >> r) - plan_->process(std::move(r)); - } -}; - -class WindowSumAggregator : public PushOperator -{ - PushOperator *plan_; - - // a circular buffer window - // the next index after buffer_i_ is the start of the window - std::vector window_buffer_; - - size_t buffer_i_ = 0; - float aggregate = 0; - - // for checking the window is filled - size_t count_ = 0; - -public: - WindowSumAggregator(PushOperator *plan, size_t windowsize) : plan_(plan), window_buffer_(windowsize) {} - - void process(float f) override - { - buffer_i_ = (buffer_i_ + 1) % window_buffer_.size(); - aggregate += f; - count_++; - if (count_ > window_buffer_.size()) - { - aggregate -= window_buffer_[buffer_i_]; - window_buffer_[buffer_i_] = f; - plan_->process(aggregate); - } - else - { - window_buffer_[buffer_i_] = f; - } - } -}; - -class WindowMedianAggregator : public PushOperator -{ - PushOperator *plan_; - std::vector window_buffer_; - size_t buffer_i_ = 0; - - // for checking the window is filled - size_t count_ = 0; - -public: - WindowMedianAggregator(PushOperator *plan, size_t window_size) : plan_(plan), window_buffer_(window_size) {} - - void process(float f) override - { - const size_t size = window_buffer_.size(); - buffer_i_ = (buffer_i_ + 1) % size; - window_buffer_[buffer_i_] = f; - count_++; - if (count_ > size) - { - - // copy and sort, this can be made much more efficient using a multiset and vector - // see multiset median trick: https://codeforces.com/blog/entry/68300 - std::vector sorted = window_buffer_; - std::sort(sorted.begin(), sorted.end()); - - // if even size get average of two middle, else middle element - if (size % 2 == 0) - { - plan_->process((sorted[size / 2] + sorted[(size / 2) - 1]) / 2); - } - else - { - plan_->process(sorted[size / 2]); - } - } - } -}; - -template -class WindowTwoStackAggregator : public PushOperator -{ - PushOperator *plan_; - - // front stack - std::vector front_values_; - std::vector front_agg_; - - // back stack - std::vector back_values_; - std::vector back_agg_; - - // track the top of front and back stacks - size_t window_pos = 0; - - // to determine when to start outputting aggregates - size_t count_ = 0; - - // flip front stack to back stack, sets window_pos = 0 - // invariants: - // - Must have window_size items present - void flip() - { - size_t size = front_values_.size(); - assert(window_pos == size); - - for (size_t i = 0; i < size; i++) - { - back_values_[size - 1 - i] = front_values_[i]; - } - - back_agg_[0] = back_values_[0]; - for (size_t i = 1; i < size; i++) - { - back_agg_[i] = agg(back_agg_[i - 1], back_values_[i]); - } - - window_pos = 0; - } - - // Push an item to the front_stack - // leaves the window_pos untouched - void push_front(Event r) - { - if (window_pos == 0) - { - front_values_[0] = r; - front_agg_[0] = r; - } - else - { - front_values_[window_pos] = r; - front_agg_[window_pos] = agg(r, front_agg_[window_pos - 1]); - } - } - -public: - WindowTwoStackAggregator(PushOperator *plan, size_t window_size) : plan_(plan), - front_values_(window_size), - front_agg_(window_size), - back_values_(window_size), - back_agg_(window_size) {} - - void process(Event r) override - { - size_t max_size = front_values_.size(); - - if (count_ < max_size) - { - push_front(r); - window_pos++; - } - else - { - if (window_pos == max_size) - { - flip(); - } - - push_front(r); - plan_->process(agg(front_agg_[window_pos], back_agg_[max_size - 1 - window_pos])); - window_pos++; - } - - count_++; - } -}; - -void example_1() -{ - Output console(std::cout); - Project mult(&console, [](auto i) - { return i * 3; }); - Select even(&mult, [](auto &i) - { return i % 2 == 0; }); - UserInput user(&even, std::cin); - - user.run(); -} - -void example_2() -{ - Output console(std::cout); - WindowSumAggregator sum(&console, 4); - Project mult(&sum, [](auto i) - { return static_cast(i); }); - Select even(&mult, [](auto &i) - { return i % 2 == 0; }); - UserInput user(&even, std::cin); - - user.run(); -} - -int intmax(int &a, int &b) -{ - return std::max(a, b); -} - -void example_3() -{ - Output console(std::cout); - WindowTwoStackAggregator maxints(&console, 3); - UserInput user(&maxints, std::cin); - - user.run(); -} - -int main() -{ - // example_1(); - // example_2(); - example_3(); -} \ No newline at end of file diff --git a/60029 - Data Processing Systems/streams/code/push_operator_backpressure.cpp b/60029 - Data Processing Systems/streams/code/push_operator_backpressure.cpp deleted file mode 100644 index e4f67d7..0000000 --- a/60029 - Data Processing Systems/streams/code/push_operator_backpressure.cpp +++ /dev/null @@ -1,6 +0,0 @@ -template -class PushOperator { -public: - // return pressure on operator - virtual float process(Event data) = 0; -}; \ No newline at end of file diff --git a/60029 - Data Processing Systems/streams/code/streams/.gitignore b/60029 - Data Processing Systems/streams/code/streams/.gitignore new file mode 100644 index 0000000..c795b05 --- /dev/null +++ b/60029 - Data Processing Systems/streams/code/streams/.gitignore @@ -0,0 +1 @@ +build \ No newline at end of file diff --git a/60029 - Data Processing Systems/streams/code/streams/CMakeLists.txt b/60029 - Data Processing Systems/streams/code/streams/CMakeLists.txt new file mode 100644 index 0000000..5f172d8 --- /dev/null +++ b/60029 - Data Processing Systems/streams/code/streams/CMakeLists.txt @@ -0,0 +1,35 @@ +cmake_minimum_required(VERSION 3.22) +project(Functions) + +include(FetchContent) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED YES) +set(CMAKE_CXX_EXTENSIONS NO) + +FetchContent_Declare( + googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG main +) + +FetchContent_Declare( + googlebenchmark + GIT_REPOSITORY https://github.com/google/benchmark.git + GIT_TAG main +) + +FetchContent_MakeAvailable( + googletest + googlebenchmark +) + +add_executable(Benchmark benchmarks.cpp) +target_link_libraries(Benchmark benchmark::benchmark) + +add_executable(Test tests.cpp) +add_library(GTest::GTest INTERFACE IMPORTED) +target_link_libraries(GTest::GTest INTERFACE gtest_main) +target_link_libraries(Test GTest::GTest) + +add_executable(Examples examples.cpp) diff --git a/60029 - Data Processing Systems/streams/code/streams/README.md b/60029 - Data Processing Systems/streams/code/streams/README.md new file mode 100644 index 0000000..65115f7 --- /dev/null +++ b/60029 - Data Processing Systems/streams/code/streams/README.md @@ -0,0 +1,18 @@ +## What is this? +[Streams](streams.h) for a basic push operator playground. +- Pushes events from operator to operator +- Operators take successors references and push + +## To build & Run +```bash +cmake -S . -B build -DCMAKE_BUILD_TYPE=Release +make -j -C build/ +./build/Test +./build/Benchmark +./build/Examples +``` + +## Contribute! +- Adding new operators. +- Pretty printing the volcano operator structure / algebra from the operators themselves. +- Tests and benchmark window aggregation implementations diff --git a/60029 - Data Processing Systems/streams/code/streams/benchmarks.cpp b/60029 - Data Processing Systems/streams/code/streams/benchmarks.cpp new file mode 100644 index 0000000..8058a70 --- /dev/null +++ b/60029 - Data Processing Systems/streams/code/streams/benchmarks.cpp @@ -0,0 +1,5 @@ +#include "operators.h" + +#include + +BENCHMARK_MAIN(); \ No newline at end of file diff --git a/60029 - Data Processing Systems/streams/code/streams/examples.cpp b/60029 - Data Processing Systems/streams/code/streams/examples.cpp new file mode 100644 index 0000000..55e9e79 --- /dev/null +++ b/60029 - Data Processing Systems/streams/code/streams/examples.cpp @@ -0,0 +1,36 @@ +#include "streams.h" + +void example_1() { + Output console(std::cout); + Project mult(&console, [](auto i) { return i * 3; }); + Select even(&mult, [](auto &i) { return i % 2 == 0; }); + UserInput user(&even, std::cin); + + user.run(); +} + +void example_2() { + Output console(std::cout); + WindowSumAggregator sum(&console, 4); + Project mult(&sum, [](auto i) { return static_cast(i); }); + Select even(&mult, [](auto &i) { return i % 2 == 0; }); + UserInput user(&even, std::cin); + + user.run(); +} + +int intmax(int &a, int &b) { return std::max(a, b); } + +void example_3() { + Output console(std::cout); + WindowTwoStackAggregator maxints(&console, 3); + UserInput user(&maxints, std::cin); + + user.run(); +} + +int main() { + // example_1(); + // example_2(); + example_3(); +} \ No newline at end of file diff --git a/60029 - Data Processing Systems/streams/code/streams/operators/output.h b/60029 - Data Processing Systems/streams/code/streams/operators/output.h new file mode 100644 index 0000000..a8738ea --- /dev/null +++ b/60029 - Data Processing Systems/streams/code/streams/operators/output.h @@ -0,0 +1,8 @@ +template class Output : public PushOperator { + std::ostream &output_; + +public: + Output(std::ostream &output) : output_(output) {} + + void process(Event data) override { output_ << "->" << data << std::endl; } +}; \ No newline at end of file diff --git a/60029 - Data Processing Systems/streams/code/streams/operators/project.h b/60029 - Data Processing Systems/streams/code/streams/operators/project.h new file mode 100644 index 0000000..d776e89 --- /dev/null +++ b/60029 - Data Processing Systems/streams/code/streams/operators/project.h @@ -0,0 +1,14 @@ +template +class Project : public PushOperator { + PushOperator *plan_; + std::function function_; + +public: + Project(PushOperator *plan, + std::function function) + : plan_(plan), function_(function) {} + + void process(InputEvent data) override { + plan_->process(function_(std::move(data))); + } +}; \ No newline at end of file diff --git a/60029 - Data Processing Systems/streams/code/streams/operators/push_operator.h b/60029 - Data Processing Systems/streams/code/streams/operators/push_operator.h new file mode 100644 index 0000000..d5a88b6 --- /dev/null +++ b/60029 - Data Processing Systems/streams/code/streams/operators/push_operator.h @@ -0,0 +1,6 @@ +// templated by the Event data type (for easy testing), would be some +// vector> +template class PushOperator { +public: + virtual void process(Event data) = 0; +}; \ No newline at end of file diff --git a/60029 - Data Processing Systems/streams/code/streams/operators/select.h b/60029 - Data Processing Systems/streams/code/streams/operators/select.h new file mode 100644 index 0000000..387e583 --- /dev/null +++ b/60029 - Data Processing Systems/streams/code/streams/operators/select.h @@ -0,0 +1,13 @@ +template class Select : public PushOperator { + PushOperator *plan_; + std::function predicate_; + +public: + Select(PushOperator *plan, std::function predicate) + : plan_(plan), predicate_(predicate) {} + + void process(Event data) override { + if (predicate_(data)) + plan_->process(std::move(data)); + } +}; \ No newline at end of file diff --git a/60029 - Data Processing Systems/streams/code/streams/operators/source.h b/60029 - Data Processing Systems/streams/code/streams/operators/source.h new file mode 100644 index 0000000..a2022bd --- /dev/null +++ b/60029 - Data Processing Systems/streams/code/streams/operators/source.h @@ -0,0 +1,18 @@ +template class Source { +public: + virtual void run() = 0; +}; + +template class UserInput : public Source { + PushOperator *plan_; + std::istream &src_; + +public: + UserInput(PushOperator *plan, std::istream &src) + : plan_(plan), src_{src} {} + + void run() override { + for (Event r;; src_ >> r) + plan_->process(std::move(r)); + } +}; diff --git a/60029 - Data Processing Systems/streams/code/streams/operators/window_median.h b/60029 - Data Processing Systems/streams/code/streams/operators/window_median.h new file mode 100644 index 0000000..4d0ce44 --- /dev/null +++ b/60029 - Data Processing Systems/streams/code/streams/operators/window_median.h @@ -0,0 +1,34 @@ +class WindowMedianAggregator : public PushOperator { + PushOperator *plan_; + std::vector window_buffer_; + size_t buffer_i_ = 0; + + // for checking the window is filled + size_t count_ = 0; + +public: + WindowMedianAggregator(PushOperator *plan, size_t window_size) + : plan_(plan), window_buffer_(window_size) {} + + void process(float f) override { + const size_t size = window_buffer_.size(); + buffer_i_ = (buffer_i_ + 1) % size; + window_buffer_[buffer_i_] = f; + count_++; + if (count_ > size) { + + // copy and sort, this can be made much more efficient using a multiset + // and vector see multiset median trick: + // https://codeforces.com/blog/entry/68300 + std::vector sorted = window_buffer_; + std::sort(sorted.begin(), sorted.end()); + + // if even size get average of two middle, else middle element + if (size % 2 == 0) { + plan_->process((sorted[size / 2] + sorted[(size / 2) - 1]) / 2); + } else { + plan_->process(sorted[size / 2]); + } + } + } +}; \ No newline at end of file diff --git a/60029 - Data Processing Systems/streams/code/streams/operators/window_sum.h b/60029 - Data Processing Systems/streams/code/streams/operators/window_sum.h new file mode 100644 index 0000000..ba62c0f --- /dev/null +++ b/60029 - Data Processing Systems/streams/code/streams/operators/window_sum.h @@ -0,0 +1,30 @@ +class WindowSumAggregator : public PushOperator { + PushOperator *plan_; + + // a circular buffer window + // the next index after buffer_i_ is the start of the window + std::vector window_buffer_; + + size_t buffer_i_ = 0; + float aggregate = 0; + + // for checking the window is filled + size_t count_ = 0; + +public: + WindowSumAggregator(PushOperator *plan, size_t windowsize) + : plan_(plan), window_buffer_(windowsize) {} + + void process(float f) override { + buffer_i_ = (buffer_i_ + 1) % window_buffer_.size(); + aggregate += f; + count_++; + if (count_ > window_buffer_.size()) { + aggregate -= window_buffer_[buffer_i_]; + window_buffer_[buffer_i_] = f; + plan_->process(aggregate); + } else { + window_buffer_[buffer_i_] = f; + } + } +}; \ No newline at end of file diff --git a/60029 - Data Processing Systems/streams/code/streams/operators/window_two_stack.h b/60029 - Data Processing Systems/streams/code/streams/operators/window_two_stack.h new file mode 100644 index 0000000..3e19c79 --- /dev/null +++ b/60029 - Data Processing Systems/streams/code/streams/operators/window_two_stack.h @@ -0,0 +1,76 @@ + +// To improve: we can use one vector instead of two separate +template +class WindowTwoStackAggregator : public PushOperator { + PushOperator *plan_; + + // front stack + std::vector front_values_; + std::vector front_agg_; + + // back stack + std::vector back_values_; + std::vector back_agg_; + + // track the top of front and back stacks + size_t window_pos = 0; + + // to determine when to start outputting aggregates + size_t count_ = 0; + + // flip front stack to back stack, sets window_pos = 0 + // invariants: + // - Must have window_size items present + void flip() { + size_t size = front_values_.size(); + assert(window_pos == size); + + for (size_t i = 0; i < size; i++) { + back_values_[size - 1 - i] = front_values_[i]; + } + + back_agg_[0] = back_values_[0]; + for (size_t i = 1; i < size; i++) { + back_agg_[i] = agg(back_agg_[i - 1], back_values_[i]); + } + + window_pos = 0; + } + + // Push an item to the front_stack + // leaves the window_pos untouched + void push_front(Event r) { + if (window_pos == 0) { + front_values_[0] = r; + front_agg_[0] = r; + } else { + front_values_[window_pos] = r; + front_agg_[window_pos] = agg(r, front_agg_[window_pos - 1]); + } + } + +public: + WindowTwoStackAggregator(PushOperator *plan, size_t window_size) + : plan_(plan), front_values_(window_size), front_agg_(window_size), + back_values_(window_size), back_agg_(window_size) {} + + void process(Event r) override { + size_t max_size = front_values_.size(); + + if (count_ < max_size) { + push_front(r); + window_pos++; + } else { + if (window_pos == max_size) { + flip(); + } + + push_front(r); + plan_->process( + agg(front_agg_[window_pos], back_agg_[max_size - 1 - window_pos])); + window_pos++; + } + + count_++; + } +}; diff --git a/60029 - Data Processing Systems/streams/code/streams/streams.h b/60029 - Data Processing Systems/streams/code/streams/streams.h new file mode 100644 index 0000000..78e8eb5 --- /dev/null +++ b/60029 - Data Processing Systems/streams/code/streams/streams.h @@ -0,0 +1,19 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +// For easy include of files in the notes, each operator is in a different file +// more maintainable that using line numbers with \inputminted +#include "operators/output.h" +#include "operators/project.h" +#include "operators/push_operator.h" +#include "operators/select.h" +#include "operators/source.h" +#include "operators/window_median.h" +#include "operators/window_sum.h" +#include "operators/window_two_stack.h" diff --git a/60029 - Data Processing Systems/streams/code/streams/tests.cpp b/60029 - Data Processing Systems/streams/code/streams/tests.cpp new file mode 100644 index 0000000..9361596 --- /dev/null +++ b/60029 - Data Processing Systems/streams/code/streams/tests.cpp @@ -0,0 +1,5 @@ +#include "operators.h" + +#include "gtest/gtest.h" + +// Currently Untested - feel free to add your own! diff --git a/60029 - Data Processing Systems/streams/streams.tex b/60029 - Data Processing Systems/streams/streams.tex index 77009b7..3a8cff9 100644 --- a/60029 - Data Processing Systems/streams/streams.tex +++ b/60029 - Data Processing Systems/streams/streams.tex @@ -12,14 +12,7 @@ \section{Motivation} \section{Push Operators} Rather than operators pulling in tuples (as in \textit{volcano processing}), operators push tuples to the next stage. -\begin{minted}{cpp} -// templated by the Event data type (for easy testing), would be some vector> -template -class PushOperator { -public: - virtual void process(Event data) = 0; -}; -\end{minted} +\inputminted[firstline=10, lastline=14]{cpp}{streams/code/streams/streams.h} \begin{itemize} \item As with volcano and bulk processing we can also send references to data (e.g indexes into a larger backing table) to avoid copies. \item Virtual method used to allow operators to be combined into queries at runtime. @@ -31,52 +24,13 @@ \subsubsection{Output to Console} Some form of output operator is required to send data to the user (e.g player positions sent over the network to a live sports match website). \\ \\ Here a basic Output operator pushes to a stream (e.g a file with \mintinline{cpp}{std::ofstream}, or to the console with \mintinline{cpp}{std::cout}). -\begin{minted}{cpp} -template -class Output : public PushOperator { - std::ostream &output_; -public: - Output(std::ostream &output) : output_(output) {} - - void process(Event data) override { output_ << "->" << data << std::endl; } -}; -\end{minted} +\inputminted{cpp}{streams/code/streams/operators/output.h} \subsubsection{Selection} -\begin{minted}{cpp} -template -class Select : public PushOperator { - PushOperator *plan_; - std::function predicate_; - -public: - Select(PushOperator *plan, std::function predicate) - : plan_(plan), predicate_(predicate) {} - - void process(Event data) override { - if (predicate_(data)) plan_->process(std::move(data)); - } -}; -\end{minted} - +\inputminted{cpp}{streams/code/streams/operators/select.h} \subsubsection{Project} Generalised here to just map a function over the stream. -\begin{minted}{cpp} -// by default maps to same data type -template -class Project : public PushOperator { - PushOperator *plan_; - std::function function_; - -public: - Project(PushOperator *plan, std::function function) - : plan_(plan), function_(function) {} - - void process(InputEvent data) override { - plan_->process(function_(std::move(data))); - } -}; -\end{minted} +\inputminted{cpp}{streams/code/streams/operators/project.h} \subsubsection{Data Source} We also need to be able to pipe data directly into a chain of operators. @@ -84,26 +38,7 @@ \subsubsection{Data Source} \item Can implement a class to directly call \mintinline{cpp}{PushOperator::process}. \item Here a convenient interface is used to demonstrate terminal input. \end{itemize} -\begin{minted}{cpp} -template -class Source { -public: - virtual void run() = 0; -}; - -template -class UserInput : public Source { - PushOperator *plan_; - std::istream &src_; - -public: - UserInput(PushOperator *plan, std::istream &src) : plan_(plan), src_{src} {} - - void run() override { - for (Row r;; src_ >> r) plan_->process(std::move(r)); - } -}; -\end{minted} +\inputminted{cpp}{streams/code/streams/operators/source.h} \subsubsection{Combining Operators} We can then combine operators to form queries. @@ -195,7 +130,7 @@ \section{Time} \begin{definitionbox}{Event-Time} Timestamps externally provided by the source supplying events to the data processing system as part of data input. \begin{itemize} - \item System needs to ensure timestamps are ordered (external provider may not be correct). + \item System needs to ensure timestamps are ordered (external provider may not be correct). \end{itemize} \begin{minted}{cpp} class NetworkSource : public Source { @@ -222,7 +157,7 @@ \subsection{In-Order Processing} \begin{definitionbox}{In-Order Processing} Events are assumed to be entered in timestamp order (or by some other monotonically progressing attribute - e.g counter). \begin{itemize} - \item Greatly simplifies stream system implementation, a powerful guarantee. + \item Greatly simplifies stream system implementation, a powerful guarantee. \item Difficult to ensure order guarantee holds (on a distributed, asynchronous system there is not global clock) \end{itemize} \end{definitionbox} @@ -278,7 +213,7 @@ \subsection{Windows} There are also \textit{Session Windows} open and closed by an event (e.g user loggin in \& out). \\ \\ \textit{Lateness bounds} are an implementation detail for ordering streams -\\ +\\ \\ \textit{Windows} are SQL supported abstractions for viewing a slice of a stream, and are part of the language semantics. \begin{sidenotebox}{SQL Windows} Despite being originally designed only for persistent databases, SQL added window functions in SQL 2003 (\href{https://en.wikipedia.org/wiki/SQL:2003}{see changelog}). @@ -311,78 +246,13 @@ \subsection{Windows} \subsection{Aggregate Implementations} We can implement basic aggregate functions using the previous \mintinline{cpp}{PushOperator} abstraction. \subsubsection{Window Sum} -\begin{minted}{cpp} -class WindowSumAggregator : public PushOperator { - PushOperator *plan_; - - // a circular buffer window - // the next index after buffer_i_ is the start of the window - std::vector window_buffer_; - - size_t buffer_i_ = 0; - float aggregate = 0; - - // for checking the window is filled - size_t count_ = 0; - -public: - WindowSumAggregator(PushOperator *plan, size_t windowsize) - : plan_(plan), window_buffer_(windowsize) {} - - void process(float f) override { - buffer_i_ = (buffer_i_ + 1) % window_buffer_.size(); - aggregate += f; - count_++; - if (count_ > window_buffer_.size()) { - aggregate -= window_buffer_[buffer_i_]; - window_buffer_[buffer_i_] = f; - plan_->process(aggregate); - } else { - window_buffer_[buffer_i_] = f; - } - } -}; -\end{minted} +\inputminted{cpp}{streams/code/streams/operators/window_sum.h} \subsubsection{Window Median} \begin{sidenotebox}{Improve Me!} The provided algorithm must copy the entire window for every \mintinline{cpp}{WindowMedianAggregator::process}. For large window sizes this is very slow, this can be made much more efficient! \end{sidenotebox} -\begin{minted}{cpp} -class WindowMedianAggregator : public PushOperator { - PushOperator *plan_; - std::vector window_buffer_; - size_t buffer_i_ = 0; - - // for checking the window is filled - size_t count_ = 0; - -public: - WindowMedianAggregator(PushOperator *plan, size_t window_size) - : plan_(plan), window_buffer_(window_size) {} - - void process(float f) override { - const size_t size = window_buffer_.size(); - buffer_i_ = (buffer_i_ + 1) % size; - window_buffer_[buffer_i_] = f; - count_++; - if (count_ > size) { - - // copy and sort, this can be made much more efficient using a multiset and vector - // see multiset median trick: https://codeforces.com/blog/entry/68300 - std::vector sorted = window_buffer_; - std::sort(sorted.begin(), sorted.end()); - - // if even size get average of two middle, else middle element - if (size % 2 == 0) { - plan_->process((sorted[size / 2] + sorted[(size / 2) - 1]) / 2); - } else { - plan_->process(sorted[size / 2]); - } - } - } -}; -\end{minted} +\inputminted{cpp}{streams/code/streams/operators/window_median.h} \subsection{Two Stacks Algorithm} @@ -395,72 +265,7 @@ \subsection{Two Stacks Algorithm} \item When the front stack is full, and back stack empty (occurs every $\cfrac{1}{window \ size}$) flip the front stack, recalculate aggregates and set to back stack. \end{itemize} We can implement this using the previous \mintinline{cpp}{PushOperator} abstraction. -\begin{minted}{cpp} -template -class WindowTwoStackAggregator : public PushOperator { - PushOperator *plan_; - - // front stack - std::vector front_values_; - std::vector front_agg_; - - // back stack - std::vector back_values_; - std::vector back_agg_; - - // track the top of front and back stacks - size_t window_pos = 0; - - // to determine when to start outputting aggregates - size_t count_ = 0; - - // flip front stack to back stack, sets window_pos = 0 - // invariant: Must have window_size items present - void flip() { - size_t size = front_values_.size(); - assert(window_pos == size); - - for (size_t i = 0; i < size; i++) { back_values_[size - 1 - i] = front_values_[i]; } - - back_agg_[0] = back_values_[0]; - - for (size_t i = 1; i < size; i++) { back_agg_[i] = agg(back_agg_[i - 1], back_values_[i]); } - - window_pos = 0; - } - - // Push an item to the front_stack, leaves the window_pos untouched - void push_front(Event r) { - if (window_pos == 0) { - front_values_[0] = r; - front_agg_[0] = r; - } else { - front_values_[window_pos] = r; - front_agg_[window_pos] = agg(r, front_agg_[window_pos - 1]); - } - } - -public: - WindowTwoStackAggregator(PushOperator *plan, size_t window_size) - : plan_(plan), front_values_(window_size), front_agg_(window_size), - back_values_(window_size), back_agg_(window_size) {} - - void process(Event r) override { - size_t max_size = front_values_.size(); - if (count_ < max_size) { - push_front(r); - window_pos++; - } else { - if (window_pos == max_size) { flip(); } - - push_front(r); - plan_->process(agg(front_agg_[window_pos], back_agg_[max_size - 1 - window_pos])); - window_pos++; - } - count_++; - } -}; -\end{minted} +\inputminted{cpp}{streams/code/streams/operators/window_two_stack.h} For example: \begin{minted}{cpp} Output console(std::cout); @@ -514,13 +319,13 @@ \subsection{Bloom Filters} \subsubsection{Tuning Bloom Filters} \noindent Bloom filters have several parameters that can be tuned. \\ \begin{tabular}{l p{.7\textwidth}} - $m$ & bits in filter \\ - $n$ & expected number of distinct elements \\ - $k$ & number of hash functions \\ - $\epsilon$ & False-positive rate \\ + $m$ & bits in filter \\ + $n$ & expected number of distinct elements \\ + $k$ & number of hash functions \\ + $\epsilon$ & False-positive rate \\ \end{tabular} \[\begin{matrix} - m \cong -1.44 \times n \times \log_2(\epsilon) & \ & - k \cong \cfrac{m}{n} \times \log_e(2) & \ & - \epsilon = \left( 1 - e^{-\cfrac{k \times n}{m}}\right)^k \\ -\end{matrix}\] + m \cong -1.44 \times n \times \log_2(\epsilon) & \ & + k \cong \cfrac{m}{n} \times \log_e(2) & \ & + \epsilon = \left( 1 - e^{-\cfrac{k \times n}{m}}\right)^k \\ + \end{matrix}\] diff --git a/60029 - Data Processing Systems/transactions/transactions.tex b/60029 - Data Processing Systems/transactions/transactions.tex index 576460e..62561ce 100644 --- a/60029 - Data Processing Systems/transactions/transactions.tex +++ b/60029 - Data Processing Systems/transactions/transactions.tex @@ -409,8 +409,7 @@ \subsubsection{Lock Manager} \item Checks for conflicts in overlapping ranges \item Ensures locks are released properly \end{itemize} - -\unfinished +To maximise concurrency, we want to lock as little as is required for the given isolation level. \begin{tabbox}{prosbox} \textbf{Serializable} & \textit{2PL} ensures realizability (and hence no anomalies). \\ @@ -420,6 +419,19 @@ \subsubsection{Lock Manager} \textbf{Mutual Exclusion} & Ranges are locked, so cannot read \& write, or write \& write in parallel. \\ \end{tabbox} +\begin{sidenotebox}{Predicate Locking} + Rather than locking objects (e.g rows, tables), lock predicates. + \begin{minted}{sql} +-- lock updates, deletions, inserts of rows potentially covered by this predicate +SELECT * FROM people WHERE name <> "bob" AND age > 18; +UPDATE people SET cool = true WHERE name <> "bob"; -- can run concurrently + \end{minted} + \begin{itemize} + \item On attempting a query, use the current predicate locks, and predicate used for access to determine locking. + \item Locking rows can prevent Non-repeatable-read, locking inserts to a table (using predicate locking) can prevent phantom read. + \end{itemize} +\end{sidenotebox} + \subsection{Timestamp Ordering} Each tuple is timestamped for \textit{last read} and \textit{last write}, and every transaction is timestamped at the start of execution. \begin{center}