Skip to content

Commit

Permalink
60029: updated code samples + misc
Browse files Browse the repository at this point in the history
  • Loading branch information
OliverKillane committed Oct 1, 2023
1 parent 603e1c5 commit aec2013
Show file tree
Hide file tree
Showing 63 changed files with 1,499 additions and 1,973 deletions.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
% !TeX root = 60029 - Data Processing Systems.tex


\documentclass{report}
\title{60029 - Data Processing Systems}
Expand Down
124 changes: 5 additions & 119 deletions 60029 - Data Processing Systems/advanced_topics/advanced_topics.tex
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ \section{CodeGen}

\begin{definitionbox}{voodoo}
A \textit{Vector-Dataflow Language} used as a unified algebra for code generating DBMS
\href{https://www.cs.albany.edu/~jhh/courses/readings/pirk.pvldb16.pdf}{original paper}
\href{https://www.cs.albany.edu/~jhh/courses/readings/pirk.pvldb16.pdf}{(original paper)}.
\end{definitionbox}

\subsection{Vector Operations}
Expand Down Expand Up @@ -223,43 +223,7 @@ \subsection{Hoare Partitioning}
\item $O(n)$ time complexity
\item Does not require extra memory / partitions in-place.
\end{itemize}

\begin{minted}{cpp}
// partition part of a vector in range [start-inc, end-exc) and return the pivot index
template <typename T>
size_t partition(std::vector<T> &sort_vec, size_t start, size_t end) {
// get pivot
T pivot = sort_vec[start];
size_t count = 0;

// determine where to partition / where to place pivot value
for (size_t i = start + 1; i < end; i++) {
if (sort_vec[i] <= pivot)
count++;
}

// swap pivot into place, will partition around pivot
size_t pivotIndex = start + count;
std::swap(sort_vec[pivotIndex], sort_vec[start]);

// start pointers i & j at ends of range
size_t i = start, j = end - 1;

// advance pointers, swap and partition
while (i < pivotIndex && j > pivotIndex) {
while (sort_vec[i] <= pivot) i++;
while (sort_vec[j] > pivot) j--;

if (i < pivotIndex && j >= pivotIndex) {
std::swap(sort_vec[i], sort_vec[j]);
i++;
j--;
}
}

return pivotIndex;
}
\end{minted}
\inputminted[firstline=5]{cpp}{advanced_topics/code/partition_comparison/partitions/in_place_conditional.h}

Consider the following section from the algorithm.
\begin{minted}{cpp}
Expand Down Expand Up @@ -298,95 +262,17 @@ \subsection{Predication}
\\ In the below examples we \textit{predicate} by removing a jump/branch, rather than using predicated instructions.
\end{sidenotebox}
We can start with a basic out-of-place partition.
\begin{minted}{cpp}
template<std::copy_constructible T>
size_t partition(const std::vector<T>& input_vec, std::vector<T>& output_vec, size_t start, size_t end)
{
const T& pivot = input_vec[(start + end) / 2];
size_t left_index = start;
size_t right_index = end - 1;

for (auto i = start; i < end; i++) {
if (input_vec[i] < pivot) {
output_vec[left_index] = input_vec[i];
left_index++;
} else {
output_vec[right_index] = input_vec[i];
right_index--;
}
}

return right_index;
}
\end{minted}
\inputminted[firstline=7]{cpp}{advanced_topics/code/partition_comparison/partitions/out_of_place_conditional.h}
Here the \mintinline{cpp}{if (input_vec[i] < pivot)} condition has low selectivity, and is part of the hot loop.
\\
\\ We can predicate this by always writing the \mintinline{cpp}{input_vec[i]}, and incrementing the pivot indexes based on the condition.
\begin{minted}{cpp}
template<std::copy_constructible T>
size_t partition(const std::vector<T>& input_vec, std::vector<T>& output_vec, size_t start, size_t end)
{
const T& pivot = input_vec[(start + end) / 2];
size_t left_index = start;
size_t right_index = end - 1;

for (auto i = start; i < end; i++) {
output_vec[left_index] = input_vec[i];
output_vec[right_index] = input_vec[i];

// increment using boolean, if not incremented, value is overwritten on the next iteration
// of the loop
left_index += input_vec[i] < pivot;
right_index -= input_vec[i] >= pivot;
}

return right_index;
}
\end{minted}
\inputminted[firstline=6]{cpp}{advanced_topics/code/partition_comparison/partitions/out_of_place_predicated.h}
\subsection{Predicated Cracking}
\begin{sidenotebox}{Cracking stuff!}
The algorithm presented here is from the paper \href{https://core.ac.uk/download/pdf/301643658.pdf}{\textit{Database Cracking: Fancy Scan, not Poor Man's Sort!}}
\end{sidenotebox}

\begin{minted}{cpp}
template <typename T>
size_t partition(std::vector<T> &sort_vec, size_t start, size_t end) {

bool cmp = false;
size_t left_ptr = start;
size_t right_ptr = end - 1;
T active = sort_vec[left_ptr];
T backup = sort_vec[right_ptr];
T pivot = sort_vec[(start + end) / 2]; // somewhat arbitrary pivot selection

while (left_ptr < right_ptr) {
// compare and write
cmp = pivot > active;
sort_vec[left_ptr] = active;
sort_vec[right_ptr] = active;

// advance cursor
left_ptr += cmp;
right_ptr -= 1 - cmp;

// backup phase
active = cmp * sort_vec[left_ptr] + (1-cmp) * sort_vec[right_ptr];

// swap active
std::swap(active, backup);
}

sort_vec[left_ptr] = active;
return left_ptr;
}
\end{minted}
\inputminted[firstline=5]{cpp}{advanced_topics/code/partition_comparison/partitions/in_place_predicated.h}

\begin{sidenotebox}{Conditional Understanding}
A python script in the included code for these notes can be used to generate apply and print steps of this algorithm.
\end{sidenotebox}

\section{Stream Processing}


\section{Composable Data Processing}

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.14)
project(Partition-Benchmarks)
project(Partition-Comparison)

include(FetchContent)

Expand All @@ -24,5 +24,10 @@ FetchContent_MakeAvailable(
googlebenchmark
)

add_executable(${PROJECT_NAME} benchmarks.cpp)
target_link_libraries(${PROJECT_NAME} benchmark::benchmark)
add_executable(Benchmark benchmarks.cpp)
target_link_libraries(Benchmark benchmark::benchmark)

add_executable(Test tests.cpp)
target_link_libraries(Test gtest_main gmock) # Remove GTest::GTest

add_executable(Examples examples.cpp)
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
## What is this?
[Tests](tests.cpp) & [benchmarks](benchmarks.cpp) for the partition algorithms discussed in the advanced topics lecture.

## To build & Run
```bash
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
make -j -C build/
./build/Test
./build/Benchmark
./build/Examples
```
Original file line number Diff line number Diff line change
@@ -1,78 +1,96 @@
#include "in_place_conditional.h"
#include "in_place_predicated.h"
#include "out_of_place_conditional.h"
#include "out_of_place_predicated.h"
#include "partitions/in_place_conditional.h"
#include "partitions/in_place_predicated.h"
#include "partitions/out_of_place_conditional.h"
#include "partitions/out_of_place_predicated.h"

#include <benchmark/benchmark.h>


std::vector<int> ordered_ints(size_t n) {
std::vector<int> vec;
vec.reserve(n);
for (auto i = 0; i < n; i++) vec.push_back(i);
return vec;
std::vector<int> vec;
vec.reserve(n);
for (auto i = 0; i < n; i++)
vec.push_back(i);
return vec;
}

std::vector<int> alternating_ints(size_t n) {
std::vector<int> vec;
vec.reserve(n);
for (auto i = 0; i < n; i++) vec.push_back(i % 2 == 0 ? i : n - i);
return vec;
std::vector<int> vec;
vec.reserve(n);
for (auto i = 0; i < n; i++)
vec.push_back(i % 2 == 0 ? i : n - i);
return vec;
}

std::vector<int> random_ints(size_t n) {
std::vector<int> vec;
vec.reserve(n);
srand(n);
for (auto i = 0; i < n; i++) vec.push_back(rand());
return vec;
std::vector<int> vec;
vec.reserve(n);
srand(n);
for (auto i = 0; i < n; i++)
vec.push_back(rand());
return vec;
}

// Benchmarking in place partitioning
template<class T, size_t partition(std::vector<T>&, size_t, size_t), std::vector<T> generate(size_t)>
template <class T, size_t partition(std::vector<T> &, size_t, size_t),
std::vector<T> generate(size_t)>
static void partition_in_place(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
std::vector<T> workload(generate(state.range(0)));
std::vector<T> workload(generate(state.range(0)));
state.ResumeTiming();
partition(workload, 0, workload.size());
benchmark::DoNotOptimize(workload);
}
}

// Benchmarking out of place (no including alloc time for aux vector)
template<class T, size_t partition(const std::vector<T>&, std::vector<T>&, size_t, size_t), std::vector<T> generate(size_t)>
template <class T,
size_t partition(const std::vector<T> &, std::vector<T> &, size_t,
size_t),
std::vector<T> generate(size_t)>
static void partition_out_of_place(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
std::vector<T> workload(generate(state.range(0)));
std::vector<T> aux(state.range(0));
std::vector<T> aux(state.range(0));
state.ResumeTiming();
partition(workload, aux, 0, workload.size());
benchmark::DoNotOptimize(aux);
}
}

BENCHMARK(partition_in_place<int, hoare::partition, ordered_ints>)->Range(8 * 1024, 64 * 1024);
BENCHMARK(partition_in_place<int, predicated_cracking::partition, ordered_ints>)->Range(8 * 1024, 64 * 1024);
BENCHMARK(partition_out_of_place<int, out_of_place_cond::partition, ordered_ints>)->Range(8 * 1024, 64 * 1024);
BENCHMARK(partition_out_of_place<int, out_of_place_pred::partition, ordered_ints>)->Range(8 * 1024, 64 * 1024);
BENCHMARK(partition_in_place<int, hoare::partition, ordered_ints>)
->Range(8 * 1024, 64 * 1024);
BENCHMARK(partition_in_place<int, predicated_cracking::partition, ordered_ints>)
->Range(8 * 1024, 64 * 1024);
BENCHMARK(
partition_out_of_place<int, out_of_place_cond::partition, ordered_ints>)
->Range(8 * 1024, 64 * 1024);
BENCHMARK(
partition_out_of_place<int, out_of_place_pred::partition, ordered_ints>)
->Range(8 * 1024, 64 * 1024);

BENCHMARK(partition_in_place<int, hoare::partition, alternating_ints>)->Range(8 * 1024, 64 * 1024);
BENCHMARK(partition_in_place<int, predicated_cracking::partition, alternating_ints>)->Range(8 * 1024, 64 * 1024);
BENCHMARK(partition_out_of_place<int, out_of_place_cond::partition, alternating_ints>)->Range(8 * 1024, 64 * 1024);
BENCHMARK(partition_out_of_place<int, out_of_place_pred::partition, alternating_ints>)->Range(8 * 1024, 64 * 1024);
BENCHMARK(partition_in_place<int, hoare::partition, alternating_ints>)
->Range(8 * 1024, 64 * 1024);
BENCHMARK(
partition_in_place<int, predicated_cracking::partition, alternating_ints>)
->Range(8 * 1024, 64 * 1024);
BENCHMARK(
partition_out_of_place<int, out_of_place_cond::partition, alternating_ints>)
->Range(8 * 1024, 64 * 1024);
BENCHMARK(
partition_out_of_place<int, out_of_place_pred::partition, alternating_ints>)
->Range(8 * 1024, 64 * 1024);

BENCHMARK(partition_in_place<int, hoare::partition, random_ints>)->Range(8 * 1024, 64 * 1024);
BENCHMARK(partition_in_place<int, predicated_cracking::partition, random_ints>)->Range(8 * 1024, 64 * 1024);
BENCHMARK(partition_out_of_place<int, out_of_place_cond::partition, random_ints>)->Range(8 * 1024, 64 * 1024);
BENCHMARK(partition_out_of_place<int, out_of_place_pred::partition, random_ints>)->Range(8 * 1024, 64 * 1024);
BENCHMARK(partition_in_place<int, hoare::partition, random_ints>)
->Range(8 * 1024, 64 * 1024);
BENCHMARK(partition_in_place<int, predicated_cracking::partition, random_ints>)
->Range(8 * 1024, 64 * 1024);
BENCHMARK(
partition_out_of_place<int, out_of_place_cond::partition, random_ints>)
->Range(8 * 1024, 64 * 1024);
BENCHMARK(
partition_out_of_place<int, out_of_place_pred::partition, random_ints>)
->Range(8 * 1024, 64 * 1024);

// To run this use:
// cd partition_comparison
// ```bash
// cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
// make -j -C build/
// ./build/Partition-Benchmarks
// ```
BENCHMARK_MAIN();
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#include "partitions/in_place_conditional.h"
#include "partitions/in_place_predicated.h"
#include "partitions/out_of_place_conditional.h"
#include "partitions/out_of_place_predicated.h"
#include <cstddef>
#include <iostream>
#include <vector>

template <typename T> void print_vec(const std::vector<T> &v) {
for (const auto &x : v)
std::cout << x << ",";
std::cout << std::endl;
}

void example_1() {
std::vector<int> is = {2, 3, 12, 5, 6, 7, 34, 3, 2, 1, 3, 5, 7, 23};
// std::vector<int> is = {
// 2, 1, 2, 6, 7, 34, 3, 5, 12, 3, 5, 7, 3, 23,
// };

print_vec(is);
auto part = predicated_cracking::partition(is, 0, is.size());
print_vec(is);
std::cout << "partition at vec[" << part << "] = " << is[part] << std::endl;
}

void example_2() {
std::vector<int> is = {7};
std::vector<int> ans(is.size());

print_vec(is);
auto part = out_of_place_cond::partition(is, ans, 0, is.size());
print_vec(ans);
std::cout << "partition at vec[" << part << "] = " << ans[part] << std::endl;
}

int main() {
example_1();
example_2();
}
Loading

0 comments on commit aec2013

Please sign in to comment.