Skip to content

Commit

Permalink
Remove GSL in favor of stdlib
Browse files Browse the repository at this point in the history
We mostly used GSL for its `span` class. After moving to the C++20
standard, we now have `std::span` available to us. It is more explicit
in how it works because it's driven by the standard, and it allows us
not to rely on a third party library.

`std::span` does no bound checks, so indexed access had to be guarded
with explicit checks when necessary. Some helper functions were
introduced, including equality operator.

`Expect` macros were replaced with `if` statements that throw
`std::invalid_argument` if the contract is not upheld.

Changelog-changed: GSL is removed as dependency
Changelog-changed: gsl::span is replaced with std::span
Signed-off-by: Michal Siedlaczek <[email protected]>
  • Loading branch information
elshize committed Dec 11, 2024
1 parent d645b4e commit b33c8f8
Show file tree
Hide file tree
Showing 35 changed files with 209 additions and 171 deletions.
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ target_link_libraries(pisa
Boost::boost
mio
mio_base
GSL
spdlog
fmt::fmt
range-v3
Expand Down
3 changes: 0 additions & 3 deletions external/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,6 @@ add_library(simdcomp STATIC ${CMAKE_CURRENT_SOURCE_DIR}/simdcomp/src/simdbitpack
${CMAKE_CURRENT_SOURCE_DIR}/simdcomp/src/simdcomputil.c
)

# Add GSL
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/GSL EXCLUDE_FROM_ALL)

# Add Boost
if (NOT PISA_SYSTEM_BOOST)
add_subdirectory(boost-cmake)
Expand Down
5 changes: 3 additions & 2 deletions include/pisa/codec/block_codec_registry.hpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
#pragma once

#include <algorithm>
#include <memory>
#include <span>
#include <string_view>

#include <fmt/format.h>
#include <gsl/span>

#include "codec/block_codec.hpp"

Expand Down Expand Up @@ -44,6 +45,6 @@ struct BlockCodecRegistry {
/**
* Lists the names of all known block codecs.
*/
[[nodiscard]] constexpr auto get_block_codec_names() -> gsl::span<std::string_view const>;
[[nodiscard]] constexpr auto get_block_codec_names() -> std::span<std::string_view const>;

} // namespace pisa
1 change: 0 additions & 1 deletion include/pisa/concepts/container.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

// Copyright 2024 PISA developers
//
// Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
6 changes: 3 additions & 3 deletions include/pisa/invert.hpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#pragma once

#include <optional>
#include <span>
#include <thread>
#include <vector>

#include <gsl/span>
#include <range/v3/view/iota.hpp>
#include <tbb/blocked_range.h>

Expand All @@ -16,7 +16,7 @@ namespace pisa { namespace invert {
using PostingIterator = typename std::vector<Posting>::iterator;
using Documents = std::unordered_map<Term_Id, std::vector<Document_Id>>;
using Frequencies = std::unordered_map<Term_Id, std::vector<Frequency>>;
using DocumentRange = gsl::span<gsl::span<Term_Id const>>;
using DocumentRange = std::span<std::span<Term_Id const>>;

/// Inverted index abstraction used internally in the inverting process.
///
Expand All @@ -42,7 +42,7 @@ namespace pisa { namespace invert {

/// A single slice view over a chunk of a forward index.
struct ForwardIndexSlice {
gsl::span<gsl::span<Term_Id const>> documents;
std::span<std::span<Term_Id const>> documents;
ranges::iota_view<Document_Id, Document_Id> document_ids;
};

Expand Down
5 changes: 2 additions & 3 deletions include/pisa/io.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@
#include <exception>
#include <filesystem>
#include <iostream>
#include <span>
#include <string>
#include <vector>

#include <gsl/span>

namespace pisa::io {

/// Indicates that a file was not found.
Expand Down Expand Up @@ -45,6 +44,6 @@ void for_each_line(std::istream& is, Function fn) {
[[nodiscard]] auto load_data(std::string const& data_file) -> std::vector<char>;

/// Writes bytes to a file.
void write_data(std::string const& data_file, gsl::span<std::byte const> bytes);
void write_data(std::string const& data_file, std::span<std::byte const> bytes);

} // namespace pisa::io
1 change: 0 additions & 1 deletion include/pisa/linear_quantizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#include <cstdint>

#include <fmt/format.h>
#include <gsl/gsl_assert>

namespace pisa {

Expand Down
10 changes: 5 additions & 5 deletions include/pisa/memory_source.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

#include <filesystem>
#include <memory>
#include <span>
#include <vector>

#include <gsl/span>
#include <mio/mmap.hpp>

namespace pisa {
Expand All @@ -29,7 +29,7 @@ class MemorySource {
/// Constructs a memory source from a vector.
///
/// NOTE: This is non-owning source, so tread carefully!
[[nodiscard]] static auto from_span(gsl::span<char> span) -> MemorySource;
[[nodiscard]] static auto from_span(std::span<char> span) -> MemorySource;

/// Constructs a memory source using a memory mapped file.
///
Expand Down Expand Up @@ -65,13 +65,13 @@ class MemorySource {
[[nodiscard]] auto size() const -> size_type;

/// Full span over memory.
[[nodiscard]] auto span() const -> gsl::span<value_type const>;
[[nodiscard]] auto span() const -> std::span<value_type const>;

/// Subspan of memory.
///
/// \throws std::out_of_range if offset + size is out of bounds
[[nodiscard]] auto subspan(size_type offset, size_type size = gsl::dynamic_extent) const
-> gsl::span<value_type const>;
[[nodiscard]] auto subspan(size_type offset, size_type size = std::dynamic_extent) const
-> std::span<value_type const>;

/// Type erasure interface. Any type implementing it are supported as memory source.
struct Interface {
Expand Down
40 changes: 20 additions & 20 deletions include/pisa/payload_vector.hpp
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
#pragma once

#include <algorithm>
#include <filesystem>
#include <fstream>
#include <iostream>
#include <iterator>
#include <optional>
#include <span>
#include <string_view>
#include <vector>

#include <fmt/format.h>
#include <gsl/gsl_assert>
#include <gsl/span>

namespace pisa {

Expand All @@ -24,8 +24,8 @@ namespace detail {
using value_type = Payload_View;
using difference_type = std::make_signed_t<size_type>;

typename gsl::span<size_type const>::iterator offset_iter;
typename gsl::span<std::byte const>::iterator payload_iter;
typename std::span<size_type const>::iterator offset_iter;
typename std::span<std::byte const>::iterator payload_iter;

constexpr auto operator++() -> Payload_Vector_Iterator& {
++offset_iter;
Expand Down Expand Up @@ -194,7 +194,7 @@ auto encode_payload_vector(InputIterator first, InputIterator last, PayloadEncod
}

template <typename Payload, typename PayloadEncodingFn>
auto encode_payload_vector(gsl::span<Payload const> values, PayloadEncodingFn encoding_fn) {
auto encode_payload_vector(std::span<Payload const> values, PayloadEncodingFn encoding_fn) {
return encode_payload_vector(values.begin(), values.end(), encoding_fn);
}

Expand All @@ -207,13 +207,13 @@ auto encode_payload_vector(InputIterator first, InputIterator last) {
});
}

inline auto encode_payload_vector(gsl::span<std::string const> values) {
inline auto encode_payload_vector(std::span<std::string const> values) {
return encode_payload_vector(values.begin(), values.end());
}

template <typename... T>
constexpr auto unpack_head(gsl::span<std::byte const> mem)
-> std::tuple<T..., gsl::span<std::byte const>> {
constexpr auto unpack_head(std::span<std::byte const> mem)
-> std::tuple<T..., std::span<std::byte const>> {
static_assert(detail::all_pod<T...>::value);
auto offset = detail::sizeofs<T...>::value;
if (offset > mem.size()) {
Expand All @@ -223,10 +223,10 @@ constexpr auto unpack_head(gsl::span<std::byte const> mem)
}
auto tail = mem.subspan(offset);
auto head = detail::unpack<T...>(mem.data());
return std::tuple_cat(head, std::tuple<gsl::span<std::byte const>>(tail));
return std::tuple_cat(head, std::tuple<std::span<std::byte const>>(tail));
}

[[nodiscard]] inline auto split(gsl::span<std::byte const> mem, std::size_t offset) {
[[nodiscard]] inline auto split(std::span<std::byte const> mem, std::size_t offset) {
if (offset > mem.size()) {
throw std::runtime_error(
fmt::format("Cannot split span of size {} at position {}", mem.size(), offset)
Expand All @@ -236,14 +236,14 @@ constexpr auto unpack_head(gsl::span<std::byte const> mem)
}

template <typename T>
[[nodiscard]] auto cast_span(gsl::span<std::byte const> mem) -> gsl::span<T const> {
[[nodiscard]] auto cast_span(std::span<std::byte const> mem) -> std::span<T const> {
auto type_size = sizeof(T);
if (mem.size() % type_size != 0) {
throw std::runtime_error(
fmt::format("Failed to cast byte-span to span of T of size {}", type_size)
);
}
return gsl::make_span(reinterpret_cast<T const*>(mem.data()), mem.size() / type_size);
return std::span(reinterpret_cast<T const*>(mem.data()), mem.size() / type_size);
}

template <typename Payload_View = std::string_view>
Expand All @@ -257,17 +257,17 @@ class Payload_Vector {
explicit Payload_Vector(Payload_Vector_Buffer const& container)
: offsets_(container.offsets), payloads_(container.payloads) {}

Payload_Vector(gsl::span<size_type const> offsets, gsl::span<std::byte const> payloads)
Payload_Vector(std::span<size_type const> offsets, std::span<std::byte const> payloads)
: offsets_(offsets), payloads_(payloads) {}

template <typename ContiguousContainer>
[[nodiscard]] constexpr static auto from(ContiguousContainer&& mem) -> Payload_Vector {
return from(gsl::make_span(reinterpret_cast<std::byte const*>(mem.data()), mem.size()));
return from(std::span(reinterpret_cast<std::byte const*>(mem.data()), mem.size()));
}

[[nodiscard]] static auto from(gsl::span<std::byte const> mem) -> Payload_Vector {
[[nodiscard]] static auto from(std::span<std::byte const> mem) -> Payload_Vector {
size_type length;
gsl::span<std::byte const> tail;
std::span<std::byte const> tail;
try {
std::tie(length, tail) = unpack_head<size_type>(mem);
} catch (std::runtime_error const& err) {
Expand All @@ -276,7 +276,7 @@ class Payload_Vector {
);
}

gsl::span<std::byte const> offsets, payloads;
std::span<std::byte const> offsets, payloads;
try {
std::tie(offsets, payloads) = split(tail, (length + 1U) * sizeof(size_type));
} catch (std::runtime_error const& err) {
Expand Down Expand Up @@ -314,8 +314,8 @@ class Payload_Vector {
}

private:
gsl::span<size_type const> offsets_;
gsl::span<std::byte const> payloads_;
std::span<size_type const> offsets_;
std::span<std::byte const> payloads_;
};

/// Find the position of `value` in a sorted range.
Expand All @@ -339,7 +339,7 @@ auto binary_search(Iter begin, Iter end, T value, Compare cmp = std::less<>{})
/// It calls the function overload that takes iterators. See that overload's documentation for more
/// information.
template <typename T, typename Compare = std::less<T>>
auto binary_search(gsl::span<std::add_const_t<T>> range, T value, Compare cmp = std::less<T>{})
auto binary_search(std::span<std::add_const_t<T>> range, T value, Compare cmp = std::less<T>{})
-> std::optional<std::ptrdiff_t> {
return pisa::binary_search(range.begin(), range.end(), value, cmp);
}
Expand Down
22 changes: 13 additions & 9 deletions include/pisa/reorder_docids.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
#include <cstdint>
#include <fstream>
#include <random>
#include <span>
#include <string>
#include <vector>

#include <gsl/span>
#include <spdlog/spdlog.h>

#include "binary_freq_collection.hpp"
#include "payload_vector.hpp"
#include "recursive_graph_bisection.hpp"
#include "span.hpp"
#include "util/inverted_index_utils.hpp"
#include "util/progress.hpp"

Expand Down Expand Up @@ -80,8 +81,8 @@ namespace detail {
forward_index fwd = options.input_fwd
? forward_index::read(*options.input_fwd)
: forward_index::from_inverted_index(
options.input_basename, options.min_length, options.compress_fwd
);
options.input_basename, options.min_length, options.compress_fwd
);

if (options.output_fwd) {
forward_index::write(fwd, *options.output_fwd);
Expand Down Expand Up @@ -137,7 +138,7 @@ struct ReorderOptions {
inline auto reorder_postings(
binary_freq_collection const& input,
std::string_view output_basename,
gsl::span<std::uint32_t const> mapping
std::span<std::uint32_t const> mapping
) {
pisa::progress progress("Reassigning IDs in posting lists", input.size());

Expand All @@ -149,7 +150,7 @@ inline auto reorder_postings(
std::vector<std::pair<std::uint32_t, std::uint32_t>> posting_list;
for (const auto& seq: input) {
for (size_t i = 0; i < seq.docs.size(); ++i) {
posting_list.emplace_back(mapping[seq.docs.begin()[i]], seq.freqs.begin()[i]);
posting_list.emplace_back(pisa::at(mapping, seq.docs.begin()[i]), seq.freqs.begin()[i]);
}

std::sort(posting_list.begin(), posting_list.end());
Expand All @@ -169,7 +170,7 @@ inline auto reorder_postings(
inline auto reorder_lexicon(
std::string const& input_lexicon,
std::string const& output_lexicon,
gsl::span<std::uint32_t const> mapping
std::span<std::uint32_t const> mapping
)

{
Expand All @@ -187,16 +188,19 @@ inline auto reorder_lexicon(
inline auto reorder_sizes(
binary_collection const& input_sizes,
std::uint64_t num_docs,
gsl::span<std::uint32_t const> mapping,
std::span<std::uint32_t const> mapping,
std::string_view output_basename
) {
pisa::progress progress("Reordering document sizes", num_docs);
auto sizes = *input_sizes.begin();
if (sizes.size() != num_docs) {
throw std::invalid_argument("Invalid sizes file");
}
if (mapping.size() != num_docs) {
throw std::invalid_argument("Invalid mapping size");
}

auto size_sequence = gsl::span(sizes.begin(), sizes.size());
auto size_sequence = std::span(sizes.begin(), sizes.size());
std::vector<std::uint32_t> new_sizes(num_docs);
for (size_t i = 0; i < num_docs; ++i) {
new_sizes[mapping[i]] = size_sequence[i];
Expand All @@ -212,7 +216,7 @@ inline void reorder_from_mapping(
binary_freq_collection const& input_collection,
binary_collection const& input_sizes,
ReorderOptions const& options,
gsl::span<std::uint32_t const> mapping
std::span<std::uint32_t const> mapping
) {
auto num_docs = input_collection.num_docs();
reorder_sizes(input_sizes, num_docs, mapping, options.output_basename);
Expand Down
7 changes: 3 additions & 4 deletions include/pisa/sharding.hpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
#pragma once

#include <optional>
#include <span>

#include <gsl/span>
#include <spdlog/spdlog.h>

#include "io.hpp"
#include "type_safe.hpp"
#include "vec_map.hpp"

Expand All @@ -18,10 +17,10 @@ format_shard(std::string_view basename, Shard_Id shard, std::string_view suffix

auto resolve_shards(std::string_view basename, std::string_view suffix = {}) -> std::vector<Shard_Id>;

auto mapping_from_files(std::istream* full_titles, gsl::span<std::istream*> shard_titles)
auto mapping_from_files(std::istream* full_titles, std::span<std::istream*> shard_titles)
-> VecMap<Document_Id, Shard_Id>;

auto mapping_from_files(std::string const& full_titles, gsl::span<std::string const> shard_titles)
auto mapping_from_files(std::string const& full_titles, std::span<std::string const> shard_titles)
-> VecMap<Document_Id, Shard_Id>;

auto create_random_mapping(
Expand Down
Loading

0 comments on commit b33c8f8

Please sign in to comment.