Skip to content

Commit

Permalink
Add memory mapped int_vector and simplify int_vec
Browse files Browse the repository at this point in the history
This commit introduces a non-const data() member which allows raw
write access to the data stored in an int_vector similar to what is
possible with the standard std::vector. this allows "unfriending"
several of the util:: helper functions that modify int_vectors.

The second addition to the library is a memory mapped int_vector
(int_vector_mapper) which provides the same functionality as a regular
int_vector but is memory mapped from a file. Thus, operations such as
util::bit_compress can now be performed without loading the int_vector
to memory. The int_vector_mapper is soley used as a resource handle to
the data stored in the file. All operations are forwarded to the
int_vector implementation. Thus, unlike the int_vector_buffer, the
mapper can be used in regular stl algorithms as it provides const and
non const access similar to the regular int_vector. The mapper
additionally supports the push_back and resize operations which can be
used to write data to disk.

Temporary storage on disk can be realized using the temp_file_buffer
class which creates a int_vector_mapper object from a temporary file
which is deleted after the int_vector_mapper object is destroyed.
  • Loading branch information
mpetri committed May 22, 2014
1 parent da06f5f commit 8e63247
Show file tree
Hide file tree
Showing 8 changed files with 692 additions and 24 deletions.
88 changes: 88 additions & 0 deletions examples/int-vector-mapper.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#include <sdsl/int_vector_mapper.hpp>
#include <string>
#include <iostream>

using namespace sdsl;
using namespace std;

int main(int argc, char* argv[])
{
if (argc < 1) {
cout << "Usage: " << argv[0] << endl;
cout << "(1) Writes an int_vector sequentially to a file" << endl;
cout << "(2) Streams the content from file" << endl;
cout << "(3) Remove the file" << endl;
return 1;
}
string tmp_file = "tmp_file.sdsl";
size_t size = 10000000;
std::mt19937_64 rng(13);
uint8_t width = 0;
int_vector<> iv(size,0,64);
int_vector<64> ivf(size,0);
std::vector<uint64_t> stdv(size,0);

// (1) write an int vector to disk
{
// write sequentially random values to disk
for (uint64_t i=0; i<size; ++i) {
iv[i] = rng();
stdv[i] = iv[i];
ivf[i] = iv[i];
}

util::bit_compress(iv);
width = iv.width();
store_to_file(iv,tmp_file);
}

// (2) memory map the content of tmp_file
{
int_vector_mapper<> ivm(tmp_file);
if (ivm.size() != size) {
std::cerr << "ERROR: ivm.size()="<< ivm.size() << " != " << size << std::endl;
return 1;
}
if (ivm.width() != width) {
std::cerr << "ERROR: ivm.width()="<< ivm.width() << " != " << width << std::endl;
return 1;
}
rng.seed(13); // To get the same values than before use the same seed
for (uint64_t i=0; i<ivm.size(); ++i) {
uint64_t expected_value = rng();
if (ivm[i] != expected_value) {
std::cerr << "ERROR: ivm["<< i << "]=" << ivm[i] << " != " << expected_value << "= expected_value" << std::endl;
return 1;
}
}

if(ivm != stdv) {
std::cerr << "ERROR: std::vector CMP failed.";
}
if(ivm != iv) {
std::cerr << "ERROR: iv CMP failed.";
}
if(ivm != ivf) {
std::cerr << "ERROR: ivf CMP failed.";
}
}

// (3) remove the file as the mapper does not do that
{
sdsl::remove(tmp_file);
}

{
auto tmp_buf = temp_file_buffer<64>::create();
for(const auto& val : stdv) {
tmp_buf.push_back(val);
}
if(tmp_buf != stdv) {
std::cerr << "ERROR: tmp_buf CMP failed." << std::endl;
}

// tmp buf file is deleted automatically
}

return 0;
}
6 changes: 3 additions & 3 deletions include/sdsl/construct_sa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ void calculate_sa(const unsigned char* c, typename int_vector<fixedIntWidth>::si
if (32 == fixedIntWidth or(0==fixedIntWidth and 32 >= oldIntWidth)) {
sa.width(32);
sa.resize(len);
divsufsort(c, (int32_t*)sa.m_data, len);
divsufsort(c, (int32_t*)sa.data(), len);
// copy integers back to the right positions
if (oldIntWidth!=32) {
for (size_type i=0; i<len; ++i) {
Expand All @@ -102,7 +102,7 @@ void calculate_sa(const unsigned char* c, typename int_vector<fixedIntWidth>::si
throw std::logic_error("width of int_vector is to small for the text!!!");
}
int_vector<> sufarray(len,0,32);
divsufsort(c, (int32_t*)sufarray.m_data, len);
divsufsort(c, (int32_t*)sufarray.data(), len);
for (size_type i=0; i<len; ++i) {
sa[i] = sufarray[i];
}
Expand All @@ -111,7 +111,7 @@ void calculate_sa(const unsigned char* c, typename int_vector<fixedIntWidth>::si
uint8_t oldIntWidth = sa.width();
sa.width(64);
sa.resize(len);
divsufsort64(c, (int64_t*)sa.m_data, len);
divsufsort64(c, (int64_t*)sa.data(), len);
// copy integers back to the right positions
if (oldIntWidth!=64) {
for (size_type i=0; i<len; ++i) {
Expand Down
19 changes: 11 additions & 8 deletions include/sdsl/int_vector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ class int_vector_iterator;
template<class t_int_vector>
class int_vector_const_iterator;

template<uint8_t t_width>
class int_vector_mapper;

template<uint8_t b, uint8_t t_patter_len> // forward declaration
class rank_support_v;

Expand Down Expand Up @@ -269,19 +272,12 @@ class int_vector
friend class int_vector_iterator_base<int_vector>;
friend class int_vector_iterator<int_vector>;
friend class int_vector_const_iterator<int_vector>;
friend class int_vector_mapper<t_width>;
friend class coder::elias_delta;
friend class coder::elias_gamma;
friend class coder::fibonacci;
friend class memory_manager;

friend void util::set_random_bits<int_vector>(int_vector& v, int);
friend void util::_set_zero_bits<int_vector>(int_vector&);
friend void util::_set_one_bits<int_vector>(int_vector&);
friend void util::bit_compress<int_vector>(int_vector&);
friend void util::set_to_value<int_vector>(int_vector&, uint64_t);
friend bool load_vector_from_file<int_vector>(int_vector&, const std::string&,uint8_t,uint8_t);
friend void algorithm::calculate_sa<t_width>(const unsigned char* c, typename int_vector<t_width>::size_type len, int_vector<t_width>& sa);

enum { fixed_int_width = t_width }; // make template parameter accessible

private:
Expand Down Expand Up @@ -377,6 +373,13 @@ for (auto x : il) {
return m_data;
}

//! Pointer to the raw data of the int_vector
/*! \returns pointer to the raw data of the int_vector
*/
uint64_t* data() {
return m_data;
}

//! Get the integer value of the binary string of length len starting at position idx in the int_vector.
/*! \param idx Starting index of the binary representation of the integer.
\param len Length of the binary representation of the integer. Default value is 64.
Expand Down
Loading

0 comments on commit 8e63247

Please sign in to comment.