Skip to content

Commit

Permalink
add: parse sas file
Browse files Browse the repository at this point in the history
  • Loading branch information
daliuzhen1 committed Dec 10, 2019
1 parent ceb346b commit f769bd6
Showing 1 changed file with 397 additions and 0 deletions.
397 changes: 397 additions & 0 deletions include/xframe/xio_sas.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,397 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/

#ifndef XFRAME_IO_SAS_HPP
#define XFRAME_IO_SAS_HPP

#include <string>
#include <fstream>

#include <xtl/xany.hpp>

#include "xvariable.hpp"

namespace xf
{
enum class sas_format
{
sas7bdata,
xport
};
inline xf::xvariable<xtl::any, xf::xcoordinate<xf::fstring>> read_sas(const std::ifstream& ifs, const sas_format& format = sas_format::sas7bdata);

namespace detail
{
constexpr uint8_t sas_endian_big = 0x00;
constexpr uint8_t sas_endian_little = 0x01;

constexpr char sas_file_format_unix = '1';
constexpr char sas_file_format_windows = '2';

constexpr uint8_t sas_aligment_offset_0 = 0x22;
constexpr uint8_t sas_aligment_offset_4 = 0x33;

constexpr uint8_t sas_column_type_number = 0x01;
constexpr uint8_t sas_column_type_char = 0x02;

constexpr uint32_t sas_subheader_signature_row_size = 0xF7F7F7F7;
constexpr uint32_t sas_subheader_signature_column_size = 0xF6F6F6F6;
constexpr uint32_t sas_subheader_signature_counts = 0xFFFFFC00;
constexpr uint32_t sas_subheader_signature_column_format = 0xFFFFFBFE;

constexpr uint32_t sas_subheader_signature_column_attrs = 0xFFFFFFFC;
constexpr uint32_t sas_subheader_signature_column_text = 0xFFFFFFFD;
constexpr uint32_t sas_subheader_signature_column_list = 0xFFFFFFFE;
constexpr uint32_t sas_subheader_signature_column_name = 0xFFFFFFFF;

constexpr uint16_t sas_page_type_meta = 0x0000;
constexpr uint16_t sas_page_type_data = 0x0100;
constexpr uint16_t sas_page_type_mix = 0x0200;
constexpr uint16_t sas_page_type_amd = 0x0400;
constexpr uint16_t sas_page_type_mask = 0x0F00;

constexpr uint16_t sas_page_type_meta2 = 0x4000;
constexpr uint16_t sas_page_type_comp = 0x9000;

constexpr uint64_t sas_subheader_pointer_size_32bit = 12;
constexpr uint64_t sas_subheader_pointer_size_64bit = 24;

constexpr uint64_t sas_page_header_size_32bit = 24;
constexpr uint64_t sas_page_header_size_64bit = 40;

constexpr uint8_t sas_compression_none = 0x00;
constexpr uint8_t sas_compression_trunc = 0x01;
constexpr uint8_t sas_compression_row = 0x04;

constexpr uint64_t sas_default_file_version = 9;

constexpr unsigned char sas7bdat_magic_number[32] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0xc2, 0xea, 0x81, 0x60,
0xb3, 0x14, 0x11, 0xcf, 0xbd, 0x92, 0x08, 0x00,
0x09, 0xc7, 0x31, 0x8c, 0x18, 0x1f, 0x10, 0x11
};

constexpr unsigned char sas7bcat_magic_number[32] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0xc2, 0xea, 0x81, 0x63,
0xb3, 0x14, 0x11, 0xcf, 0xbd, 0x92, 0x08, 0x00,
0x09, 0xc7, 0x31, 0x8c, 0x18, 0x1f, 0x10, 0x11
};

class xsas7bdat_reader
{
public:
explicit xsas7bdat_reader(std::ifstream& ifs);

xsas7bdat_reader(xsas7bdat_reader&) = delete;
xsas7bdat_reader(xsas7bdat_reader&&) = delete;
xsas7bdat_reader& operator=(const xsas7bdat_reader&) = delete;
xsas7bdat_reader& operator=(xsas7bdat_reader&&) = delete;
std::vector<std::string> parse_meta();
std::vector<std::string> parse_data();

private:
inline bool little_endian()
{
const int value { 0x01 };
const void * address = static_cast<const void *>(&value);
const unsigned char * least_significant_address = static_cast<const unsigned char *>(address);
return (*least_significant_address == 0x01);
}
template <typename T>
inline auto swap_endian(const T&) -> T;
template <typename T>
inline auto read_data(const std::ifstream& ,bool) -> T;

inline void parse_head();
inline void parse_page(std::string::iterator& it);
inline void parse_subheader(std::string::iterator& it, uint16_t subheader_pointers_count);

std::tuple<uint64_t, uint64_t, uint8_t, uint8_t> parse_subheader_pointer(std::string::iterator& it);

uint64_t m_page_count {0};
uint64_t m_header_size {0};
uint64_t m_page_size {0};
uint64_t m_max_row_count{0};
uint64_t m_row_length{0};
uint64_t m_total_row_count;
uint64_t m_col_count;
bool m_u64 {false};
bool m_swap {false};
std::ifstream& m_sas_ifs;
};

xsas7bdat_reader::xsas7bdat_reader(std::ifstream& ifs) : m_sas_ifs(ifs)
{
}

template <typename T>
inline auto swap_endian(const T &val) -> T
{
union
{
T val;
std::array<std::uint8_t, sizeof(T)> raw;
} src, dst;
src.val = val;
std::reverse_copy(src.raw.begin(), src.raw.end(), dst.raw.begin());
return dst.val;
}

template <typename T>
inline auto read_sas_data(std::ifstream& ifs ,bool swap) -> T
{
T data;
if (!ifs.read((char*)(&data), sizeof(data)))
throw std::runtime_error("");
if (swap)
data = swap_endian(data);
return data;
}

template <typename T>
inline auto read_sas_data(std::string::iterator& memory_it, bool swap) -> T
{
T data;
std::string str(memory_it, memory_it + sizeof(T));
str.copy((char*)&data, sizeof(T), 0);
if (swap)
data = swap_endian(data);
memory_it += sizeof(T);
return data;
}
std::vector<std::string> xsas7bdat_reader::parse_meta()
{
parse_head();
auto page_header_size = m_u64 ? sas_page_header_size_64bit : sas_page_header_size_32bit;
auto subheader_pointer_size = m_u64 ? sas_subheader_pointer_size_64bit : sas_subheader_pointer_size_32bit;
auto subheader_signature_size = m_u64 ? 8 : 4;

for (decltype(m_page_count) idx = 0; idx < m_page_count; idx++)
{
auto page_offset = m_header_size + idx * m_page_size;
if (!m_sas_ifs.seekg(page_offset, m_sas_ifs.beg))
throw std::runtime_error("parse sas error");

std::string page_memory;
if (!m_sas_ifs.read(&page_memory[0], m_page_size))
throw std::runtime_error("parse sas error");

parse_page(page_memory);
}

}
// template <typename T>
// inline void seekg(std::ifstream& ifs, std::ios_base::seekdir& seekdir)
// {
// ifs.seekg(sizeof(T), seekdir);
// return;
// }

// inline void seekg(std::ifstream& ifs, size_t pos, std::ios_base::seekdir& seekdir)
// {
// ifs.seekg(pos, seekdir);
// return;
// }

// template <typename T>
// inline void seekg(std::string::iterator& memory_it)
// {
// memory_it += sizeof(T);
// return;
// }

inline void seekg(std::string::iterator& memory_it, size_t pos)
{
memory_it += pos;
return;
}

void xsas7bdat_reader::parse_head()
{
#pragma pack(push, 1)
struct sas_header_begin
{
unsigned char magic_number[32];
unsigned char a2;
unsigned char mystery1[2];
unsigned char a1;
unsigned char mystery2[1];
unsigned char endian;
unsigned char mystery3[1];
char file_format;
unsigned char mystery4[30];
unsigned char encoding;
unsigned char mystery5[13];
char file_type[8];
char file_label[64];
char file_info[8];
};
#pragma pack(pop)

auto header_begin = read_sas_data<sas_header_begin>(m_sas_ifs, false);
if (std::memcmp(header_begin.magic_number, sas7bdat_magic_number, sizeof(sas7bdat_magic_number)) != 0)
throw std::runtime_error("error");
auto a1 = 0;
if (header_begin.a1 == sas_aligment_offset_4)
a1 = 4;
if (header_begin.a2 == sas_aligment_offset_4)
m_u64 = true;
m_swap = false;
if (header_begin.endian == sas_endian_big)
m_swap = little_endian();
else if (header_begin.endian == sas_endian_little)
m_swap = !little_endian();
else
throw std::runtime_error("parse sas error");
if (!m_sas_ifs.seekg(a1 + sizeof(double) * 2 + 16, m_sas_ifs.cur))
throw std::runtime_error("parse sas error");

m_header_size = read_sas_data<uint32_t>(m_sas_ifs, m_swap);
m_page_size = read_sas_data<uint32_t>(m_sas_ifs, m_swap);
if (m_header_size < 1024 || m_page_size < 1024)
throw std::runtime_error("");
if (m_header_size > (1 << 20) || m_page_size > (1 << 24))
throw std::runtime_error("");

if (m_u64)
m_page_count = read_sas_data<uint64_t>(m_sas_ifs, m_swap);
else
m_page_count = read_sas_data<uint32_t>(m_sas_ifs, m_swap);
if (m_page_count > (1 << 24))
throw std::runtime_error("");
}

void xsas7bdat_reader::parse_page(std::string& page_memory)
{
auto it = page_memory.begin();
auto signature = read_sas_data<uint32_t>(it, m_swap);
m_u64 ? seekg(it, 12) : seekg(it, 28);

auto page_type = read_sas_data<uint16_t>(it, m_swap);
//TODO
// if ((page_type & sas_page_type_mask) == sas_page_type_data)
// break;
// if ((page_type & sas_page_type_comp))
// continue;
auto data_block_count = read_sas_data<uint16_t>(it, m_swap);
auto subheader_pointers_count = read_sas_data<uint16_t>(it, m_swap);
seekg(it, 2);
auto subheader_pointer_size = m_u64 ? sas_subheader_pointer_size_64bit : sas_subheader_pointer_size_32bit;
for (auto idx = 0; idx < subheader_pointers_count; idx++)
{
auto it = it + subheader_pointer_size;
auto ret = parse_subheader_pointer(it);
auto subheader_it = page_memory.begin() + std::get<0>(ret);
if (signature == sas_subheader_signature_row_size) {
if (std::get<1>(ret) < (m_u64 ? 128 : 64))
throw std::runtime_error("");
parse_row_size_subheader(subheader_it);
} else if (signature == sas_subheader_signature_column_size) {
if (std::get<1>(ret) < (m_u64 ? 16 : 8))
throw std::runtime_error("");
parse_column_size_subheader(subheader_it);
} else if (signature == sas_subheader_signature_counts) {
/* void */
} else if (signature == sas_subheader_signature_column_text) {
parse_column_text_subheader(subheader_it);
} else if (signature == sas_subheader_signature_column_name) {
parse_column_name_subheader(subheader_it, std::get<1>(ret));
} else if (signature == sas_subheader_signature_column_attrs) {
parse_column_attributes_subheader(subheader_it);
} else if (signature == sas_subheader_signature_column_format) {
parse_column_format_subheader(subheader_it);
} else if (signature == sas_subheader_signature_column_list) {
/* void */
} else if ((signature & sas_subheader_signature_column_mask) == sas_subheader_signature_column_mask) {
/* void */
} else {
/* void */
}
}
}

std::tuple<uint64_t, uint64_t, uint8_t, uint8_t> xsas7bdat_reader::parse_subheader_pointer()
{
auto offset_to_subhead = 0;
auto length = 0;
auto compression = 0;
auto subheader_type = 0;
if (m_u64)
{
offset_to_subhead = read_sas_data<uint64_t>(it, m_swap);
length = read_sas_data<uint64_t>(it, m_swap);
compression = read_sas_data<uint8_t>(it, m_swap);
subheader_type = read_sas_data<uint8_t>(it, m_swap);
seekg(it, 7);
}
else
{
offset_to_subhead = read_sas_data<uint32_t>(it, m_swap);
length = read_sas_data<uint32_t>(it, m_swap);
compression = read_sas_data<uint8_t>(it, m_swap);
subheader_type = read_sas_data<uint8_t>(it, m_swap);
seekg(it, 3);
}
return std::make_tuple(offset_to_subhead, length, compression, subheader_type);
}

void xsas7bdat_reader::parse_row_size_subheader(std::string::iterator& it)
{
m_u64 ? seekg(it, 40) : seekg(it, 20);
m_row_length = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap);
m_total_row_count = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap);
m_u64 ? seekg(it, 72) : seekg(it, 36);
m_max_row_count = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap);
}

void xsas7bdat_reader::parse_column_size_subheader(std::string::iterator& it)
{
m_col_count = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap);
}
void xsas7bdat_reader::parse_column_text_subheader(std::string::iterator& it)
{
m_col_count = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap);
}

void xsas7bdat_reader::parse_column_name_subheader(std::string::iterator& it, uint64_t length)
{
int cmax = m_u64 ? (length - 28) / 8 : (length - 20) / 8;
readstat_error_t retval = READSTAT_OK;
}

// void xsas7bdat_reader::parse_column_attrs_subheader()
// {
// }

// void xsas7bdat_reader::parse_column_format_subheader()
// {

// }


}
// inline xf::xvariable<xtl::any, xf::xcoordinate<xf::fstring>> xf::read_sas(const std::ifstream& ifs, const sas_format& format = sas_format::sas7bdata)
// {
// if (format == sas_format::sas7bdata)
// {
// detail::xsas7bdat_reader reader(ifs);
// return reader.parse();
// }
// else
// {

// }
// return xf::xvariable<xtl::any, xf::xcoordinate<xf::fstring>();
// }
}

#endif

0 comments on commit f769bd6

Please sign in to comment.