-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
ceb346b
commit f769bd6
Showing
1 changed file
with
397 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,397 @@ | ||
/*************************************************************************** | ||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * | ||
* Martin Renou * | ||
* Copyright (c) QuantStack * | ||
* * | ||
* Distributed under the terms of the BSD 3-Clause License. * | ||
* * | ||
* The full license is in the file LICENSE, distributed with this software. * | ||
****************************************************************************/ | ||
|
||
#ifndef XFRAME_IO_SAS_HPP | ||
#define XFRAME_IO_SAS_HPP | ||
|
||
#include <string> | ||
#include <fstream> | ||
|
||
#include <xtl/xany.hpp> | ||
|
||
#include "xvariable.hpp" | ||
|
||
namespace xf | ||
{ | ||
enum class sas_format | ||
{ | ||
sas7bdata, | ||
xport | ||
}; | ||
inline xf::xvariable<xtl::any, xf::xcoordinate<xf::fstring>> read_sas(const std::ifstream& ifs, const sas_format& format = sas_format::sas7bdata); | ||
|
||
namespace detail | ||
{ | ||
constexpr uint8_t sas_endian_big = 0x00; | ||
constexpr uint8_t sas_endian_little = 0x01; | ||
|
||
constexpr char sas_file_format_unix = '1'; | ||
constexpr char sas_file_format_windows = '2'; | ||
|
||
constexpr uint8_t sas_aligment_offset_0 = 0x22; | ||
constexpr uint8_t sas_aligment_offset_4 = 0x33; | ||
|
||
constexpr uint8_t sas_column_type_number = 0x01; | ||
constexpr uint8_t sas_column_type_char = 0x02; | ||
|
||
constexpr uint32_t sas_subheader_signature_row_size = 0xF7F7F7F7; | ||
constexpr uint32_t sas_subheader_signature_column_size = 0xF6F6F6F6; | ||
constexpr uint32_t sas_subheader_signature_counts = 0xFFFFFC00; | ||
constexpr uint32_t sas_subheader_signature_column_format = 0xFFFFFBFE; | ||
|
||
constexpr uint32_t sas_subheader_signature_column_attrs = 0xFFFFFFFC; | ||
constexpr uint32_t sas_subheader_signature_column_text = 0xFFFFFFFD; | ||
constexpr uint32_t sas_subheader_signature_column_list = 0xFFFFFFFE; | ||
constexpr uint32_t sas_subheader_signature_column_name = 0xFFFFFFFF; | ||
|
||
constexpr uint16_t sas_page_type_meta = 0x0000; | ||
constexpr uint16_t sas_page_type_data = 0x0100; | ||
constexpr uint16_t sas_page_type_mix = 0x0200; | ||
constexpr uint16_t sas_page_type_amd = 0x0400; | ||
constexpr uint16_t sas_page_type_mask = 0x0F00; | ||
|
||
constexpr uint16_t sas_page_type_meta2 = 0x4000; | ||
constexpr uint16_t sas_page_type_comp = 0x9000; | ||
|
||
constexpr uint64_t sas_subheader_pointer_size_32bit = 12; | ||
constexpr uint64_t sas_subheader_pointer_size_64bit = 24; | ||
|
||
constexpr uint64_t sas_page_header_size_32bit = 24; | ||
constexpr uint64_t sas_page_header_size_64bit = 40; | ||
|
||
constexpr uint8_t sas_compression_none = 0x00; | ||
constexpr uint8_t sas_compression_trunc = 0x01; | ||
constexpr uint8_t sas_compression_row = 0x04; | ||
|
||
constexpr uint64_t sas_default_file_version = 9; | ||
|
||
constexpr unsigned char sas7bdat_magic_number[32] = { | ||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | ||
0x00, 0x00, 0x00, 0x00, 0xc2, 0xea, 0x81, 0x60, | ||
0xb3, 0x14, 0x11, 0xcf, 0xbd, 0x92, 0x08, 0x00, | ||
0x09, 0xc7, 0x31, 0x8c, 0x18, 0x1f, 0x10, 0x11 | ||
}; | ||
|
||
constexpr unsigned char sas7bcat_magic_number[32] = { | ||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | ||
0x00, 0x00, 0x00, 0x00, 0xc2, 0xea, 0x81, 0x63, | ||
0xb3, 0x14, 0x11, 0xcf, 0xbd, 0x92, 0x08, 0x00, | ||
0x09, 0xc7, 0x31, 0x8c, 0x18, 0x1f, 0x10, 0x11 | ||
}; | ||
|
||
class xsas7bdat_reader | ||
{ | ||
public: | ||
explicit xsas7bdat_reader(std::ifstream& ifs); | ||
|
||
xsas7bdat_reader(xsas7bdat_reader&) = delete; | ||
xsas7bdat_reader(xsas7bdat_reader&&) = delete; | ||
xsas7bdat_reader& operator=(const xsas7bdat_reader&) = delete; | ||
xsas7bdat_reader& operator=(xsas7bdat_reader&&) = delete; | ||
std::vector<std::string> parse_meta(); | ||
std::vector<std::string> parse_data(); | ||
|
||
private: | ||
inline bool little_endian() | ||
{ | ||
const int value { 0x01 }; | ||
const void * address = static_cast<const void *>(&value); | ||
const unsigned char * least_significant_address = static_cast<const unsigned char *>(address); | ||
return (*least_significant_address == 0x01); | ||
} | ||
template <typename T> | ||
inline auto swap_endian(const T&) -> T; | ||
template <typename T> | ||
inline auto read_data(const std::ifstream& ,bool) -> T; | ||
|
||
inline void parse_head(); | ||
inline void parse_page(std::string::iterator& it); | ||
inline void parse_subheader(std::string::iterator& it, uint16_t subheader_pointers_count); | ||
|
||
std::tuple<uint64_t, uint64_t, uint8_t, uint8_t> parse_subheader_pointer(std::string::iterator& it); | ||
|
||
uint64_t m_page_count {0}; | ||
uint64_t m_header_size {0}; | ||
uint64_t m_page_size {0}; | ||
uint64_t m_max_row_count{0}; | ||
uint64_t m_row_length{0}; | ||
uint64_t m_total_row_count; | ||
uint64_t m_col_count; | ||
bool m_u64 {false}; | ||
bool m_swap {false}; | ||
std::ifstream& m_sas_ifs; | ||
}; | ||
|
||
xsas7bdat_reader::xsas7bdat_reader(std::ifstream& ifs) : m_sas_ifs(ifs) | ||
{ | ||
} | ||
|
||
template <typename T> | ||
inline auto swap_endian(const T &val) -> T | ||
{ | ||
union | ||
{ | ||
T val; | ||
std::array<std::uint8_t, sizeof(T)> raw; | ||
} src, dst; | ||
src.val = val; | ||
std::reverse_copy(src.raw.begin(), src.raw.end(), dst.raw.begin()); | ||
return dst.val; | ||
} | ||
|
||
template <typename T> | ||
inline auto read_sas_data(std::ifstream& ifs ,bool swap) -> T | ||
{ | ||
T data; | ||
if (!ifs.read((char*)(&data), sizeof(data))) | ||
throw std::runtime_error(""); | ||
if (swap) | ||
data = swap_endian(data); | ||
return data; | ||
} | ||
|
||
template <typename T> | ||
inline auto read_sas_data(std::string::iterator& memory_it, bool swap) -> T | ||
{ | ||
T data; | ||
std::string str(memory_it, memory_it + sizeof(T)); | ||
str.copy((char*)&data, sizeof(T), 0); | ||
if (swap) | ||
data = swap_endian(data); | ||
memory_it += sizeof(T); | ||
return data; | ||
} | ||
std::vector<std::string> xsas7bdat_reader::parse_meta() | ||
{ | ||
parse_head(); | ||
auto page_header_size = m_u64 ? sas_page_header_size_64bit : sas_page_header_size_32bit; | ||
auto subheader_pointer_size = m_u64 ? sas_subheader_pointer_size_64bit : sas_subheader_pointer_size_32bit; | ||
auto subheader_signature_size = m_u64 ? 8 : 4; | ||
|
||
for (decltype(m_page_count) idx = 0; idx < m_page_count; idx++) | ||
{ | ||
auto page_offset = m_header_size + idx * m_page_size; | ||
if (!m_sas_ifs.seekg(page_offset, m_sas_ifs.beg)) | ||
throw std::runtime_error("parse sas error"); | ||
|
||
std::string page_memory; | ||
if (!m_sas_ifs.read(&page_memory[0], m_page_size)) | ||
throw std::runtime_error("parse sas error"); | ||
|
||
parse_page(page_memory); | ||
} | ||
|
||
} | ||
// template <typename T> | ||
// inline void seekg(std::ifstream& ifs, std::ios_base::seekdir& seekdir) | ||
// { | ||
// ifs.seekg(sizeof(T), seekdir); | ||
// return; | ||
// } | ||
|
||
// inline void seekg(std::ifstream& ifs, size_t pos, std::ios_base::seekdir& seekdir) | ||
// { | ||
// ifs.seekg(pos, seekdir); | ||
// return; | ||
// } | ||
|
||
// template <typename T> | ||
// inline void seekg(std::string::iterator& memory_it) | ||
// { | ||
// memory_it += sizeof(T); | ||
// return; | ||
// } | ||
|
||
inline void seekg(std::string::iterator& memory_it, size_t pos) | ||
{ | ||
memory_it += pos; | ||
return; | ||
} | ||
|
||
void xsas7bdat_reader::parse_head() | ||
{ | ||
#pragma pack(push, 1) | ||
struct sas_header_begin | ||
{ | ||
unsigned char magic_number[32]; | ||
unsigned char a2; | ||
unsigned char mystery1[2]; | ||
unsigned char a1; | ||
unsigned char mystery2[1]; | ||
unsigned char endian; | ||
unsigned char mystery3[1]; | ||
char file_format; | ||
unsigned char mystery4[30]; | ||
unsigned char encoding; | ||
unsigned char mystery5[13]; | ||
char file_type[8]; | ||
char file_label[64]; | ||
char file_info[8]; | ||
}; | ||
#pragma pack(pop) | ||
|
||
auto header_begin = read_sas_data<sas_header_begin>(m_sas_ifs, false); | ||
if (std::memcmp(header_begin.magic_number, sas7bdat_magic_number, sizeof(sas7bdat_magic_number)) != 0) | ||
throw std::runtime_error("error"); | ||
auto a1 = 0; | ||
if (header_begin.a1 == sas_aligment_offset_4) | ||
a1 = 4; | ||
if (header_begin.a2 == sas_aligment_offset_4) | ||
m_u64 = true; | ||
m_swap = false; | ||
if (header_begin.endian == sas_endian_big) | ||
m_swap = little_endian(); | ||
else if (header_begin.endian == sas_endian_little) | ||
m_swap = !little_endian(); | ||
else | ||
throw std::runtime_error("parse sas error"); | ||
if (!m_sas_ifs.seekg(a1 + sizeof(double) * 2 + 16, m_sas_ifs.cur)) | ||
throw std::runtime_error("parse sas error"); | ||
|
||
m_header_size = read_sas_data<uint32_t>(m_sas_ifs, m_swap); | ||
m_page_size = read_sas_data<uint32_t>(m_sas_ifs, m_swap); | ||
if (m_header_size < 1024 || m_page_size < 1024) | ||
throw std::runtime_error(""); | ||
if (m_header_size > (1 << 20) || m_page_size > (1 << 24)) | ||
throw std::runtime_error(""); | ||
|
||
if (m_u64) | ||
m_page_count = read_sas_data<uint64_t>(m_sas_ifs, m_swap); | ||
else | ||
m_page_count = read_sas_data<uint32_t>(m_sas_ifs, m_swap); | ||
if (m_page_count > (1 << 24)) | ||
throw std::runtime_error(""); | ||
} | ||
|
||
void xsas7bdat_reader::parse_page(std::string& page_memory) | ||
{ | ||
auto it = page_memory.begin(); | ||
auto signature = read_sas_data<uint32_t>(it, m_swap); | ||
m_u64 ? seekg(it, 12) : seekg(it, 28); | ||
|
||
auto page_type = read_sas_data<uint16_t>(it, m_swap); | ||
//TODO | ||
// if ((page_type & sas_page_type_mask) == sas_page_type_data) | ||
// break; | ||
// if ((page_type & sas_page_type_comp)) | ||
// continue; | ||
auto data_block_count = read_sas_data<uint16_t>(it, m_swap); | ||
auto subheader_pointers_count = read_sas_data<uint16_t>(it, m_swap); | ||
seekg(it, 2); | ||
auto subheader_pointer_size = m_u64 ? sas_subheader_pointer_size_64bit : sas_subheader_pointer_size_32bit; | ||
for (auto idx = 0; idx < subheader_pointers_count; idx++) | ||
{ | ||
auto it = it + subheader_pointer_size; | ||
auto ret = parse_subheader_pointer(it); | ||
auto subheader_it = page_memory.begin() + std::get<0>(ret); | ||
if (signature == sas_subheader_signature_row_size) { | ||
if (std::get<1>(ret) < (m_u64 ? 128 : 64)) | ||
throw std::runtime_error(""); | ||
parse_row_size_subheader(subheader_it); | ||
} else if (signature == sas_subheader_signature_column_size) { | ||
if (std::get<1>(ret) < (m_u64 ? 16 : 8)) | ||
throw std::runtime_error(""); | ||
parse_column_size_subheader(subheader_it); | ||
} else if (signature == sas_subheader_signature_counts) { | ||
/* void */ | ||
} else if (signature == sas_subheader_signature_column_text) { | ||
parse_column_text_subheader(subheader_it); | ||
} else if (signature == sas_subheader_signature_column_name) { | ||
parse_column_name_subheader(subheader_it, std::get<1>(ret)); | ||
} else if (signature == sas_subheader_signature_column_attrs) { | ||
parse_column_attributes_subheader(subheader_it); | ||
} else if (signature == sas_subheader_signature_column_format) { | ||
parse_column_format_subheader(subheader_it); | ||
} else if (signature == sas_subheader_signature_column_list) { | ||
/* void */ | ||
} else if ((signature & sas_subheader_signature_column_mask) == sas_subheader_signature_column_mask) { | ||
/* void */ | ||
} else { | ||
/* void */ | ||
} | ||
} | ||
} | ||
|
||
std::tuple<uint64_t, uint64_t, uint8_t, uint8_t> xsas7bdat_reader::parse_subheader_pointer() | ||
{ | ||
auto offset_to_subhead = 0; | ||
auto length = 0; | ||
auto compression = 0; | ||
auto subheader_type = 0; | ||
if (m_u64) | ||
{ | ||
offset_to_subhead = read_sas_data<uint64_t>(it, m_swap); | ||
length = read_sas_data<uint64_t>(it, m_swap); | ||
compression = read_sas_data<uint8_t>(it, m_swap); | ||
subheader_type = read_sas_data<uint8_t>(it, m_swap); | ||
seekg(it, 7); | ||
} | ||
else | ||
{ | ||
offset_to_subhead = read_sas_data<uint32_t>(it, m_swap); | ||
length = read_sas_data<uint32_t>(it, m_swap); | ||
compression = read_sas_data<uint8_t>(it, m_swap); | ||
subheader_type = read_sas_data<uint8_t>(it, m_swap); | ||
seekg(it, 3); | ||
} | ||
return std::make_tuple(offset_to_subhead, length, compression, subheader_type); | ||
} | ||
|
||
void xsas7bdat_reader::parse_row_size_subheader(std::string::iterator& it) | ||
{ | ||
m_u64 ? seekg(it, 40) : seekg(it, 20); | ||
m_row_length = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap); | ||
m_total_row_count = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap); | ||
m_u64 ? seekg(it, 72) : seekg(it, 36); | ||
m_max_row_count = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap); | ||
} | ||
|
||
void xsas7bdat_reader::parse_column_size_subheader(std::string::iterator& it) | ||
{ | ||
m_col_count = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap); | ||
} | ||
void xsas7bdat_reader::parse_column_text_subheader(std::string::iterator& it) | ||
{ | ||
m_col_count = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap); | ||
} | ||
|
||
void xsas7bdat_reader::parse_column_name_subheader(std::string::iterator& it, uint64_t length) | ||
{ | ||
int cmax = m_u64 ? (length - 28) / 8 : (length - 20) / 8; | ||
readstat_error_t retval = READSTAT_OK; | ||
} | ||
|
||
// void xsas7bdat_reader::parse_column_attrs_subheader() | ||
// { | ||
// } | ||
|
||
// void xsas7bdat_reader::parse_column_format_subheader() | ||
// { | ||
|
||
// } | ||
|
||
|
||
} | ||
// inline xf::xvariable<xtl::any, xf::xcoordinate<xf::fstring>> xf::read_sas(const std::ifstream& ifs, const sas_format& format = sas_format::sas7bdata) | ||
// { | ||
// if (format == sas_format::sas7bdata) | ||
// { | ||
// detail::xsas7bdat_reader reader(ifs); | ||
// return reader.parse(); | ||
// } | ||
// else | ||
// { | ||
|
||
// } | ||
// return xf::xvariable<xtl::any, xf::xcoordinate<xf::fstring>(); | ||
// } | ||
} | ||
|
||
#endif |