From 7a90b2548add8c7fa45e4dd96ea6ee6af35a2365 Mon Sep 17 00:00:00 2001 From: visuve Date: Thu, 11 Apr 2024 21:06:57 +0300 Subject: [PATCH] MAJOR CHANGE: USE MEMORY MAPPED FILE - This just makes everything so much more simpler - The performance seems to have improved a bit - However, it's a file searcher hence pretty much everything is IO bound... --- PystykorvaCLI/Main.cpp | 8 +- PystykorvaLib/BufferedStream.cpp | 65 ---------- PystykorvaLib/BufferedStream.hpp | 28 ----- PystykorvaLib/MemoryMappedFile.cpp | 158 ++++++++++++++++++++++++ PystykorvaLib/MemoryMappedFile.hpp | 21 ++++ PystykorvaLib/Pystykorva.cpp | 1 - PystykorvaLib/Pystykorva.hpp | 8 +- PystykorvaLib/PystykorvaLib.vcxproj | 4 +- PystykorvaLib/TextProcessor.cpp | 97 +++++---------- PystykorvaLib/TextProcessor.hpp | 3 +- PystykorvaLib/UnicodeConverter.hpp | 2 +- PystykorvaTests/BufferedStreamTests.cpp | 28 ----- PystykorvaTests/PystykorvaTests.vcxproj | 1 - PystykorvaTests/TextProcessorTests.cpp | 56 ++++++--- 14 files changed, 267 insertions(+), 213 deletions(-) delete mode 100644 PystykorvaLib/BufferedStream.cpp delete mode 100644 PystykorvaLib/BufferedStream.hpp create mode 100644 PystykorvaLib/MemoryMappedFile.cpp create mode 100644 PystykorvaLib/MemoryMappedFile.hpp delete mode 100644 PystykorvaTests/BufferedStreamTests.cpp diff --git a/PystykorvaCLI/Main.cpp b/PystykorvaCLI/Main.cpp index 85b1215..bb44b70 100644 --- a/PystykorvaCLI/Main.cpp +++ b/PystykorvaCLI/Main.cpp @@ -26,9 +26,6 @@ Pystykorva::Options Deserialize(const CmdArgs& args) options.MinimumTime = args.Value("mintime"); options.MaximumTime = args.Value("maxtime"); - // 64 kib should be decent for most text files - options.BufferSize = args.Value("buffersize"); - // On my 16 core CPU, harware_concurrency returns 32, which is fine as I have SMT options.MaximumThreads = args.Value("maxthreads"); @@ -131,8 +128,7 @@ Console& operator << (Console& stream, const Pystykorva::Match& result) std::mutex _mutex; -void ReportProcessing( - const std::filesystem::path& path) +void ReportProcessing(const std::filesystem::path& path) { _CRT_UNUSED(path); #if _DEBUG @@ -174,6 +170,8 @@ void ReportResults( void ReportFinished(std::chrono::milliseconds ms) { + std::lock_guard guard(_mutex); + Cout << "\nPystykorva finished!\n\n"; Cout << "Statistics:\n"; Cout << "\tTook: " << std::format("{:%T}\n", ms); diff --git a/PystykorvaLib/BufferedStream.cpp b/PystykorvaLib/BufferedStream.cpp deleted file mode 100644 index 09eb30d..0000000 --- a/PystykorvaLib/BufferedStream.cpp +++ /dev/null @@ -1,65 +0,0 @@ -#include "PCH.hpp" -#include "BufferedStream.hpp" - -BufferedStream::BufferedStream( - std::istream& input, - size_t bufferSize, - uint64_t streamSize) : - _input(input.rdbuf()), - _streamSize(streamSize), - _bytesRead(0) -{ - assert(input); - assert(bufferSize > 0); - assert(streamSize > 0); - - if (streamSize < bufferSize) - { - bufferSize = streamSize; - } - - _buffer.resize(bufferSize); -} - -BufferedStream::~BufferedStream() -{ -} - -bool BufferedStream::HasData() const -{ - return _streamSize > 0; -} - -bool BufferedStream::Read() -{ - if (_streamSize == 0) - { - return false; - } - - std::streamsize extracted = _input->sgetn(_buffer.data(), _buffer.size()); - - if (extracted <= 0) - { - _streamSize = 0; - return false; - } - - size_t extractedReal = static_cast(extracted); - - if (extractedReal < _buffer.size()) - { - // This should happen only once, i.e. when the last chunk is read - _buffer.resize(extractedReal); - } - - _bytesRead += extractedReal; - _streamSize -= extractedReal; - - return true; -} - -std::string_view BufferedStream::Data() const -{ - return _buffer; -} diff --git a/PystykorvaLib/BufferedStream.hpp b/PystykorvaLib/BufferedStream.hpp deleted file mode 100644 index 39407d9..0000000 --- a/PystykorvaLib/BufferedStream.hpp +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -class BufferedStream -{ -public: - BufferedStream( - std::istream& input, - size_t bufferSize, - uint64_t streamSize); - - BufferedStream(const BufferedStream&) = delete; - BufferedStream(BufferedStream&&) = delete; - BufferedStream& operator = (const BufferedStream&) = delete; - BufferedStream& operator = (BufferedStream&&) = delete; - - ~BufferedStream(); - - bool HasData() const; - bool Read(); - std::string_view Data() const; - -private: - std::streambuf* _input; - uint64_t _streamSize; - std::string _buffer; - uint64_t _bytesRead; -}; - diff --git a/PystykorvaLib/MemoryMappedFile.cpp b/PystykorvaLib/MemoryMappedFile.cpp new file mode 100644 index 0000000..8934adf --- /dev/null +++ b/PystykorvaLib/MemoryMappedFile.cpp @@ -0,0 +1,158 @@ +#include "PCH.hpp" +#include "MemoryMappedFile.hpp" + +#ifdef _WIN32 + +#define NOMINMAX +#include + +class MemoryMappedFileImpl +{ +public: + MemoryMappedFileImpl(const std::filesystem::path& path, uint64_t fileSize) : + _file(CreateFileW( + path.c_str(), + GENERIC_READ | GENERIC_WRITE, + 0, + nullptr, + OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL, + NULL)) + { + if (!_file || _file == INVALID_HANDLE_VALUE) + { + throw std::system_error(GetLastError(), std::system_category(), "CreateFileW"); + } + + LARGE_INTEGER mappingSize; + mappingSize.QuadPart = fileSize; + + _mapping = CreateFileMappingW( + _file, + nullptr, + PAGE_READWRITE, + mappingSize.HighPart, + mappingSize.LowPart, + nullptr); + + if (!_mapping) + { + throw std::system_error(GetLastError(), std::system_category(), "CreateFileMappingW"); + } + + _view = MapViewOfFile(_mapping, FILE_MAP_ALL_ACCESS, 0, 0, fileSize); + + if (!_view) + { + throw std::system_error(GetLastError(), std::system_category(), "MapViewOfFile"); + } + + _size = fileSize; + } + + ~MemoryMappedFileImpl() + { + if (_view) + { + UnmapViewOfFile(_view); + } + + if (_mapping) + { + CloseHandle(_mapping); + } + + if (_file) + { + CloseHandle(_file); + } + } + + std::string_view Data() const + { + return { reinterpret_cast(_view), _size }; + } + + std::string_view Sample(size_t size) const + { + return { reinterpret_cast(_view), std::min(size, _size) }; + } + +private: + HANDLE _file = nullptr; + HANDLE _mapping = nullptr; + void* _view = nullptr; + uint64_t _size = 0; +}; +#else + +#include +#include +#include +#include + +class MemoryMappedFileImpl +{ +public: + MemoryMappedFileImpl(const std::filesystem::path& path, uint64_t fileSize) : + _descriptor(open(path.c_str(), O_RDONLY)) + { + if (_descriptor == -1) + { + throw std::system_error(errno, std::system_category(), "open"); + } + + _view = mmap(nullptr, _size, PROT_READ, MAP_PRIVATE, _descriptor, 0); + + if (_view == MAP_FAILED) + { + throw std::system_error(errno, std::system_category(), "mmap"); + } + + _size = fileSize; + } + + ~MemoryMappedFileImpl() + { + if (_descriptor) + { + ::close(_descriptor); + } + } + + std::string_view Sample(size_t size) + { + return { reinterpret_cast(_view), std::min(_size, size) }; + } + + std::string_view Data() const + { + return { reinterpret_cast(_view), _size }; + } + +private: + int _descriptor = 0; + void* _view = nullptr; + uint64_t _size = 0; +}; +#endif + +MemoryMappedFile::MemoryMappedFile(const std::filesystem::path& path, uint64_t fileSize) : + _impl(new MemoryMappedFileImpl(path, fileSize)) +{ +} + +MemoryMappedFile::~MemoryMappedFile() +{ + delete _impl; +} + +std::string_view MemoryMappedFile::Sample(size_t size) const +{ + return _impl->Sample(size); +} + +std::string_view MemoryMappedFile::Data() const +{ + return _impl->Data(); +} \ No newline at end of file diff --git a/PystykorvaLib/MemoryMappedFile.hpp b/PystykorvaLib/MemoryMappedFile.hpp new file mode 100644 index 0000000..371829a --- /dev/null +++ b/PystykorvaLib/MemoryMappedFile.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include "Pystykorva.hpp" + +class MemoryMappedFileImpl; + +// This interface has no other meaning than to allow testing + +class MemoryMappedFile : public Pystykorva::IFile +{ +public: + MemoryMappedFile(const std::filesystem::path&, uint64_t); + ~MemoryMappedFile(); + + std::string_view Sample(size_t size = 0x400) const override; + std::string_view Data() const override; + +private: + MemoryMappedFileImpl* _impl; +}; + diff --git a/PystykorvaLib/Pystykorva.cpp b/PystykorvaLib/Pystykorva.cpp index af70420..3e0c6a9 100644 --- a/PystykorvaLib/Pystykorva.cpp +++ b/PystykorvaLib/Pystykorva.cpp @@ -8,7 +8,6 @@ Pystykorva::Pystykorva(const Options& options, const Callbacks& callbacks) : _callbacks(callbacks), _rdi(options.Directory) { - assert(_options.BufferSize > _options.SearchExpression.size()); } Pystykorva::~Pystykorva() diff --git a/PystykorvaLib/Pystykorva.hpp b/PystykorvaLib/Pystykorva.hpp index d52f7d0..169ffbd 100644 --- a/PystykorvaLib/Pystykorva.hpp +++ b/PystykorvaLib/Pystykorva.hpp @@ -28,12 +28,16 @@ class Pystykorva std::chrono::time_point MinimumTime; std::chrono::time_point MaximumTime; - uint32_t BufferSize = 0; - // Zero will default to std::thread::hardware_concurrency or 1 uint32_t MaximumThreads = 0; }; + struct IFile + { + virtual std::string_view Sample(size_t size = 0x400) const = 0; + virtual std::string_view Data() const = 0; + }; + enum Status : uint32_t { Ok = 0x00, diff --git a/PystykorvaLib/PystykorvaLib.vcxproj b/PystykorvaLib/PystykorvaLib.vcxproj index 78f7ac2..0c203b3 100644 --- a/PystykorvaLib/PystykorvaLib.vcxproj +++ b/PystykorvaLib/PystykorvaLib.vcxproj @@ -14,7 +14,7 @@ - + @@ -27,7 +27,7 @@ - + diff --git a/PystykorvaLib/TextProcessor.cpp b/PystykorvaLib/TextProcessor.cpp index e275756..5d52347 100644 --- a/PystykorvaLib/TextProcessor.cpp +++ b/PystykorvaLib/TextProcessor.cpp @@ -1,6 +1,7 @@ #include "PCH.hpp" #include "TextProcessor.hpp" #include "Wildcard.hpp" +#include "MemoryMappedFile.hpp" TextProcessor::TextProcessor(std::stop_token token, const Pystykorva::Options& options) : _token(token), @@ -62,10 +63,9 @@ Pystykorva::Result TextProcessor::ProcessFile(const std::filesystem::path& path) return result; } - std::ifstream file(path, std::ios::binary); - BufferedStream stream(file, _options.BufferSize, fileSize); + MemoryMappedFile file(path, fileSize); - FindAll(stream, result.Matches, result.Encoding); + FindAll(file, result.Matches, result.Encoding); } catch (const EncodingException&) { @@ -91,82 +91,53 @@ Pystykorva::Result TextProcessor::ProcessFile(const std::filesystem::path& path) return result; } -void TextProcessor::FindAll(BufferedStream& stream, std::vector& matches, Pystykorva::EncodingGuess& encoding) +void TextProcessor::FindAll(Pystykorva::IFile& file, std::vector& matches, Pystykorva::EncodingGuess& encoding) { - std::unique_ptr converter; + if (!_encodingDetector.DetectEncoding(file.Sample(), encoding)) + { + throw EncodingException("Failed to detect encoding"); + } + UnicodeConverter converter(encoding.Name); + uint8_t charSize = converter.CharSize(); uint32_t lineNumber = 0; - uint64_t offset = 0; - uint8_t charSize = 0; - while (!_token.stop_requested()) + if (_token.stop_requested()) return; + + converter.Convert(file.Data()); + + if (_token.stop_requested()) return; + + auto boundaries = _lineAnalyzer.Boundaries(converter.Data()); + + for (Pystykorva::Position& boundary : boundaries) { - if (!stream.Read()) + if (_token.stop_requested()) return; + + // Check if the boundary is "incomplete" + if (boundary.End == Pystykorva::Position::Unknown) { - return; + boundary.End = converter.End(); } - if (!converter) - { - if (!_encodingDetector.DetectEncoding(stream.Data(), encoding)) - { - throw EncodingException("Failed to detect encoding"); - } + // Do not increment on "incomplete" lines + ++lineNumber; - converter = std::make_unique(encoding.Name); - charSize = converter->CharSize(); - } + std::u16string_view line = + converter.View(boundary.Begin, boundary.End); - // NOTE: the converter's back buffer might grow larger than defined in the options - converter->Convert(stream.Data(), stream.HasData() == false); + Pystykorva::Match match = ProcessLine(offset, lineNumber, line); - auto boundaries = _lineAnalyzer.Boundaries(converter->Data()); + // I smell problems here with variable-length character encoding... + offset += line.size() * charSize; - for (Pystykorva::Position& boundary : boundaries) + if (match.Positions.empty()) { - // Check if the boundary is "incomplete" - if (boundary.End == Pystykorva::Position::Unknown) - { - if (stream.HasData()) - { - // Fetch more data by breaking out of the loop - break; - } - else - { - // There is no more data: - // force the boundary end to the data - boundary.End = converter->End(); - } - } - - // Do not increment on "incomplete" lines - ++lineNumber; - - std::u16string_view line = - converter->View(boundary.Begin, boundary.End); - - Pystykorva::Match match = ProcessLine(offset, lineNumber, line); - - // I smell problems here with variable-length character encoding... - offset += line.size() * charSize; - - if (match.Positions.empty()) - { - continue; - } - - matches.emplace_back(match); + continue; } - if (!boundaries.empty() && stream.HasData()) - { - // Erase data which has already been analyzed - // Note: there is no point in erasing the data - // if the stream is already at the end - converter->Erase(boundaries.back().Begin); - } + matches.emplace_back(match); } } diff --git a/PystykorvaLib/TextProcessor.hpp b/PystykorvaLib/TextProcessor.hpp index abe3306..dc20501 100644 --- a/PystykorvaLib/TextProcessor.hpp +++ b/PystykorvaLib/TextProcessor.hpp @@ -5,7 +5,6 @@ #include "UnicodeConverter.hpp" #include "LineAnalyzer.hpp" #include "TextSearcher.hpp" -#include "BufferedStream.hpp" class TextProcessor { @@ -14,7 +13,7 @@ class TextProcessor ~TextProcessor(); Pystykorva::Result ProcessFile(const std::filesystem::path&); - void FindAll(BufferedStream& stream, std::vector& matches, Pystykorva::EncodingGuess& encoding); + void FindAll(Pystykorva::IFile& file, std::vector& matches, Pystykorva::EncodingGuess& encoding); Pystykorva::Match ProcessLine(uint64_t offset, uint32_t lineNumber, std::u16string_view line); private: diff --git a/PystykorvaLib/UnicodeConverter.hpp b/PystykorvaLib/UnicodeConverter.hpp index 361c31e..cae10f6 100644 --- a/PystykorvaLib/UnicodeConverter.hpp +++ b/PystykorvaLib/UnicodeConverter.hpp @@ -17,7 +17,7 @@ class UnicodeConverter ~UnicodeConverter(); uint8_t CharSize() const; - void Convert(std::string_view sample, bool flush); + void Convert(std::string_view sample, bool flush = true); std::u16string_view Data() const; size_t End() const; std::u16string_view View(size_t from, size_t to) const; diff --git a/PystykorvaTests/BufferedStreamTests.cpp b/PystykorvaTests/BufferedStreamTests.cpp deleted file mode 100644 index 188ccbd..0000000 --- a/PystykorvaTests/BufferedStreamTests.cpp +++ /dev/null @@ -1,28 +0,0 @@ -#include "PCH.hpp" -#include "BufferedStream.hpp" - -TEST(StreamTests, ReadSimple) -{ - { - std::istringstream iss("foobar"); - BufferedStream stream(iss, 3, 6); - - EXPECT_TRUE(stream.Read()); - EXPECT_TRUE(stream.Data() == "foo"); - EXPECT_TRUE(stream.Read()); - EXPECT_TRUE(stream.Data() == "bar"); - EXPECT_FALSE(stream.HasData()); - EXPECT_FALSE(stream.Read()); - } - { - std::istringstream iss("foobar"); - BufferedStream stream(iss, 4, 6); - - EXPECT_TRUE(stream.Read()); - EXPECT_TRUE(stream.Data() == "foob"); - EXPECT_TRUE(stream.Read()); - EXPECT_TRUE(stream.Data() == "ar"); - EXPECT_FALSE(stream.HasData()); - EXPECT_FALSE(stream.Read()); - } -} \ No newline at end of file diff --git a/PystykorvaTests/PystykorvaTests.vcxproj b/PystykorvaTests/PystykorvaTests.vcxproj index 866b805..bcf8087 100644 --- a/PystykorvaTests/PystykorvaTests.vcxproj +++ b/PystykorvaTests/PystykorvaTests.vcxproj @@ -28,7 +28,6 @@ Create - diff --git a/PystykorvaTests/TextProcessorTests.cpp b/PystykorvaTests/TextProcessorTests.cpp index 83d9cf2..de3cf96 100644 --- a/PystykorvaTests/TextProcessorTests.cpp +++ b/PystykorvaTests/TextProcessorTests.cpp @@ -1,6 +1,43 @@ #include "PCH.hpp" #include "TextProcessor.hpp" +class FakeFile : public Pystykorva::IFile +{ +public: + template + FakeFile(T(&data)[N]) + { + _size = sizeof(T) * (N - 1); // Exclude the trailing nulls + _data = static_cast(malloc(_size)); + + if (_data) + { + memcpy(_data, data, _size); + } + } + + ~FakeFile() + { + if (_data) + { + free(_data); + } + } + + std::string_view Sample(size_t size = 0x400) const override + { + return { _data, std::min(size, _size) }; + } + + std::string_view Data() const override + { + return { _data, _size }; + } +private: + char* _data = nullptr; + size_t _size = 0; +}; + TEST(TextProcessorTests, RegexSearchUTF8) { std::stop_token token; @@ -8,20 +45,13 @@ TEST(TextProcessorTests, RegexSearchUTF8) Pystykorva::Options options = {}; options.Mode = Pystykorva::MatchMode::RegexCaseInsensitive; options.SearchExpression = u"\\w+"; - options.BufferSize = 3; TextProcessor processor(token, options); - // I do not understand why this only works with UTF-16 _BE_ BOM... - std::istringstream iss({ reinterpret_cast(u8"\uFEFFAAAA\nBBB\nCC"), 14 }); - - // 0 1 2 3 4 5 6 7 8 9 10 11 - // x A A A A \n B B B \n C C - - BufferedStream stream(iss, 3, 14); + FakeFile file(u8"\uFEFFAAAA\nBBB\nCC"); std::vector matches; Pystykorva::EncodingGuess encoding; - processor.FindAll(stream, matches, encoding); + processor.FindAll(file, matches, encoding); EXPECT_STREQ(encoding.Name.c_str(), "UTF-8"); EXPECT_EQ(encoding.Confidence, 100); @@ -38,7 +68,6 @@ TEST(TextProcessorTests, RegexSearchUTF8) EXPECT_TRUE(matches[2].LineContent == u"CC"); ASSERT_EQ(matches[2].Positions.size(), 1); EXPECT_EQ(matches[2].Positions[0], Pystykorva::RelAbsPosPair(0, 2, 10, 12)); - } TEST(TextProcessorTests, RegexSearchUTF16LE) @@ -48,18 +77,15 @@ TEST(TextProcessorTests, RegexSearchUTF16LE) Pystykorva::Options options = {}; options.Mode = Pystykorva::MatchMode::RegexCaseInsensitive; options.SearchExpression = u"\\w+"; - options.BufferSize = 3; TextProcessor processor(token, options); // I do not understand why this only works with UTF-16 _BE_ BOM... - std::istringstream iss({ reinterpret_cast(u"\uFEFFAAAA\nBBB\nCC"), 24 }); - // It is simply impossible to use buffer sizes, which are not divisible by two when the source data is unicode - BufferedStream stream(iss, 3, 24); + FakeFile file(u"\uFEFFAAAA\nBBB\nCC"); std::vector matches; Pystykorva::EncodingGuess encoding; - processor.FindAll(stream, matches, encoding); + processor.FindAll(file, matches, encoding); EXPECT_STREQ(encoding.Name.c_str(), "UTF-16LE"); EXPECT_EQ(encoding.Confidence, 100); ASSERT_EQ(matches.size(), 3);