diff --git a/PystykorvaLib/MemoryMappedFile.cpp b/PystykorvaLib/MemoryMappedFile.cpp index 0fdc95a..67f51fd 100644 --- a/PystykorvaLib/MemoryMappedFile.cpp +++ b/PystykorvaLib/MemoryMappedFile.cpp @@ -14,15 +14,15 @@ IOException::IOException(const std::string& message) : class MemoryMappedFileImpl { public: - MemoryMappedFileImpl(const std::filesystem::path& path, uint64_t fileSize) : + MemoryMappedFileImpl(const std::filesystem::path& path, uint64_t fileSize, bool readOnly) : _file(CreateFileW( path.c_str(), - GENERIC_READ | GENERIC_WRITE, + readOnly ? GENERIC_READ : GENERIC_READ | GENERIC_WRITE, 0, nullptr, - OPEN_EXISTING, + readOnly ? OPEN_EXISTING : CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, - NULL)) + nullptr)) { if (!_file || _file == INVALID_HANDLE_VALUE) { @@ -35,7 +35,7 @@ class MemoryMappedFileImpl _mapping = CreateFileMappingW( _file, nullptr, - PAGE_READWRITE, + readOnly ? PAGE_READONLY : PAGE_READWRITE, mappingSize.HighPart, mappingSize.LowPart, nullptr); @@ -45,7 +45,12 @@ class MemoryMappedFileImpl throw IOException("CreateFileMappingW"); } - _view = MapViewOfFile(_mapping, FILE_MAP_ALL_ACCESS, 0, 0, fileSize); + _view = MapViewOfFile( + _mapping, + readOnly ? FILE_MAP_READ : FILE_MAP_ALL_ACCESS, + 0, + 0, + fileSize); if (!_view) { @@ -74,16 +79,7 @@ class MemoryMappedFileImpl } NonCopyable(MemoryMappedFileImpl); - - std::string_view Data() const - { - return { reinterpret_cast(_view), _size }; - } - - std::string_view Sample(size_t size) const - { - return { reinterpret_cast(_view), std::min(size, _size) }; - } + friend class MemoryMappedFile; private: HANDLE _file = nullptr; @@ -106,15 +102,21 @@ IOException::IOException(const std::string& message) : class MemoryMappedFileImpl { public: - MemoryMappedFileImpl(const std::filesystem::path& path, uint64_t fileSize) : - _descriptor(open(path.c_str(), O_RDONLY)) + MemoryMappedFileImpl(const std::filesystem::path& path, uint64_t fileSize, bool readOnly) : + _descriptor(open(path.c_str(), readOnly ? O_RDONLY : O_RDWR)) { if (_descriptor == -1) { throw IOException("open"); } - _view = mmap(nullptr, _size, PROT_READ, MAP_PRIVATE, _descriptor, 0); + _view = mmap( + nullptr, + _size, + readOnly ? PROT_READ : PROT_READ | PROT_WRITE, + MAP_PRIVATE, + _descriptor, + 0); if (_view == MAP_FAILED) { @@ -138,17 +140,7 @@ class MemoryMappedFileImpl } NonCopyable(MemoryMappedFileImpl); - - std::string_view Sample(size_t size) - { - return { reinterpret_cast(_view), std::min(_size, size) }; - } - - std::string_view Data() const - { - return { reinterpret_cast(_view), _size }; - } - + friend class MemoryMappedFile; private: int _descriptor = 0; void* _view = nullptr; @@ -156,8 +148,8 @@ class MemoryMappedFileImpl }; #endif -MemoryMappedFile::MemoryMappedFile(const std::filesystem::path& path, uint64_t fileSize) : - _impl(new MemoryMappedFileImpl(path, fileSize)) +MemoryMappedFile::MemoryMappedFile(const std::filesystem::path& path, uint64_t fileSize, bool readOnly) : + _impl(new MemoryMappedFileImpl(path, fileSize, readOnly)) { } @@ -166,12 +158,58 @@ MemoryMappedFile::~MemoryMappedFile() delete _impl; } +uint64_t MemoryMappedFile::Size() const +{ + return _impl->_size; +} + std::string_view MemoryMappedFile::Sample(size_t size) const { - return _impl->Sample(size); + size = std::min(_impl->_size, size); + return { reinterpret_cast(_impl->_view), size }; +} + +std::string_view MemoryMappedFile::Chunk(uint64_t offset, uint64_t size) const +{ + if (offset + size > _impl->_size) + { + throw std::out_of_range("chunk out of bounds"); + } + + return { reinterpret_cast(_impl->_view) + offset, size }; } std::string_view MemoryMappedFile::Data() const { - return _impl->Data(); + return { reinterpret_cast(_impl->_view), _impl->_size }; +} + +void MemoryMappedFile::Read(void* data, uint64_t size) +{ + if (_offset + size > _impl->_size) + { + throw std::out_of_range("read out of bounds"); + } + + for (uint64_t offset = 0; offset < size; ++offset, ++_offset) + { + auto source = reinterpret_cast(_impl->_view) + _offset; + auto target = reinterpret_cast(data) + offset; + *target = *source; + } } + +void MemoryMappedFile::Write(const void* data, uint64_t size) +{ + if (_offset + size > _impl->_size) + { + throw std::out_of_range("write out of bounds"); + } + + for (uint64_t offset = 0; offset < size; ++offset, ++_offset) + { + auto source = reinterpret_cast(data) + offset; + auto target = reinterpret_cast(_impl->_view) + _offset; + *target = *source; + } +} \ No newline at end of file diff --git a/PystykorvaLib/MemoryMappedFile.hpp b/PystykorvaLib/MemoryMappedFile.hpp index a290015..f88cb9a 100644 --- a/PystykorvaLib/MemoryMappedFile.hpp +++ b/PystykorvaLib/MemoryMappedFile.hpp @@ -13,14 +13,19 @@ struct IOException : std::system_error class MemoryMappedFile : public Pystykorva::IFile { public: - MemoryMappedFile(const std::filesystem::path&, uint64_t); + MemoryMappedFile(const std::filesystem::path& path, uint64_t size, bool readOnly); ~MemoryMappedFile(); NonCopyable(MemoryMappedFile); - std::string_view Sample(size_t size = 0x400) const override; + virtual uint64_t Size() const override; + std::string_view Sample(uint64_t size) const override; + std::string_view Chunk(uint64_t from, uint64_t size) const override; std::string_view Data() const override; + void Read(void* data, uint64_t size) override; + void Write(const void* data, uint64_t size) override; private: MemoryMappedFileImpl* _impl; + uint64_t _offset = 0; }; diff --git a/PystykorvaLib/Pystykorva.hpp b/PystykorvaLib/Pystykorva.hpp index 45e41d1..97cd947 100644 --- a/PystykorvaLib/Pystykorva.hpp +++ b/PystykorvaLib/Pystykorva.hpp @@ -34,8 +34,12 @@ class Pystykorva struct IFile { - virtual std::string_view Sample(size_t size = 0x400) const = 0; + virtual uint64_t Size() const = 0; + virtual std::string_view Sample(uint64_t size = 0x400) const = 0; + virtual std::string_view Chunk(uint64_t offset, uint64_t size) const = 0; virtual std::string_view Data() const = 0; + virtual void Read(void* data, uint64_t size) = 0; + virtual void Write(const void* data, uint64_t size) = 0; }; enum Status : uint32_t diff --git a/PystykorvaLib/TextProcessor.cpp b/PystykorvaLib/TextProcessor.cpp index ed34959..55cd9d7 100644 --- a/PystykorvaLib/TextProcessor.cpp +++ b/PystykorvaLib/TextProcessor.cpp @@ -63,9 +63,9 @@ Pystykorva::Result TextProcessor::ProcessFile(const std::filesystem::path& path) return result; } - MemoryMappedFile file(path, fileSize); + MemoryMappedFile input(path, fileSize, true); - FindAll(file, result.Matches, result.Encoding); + FindAll(input, result.Matches, result.Encoding); } catch (const IOException&) { diff --git a/PystykorvaLib/TextReplacer.cpp b/PystykorvaLib/TextReplacer.cpp index 5201334..308926b 100644 --- a/PystykorvaLib/TextReplacer.cpp +++ b/PystykorvaLib/TextReplacer.cpp @@ -4,31 +4,108 @@ class TextReplacerImpl { public: - TextReplacerImpl() + TextReplacerImpl(const Pystykorva::IFile& input, Pystykorva::Result& result) : + _input(input), + _result(result), + _converter(ucnv_open(result.Encoding.Name.data(), &_status)) { - // TODO! + if (U_FAILURE(_status)) + { + throw ReplaceException("ucnv_open failed"); + } } ~TextReplacerImpl() { + if (_converter) + { + ucnv_close(_converter); + } } NonCopyable(TextReplacerImpl); + void ReplaceAll( + Pystykorva::IFile& output, + std::u16string_view replacement) + { + uint64_t offset = 0; + + std::string sourceEncodedReplacement = SourceEncode(replacement); + + for (Pystykorva::Match& match : _result.Matches) + { + for (const auto& [relative, absolute] : match.Positions) + { + // TODO: progress reporting? + + const uint64_t chunkSize = std::max(absolute.Begin, offset) - std::min(absolute.Begin, offset); + + if (chunkSize) + { + auto chunk = _input.Chunk(offset, chunkSize); + output.Write(chunk.data(), chunkSize); + offset += chunkSize; + } + + output.Write(sourceEncodedReplacement.data(), sourceEncodedReplacement.size()); + offset += absolute.Size(); + + // In case someone wants to render the end result + match.LineContent.replace(relative.Begin, relative.Size(), replacement); + } + } + + const uint64_t bytesLeft = _input.Size() - offset; + + if (bytesLeft) + { + auto chunk = _input.Chunk(offset, bytesLeft); + output.Write(chunk.data(), bytesLeft); + } + } + private: + std::string SourceEncode(std::u16string_view replacement) + { + std::string buffer(replacement.size() * 2, '\0'); + char* target = buffer.data(); + char* targetLimit = buffer.data() + buffer.size(); + + const char16_t* source = replacement.data(); + const char16_t* sourceLimit = replacement.data() + replacement.size(); + + ucnv_fromUnicode(_converter, &target, targetLimit, &source, sourceLimit, nullptr, true, &_status); + + if (U_FAILURE(_status)) + { + throw ReplaceException("ucnv_toUnicode failed"); + } + + buffer.resize(targetLimit - target); + + return buffer; + } + + const Pystykorva::IFile& _input; + Pystykorva::Result& _result; + UErrorCode _status = U_ZERO_ERROR; + UConverter* _converter = nullptr; }; -TextReplacer::TextReplacer() : - _impl(new TextReplacerImpl()) +TextReplacer::TextReplacer(const Pystykorva::IFile& file, Pystykorva::Result& result) : + _impl(new TextReplacerImpl(file, result)) { } TextReplacer::~TextReplacer() { + delete _impl; } - -void TextReplacer::ReplaceAll(Pystykorva::IFile&, Pystykorva::Match&, std::string_view) +void TextReplacer::ReplaceAll( + Pystykorva::IFile& output, + std::u16string_view replacement) { - // TODO! + _impl->ReplaceAll(output, replacement); } diff --git a/PystykorvaLib/TextReplacer.hpp b/PystykorvaLib/TextReplacer.hpp index 0f9a5b8..16c9305 100644 --- a/PystykorvaLib/TextReplacer.hpp +++ b/PystykorvaLib/TextReplacer.hpp @@ -16,11 +16,11 @@ class TextReplacerImpl; class TextReplacer { public: - TextReplacer(); + TextReplacer(const Pystykorva::IFile&, Pystykorva::Result&); ~TextReplacer(); NonCopyable(TextReplacer); - void ReplaceAll(Pystykorva::IFile&, Pystykorva::Match&, std::string_view); + void ReplaceAll(Pystykorva::IFile& output, std::u16string_view); private: TextReplacerImpl* _impl; diff --git a/PystykorvaLib/UnicodeConverter.cpp b/PystykorvaLib/UnicodeConverter.cpp index 76a2c55..ca9da8c 100644 --- a/PystykorvaLib/UnicodeConverter.cpp +++ b/PystykorvaLib/UnicodeConverter.cpp @@ -7,7 +7,10 @@ class UnicodeConverterImpl UnicodeConverterImpl(std::string_view encoding) : _converter(ucnv_open(encoding.data(), &_status)) { - assert(U_SUCCESS(_status)); + if (U_FAILURE(_status)) + { + throw ConversionException("ucnv_open failed"); + } } ~UnicodeConverterImpl() diff --git a/PystykorvaTests/MockFile.hpp b/PystykorvaTests/MockFile.hpp new file mode 100644 index 0000000..bc29582 --- /dev/null +++ b/PystykorvaTests/MockFile.hpp @@ -0,0 +1,81 @@ +#pragma once + +class MockFile : public Pystykorva::IFile +{ +public: + MockFile(size_t size) : + _buffer(size, '\0') + { + } + + template + MockFile(T(&data)[N]) : + _buffer(sizeof(T)* (N - 1), '\0') + { + std::memcpy(_buffer.data(), data, _buffer.size()); + } + + inline ~MockFile() + { + _buffer.clear(); + } + + inline uint64_t Size() const override + { + return _buffer.size(); + } + + inline std::string_view Sample(size_t size = 0x400) const override + { + return { _buffer.data(), std::min(_buffer.size(), size) }; + } + + std::string_view Chunk(uint64_t offset, uint64_t size) const override + { + if (offset + size > _buffer.size()) + { + throw std::out_of_range("chunk out of bounds"); + } + + return { _buffer.data() + offset, size }; + } + + inline std::string_view Data() const override + { + return _buffer; + } + + inline void Read(void* data, uint64_t size) override + { + if (_offset + size > _buffer.size()) + { + throw std::out_of_range("read out of bounds"); + } + + for (uint64_t offset = 0; offset < size; ++offset, ++_offset) + { + auto source = _buffer.data() + _offset; + auto target = reinterpret_cast(data) + offset; + *target = *source; + } + } + + inline void Write(const void* data, uint64_t size) override + { + if (_offset + size > _buffer.size()) + { + throw std::out_of_range("write of bounds"); + } + + for (uint64_t offset = 0; offset < size; ++offset, ++_offset) + { + auto source = reinterpret_cast(data) + offset; + auto target = _buffer.data() + _offset; + *target = *source; + } + } + +private: + std::string _buffer; + mutable size_t _offset = 0; +}; diff --git a/PystykorvaTests/PystykorvaTests.vcxproj b/PystykorvaTests/PystykorvaTests.vcxproj index fc59ce7..e5617b2 100644 --- a/PystykorvaTests/PystykorvaTests.vcxproj +++ b/PystykorvaTests/PystykorvaTests.vcxproj @@ -19,6 +19,7 @@ + @@ -31,6 +32,7 @@ + diff --git a/PystykorvaTests/TextProcessorTests.cpp b/PystykorvaTests/TextProcessorTests.cpp index 0e37c21..e311d38 100644 --- a/PystykorvaTests/TextProcessorTests.cpp +++ b/PystykorvaTests/TextProcessorTests.cpp @@ -1,42 +1,6 @@ #include "PystykorvaTests.pch" #include "TextProcessor.hpp" - -class FakeFile : public Pystykorva::IFile -{ -public: - template - FakeFile(T(&data)[N]) - { - _size = sizeof(T) * (N - 1); // Exclude the trailing nulls - _data = static_cast(malloc(_size)); - - if (_data) - { - memcpy(_data, data, _size); - } - } - - ~FakeFile() - { - if (_data) - { - free(_data); - } - } - - std::string_view Sample(size_t size = 0x400) const override - { - return { _data, std::min(size, _size) }; - } - - std::string_view Data() const override - { - return { _data, _size }; - } -private: - char* _data = nullptr; - size_t _size = 0; -}; +#include "MockFile.hpp" TEST(TextProcessorTests, RegexSearchUTF8) { @@ -48,7 +12,7 @@ TEST(TextProcessorTests, RegexSearchUTF8) TextProcessor processor(token, options); - FakeFile file(u8"\uFEFFAAAA\nBBB\nCC"); + MockFile file(u8"\uFEFFAAAA\nBBB\nCC"); std::vector matches; Pystykorva::EncodingGuess encoding; processor.FindAll(file, matches, encoding); @@ -82,7 +46,7 @@ TEST(TextProcessorTests, RegexSearchUTF16LE) // I do not understand why this only works with UTF-16 _BE_ BOM... - FakeFile file(u"\uFEFFAAAA\nBBB\nCC"); + MockFile file(u"\uFEFFAAAA\nBBB\nCC"); std::vector matches; Pystykorva::EncodingGuess encoding; processor.FindAll(file, matches, encoding); diff --git a/PystykorvaTests/TextReplacerTests.cpp b/PystykorvaTests/TextReplacerTests.cpp new file mode 100644 index 0000000..858e4ed --- /dev/null +++ b/PystykorvaTests/TextReplacerTests.cpp @@ -0,0 +1,72 @@ +#include "PystykorvaTests.pch" +#include "TextReplacer.hpp" +#include "MockFile.hpp" + +TEST(TextReplacedTests, ReplaceSameSize) +{ + MockFile input(u8"foo\nbar\nxyz"); + MockFile output(11); + + Pystykorva::Result result; + result.Encoding = { 100, "UTF-8" }; + std::vector foo = { { 0, 3, 0, 3 } }; + std::vector bar = { { 0, 3, 4, 7 } }; + std::vector xyz = { { 0, 3, 8, 11 } }; + result.Matches.emplace_back(0, u"foo\n", foo); + result.Matches.emplace_back(1, u"bar\n", bar); + result.Matches.emplace_back(2, u"xyz\n", xyz); + + TextReplacer replacer(input, result); + replacer.ReplaceAll(output, u"abc"); + + EXPECT_TRUE(output.Data() == "abc\nabc\nabc"); + EXPECT_TRUE(result.Matches[0].LineContent == u"abc\n"); + EXPECT_TRUE(result.Matches[1].LineContent == u"abc\n"); + EXPECT_TRUE(result.Matches[2].LineContent == u"abc\n"); +} + +TEST(TextReplacedTests, ReplaceSmallerSize) +{ + MockFile input(u8"foo\nbar\nxyz"); + MockFile output(8); + + Pystykorva::Result result; + result.Encoding = { 100, "UTF-8" }; + std::vector foo = { { 0, 3, 0, 3 } }; + std::vector bar = { { 0, 3, 4, 7 } }; + std::vector xyz = { { 0, 3, 8, 11 } }; + result.Matches.emplace_back(0, u"foo\n", foo); + result.Matches.emplace_back(1, u"bar\n", bar); + result.Matches.emplace_back(2, u"xyz\n", xyz); + + TextReplacer replacer(input, result); + replacer.ReplaceAll(output, u"ab"); + + EXPECT_TRUE(output.Data() == "ab\nab\nab"); + EXPECT_TRUE(result.Matches[0].LineContent == u"ab\n"); + EXPECT_TRUE(result.Matches[1].LineContent == u"ab\n"); + EXPECT_TRUE(result.Matches[2].LineContent == u"ab\n"); +} + +TEST(TextReplacedTests, ReplaceLargerSize) +{ + MockFile input(u8"foo\nbar\nxyz"); + MockFile output(14); + + Pystykorva::Result result; + result.Encoding = { 100, "UTF-8" }; + std::vector foo = { { 0, 3, 0, 3 } }; + std::vector bar = { { 0, 3, 4, 7 } }; + std::vector xyz = { { 0, 3, 8, 11 } }; + result.Matches.emplace_back(0, u"foo\n", foo); + result.Matches.emplace_back(1, u"bar\n", bar); + result.Matches.emplace_back(2, u"xyz\n", xyz); + + TextReplacer replacer(input, result); + replacer.ReplaceAll(output, u"abcd"); + + EXPECT_TRUE(output.Data() == "abcd\nabcd\nabcd"); + EXPECT_TRUE(result.Matches[0].LineContent == u"abcd\n"); + EXPECT_TRUE(result.Matches[1].LineContent == u"abcd\n"); + EXPECT_TRUE(result.Matches[2].LineContent == u"abcd\n"); +} \ No newline at end of file