Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Centralized metadata constraints #339

Merged
merged 16 commits into from
Apr 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
246 changes: 246 additions & 0 deletions src/metadata.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
/*
* Copyright 2023 Veloman Yunkan <[email protected]>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/

#include "metadata.h"

#include <sstream>
#include <regex>
#include <unicode/unistr.h>

#include <cctype>
#include <iomanip>


namespace zim
{

namespace
{

const bool MANDATORY = true;
const bool OPTIONAL = false;

const std::string LANGS_REGEXP = "\\w{3}(,\\w{3})*";
const std::string DATE_REGEXP = R"(\d\d\d\d-\d\d-\d\d)";

// PNG regexp has to be defined in such a tricky way because it includes
// a NUL character
const char PNG_REGEXP_DATA[] = "^\x89\x50\x4e\x47\x0d\x0a\x1a\x0a(.|\\s|\0)+";
const std::string PNG_REGEXP(PNG_REGEXP_DATA, sizeof(PNG_REGEXP_DATA)-1);

bool matchRegex(const std::string& regexStr, const std::string& text)
{
const std::regex regex(regexStr);
return std::regex_match(text.begin(), text.end(), regex);
}

size_t getTextLength(const std::string& utf8EncodedString)
{
return icu::UnicodeString::fromUTF8(utf8EncodedString).length();
}

class MetadataComplexCheckBase
{
public:
const std::string description;
const MetadataComplexCheckBase* const prev;

public: // functions
explicit MetadataComplexCheckBase(const std::string& desc);

MetadataComplexCheckBase(const MetadataComplexCheckBase&) = delete;
MetadataComplexCheckBase(MetadataComplexCheckBase&&) = delete;
void operator=(const MetadataComplexCheckBase&) = delete;
void operator=(MetadataComplexCheckBase&&) = delete;

virtual ~MetadataComplexCheckBase();

virtual bool checkMetadata(const Metadata& m) const = 0;

static const MetadataComplexCheckBase* getLastCheck() { return last; }

private: // functions
static const MetadataComplexCheckBase* last;
mgautierfr marked this conversation as resolved.
Show resolved Hide resolved
};

const MetadataComplexCheckBase* MetadataComplexCheckBase::last = nullptr;

MetadataComplexCheckBase::MetadataComplexCheckBase(const std::string& desc)
: description(desc)
, prev(last)
{
last = this;
}

MetadataComplexCheckBase::~MetadataComplexCheckBase()
{
// Ideally, we should de-register this object from the list of live objects.
// However, in the current implementation MetadataComplexCheckBase objects
// are only constructed in static storage and the list of active objects
// isn't supposed to be accessed after any MetadataComplexCheckBase object
// has been destroyed as part of program termination clean-up actions.
}

#define ADD_METADATA_COMPLEX_CHECK(DESC, CLSNAME) \
class CLSNAME : public MetadataComplexCheckBase \
{ \
public: \
CLSNAME() : MetadataComplexCheckBase(DESC) {} \
bool checkMetadata(const Metadata& data) const override; \
}; \
\
const CLSNAME CONCAT(obj, CLSNAME); \
\
bool CLSNAME::checkMetadata(const Metadata& data) const \
/* should be followed by the check body */



#define CONCAT(X, Y) X##Y
#define GENCLSNAME(UUID) CONCAT(MetadataComplexCheck, UUID)

#define METADATA_ASSERT(DESC) ADD_METADATA_COMPLEX_CHECK(DESC, GENCLSNAME(__LINE__))


#include "metadata_constraints.cpp"

// This function is intended for pretty printing of regexps with non-printable
// characters.
// In a general purpose/rigorous version we should escape the escape symbol
// (backslash) too, but that doesn't play well with the purpose stated above.
std::string escapeNonPrintableChars(const std::string& s)
{
std::ostringstream os;
os << std::hex;
for (const char c : s) {
if (std::isprint(c)) {
os << c;
} else {
const unsigned int charVal = static_cast<unsigned char>(c);
os << "\\x" << std::setw(2) << std::setfill('0') << charVal;
}
}
return os.str();
}

mgautierfr marked this conversation as resolved.
Show resolved Hide resolved
Metadata::Errors concat(Metadata::Errors e1, const Metadata::Errors& e2)
{
e1.insert(e1.end(), e2.begin(), e2.end());
return e1;
}

} // unnamed namespace

const Metadata::ReservedMetadataTable& Metadata::reservedMetadataInfo = reservedMetadataInfoTable;

const Metadata::ReservedMetadataRecord&
Metadata::getReservedMetadataRecord(const std::string& name)
{
for ( const auto& x : reservedMetadataInfo ) {
if ( x.name == name )
return x;
}

throw std::out_of_range(name + " is not a reserved metadata name");
}

bool Metadata::has(const std::string& name) const
{
return data.find(name) != data.end();
}

const std::string& Metadata::operator[](const std::string& name) const
{
return data.at(name);
}

void Metadata::set(const std::string& name, const std::string& value)
{
data[name] = value;
}

bool Metadata::valid() const
{
return check().empty();
}

Metadata::Errors Metadata::checkMandatoryMetadata() const
{
Errors errors;
for ( const auto& rmr : reservedMetadataInfo ) {
if ( rmr.isMandatory && data.find(rmr.name) == data.end() ) {
errors.push_back("Missing mandatory metadata: " + rmr.name );
}
}

return errors;
}

Metadata::Errors Metadata::checkSimpleConstraints() const
{
Errors errors;
for ( const auto& nv : data ) {
const auto& name = nv.first;
const auto& value = nv.second;
try {
const auto& rmr = getReservedMetadataRecord(name);
if ( rmr.minLength != 0 && getTextLength(value) < rmr.minLength ) {
std::ostringstream oss;
oss << name << " must contain at least " << rmr.minLength << " characters";
errors.push_back(oss.str());
}
if ( rmr.maxLength != 0 && getTextLength(value) > rmr.maxLength ) {
std::ostringstream oss;
oss << name << " must contain at most " << rmr.maxLength << " characters";
errors.push_back(oss.str());
}
if ( !rmr.regex.empty() && !matchRegex(rmr.regex, value) ) {
const std::string regex = escapeNonPrintableChars(rmr.regex);
errors.push_back(name + " doesn't match regex: " + regex);
}
} catch ( const std::out_of_range& ) {
// ignore non-reserved metadata
}
}
return errors;
}

Metadata::Errors Metadata::checkComplexConstraints() const
{
Errors errors;
const MetadataComplexCheckBase* c = MetadataComplexCheckBase::getLastCheck();
for ( ; c != nullptr ; c = c->prev ) {
if ( ! c->checkMetadata(*this) ) {
errors.push_back(c->description);
}
}
return errors;
}

Metadata::Errors Metadata::check() const
{
const Errors e1 = checkMandatoryMetadata();
const Errors e2 = checkSimpleConstraints();
if ( !e1.empty() || !e2.empty() )
return concat(e1, e2);

return checkComplexConstraints();
}

} // namespace zim
77 changes: 77 additions & 0 deletions src/metadata.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* Copyright 2023 Veloman Yunkan <[email protected]>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/

#ifndef OPENZIM_METADATA_H
#define OPENZIM_METADATA_H

#include <string>
#include <vector>
#include <map>

namespace zim
{

class Metadata
{
typedef std::map<std::string, std::string> KeyValueMap;

public: // types
struct ReservedMetadataRecord
{
const std::string name;
const bool isMandatory;
const size_t minLength;
const size_t maxLength;
const std::string regex;
};

typedef std::vector<ReservedMetadataRecord> ReservedMetadataTable;

typedef std::vector<std::string> Errors;

typedef KeyValueMap::const_iterator Iterator;

public: // data
static const ReservedMetadataTable& reservedMetadataInfo;

public: // functions
void set(const std::string& name, const std::string& value);
bool has(const std::string& name) const;
const std::string& operator[](const std::string& name) const;

bool valid() const;
Errors check() const;

static const ReservedMetadataRecord& getReservedMetadataRecord(const std::string& name);

Iterator begin() const { return data.begin(); }
Iterator end() const { return data.end(); }

private: // functions
Errors checkMandatoryMetadata() const;
Errors checkSimpleConstraints() const;
Errors checkComplexConstraints() const;

private: // data
KeyValueMap data;
};

} // namespace zim

#endif // OPENZIM_METADATA_H
32 changes: 32 additions & 0 deletions src/metadata_constraints.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
const Metadata::ReservedMetadataTable reservedMetadataInfoTable = {
// name isMandatory minLength maxLength regex
{ "Name", MANDATORY, 1, 0, "" },
{ "Title", MANDATORY, 1, 30, "" },
{ "Language", MANDATORY, 3, 0, LANGS_REGEXP },
{ "Creator", MANDATORY, 1, 0, "" },
{ "Publisher", MANDATORY, 1, 0, "" },
{ "Date", MANDATORY, 10, 10, DATE_REGEXP },
{ "Description", MANDATORY, 1, 80, "" },
{ "LongDescription", OPTIONAL, 0, 4000, "" },
{ "License", OPTIONAL, 0, 0, "" },
{ "Tags", OPTIONAL, 0, 0, "" },
{ "Relation", OPTIONAL, 0, 0, "" },
{ "Flavour", OPTIONAL, 0, 0, "" },
{ "Source", OPTIONAL, 0, 0, "" },
{ "Counter", OPTIONAL, 0, 0, "" },
{ "Scraper", OPTIONAL, 0, 0, "" },

{
"Illustration_48x48@1",
MANDATORY,
0, // There are no constraints on the illustration metadata size
0, // in order to avoid decoding it as UTF-8 encoded text
PNG_REGEXP
},
};

METADATA_ASSERT("LongDescription shouldn't be shorter than Description")
{
return !data.has("LongDescription")
|| data["LongDescription"].size() >= data["Description"].size();
}
Loading