From 79251f5fa21eece6f537f24606f314b77a2677fd Mon Sep 17 00:00:00 2001 From: Sebastian Ullrich Date: Mon, 27 Nov 2023 11:24:43 +0100 Subject: [PATCH] feat: embed and check githash in .olean (#2766) This is an additional safety net on top of #2749: it protects users that circumvent the build system (e.g. with `lake env`) as well as obviates the need for TOCTOU-like race condition checks in the build system. The check is activated by `CHECK_OLEAN_VERSION=ON`, which now defaults to `OFF` as the sensible default for local development. When activated, `USE_GITHASH=ON` is also force-enabled for stage 0 in order to make sure that stage 1 can load its own core library. --- .github/workflows/ci.yml | 3 +- CMakeLists.txt | 4 ++- src/CMakeLists.txt | 16 +++------- src/library/module.cpp | 60 ++++++++++++++++++++++++----------- src/library/util.cpp | 6 +--- src/version.h.in | 4 --- stage0/src/CMakeLists.txt | 16 +++------- stage0/src/library/module.cpp | 60 ++++++++++++++++++++++++----------- 8 files changed, 98 insertions(+), 71 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1ef6a5c25945..111d9c89916d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -259,7 +259,8 @@ jobs: mkdir build cd build ulimit -c unlimited # coredumps - OPTIONS=() + # this also enables githash embedding into stage 1 library + OPTIONS=(-DCHECK_OLEAN_VERSION=ON) if [[ -n '${{ matrix.prepare-llvm }}' ]]; then wget -q ${{ matrix.llvm-url }} PREPARE="$(${{ matrix.prepare-llvm }})" diff --git a/CMakeLists.txt b/CMakeLists.txt index 271ef9c4cdd3..b5654a6f76dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ foreach(var ${vars}) list(APPEND STAGE0_ARGS "-D${CMAKE_MATCH_1}=${${var}}") elseif("${currentHelpString}" MATCHES "No help, variable specified on the command line." OR "${currentHelpString}" STREQUAL "") list(APPEND CL_ARGS "-D${var}=${${var}}") - if("${var}" STREQUAL "USE_GMP") + if("${var}" MATCHES "USE_GMP|CHECK_OLEAN_VERSION") # must forward options that generate incompatible .olean format list(APPEND STAGE0_ARGS "-D${var}=${${var}}") endif() @@ -35,6 +35,8 @@ ExternalProject_add(stage0 SOURCE_SUBDIR src BINARY_DIR stage0 # do not rebuild stage0 when git hash changes; it's not from this commit anyway + # (however, `CHECK_OLEAN_VERSION=ON` in CI will override this as we need to + # embed the githash into the stage 1 library built by stage 0) CMAKE_ARGS -DSTAGE=0 -DUSE_GITHASH=OFF ${PLATFORM_ARGS} ${STAGE0_ARGS} BUILD_ALWAYS ON # cmake doesn't auto-detect changes without a download method INSTALL_COMMAND "" # skip install diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8806e6add72e..86fed0d474e3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -64,7 +64,7 @@ option(BSYMBOLIC "Link with -Bsymbolic to reduce call overhead in shared librari option(USE_GMP "USE_GMP" ON) # development-specific options -option(CHECK_OLEAN_VERSION "Only load .olean files compiled with the current version of Lean" ON) +option(CHECK_OLEAN_VERSION "Only load .olean files compiled with the current version of Lean" OFF) set(LEAN_EXTRA_MAKE_OPTS "" CACHE STRING "extra options to lean --make") set(LEANC_CC ${CMAKE_C_COMPILER} CACHE STRING "C compiler to use in `leanc`") @@ -93,8 +93,9 @@ if ("${RUNTIME_STATS}" MATCHES "ON") string(APPEND LEAN_EXTRA_CXX_FLAGS " -D LEAN_RUNTIME_STATS") endif() -if (NOT("${CHECK_OLEAN_VERSION}" MATCHES "ON")) - string(APPEND LEAN_EXTRA_CXX_FLAGS " -D LEAN_IGNORE_OLEAN_VERSION") +if ("${CHECK_OLEAN_VERSION}" MATCHES "ON") + set(USE_GITHASH ON) + string(APPEND LEAN_EXTRA_CXX_FLAGS " -D LEAN_CHECK_OLEAN_VERSION") endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Emscripten") @@ -401,26 +402,17 @@ if(MULTI_THREAD AND NOT MSVC AND (NOT ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin")) endif() # Git HASH -set(LEAN_PACKAGE_VERSION "NOT-FOUND") if(USE_GITHASH) include(GetGitRevisionDescription) get_git_head_revision(GIT_REFSPEC GIT_SHA1) if(${GIT_SHA1} MATCHES "GITDIR-NOTFOUND") message(STATUS "Failed to read git_sha1") set(GIT_SHA1 "") - if(EXISTS "${LEAN_SOURCE_DIR}/bin/package_version") - file(STRINGS "${LEAN_SOURCE_DIR}/bin/package_version" LEAN_PACKAGE_VERSION) - message(STATUS "Package version detected: ${LEAN_PACKAGE_VERSION}") - endif() else() message(STATUS "git commit sha1: ${GIT_SHA1}") endif() else() set(GIT_SHA1 "") - if(EXISTS "${LEAN_SOURCE_DIR}/bin/package_version") - file(STRINGS "${LEAN_SOURCE_DIR}/bin/package_version" LEAN_PACKAGE_VERSION) - message(STATUS "Package version detected: ${LEAN_PACKAGE_VERSION}") - endif() endif() configure_file("${LEAN_SOURCE_DIR}/githash.h.in" "${LEAN_BINARY_DIR}/githash.h") diff --git a/src/library/module.cpp b/src/library/module.cpp index 7d93ca857c4f..33138b2cc3cc 100644 --- a/src/library/module.cpp +++ b/src/library/module.cpp @@ -3,6 +3,8 @@ Copyright (c) 2014-2015 Microsoft Corporation. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. Authors: Leonardo de Moura, Gabriel Ebner, Sebastian Ullrich + +.olean serialization and deserialization. */ #include #include @@ -25,6 +27,7 @@ Authors: Leonardo de Moura, Gabriel Ebner, Sebastian Ullrich #include "library/constants.h" #include "library/time_task.h" #include "library/util.h" +#include "githash.h" #ifdef LEAN_WINDOWS #include @@ -41,8 +44,22 @@ Authors: Leonardo de Moura, Gabriel Ebner, Sebastian Ullrich #endif namespace lean { -// manually padded to multiple of word size, see `initialize_module` -static char const * g_olean_header = "oleanfile!!!!!!!"; + +/** On-disk format of a .olean file. */ +struct olean_header { + // 5 bytes: magic number + char marker[5] = {'o', 'l', 'e', 'a', 'n'}; + // 1 byte: version, currently always `1` + uint8_t version = 1; + // 42 bytes: build githash, padded with `\0` to the right + char githash[42]; + // address at which the beginning of the file (including header) is attempted to be mmapped + size_t base_addr; + // payload, a serialize Lean object graph; `size_t` has same alignment requirements as Lean objects + size_t data[]; +}; +// make sure we don't have any padding bytes, which also ensures `data` is properly aligned +static_assert(sizeof(olean_header) == 5 + 1 + 42 + sizeof(size_t), "olean_header must be packed"); extern "C" LEAN_EXPORT object * lean_save_module_data(b_obj_arg fname, b_obj_arg mod, b_obj_arg mdata, object *) { std::string olean_fn(string_cstr(fname)); @@ -73,10 +90,14 @@ extern "C" LEAN_EXPORT object * lean_save_module_data(b_obj_arg fname, b_obj_arg // `MapViewOfFileEx` addresses must be aligned to the "memory allocation granularity", which is 64KB. base_addr = base_addr & ~((1LL<<16) - 1); - object_compactor compactor(reinterpret_cast(base_addr + strlen(g_olean_header) + sizeof(base_addr))); + object_compactor compactor(reinterpret_cast(base_addr + offsetof(olean_header, data))); compactor(mdata); - out.write(g_olean_header, strlen(g_olean_header)); - out.write(reinterpret_cast(&base_addr), sizeof(base_addr)); + + // see/sync with file format description above + olean_header header = {}; + header.base_addr = base_addr; + strncpy(header.githash, LEAN_GITHASH, sizeof(header.githash)); + out.write(reinterpret_cast(&header), sizeof(header)); out.write(static_cast(compactor.data()), compactor.size()); out.close(); while (std::rename(olean_tmp_fn.c_str(), olean_fn.c_str()) != 0) { @@ -116,19 +137,21 @@ extern "C" LEAN_EXPORT object * lean_read_module_data(object * fname, object *) in.seekg(0, in.end); size_t size = in.tellg(); in.seekg(0); - size_t header_size = strlen(g_olean_header); - if (size < header_size) { + + olean_header default_header = {}; + olean_header header; + if (!in.read(reinterpret_cast(&header), sizeof(header))) { return io_result_mk_error((sstream() << "failed to read file '" << olean_fn << "', invalid header").str()); } - char * header = new char[header_size]; - in.read(header, header_size); - if (strncmp(header, g_olean_header, header_size) != 0) { + if (memcmp(header.marker, default_header.marker, sizeof(header.marker)) != 0 + || header.version != default_header.version +#ifdef LEAN_CHECK_OLEAN_VERSION + || strncmp(header.githash, LEAN_GITHASH, sizeof(header.githash)) != 0 +#endif + ) { return io_result_mk_error((sstream() << "failed to read file '" << olean_fn << "', invalid header").str()); } - delete[] header; - char * base_addr; - in.read(reinterpret_cast(&base_addr), sizeof(base_addr)); - header_size += sizeof(base_addr); + char * base_addr = reinterpret_cast(header.base_addr); char * buffer = nullptr; bool is_mmap = false; std::function free_data; @@ -166,24 +189,25 @@ extern "C" LEAN_EXPORT object * lean_read_module_data(object * fname, object *) }; #endif if (buffer && buffer == base_addr) { - buffer += header_size; + buffer += sizeof(olean_header); is_mmap = true; } else { #ifdef LEAN_MMAP free_data(); #endif - buffer = static_cast(malloc(size - header_size)); + buffer = static_cast(malloc(size - sizeof(olean_header))); free_data = [=]() { free(buffer); }; - in.read(buffer, size - header_size); + in.read(buffer, size - sizeof(olean_header)); if (!in) { return io_result_mk_error((sstream() << "failed to read file '" << olean_fn << "'").str()); } } in.close(); - compacted_region * region = new compacted_region(size - header_size, buffer, base_addr + header_size, is_mmap, free_data); + compacted_region * region = + new compacted_region(size - sizeof(olean_header), buffer, base_addr + sizeof(olean_header), is_mmap, free_data); #if defined(__has_feature) #if __has_feature(address_sanitizer) // do not report as leak diff --git a/src/library/util.cpp b/src/library/util.cpp index 9ce0383c6f38..eb22dd48cb8d 100644 --- a/src/library/util.cpp +++ b/src/library/util.cpp @@ -873,11 +873,7 @@ void initialize_library_util() { if (std::strlen(LEAN_SPECIAL_VERSION_DESC) > 0) { out << "-" << LEAN_SPECIAL_VERSION_DESC; } - if (std::strlen(LEAN_GITHASH) == 0) { - if (std::strcmp(LEAN_PACKAGE_VERSION, "NOT-FOUND") != 0) { - out << ", package " << LEAN_PACKAGE_VERSION; - } - } else { + if (std::strlen(LEAN_GITHASH) > 0) { out << ", commit " << std::string(LEAN_GITHASH).substr(0, 12); } g_version_string = new std::string(out.str()); diff --git a/src/version.h.in b/src/version.h.in index 3128fad3119c..7f343adda530 100644 --- a/src/version.h.in +++ b/src/version.h.in @@ -6,7 +6,3 @@ // Additional version description like "nightly-2018-03-11" #define LEAN_SPECIAL_VERSION_DESC "@LEAN_SPECIAL_VERSION_DESC@" - -// When git_sha1 is not available, lean reads bin/version file and -// assign its contents to LEAN_PACKAGE_VERSION -#define LEAN_PACKAGE_VERSION "@LEAN_PACKAGE_VERSION@" diff --git a/stage0/src/CMakeLists.txt b/stage0/src/CMakeLists.txt index 8806e6add72e..86fed0d474e3 100644 --- a/stage0/src/CMakeLists.txt +++ b/stage0/src/CMakeLists.txt @@ -64,7 +64,7 @@ option(BSYMBOLIC "Link with -Bsymbolic to reduce call overhead in shared librari option(USE_GMP "USE_GMP" ON) # development-specific options -option(CHECK_OLEAN_VERSION "Only load .olean files compiled with the current version of Lean" ON) +option(CHECK_OLEAN_VERSION "Only load .olean files compiled with the current version of Lean" OFF) set(LEAN_EXTRA_MAKE_OPTS "" CACHE STRING "extra options to lean --make") set(LEANC_CC ${CMAKE_C_COMPILER} CACHE STRING "C compiler to use in `leanc`") @@ -93,8 +93,9 @@ if ("${RUNTIME_STATS}" MATCHES "ON") string(APPEND LEAN_EXTRA_CXX_FLAGS " -D LEAN_RUNTIME_STATS") endif() -if (NOT("${CHECK_OLEAN_VERSION}" MATCHES "ON")) - string(APPEND LEAN_EXTRA_CXX_FLAGS " -D LEAN_IGNORE_OLEAN_VERSION") +if ("${CHECK_OLEAN_VERSION}" MATCHES "ON") + set(USE_GITHASH ON) + string(APPEND LEAN_EXTRA_CXX_FLAGS " -D LEAN_CHECK_OLEAN_VERSION") endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Emscripten") @@ -401,26 +402,17 @@ if(MULTI_THREAD AND NOT MSVC AND (NOT ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin")) endif() # Git HASH -set(LEAN_PACKAGE_VERSION "NOT-FOUND") if(USE_GITHASH) include(GetGitRevisionDescription) get_git_head_revision(GIT_REFSPEC GIT_SHA1) if(${GIT_SHA1} MATCHES "GITDIR-NOTFOUND") message(STATUS "Failed to read git_sha1") set(GIT_SHA1 "") - if(EXISTS "${LEAN_SOURCE_DIR}/bin/package_version") - file(STRINGS "${LEAN_SOURCE_DIR}/bin/package_version" LEAN_PACKAGE_VERSION) - message(STATUS "Package version detected: ${LEAN_PACKAGE_VERSION}") - endif() else() message(STATUS "git commit sha1: ${GIT_SHA1}") endif() else() set(GIT_SHA1 "") - if(EXISTS "${LEAN_SOURCE_DIR}/bin/package_version") - file(STRINGS "${LEAN_SOURCE_DIR}/bin/package_version" LEAN_PACKAGE_VERSION) - message(STATUS "Package version detected: ${LEAN_PACKAGE_VERSION}") - endif() endif() configure_file("${LEAN_SOURCE_DIR}/githash.h.in" "${LEAN_BINARY_DIR}/githash.h") diff --git a/stage0/src/library/module.cpp b/stage0/src/library/module.cpp index 7d93ca857c4f..33138b2cc3cc 100644 --- a/stage0/src/library/module.cpp +++ b/stage0/src/library/module.cpp @@ -3,6 +3,8 @@ Copyright (c) 2014-2015 Microsoft Corporation. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. Authors: Leonardo de Moura, Gabriel Ebner, Sebastian Ullrich + +.olean serialization and deserialization. */ #include #include @@ -25,6 +27,7 @@ Authors: Leonardo de Moura, Gabriel Ebner, Sebastian Ullrich #include "library/constants.h" #include "library/time_task.h" #include "library/util.h" +#include "githash.h" #ifdef LEAN_WINDOWS #include @@ -41,8 +44,22 @@ Authors: Leonardo de Moura, Gabriel Ebner, Sebastian Ullrich #endif namespace lean { -// manually padded to multiple of word size, see `initialize_module` -static char const * g_olean_header = "oleanfile!!!!!!!"; + +/** On-disk format of a .olean file. */ +struct olean_header { + // 5 bytes: magic number + char marker[5] = {'o', 'l', 'e', 'a', 'n'}; + // 1 byte: version, currently always `1` + uint8_t version = 1; + // 42 bytes: build githash, padded with `\0` to the right + char githash[42]; + // address at which the beginning of the file (including header) is attempted to be mmapped + size_t base_addr; + // payload, a serialize Lean object graph; `size_t` has same alignment requirements as Lean objects + size_t data[]; +}; +// make sure we don't have any padding bytes, which also ensures `data` is properly aligned +static_assert(sizeof(olean_header) == 5 + 1 + 42 + sizeof(size_t), "olean_header must be packed"); extern "C" LEAN_EXPORT object * lean_save_module_data(b_obj_arg fname, b_obj_arg mod, b_obj_arg mdata, object *) { std::string olean_fn(string_cstr(fname)); @@ -73,10 +90,14 @@ extern "C" LEAN_EXPORT object * lean_save_module_data(b_obj_arg fname, b_obj_arg // `MapViewOfFileEx` addresses must be aligned to the "memory allocation granularity", which is 64KB. base_addr = base_addr & ~((1LL<<16) - 1); - object_compactor compactor(reinterpret_cast(base_addr + strlen(g_olean_header) + sizeof(base_addr))); + object_compactor compactor(reinterpret_cast(base_addr + offsetof(olean_header, data))); compactor(mdata); - out.write(g_olean_header, strlen(g_olean_header)); - out.write(reinterpret_cast(&base_addr), sizeof(base_addr)); + + // see/sync with file format description above + olean_header header = {}; + header.base_addr = base_addr; + strncpy(header.githash, LEAN_GITHASH, sizeof(header.githash)); + out.write(reinterpret_cast(&header), sizeof(header)); out.write(static_cast(compactor.data()), compactor.size()); out.close(); while (std::rename(olean_tmp_fn.c_str(), olean_fn.c_str()) != 0) { @@ -116,19 +137,21 @@ extern "C" LEAN_EXPORT object * lean_read_module_data(object * fname, object *) in.seekg(0, in.end); size_t size = in.tellg(); in.seekg(0); - size_t header_size = strlen(g_olean_header); - if (size < header_size) { + + olean_header default_header = {}; + olean_header header; + if (!in.read(reinterpret_cast(&header), sizeof(header))) { return io_result_mk_error((sstream() << "failed to read file '" << olean_fn << "', invalid header").str()); } - char * header = new char[header_size]; - in.read(header, header_size); - if (strncmp(header, g_olean_header, header_size) != 0) { + if (memcmp(header.marker, default_header.marker, sizeof(header.marker)) != 0 + || header.version != default_header.version +#ifdef LEAN_CHECK_OLEAN_VERSION + || strncmp(header.githash, LEAN_GITHASH, sizeof(header.githash)) != 0 +#endif + ) { return io_result_mk_error((sstream() << "failed to read file '" << olean_fn << "', invalid header").str()); } - delete[] header; - char * base_addr; - in.read(reinterpret_cast(&base_addr), sizeof(base_addr)); - header_size += sizeof(base_addr); + char * base_addr = reinterpret_cast(header.base_addr); char * buffer = nullptr; bool is_mmap = false; std::function free_data; @@ -166,24 +189,25 @@ extern "C" LEAN_EXPORT object * lean_read_module_data(object * fname, object *) }; #endif if (buffer && buffer == base_addr) { - buffer += header_size; + buffer += sizeof(olean_header); is_mmap = true; } else { #ifdef LEAN_MMAP free_data(); #endif - buffer = static_cast(malloc(size - header_size)); + buffer = static_cast(malloc(size - sizeof(olean_header))); free_data = [=]() { free(buffer); }; - in.read(buffer, size - header_size); + in.read(buffer, size - sizeof(olean_header)); if (!in) { return io_result_mk_error((sstream() << "failed to read file '" << olean_fn << "'").str()); } } in.close(); - compacted_region * region = new compacted_region(size - header_size, buffer, base_addr + header_size, is_mmap, free_data); + compacted_region * region = + new compacted_region(size - sizeof(olean_header), buffer, base_addr + sizeof(olean_header), is_mmap, free_data); #if defined(__has_feature) #if __has_feature(address_sanitizer) // do not report as leak