From 50b39e5f8977671888ab33c961de6e6e669c6ae9 Mon Sep 17 00:00:00 2001 From: Gavin Halliday Date: Wed, 25 Sep 2024 15:52:25 +0100 Subject: [PATCH 1/2] HPCC-32723 Add support for storage planes using the azure blob api Signed-off-by: Gavin Halliday --- common/remote/hooks/azure/azurefile.cpp | 390 ++++++++++++------------ helm/hpcc/values.schema.json | 1 - 2 files changed, 190 insertions(+), 201 deletions(-) diff --git a/common/remote/hooks/azure/azurefile.cpp b/common/remote/hooks/azure/azurefile.cpp index df4b7ebb2cf..5a2797dae99 100644 --- a/common/remote/hooks/azure/azurefile.cpp +++ b/common/remote/hooks/azure/azurefile.cpp @@ -49,6 +49,7 @@ using namespace std::chrono; * an IFileIO. E.g., append blobs */ constexpr const char * azureFilePrefix = "azure:"; +constexpr const char * azureBlobPrefix = "azureblob:"; // Syntax azureblob:storageplane[/device]/apth #ifdef TEST_AZURE_PAGING constexpr offset_t azureReadRequestSize = 50; #else @@ -58,10 +59,14 @@ constexpr offset_t azureReadRequestSize = 0x400000; // Default to requesting 4M //--------------------------------------------------------------------------------------------------------------------- class AzureFile; -class AzureFileReadIO : implements CInterfaceOf + +//The base class for AzureFileIO. This class performs NO caching of the data - to avoid problems with +//copying the data too many times. It is the responsibility of the caller to implement a cache if necessary. +class AzureFileIO : implements CInterfaceOf { public: - AzureFileReadIO(AzureFile * _file, const FileIOStats & _stats); + AzureFileIO(AzureFile * _file, const FileIOStats & _stats); + AzureFileIO(AzureFile * _file) : file(_file) {} virtual size32_t read(offset_t pos, size32_t len, void * data) override; virtual offset_t size() override; @@ -69,6 +74,19 @@ class AzureFileReadIO : implements CInterfaceOf { } + unsigned __int64 getStatistic(StatisticKind kind) override; + +protected: + Linked file; + FileIOStats stats; +}; + + +class AzureFileReadIO : public AzureFileIO +{ +public: + AzureFileReadIO(AzureFile * _file, const FileIOStats & _stats); + // Write methods not implemented - this is a read-only file virtual size32_t write(offset_t pos, size32_t len, const void * data) override { @@ -85,42 +103,21 @@ class AzureFileReadIO : implements CInterfaceOf virtual void flush() override { } - unsigned __int64 getStatistic(StatisticKind kind) override; - -protected: - size_t extractDataFromResult(size_t offset, size_t length, void * target); - -protected: - Linked file; - CriticalSection cs; - offset_t startResultOffset = 0; - offset_t endResultOffset = 0; - MemoryBuffer contents; - FileIOStats stats; }; -class AzureFileWriteIO : implements CInterfaceOf +class AzureFileWriteIO : public AzureFileIO { public: AzureFileWriteIO(AzureFile * _file); virtual void beforeDispose() override; - virtual size32_t read(offset_t pos, size32_t len, void * data) override - { - throwUnexpected(); - } - virtual offset_t size() override; virtual void setSize(offset_t size) override; virtual void flush() override; - virtual unsigned __int64 getStatistic(StatisticKind kind) override; - protected: - Linked file; CriticalSection cs; - FileIOStats stats; offset_t offset = 0; }; @@ -146,11 +143,8 @@ class AzureFileBlockBlobWriteIO final : implements AzureFileWriteIO }; -class AzureFile : implements CInterfaceOf +class AzureFile final : implements CInterfaceOf { - friend class AzureFileReadIO; - friend class AzureFileAppendBlobWriteIO; - friend class AzureFileBlockBlobWriteIO; public: AzureFile(const char *_azureFileName); virtual bool exists() override @@ -252,27 +246,33 @@ class AzureFile : implements CInterfaceOf virtual void copyTo(IFile *dest, size32_t buffersize=DEFAULT_COPY_BLKSIZE, ICopyFileProgress *progress=NULL, bool usetmp=false, CFflags copyFlags=CFnone) override { UNIMPLEMENTED_X("AzureFile::copyTo"); } virtual IMemoryMappedFile *openMemoryMapped(offset_t ofs=0, memsize_t len=(memsize_t)-1, bool write=false) override { UNIMPLEMENTED_X("AzureFile::openMemoryMapped"); } +//Helper functions for the azureFileIO classes + offset_t read(offset_t pos, size32_t len, void * data, FileIOStats & stats); + + void createAppendBlob(); + void appendToAppendBlob(size32_t len, const void * data); + void createBlockBlob(); + void appendToBlockBlob(size32_t len, const void * data); + protected: std::shared_ptr getCredentials() const; std::string getBlobUrl() const; std::shared_ptr getBlobContainerClient() const; template std::shared_ptr getClient() const; - void createAppendBlob(); - void appendToAppendBlob(size32_t len, const void * data); - void createBlockBlob(); - void appendToBlockBlob(size32_t len, const void * data); - offset_t readBlock(MemoryBuffer & contents, FileIOStats & stats, offset_t from = 0, offset_t length = unknownFileSize); + void ensureMetaData(); void gatherMetaData(); IFileIO * createFileReadIO(); IFileIO * createFileWriteIO(); void setProperties(int64_t _blobSize, Azure::DateTime _lastModified, Azure::DateTime _createdOn); + protected: StringBuffer fullName; StringAttr accountName; StringAttr accountKey; StringAttr containerName; + StringBuffer secretName; StringAttr blobName; offset_t fileSize = unknownFileSize; bool haveMeta = false; @@ -287,108 +287,47 @@ class AzureFile : implements CInterfaceOf //--------------------------------------------------------------------------------------------------------------------- -AzureFileReadIO::AzureFileReadIO(AzureFile * _file, const FileIOStats & _firstStats) +AzureFileIO::AzureFileIO(AzureFile * _file, const FileIOStats & _firstStats) : file(_file), stats(_firstStats) { - startResultOffset = 0; - endResultOffset = 0; } -size32_t AzureFileReadIO::read(offset_t pos, size32_t len, void * data) + +size32_t AzureFileIO::read(offset_t pos, size32_t len, void * data) { - if (pos > file->fileSize) + offset_t fileSize = file->size(); + if (pos > fileSize) return 0; - if (pos + len > file->fileSize) - len = file->fileSize - pos; + if (pos + len > fileSize) + len = fileSize - pos; if (len == 0) return 0; - size32_t sizeRead = 0; - offset_t lastOffset = pos + len; - - // MORE: Do we ever read file IO from more than one thread? I'm not convinced we do, and the critical blocks waste space and slow it down. - //It might be worth revisiting (although I'm not sure what effect stranding has) - CriticalBlock block(cs); - for(;;) - { - //Check if part of the request can be fulfilled from the current read block - if (pos >= startResultOffset && pos < endResultOffset) - { - size_t copySize = ((lastOffset > endResultOffset) ? endResultOffset : lastOffset) - pos; - size_t extractedSize = extractDataFromResult((pos - startResultOffset), copySize, data); - assertex(copySize == extractedSize); - pos += copySize; - len -= copySize; - data = (byte *)data + copySize; - sizeRead += copySize; - if (len == 0) - return sizeRead; - } - -#ifdef TEST_AZURE_PAGING - offset_t readSize = azureReadRequestSize; -#else - offset_t readSize = (len > azureReadRequestSize) ? len : azureReadRequestSize; -#endif - - offset_t contentSize = file->readBlock(contents, stats, pos, readSize); - //If the results are inconsistent then do not loop forever - if (contentSize == 0) - return sizeRead; - - startResultOffset = pos; - endResultOffset = pos + contentSize; - } + return file->read(pos, len, data, stats); } -offset_t AzureFileReadIO::size() +offset_t AzureFileIO::size() { return file->size(); } -size_t AzureFileReadIO::extractDataFromResult(size_t offset, size_t length, void * target) -{ - if (offset>=contents.length()) - return 0; - const byte * base = (byte *)(contents.bufferBase())+offset; - unsigned len = std::min(length, contents.length()-offset); - memcpy(target, base, len); - return len; -} - -unsigned __int64 AzureFileReadIO::getStatistic(StatisticKind kind) +unsigned __int64 AzureFileIO::getStatistic(StatisticKind kind) { return stats.getStatistic(kind); } -unsigned __int64 FileIOStats::getStatistic(StatisticKind kind) +//--------------------------------------------------------------------------------------------------------------------- + + +AzureFileReadIO::AzureFileReadIO(AzureFile * _file, const FileIOStats & _firstStats) +: AzureFileIO(_file, _firstStats) { - switch (kind) - { - case StCycleDiskReadIOCycles: - return ioReadCycles.load(); - case StCycleDiskWriteIOCycles: - return ioWriteCycles.load(); - case StTimeDiskReadIO: - return cycle_to_nanosec(ioReadCycles.load()); - case StTimeDiskWriteIO: - return cycle_to_nanosec(ioWriteCycles.load()); - case StSizeDiskRead: - return ioReadBytes.load(); - case StSizeDiskWrite: - return ioWriteBytes.load(); - case StNumDiskReads: - return ioReads.load(); - case StNumDiskWrites: - return ioWrites.load(); - } - return 0; } //--------------------------------------------------------------------------------------------------------------------- AzureFileWriteIO::AzureFileWriteIO(AzureFile * _file) -: file(_file) +: AzureFileIO(_file) { } @@ -417,16 +356,11 @@ void AzureFileWriteIO::flush() { } -unsigned __int64 AzureFileWriteIO::getStatistic(StatisticKind kind) -{ - return stats.getStatistic(kind); -} - //--------------------------------------------------------------------------------------------------------------------- AzureFileAppendBlobWriteIO::AzureFileAppendBlobWriteIO(AzureFile * _file) : AzureFileWriteIO(_file) { - file->createAppendBlob(); + file->createBlockBlob(); } void AzureFileAppendBlobWriteIO::close() @@ -501,45 +435,6 @@ static bool isBase64Char(char c) return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || (c == '+') || (c == '/') || (c == '='); } -static std::shared_ptr getCredentials(const char * accountName, const char * key) -{ - //MORE: The client should be cached and shared between different file access - implement when secret storage is added. - StringBuffer keyTemp; - if (!accountName) - accountName = getenv("AZURE_ACCOUNT_NAME"); - if (!key) - { - key = getenv("AZURE_ACCOUNT_KEY"); - if (!key) - { - StringBuffer secretName; - secretName.append("azure-").append(accountName); - getSecretValue(keyTemp, "storage", secretName, "key", true); - //Trim trailing whitespace/newlines in case the secret has been entered by hand e.g. on bare metal - size32_t len = keyTemp.length(); - for (;;) - { - if (!len) - break; - if (isBase64Char(keyTemp.charAt(len-1))) - break; - len--; - } - keyTemp.setLength(len); - key = keyTemp.str(); - } - } - try - { - return std::make_shared(accountName, key); - } - catch (const Azure::Core::RequestFailedException& e) - { - IException * error = makeStringExceptionV(-1, "Azure access: %s (%d)", e.ReasonPhrase.c_str(), static_cast(e.StatusCode)); - throw error; - } -} - static std::string getContainerUrl(const char *account, const char * container) { std::string url("https://"); @@ -554,59 +449,153 @@ static std::string getBlobUrl(const char *account, const char * container, const AzureFile::AzureFile(const char *_azureFileName) : fullName(_azureFileName) { - const char * filename = fullName + strlen(azureFilePrefix); - if (filename[0] != '/' || filename[1] != '/') - throw makeStringException(99, "// missing from azure: file reference"); + if (startsWith(fullName, azureBlobPrefix)) + { + //format is azureblob:plane[/device]/path + const char * filename = fullName + strlen(azureBlobPrefix); + const char * slash = strchr(filename, '/'); + if (!slash) + throw makeStringException(99, "Missing / in azureblob: file reference"); - //Allow the access key to be provided after the // before a @ i.e. azure://:@... - filename += 2; + StringBuffer planeName(slash-filename, filename); + Owned plane = getStoragePlane(planeName); + const char * api = plane->queryProp("storageapi/@type"); + if (!api) + throw makeStringExceptionV(99, "No storage api defined for plane %s", planeName.str()); - //Allow the account and key to be quoted so that it can support slashes within the access key (since they are part of base64 encoding) - //e.g. i.e. azure://':'@... - StringBuffer accessExtra; - if (filename[0] == '"' || filename[0] == '\'') - { - const char * endQuote = strchr(filename + 1, filename[0]); - if (!endQuote) - throw makeStringException(99, "access key is missing terminating quote"); - accessExtra.append(endQuote - (filename + 1), filename + 1); - filename = endQuote+1; - if (*filename != '@') - throw makeStringException(99, "missing @ following quoted access key"); - filename++; - } + constexpr size_t lenPrefix = strlen(azureBlobPrefix); + if ((strncmp(api, azureBlobPrefix, lenPrefix-1) != 0) || api[lenPrefix-1] != ':') + throw makeStringExceptionV(99, "Storage api for plane %s is not azureblob", planeName.str()); - const char * at = strchr(filename, '@'); - const char * slash = strchr(filename, '/'); - assertex(slash); // could probably relax this.... + unsigned numDevices = plane->getPropInt("@numDevices", 1); + if (numDevices != 1) + { + if (slash[1] != 'd') + throw makeStringExceptionV(99, "Expected a device number in the filename %s", fullName.str()); - //Possibly pedantic - only spot @s before the first leading / - if (at && (!slash || at < slash)) - { - accessExtra.append(at - filename, filename); - filename = at+1; - } + char * endDevice = nullptr; + unsigned device = strtod(slash+2, &endDevice); + if ((device == 0) || (device > numDevices)) + throw makeStringExceptionV(99, "Device %d out of range for plane %s", device, planeName.str()); - if (accessExtra) + if (!endDevice || (*endDevice != '/')) + throw makeStringExceptionV(99, "Unexpected end of device partition %s", fullName.str()); + + VStringBuffer childPath("containers[%d]", device-1); + IPropertyTree * deviceInfo = plane->queryPropTree(childPath); + if (deviceInfo) + { + accountName.set(deviceInfo->queryProp("@account")); + secretName.set(deviceInfo->queryProp("@secret")); + } + + //If device-specific information is not provided all defaults come from the storage plane + if (!accountName) + accountName.set(plane->queryProp("@account")); + if (!secretName) + secretName.set(plane->queryProp("@secret")); + + filename = endDevice+1; + } + else + { + accountName.set(plane->queryProp("@account")); + secretName.set(plane->queryProp("@secret")); + filename = slash+1; + } + + if (isEmptyString(accountName) || isEmptyString(secretName)) + throw makeStringExceptionV(99, "Missing container or secret name for plane %s", planeName.str()); + + //I am not at all sure we need to split this apart, only to join in back together again. + slash = strchr(filename, '/'); + assertex(slash); // could probably relax this.... + containerName.set(filename, slash-filename); + blobName.set(slash+1); + } + else if (startsWith(fullName, azureFilePrefix)) { - const char * colon = strchr(accessExtra, ':'); - if (colon) + const char * filename = fullName + strlen(azureFilePrefix); + if (filename[0] != '/' || filename[1] != '/') + throw makeStringException(99, "// missing from azure: file reference"); + + //Allow the access key to be provided after the // before a @ i.e. azure://:@... + filename += 2; + + //Allow the account and key to be quoted so that it can support slashes within the access key (since they are part of base64 encoding) + //e.g. i.e. azure://':'@... + StringBuffer accessExtra; + if (filename[0] == '"' || filename[0] == '\'') { - accountName.set(accessExtra, colon-accessExtra); - accountKey.set(colon+1); + const char * endQuote = strchr(filename + 1, filename[0]); + if (!endQuote) + throw makeStringException(99, "access key is missing terminating quote"); + accessExtra.append(endQuote - (filename + 1), filename + 1); + filename = endQuote+1; + if (*filename != '@') + throw makeStringException(99, "missing @ following quoted access key"); + filename++; } - else - accountName.set(accessExtra); // Key is retrieved from the secrets + + const char * at = strchr(filename, '@'); + const char * slash = strchr(filename, '/'); + assertex(slash); // could probably relax this.... + + //Possibly pedantic - only spot @s before the first leading / + if (at && (!slash || at < slash)) + { + accessExtra.append(at - filename, filename); + filename = at+1; + } + + if (accessExtra) + { + const char * colon = strchr(accessExtra, ':'); + if (colon) + { + accountName.set(accessExtra, colon-accessExtra); + secretName.set(colon+1); + } + else + { + accountName.set(accessExtra); // Key is retrieved from the secrets + secretName.set("azure-").append(accountName); + } + } + containerName.set(filename, slash-filename); + blobName.set(slash+1); } + else + throw makeStringExceptionV(99, "Unexpected prefix on azure filename %s", fullName.str()); - containerName.set(filename, slash-filename); - blobName.set(slash+1); blobUrl = ::getBlobUrl(accountName, containerName, blobName); } std::shared_ptr AzureFile::getCredentials() const { - return ::getCredentials(accountName, accountKey); + StringBuffer key; + getSecretValue(key, "storage", secretName, "key", true); + //Trim trailing whitespace/newlines in case the secret has been entered by hand e.g. on bare metal + size32_t len = key.length(); + for (;;) + { + if (!len) + break; + if (isBase64Char(key.charAt(len-1))) + break; + len--; + } + key.setLength(len); + + try + { + return std::make_shared(accountName.str(), key.str()); + } + catch (const Azure::Core::RequestFailedException& e) + { + IException * error = makeStringExceptionV(-1, "Azure access: %s (%d)", e.ReasonPhrase.c_str(), static_cast(e.StatusCode)); + throw error; + } } std::string AzureFile::getBlobUrl() const @@ -731,21 +720,20 @@ bool AzureFile::getTime(CDateTime * createTime, CDateTime * modifiedTime, CDateT return false; } -offset_t AzureFile::readBlock(MemoryBuffer & contents, FileIOStats & stats, offset_t from, offset_t length) +offset_t AzureFile::read(offset_t pos, size32_t len, void * data, FileIOStats & stats) { CCycleTimer timer; auto blockBlobClient = getClient(); Azure::Storage::Blobs::DownloadBlobToOptions options; options.Range = Azure::Core::Http::HttpRange(); - options.Range.Value().Offset = from; - options.Range.Value().Length = length; - contents.ensureCapacity(length); - uint8_t * buffer = reinterpret_cast(contents.bufferBase()); + options.Range.Value().Offset = pos; + options.Range.Value().Length = len; + uint8_t * buffer = reinterpret_cast(data); long int sizeRead = 0; try { - Azure::Response result = blockBlobClient->DownloadTo(buffer, length, options); + Azure::Response result = blockBlobClient->DownloadTo(buffer, len, options); Azure::Core::Http::HttpRange range = result.Value.ContentRange; if (range.Length.HasValue()) sizeRead = range.Length.Value(); @@ -1106,7 +1094,9 @@ class AzureFileHook : public CInterfaceOf protected: static bool isAzureFileName(const char *fileName) { - if (!startsWith(fileName, azureFilePrefix)) + if (startsWith(fileName, azureBlobPrefix)) + return true; + if (startsWith(fileName, azureFilePrefix)) return false; const char * filename = fileName + strlen(azureFilePrefix); const char * slash = strchr(filename, '/'); diff --git a/helm/hpcc/values.schema.json b/helm/hpcc/values.schema.json index 1d853f13619..1dd0366e51b 100644 --- a/helm/hpcc/values.schema.json +++ b/helm/hpcc/values.schema.json @@ -714,7 +714,6 @@ } }, "additionalProperties": false, - "required": [ "name" ] } } }, From 83c53468d4cb9eb9958f0b713e8637a72e520294 Mon Sep 17 00:00:00 2001 From: Jack Del Vecchio Date: Thu, 14 Nov 2024 10:53:22 -0500 Subject: [PATCH 2/2] HPCC-32723 Add support for storage planes using the Azure blob api --- common/remote/hooks/azure/azurefile.cpp | 17 ++++++++++------- dali/base/dautils.cpp | 9 +++++++++ helm/hpcc/values.schema.json | 2 +- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/common/remote/hooks/azure/azurefile.cpp b/common/remote/hooks/azure/azurefile.cpp index 5a2797dae99..f92d0fec77d 100644 --- a/common/remote/hooks/azure/azurefile.cpp +++ b/common/remote/hooks/azure/azurefile.cpp @@ -360,7 +360,7 @@ void AzureFileWriteIO::flush() AzureFileAppendBlobWriteIO::AzureFileAppendBlobWriteIO(AzureFile * _file) : AzureFileWriteIO(_file) { - file->createBlockBlob(); + file->createAppendBlob(); } void AzureFileAppendBlobWriteIO::close() @@ -404,7 +404,7 @@ size32_t AzureFileAppendBlobWriteIO::write(offset_t pos, size32_t len, const voi AzureFileBlockBlobWriteIO::AzureFileBlockBlobWriteIO(AzureFile * _file) : AzureFileWriteIO(_file) { - file->createAppendBlob(); + file->createBlockBlob(); } void AzureFileBlockBlobWriteIO::close() @@ -459,12 +459,15 @@ AzureFile::AzureFile(const char *_azureFileName) : fullName(_azureFileName) StringBuffer planeName(slash-filename, filename); Owned plane = getStoragePlane(planeName); + if (!plane) + throw makeStringExceptionV(99, "Unknown storage plane %s", planeName.str()); + const char * api = plane->queryProp("storageapi/@type"); if (!api) throw makeStringExceptionV(99, "No storage api defined for plane %s", planeName.str()); - constexpr size_t lenPrefix = strlen(azureBlobPrefix); - if ((strncmp(api, azureBlobPrefix, lenPrefix-1) != 0) || api[lenPrefix-1] != ':') + StringBuffer azureBlobAPI(strlen(azureBlobPrefix) - 1, azureBlobPrefix); + if (!strieq(api, azureBlobAPI.str())) throw makeStringExceptionV(99, "Storage api for plane %s is not azureblob", planeName.str()); unsigned numDevices = plane->getPropInt("@numDevices", 1); @@ -624,8 +627,8 @@ bool AzureFile::createDirectory() { Azure::Response result = blobContainerClient->CreateIfNotExists(); if (result.Value.Created==false) - OERRLOG("Azure create container: container not created"); - return result.Value.Created; + DBGLOG("AzureFile::createDirectory: container not created because it already exists"); + return true; } catch (const Azure::Core::RequestFailedException& e) { @@ -1096,7 +1099,7 @@ class AzureFileHook : public CInterfaceOf { if (startsWith(fileName, azureBlobPrefix)) return true; - if (startsWith(fileName, azureFilePrefix)) + if (!startsWith(fileName, azureFilePrefix)) return false; const char * filename = fileName + strlen(azureFilePrefix); const char * slash = strchr(filename, '/'); diff --git a/dali/base/dautils.cpp b/dali/base/dautils.cpp index 5864e62676c..7ad40dfd672 100644 --- a/dali/base/dautils.cpp +++ b/dali/base/dautils.cpp @@ -1484,6 +1484,15 @@ StringBuffer &CDfsLogicalFileName::getGroupName(StringBuffer &grp) const if (e) grp.append(e-s,s); } + else + { + const char *s = skipScope(lfn,PLANE_SCOPE); + if (s) { + const char *e = strstr(s,"::"); + if (e) + grp.append(e-s,s); + } + } } return grp; } diff --git a/helm/hpcc/values.schema.json b/helm/hpcc/values.schema.json index 1dd0366e51b..0caa33ccd2f 100644 --- a/helm/hpcc/values.schema.json +++ b/helm/hpcc/values.schema.json @@ -713,7 +713,7 @@ "type" : "string" } }, - "additionalProperties": false, + "additionalProperties": false } } },