From c3cc3a26721486345f87de5d64d6f4cafadbfe42 Mon Sep 17 00:00:00 2001 From: Greg Eisenhauer Date: Wed, 30 Oct 2024 07:20:12 -0400 Subject: [PATCH] Start developer_docs directory with some content (#4385) --- developer_docs/bp5format.md | 310 +++++++++++++++++++++ developer_docs/bp5reader.md | 307 ++++++++++++++++++++ source/adios2/toolkit/format/bp5/BP5Base.h | 269 +----------------- 3 files changed, 622 insertions(+), 264 deletions(-) create mode 100644 developer_docs/bp5format.md create mode 100644 developer_docs/bp5reader.md diff --git a/developer_docs/bp5format.md b/developer_docs/bp5format.md new file mode 100644 index 0000000000..9cfce39b8e --- /dev/null +++ b/developer_docs/bp5format.md @@ -0,0 +1,310 @@ +# BP5 Metadata Marshaling, writer-side focus + +BP5 Metadata Marshalling is based upon FFS, which provides the ability +to serialize a C-style pointer-based data structure (starting with a +base struct) and to deserialize it in-place on the receiving side. +This is what we'll do to encode BP5 Metadata, create a custom C-style +struct on the writer side and then use FFS to make that same struct +available to the reader. + +Normally, in order to use FFS, an application must fully describe +the base structure using an FMFieldList, where each element +describes a field in the structure, including the field's name, +basic type (integer, float, etc.), size and offset from the start +of the structure. In "normal" scenarios, like in SST this is +straightforward because we're describing a structure that exists +at compile-time and all of those things are compile-time static. +However, ADIOS metadata represents information about variables +that we don't know about until run-time, so if we're going to use +FFS here, things have to be a bit more dynamic. In particular, +we'll represent ADIOS metadata with a "virtual" structure, one +whose description we'll construct on the fly and which will only +ever exist virtually, making up offsets as we go. We just have to +be careful about keeping things aligned appropriately because we +want this to land on the receiver and be appropriately aligned +there. (Normally the compiler takes care of this, but this +virtual structure is never seen by a compiler, so we're doing it.) +The field name that we specify to FFS is also important because we +use it to communicate a lot of information between writer and +reader. While it always contains the variable name, it also +encodes the variable type (local or global, atomic or array, +compressed, derived, etc.). Because the variable name only +appears in the metametadata (ffs format), this is a great place to +put more static information about the variable, specifically +anything that is fixed after definition and doesn't change on a +per-timestep basis. More on names later. + +To accomplish managing the structure on the writer side, we +principally track two things, the FMFieldList that represents the +description of the virtual struct, and a malloc'd region where we +build the virtual struct itself. While the description is +interpreted by FFS, the most important thing for BP5 to remember +is this field's offset because that's where the (meta)data will +go. When we Marshal a simple atomic value (local or global), we +calculate an appropriately aligned new offset in the buffer, add +to the FMFieldList (maintained in Info.MetaFields on the writer) +and copy the data into the virtual field at that offset in the +buffer. On future timesteps, the field already exists, so we just +use the offset and copy the data into the buffer. Arrays are a +bit more complex, but lets start with the simple case. FFS +supports substructures, I.E. fields which themselves are a +structure and we use that feature for all array representations. +There are several things that may change on a per-timestep basis +for arrays, including Shape, Count and Offset values (which are +themselves arrays), and we also need to track the location of the +related data block (offset in this rank's data segment). Except +for Shape (which we assume is set for at least this timestep), all +of these things are per-block. + +Back to FFS capabilities for a moment. FFS's pointer-based +structures include dynamically-sized arrays, and the size of those +arrays must be specified by an integer-typed field in that +structure. There are three different array lengths required here. +Shape is of length Dims (how many dimensions the array has), +DataBlockLocation is of length BlockCount (how many blocks were +written on this rank), and for Count and Offsets we must have +those per-block, so the length is Dims*BlockCount. To satisfy +FFS's constraints, that means we must have integer fields +representing all three lengths in the array metadata struct, and +we need pointers to the dynamic arrays representing Shape, Count, +Offsets, and DataBlockLocation. These are the BASE_FIELDS below +and the FFS FMField entries are BASE_FIELD_ENTRIES in BP5Base.cpp. +``` +#define BASE_FIELDS \ + size_t Dims; /* How many dimensions does this array have */ \ + size_t BlockCount; /* How many blocks are written */ \ + size_t DBCount; /* Dimens * BlockCount */ \ + size_t *Shape; /* Global dimensionality [Dims] NULL for local */ \ + size_t *Count; /* Per-block Counts [DBCount] */ \ + size_t *Offsets; /* Per-block Offsets [DBCount] NULL for local */ \ + size_t *DataBlockLocation; /* Per-block Offset in PG [BlockCount] */ +``` +``` +#define BASE_FIELD_ENTRIES \ + {"Dims", "integer", sizeof(size_t), FMOffset(BP5Base::MetaArrayRec *, Dims)}, \ + {"BlockCount", "integer", sizeof(size_t), FMOffset(BP5Base::MetaArrayRec *, BlockCount)}, \ + {"DBCount", "integer", sizeof(size_t), FMOffset(BP5Base::MetaArrayRec *, DBCount)}, \ + {"Shape", "integer[Dims]", sizeof(size_t), FMOffset(BP5Base::MetaArrayRec *, Shape)}, \ + {"Count", "integer[DBCount]", sizeof(size_t), FMOffset(BP5Base::MetaArrayRec *, Count)}, \ + {"Offset", "integer[DBCount]", sizeof(size_t), \ + FMOffset(BP5Base::MetaArrayRec *, Offsets)}, \ + {"DataBlockLocation", "integer[BlockCount]", sizeof(size_t), \ + FMOffset(BP5Base::MetaArrayRec *, DataBlockLocation)}, +``` +While more complex arrays metadata entries are necessary, these +must be the first fields in those structures. While there can't +be a static struct declaration for all of the metadata, there is a +static declaration for the array metadata substructure, +`MetaArrayRec` below. +``` + typedef struct _MetaArrayRec + { + BASE_FIELDS + } MetaArrayRec; +``` +Mostly you'll see this used like this: +``` +MetaArrayRec *MetaEntry = (MetaArrayRec *)((char *)(MetadataBuf) + Rec->MetaOffset); +``` +This gives us a nice way of accessing the key fields in an array's +metadata entry. + +So, what about more complex arrays? All of our compression +operators require the length of the encrypted field as input to +the uncompress operator. Generally we don't include data block +length as part of metadata because it's easily calculated from the +Count values and the length of the data type, but in order to +support compression we have to communicate it from the writer to +the reader so we can uncompress. Therefore every field with an +operator has as its next field (after BASE_FIELDS) DataBlockSize. +Like DataBlockLocation, this is per block (and so it's FFS +description also uses BlockCount). This arrangement is +represented by the `struct MetaArrayRecOperator` below. Note that +BP5 does not itself use the DataBlockSize in the metadata. The +size of the compressed data is returned from the compression +operator, and is used by BP5 to copy that data into the data +block, but after that it is only passed to the Uncompress operator +on the receiving side, so operators like MGard may choose to use +this differently. +``` + typedef struct _MetaArrayRecOperator + { + BASE_FIELDS + size_t *DataBlockSize; // Per-block Lengths [BlockCount] + } MetaArrayRecOperator; +``` +The last case is arrays that also have Min/Max stats associated +with them. Since this can be combined with operators, that gives +us two more possible structs for array metadata, a plain array +with Min/Max or an array with an operator and Min/Max, these are +represented by the structs `MetaArrayRecMM` and +`MetaArrayRecOperatorMM` below. Note that MinMax in that struct is +a `char*`, but obviously the data type of Min/Max depends upon the +element type of the array. How does that work? The actual size +in bytes of the MinMax array is `BlockCount * sizeof(array element) * 2`, but in order to avoid introducing yet another integer-typed +size value into the structure we've gone to some effort in order +to leverage the existing BlockCount value. In particular, there +are a number of FMField lists for The MM and OperatorMM arrays, +each giving FFS a different element size for the MinMax Array. +ADIOS types of size 1 use `MetarrayRecMM1List`, those of size 2 use +`MetaArrayRecMM2List`, etc., up to `MetaArrayRecMM16List`, which would +be used by long double. Note that BP5 doesn't define or support +MinMax for string, complex, or structure types. +``` + typedef struct _MetaArrayRecMM + { + BASE_FIELDS + char *MinMax; // char[TYPESIZE][BlockCount] varies by type + } MetaArrayRecMM; + + typedef struct _MetaArrayRecOperatorMM + { + BASE_FIELDS + size_t *DataBlockSize; // Per-block Lengths [BlockCount] + char *MinMax; // char[TYPESIZE][BlockCount] varies by type + } MetaArrayRecOperatorMM; +``` +For each of the array variations above, when we add the field +associated with that array to the metadata field list, we specify +the appropriate FieldList in the FFS "field_type" value, and +allocate space for the relevant structure in the virtual metadata +struct we're building. (Example MetaArrayRecOperatorMM8List below.) +``` +static FMField MetaArrayRecOperatorMM8List[] = { + BASE_FIELD_ENTRIES + {"DataBlockSize", "integer[BlockCount]", sizeof(size_t), + FMOffset(BP5Base::MetaArrayRecOperator *, DataBlockSize)}, + {"MinMax", "char[16][BlockCount]", 1, FMOffset(BP5Base::MetaArrayRecOperatorMM *, MinMax)}, + {NULL, NULL, 0, 0}}; +``` +We mentioned field names above, we actually encode a lot of +information into the FFS field names, including the variable name, +shape, element_size, ADIOS type, any operator that might be +applied, the name of the substructure (if the array is a struct +type), and even the expression that is to be used for derived +variables. These are all encoded in different ways, for example +the basic shape of the variable is encoded in the three letter +prefix of the FFS fieldname: GlobalValue: = "BPg", GlobalArray = +"BPG"JoinedArray = "BPJ", LocalValue = "BPl", LocalArray = "BPL". +The details of the encoding are buried in the logic, but important +bit is knowing that there's a lot of information there and some of +it (like the expression) is base64 encoded to avoid having special +characters in the FFS field name. From the BP5 point of view, +anything that can be encoded in the field name is a good thing +because it travels in the metametadata, not the metadata, so it +only gets moved around if the field set changes. + +Speaking of changes, there are some details that are omitted above +to get the main points across, but lets talk about other details. +First, when you put a first block of an array, we fill out the +Dims field, init BlockCount to 1, DBCount (the `Dims*BlockCount` +value) to Dims and then we malloc memory to hold a copy of the +Shape, Count and Offset values. (We need to copy these anyway as +part of serialization as they must be captured at the time of Put, +so we can't, say, just reference the values in the VariableBase +class.) For LocalArrays, the Shape value stays at a NULL pointer, +as does the Start value. If after the first there's another Put() +on that variable, we add 1 to BlockCount, increment DBCount by +Dims, and realloc() the Count and Offset arrays so that we can add +the new Count and Offset values after the ones that are already +there. This means that the Count values for block 1 start at +`Count[Dims]`, for block 2 they start at `Count[2*Dims]`, etc. At the +end of the timestep after using FFSencode() to serialize the +metadata, `FMfree_var_rec_elements()` is used to free() all these +subarrays that we've malloc'd. It understands the structure of +our entire Metadata structure, walks the field list and +deallocates appropriately. Once this has been done, we can +memset() the whole metadata structure back to zeros and we're +ready to start again. (All pointers NULL and counts are zero.) + +When we do start again with the next timestep, we don't start from +scratch with a new Fieldlist and virtual structure, but instead +try to reuse the old one. The anticipation is that step-based HPC +applications are highly regular and the set of variables that are +output on step N+1 are likely the same as what they output for +step N. So when we get a Put() for a variable, we look up its +entry in internal bookkeeping and if it has an entry in the +structure we reuse it, putting the appropriate data in the virtual +structure as described above. This is fine if we write the exact +same set of variables in subsequent steps, but what if we don't? +Well, if we write a new variable, then the procedure above +happens, but we also take steps to make sure that we generate new +MetaMetaData (I.E. re-register the format with FFS). We do this +by setting the Info.MetaFormat value to NULL. + +Handling a non-written variable is done differently. We don't +really want to bear the cost of new MetaMetaData frequently +(because MetaMetaData can be big), so instead we're willing to +bear the costs of not using some of the data in the virtual +structure. So if the app Puts an atomic variable on timestep N, +but skips it on N+1, we essentially leave that fraction of the +metadata buffer unused in N+1. It's transmitted or stored, but it +doesn't contain anything useful. But the reader still needs to +know that it wasn't written, so BP5 metadata carries with it a +bitmap showing if a variable that is part of the metadata has +actually been written and is valid. This bitmap, contained in the +BitField[BitFieldCount] fields in the MetadataFieldList is the +ultimate authority as to what has been written. Variables are +assigned an index in order when they are first entered into +metadata and if the bit at that index isn't set, that variable +wasn't written on that timestep. + +Now, this does bring up a vulnerability with BP5. If an application +were to write a lot of variables on one step and then never use them +again, we might end up with a big metadata block that mostly carried +unused (junk) bytes. We have not yet run into this in a real +application, so it isn't specifically handled. In an ideal world, one +would look at the "occcupancy rate" of metadata in EndStep() and make +a decision that for either this timestep or the next, we'd start from +scratch with an empty field list. There's a tradeoff here. Do this +too often and we've got big MetaMetadata costs, do it too little and +our metadata has a lot of useless bytes. Future work. Note that this +is mostly a writer-side thing to fix/optimize. The reader will +appropriately handple new metadata, including new metametadata. + +The stuff above applies to ADIOS variables, but attributes are always +handled separately. In the initial FFS-marshalling implementation, +Attributes, while separate, were handled very similarly to variables. +That is, there was a field list and virtual structure maintained where +we entered attributes much like Global and local values are described +above. There was a metametadata generated it it and it was moved +around like other metametadata blocks. This old way of doing things +is still present in the code and gets used if `MarshalAttribute()` is +called by the engine. Engines that use this marshall all attributes +in `Endstep()`, calling MarshalAttribute for all attributes and only +doing this when some attribute has changed. The resulting Attribute +data always contains ==all== the current attribute values, a situation +that works out well for engines like SST where readers might join +after timestep 0. The SST writer can save the most recent Attribute +data block and provide it to a newly-joined reader so that it has all +available attributes. + +However, this encoding mechanism has some significant disadvantages +under almost all situations. This separation of metametadata and +metadata was designed for Variables, where the set of variables was +likely to be reused without changes repeatedly. However, attributes +aren't like that, particularly in the original situation where +attributes once set can never change. Then we're only doing this when +we add an attribute, we're always generating new MetaMetadata whenever +we have a change, and MetaMetadata + Metadata size is always going to +be bigger than some simpler encoding mechanism. So, BP5 file engine +now does things differently. It calls OnetimeMarshalAttribute() which +uses a simpler FFS representation for attributes with the attribute +"name" being part of the data, not part of the metametadata as it is +with variables. This means that the metametadata never changes, so we +don't have the same issues as with the prior approach. That +metametadata struct (BP5AttrStruct) describes a relatively simple +structure with two lists, one for attributes of any non-string type, +and the other a list of string and array-of-string attributes. +Generally we only want attributes to appear here when they change, so +the BP5Writer calls OnetimeMarshlAttribute whenever it gets the +NotifyEngineAttribute call (whenever an attribute changes). However +it also gets called in BeginStep if that step is the first every +called, because some attributes may have been defined before the +engine was ever created. In BP5 file, attribute blocks then only +every contain an attribute once, unless the attribute changes in which +case it will appear again. This is not such a good situation for SST +because of the late-coming-reader issue, so that still uses the old +marshaling mechanism. + + diff --git a/developer_docs/bp5reader.md b/developer_docs/bp5reader.md new file mode 100644 index 0000000000..ca64f074f4 --- /dev/null +++ b/developer_docs/bp5reader.md @@ -0,0 +1,307 @@ +# BP5 Metadata handling, reader-side focus + +This document is to read in the context of [BP5 Metadata +Marshalling](bp5format.md), which covers metadata creation in BP5. + +BP5 Metadata overall setup includes MetaMetaData (which is just an FFS +Format Block, essentially a marshalled version of the Metadata +FieldList created in [BP5 Metadata Marshalling](bp5format.md)) and +Metadata block itself (I.E. the result of FFSEncoding a the virtual +structure created in [BP5 Metadata Marshalling](bp5format.md)). + +First, some FFS basics. You'll notice that InstallMetaMetaData in +BP5Deserializer.cpp mostly just consists of some copying and a call to +load_external_format_FMcontext(). This just loads the format +information (I.E. the marshalled version of the Metadata FieldList) +into FFS. This is a necessary first step for deserializing metadata, +however MetaMetaData is used for Attributes (at least for the original +version of attribute encoding where they were done with a custom +structure (fields named for the attributes) rather than a generic one +(attribute names in data). Because of this we don't process +MetaMetaData on installation, but wait to see how it is used. + +Next, lets look at the start of InstallMetaData. This basically takes +in an encoded metadata block and does everything necessary to setup +newly read variables, etc. The first part of this is "re-inflating" +the virtual metadata structure from its encoded form to something just +like it was on the writer, a C-style structure with pointers. Note +that while these are all valid pointers, this is not a classic C +structure where each pointed-to entity is separately malloc'd. That +would be terribly inefficient. Instead FFS keeps this as a single +data block but with internal pointers. Trying to free() them +individually would not go well. Note that this is true whether we can +decode an incoming block *in-place* or not. + +That may require some explanation. FFS' goal is to efficiently move +pointer-based structures from one memory space to another. In order +to do that, it doesn't do the classic thing, copying each field +individually into the encode buffer. Instead it copies the base +structure into the encode buffer, followed by the things pointed to by +fields in the base structure, then recursively down the data structure +until everything is in the buffer. As this happens, pointers in +copied structures are turned into the integer offset of the pointed-to +copy, and all copied structures are appropriately aligned within the +encode buffer so that hopefully when they "land" in the receiving +memory space they'll have an appropriate alignment on that processor +too. However, this isn't always possible. For example when +transferring from a 32-bit machine to a 64-bit, lots of things change +including the size of pointers and the required alignment of data +types. FFS was designed for this situation, but heterogeneity isn't +what it used to be and a lot of that code hasn't been seriously +exercised in some time, which is why the FMlocalize_structs() call in +InstallMetaData() is commented out. Normally that call would take the +FMformatList from the encoding host, "localize" it to be suitable for +the decoding host, and then FFS would take care of the unpleasant +details. However at present the world is pretty uniformly 64-bit +little-endian and none of this should be necessary. The +localize_structs is commented out because for some reason that I +couldn't quite work out, FFS still thought it was necessary and that +change was the easiest way to avoid the problem. Should we support +32-bit architectures or this code survive to run on 128-bit +architectures, things will have to change. As it is, we should always +be following the `FFSdecode_in_place_possible() == TRUE` code path. + +Something useful to note: setting the environment variable +"BP5DumpMetadata" will cause the output of the raw incoming metadata +by the Deserializer. This might be a little ugly, but it can be useful. For example, this is the portion of output for the 'c32' variable in staging_common/TestCommonWrite: +``` +BPG_8_12_c32 = + BPG_8_12_c32 = +Dims = 1 ,BlockCount = 1 ,DBCount = 1 ,Shape = 0x11e817388 10 ,Count = 0x11e817390 10 ,Offset = 0x11e817398 0 ,DataBlockLocation = 0x11e8173a0 272 ,MinMax = NULL, , +, +``` +You see the field name with the "BPG" prefix indicating a global +array, element size of 8, ADIOS type of 12 (maps to complex float), +and the actual variable name at the end. Dims, BlockCount and DBCount +are all 1. The Shape Count and Offset are arrays, so they are +represented by their base address (after decoding) followed by their +elements. DataBlockLocation is similar, showing the datablock at +offset 272. Finally there is no MinMax for complex, so that pointer +is NULL. + +Lets step back a bit. When using BP5, we expect the engine to provide +the Deserializer with all of the MetaMetaData, and then the Metadata +block from each rank. For the BP5 file reader in random access mode, +we also expect to be given the Metadata blocks for ever step. We +don't need all the MetaMetaData up front _per se_, but we have to have +it _before_ any MetaData block that it was associated with. The +FFSdecode_* calls in InstallMetaData() produce what are essentially +copies of the metadata structure that was created in the writer, and +because we don't do the sort of metadata "merging" that BP3/4 did on +the writer side, we'll have a copy of the metadata from **EACH* +writer, and in BP5 file reader random access mode, also for each step. +**These C-style pointer-based data structures are the core of +in-memory BP5 metadata.** We don't do aggregation, turn variable count +arrays int std::vector-based structures or really anything like +that. Instead most everything in BP5Deserializer is just support for +accessing those data structures as they are. NOTE: Remember that the +entry for an array variable in each of these blocks is a +`MetaArrayRec` as described in [BP5 Metadata +Marshalling](bp5format.md)). That's a C structure with pointers to +the Shape, Count, Offsets, etc for all the blocks that were written on +that rank on that step. The problem? Each rank may have different +metadata structure and therefore the MetaArrayRec structure for +Variable X may live at a different offset in each MetaData block. So +coming up with the right offset to find a variable's data given rank +and step is key to making this work. + +The principal data structure that the BP5 deserializer maintains is +the BP5VarRec. This is BP5's internal per-variable record and it +matches one-on-one with a Variable class object in the IO, except that +the BP5VarRec is persistent for the life of the BP5Deserializer +object, where the Variable object may be deleted and recreated on +every timestep in streaming mode. Note that BP5 tries to be much more +careful than other engines about storing engine-specific information +in the shared IO and variable objects (I.E. it doesn't do it). In +order to maintain this separation, the deserializer maintains two maps +with which it associates IO Variable objects with their persistent +BP5VarRec entries, `VarByName` and `VarByKey`, which are indexed by +the variable name and by the Variable instance address, respectively. +One of these calls is often the first call upon entry to the +deserializer's public methods. Additionally, each BP5VarRec has a +VarNum field. These numbers are assigned sequentially starting with 0 +for the first Variable encountered when processing metadata. The +VarNum is an important value used for indexing into various arrays. + +The details of the BP5VarRec entries are in BP5Deserializer.h. There +are too many entries to go through individually, but most are obvious +from code context, so here we'll focus on the creation and indexing +mechanisms that drive metadata use. BP5VarRec entries are created +during parsing of MetaMetaData entries (FFS Formats), which happens +the first time we encounter a MetaData entry that was encoded with +that MetaMetaData (FFS Format). This happens in the BuildControl() +routine which creates a ControlInfo struct for the MetaMetaData. The +ControlInfo struct looks like this: +``` + struct ControlInfo + { + FMFormat Format; + int ControlCount; + struct ControlInfo *Next; + std::vector *MetaFieldOffset; + std::vector *CIVarIndex; + struct ControlStruct Controls[1]; + }; +``` + +The Format field is essentially the MetaMetaID and is what this is +indexed by. I.E. when we get a new MetaData block, we determine it's +Format and look up the ControlInfo struct, which tells us everything +we need to know about the MetaData block without parsing it. The +ControlCount is how many Variables are represented in this block and +the MetaFieldOffset gives us the starting offset of each one in the +MetaData block. Recall from [BP5 Metadata Marshalling](bp5format.md), +that that's either the offset of the atomic value, or for arrays the +offset of the MetaArrayRec structure. So, MetaFieldOffset[i] is the +offset of the i'th variable in this block. But that `i` index is of +the variables that are actually in this block, and it may not +correspond to the VarNum of that variable (which as per above is +assigned the first time we see a Variable), to the CIVarIndex maps +from the VarNum index to the i'th entry in this block. + +The Controls array is the per-variable entry in the ControlInfo struct +and it contains info directly parsed from the FMFieldList for this +entry plus a pointer to the VarRec that this is associated with: +``` + struct ControlStruct + { + int FieldOffset; + BP5VarRec *VarRec; + ShapeID OrigShapeID; + DataType Type; + int ElementSize; + }; +`` +Please forgive the C-style structs and code. Much of BP5 code was +derived from the C-based FFS marshaling method in SST. Not everything +was converted to a more C++ style. + +Now, lets first talk about the simple situation, non-random access +mode (I.E. step mode). In this situation if we have N ranks in the +MPI cohort, we expect to have InstallMetaData() called for each one, +and the BP5Deserializer keeps a simple vector m_MetadataBaseAddrs +indexed by rank number. Each VarRec also has a +PerWriterMetaFieldOffset array (filled in as we did each Install(), so +the address of the metadata for a particular variable from a +particular rank is basically +`m_MetadataAddrs[Rank]+VarRec->PerWriterMetaFieldOffset[Rank]`. +You'll see this code in BP5Deserializer::GetMetadataBase(), with the +added protection that if `VarRec->PerWriterMetaFieldOffset[Rank] == +0`, that WriterRank didn't write that variable on that timestep. + +Random access mode, where we have the metadata for a bunch of steps in +memory at the same time, is vastly more complex. We didn't want to +mess up the speed and simplicity of the step-based mode, so this code +is split out in a separate `if` in most places, but lets step through +that branch in GetMetadataBase() because it hits on important points +in the BP5Deserializer code. The first few lines of this branch are: +``` + if (Step >= m_ControlArray.size() || WriterRank >= m_ControlArray[Step].size()) + { + return NULL; // we don't have this rank in this step + } +``` + +These are bounds checks. Like several other data structures in +BP5Deserializer, m_ControlArray is a vector of vectors. The first +"dimension" here is the step, so the first predicate of this if checks +to see if the requested Step is larger than the size of m_ControlArray +which has entries for each step for which we have metadata. If it is +larger, we've got no metadata and return NULL. The second predicate +is maybe a little less obvious. It turns out that the number of +writer ranks contributing to a BP5 file is not necessarily constant. +It is constant for a single write session, but you can close a BP5 +file and reopen it in append mode with a different number of writers. +So the second "dimension" of the m_ControlArray is the number of +writer ranks that was in use for that step. If we're asking for the +metadata for a writer rank that is larger that what was used for that +step, we don't have it and return NULL. + +OK, the next bit: +``` + ControlInfo *CI = m_ControlArray[Step][WriterRank]; // writer control array + if (((*CI->MetaFieldOffset).size() <= VarRec->VarNum) || + ((*CI->MetaFieldOffset)[VarRec->VarNum] == 0)) + { + // Var does not appear in this record + return NULL; + } +``` + +`CI` here is the ControlInfo block for this WriterRank on this Step. +Like all FMFormats, it's really a template and lots of metadata blocks +likely have the same template, so this pointer is not unique, but it +is the template that applies to the metadata block for this Rank and +Step. But we have a couple more checks. MetaFieldOffset is indexed +by VarNum, and it's size corresponds to the highest VarNum we had seen +at the time that this CI was produced (I.E. the corresponding +MetaMetaData was parsed). If the VarNum we're interested in is larger +than the MetaFieldOffset array, that Var was unknown when this was +parsed, therefore it's not in this CI. On the other hand, if the +VarNum was known, but simply didn't appear in this CI, the +MetaFieldOffset is 0, and we also don't have metadata here. (Note +that there are headers like the BitField that appear first in +metadata, so a zero offset is never valid for a Var field.) + +OK, we've gotten to the point where we have a CI for this metadata +block and the template contains the variable we're interested in, +there's one more check: +``` + size_t CI_VarIndex = (*CI->CIVarIndex)[VarRec->VarNum]; + BP5MetadataInfoStruct *BaseData = + (BP5MetadataInfoStruct *)(*MetadataBaseArray[Step])[WriterRank]; + if (!BP5BitfieldTest(BaseData, (int)CI_VarIndex)) + { + // Var appears in CI, but wasn't written on this step + return NULL; + } +``` + +MetadataBaseArray, like m_ControlArray, is a vector of vectors, and it +contains a pointer to the metadata block for this rank/step (I.E. the +virtual structure that we build in [BP5 Metadata +Marshalling](bp5format.md). We need to check to see if this variable, +while described in the MetaMetaData, was actually written on this +step, and to do that we have to check the bitfield. Because the +bitfield is indexed not by VarNum but by the index of the Variable in +that block, we first have to lookup that index using the CIVarIndex +vector in the CI. This is indexed by VarNum and maps it back to +CI_VarIndex. Given that and the address of the metadatablock, we use +BP5BitfieldTest to see if the variable was actually written on this +step and return NULL if not. + +Finally, we're done with checks and mapping. The address of whatever +metadata is associated with this variable on this step and rank is the +base address of the metadata block plus the MetadataFieldOffset: +``` + size_t MetadataFieldOffset = (*CI->MetaFieldOffset)[VarRec->VarNum]; + writer_meta_base = (MetaArrayRec *)(((char *)(*MetadataBaseArray[Step])[WriterRank]) + + MetadataFieldOffset); +``` + +GetMetadataBase() is the workhorse of BP5 reader-side metadata. The +ReadRandomAccess code path may seem like a lot, but mostly the most +complex operations there are indexes into arrays and adding offsets. +It's got a lot of checks, but it runs pretty quick. + +Most of the rest of BP5Deserializer is pretty straightforward if you +understand how GetMetadataBase() works, but there is one more +complexity that is somewhat the bane of BP5. Every time we mention +"Step" in random access mode above, we mean an absolute step number. +That is, we start with 0 at writer's first Begin/EndStep and increment +by one on every subsequent Begin/EndStep (handling appending +appropriately by starting with the number of steps already in the +file). However, many (all?) things in ADIOS random access mode API +have traditionally been in terms of "relative" steps. Relative steps +don't increment if the variable isn't written on that step. So if you +write 10 steps into a file, but only write variable X on the even +absolute steps (0,2,4,6,8), then BP metadata must show 5 steps for +that variable and they should be steps 0-4 (_FOR THAT VARIABLE_). So +if the user asks, for example, for the Shape of that variable on Step +4, we must internally map that RelativeStep specification to an +AbsoluteStep before applying the logic above. We've tried to use the +variable name RelStep when dealing with a relative step spec, but +there's probably places that have been missed. (Hopefully there's not +logic that has been missed too.) + +# BP5 Read logic \ No newline at end of file diff --git a/source/adios2/toolkit/format/bp5/BP5Base.h b/source/adios2/toolkit/format/bp5/BP5Base.h index d19be1df73..d4c290885c 100644 --- a/source/adios2/toolkit/format/bp5/BP5Base.h +++ b/source/adios2/toolkit/format/bp5/BP5Base.h @@ -19,264 +19,6 @@ #pragma warning(disable : 4250) #endif -/* - * BP5 Metadata Marshalling is based upon FFS, which provides the - * ability to serialize a C-style pointer-based data structure - * (starting with a base struct) and to deserialize it in-place on - * the receiving side. - * - * Normally, in order to use FFS, an application must fully describe - * the base structure using an FMFieldList, where each element - * describes a field in the structure, including the field's name, - * basic type (integer, float, etc.), size and offset from the start - * of the structure. In "normal" scenarios, like in SST this is - * straightforward because we're describing a structure that exists - * at compile-time and all of those things are compile-time static. - * However, ADIOS metadata represents information about variables - * that we don't know about until run-time, so if we're going to use - * FFS here, things have to be a bit more dynamic. In particular, - * we'll represent ADIOS metadata with a "virtual" structure, one - * whose description we'll construct on the fly and which will only - * ever exist virtually, making up offsets as we go. We just have to - * be careful about keeping things aligned appropriately because we - * want this to land on the receiver and be appropriately aligned - * there. (Normally the compiler takes care of this, but this - * virtual structure is never seen by a compiler, so we're doing it.) - * The field name that we specify to FFS is also important because we - * use it to communicate a lot of information between writer and - * reader. While it always contains the variable name, it also - * encodes the variable type (local or global, atomic or array, - * compressed, derived, etc.). Because the variable name only - * appears in the metametadata (ffs format), this is a great place to - * put more static information about the variable, specifically - * anything that is fixed after definition and doesn't change on a - * per-timestep basis. More on names later. - * - * To accomplish managing the structure on the writer side, we - * principally track two things, the FMFieldList that represents the - * description of the virtual struct, and a malloc'd region where we - * build the virtual struct itself. While the description is - * interpreted by FFS, the most important thing for BP5 to remember - * is this field's offset because that's where the (meta)data will - * go. When we Marshal a simple atomic value (local or global), we - * calculate an appropriately aligned new offset in the buffer, add - * to the FMFieldList (maintained in Info.MetaFields on the writer) - * and copy the data into the virtual field at that offset in the - * buffer. On future timesteps, the field already exists, so we just - * use the offset and copy the data into the buffer. Arrays are a - * bit more complex, but lets start with the simple case. FFS - * supports substructures, I.E. fields which themselves are a - * structure and we use that feature for all array representations. - * There are several things that may change on a per-timestep basis - * for arrays, including Shape, Count and Offset values (which are - * themselves arrays), and we also need to track the location of the - * related data block (offset in this rank's data segment). Except - * for Shape (which we assume is set for at least this timestep), all - * of these things are per-block. - * - * Back to FFS capabilities for a moment. FFS's pointer-based - * structures include dynamically-sized arrays, and the size of those - * arrays must be specified by an integer-typed field in that - * structure. There are three different array lengths required here. - * Shape is of length Dims (how many dimensions the array has), - * DataBlockLocation is of length BlockCount (how many blocks were - * written on this rank), and for Count and Offsets we must have - * those per-block, so the length is Dims*BlockCount. To satisfy - * FFS's constraints, that means we must have integer fields - * representing all three lengths in the array metadata struct, and - * we need pointers to the dynamic arrays representing Shape, Count, - * Offsets, and DataBlockLocation. These are the BASE_FIELDS below - * and the FFS FMField entries are BASE_FIELD_ENTRIES in BP5Base.cpp. - * While more complex arrays metadata entries are necessary, these - * must be the first fields in those structures. While there can't - * be a static struct declaration for all of the metadata, there is a - * static declaration for the array metadata substructure, - * MetaArrayRec below. Mostly you'll see this used like this: - * - * MetaArrayRec *MetaEntry = (MetaArrayRec *)((char *)(MetadataBuf) + Rec->MetaOffset); - * - * This gives us a nice way of accessing the key fields in an array's - * metadata entry. - * - * So, what about more complex arrays? All of our compression - * operators require the length of the encrypted field as input to - * the uncompress operator. Generally we don't include data block - * length as part of metadata because it's easily calculated from the - * Count values and the length of the data type, but in order to - * support compression we have to communicate it from the writer to - * the reader so we can uncompress. Therefore every field with an - * operator has as its next field (after BASE_FIELDS) DataBlockSize. - * Like DataBlockLocation, this is per block (and so it's FFS - * description also uses BlockCount). This arrangement is - * represented by the struct MetaArrayRecOperator below. Note that - * BP5 does not itself use the DataBlockSize in the metadata. The - * size of the compressed data is returned from the compression - * operator, and is used by BP5 to copy that data into the data - * block, but after that it is only passed to the Uncompress operator - * on the receiving side, so operators like MGard may choose to use - * this differently. - * - * The last case is arrays that also have Min/Max stats associated - * with them. Since this can be combined with operators, that gives - * us two more possible structs for array metadata, a plain array - * with Min/Max or an array with an operator and Min/Max, these are - * represented by the structs MetaArrayRecMM and - * MetaArrayRecOperatorMM below. Note that MinMax in that struct is - * a char*, but obviously the data type of Min/Max depends upon the - * element type of the array. How does that work? The actual size - * in bytes of the MinMax array is BlockCount * sizeof(array element) - * * 2, but in order to avoid introducing yet another integer-typed - * size value into the structure we've gone to some effort in order - * to leverage the existing BlockCount value. In particular, there - * are a number of FMField lists for The MM and OperatorMM arrays, - * each giving FFS a different element size for the MinMax Array. - * ADIOS types of size 1 use MetarrayRecMM1List, those of size 2 use - * MetaArrayRecMM2List, etc., up to MetaArrayRecMM16List, which would - * be used by long double. Note that BP5 doesn't define or support - * MinMax for string, complex, or structure types. - * - * For each of the array variations above, when we add the field - * associated with that array to the metadata field list, we specify - * the appropriate FieldList in the FFS "field_type" value, and - * allocate space for the relevant structure in the virtual metadata - * struct we're building. - * - * We mentioned field names above, we actually encode a lot of - * information into the FFS field names, including the variable name, - * shape, element_size, ADIOS type, any operator that might be - * applied, the name of the substructure (if the array is a struct - * type), and even the expression that is to be used for derived - * variables. These are all encoded in different ways, for example - * the basic shape of the variable is encoded in the three letter - * prefix of the FFS fieldname: GlobalValue: = "BPg", GlobalArray = - * "BPG"JoinedArray = "BPJ", LocalValue = "BPl", LocalArray = "BPL". - * The details of the encoding are buried in the logic, but important - * bit is knowing that there's a lot of information there and some of - * it (like the expression) is base64 encoded to avoid having special - * characters in the FFS field name. From the BP5 point of view, - * anything that can be encoded in the field name is a good thing - * because it travels in the metametadata, not the metadata, so it - * only gets moved around if the field set changes. - * - * Speaking of changes, there are some details that are omitted above - * to get the main points across, but lets talk about other details. - * First, when you put a first block of an array, we fill out the - * Dims field, init BlockCount to 1, DBCount (the Dims*BlockCount - * value) to Dims and then we malloc memory to hold a copy of the - * Shape, Count and Offset values. (We need to copy these anyway as - * part of serialization as they must be captured at the time of Put, - * so we can't, say, just reference the values in the VariableBase - * class.) For LocalArrays, the Shape value stays at a NULL pointer, - * as does the Start value. If after the first there's another Put() - * on that variable, we add 1 to BlockCount, increment DBCount by - * Dims, and realloc() the Count and Offset arrays so that we can add - * the new Count and Offset values after the ones that are already - * there. This means that the Count values for block 1 start at - * Count[Dims], for block 2 they start at Count[2*Dims], etc. At the - * end of the timestep after using FFSencode() to serialize the - * metadata, FMfree_var_rec_elements() is used to free() all these - * subarrays that we've malloc'd. It understands the structure of - * our entire Metadata structure, walks the field list and - * deallocates appropriately. Once this has been done, we can - * memset() the whole metadata structure back to zeros and we're - * ready to start again. (All pointers NULL and counts are zero.) - * - * When we do start again with the next timestep, we don't start from - * scratch with a new Fieldlist and virtual structure, but instead - * try to reuse the old one. The anticipation is that step-based HPC - * applications are highly regular and the set of variables that are - * output on step N+1 are likely the same as what they output for - * step N. So when we get a Put() for a variable, we look up it's - * entry in internal bookkeeping and if it has an entry in the - * structure we reuse it, putting the appropriate data in the virtual - * structure as described above. This is fine if we write the exact - * same set of variables in subsequent steps, but what if we don't? - * Well, if we write a new variable, then the procedure above - * happens, but we also take steps to make sure that we generate new - * MetaMetaData (I.E. re-register the format with FFS). We do this - * by setting the Info.MetaFormat value to NULL. - * - * Handling a non-written variable is done differently. We don't - * really want to bear the cost of new MetaMetaData frequently - * (because MetaMetaData can be big), so instead we're willing to - * bear the costs of not using some of the data in the virtual - * structure. So if the app Puts an atomic variable on timestep N, - * but skips it on N+1, we essentially leave that fraction of the - * metadata buffer unused in N+1. It's transmitted or stored, but it - * doesn't contain anything useful. But the reader still needs to - * know that it wasn't written, so BP5 metadata carries with it a - * bitmap showing if a variable that is part of the metadata has - * actually been written and is valid. This bitmap, contained in the - * BitField[BitFieldCount] fields in the MetadataFieldList is the - * ultimate authority as to what has been written. Variables are - * assigned an index in order when they are first entered into - * metadata and if the bit at that index isn't set, that variable - * wasn't written on that timestep. - * - * Now, this does bring up a vulnerability with BP5. If an - * application were to write a lot of variables on one step and then - * never use them again, we might end up with a big metadata block - * that mostly carried unused (junk) bytes. We have not yet run into - * this in a real application, so it isn't specifically handled. In - * an ideal world, one would look at the "occcupancy rate" of - * metadata in EndStep() and make a decision that for either this - * timestep or the next, we'd start from scratch with an empty field - * list. There's a tradeoff here. Do this too often and we've got - * big MetaMetadata costs, do it too little and our metadata has a - * lot of useless bytes. Future work. Note that this is mostly a - * writer-side thing to fix/optimize. The reader will appropriately - * handle new metadata, including new metametadata. - * - * The stuff above applies to ADIOS variables, but attributes are - * always handled separately. In the initial FFS-marshalling - * implementation, Attributes, while separate, were handled very - * similarly to variables. That is, there was a field list and - * virtual structure maintained where we entered attributes much like - * Global and local values are described above. There was a - * metametadata generated it it and it was moved around like other - * metametadata blocks. This old way of doing things is still - * present in the code and gets used if MarshalAttribute is called by - * the engine. Engines that use this marshall all attributes in - * Endstep(), calling MarshalAttribute for all attributes and only - * doing this when some attribute has changed. The resulting - * Attribute data always contains *all* the current attribute values, - * a situation that works out well for engines like SST where readers - * might join after timestep 0. The SST writer can save the most - * recent Attribute data block and provide it to a newly-joined - * reader so that it has all available attributes. - * - * However, this encoding mechanism has some significant - * disadvantages under almost all situations. This separation of - * metametadata and metadata was designed for Variables, where the - * set of variables was likely to be reused without changes - * repeatedly. However, attributes aren't like that, particularly in - * the original situation where attributes once set can never change. - * Then we're only doing this when we add an attribute, we're always - * generating new MetaMetadata whenever we have a change, and - * MetaMetadata + Metadata size is always going to be bigger than - * some simpler encoding mechanism. So, BP5 file engine now does - * things differently. It calls OnetimeMarshalAttribute() which uses - * a simpler FFS representation for attributes with the attribute - * "name" being part of the data, not part of the metametadata as it - * is with variables. This means that the metametadata never - * changes, so we don't have the same issues as with the prior - * approach. That metametadata struct (BP5AttrStruct) describes a - * relatively simple structure with two lists, one for attributes of - * any non-string type, and the other a list of string and - * array-of-string attributes. Generally we only want attributes to - * appear here when they change, so the BP5Writer calls - * OnetimeMarshlAttribute whenever it gets the NotifyEngineAttribute - * call (whenever an attribute changes). However it also gets called - * in BeginStep if that step is the first every called, because some - * attributes may have been defined before the engine was ever - * created. In BP5 file, attribute blocks then only every contain an - * attribute once, unless the attribute changes in which case it will - * appear again. This is not such a good situation for SST because - * of the late-coming-reader issue, so that still uses the old - * marshaling mechanism. - * - */ - namespace adios2 { namespace format @@ -297,12 +39,11 @@ class BP5Base #define BASE_FIELDS \ size_t Dims; /* How many dimensions does this array have */ \ - size_t BlockCount; /* How many blocks are written */ \ - size_t DBCount; /* Dimens * BlockCount */ \ - size_t *Shape; /* Global dimensionality [Dims] NULL for local */ \ - size_t *Count; /* Per-block Counts [DBCount] */ \ - size_t *Offsets; /* Per-block Offsets [DBCount] NULL for local \ - */ \ + size_t BlockCount; /* How many blocks are written */ \ + size_t DBCount; /* Dimens * BlockCount */ \ + size_t *Shape; /* Global dimensionality [Dims] NULL for local */ \ + size_t *Count; /* Per-block Counts [DBCount] */ \ + size_t *Offsets; /* Per-block Offsets [DBCount] NULL for local */ \ size_t *DataBlockLocation; /* Per-block Offset in PG [BlockCount] */ typedef struct _MetaArrayRec