Skip to content

Commit

Permalink
Add sorting like in a user friendly file system option
Browse files Browse the repository at this point in the history
Summary:
When sorting files in macos or windows, when there is a group of digits, all things before being equal, sorting happens using numbers (not characters), and in case the numbers match, even if they look different ("1" vs "01"), then the rest of the string determines the order.
We implement this logic in a helper compare function that gar can then use, so files can be sorted as one expects in the archive, when adding a folder.
This will help with GHS data, that has 70,000 images in a folder, but only up to 3 leading zeros... so image0001.bin, image9999.bin, and image10000.bin... Ordering files in a gar archive controls how records are sorted on disk, which impacts streaming caching, so it's important to get this order correctly.

Reviewed By: kiminoue7

Differential Revision: D51785203

fbshipit-source-id: 64a3d732f80e5e604144d8af3933954ecd54816e
  • Loading branch information
Georges Berenger authored and facebook-github-bot committed Dec 5, 2023
1 parent b619362 commit 414c7b5
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 1 deletion.
51 changes: 51 additions & 0 deletions vrs/helpers/Strings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cstring>
#include <ctime>

#include <algorithm>
#include <sstream>

#include <fmt/format.h>
Expand Down Expand Up @@ -62,6 +63,56 @@ bool endsWith(const string& text, const string& suffix) {
text.c_str() + text.length() - suffix.length(), suffix.c_str(), suffix.length()) == 0;
}

inline bool isdigit(char c) {
return std::isdigit(static_cast<uint8_t>(c));
}

static uint32_t lastDigitIndex(const char* str, uint32_t index) {
while (isdigit(str[index + 1])) {
index++;
}
return index;
}

inline char paddedChar(const char* str, uint32_t pos, uint32_t pad, uint32_t index) {
return index < pad ? '0' : str[pos + index - pad];
}

#define LEFT_C (left[left_p])
#define RIGHT_C (right[right_p])

bool beforeFileName(const char* left, const char* right) {
uint32_t leftPos = 0;
uint32_t rightPos = 0;
bool bothDigits = false;
while ((bothDigits = (isdigit(left[leftPos]) && isdigit(right[rightPos]))) ||
(left[leftPos] == right[rightPos] && left[leftPos] != 0)) {
if (bothDigits) {
uint32_t leftDigitLength = lastDigitIndex(left, leftPos) - leftPos;
uint32_t rightDigitLength = lastDigitIndex(right, rightPos) - rightPos;
uint32_t leftPad =
leftDigitLength < rightDigitLength ? rightDigitLength - leftDigitLength : 0;
uint32_t rightPad =
rightDigitLength < leftDigitLength ? leftDigitLength - rightDigitLength : 0;
uint32_t lastDigitIndex = max<uint32_t>(leftDigitLength, rightDigitLength);
for (uint32_t digitIndex = 0; digitIndex <= lastDigitIndex; digitIndex++) {
char lc = paddedChar(left, leftPos, leftPad, digitIndex);
char rc = paddedChar(right, rightPos, rightPad, digitIndex);
if (lc != rc) {
return lc < rc;
}
}
leftPos += leftDigitLength;
rightPos += rightDigitLength;
}
leftPos++, rightPos++;
}
if (left[leftPos] == 0) {
return right[rightPos] != 0;
}
return left[leftPos] < right[rightPos];
}

string humanReadableFileSize(int64_t bytes) {
const int64_t unit = 1024;
if (bytes < unit) {
Expand Down
12 changes: 11 additions & 1 deletion vrs/helpers/Strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

#include <cstdint>

#include <algorithm>
#include <map>
#include <string>
#include <vector>
Expand Down Expand Up @@ -53,6 +52,17 @@ inline int strncasecmp(const char* first, const char* second, size_t size) {
}
#endif

/// Compare strings, as you'd expect in a modern desktop OS (Explorer/Finder), treating digit
/// sections as numbers, so that "image1.png" is before "image02.png", and "image010.png" is the
/// same as "image00010.png".
/// Note: This is not a total order, since beforeFileName("image1.png", "image01.png") and
/// beforeFileName("image01.png", "image1.png") are both false!
bool beforeFileName(const char* left, const char* right);

inline bool beforefileName(const std::string& left, const std::string& right) {
return beforeFileName(left.c_str(), right.c_str());
}

/// Returns a copy of the string from which all the characters in whiteChars
/// at the beginning or at the end of the string have been removed.
/// @param text: some utf8 text string to trim
Expand Down
62 changes: 62 additions & 0 deletions vrs/helpers/test/StringsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -323,3 +323,65 @@ TEST_F(StringsHelpersTester, splitTest) {
helpers::split(str, 'l', actualTokens, true, " ");
EXPECT_EQ(actualTokens, expectedTokens);
}

#define CHECK_BEFORE(a, b) \
EXPECT_TRUE(helpers::beforeFileName(a, b)); \
EXPECT_FALSE(helpers::beforeFileName(b, a));

#define CHECK_SAME(a, b) \
EXPECT_FALSE(helpers::beforeFileName(a, b)); \
EXPECT_FALSE(helpers::beforeFileName(b, a));

#define CHECK_BEFORE_SELF(a) EXPECT_FALSE(helpers::beforeFileName(a, a))

TEST_F(StringsHelpersTester, beforeFileNameTest) {
helpers::beforeFileName("part0image10.png", "part0000image011.png");

CHECK_BEFORE_SELF("");
CHECK_BEFORE_SELF("a");
CHECK_BEFORE_SELF("abcd");
CHECK_BEFORE_SELF("abcd000z");

CHECK_BEFORE("", "a");
CHECK_BEFORE("", "0");
CHECK_BEFORE("00", "001");
CHECK_BEFORE("00", "0a");
CHECK_BEFORE("10", "011");

CHECK_SAME("0", "00");
CHECK_SAME("0", "0000000");
CHECK_SAME("10", "0010");
CHECK_SAME("123", "123");
CHECK_SAME("123", "0123");
CHECK_SAME("0123", "00000000123");
CHECK_SAME("image0123section3z", "image000123section003z");
CHECK_SAME("02image0123section3z", "2image0123section03z");

CHECK_SAME("image10.png", "image10.png");
CHECK_SAME("image010.png", "image10.png");
CHECK_SAME("image0010.png", "image10.png");
CHECK_SAME("image010.png", "image000010.png");

CHECK_BEFORE("image10a", "image10b");
CHECK_BEFORE("image010a", "image10b");
CHECK_BEFORE("image010a", "image0010b");

CHECK_BEFORE("image10.png", "image11.png");
CHECK_BEFORE("image010.png", "image11.png");
CHECK_BEFORE("image10.png", "image011.png");
CHECK_BEFORE("image90.png", "image0110.png");
CHECK_BEFORE("image90.png", "image0190.png");
CHECK_BEFORE("image19.png", "image90.png");
CHECK_BEFORE("image019.png", "image90.png");
CHECK_BEFORE("image019.png", "image0090.png");
CHECK_BEFORE("image1901.png", "image19010.png");

CHECK_BEFORE("part0image10.png", "part0image11.png");
CHECK_BEFORE("part00image010.png", "part0image11.png");
CHECK_BEFORE("part0image10.png", "part0000image011.png");
CHECK_BEFORE("part0image90.png", "part000image0110.png");
CHECK_BEFORE("part0image90.png", "part0image0190.png");
CHECK_BEFORE("part0image19.png", "part00image90.png");
CHECK_BEFORE("part0image019.png", "part0image90.png");
CHECK_BEFORE("part0image019.png", "part0image0090.png");
}

0 comments on commit 414c7b5

Please sign in to comment.