From d6d8275802d7d99060f225d8890bb2ba777c33e0 Mon Sep 17 00:00:00 2001
From: Jeremy Leibs <jeremy@rerun.io>
Date: Thu, 17 Oct 2024 01:46:32 -0400
Subject: [PATCH] Allow splitting entity path expressions with whitespace
 (#7782)

### What
- Resolves: https://github.com/rerun-io/rerun/issues/7740

This is a little tricky since we don't want to split on whitespace that
happens to show up between a + or - and the expression.

Logic seems correct from testing.

### Checklist
* [x] I have read and agree to [Contributor
Guide](https://github.com/rerun-io/rerun/blob/main/CONTRIBUTING.md) and
the [Code of
Conduct](https://github.com/rerun-io/rerun/blob/main/CODE_OF_CONDUCT.md)
* [x] I've included a screenshot or gif (if applicable)
* [x] I have tested the web demo (if applicable):
* Using examples from latest `main` build:
[rerun.io/viewer](https://rerun.io/viewer/pr/7782?manifest_url=https://app.rerun.io/version/main/examples_manifest.json)
* Using full set of examples from `nightly` build:
[rerun.io/viewer](https://rerun.io/viewer/pr/7782?manifest_url=https://app.rerun.io/version/nightly/examples_manifest.json)
* [x] The PR title and labels are set such as to maximize their
usefulness for the next release's CHANGELOG
* [x] If applicable, add a new check to the [release
checklist](https://github.com/rerun-io/rerun/blob/main/tests/python/release_checklist)!
* [x] If have noted any breaking changes to the log API in
`CHANGELOG.md` and the migration guide

- [PR Build Summary](https://build.rerun.io/pr/7782)
- [Recent benchmark results](https://build.rerun.io/graphs/crates.html)
- [Wasm size tracking](https://build.rerun.io/graphs/sizes.html)

To run all checks from `main`, comment on the PR with `@rerun-bot
full-check`.

---------

Co-authored-by: Emil Ernerfeldt <emil.ernerfeldt@gmail.com>
---
 .github/workflows/labels.yml                  |   2 +-
 .../src/path/entity_path_filter.rs            | 104 +++++++++++++++++-
 rerun_py/tests/unit/test_dataframe.py         |  19 ++++
 3 files changed, 122 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/labels.yml b/.github/workflows/labels.yml
index b6f86391c1e2..a0e7ac40f10c 100644
--- a/.github/workflows/labels.yml
+++ b/.github/workflows/labels.yml
@@ -39,4 +39,4 @@ jobs:
         with:
           mode: minimum
           count: 1
-          labels: "📊 analytics, 🟦 blueprint, 🪳 bug, 🌊 C++ API, CLI, codegen/idl, 🧑‍💻 dev experience, dependencies, 📖 documentation, 💬 discussion, examples, exclude from changelog, 🪵 Log-API, 📉 performance, 🐍 Python API, ⛃ re_datastore, 🔍 re_query, 📺 re_viewer, 🔺 re_renderer, 🚜 refactor, ⛴ release, 🦀 Rust API, 🔨 testing, ui, 🕸️ web"
+          labels: "📊 analytics, 🟦 blueprint, 🪳 bug, 🌊 C++ API, CLI, codegen/idl, 🧑‍💻 dev experience, dependencies, 📖 documentation, 💬 discussion, examples, exclude from changelog, 🪵 Log & send APIs, 📉 performance, 🐍 Python API, ⛃ re_datastore, 🔍 re_query, 📺 re_viewer, 🔺 re_renderer, 🚜 refactor, ⛴ release, 🦀 Rust API, 🔨 testing, ui, 🕸️ web"
diff --git a/crates/store/re_log_types/src/path/entity_path_filter.rs b/crates/store/re_log_types/src/path/entity_path_filter.rs
index 220e70c64165..a59629c1e089 100644
--- a/crates/store/re_log_types/src/path/entity_path_filter.rs
+++ b/crates/store/re_log_types/src/path/entity_path_filter.rs
@@ -169,6 +169,75 @@ impl TryFrom<&str> for EntityPathFilter {
     }
 }
 
+/// Split a string into whitespace-separated tokens with extra logic.
+///
+/// Specifically, we allow for whitespace between `+`/`-` and the following token.
+///
+/// Additional rules:
+///  - Escaped whitespace never results in a split.
+///  - Otherwise always split on `\n` (even if it follows a `+` or `-` character).
+///  - Only consider `+` and `-` characters as special if they are the first character of a token.
+///  - Split on whitespace does not following a relevant `+` or `-` character.
+fn split_whitespace_smart(path: &'_ str) -> Vec<&'_ str> {
+    #![allow(clippy::unwrap_used)]
+
+    // We parse on bytes, and take care to only split on either side of a one-byte ASCII,
+    // making the `from_utf8(…)`s below safe to unwrap.
+    let mut bytes = path.as_bytes();
+
+    let mut tokens = vec![];
+
+    // Start by ignoring any leading whitespace
+    while !bytes.is_empty() {
+        let mut i = 0;
+        let mut is_in_escape = false;
+        let mut is_include_exclude = false;
+        let mut new_token = true;
+
+        // Find the next unescaped whitespace character not following a '+' or '-' character
+        while i < bytes.len() {
+            let is_unescaped_whitespace = !is_in_escape && bytes[i].is_ascii_whitespace();
+
+            let is_unescaped_newline = !is_in_escape && bytes[i] == b'\n';
+
+            if is_unescaped_newline || (!is_include_exclude && is_unescaped_whitespace) {
+                break;
+            }
+
+            is_in_escape = bytes[i] == b'\\';
+
+            if bytes[i] == b'+' || bytes[i] == b'-' {
+                is_include_exclude = new_token;
+            } else if !is_unescaped_whitespace {
+                is_include_exclude = false;
+                new_token = false;
+            }
+
+            i += 1;
+        }
+        if i > 0 {
+            tokens.push(&bytes[..i]);
+        }
+
+        // Continue skipping whitespace characters
+        while i < bytes.len() {
+            if is_in_escape || !bytes[i].is_ascii_whitespace() {
+                break;
+            }
+            is_in_escape = bytes[i] == b'\\';
+            i += 1;
+        }
+
+        bytes = &bytes[i..];
+    }
+
+    // Safety: we split at proper character boundaries
+    tokens
+        .iter()
+        .map(|token| std::str::from_utf8(token).unwrap())
+        .collect()
+}
+
 impl EntityPathFilter {
     /// Parse an entity path filter from a string while ignore syntax errors.
     ///
@@ -187,7 +256,8 @@ impl EntityPathFilter {
     ///
     /// Conflicting rules are resolved by the last rule.
     pub fn parse_forgiving(rules: &str, subst_env: &EntityPathSubs) -> Self {
-        Self::from_query_expressions_forgiving(rules.split('\n'), subst_env)
+        let split_rules = split_whitespace_smart(rules);
+        Self::from_query_expressions_forgiving(split_rules, subst_env)
     }
 
     /// Build a filter from a list of query expressions.
@@ -243,7 +313,8 @@ impl EntityPathFilter {
         rules: &str,
         subst_env: &EntityPathSubs,
     ) -> Result<Self, EntityPathFilterParseError> {
-        Self::from_query_expressions_strict(rules.split('\n'), subst_env)
+        let split_rules = split_whitespace_smart(rules);
+        Self::from_query_expressions_strict(split_rules, subst_env)
     }
 
     /// Build a filter from a list of query expressions.
@@ -891,3 +962,32 @@ mod tests {
         }
     }
 }
+
+#[test]
+fn test_split_whitespace_smart() {
+    assert_eq!(split_whitespace_smart("/world"), vec!["/world"]);
+    assert_eq!(split_whitespace_smart("a b c"), vec!["a", "b", "c"]);
+    assert_eq!(split_whitespace_smart("a\nb\tc  "), vec!["a", "b", "c"]);
+    assert_eq!(split_whitespace_smart(r"a\ b c"), vec![r"a\ b", "c"]);
+
+    assert_eq!(
+        split_whitespace_smart(" + a - b + c"),
+        vec!["+ a", "- b", "+ c"]
+    );
+    assert_eq!(
+        split_whitespace_smart("+ a -\n b + c"),
+        vec!["+ a", "-", "b", "+ c"]
+    );
+    assert_eq!(
+        split_whitespace_smart("/weird/path- +/oth- erpath"),
+        vec!["/weird/path-", "+/oth-", "erpath"]
+    );
+    assert_eq!(
+        split_whitespace_smart(r"+world/** -/world/points"),
+        vec!["+world/**", "-/world/points"]
+    );
+    assert_eq!(
+        split_whitespace_smart(r"+ world/** - /world/points"),
+        vec!["+ world/**", "- /world/points"]
+    );
+}
diff --git a/rerun_py/tests/unit/test_dataframe.py b/rerun_py/tests/unit/test_dataframe.py
index be1134c96199..c5ccf970ea1d 100644
--- a/rerun_py/tests/unit/test_dataframe.py
+++ b/rerun_py/tests/unit/test_dataframe.py
@@ -173,6 +173,25 @@ def test_full_view(self) -> None:
         assert table.num_columns == 1
         assert table.num_rows == 1
 
+    def test_content_filters(self) -> None:
+        filter_expressions = [
+            "+/** -/static_text",
+            """
+            +/**
+            -/static_text
+            """,
+            {"/** -/static_text": ["Position3D", "Color"]},
+        ]
+
+        for expr in filter_expressions:
+            view = self.recording.view(index="my_index", contents=expr)
+
+            table = view.select().read_all()
+
+            # my_index, log_time, log_tick, points, colors
+            assert table.num_columns == 5
+            assert table.num_rows == 2
+
     def test_select_columns(self) -> None:
         view = self.recording.view(index="my_index", contents="points")
         index_col = rr.dataframe.IndexColumnSelector("my_index")