ref(grouping): Tweak comments and docstrings (#82045)

This is a collection of tweaks to (and additions of) comments and docstrings, built up over the last month or so of my work on grouping. No behavior changes.
getsentry · Dec 12, 2024 · 247bdff · 247bdff
1 parent 6021578
commit 247bdff
Show file tree

Hide file tree

Showing 14 changed files with 78 additions and 49 deletions.
diff --git a/src/sentry/grouping/api.py b/src/sentry/grouping/api.py
@@ -156,7 +156,7 @@ def get_grouping_config_dict_for_project(project) -> GroupingConfig:
     ingestion so that the grouping algorithm can be re-run later.
 
     This is called early on in normalization so that everything that is needed
-    to group the project is pulled into the event.
+    to group the event is pulled into the event data.
     """
     loader = PrimaryGroupingConfigLoader()
     return loader.get_config_dict(project)
@@ -285,8 +285,10 @@ def _get_component_trees_for_variants(
     precedence_hint: str | None = None
     all_strategies_components_by_variant: dict[str, list[BaseGroupingComponent]] = {}
 
+    # `iter_strategies` presents strategies in priority order, which allows us to go with the first
+    # one which produces a result. (See `src/sentry/grouping/strategies/configurations.py` for the
+    # strategies used by each config.)
     for strategy in context.config.iter_strategies():
-        # Defined in src/sentry/grouping/strategies/base.py
         current_strategy_components_by_variant = strategy.get_grouping_components(
             event, context=context
         )
@@ -334,8 +336,7 @@ def get_grouping_variants_for_event(
     event: Event, config: StrategyConfiguration | None = None
 ) -> dict[str, BaseVariant]:
     """Returns a dict of all grouping variants for this event."""
-    # If a checksum is set the only variant that comes back from this
-    # event is the checksum variant.
+    # If a checksum is set the only variant that comes back from this event is the checksum variant.
     #
     # TODO: Is there a reason we don't treat a checksum like a custom fingerprint, and run the other
     # strategies but mark them as non-contributing, with explanations why?

diff --git a/src/sentry/grouping/component.py b/src/sentry/grouping/component.py
@@ -31,8 +31,13 @@ def _calculate_contributes[ValuesType](values: Sequence[ValuesType]) -> bool:
 
 
 class BaseGroupingComponent[ValuesType: str | int | BaseGroupingComponent[Any]](ABC):
-    """A grouping component is a recursive structure that is flattened
-    into components to make a hash for grouping purposes.
+    """
+    A grouping component is a node in a tree describing the event data (exceptions, stacktraces,
+    messages, etc.) which can contribute to grouping. Each node's children, stored in the `values`
+    attribute, are either other grouping components or primitives representing the actual data.
+
+    For example, an exception component might have type, value, and stacktrace components as
+    children, and the type component might have the string "KeyError" as its child.
     """
 
     hint: str | None = None
@@ -45,6 +50,8 @@ def __init__(
         contributes: bool | None = None,
         values: Sequence[ValuesType] | None = None,
     ):
+        # Use `upate` to set attribute values because it ensures `contributes` is set (if
+        # `contributes` is not provided, `update` will derive it from the `values` value)
         self.update(
             hint=hint,
             contributes=contributes,
@@ -143,8 +150,9 @@ def shallow_copy(self) -> Self:
         return copy
 
     def iter_values(self) -> Generator[str | int]:
-        """Recursively walks the component and flattens it into a list of
-        values.
+        """
+        Recursively walks the component tree, gathering literal values from contributing
+        branches into a flat list.
         """
         if self.contributes:
             for value in self.values:

diff --git a/src/sentry/grouping/fingerprinting/__init__.py b/src/sentry/grouping/fingerprinting/__init__.py
@@ -240,6 +240,9 @@ def _get_release(self) -> list[_ReleaseInfo]:
         return self._release
 
     def get_values(self, match_type: str) -> list[dict[str, Any]]:
+        """
+        Pull values from all the spots in the event appropriate to the given match type.
+        """
         return getattr(self, "_get_" + match_type)()
 
 
@@ -375,7 +378,12 @@ def _from_config_structure(
 
 
 class FingerprintMatcher:
-    def __init__(self, key: str, pattern: str, negated: bool = False) -> None:
+    def __init__(
+        self,
+        key: str,  # The event attribute on which to match
+        pattern: str,  # The value to match (or to not match, depending on `negated`)
+        negated: bool = False,  # If True, match when `event[key]` does NOT equal `pattern`
+    ) -> None:
         if key.startswith("tags."):
             self.key = key
         else:
@@ -422,7 +430,7 @@ def _positive_path_match(self, value: str | None) -> bool:
         return False
 
     def _positive_match(self, values: dict[str, Any]) -> bool:
-        # path is special in that it tests against two values (abs_path and path)
+        # `path` is special in that it tests against two values (`abs_path` and `filename`)
         if self.key == "path":
             value = values.get("abs_path")
             if self._positive_path_match(value):
@@ -433,7 +441,7 @@ def _positive_match(self, values: dict[str, Any]) -> bool:
                     return True
             return False
 
-        # message tests against value as well as this is what users expect
+        # message tests against exception value also, as this is what users expect
         if self.key == "message":
             for key in ("message", "value"):
                 value = values.get(key)

diff --git a/src/sentry/grouping/parameterization.py b/src/sentry/grouping/parameterization.py
@@ -20,19 +20,19 @@
 @dataclasses.dataclass
 class ParameterizationRegex:
 
-    name: str  # name of the pattern also used as group name in combined regex
+    name: str  # name of the pattern (also used as group name in combined regex)
     raw_pattern: str  # regex pattern w/o matching group name
     lookbehind: str | None = None  # positive lookbehind prefix if needed
     lookahead: str | None = None  # positive lookahead postfix if needed
     counter: int = 0
 
-    # These need to be used with `(?x)` tells the regex compiler to ignore comments
+    # These need to be used with `(?x)`, to tell the regex compiler to ignore comments
     # and unescaped whitespace, so we can use newlines and indentation for better legibility.
 
     @property
     def pattern(self) -> str:
         """
-        Returns the regex pattern for with as a named matching group and lookbehind/lookahead if needed.
+        Returns the regex pattern with a named matching group and lookbehind/lookahead if needed.
         """
         prefix = rf"(?<={self.lookbehind})" if self.lookbehind else ""
         postfix = rf"(?={self.lookahead})" if self.lookahead else ""
@@ -41,7 +41,7 @@ def pattern(self) -> str:
     @property
     def compiled_pattern(self) -> re.Pattern[str]:
         """
-        Returns the compiled regex pattern for with as a named matching group and lookbehind/lookahead if needed.
+        Returns the compiled regex pattern with a named matching group and lookbehind/lookahead if needed.
         """
         if not hasattr(self, "_compiled_pattern"):
             self._compiled_pattern = re.compile(rf"(?x){self.pattern}")
@@ -189,8 +189,8 @@ class ParameterizationCallable:
     us more flexibility than just using regex.
     """
 
-    name: str  # name of the pattern also used as group name in combined regex
-    apply: Callable[[str], tuple[str, int]]  # function to modifying the input string
+    name: str  # name of the pattern (also used as group name in combined regex)
+    apply: Callable[[str], tuple[str, int]]  # function for modifying the input string
     counter: int = 0
 
 

diff --git a/src/sentry/grouping/strategies/base.py b/src/sentry/grouping/strategies/base.py
@@ -62,7 +62,7 @@ def strategy(
     """
     Registers a strategy
 
-    :param ids: The strategy/delegate IDs with which to register
+    :param ids: The strategy/delegate IDs to register
     :param interface: Which interface type should be dispatched to this strategy
     :param score: Determines precedence of strategies. For example exception
         strategy scores higher than message strategy, so if both interfaces are
@@ -90,6 +90,7 @@ def decorator(f: StrategyFunc[ConcreteInterface]) -> Strategy[ConcreteInterface]
 
 class GroupingContext:
     def __init__(self, strategy_config: "StrategyConfiguration"):
+        # The initial context is essentially the grouping config options
         self._stack = [strategy_config.initial_context]
         self.config = strategy_config
         self.push()
@@ -99,6 +100,7 @@ def __setitem__(self, key: str, value: ContextValue) -> None:
         self._stack[-1][key] = value
 
     def __getitem__(self, key: str) -> ContextValue:
+        # Walk down the stack from the top and return the first instance of `key` found
         for d in reversed(self._stack):
             if key in d:
                 return d[key]
@@ -200,9 +202,9 @@ def __repr__(self) -> str:
     def _invoke(
         self, func: Callable[..., ReturnedVariants], *args: Any, **kwargs: Any
     ) -> ReturnedVariants:
-        # We forcefully override strategy here.  This lets a strategy
+        # We forcefully override strategy here. This lets a strategy
         # function always access its metadata and directly forward it to
-        # subcomponents without having to filter out strategy.
+        # subcomponents.
         kwargs["strategy"] = self
         return func(*args, **kwargs)
 

diff --git a/src/sentry/grouping/strategies/newstyle.py b/src/sentry/grouping/strategies/newstyle.py
@@ -351,8 +351,9 @@ def frame(
     if context["javascript_fuzzing"] and get_behavior_family_for_platform(platform) == "javascript":
         func = frame.raw_function or frame.function
         if func:
+            # Strip leading namespacing, i.e., turn `some.module.path.someFunction` into
+            # `someFunction` and `someObject.someMethod` into `someMethod`
             func = func.rsplit(".", 1)[-1]
-        # special case empty functions not to have a hint
         if not func:
             function_component.update(contributes=False)
         elif func in (
@@ -518,15 +519,9 @@ def single_exception(
 
     if exception.mechanism:
         if exception.mechanism.synthetic:
-            # Ignore synthetic exceptions as they are produced from platform
-            # specific error codes.
-            #
-            # For example there can be crashes with EXC_ACCESS_VIOLATION_* on Windows with
-            # the same exact stacktrace as a crash with EXC_BAD_ACCESS on macOS.
-            #
-            # Do not update type component of system variant, such that regex
-            # can be continuously modified without unnecessarily creating new
-            # groups.
+            # Ignore the error type for synthetic exceptions as it can vary by platform and doesn't
+            # actually carry any meaning with respect to what went wrong. (Synthetic exceptions
+            # are dummy excepttions created by the SDK in order to harvest a stacktrace.)
             type_component.update(contributes=False, hint="ignored because exception is synthetic")
             system_type_component.update(
                 contributes=False, hint="ignored because exception is synthetic"
@@ -615,7 +610,7 @@ def chained_exception(
     # Get all the exceptions to consider.
     all_exceptions = interface.exceptions()
 
-    # Get the grouping components for all exceptions up front, as we'll need them in a few places and only want to compute them once.
+    # For each exception, create a dictionary of grouping components by variant name
     exception_components_by_exception = {
         id(exception): context.get_grouping_components_by_variant(exception, event=event, **meta)
         for exception in all_exceptions
@@ -638,12 +633,16 @@ def chained_exception(
     if main_exception_id:
         event.data["main_exception_id"] = main_exception_id
 
-    # Case 1: we have a single exception, use the single exception
-    # component directly to avoid a level of nesting
+    # Cases 1 and 2: Either this never was a chained exception (this is our entry point for single
+    # exceptions, too), or this is a chained exception consisting solely of an exception group and a
+    # single inner exception. In the former case, all we have is the single exception component, so
+    # return it. In the latter case, the there's no value-add to the wrapper, so discard it and just
+    # return the component for the inner exception.
     if len(exceptions) == 1:
         return exception_components_by_exception[id(exceptions[0])]
 
-    # Case 2: produce a component for each chained exception
+    # Case 3: This is either a chained exception or an exception group containing at least two inner
+    # exceptions. Either way, we need to wrap our exception components in a chained exception component.
     exception_components_by_variant: dict[str, list[ExceptionGroupingComponent]] = {}
 
     for exception in exceptions:
@@ -710,8 +709,8 @@ def get_child_exceptions(exception: SingleException) -> list[SingleException]:
         node = exception_tree.get(exception_id)
         return node.children if node else []
 
-    # This recursive generator gets the "top-level exceptions", and is used below.
-    # "Top-level exceptions are those that are the first descendants of the root that are not exception groups.
+    # This recursive generator gets the "top-level exceptions," and is used below.
+    # Top-level exceptions are those that are the first descendants of the root that are not exception groups.
     # For examples, see https://github.com/getsentry/rfcs/blob/main/text/0079-exception-groups.md#sentry-issue-grouping
     def get_top_level_exceptions(
         exception: SingleException,
@@ -753,8 +752,8 @@ def get_first_path(exception: SingleException) -> Generator[SingleException]:
     # If there's only one distinct top-level exception in the group,
     # use it and its first-path children, but throw out the exception group and any copies.
     # For example, Group<['Da', 'Da', 'Da']> should just be treated as a single 'Da'.
-    # We'll also set the main_exception_id, which is used in the extract_metadata function
-    # in src/sentry/eventtypes/error.py - which will ensure the issue is titled by this
+    # We'll also set `main_exception_id`, which is used in the `extract_metadata` function
+    # in `src/sentry/eventtypes/error.py`, in order to ensure the issue is titled by this
     # item rather than the exception group.
     if len(distinct_top_level_exceptions) == 1:
         main_exception = distinct_top_level_exceptions[0]

diff --git a/src/sentry/grouping/strategies/utils.py b/src/sentry/grouping/strategies/utils.py
@@ -12,8 +12,8 @@ def remove_non_stacktrace_variants(variants: ReturnedVariants) -> ReturnedVarian
     non_contributing_components = []
     stacktrace_variants = set()
 
-    # In case any of the variants has a contributing stacktrace, we want
-    # to make all other variants non contributing.
+    # If at least one variant has a contributing stacktrace, we want to mark all variants without a
+    # stacktrace as non-contributing.
     for variant_name, component in variants.items():
         stacktrace_iter = component.iter_subcomponents(
             id="stacktrace", recursive=True, only_contributing=True

diff --git a/src/sentry/grouping/utils.py b/src/sentry/grouping/utils.py
@@ -20,13 +20,22 @@ def is_default_fingerprint_var(value):
 
 
 def hash_from_values(values):
+    """
+    Primarily used at the end of the grouping process, to get a final hash value once the all of the
+    variants have been constructed, but also used as a hack to compare exception components (by
+    stringifying their reprs) when calculating variants for chained exceptions.
+    """
     result = md5()
     for value in values:
         result.update(force_bytes(value, errors="replace"))
     return result.hexdigest()
 
 
 def bool_from_string(value):
+    """
+    Convert various string representations of boolean values ("1", "yes", "true", "0", "no",
+    "false") into actual booleans.
+    """
     if value:
         value = value.lower()
         if value in ("1", "yes", "true"):
@@ -78,6 +87,7 @@ def get_fingerprint_value(var, data):
     elif var == "logger":
         return data.get("logger") or "<no-logger>"
     elif var.startswith("tags."):
+        # Turn "tags.some_tag" into just "some_tag"
         tag = var[5:]
         for t, value in data.get("tags") or ():
             if t == tag:

diff --git a/src/sentry/grouping/variants.py b/src/sentry/grouping/variants.py
@@ -124,9 +124,7 @@ def _get_metadata_as_dict(self):
 
 
 class ComponentVariant(BaseVariant):
-    """A component variant is a variant that produces a hash from the
-    `BaseGroupingComponent` it encloses.
-    """
+    """A variant that produces a hash from the `BaseGroupingComponent` it encloses."""
 
     type = "component"
 
@@ -201,7 +199,7 @@ def _get_metadata_as_dict(self) -> FingerprintVariantMetadata:
 
 
 class BuiltInFingerprintVariant(CustomFingerprintVariant):
-    """A built-in, Sentry defined fingerprint."""
+    """A built-in, Sentry-defined fingerprint."""
 
     type = "built_in_fingerprint"
 
@@ -235,6 +233,8 @@ def get_hash(self) -> str | None:
             return None
         final_values: list[str | int] = []
         for value in self.values:
+            # If we've hit the `{{ default }}` part of the fingerprint, pull in values from the
+            # original grouping method (message, stacktrace, etc.)
             if is_default_fingerprint_var(value):
                 final_values.extend(self.component.iter_values())
             else:

diff --git a/src/sentry/interfaces/security.py b/src/sentry/interfaces/security.py
@@ -150,7 +150,7 @@ class Csp(SecurityReport):
     """
     A CSP violation report.
 
-    See also: http://www.w3.org/TR/CSP/#violation-reports
+    See also: https://www.w3.org/TR/CSP/#violation-events
 
     >>> {
     >>>     "document_uri": "http://example.com/",

diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py
@@ -906,13 +906,13 @@
 register(
     "seer.similarity.global-rate-limit",
     type=Dict,
-    default={"limit": 20, "window": 1},
+    default={"limit": 20, "window": 1},  # window is in seconds
     flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
 )
 register(
     "seer.similarity.per-project-rate-limit",
     type=Dict,
-    default={"limit": 5, "window": 1},
+    default={"limit": 5, "window": 1},  # window is in seconds
     flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
 )
 

diff --git a/src/sentry/utils/tag_normalization.py b/src/sentry/utils/tag_normalization.py
@@ -115,7 +115,7 @@ def normalized_sdk_tag_from_event(data: Mapping[str, Any]) -> str:
     Note: Some platforms may keep their framework-specific values, as needed for analytics.
 
     This is done to reduce the cardinality of the `sdk.name` tag, while keeping
-    the ones interesinting to us as granual as possible.
+    the ones interesting to us as granular as possible.
     """
     try:
         return normalize_sdk_tag((data.get("sdk") or {}).get("name") or "other")

diff --git a/tests/sentry/grouping/__init__.py b/tests/sentry/grouping/__init__.py
@@ -202,6 +202,7 @@ def _dump_component(component: BaseGroupingComponent, indent: int) -> None:
 
     lines.append("{}hash: {}".format("  " * indent, to_json(variant.get_hash())))
 
+    # Note that this prints `__dict__`, not `as_dict()`, so if something seems missing, that's probably why
     for key, value in sorted(variant.__dict__.items()):
         if isinstance(value, BaseGroupingComponent):
             lines.append("{}{}:".format("  " * indent, key))

diff --git a/tests/sentry/grouping/test_variants.py b/tests/sentry/grouping/test_variants.py
@@ -30,7 +30,7 @@ def test_variants_with_legacy_configs(
     config_name: str, grouping_input: GroupingInput, insta_snapshot: InstaSnapshotter
 ) -> None:
     """
-    Run the variant snapshot tests using an minimal (and much more performant) save process.
+    Run the variant snapshot tests using a minimal (and much more performant) save process.
 
     Because manually cherry-picking only certain parts of the save process to run makes us much more
     likely to fall out of sync with reality, for safety we only do this when testing legacy,