Skip to content

Commit

Permalink
ref(grouping): Tweak comments and docstrings (#82045)
Browse files Browse the repository at this point in the history
This is a collection of tweaks to (and additions of) comments and docstrings, built up over the last month or so of my work on grouping. No behavior changes.
  • Loading branch information
lobsterkatie authored Dec 12, 2024
1 parent 6021578 commit 247bdff
Show file tree
Hide file tree
Showing 14 changed files with 78 additions and 49 deletions.
9 changes: 5 additions & 4 deletions src/sentry/grouping/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def get_grouping_config_dict_for_project(project) -> GroupingConfig:
ingestion so that the grouping algorithm can be re-run later.
This is called early on in normalization so that everything that is needed
to group the project is pulled into the event.
to group the event is pulled into the event data.
"""
loader = PrimaryGroupingConfigLoader()
return loader.get_config_dict(project)
Expand Down Expand Up @@ -285,8 +285,10 @@ def _get_component_trees_for_variants(
precedence_hint: str | None = None
all_strategies_components_by_variant: dict[str, list[BaseGroupingComponent]] = {}

# `iter_strategies` presents strategies in priority order, which allows us to go with the first
# one which produces a result. (See `src/sentry/grouping/strategies/configurations.py` for the
# strategies used by each config.)
for strategy in context.config.iter_strategies():
# Defined in src/sentry/grouping/strategies/base.py
current_strategy_components_by_variant = strategy.get_grouping_components(
event, context=context
)
Expand Down Expand Up @@ -334,8 +336,7 @@ def get_grouping_variants_for_event(
event: Event, config: StrategyConfiguration | None = None
) -> dict[str, BaseVariant]:
"""Returns a dict of all grouping variants for this event."""
# If a checksum is set the only variant that comes back from this
# event is the checksum variant.
# If a checksum is set the only variant that comes back from this event is the checksum variant.
#
# TODO: Is there a reason we don't treat a checksum like a custom fingerprint, and run the other
# strategies but mark them as non-contributing, with explanations why?
Expand Down
16 changes: 12 additions & 4 deletions src/sentry/grouping/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,13 @@ def _calculate_contributes[ValuesType](values: Sequence[ValuesType]) -> bool:


class BaseGroupingComponent[ValuesType: str | int | BaseGroupingComponent[Any]](ABC):
"""A grouping component is a recursive structure that is flattened
into components to make a hash for grouping purposes.
"""
A grouping component is a node in a tree describing the event data (exceptions, stacktraces,
messages, etc.) which can contribute to grouping. Each node's children, stored in the `values`
attribute, are either other grouping components or primitives representing the actual data.
For example, an exception component might have type, value, and stacktrace components as
children, and the type component might have the string "KeyError" as its child.
"""

hint: str | None = None
Expand All @@ -45,6 +50,8 @@ def __init__(
contributes: bool | None = None,
values: Sequence[ValuesType] | None = None,
):
# Use `upate` to set attribute values because it ensures `contributes` is set (if
# `contributes` is not provided, `update` will derive it from the `values` value)
self.update(
hint=hint,
contributes=contributes,
Expand Down Expand Up @@ -143,8 +150,9 @@ def shallow_copy(self) -> Self:
return copy

def iter_values(self) -> Generator[str | int]:
"""Recursively walks the component and flattens it into a list of
values.
"""
Recursively walks the component tree, gathering literal values from contributing
branches into a flat list.
"""
if self.contributes:
for value in self.values:
Expand Down
14 changes: 11 additions & 3 deletions src/sentry/grouping/fingerprinting/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,9 @@ def _get_release(self) -> list[_ReleaseInfo]:
return self._release

def get_values(self, match_type: str) -> list[dict[str, Any]]:
"""
Pull values from all the spots in the event appropriate to the given match type.
"""
return getattr(self, "_get_" + match_type)()


Expand Down Expand Up @@ -375,7 +378,12 @@ def _from_config_structure(


class FingerprintMatcher:
def __init__(self, key: str, pattern: str, negated: bool = False) -> None:
def __init__(
self,
key: str, # The event attribute on which to match
pattern: str, # The value to match (or to not match, depending on `negated`)
negated: bool = False, # If True, match when `event[key]` does NOT equal `pattern`
) -> None:
if key.startswith("tags."):
self.key = key
else:
Expand Down Expand Up @@ -422,7 +430,7 @@ def _positive_path_match(self, value: str | None) -> bool:
return False

def _positive_match(self, values: dict[str, Any]) -> bool:
# path is special in that it tests against two values (abs_path and path)
# `path` is special in that it tests against two values (`abs_path` and `filename`)
if self.key == "path":
value = values.get("abs_path")
if self._positive_path_match(value):
Expand All @@ -433,7 +441,7 @@ def _positive_match(self, values: dict[str, Any]) -> bool:
return True
return False

# message tests against value as well as this is what users expect
# message tests against exception value also, as this is what users expect
if self.key == "message":
for key in ("message", "value"):
value = values.get(key)
Expand Down
12 changes: 6 additions & 6 deletions src/sentry/grouping/parameterization.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,19 @@
@dataclasses.dataclass
class ParameterizationRegex:

name: str # name of the pattern also used as group name in combined regex
name: str # name of the pattern (also used as group name in combined regex)
raw_pattern: str # regex pattern w/o matching group name
lookbehind: str | None = None # positive lookbehind prefix if needed
lookahead: str | None = None # positive lookahead postfix if needed
counter: int = 0

# These need to be used with `(?x)` tells the regex compiler to ignore comments
# These need to be used with `(?x)`, to tell the regex compiler to ignore comments
# and unescaped whitespace, so we can use newlines and indentation for better legibility.

@property
def pattern(self) -> str:
"""
Returns the regex pattern for with as a named matching group and lookbehind/lookahead if needed.
Returns the regex pattern with a named matching group and lookbehind/lookahead if needed.
"""
prefix = rf"(?<={self.lookbehind})" if self.lookbehind else ""
postfix = rf"(?={self.lookahead})" if self.lookahead else ""
Expand All @@ -41,7 +41,7 @@ def pattern(self) -> str:
@property
def compiled_pattern(self) -> re.Pattern[str]:
"""
Returns the compiled regex pattern for with as a named matching group and lookbehind/lookahead if needed.
Returns the compiled regex pattern with a named matching group and lookbehind/lookahead if needed.
"""
if not hasattr(self, "_compiled_pattern"):
self._compiled_pattern = re.compile(rf"(?x){self.pattern}")
Expand Down Expand Up @@ -189,8 +189,8 @@ class ParameterizationCallable:
us more flexibility than just using regex.
"""

name: str # name of the pattern also used as group name in combined regex
apply: Callable[[str], tuple[str, int]] # function to modifying the input string
name: str # name of the pattern (also used as group name in combined regex)
apply: Callable[[str], tuple[str, int]] # function for modifying the input string
counter: int = 0


Expand Down
8 changes: 5 additions & 3 deletions src/sentry/grouping/strategies/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def strategy(
"""
Registers a strategy
:param ids: The strategy/delegate IDs with which to register
:param ids: The strategy/delegate IDs to register
:param interface: Which interface type should be dispatched to this strategy
:param score: Determines precedence of strategies. For example exception
strategy scores higher than message strategy, so if both interfaces are
Expand Down Expand Up @@ -90,6 +90,7 @@ def decorator(f: StrategyFunc[ConcreteInterface]) -> Strategy[ConcreteInterface]

class GroupingContext:
def __init__(self, strategy_config: "StrategyConfiguration"):
# The initial context is essentially the grouping config options
self._stack = [strategy_config.initial_context]
self.config = strategy_config
self.push()
Expand All @@ -99,6 +100,7 @@ def __setitem__(self, key: str, value: ContextValue) -> None:
self._stack[-1][key] = value

def __getitem__(self, key: str) -> ContextValue:
# Walk down the stack from the top and return the first instance of `key` found
for d in reversed(self._stack):
if key in d:
return d[key]
Expand Down Expand Up @@ -200,9 +202,9 @@ def __repr__(self) -> str:
def _invoke(
self, func: Callable[..., ReturnedVariants], *args: Any, **kwargs: Any
) -> ReturnedVariants:
# We forcefully override strategy here. This lets a strategy
# We forcefully override strategy here. This lets a strategy
# function always access its metadata and directly forward it to
# subcomponents without having to filter out strategy.
# subcomponents.
kwargs["strategy"] = self
return func(*args, **kwargs)

Expand Down
35 changes: 17 additions & 18 deletions src/sentry/grouping/strategies/newstyle.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,8 +351,9 @@ def frame(
if context["javascript_fuzzing"] and get_behavior_family_for_platform(platform) == "javascript":
func = frame.raw_function or frame.function
if func:
# Strip leading namespacing, i.e., turn `some.module.path.someFunction` into
# `someFunction` and `someObject.someMethod` into `someMethod`
func = func.rsplit(".", 1)[-1]
# special case empty functions not to have a hint
if not func:
function_component.update(contributes=False)
elif func in (
Expand Down Expand Up @@ -518,15 +519,9 @@ def single_exception(

if exception.mechanism:
if exception.mechanism.synthetic:
# Ignore synthetic exceptions as they are produced from platform
# specific error codes.
#
# For example there can be crashes with EXC_ACCESS_VIOLATION_* on Windows with
# the same exact stacktrace as a crash with EXC_BAD_ACCESS on macOS.
#
# Do not update type component of system variant, such that regex
# can be continuously modified without unnecessarily creating new
# groups.
# Ignore the error type for synthetic exceptions as it can vary by platform and doesn't
# actually carry any meaning with respect to what went wrong. (Synthetic exceptions
# are dummy excepttions created by the SDK in order to harvest a stacktrace.)
type_component.update(contributes=False, hint="ignored because exception is synthetic")
system_type_component.update(
contributes=False, hint="ignored because exception is synthetic"
Expand Down Expand Up @@ -615,7 +610,7 @@ def chained_exception(
# Get all the exceptions to consider.
all_exceptions = interface.exceptions()

# Get the grouping components for all exceptions up front, as we'll need them in a few places and only want to compute them once.
# For each exception, create a dictionary of grouping components by variant name
exception_components_by_exception = {
id(exception): context.get_grouping_components_by_variant(exception, event=event, **meta)
for exception in all_exceptions
Expand All @@ -638,12 +633,16 @@ def chained_exception(
if main_exception_id:
event.data["main_exception_id"] = main_exception_id

# Case 1: we have a single exception, use the single exception
# component directly to avoid a level of nesting
# Cases 1 and 2: Either this never was a chained exception (this is our entry point for single
# exceptions, too), or this is a chained exception consisting solely of an exception group and a
# single inner exception. In the former case, all we have is the single exception component, so
# return it. In the latter case, the there's no value-add to the wrapper, so discard it and just
# return the component for the inner exception.
if len(exceptions) == 1:
return exception_components_by_exception[id(exceptions[0])]

# Case 2: produce a component for each chained exception
# Case 3: This is either a chained exception or an exception group containing at least two inner
# exceptions. Either way, we need to wrap our exception components in a chained exception component.
exception_components_by_variant: dict[str, list[ExceptionGroupingComponent]] = {}

for exception in exceptions:
Expand Down Expand Up @@ -710,8 +709,8 @@ def get_child_exceptions(exception: SingleException) -> list[SingleException]:
node = exception_tree.get(exception_id)
return node.children if node else []

# This recursive generator gets the "top-level exceptions", and is used below.
# "Top-level exceptions are those that are the first descendants of the root that are not exception groups.
# This recursive generator gets the "top-level exceptions," and is used below.
# Top-level exceptions are those that are the first descendants of the root that are not exception groups.
# For examples, see https://github.com/getsentry/rfcs/blob/main/text/0079-exception-groups.md#sentry-issue-grouping
def get_top_level_exceptions(
exception: SingleException,
Expand Down Expand Up @@ -753,8 +752,8 @@ def get_first_path(exception: SingleException) -> Generator[SingleException]:
# If there's only one distinct top-level exception in the group,
# use it and its first-path children, but throw out the exception group and any copies.
# For example, Group<['Da', 'Da', 'Da']> should just be treated as a single 'Da'.
# We'll also set the main_exception_id, which is used in the extract_metadata function
# in src/sentry/eventtypes/error.py - which will ensure the issue is titled by this
# We'll also set `main_exception_id`, which is used in the `extract_metadata` function
# in `src/sentry/eventtypes/error.py`, in order to ensure the issue is titled by this
# item rather than the exception group.
if len(distinct_top_level_exceptions) == 1:
main_exception = distinct_top_level_exceptions[0]
Expand Down
4 changes: 2 additions & 2 deletions src/sentry/grouping/strategies/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ def remove_non_stacktrace_variants(variants: ReturnedVariants) -> ReturnedVarian
non_contributing_components = []
stacktrace_variants = set()

# In case any of the variants has a contributing stacktrace, we want
# to make all other variants non contributing.
# If at least one variant has a contributing stacktrace, we want to mark all variants without a
# stacktrace as non-contributing.
for variant_name, component in variants.items():
stacktrace_iter = component.iter_subcomponents(
id="stacktrace", recursive=True, only_contributing=True
Expand Down
10 changes: 10 additions & 0 deletions src/sentry/grouping/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,22 @@ def is_default_fingerprint_var(value):


def hash_from_values(values):
"""
Primarily used at the end of the grouping process, to get a final hash value once the all of the
variants have been constructed, but also used as a hack to compare exception components (by
stringifying their reprs) when calculating variants for chained exceptions.
"""
result = md5()
for value in values:
result.update(force_bytes(value, errors="replace"))
return result.hexdigest()


def bool_from_string(value):
"""
Convert various string representations of boolean values ("1", "yes", "true", "0", "no",
"false") into actual booleans.
"""
if value:
value = value.lower()
if value in ("1", "yes", "true"):
Expand Down Expand Up @@ -78,6 +87,7 @@ def get_fingerprint_value(var, data):
elif var == "logger":
return data.get("logger") or "<no-logger>"
elif var.startswith("tags."):
# Turn "tags.some_tag" into just "some_tag"
tag = var[5:]
for t, value in data.get("tags") or ():
if t == tag:
Expand Down
8 changes: 4 additions & 4 deletions src/sentry/grouping/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,7 @@ def _get_metadata_as_dict(self):


class ComponentVariant(BaseVariant):
"""A component variant is a variant that produces a hash from the
`BaseGroupingComponent` it encloses.
"""
"""A variant that produces a hash from the `BaseGroupingComponent` it encloses."""

type = "component"

Expand Down Expand Up @@ -201,7 +199,7 @@ def _get_metadata_as_dict(self) -> FingerprintVariantMetadata:


class BuiltInFingerprintVariant(CustomFingerprintVariant):
"""A built-in, Sentry defined fingerprint."""
"""A built-in, Sentry-defined fingerprint."""

type = "built_in_fingerprint"

Expand Down Expand Up @@ -235,6 +233,8 @@ def get_hash(self) -> str | None:
return None
final_values: list[str | int] = []
for value in self.values:
# If we've hit the `{{ default }}` part of the fingerprint, pull in values from the
# original grouping method (message, stacktrace, etc.)
if is_default_fingerprint_var(value):
final_values.extend(self.component.iter_values())
else:
Expand Down
2 changes: 1 addition & 1 deletion src/sentry/interfaces/security.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ class Csp(SecurityReport):
"""
A CSP violation report.
See also: http://www.w3.org/TR/CSP/#violation-reports
See also: https://www.w3.org/TR/CSP/#violation-events
>>> {
>>> "document_uri": "http://example.com/",
Expand Down
4 changes: 2 additions & 2 deletions src/sentry/options/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -906,13 +906,13 @@
register(
"seer.similarity.global-rate-limit",
type=Dict,
default={"limit": 20, "window": 1},
default={"limit": 20, "window": 1}, # window is in seconds
flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
)
register(
"seer.similarity.per-project-rate-limit",
type=Dict,
default={"limit": 5, "window": 1},
default={"limit": 5, "window": 1}, # window is in seconds
flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
)

Expand Down
2 changes: 1 addition & 1 deletion src/sentry/utils/tag_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def normalized_sdk_tag_from_event(data: Mapping[str, Any]) -> str:
Note: Some platforms may keep their framework-specific values, as needed for analytics.
This is done to reduce the cardinality of the `sdk.name` tag, while keeping
the ones interesinting to us as granual as possible.
the ones interesting to us as granular as possible.
"""
try:
return normalize_sdk_tag((data.get("sdk") or {}).get("name") or "other")
Expand Down
1 change: 1 addition & 0 deletions tests/sentry/grouping/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ def _dump_component(component: BaseGroupingComponent, indent: int) -> None:

lines.append("{}hash: {}".format(" " * indent, to_json(variant.get_hash())))

# Note that this prints `__dict__`, not `as_dict()`, so if something seems missing, that's probably why
for key, value in sorted(variant.__dict__.items()):
if isinstance(value, BaseGroupingComponent):
lines.append("{}{}:".format(" " * indent, key))
Expand Down
2 changes: 1 addition & 1 deletion tests/sentry/grouping/test_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def test_variants_with_legacy_configs(
config_name: str, grouping_input: GroupingInput, insta_snapshot: InstaSnapshotter
) -> None:
"""
Run the variant snapshot tests using an minimal (and much more performant) save process.
Run the variant snapshot tests using a minimal (and much more performant) save process.
Because manually cherry-picking only certain parts of the save process to run makes us much more
likely to fall out of sync with reality, for safety we only do this when testing legacy,
Expand Down

0 comments on commit 247bdff

Please sign in to comment.