Skip to content

Commit

Permalink
allow to set instance in diff operator (#7634)
Browse files Browse the repository at this point in the history
GitOrigin-RevId: 9c968d8a6b7a816efbd271a5b60d7ca4e81e71f2
  • Loading branch information
KamilPiechowiak authored and Manul from Pathway committed Nov 13, 2024
1 parent 0dfeb04 commit e7cee98
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 2 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
### Added
- `pw.io.kafka.read` now supports reading entries starting from a specified timestamp.

### Changed
- `pw.Table.diff` now supports setting `instance` parameter that allows computing differences for multiple groups.

## [0.15.3] - 2024-11-07

### Added
Expand Down
29 changes: 27 additions & 2 deletions python/pathway/stdlib/ordered/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def diff(
self: pw.Table,
timestamp: pw.ColumnReference,
*values: pw.ColumnReference,
instance: pw.ColumnReference | None = None,
) -> pw.Table:
"""
Compute the difference between the values in the ``values`` columns and the previous values
Expand All @@ -22,6 +23,9 @@ def diff(
The column reference to the ``timestamp`` column on which the order is computed.
- *values (pw.ColumnReference[int | float | datetime]):
Variable-length argument representing the column references to the ``values`` columns.
- instance (pw.ColumnReference):
Can be used to group the values. The difference is only computed between rows with
the same ``instance`` value.
Returns:
``Table``: A new table where each column is replaced with a new column containing
Expand All @@ -31,7 +35,7 @@ def diff(
ValueError: If the columns are not ColumnReference.
Note:
- The value of the "first" value (the row with the lower value \
- The value of the "first" value (the row with the lowest value \
in the ``timestamp`` column) is ``None``.
Example:
Expand All @@ -55,6 +59,27 @@ def diff(
4 | 7 | 3
5 | 11 | 4
6 | 16 | 5
>>> table = pw.debug.table_from_markdown(
... '''
... timestamp | instance | values
... 1 | 0 | 1
... 2 | 1 | 2
... 3 | 1 | 4
... 3 | 0 | 7
... 6 | 1 | 11
... 6 | 0 | 16
... '''
... )
>>> table += table.diff(pw.this.timestamp, pw.this.values, instance=pw.this.instance)
>>> pw.debug.compute_and_print(table, include_id=False)
timestamp | instance | values | diff_values
1 | 0 | 1 |
2 | 1 | 2 |
3 | 0 | 7 | 6
3 | 1 | 4 | 2
6 | 0 | 16 | 9
6 | 1 | 11 | 7
"""

if isinstance(timestamp, pw.ColumnReference):
Expand All @@ -69,7 +94,7 @@ def diff(
"statistical.diff(): Invalid column reference for the parameter timestamp."
)

ordered_table = self.sort(key=timestamp)
ordered_table = self.sort(key=timestamp, instance=instance)

for value in values:
if isinstance(value, pw.ColumnReference):
Expand Down
35 changes: 35 additions & 0 deletions python/pathway/tests/ordered/test_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,38 @@ def test_diff_multiple_columns():
)

assert_table_equality_wo_index(res, expected)


def test_diff_instance():
t = T(
"""
| t | i | v
1 | 1 | 0 | 1
2 | 2 | 1 | 2
3 | 3 | 1 | 4
4 | 3 | 0 | 7
5 | 5 | 1 | 11
6 | 5 | 0 | 16
7 | 7 | 0 | 22
8 | 8 | 1 | 29
9 | 9 | 0 | 37
"""
)
res = t.diff(t.t, t.v, instance=t.i)

expected = T(
"""
| diff_v
1 |
2 |
3 | 2
4 | 6
5 | 7
6 | 9
7 | 6
8 | 18
9 | 15
"""
)

assert_table_equality_wo_index(res, expected)

0 comments on commit e7cee98

Please sign in to comment.