Fix missing index column in /filter (#1935)

* Fix missing index column in /filter * Fix test * Fix e2e test
huggingface · Oct 5, 2023 · 6e19231 · 6e19231
1 parent 41ea211
commit 6e19231
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 7 deletions.
diff --git a/e2e/tests/test_53_filter.py b/e2e/tests/test_53_filter.py
@@ -15,7 +15,7 @@ def test_filter_endpoint(
     headers = auth_headers[auth]
     offset = 1
     length = 2
-    where = "col_4 = 'B'"
+    where = "col_4='B'"
     filter_response = poll_until_ready_and_assert(
         relative_url=(
             f"/filter?dataset={dataset}&config={config}&split={split}&offset={offset}&length={length}&where={where}"
@@ -47,7 +47,7 @@ def test_filter_endpoint(
         "truncated_cells": [],
     }, rows[0]
     assert rows[1] == {
-        "row_idx": 2,
+        "row_idx": 3,
         "row": {
             "col_1": "The wingman spots the pirateship coming at him and warns the Dark Lord",
             "col_2": 3,
@@ -77,15 +77,13 @@ def test_filter_endpoint(
         ("col_2<3 OR col_4='B'", 4),
     ],
 )
-def test_where_parameter_in_filter_endpoint(
+def test_filter_endpoint_parameter_where(
     where: str, expected_num_rows: int, hf_public_dataset_repo_csv_data: str
 ) -> None:
     dataset = hf_public_dataset_repo_csv_data
     config, split = get_default_config_split()
     response = poll_until_ready_and_assert(
         relative_url=f"/filter?dataset={dataset}&config={config}&split={split}&where={where}",
-        expected_status_code=200,
-        expected_error_code=None,
         check_x_revision=True,
     )
     content = response.json()

diff --git a/services/search/src/search/routes/filter.py b/services/search/src/search/routes/filter.py
@@ -222,7 +222,10 @@ def execute_filter_query(
 ) -> tuple[int, pa.Table]:
     with duckdb_connect(database=index_file_location) as con:
         filter_query = FILTER_QUERY.format(
-            columns=",".join([f'"{column}"' for column in columns]), where=where, limit=limit, offset=offset
+            columns=",".join([f'"{column}"' for column in [ROW_IDX_COLUMN] + columns]),
+            where=where,
+            limit=limit,
+            offset=offset,
         )
         filter_count_query = FILTER_COUNT_QUERY.format(where=where)
         try:

diff --git a/services/search/tests/routes/test_filter.py b/services/search/tests/routes/test_filter.py
@@ -64,7 +64,7 @@ def test_execute_filter_query(index_file_location: str) -> None:
         index_file_location=index_file_location, columns=columns, where=where, limit=limit, offset=offset
     )
     assert num_rows_total == 2
-    assert pa_table == pa.Table.from_pydict({"name": ["Simone"], "age": [30]})
+    assert pa_table == pa.Table.from_pydict({"__hf_index_id": [3], "name": ["Simone"], "age": [30]})
 
 
 @pytest.mark.parametrize("where", ["non-existing-column=30", "name=30", "name>30"])