Skip to content

Commit

Permalink
[Chore] Improve test flakes (#404)
Browse files Browse the repository at this point in the history
## Problem

I frequently see recurring test flakes, most often at the cleanup stage
of `test_configure_index_with_deletion_protection` because the index
cannot be deleted while still in an "upgrading" state. The index is in
that state while configuration changes are being applied.

## Solution

- In deletion protection test, wait for the index to be ready before
attempting the delete.
- In dependency tests which do a basic sanity test, loosen the
assertions. We're not really intending to test the backend functionality
with these tests; we just want to make sure network calls are
successfully made with a variety of different grpc / protobuff / etc
dependency versions.
- Remove some assertions that rely on non-deterministic behavior on the
backend. E.g. how long it takes for a serverless index to be considered
ready.
- Check for deletion protection status before trying to delete.
- Don't fail the job when cleanup fails; even if we re-run the entire
job, there will still be an orphaned index from a previous run that
needs to be cleaned up. Best to just attempt to delete, then let the
nightly cleanup job garbage collect anything left over.

## Type of Change

- [x] Bug fix (non-breaking change which fixes an issue)
- [ ] New feature (non-breaking change which adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to not work as expected)
- [ ] This change requires a documentation update
- [x] Infrastructure change (CI configs, etc)
- [ ] Non-code change (docs, etc)
- [ ] None of the above: (explain here)
  • Loading branch information
jhamon authored Oct 25, 2024
1 parent 3780924 commit d1fda39
Show file tree
Hide file tree
Showing 9 changed files with 120 additions and 32 deletions.
7 changes: 6 additions & 1 deletion .github/actions/cleanup-all/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,16 @@ inputs:
PINECONE_API_KEY:
description: 'The Pinecone API key'
required: true
DELETE_ALL:
description: 'Delete all indexes and collections'
required: false
default: 'false'

runs:
using: 'composite'
steps:
- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: 3.9
- name: Setup Poetry
Expand All @@ -20,3 +24,4 @@ runs:
run: poetry run python3 scripts/cleanup-all.py
env:
PINECONE_API_KEY: ${{ inputs.PINECONE_API_KEY }}
DELETE_ALL: ${{ inputs.DELETE_ALL }}
25 changes: 25 additions & 0 deletions scripts/cleanup-all.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re
from pinecone import Pinecone
from datetime import datetime, timedelta
import time


def delete_everything(pc):
Expand All @@ -16,6 +17,30 @@ def delete_everything(pc):
for index in pc.list_indexes().names():
try:
print("Deleting index: " + index)
desc = pc.describe_index(index)

# Check whether index can be deleted
if desc.deletion_protection == "enabled":
pc.configure_index(index, deletion_protection="disabled")

# Wait for index to be ready before deleting
ready_to_delete = False
max_wait = 60
time_waited = 0
while not ready_to_delete:
desc = pc.describe_index(index)
if desc.status.state == "Ready":
ready_to_delete = True
break
else:
print("Index is not ready yet. Waiting for 2 seconds.")
time.sleep(2)
time_waited += 2

if time_waited > max_wait:
print(f"Timed out waiting for index {index} to be ready")
break

pc.delete_index(index)
except Exception as e:
print("Failed to delete index: " + index + " " + str(e))
Expand Down
2 changes: 1 addition & 1 deletion tests/dependency/grpc/test_sanity.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_sanity(self, index_name, client):
# Check the vector count reflects some data has been upserted
description = idx.describe_index_stats()
assert description.dimension == 2
assert description.total_vector_count == 3
assert description.total_vector_count >= 3

# Query for results
query_results = idx.query(id="1", top_k=10, include_values=True)
Expand Down
2 changes: 1 addition & 1 deletion tests/dependency/rest/test_sanity.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_sanity(self, index_name, client):
# Check the vector count reflects some data has been upserted
description = idx.describe_index_stats()
assert description.dimension == 2
assert description.total_vector_count == 3
assert description.total_vector_count >= 3

# Query for results
query_results = idx.query(id="1", top_k=10, include_values=True)
Expand Down
52 changes: 40 additions & 12 deletions tests/integration/control/pod/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def ready_index(client, index_name, create_index_params):
client.create_index(**create_index_params)
time.sleep(10) # Extra wait, since status is sometimes inaccurate
yield index_name
client.delete_index(index_name, -1)
attempt_delete_index(client, index_name)


@pytest.fixture()
Expand All @@ -64,10 +64,6 @@ def notready_index(client, index_name, create_index_params):
yield index_name


def index_exists(index_name, client):
return index_name in client.list_indexes().names()


@pytest.fixture(scope="session")
def reusable_collection():
pc = Pinecone(
Expand Down Expand Up @@ -113,20 +109,43 @@ def reusable_collection():
raise Exception(f"Collection {collection_name} is not ready after 120 seconds")

print(f"Collection {collection_name} is ready. Deleting index {index_name}...")
pc.delete_index(index_name)
attempt_delete_index(pc, index_name)

yield collection_name

print(f"Deleting collection {collection_name}...")
pc.delete_collection(collection_name)
attempt_delete_collection(pc, collection_name)


@pytest.fixture(autouse=True)
def cleanup(client, index_name):
yield
def attempt_delete_collection(client, collection_name):
time_waited = 0
while collection_name in client.list_collections().names() and time_waited < 120:
print(
f"Waiting for collection {collection_name} to be ready to delete. Waited {time_waited} seconds.."
)
time_waited += 5
time.sleep(5)
try:
print(f"Attempting delete of collection {collection_name}")
client.delete_collection(collection_name)
print(f"Deleted collection {collection_name}")
break
except Exception as e:
print(f"Unable to delete collection {collection_name}: {e}")
pass

if time_waited >= 120:
# Things that fail to delete due to transient statuses will be garbage
# collected by the nightly cleanup script
print(f"Collection {collection_name} could not be deleted after 120 seconds")


def attempt_delete_index(client, index_name):
time_waited = 0
while index_exists(index_name, client) and time_waited < 120:
while client.has_index(index_name) and time_waited < 120:
if client.describe_index(index_name).delete_protection == "enabled":
client.configure_index(index_name, deletion_protection="disabled")

print(
f"Waiting for index {index_name} to be ready to delete. Waited {time_waited} seconds.."
)
Expand All @@ -142,4 +161,13 @@ def cleanup(client, index_name):
pass

if time_waited >= 120:
raise Exception(f"Index {index_name} could not be deleted after 120 seconds")
# Things that fail to delete due to transient statuses will be garbage
# collected by the nightly cleanup script
print(f"Index {index_name} could not be deleted after 120 seconds")


@pytest.fixture(autouse=True)
def cleanup(client, index_name):
yield

attempt_delete_index(client, index_name)
29 changes: 25 additions & 4 deletions tests/integration/control/pod/test_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,28 @@
from ...helpers import generate_index_name, generate_collection_name


def attempt_cleanup_collection(client, collection_name):
try:
time.sleep(10)
client.delete_collection(collection_name)
except Exception as e:
# Failures here usually happen because the backend thinks there is still some
# operation pending on the resource.
# These orphaned resources will get cleaned up by the cleanup job later.
print(f"Failed to cleanup collection: {e}")


def attempt_cleanup_index(client, index_name):
try:
time.sleep(10)
client.delete_index(index_name, -1)
except Exception as e:
# Failures here usually happen because the backend thinks there is still some
# operation pending on the resource.
# These orphaned resources will get cleaned up by the cleanup job later.
print(f"Failed to cleanup collection: {e}")


class TestCollectionsHappyPath:
def test_index_to_collection_to_index_happy_path(
self, client, environment, dimension, metric, ready_index, random_vector
Expand Down Expand Up @@ -78,8 +100,8 @@ def test_index_to_collection_to_index_happy_path(
assert results.vectors[v[0]].values == pytest.approx(v[1], rel=0.01)

# Cleanup
client.delete_collection(collection_name)
client.delete_index(index_name)
attempt_cleanup_collection(client, collection_name)
attempt_cleanup_index(client, index_name)

def test_create_index_with_different_metric_from_orig_index(
self, client, dimension, metric, environment, reusable_collection
Expand All @@ -94,5 +116,4 @@ def test_create_index_with_different_metric_from_orig_index(
metric=target_metric,
spec=PodSpec(environment=environment, source_collection=reusable_collection),
)
time.sleep(10)
client.delete_index(index_name, -1)
attempt_cleanup_index(client, index_name)
24 changes: 22 additions & 2 deletions tests/integration/control/pod/test_deletion_protection.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pytest
import time
from pinecone import PodSpec


Expand Down Expand Up @@ -47,5 +48,24 @@ def test_configure_index_with_deletion_protection(self, client, index_name, envi
assert desc.spec.pod.replicas == 3
assert desc.deletion_protection == "disabled"

# Cleanup
client.delete_index(index_name)
# Wait up to 30*2 seconds for the index to be ready before attempting to delete
for t in range(1, 30):
delta = 2
desc = client.describe_index(index_name)
if desc.status.state == "Ready":
print(f"Index {index_name} is ready after {(t-1)*delta} seconds")
break
print("Index is not ready yet. Waiting for 2 seconds.")
time.sleep(delta)

attempts = 0
while attempts < 12:
try:
client.delete_index(index_name)
break
except Exception as e:
attempts += 1
print(f"Failed to delete index {index_name} on attempt {attempts}.")
print(f"Error: {e}")
client.describe_index(index_name)
time.sleep(10)
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
import pytest


class TestCreateIndexWithTimeout:
def test_create_index_default_timeout(self, client, create_sl_index_params):
create_sl_index_params["timeout"] = None
Expand All @@ -17,11 +14,6 @@ def test_create_index_when_timeout_set(self, client, create_sl_index_params):
desc = client.describe_index(create_sl_index_params["name"])
assert desc.status.ready == True

def test_create_index_when_timeout_error(self, client, create_sl_index_params):
create_sl_index_params["timeout"] = 1
with pytest.raises(TimeoutError):
client.create_index(**create_sl_index_params)

def test_create_index_with_negative_timeout(self, client, create_sl_index_params):
create_sl_index_params["timeout"] = -1
client.create_index(**create_sl_index_params)
Expand Down
3 changes: 0 additions & 3 deletions tests/integration/control/serverless/test_describe_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,3 @@ def test_describe_index_when_not_ready(self, client, notready_sl_index, create_s
assert isinstance(description.host, str)
assert description.host != ""
assert notready_sl_index in description.host

assert description.status.ready == False
assert description.status.state in ["Ready", "Initializing"]

0 comments on commit d1fda39

Please sign in to comment.