diff --git a/scripts/migrations/migration.004.s3_name_with_id.py b/scripts/migrations/migration.004.s3_name_with_id.py index 0a2e203..21db0f5 100644 --- a/scripts/migrations/migration.004.s3_name_with_id.py +++ b/scripts/migrations/migration.004.s3_name_with_id.py @@ -29,15 +29,14 @@ def s3_name_with_id(bucket: str): contents = res["Contents"] moved_files = 0 for s3_file in contents: - if s3_file["Key"].split("/")[0] == "aggregates": - key = s3_file["Key"] - key_array = key.split("/") - if len(key_array[3]) == 3: - key_array[3] = f"{key_array[2]}_{key_array[3]}" - new_key = "/".join(key_array) - client.copy({"Bucket": bucket, "Key": key}, bucket, new_key) - client.delete_object(Bucket=bucket, Key=key) - moved_files += 1 + key = s3_file["Key"] + key_array = key.split("/") + if key_array[0] == "aggregates" and len(key_array[3]) == 3: + key_array[3] = f"{key_array[2]}__{key_array[3]}" + new_key = "/".join(key_array) + client.copy({"Bucket": bucket, "Key": key}, bucket, new_key) + client.delete_object(Bucket=bucket, Key=key) + moved_files += 1 print(f"Updated {moved_files} aggregates") diff --git a/src/handlers/dashboard/get_chart_data.py b/src/handlers/dashboard/get_chart_data.py index de9d9dd..4da4f61 100644 --- a/src/handlers/dashboard/get_chart_data.py +++ b/src/handlers/dashboard/get_chart_data.py @@ -26,10 +26,11 @@ def _get_table_cols(dp_id: str, version: str | None = None) -> list: """ s3_bucket_name = os.environ.get("BUCKET_NAME") - prefix = f"{enums.BucketPath.CSVAGGREGATE.value}/{dp_id.split('__')[0]}/{dp_id[:-4]}" + dp_name = dp_id.rsplit("__", 1)[0] + prefix = f"{enums.BucketPath.CSVAGGREGATE.value}/{dp_id.split('__')[0]}/{dp_name}" if version is None: version = functions.get_latest_data_package_version(s3_bucket_name, prefix) - s3_key = f"{prefix}/{version}/{dp_id[:-4]}__aggregate.csv" + s3_key = f"{prefix}/{version}/{dp_name}__aggregate.csv" s3_client = boto3.client("s3") try: s3_iter = s3_client.get_object( @@ -117,6 +118,8 @@ def chart_data_handler(event, context): res = _format_payload(df, query_params, filters) res = functions.http_response(200, res) except errors.AggregatorS3Error: + # while the API is publicly accessible, we've been asked to not pass + # helpful error messages back. revisit when dashboard is in AWS. res = functions.http_response( 404, f"Aggregate for {path_params['data_package_id']} not found" ) diff --git a/src/handlers/site_upload/powerset_merge.py b/src/handlers/site_upload/powerset_merge.py index d4cacc1..18b6786 100644 --- a/src/handlers/site_upload/powerset_merge.py +++ b/src/handlers/site_upload/powerset_merge.py @@ -44,7 +44,7 @@ def __init__(self, event): self.study = s3_key_array[1] self.data_package = s3_key_array[2].split("__")[1] self.site = s3_key_array[3] - self.version = s3_key_array[4][-3:] + self.version = s3_key_array[4].split("__")[-1] self.metadata = functions.read_metadata(self.s3_client, self.s3_bucket_name) self.types_metadata = functions.read_metadata( self.s3_client, @@ -87,7 +87,7 @@ def write_parquet(self, df: pandas.DataFrame, is_new_data_package: bool) -> None parquet_aggregate_path = ( f"s3://{self.s3_bucket_name}/{enums.BucketPath.AGGREGATE.value}/" f"{self.study}/{self.study}__{self.data_package}/" - f"{self.study}__{self.data_package}_{self.version}/" + f"{self.study}__{self.data_package}__{self.version}/" f"{self.study}__{self.data_package}__aggregate.parquet" ) awswrangler.s3.to_parquet(df, parquet_aggregate_path, index=False) diff --git a/template.yaml b/template.yaml index 6a654bb..3d8c21e 100644 --- a/template.yaml +++ b/template.yaml @@ -350,7 +350,7 @@ Resources: Type: Api Properties: RestApiId: !Ref DashboardApiGateway - Path: /data_packages/{data_package_id}/chart + Path: /data-packages/{data_package_id}/chart Method: GET RequestParameters: - method.request.querystring.column: @@ -417,7 +417,7 @@ Resources: Type: Api Properties: RestApiId: !Ref DashboardApiGateway - Path: /last_valid/{study}/{data_package}/{site}/{version}/{filename} + Path: /last-valid/{study}/{data_package}/{site}/{version}/{filename} Method: GET Policies: - S3ReadPolicy: @@ -463,7 +463,7 @@ Resources: Type: Api Properties: RestApiId: !Ref DashboardApiGateway - Path: /last_valid/ + Path: /last-valid/ Method: GET Policies: - S3ReadPolicy: @@ -571,7 +571,7 @@ Resources: Type: Api Properties: RestApiId: !Ref DashboardApiGateway - Path: /data_packages + Path: /data-packages Method: GET RequestParameters: - method.request.querystring.name: @@ -580,7 +580,7 @@ Resources: Type: Api Properties: RestApiId: !Ref DashboardApiGateway - Path: /data_packages/{data_package_id} + Path: /data-packages/{data_package_id} Method: GET # TODO: it :should: be possible to move these policies to a central role/policy # set that can be referenced in multiple places; see diff --git a/tests/conftest.py b/tests/conftest.py index 97a496a..a7abc6e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -49,7 +49,7 @@ def _init_mock_data(s3_client, bucket, study, data_package, version): "./tests/test_data/count_synthea_patient_agg.parquet", bucket, f"{enums.BucketPath.AGGREGATE.value}/{study}/" - f"{study}__{data_package}/{study}__{data_package}_{version}/" + f"{study}__{data_package}/{study}__{data_package}__{version}/" f"{study}__{data_package}__aggregate.parquet", ) s3_client.upload_file( diff --git a/tests/dashboard/test_get_chart_data.py b/tests/dashboard/test_get_chart_data.py index 51201e1..7acdbdf 100644 --- a/tests/dashboard/test_get_chart_data.py +++ b/tests/dashboard/test_get_chart_data.py @@ -104,8 +104,8 @@ def test_format_payload(query_params, filters, expected_payload): def test_get_data_cols(mock_bucket): - table_name = f"{EXISTING_STUDY}__{EXISTING_DATA_P}_{EXISTING_VERSION}" - res = get_chart_data._get_table_cols(table_name) + table_id = f"{EXISTING_STUDY}__{EXISTING_DATA_P}__{EXISTING_VERSION}" + res = get_chart_data._get_table_cols(table_id) cols = pandas.read_csv("./tests/test_data/count_synthea_patient_agg.csv").columns assert res == list(cols) diff --git a/tests/site_upload/test_powerset_merge.py b/tests/site_upload/test_powerset_merge.py index 45b1499..106c4da 100644 --- a/tests/site_upload/test_powerset_merge.py +++ b/tests/site_upload/test_powerset_merge.py @@ -92,17 +92,17 @@ 200, ITEM_COUNT + 4, ), - # ( # ensuring that a data package that is a substring does not get - # # merged by substr match - # "./tests/test_data/count_synthea_patient.parquet", - # f"/{EXISTING_STUDY}/{EXISTING_STUDY}__{EXISTING_DATA_P[0:-2]}/{EXISTING_SITE}/" - # f"{EXISTING_STUDY}__{EXISTING_DATA_P[0:-2]}_{EXISTING_VERSION}/encount.parquet", - # f"/{EXISTING_STUDY}/{EXISTING_STUDY}__{EXISTING_DATA_P[0:-2]}/" - # f"{EXISTING_STUDY}__{EXISTING_DATA_P[0:-2]}_{EXISTING_VERSION}/encount.parquet", - # False, - # 200, - # ITEM_COUNT + 4, - # ), + ( # ensuring that a data package that is a substring does not get + # merged by substr match + "./tests/test_data/count_synthea_patient.parquet", + f"/{EXISTING_STUDY}/{EXISTING_STUDY}__{EXISTING_DATA_P[0:-2]}/{EXISTING_SITE}/" + f"{EXISTING_VERSION}/encount.parquet", + f"/{EXISTING_STUDY}/{EXISTING_STUDY}__{EXISTING_DATA_P[0:-2]}/{EXISTING_SITE}/" + f"{EXISTING_VERSION}/encount.parquet", + False, + 200, + ITEM_COUNT + 4, + ), ( # Empty file upload None, f"/{NEW_STUDY}/{NEW_STUDY}__{EXISTING_DATA_P}/{EXISTING_SITE}" @@ -136,7 +136,6 @@ def test_powerset_merge_single_upload( mock_notification, ): s3_client = boto3.client("s3", region_name="us-east-1") - s3_res = s3_client.list_objects_v2(Bucket=TEST_BUCKET) if upload_file is not None: s3_client.upload_file( upload_file,