diff --git a/research_datastream/terraform/GETTING_STARTED.md b/research_datastream/terraform/GETTING_STARTED.md index f72757e72..68789abd3 100644 --- a/research_datastream/terraform/GETTING_STARTED.md +++ b/research_datastream/terraform/GETTING_STARTED.md @@ -100,13 +100,19 @@ Starting from execution_template_general_purpose. Make sure to wrap commands in ``` ### Edit Run Options -The state machine is capable of confirming a complete execution by checking for the existence output data in the form of an s3 object. Set booleans here. If `s3_bucket` and `s3_prefix` are provided in `datastream_command_options`, `ngen-datastream` will create a `ngen-run.tar.gz` file that can be found at `s3:////ngen-run.tar.gz` ``` "run_options":{ - "ii_delete_volume" : false, - "ii_check_s3" : true + "ii_terminate_instance" : true, + "ii_delete_volume" : false, + "ii_check_s3" : true, + "timeout_s" : 3600 }, ``` +If `s3_bucket` and `s3_prefix` are provided in `datastream_command_options` and `ii_check_s3` is set to `true` , the state machine will confirm that at least one object exists at `s3:////`. + +`ii_terminate_instance` and `ii_delete_volume` allow the user to clean up AWS resources to avoid needless costs. While stopped instances do not incur costs, detached volumes do incur costs until deleted. + +`timeout_s` is a timeout for the commands issued during execution. This is valuable for shutting down hanging instances that may become unresponsive due to memory overflow, etc. Default is 3600. ### Edit Instance Options 4) Define the AMI ID. diff --git a/research_datastream/terraform/executions/execution_datastream_example.json b/research_datastream/terraform/executions/execution_datastream_example.json index 773f7103d..48b84f0d1 100644 --- a/research_datastream/terraform/executions/execution_datastream_example.json +++ b/research_datastream/terraform/executions/execution_datastream_example.json @@ -12,8 +12,10 @@ "s3_prefix" : "test_directory" }, "run_options":{ - "ii_delete_volume" : true, - "ii_check_s3" : true + "ii_delete_volume" : true, + "ii_check_s3" : true, + "ii_terminate_instance" : false, + "timeout_s" : 3600 }, "instance_parameters": { "ImageId": "ami-062bdcbb454b8d833", diff --git a/research_datastream/terraform/executions/execution_gp_example.json b/research_datastream/terraform/executions/execution_gp_example.json index 1db212df8..065dd0601 100644 --- a/research_datastream/terraform/executions/execution_gp_example.json +++ b/research_datastream/terraform/executions/execution_gp_example.json @@ -3,12 +3,14 @@ "runuser -l ec2-user -c 'ls -la'" ], "run_options":{ - "ii_delete_volume" : false, - "ii_check_s3" : false + "ii_delete_volume" : false, + "ii_terminate_instance" : true, + "ii_check_s3" : false, + "timeout_s" : 3600 }, "instance_parameters" : { - "ImageId" : "ami-03b72f226b125860d", + "ImageId" : "ami-07161bb3f4b6e5b6d", "InstanceType" : "t4g.large", "KeyName" : "jlaser_west2", "SecurityGroupIds" : ["sg-04365a4248fe126bc"], diff --git a/research_datastream/terraform/executions/execution_template_datastream.json b/research_datastream/terraform/executions/execution_template_datastream.json index 5b5cc5e51..c5cb31595 100644 --- a/research_datastream/terraform/executions/execution_template_datastream.json +++ b/research_datastream/terraform/executions/execution_template_datastream.json @@ -12,8 +12,10 @@ "s3_prefix" : "" }, "run_options":{ - "ii_delete_volume" : true, - "ii_check_s3" : true + "ii_terminate_instance" : true, + "ii_delete_volume" : true, + "ii_check_s3" : true, + "timeout_s" : 3600 }, "instance_parameters": { "ImageId": "", diff --git a/research_datastream/terraform/lambda_functions/checker_lambda.zip b/research_datastream/terraform/lambda_functions/checker_lambda.zip deleted file mode 100644 index 8b1549618..000000000 Binary files a/research_datastream/terraform/lambda_functions/checker_lambda.zip and /dev/null differ diff --git a/research_datastream/terraform/lambda_functions/commander_lambda.zip b/research_datastream/terraform/lambda_functions/commander_lambda.zip deleted file mode 100644 index 7c2fcd703..000000000 Binary files a/research_datastream/terraform/lambda_functions/commander_lambda.zip and /dev/null differ diff --git a/research_datastream/terraform/lambda_functions/poller/lambda_function.py b/research_datastream/terraform/lambda_functions/poller/lambda_function.py index fdc7bcde6..fc39800e6 100644 --- a/research_datastream/terraform/lambda_functions/poller/lambda_function.py +++ b/research_datastream/terraform/lambda_functions/poller/lambda_function.py @@ -20,6 +20,7 @@ def lambda_handler(event, context): Generic Poller funcion """ t0 = time.perf_counter() + timeout_s = event['run_options']['timeout_s'] global client_ssm, client_ec2 client_ssm = boto3.client('ssm',region_name=event['region']) @@ -40,9 +41,13 @@ def lambda_handler(event, context): elif output['Status'] == 'InProgress': ii_pass = False print(f'Commands are still in progress. Waiting 5 seconds and checking again') - if (time.perf_counter() - t0) > 850: + if (time.perf_counter() - t0) > 800: print(f'Cycling...') ii_time = True + duration = time.time() - event['t0'] + if duration >= timeout_s: + print(f'Duration -> {duration}\nTimeout -> {timeout_s}') + raise Exception(f'Commands duration have exceed the timeout specified in the execution') time.sleep(5) else: raise Exception(f'Command failed {output}') diff --git a/research_datastream/terraform/lambda_functions/poller_lambda.zip b/research_datastream/terraform/lambda_functions/poller_lambda.zip deleted file mode 100644 index 54256bf28..000000000 Binary files a/research_datastream/terraform/lambda_functions/poller_lambda.zip and /dev/null differ diff --git a/research_datastream/terraform/lambda_functions/start_ami/lambda_function.py b/research_datastream/terraform/lambda_functions/start_ami/lambda_function.py index 0958ea543..9ac4fc57b 100644 --- a/research_datastream/terraform/lambda_functions/start_ami/lambda_function.py +++ b/research_datastream/terraform/lambda_functions/start_ami/lambda_function.py @@ -34,6 +34,12 @@ def replace_in_dict(d, pattern, replacement): def lambda_handler(event, context): + t0 = time.time() + event['t0'] = t0 + if not "timeout_s" in event['run_options']: + print(f'Setting timeout_s to default 3600 seconds') + event['run_options']['timeout_s'] = 3600 + event['region'] = os.environ['AWS_REGION'] global client_ec2 client_ec2 = boto3.client('ec2',region_name=event['region']) diff --git a/research_datastream/terraform/lambda_functions/starter_lambda.zip b/research_datastream/terraform/lambda_functions/starter_lambda.zip deleted file mode 100644 index bda47efd5..000000000 Binary files a/research_datastream/terraform/lambda_functions/starter_lambda.zip and /dev/null differ diff --git a/research_datastream/terraform/lambda_functions/stopper/lambda_function.py b/research_datastream/terraform/lambda_functions/stopper/lambda_function.py index 7a0281ad7..afa554761 100644 --- a/research_datastream/terraform/lambda_functions/stopper/lambda_function.py +++ b/research_datastream/terraform/lambda_functions/stopper/lambda_function.py @@ -17,6 +17,20 @@ def confirm_detach(volume_id): time.sleep(1) else: return + +def confirm_instance_termination(instance_id): + while True: + response = client_ec2.describe_instances( + InstanceIds=[ + instance_id + ] + ) + if response['Reservations'][0]['Instances'][0]['State']['Name'] != 'terminated': + print(f'Instance not yet terminated') + time.sleep(1) + else: + print(f'Instance {instance_id} terminated') + return def lambda_handler(event, context): """ @@ -41,20 +55,28 @@ def lambda_handler(event, context): ],) print(response) volume_id=event['volume_id'] - if event["run_options"]["ii_delete_volume"]: - print(f'Instance VolumeId {volume_id} located.') - response = client_ec2.detach_volume( - InstanceId=instance_id, - VolumeId=volume_id, - DryRun=False + if event["run_options"]["ii_terminate_instance"]: + response = client_ec2.terminate_instances( + InstanceIds=[ + instance_id, + ], ) - confirm_detach(volume_id) - print(f'EBS volume {instance_id} has been successfully detached.') - response = client_ec2.delete_volume( - VolumeId=volume_id, - DryRun=False - ) - print(f'EBS volume {instance_id} has been successfully deleted.') + confirm_instance_termination(instance_id) else: - print(f"Volume {volume_id} remains attached or available and is still incurring costs.") + if event["run_options"]["ii_delete_volume"]: + print(f'Instance VolumeId {volume_id} located.') + response = client_ec2.detach_volume( + InstanceId=instance_id, + VolumeId=volume_id, + DryRun=False + ) + confirm_detach(volume_id) + print(f'EBS volume {instance_id} has been successfully detached.') + response = client_ec2.delete_volume( + VolumeId=volume_id, + DryRun=False + ) + print(f'EBS volume {volume_id} has been successfully deleted.') + else: + print(f"Volume {volume_id} remains attached or available and is still incurring costs.") diff --git a/research_datastream/terraform/lambda_functions/stopper_lambda.zip b/research_datastream/terraform/lambda_functions/stopper_lambda.zip deleted file mode 100644 index 01ba661c4..000000000 Binary files a/research_datastream/terraform/lambda_functions/stopper_lambda.zip and /dev/null differ diff --git a/research_datastream/terraform/main.tf b/research_datastream/terraform/main.tf index 5142988e6..a9a662e9a 100644 --- a/research_datastream/terraform/main.tf +++ b/research_datastream/terraform/main.tf @@ -124,6 +124,7 @@ resource "aws_iam_policy" "datastreamlambda_policy" { "ec2:StartInstances", "ec2:StopInstances", "ec2:DescribeInstances", + "ec2:TerminateInstances", "ec2:DescribeVolumes", "ec2:DeleteVolume", "ec2:DetachVolume", diff --git a/research_datastream/terraform/test/execution_gp_arm_docker_buildNtester.json b/research_datastream/terraform/test/execution_gp_arm_docker_buildNtester.json index b0c7d6580..490c1d879 100644 --- a/research_datastream/terraform/test/execution_gp_arm_docker_buildNtester.json +++ b/research_datastream/terraform/test/execution_gp_arm_docker_buildNtester.json @@ -9,12 +9,14 @@ "runuser -l ec2-user -c 'aws s3 cp /home/ec2-user/ngen-datastream/docker_build_log.txt s3://ciroh-community-ngen-datastream/docker_build_log.txt'" ], "run_options":{ - "ii_delete_volume" : true, - "check_s3" : false + "ii_terminate_instance" : true, + "ii_delete_volume" : true, + "ii_check_s3" : false, + "timeout_s" : 3600 }, "instance_parameters" : { - "ImageId" : "ami-03b72f226b125860d", + "ImageId" : "ami-07161bb3f4b6e5b6d", "InstanceType" : "t4g.large", "KeyName" : "actions_key_arm", "SecurityGroupIds" : ["sg-0fcbe0c6d6faa0117"], diff --git a/research_datastream/terraform/test/execution_gp_test.json b/research_datastream/terraform/test/execution_gp_test.json index c088dc03e..24d45b30a 100644 --- a/research_datastream/terraform/test/execution_gp_test.json +++ b/research_datastream/terraform/test/execution_gp_test.json @@ -3,12 +3,14 @@ "runuser -l ec2-user -c 'ls -la'" ], "run_options":{ - "ii_delete_volume" : true, - "check_s3" : false + "ii_terminate_instance" : true, + "ii_delete_volume" : true, + "ii_check_s3" : false, + "timeout_s" : 3600 }, "instance_parameters" : { - "ImageId" : "ami-03b72f226b125860d", + "ImageId" : "ami-07161bb3f4b6e5b6d", "InstanceType" : "t4g.nano", "KeyName" : "actions_key", "SecurityGroupIds" : ["sg-06f57f883e902d7bc"],