Skip to content

Commit

Permalink
Merge pull request #142 from CIROH-UA/infra_updates
Browse files Browse the repository at this point in the history
add timeout
  • Loading branch information
JordanLaserGit authored Nov 18, 2024
2 parents e403c74 + 245a010 commit b45f9c1
Show file tree
Hide file tree
Showing 15 changed files with 81 additions and 31 deletions.
12 changes: 9 additions & 3 deletions research_datastream/terraform/GETTING_STARTED.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,13 +100,19 @@ Starting from execution_template_general_purpose. Make sure to wrap commands in
```

### Edit Run Options
The state machine is capable of confirming a complete execution by checking for the existence output data in the form of an s3 object. Set booleans here. If `s3_bucket` and `s3_prefix` are provided in `datastream_command_options`, `ngen-datastream` will create a `ngen-run.tar.gz` file that can be found at `s3://<s3_bucket>/<s3_prefix>/ngen-run.tar.gz`
```
"run_options":{
"ii_delete_volume" : false,
"ii_check_s3" : true
"ii_terminate_instance" : true,
"ii_delete_volume" : false,
"ii_check_s3" : true,
"timeout_s" : 3600
},
```
If `s3_bucket` and `s3_prefix` are provided in `datastream_command_options` and `ii_check_s3` is set to `true` , the state machine will confirm that at least one object exists at `s3://<s3_bucket>/<s3_prefix>/`.

`ii_terminate_instance` and `ii_delete_volume` allow the user to clean up AWS resources to avoid needless costs. While stopped instances do not incur costs, detached volumes do incur costs until deleted.

`timeout_s` is a timeout for the commands issued during execution. This is valuable for shutting down hanging instances that may become unresponsive due to memory overflow, etc. Default is 3600.

### Edit Instance Options
4) Define the AMI ID.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@
"s3_prefix" : "test_directory"
},
"run_options":{
"ii_delete_volume" : true,
"ii_check_s3" : true
"ii_delete_volume" : true,
"ii_check_s3" : true,
"ii_terminate_instance" : false,
"timeout_s" : 3600
},
"instance_parameters": {
"ImageId": "ami-062bdcbb454b8d833",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
"runuser -l ec2-user -c 'ls -la'"
],
"run_options":{
"ii_delete_volume" : false,
"ii_check_s3" : false
"ii_delete_volume" : false,
"ii_terminate_instance" : true,
"ii_check_s3" : false,
"timeout_s" : 3600
},
"instance_parameters" :
{
"ImageId" : "ami-03b72f226b125860d",
"ImageId" : "ami-07161bb3f4b6e5b6d",
"InstanceType" : "t4g.large",
"KeyName" : "jlaser_west2",
"SecurityGroupIds" : ["sg-04365a4248fe126bc"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@
"s3_prefix" : ""
},
"run_options":{
"ii_delete_volume" : true,
"ii_check_s3" : true
"ii_terminate_instance" : true,
"ii_delete_volume" : true,
"ii_check_s3" : true,
"timeout_s" : 3600
},
"instance_parameters": {
"ImageId": "",
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def lambda_handler(event, context):
Generic Poller funcion
"""
t0 = time.perf_counter()
timeout_s = event['run_options']['timeout_s']

global client_ssm, client_ec2
client_ssm = boto3.client('ssm',region_name=event['region'])
Expand All @@ -40,9 +41,13 @@ def lambda_handler(event, context):
elif output['Status'] == 'InProgress':
ii_pass = False
print(f'Commands are still in progress. Waiting 5 seconds and checking again')
if (time.perf_counter() - t0) > 850:
if (time.perf_counter() - t0) > 800:
print(f'Cycling...')
ii_time = True
duration = time.time() - event['t0']
if duration >= timeout_s:
print(f'Duration -> {duration}\nTimeout -> {timeout_s}')
raise Exception(f'Commands duration have exceed the timeout specified in the execution')
time.sleep(5)
else:
raise Exception(f'Command failed {output}')
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ def replace_in_dict(d, pattern, replacement):

def lambda_handler(event, context):

t0 = time.time()
event['t0'] = t0
if not "timeout_s" in event['run_options']:
print(f'Setting timeout_s to default 3600 seconds')
event['run_options']['timeout_s'] = 3600

event['region'] = os.environ['AWS_REGION']
global client_ec2
client_ec2 = boto3.client('ec2',region_name=event['region'])
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,20 @@ def confirm_detach(volume_id):
time.sleep(1)
else:
return

def confirm_instance_termination(instance_id):
while True:
response = client_ec2.describe_instances(
InstanceIds=[
instance_id
]
)
if response['Reservations'][0]['Instances'][0]['State']['Name'] != 'terminated':
print(f'Instance not yet terminated')
time.sleep(1)
else:
print(f'Instance {instance_id} terminated')
return

def lambda_handler(event, context):
"""
Expand All @@ -41,20 +55,28 @@ def lambda_handler(event, context):
],)
print(response)
volume_id=event['volume_id']
if event["run_options"]["ii_delete_volume"]:
print(f'Instance VolumeId {volume_id} located.')
response = client_ec2.detach_volume(
InstanceId=instance_id,
VolumeId=volume_id,
DryRun=False
if event["run_options"]["ii_terminate_instance"]:
response = client_ec2.terminate_instances(
InstanceIds=[
instance_id,
],
)
confirm_detach(volume_id)
print(f'EBS volume {instance_id} has been successfully detached.')
response = client_ec2.delete_volume(
VolumeId=volume_id,
DryRun=False
)
print(f'EBS volume {instance_id} has been successfully deleted.')
confirm_instance_termination(instance_id)
else:
print(f"Volume {volume_id} remains attached or available and is still incurring costs.")
if event["run_options"]["ii_delete_volume"]:
print(f'Instance VolumeId {volume_id} located.')
response = client_ec2.detach_volume(
InstanceId=instance_id,
VolumeId=volume_id,
DryRun=False
)
confirm_detach(volume_id)
print(f'EBS volume {instance_id} has been successfully detached.')
response = client_ec2.delete_volume(
VolumeId=volume_id,
DryRun=False
)
print(f'EBS volume {volume_id} has been successfully deleted.')
else:
print(f"Volume {volume_id} remains attached or available and is still incurring costs.")

Binary file not shown.
1 change: 1 addition & 0 deletions research_datastream/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ resource "aws_iam_policy" "datastreamlambda_policy" {
"ec2:StartInstances",
"ec2:StopInstances",
"ec2:DescribeInstances",
"ec2:TerminateInstances",
"ec2:DescribeVolumes",
"ec2:DeleteVolume",
"ec2:DetachVolume",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@
"runuser -l ec2-user -c 'aws s3 cp /home/ec2-user/ngen-datastream/docker_build_log.txt s3://ciroh-community-ngen-datastream/docker_build_log.txt'"
],
"run_options":{
"ii_delete_volume" : true,
"check_s3" : false
"ii_terminate_instance" : true,
"ii_delete_volume" : true,
"ii_check_s3" : false,
"timeout_s" : 3600
},
"instance_parameters" :
{
"ImageId" : "ami-03b72f226b125860d",
"ImageId" : "ami-07161bb3f4b6e5b6d",
"InstanceType" : "t4g.large",
"KeyName" : "actions_key_arm",
"SecurityGroupIds" : ["sg-0fcbe0c6d6faa0117"],
Expand Down
8 changes: 5 additions & 3 deletions research_datastream/terraform/test/execution_gp_test.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
"runuser -l ec2-user -c 'ls -la'"
],
"run_options":{
"ii_delete_volume" : true,
"check_s3" : false
"ii_terminate_instance" : true,
"ii_delete_volume" : true,
"ii_check_s3" : false,
"timeout_s" : 3600
},
"instance_parameters" :
{
"ImageId" : "ami-03b72f226b125860d",
"ImageId" : "ami-07161bb3f4b6e5b6d",
"InstanceType" : "t4g.nano",
"KeyName" : "actions_key",
"SecurityGroupIds" : ["sg-06f57f883e902d7bc"],
Expand Down

0 comments on commit b45f9c1

Please sign in to comment.