Skip to content

Commit

Permalink
Prune hanging instances longer than 2 hours (#2061)
Browse files Browse the repository at this point in the history
Some ec2 instances aren't successfully terminated along with the pruned
commits/jobs and we don't have any additional retry logic to handle
that. This adds logic to the job pruner to terminate any
ec2-test-framework instances hanging longer than a couple hours. There
were also some inconsistent failures with the instance wanting to be
restarted, removing `needrestart` from the instance should solve that.

By submitting this pull request, I confirm that my contribution is made
under the terms of the Apache 2.0 license and the ISC license.
  • Loading branch information
samuel40791765 authored Dec 17, 2024
1 parent c23791a commit 92b771f
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 9 deletions.
4 changes: 1 addition & 3 deletions tests/ci/cdk/cdk/ssm/general_test_run_ssm_document.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,6 @@ mainSteps:
inputs:
timeoutSeconds: '7200'
runCommand:
# TODO (P131897680): Parallelize the FIPS and sanitizer tests. The instance timeout can be lowered
# once we do so.
#
# Fallback plan to shut down the ec2 instance in 90 minutes in case it's not terminated.
# Codebuild just "stops" the instance calling the script, so "trap cleanup" is not executed.
- shutdown -P +90
Expand All @@ -28,6 +25,7 @@ mainSteps:
# install aws-cli
- killall apt apt-get
- apt-get update
- apt-get -y remove needrestart
- apt-get -y install unzip
- curl "https://awscli.amazonaws.com/awscli-exe-linux-${AWS_CLI_PREFIX}64.zip" -o "awscliv2.zip"
- unzip awscliv2.zip
Expand Down
30 changes: 24 additions & 6 deletions tests/ci/lambda/src/bin/purge-stale-builds.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use std::time::{SystemTime, UNIX_EPOCH};

use aws_sdk_codebuild::types::BuildBatchFilter;
use aws_sdk_ec2::operation::describe_instances::DescribeInstancesOutput;
use aws_sdk_ec2::primitives::DateTime;
use aws_sdk_ec2::types::Filter;
use aws_sdk_ssm::types::DocumentKeyValuesFilter;
use lambda_runtime::{service_fn, Error, LambdaEvent};
Expand Down Expand Up @@ -34,7 +35,7 @@ async fn handle(_event: LambdaEvent<Value>) -> Result<(), Error> {
std::env::var("GITHUB_TOKEN_SECRET_NAME")
.map_err(|_| "failed to find github access token secret name")?,
)
.await?;
.await?;

let github = octocrab::initialise(octocrab::Octocrab::builder().personal_token(github_token))
.map_err(|e| format!("failed to build github client: {e}"))?;
Expand Down Expand Up @@ -115,6 +116,22 @@ async fn handle(_event: LambdaEvent<Value>) -> Result<(), Error> {
let mut ec2_terminated_instances: Vec<String> = vec![];
let mut stopped_builds: u64 = 0;

let now_as_secs = DateTime::from(SystemTime::now()).secs();
// Instances do not properly shut down from time to time. Gather a list of all hanging ec2
// instances longer than 2 hours that fall under that umbrella.
if let Some(ref ec2_describe_response) = ec2_describe_response_optional {
for reservation in ec2_describe_response.reservations() {
for instance in reservation.instances() {
let launch_elapsed_time = now_as_secs - instance.launch_time().unwrap().secs();
if launch_elapsed_time > 7200 {
ec2_terminated_instances.push(instance.instance_id().unwrap().to_string());
log::info!("Instance {:?} will be terminated.", reservation.instances());
}
}
}
}
log::info!("Instances {:?}", ec2_terminated_instances);

for (k, v) in &pull_requests {
if v.len() <= 1 {
continue;
Expand Down Expand Up @@ -181,12 +198,13 @@ async fn handle(_event: LambdaEvent<Value>) -> Result<(), Error> {
}

if !ssm_deleted_documents.is_empty() && is_ec2_test_framework {
log::info!("Query for list of documents to delete with: {:?}",ssm_deleted_documents);
log::info!(
"Query for list of documents to delete with: {:?}",
ssm_deleted_documents
);

let all_documents = get_ssm_document_list(
&ssm_client_optional,
ssm_deleted_documents.clone(),
).await?;
let all_documents =
get_ssm_document_list(&ssm_client_optional, ssm_deleted_documents.clone()).await?;

// Prune hanging ssm documents corresponding to commits.
for document in all_documents {
Expand Down

0 comments on commit 92b771f

Please sign in to comment.