From: Anil Belur Date: Mon, 31 Jan 2022 00:48:51 +0000 (+1000) Subject: Feat: Process orphaned coe clusters for K8S jobs X-Git-Tag: v0.74.0^0 X-Git-Url: https://gerrit.linuxfoundation.org/infra/gitweb?p=releng%2Fglobal-jjb.git;a=commitdiff_plain;h=70b380fccde98eaf35cc894795ff6736561ce6ce Feat: Process orphaned coe clusters for K8S jobs K8s jobs by default creates stacks names that does not match JOB_NAME, therefore ignore them while processing orphaned stacks and handle them separatly when cleaning up the orphaned clusters. The stack naming scheme is limited to take first 20 chars from the JOB_NAME while the rest is randomly generated for uniqueness which breaks the openstack cron jobs. Ref: https://github.com/openstack/magnum/blob/master/magnum/ drivers/heat/driver.py#L202-L212 Issue-ID: RELENG-4106 Change-Id: Id3d9b74c3e6e2a0abbddb771b7fc7d5ba2b59ca5 Signed-off-by: Anil Belur --- diff --git a/jjb/lf-ci-jobs.yaml b/jjb/lf-ci-jobs.yaml index 7759168f..bb0baafa 100644 --- a/jjb/lf-ci-jobs.yaml +++ b/jjb/lf-ci-jobs.yaml @@ -1372,6 +1372,12 @@ - lf-infra-pre-build - inject: properties-content: OS_CLOUD={openstack-cloud} + # K8s Clusters + - conditional-step: + condition-kind: boolean-expression + condition-expression: "{openstack-stack-cleanup}" + steps: + - shell: !include-raw-escape: ../shell/openstack-cleanup-orphaned-k8s-clusters.sh # Stacks - conditional-step: condition-kind: boolean-expression diff --git a/releasenotes/notes/cleanup-coe-k8s-clusters-ac491a6ebc0ba7db.yaml b/releasenotes/notes/cleanup-coe-k8s-clusters-ac491a6ebc0ba7db.yaml new file mode 100644 index 00000000..8725be23 --- /dev/null +++ b/releasenotes/notes/cleanup-coe-k8s-clusters-ac491a6ebc0ba7db.yaml @@ -0,0 +1,13 @@ +--- +features: + - | + Process orphaned coe clusters for K8S jobs + + K8s (COE cluster) jobs by default creates stacks names that does not match + JOB_NAME, therefore ignore them while processing orphaned stacks and handle + them separatly when cleaning up the orphaned clusters. + + The stack naming scheme is limited to take first 20 chars from the JOB_NAME + while the rest is randomly generated for uniqueness: + https://github.com/openstack/magnum/blob/master/magnum/drivers/heat/driver.py#L202-L212 + This breaks the openstack cron jobs. diff --git a/shell/openstack-cleanup-orphaned-k8s-clusters.sh b/shell/openstack-cleanup-orphaned-k8s-clusters.sh new file mode 100644 index 00000000..b1f3923e --- /dev/null +++ b/shell/openstack-cleanup-orphaned-k8s-clusters.sh @@ -0,0 +1,85 @@ +#!/bin/bash -l +# SPDX-License-Identifier: EPL-1.0 +############################################################################## +# Copyright (c) 2017, 2022 The Linux Foundation and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Eclipse Public License v1.0 +# which accompanies this distribution, and is available at +# http://www.eclipse.org/legal/epl-v10.html +############################################################################## +# Scans OpenStack for orphaned COE clusters +echo "---> Orphaned k8s clusters" + +os_cloud="${OS_CLOUD:-vex}" +jenkins_urls="${JENKINS_URLS:-}" + +cluster_in_jenkins() { + # Usage: cluster_in_jenkins CLUSTER_NAME JENKINS_URL [JENKINS_URL...] + # Returns: 0 If CLUSTER_NAME is in Jenkins and 1 if CLUSTER_NAME is not + # in Jenkins. + + CLUSTER_NAME="${1}" + + builds=() + for jenkins in "${@:2}"; do + PARAMS="tree=computer[executors[currentExecutable[url]]," + PARAMS=$PARAMS"oneOffExecutors[currentExecutable[url]]]" + PARAMS=$PARAMS"&xpath=//url&wrapper=builds" + JENKINS_URL="$jenkins/computer/api/json?$PARAMS" + resp=$(curl -s -w "\\n\\n%{http_code}" --globoff -H "Content-Type:application/json" "$JENKINS_URL") + json_data=$(echo "$resp" | head -n1) + status=$(echo "$resp" | awk 'END {print $NF}') + + if [ "$status" != 200 ]; then + >&2 echo "ERROR: Failed to fetch data from $JENKINS_URL with status code $status" + >&2 echo "$resp" + exit 1 + fi + + if [[ "${jenkins}" == *"jenkins."*".org" ]] || [[ "${jenkins}" == *"jenkins."*".io" ]]; then + silo="production" + else + silo=$(echo "$jenkins" | sed 's/\/*$//' | awk -F'/' '{print $NF}') + fi + export silo + # We purposely want to wordsplit here to combine the arrays + # shellcheck disable=SC2206,SC2207 + builds=(${builds[@]} $(echo "$json_data" | \ + jq -r '.computer[].executors[].currentExecutable.url' \ + | grep -v null | awk -F'/' '{print ENVIRON["silo"] "-" $6 "-" $7}') + ) + done + + if [[ "${builds[*]}" =~ $CLUSTER_NAME ]]; then + return 0 + fi + + return 1 +} + +######################### +## FETCH ACTIVE BUILDS ## +######################### +# Fetch coe cluster list before fetching active stacks. +mapfile -t OS_COE_CLUSTERS < <(openstack --os-cloud "$os_cloud" coe cluster list \ + -f value -c "uuid" -c "name" -c "status" -c "health_status" \ + | awk '{print $2}') + +########################## +## DELETE UNUSED STACKS ## +########################## +echo "-----> Delete orphaned cluster" + +# Search for COE clusters not in use by any active Jenkins systems and remove them. +for CLUSTER_NAME in "${OS_COE_CLUSTERS[@]}"; do + # jenkins_urls intentially needs globbing to be passed a separate params. + # shellcheck disable=SC2153,SC2086 + if cluster_in_jenkins "$CLUSTER_NAME" $jenkins_urls; then + # No need to delete stacks if there exists an active build for them + continue + else + echo "Deleting orphaned k8s cluster: $CLUSTER_NAME" + openstack --os-cloud "$os_cloud" coe cluster delete "$CLUSTER_NAME" + fi +done diff --git a/shell/openstack-cleanup-orphaned-stacks.sh b/shell/openstack-cleanup-orphaned-stacks.sh index e4f457ae..cc150ac4 100644 --- a/shell/openstack-cleanup-orphaned-stacks.sh +++ b/shell/openstack-cleanup-orphaned-stacks.sh @@ -56,13 +56,45 @@ stack_in_jenkins() { return 1 } - +set -x ######################### ## FETCH ACTIVE BUILDS ## ######################### +# Fetch COE cluster list before fetching active stacks. K8s cluster creates +# stack that does not match JOB_NAME, therefore ignore them while processing +# orphaned stacks and handle them separatly. +# The stack naming scheme is limited in the source code to take only first 20 +# chars from the JOB_NAME, and the rest is randomly generated value for +# uniqueness: +# https://github.com/openstack/magnum/blob/master/magnum/drivers/heat/driver.py#L202-L212 +mapfile -t OS_COE_CLUSTERS_ID < <(openstack --os-cloud "${os_cloud}" coe cluster list \ + -f value -c "uuid" -c "name" \ + | grep -E '(DELETE_FAILED|UNKNOWN|UNHEALTHY)' | awk '{print $1}') + +echo "-----> Active clusters -> stacks" +# mapfile -t OS_COE_STACKS_ID +OS_COE_STACKS=() +for cluster_id in "${OS_COE_CLUSTERS_ID[@]}"; do + # find active stacks id associated with the COE cluster + stack_id=$(openstack --os-cloud "${os_cloud}" coe cluster show "${cluster_id}" \ + -f value -c "stack_id") + # get the stack name associated with the COE cluster + stack_name=$(openstack --os-cloud "${os_cloud}" stack show "${stack_id}" \ + -f value -c "stack_name") + OS_COE_STACKS+=("${stack_id}") + echo "clusterid:${cluster_id} -> stackid:${stack_id} stack_name: ${stack_name}" +done + +if [[ ${#OS_COE_STACKS[@]} -gt "0" ]]; then + echo "${OS_COE_STACKS[*]}" + echo "-----> Active COE cluster stacks" + for cstack in "${OS_COE_STACKS[@]}"; do + echo "$cstack" + done +fi + # Fetch stack list before fetching active builds to minimize race condition # where we might be try to delete stacks while jobs are trying to start - mapfile -t OS_STACKS < <(openstack --os-cloud "$os_cloud" stack list \ -f value -c "Stack Name" -c "Stack Status" \ --property "stack_status=CREATE_COMPLETE" \ @@ -83,13 +115,17 @@ echo "-----> Delete orphaned stacks" # Search for stacks not in use by any active Jenkins systems and remove them. for STACK_NAME in "${OS_STACKS[@]}"; do - # jenkins_urls intentially needs globbing to be passed a separate params. + # Check for COE cluster stack is present # shellcheck disable=SC2153,SC2086 - if stack_in_jenkins "$STACK_NAME" $jenkins_urls; then + if [[ ${#OS_COE_STACKS[@]} -gt "0" ]] && [[ ${OS_COE_STACKS[*]} =~ ${STACK_NAME} ]]; then + # Do not delete a stack linked to COE cluster, handle them separatly. + continue + # jenkins_urls intentially needs globbing to be passed a separate params. + elif stack_in_jenkins "$STACK_NAME" $jenkins_urls; then # No need to delete stacks if there exists an active build for them continue else - echo "Deleting orphaned stack: $STACK_NAME" - lftools openstack --os-cloud "$os_cloud" stack delete --force "$STACK_NAME" + echo "Deleting orphaned stack: ${STACK_NAME}" + lftools openstack --os-cloud "${os_cloud}" stack delete --force "${STACK_NAME}" fi done