Feat: Process orphaned coe clusters for K8S jobs 95/69595/6 v0.74.0
authorAnil Belur <abelur@linuxfoundation.org>
Mon, 31 Jan 2022 00:48:51 +0000 (10:48 +1000)
committerAnil Belur <abelur@linuxfoundation.org>
Wed, 2 Feb 2022 03:38:41 +0000 (13:38 +1000)
K8s jobs by default creates stacks names that does not match
JOB_NAME, therefore ignore them while processing orphaned stacks
and handle them separatly when cleaning up the orphaned clusters.

The stack naming scheme is limited to take first 20 chars from the
JOB_NAME while the rest is randomly generated for uniqueness which
breaks the openstack cron jobs.

Ref: https://github.com/openstack/magnum/blob/master/magnum/
drivers/heat/driver.py#L202-L212

Issue-ID: RELENG-4106
Change-Id: Id3d9b74c3e6e2a0abbddb771b7fc7d5ba2b59ca5
Signed-off-by: Anil Belur <abelur@linuxfoundation.org>
jjb/lf-ci-jobs.yaml
releasenotes/notes/cleanup-coe-k8s-clusters-ac491a6ebc0ba7db.yaml [new file with mode: 0644]
shell/openstack-cleanup-orphaned-k8s-clusters.sh [new file with mode: 0644]
shell/openstack-cleanup-orphaned-stacks.sh

index 7759168..bb0baaf 100644 (file)
       - lf-infra-pre-build
       - inject:
           properties-content: OS_CLOUD={openstack-cloud}
+      # K8s Clusters
+      - conditional-step:
+          condition-kind: boolean-expression
+          condition-expression: "{openstack-stack-cleanup}"
+          steps:
+            - shell: !include-raw-escape: ../shell/openstack-cleanup-orphaned-k8s-clusters.sh
       # Stacks
       - conditional-step:
           condition-kind: boolean-expression
diff --git a/releasenotes/notes/cleanup-coe-k8s-clusters-ac491a6ebc0ba7db.yaml b/releasenotes/notes/cleanup-coe-k8s-clusters-ac491a6ebc0ba7db.yaml
new file mode 100644 (file)
index 0000000..8725be2
--- /dev/null
@@ -0,0 +1,13 @@
+---
+features:
+  - |
+    Process orphaned coe clusters for K8S jobs
+
+    K8s (COE cluster) jobs by default creates stacks names that does not match
+    JOB_NAME, therefore ignore them while processing orphaned stacks and handle
+    them separatly when cleaning up the orphaned clusters.
+
+    The stack naming scheme is limited to take first 20 chars from the JOB_NAME
+    while the rest is randomly generated for uniqueness:
+    https://github.com/openstack/magnum/blob/master/magnum/drivers/heat/driver.py#L202-L212
+    This breaks the openstack cron jobs.
diff --git a/shell/openstack-cleanup-orphaned-k8s-clusters.sh b/shell/openstack-cleanup-orphaned-k8s-clusters.sh
new file mode 100644 (file)
index 0000000..b1f3923
--- /dev/null
@@ -0,0 +1,85 @@
+#!/bin/bash -l
+# SPDX-License-Identifier: EPL-1.0
+##############################################################################
+# Copyright (c) 2017, 2022 The Linux Foundation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Eclipse Public License v1.0
+# which accompanies this distribution, and is available at
+# http://www.eclipse.org/legal/epl-v10.html
+##############################################################################
+# Scans OpenStack for orphaned COE clusters
+echo "---> Orphaned k8s clusters"
+
+os_cloud="${OS_CLOUD:-vex}"
+jenkins_urls="${JENKINS_URLS:-}"
+
+cluster_in_jenkins() {
+    # Usage: cluster_in_jenkins CLUSTER_NAME JENKINS_URL [JENKINS_URL...]
+    # Returns: 0 If CLUSTER_NAME is in Jenkins and 1 if CLUSTER_NAME is not
+    # in Jenkins.
+
+    CLUSTER_NAME="${1}"
+
+    builds=()
+    for jenkins in "${@:2}"; do
+        PARAMS="tree=computer[executors[currentExecutable[url]],"
+        PARAMS=$PARAMS"oneOffExecutors[currentExecutable[url]]]"
+        PARAMS=$PARAMS"&xpath=//url&wrapper=builds"
+        JENKINS_URL="$jenkins/computer/api/json?$PARAMS"
+        resp=$(curl -s -w "\\n\\n%{http_code}" --globoff -H "Content-Type:application/json" "$JENKINS_URL")
+        json_data=$(echo "$resp" | head -n1)
+        status=$(echo "$resp" | awk 'END {print $NF}')
+
+        if [ "$status" != 200 ]; then
+            >&2 echo "ERROR: Failed to fetch data from $JENKINS_URL with status code $status"
+            >&2 echo "$resp"
+            exit 1
+        fi
+
+        if [[ "${jenkins}" == *"jenkins."*".org" ]] || [[ "${jenkins}" == *"jenkins."*".io" ]]; then
+            silo="production"
+        else
+            silo=$(echo "$jenkins" | sed 's/\/*$//' | awk -F'/' '{print $NF}')
+        fi
+        export silo
+        # We purposely want to wordsplit here to combine the arrays
+        # shellcheck disable=SC2206,SC2207
+        builds=(${builds[@]} $(echo "$json_data" | \
+            jq -r '.computer[].executors[].currentExecutable.url' \
+            | grep -v null | awk -F'/' '{print ENVIRON["silo"] "-" $6 "-" $7}')
+        )
+    done
+
+    if [[ "${builds[*]}" =~ $CLUSTER_NAME ]]; then
+        return 0
+    fi
+
+    return 1
+}
+
+#########################
+## FETCH ACTIVE BUILDS ##
+#########################
+# Fetch coe cluster list before fetching active stacks.
+mapfile -t OS_COE_CLUSTERS < <(openstack --os-cloud "$os_cloud" coe cluster list \
+            -f value -c "uuid" -c "name" -c "status" -c "health_status" \
+            | awk '{print $2}')
+
+##########################
+## DELETE UNUSED STACKS ##
+##########################
+echo "-----> Delete orphaned cluster"
+
+# Search for COE clusters not in use by any active Jenkins systems and remove them.
+for CLUSTER_NAME in "${OS_COE_CLUSTERS[@]}"; do
+    # jenkins_urls intentially needs globbing to be passed a separate params.
+    # shellcheck disable=SC2153,SC2086
+    if cluster_in_jenkins "$CLUSTER_NAME" $jenkins_urls; then
+        # No need to delete stacks if there exists an active build for them
+        continue
+    else
+        echo "Deleting orphaned k8s cluster: $CLUSTER_NAME"
+        openstack --os-cloud "$os_cloud" coe cluster delete "$CLUSTER_NAME"
+    fi
+done
index e4f457a..cc150ac 100644 (file)
@@ -56,13 +56,45 @@ stack_in_jenkins() {
 
     return 1
 }
-
+set -x
 #########################
 ## FETCH ACTIVE BUILDS ##
 #########################
+# Fetch COE cluster list before fetching active stacks. K8s cluster creates
+# stack that does not match JOB_NAME, therefore ignore them while processing
+# orphaned stacks and handle them separatly.
+# The stack naming scheme is limited in the source code to take only first 20
+# chars from the JOB_NAME, and the rest is randomly generated value for
+# uniqueness:
+# https://github.com/openstack/magnum/blob/master/magnum/drivers/heat/driver.py#L202-L212
+mapfile -t OS_COE_CLUSTERS_ID < <(openstack --os-cloud "${os_cloud}" coe cluster list \
+            -f value -c "uuid" -c "name" \
+            | grep -E '(DELETE_FAILED|UNKNOWN|UNHEALTHY)' | awk '{print $1}')
+
+echo "-----> Active clusters -> stacks"
+# mapfile -t OS_COE_STACKS_ID
+OS_COE_STACKS=()
+for cluster_id in "${OS_COE_CLUSTERS_ID[@]}"; do
+    # find active stacks id associated with the COE cluster
+    stack_id=$(openstack --os-cloud "${os_cloud}" coe cluster show "${cluster_id}" \
+                -f value -c "stack_id")
+    # get the stack name associated with the COE cluster
+    stack_name=$(openstack --os-cloud "${os_cloud}" stack show "${stack_id}" \
+                -f value -c "stack_name")
+    OS_COE_STACKS+=("${stack_id}")
+    echo "clusterid:${cluster_id} -> stackid:${stack_id} stack_name: ${stack_name}"
+done
+
+if [[ ${#OS_COE_STACKS[@]} -gt "0" ]]; then
+    echo "${OS_COE_STACKS[*]}"
+    echo "-----> Active COE cluster stacks"
+    for cstack in "${OS_COE_STACKS[@]}"; do
+        echo "$cstack"
+    done
+fi
+
 # Fetch stack list before fetching active builds to minimize race condition
 # where we might be try to delete stacks while jobs are trying to start
-
 mapfile -t OS_STACKS < <(openstack --os-cloud "$os_cloud" stack list \
             -f value -c "Stack Name" -c "Stack Status" \
             --property "stack_status=CREATE_COMPLETE" \
@@ -83,13 +115,17 @@ echo "-----> Delete orphaned stacks"
 
 # Search for stacks not in use by any active Jenkins systems and remove them.
 for STACK_NAME in "${OS_STACKS[@]}"; do
-    # jenkins_urls intentially needs globbing to be passed a separate params.
+    # Check for COE cluster stack is present
     # shellcheck disable=SC2153,SC2086
-    if stack_in_jenkins "$STACK_NAME" $jenkins_urls; then
+    if [[ ${#OS_COE_STACKS[@]} -gt "0" ]] && [[ ${OS_COE_STACKS[*]} =~ ${STACK_NAME} ]]; then
+        # Do not delete a stack linked to COE cluster, handle them separatly.
+        continue
+    # jenkins_urls intentially needs globbing to be passed a separate params.
+    elif stack_in_jenkins "$STACK_NAME" $jenkins_urls; then
         # No need to delete stacks if there exists an active build for them
         continue
     else
-        echo "Deleting orphaned stack: $STACK_NAME"
-        lftools openstack --os-cloud "$os_cloud" stack delete --force "$STACK_NAME"
+        echo "Deleting orphaned stack: ${STACK_NAME}"
+        lftools openstack --os-cloud "${os_cloud}" stack delete --force "${STACK_NAME}"
     fi
 done