From 3702b8680891a0392a53a2e2ce8060fb75b2bbdf Mon Sep 17 00:00:00 2001
From: Tim Johnson <tijohnson@linuxfoundation.org>
Date: Mon, 9 Dec 2019 08:13:09 -0800
Subject: [PATCH] Add Script to scrape cost data

Issue: RELENG-2555
Change-Id: I46e719f88721fd8cf0060181297d2655995e47f7
Signed-off-by: Tim Johnson <tijohnson@linuxfoundation.org>
---
 .../notes/scrape-job-cost-2a43bea721775985.yaml    |   9 ++
 shell/scrape-job-cost.sh                           | 125 +++++++++++++++++++++
 2 files changed, 134 insertions(+)
 create mode 100644 releasenotes/notes/scrape-job-cost-2a43bea721775985.yaml
 create mode 100755 shell/scrape-job-cost.sh

diff --git a/releasenotes/notes/scrape-job-cost-2a43bea721775985.yaml b/releasenotes/notes/scrape-job-cost-2a43bea721775985.yaml
new file mode 100644
index 00000000..6994dfe9
--- /dev/null
+++ b/releasenotes/notes/scrape-job-cost-2a43bea721775985.yaml
@@ -0,0 +1,9 @@
+---
+features:
+  - |
+    New script has been added: scrape-job-cost. This script will be executed by
+    cron as nexus on the Nexus Server for each project. It will extract cost
+    data from the nexus directory for each Jenkins Builder (production &
+    sandbox). The cost data for each builder will be appended to separate cost
+    files located in ~nexus/cost on the nexus server. The files will be named
+    sandbox-YYYY.csv and production-YYYY.csv.
diff --git a/shell/scrape-job-cost.sh b/shell/scrape-job-cost.sh
new file mode 100755
index 00000000..9bd5205e
--- /dev/null
+++ b/shell/scrape-job-cost.sh
@@ -0,0 +1,125 @@
+#! /bin/bash
+# SPDX-License-Identifier: EPL-1.0
+##############################################################################
+# Copyright (c) 2020 The Linux Foundation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Eclipse Public License v1.0
+# which accompanies this distribution, and is available at
+# http://www.eclipse.org/legal/epl-v10.html
+##############################################################################
+
+# This script will be run on the Nexus Server for each project.  Typically each
+# project will have multiple nexus silos. This script will find all the job cost
+# files (job_name/job_num/cost.csv) in each silo and append them to the annual
+# cost file (~nexus/cost/$silo.YYYY.csv~nexus/bin . It will then delete all the
+# job cost files.
+
+# Because this file meant to be run by cron, I have restricted normal info
+# logging messages on a single line. Error may be multi-line.
+#
+# Each cost file contains one or more CSV records in the following format:
+#
+#   JobName , BuildNumber , Date , InstanceType , Uptime , Cost1 , Cost2
+#
+#   Date format: '%Y-%m-%d %H:%M:%S'
+#
+##############################################################################
+#
+#  Testing/Validation
+#
+#  You can validate this script by running as yourself on the Nexus server. You
+#  should not have write permission anywhere in the Silo Directory. The
+#  Silo Cost File from your test will be created/updated in:
+#      ~/cost/$silo-$year.csv. If you run multiple times, duplicate records
+#      will be created.
+#
+#  To enable debug set envionment variable DEBUG=true. If this is done on the
+#  command-line, you  do not have to edit this file.
+debug=${DEBUG:-false}
+$debug && echo "DEBUG Enabled"
+#
+##############################################################################
+
+set -eufo pipefail
+
+function get-year-list()
+{
+    # Grab the years for each cost record use sort | uniq to get the
+    # list of unique years found
+    local list
+    list=$(awk -F',' '{print $3}' "$cost_file_records" \
+               | awk -F'-' '{print $1}' | sort | uniq)
+    echo "$list"
+}
+
+###########  End of Function Definitions  ######################################
+
+if [[ $# != 2 ]]; then
+    echo "usage: $(basename "$0") silo silo_dir"
+    exit 1
+fi
+
+# The Silo Dir is top-level directory that will contain the job directories
+# which will contain the cost files (cost.csv)
+silo=$1
+silo_dir=$2
+
+cost_file_records=/tmp/cost-file-records$$
+cost_file_list=/tmp/cost-file-list$$
+# The directory where the annual cost file will be located
+cost_dir=~/cost
+[[ -d $cost_dir ]] || mkdir $cost_dir
+
+# The Silo Directory for sandbox will get deleted periodically, so
+# gracefully handle that
+if [[ -d $silo_dir ]]; then
+   cd "$silo_dir"
+else
+    echo  "$(date +'%Y-%m-%d %H:%M') No Silo Directory, nothing to do"
+    exit 0
+fi
+
+find . -maxdepth 3 -name cost.csv > $cost_file_list
+xargs cat < $cost_file_list | \
+    sort --field-separator=',' --key=3  > $cost_file_records
+num_of_records=$(wc -l < $cost_file_records)
+echo -n "$(date +'%Y-%m-%d %H:%M') Records: $num_of_records "
+
+if [[ $num_of_records == 0 ]]; then
+    echo "Nothing to do"
+    set +f
+    rm -rf /tmp/cost-file-* || true
+    exit 0
+fi
+
+# Append each entry to the silo cost file based on date
+year_list=$(get-year-list)
+for year in $year_list; do
+    echo -n "cost-$year.csv: $(grep -Fc ",$year-" $cost_file_records) "
+    grep -F ",$year-" $cost_file_records >>    \
+         "$cost_dir/$silo-$year.csv"
+done
+
+# Rename the job cost files (make them hidden)
+while read -r p; do
+    job_dir=$(dirname "$p")
+    $debug || (cd "$job_dir" ; mv cost.csv .cost.csv)
+done < $cost_file_list
+
+rm -r $cost_file_list $cost_file_records
+
+echo -n "Sorting: "
+# Sort the silo cost file by 'date' (column 3)
+for year in $year_list; do
+    echo -n "cost-$year.csv "
+    sort --field-separator=',' --key=3  \
+         -o "$cost_dir/$silo-$year.csv" \
+         "$cost_dir/$silo-$year.csv"
+done
+
+set +f
+rm -rf /tmp/cost-file-* || true
+
+# Keep track of time initally
+echo "Complete $SECONDS Secs"
-- 
2.16.6