Static Jenkins Site Generator

- Private Jenkins job scraping w/API key
- Added Gilroy font to match main public website
- Link back to ONF website for products
- Add more products

Change-Id: I3ed2dc1e371c564ee483ab83fd110a88d818bca7
diff --git a/buildcollector.py b/buildcollector.py
new file mode 100644
index 0000000..7091197
--- /dev/null
+++ b/buildcollector.py
@@ -0,0 +1,443 @@
+#!/usr/bin/env python3
+
+# SPDX-FileCopyrightText: © 2020 Open Networking Foundation <support@opennetworking.org>
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import absolute_import
+
+import argparse
+import base64
+import json
+import logging
+import os
+import re
+import sys
+import urllib.request
+import yaml
+
+from jsonpath_ng.ext import parse as jpparse
+
+# create shared logger
+logging.basicConfig()
+logger = logging.getLogger("sjsgc")
+
+# global dict of jsonpath expressions -> compiled jsonpath parsers, as
+# reparsing expressions in each loop results in 100x longer execution time
+gjpaths = {}
+
+# credentials global
+
+
+def parse_collector_args():
+    """
+    parse CLI arguments
+    """
+
+    parser = argparse.ArgumentParser(description="Jenkins job results collector")
+
+    # Positional args
+    parser.add_argument(
+        "scrape_file",
+        default="scrape.yaml",
+        type=argparse.FileType("r"),
+        help="YAML file describing Jenkins job and data to scrape",
+    )
+
+    # Flags
+    parser.add_argument(
+        "--credentials",
+        type=argparse.FileType("r"),
+        help="Credentials to use for private jenkins jobs",
+    )
+
+    parser.add_argument(
+        "--local", action="store_true", help="Prefer local copies of build lists"
+    )
+
+    parser.add_argument(
+        "--product_dir", default="products", help="Directory to save per-product output"
+    )
+
+    parser.add_argument(
+        "--jobs_dir", default="jobs", help="Directory to save raw Jenkins job output"
+    )
+
+    parser.add_argument(
+        "--debug", action="store_true", help="Print additional debugging information"
+    )
+
+    return parser.parse_args()
+
+
+def jenkins_job_list_url(server_url, job_name):
+    """
+    create a Jenkins JSON API URL for a job (list of builds)
+    """
+
+    url = "%s/job/%s/api/json" % (server_url, job_name)
+    return url
+
+
+def jenkins_job_build_url(server_url, job_name, build_number):
+    """
+    create a Jenkins JSON API URL for a specific build of a job
+    """
+
+    url = "%s/job/%s/%d/api/json" % (server_url, job_name, build_number)
+    return url
+
+
+def basic_auth_header(username, password):
+    """
+    returns a tuple containing a http basic auth header
+    """
+    creds_str = "%s:%s" % (username, password)
+    creds_b64 = base64.standard_b64encode(creds_str.encode("utf-8"))
+
+    return ("Authorization", "Basic %s" % creds_b64.decode("utf-8"))
+
+
+def jenkins_api_get(url, headers=[]):
+    """
+    Get data from Jenkins JSON API endpoint, return data as a dict
+    """
+
+    request = urllib.request.Request(url)
+
+    # add headers tuples
+    for header in headers:
+        request.add_header(*header)
+
+    try:
+        response = urllib.request.urlopen(request)
+    except urllib.error.HTTPError:
+        logger.exception("Server encountered an HTTPError at URL: '%s'", url)
+    except urllib.error.URLError:
+        logger.exception("An URLError occurred at URL: '%s'", url)
+    else:
+        # docs: https://docs.python.org/3/library/json.html
+        jsondata = response.read()
+        logger.debug("API response: %s", jsondata)
+
+    try:
+        data = json.loads(jsondata)
+    except json.decoder.JSONDecodeError:
+        logger.exception("Unable to decode JSON")
+    else:
+        logger.debug("JSON decoded: %s", data)
+
+    return data
+
+
+def json_file_load(path):
+    """
+    Get data from local file, return data as a dict
+    """
+
+    with open(path) as jf:
+        try:
+            data = json.loads(jf.read())
+        except json.decoder.JSONDecodeError:
+            logger.exception("Unable to decode JSON from file: '%s'", path)
+
+    return data
+
+
+def json_file_dump(path, data):
+    """
+    Write JSON file out to a path, creating directories in path as needed
+    """
+
+    # create directory if it doesn't already exist
+    parent_dir = os.path.dirname(path)
+    os.makedirs(parent_dir, exist_ok=True)
+
+    # write file, pretty printed
+    with open(path, "w") as jf:
+        json.dump(data, jf, indent=2)
+
+
+def parse_scrape_file(scrape_file):
+    """
+    Load and check the YAML scrape file, returning a list one or more documents
+    """
+
+    yout = list(yaml.safe_load_all(scrape_file))  # safe_load_all returns a generator
+    logger.debug("YAML decoded: %s", yout)
+
+    def check_required_keys(to_check, req_keys):
+        """
+        check that all required keys are found in the dict to check
+        """
+        for rk in req_keys:
+            if rk not in to_check:
+                logger.error("Required key '%s' not found in: '%s'", rk, to_check)
+                sys.exit(1)
+
+    # check that required keys exist in each YAML document
+    for ydoc in yout:
+        check_required_keys(ydoc, ["jenkins_jobs", "product_name", "onf_project"])
+
+        for group in ydoc["jenkins_jobs"]:
+            check_required_keys(group, ["group", "jenkins_url", "jobs"])
+
+            for job in group["jobs"]:
+                check_required_keys(job, ["name", "extract"])
+
+    return yout
+
+
+def jsonpath_extract(json_in, extract_list):
+    """
+    Extract data from json using list of jsonpath expressions
+    """
+
+    ret = {}
+
+    for name, jpath in extract_list.items():
+
+        # parsing jsonpath is expensive, store in global of parsed
+        # jsonpath expressions
+        if jpath not in gjpaths:
+            gjpaths[jpath] = jpparse(jpath)
+
+        jexpr = gjpaths[jpath]
+
+        matches = [match.value for match in jexpr.find(json_in)]
+
+        # If only a single match, unwrap from list
+        if len(matches) == 1:
+            ret[name] = matches[0]
+        else:
+            ret[name] = matches
+
+    logger.debug("extracted data: %s", ret)
+
+    return ret
+
+
+def get_builds_for_job(jobs_dir, local, jenkins_url, job_name, headers=[]):
+    """
+    Download list of builds from a Jenkins job, return list of build ids
+    """
+
+    # where to store jenkins JSON output with builds list
+    jbuildlist = "%s/%s/%s/0_list.json" % (jobs_dir, clean_url(jenkins_url), job_name)
+
+    if os.path.isfile(jbuildlist) and local:
+        # if already downlaoded and want to use the local copy, load it
+        jl = json_file_load(jbuildlist)
+    else:
+        # if not, query jenkins for the list of job builds
+        jlu = jenkins_job_list_url(jenkins_url, job_name)
+        jl = jenkins_api_get(jlu, headers)
+
+        # save to disk
+        json_file_dump(jbuildlist, jl)
+
+    # JSONPath for list of builds in the job
+    jexpr = jpparse("builds[*].number")
+
+    # get a list of builds
+    buildlist = [build.value for build in jexpr.find(jl)]
+
+    return buildlist
+
+
+def get_jenkins_build(jobs_dir, jenkins_url, job_name, build_id, headers=[]):
+    """
+    Download a single build and store it on disk, if job has completed
+    """
+
+    # path to store a copy of the JSON recieved by Jenkins
+    jjson = "%s/%s/%s/%d_build.json" % (
+        jobs_dir,
+        clean_url(jenkins_url),
+        job_name,
+        build_id,
+    )
+
+    if os.path.isfile(jjson):
+        # if have already run and local copy exists, read/return local copy
+        braw = json_file_load(jjson)
+    else:
+        # make an API call to get the JSON, store locally
+        burl = jenkins_job_build_url(jenkins_url, job_name, build_id)
+        braw = jenkins_api_get(burl, headers)
+
+        # if build is still going on the result field is null, so don't return
+        # build or save a copy, as build status is not final.
+        if not braw["result"]:
+            return None
+
+        # save to disk
+        json_file_dump(jjson, braw)
+
+    return braw
+
+
+def get_all_jenkins_builds(jobs_dir, jenkins_url, job_name, build_ids, headers=[]):
+    """
+    Get a list of all jenkins build data, for completed builds
+    """
+
+    builds_list = []
+
+    # download build data for all builds
+    for build_id in build_ids:
+
+        build = get_jenkins_build(
+            args.jobs_dir, jobgroup["jenkins_url"], job["name"], build_id, headers,
+        )
+
+        # may return None if build is in progress
+        if build:
+            builds_list.append(build)
+
+    return builds_list
+
+
+def clean_name(name):
+    """
+    Clean up a name string. Currently only replaces spaces with underscores
+    """
+    return name.replace(" ", "_")
+
+
+def clean_url(url):
+    """
+    remove prefix and any non-path friendly characters from URL
+    """
+    return re.sub(r"\W", "_", re.sub(r"\w+://", "", url))
+
+
+def save_product_builds(product_doc, product_dir, builds):
+    """
+    save the product-specific build information, if it's applicable to this
+    product based on the filters
+    """
+
+    # duplicate the scrape doc into final product data
+    product_data = dict(product_doc)
+
+    # used to hold groups of jobs
+    groups = {}
+
+    # each doc can have multiple job groups (usually version-specific)
+    for jobgroup in product_doc["jenkins_jobs"]:
+
+        groups[jobgroup["group"]] = {}
+
+        # each job group can have multiple jobs
+        for job in jobgroup["jobs"]:
+
+            pbuilds = []
+
+            # get the build data for the job
+            for build in builds[job["name"]]:
+
+                jpedata = jsonpath_extract(build, job["extract"])
+
+                # filter builds
+                save = True
+                if "filter" in job:
+                    for k, v in job["filter"].items():
+                        # if data doesn't match the filter value given, don't save it
+                        if jpedata[k] != v:
+                            save = False
+
+                if save:
+                    pbuilds.append(jpedata)
+
+            # allow job name to be overridden, for private jobs
+            if "name_override" in job:
+                groups[jobgroup["group"]][job["name_override"]] = pbuilds
+            else:
+                groups[jobgroup["group"]][job["name"]] = pbuilds
+
+    product_data["groups"] = groups
+
+    product_filename = "%s/%s.json" % (
+        product_dir,
+        clean_name(product_doc["product_name"]),
+    )
+
+    json_file_dump(product_filename, product_data)
+
+
+# main function that calls other functions
+if __name__ == "__main__":
+
+    args = parse_collector_args()
+
+    if not os.path.isdir(args.product_dir):
+        logger.error("Output directory is not a directory: '%s'", args.product_dir)
+        sys.exit(1)
+
+    # only print log messages if debugging
+    if args.debug:
+        logger.setLevel(logging.DEBUG)
+    else:
+        logger.setLevel(logging.CRITICAL)
+
+    # read in credentials file if option if argument passed
+    credentials = {}
+    if args.credentials:
+        cred_file = yaml.safe_load(args.credentials)
+        credentials = cred_file["credentials"]
+
+    # read in the Scrape File
+    sfile = parse_scrape_file(args.scrape_file)
+
+    # dict of job name -> build data
+    builds = {}
+
+    # Scrape File YAML may contain multiple documents
+    for sdoc in sfile:
+
+        # phase 1 - identify all the Jenkins jobs
+        # each doc can have multiple job groups (usually version-specific)
+        for jobgroup in sdoc["jenkins_jobs"]:
+
+            api_headers = []
+
+            if "credentials" in jobgroup:
+                if jobgroup["credentials"] in credentials:
+                    api_headers = [
+                        basic_auth_header(
+                            credentials[jobgroup["credentials"]]["jenkins_api_user"],
+                            credentials[jobgroup["credentials"]]["jenkins_api_token"],
+                        )
+                    ]
+                else:
+                    logger.error(
+                        "Credentials for '%s' not supplied", jobgroup["credentials"]
+                    )
+                    sys.exit(1)
+
+            # each job group can have multiple jobs
+            for job in jobgroup["jobs"]:
+
+                # only redownload jobs that haven't been downloaded before
+                if job["name"] not in builds:
+
+                    # get list of all Job ID's
+                    build_ids = get_builds_for_job(
+                        args.jobs_dir,
+                        args.local,
+                        jobgroup["jenkins_url"],
+                        job["name"],
+                        api_headers,
+                    )
+
+                    # get build info - either download or load from disk
+                    builds[job["name"]] = get_all_jenkins_builds(
+                        args.jobs_dir,
+                        jobgroup["jenkins_url"],
+                        job["name"],
+                        build_ids,
+                        api_headers,
+                    )
+
+        # phase 2 - create per-product (document) lists of build extracted data
+        save_product_builds(sdoc, args.product_dir, builds)