Zack Williams | 712caf6 | 2020-04-28 13:37:41 -0700 | [diff] [blame] | 1 | #!/usr/bin/env python3 |
| 2 | |
| 3 | # SPDX-FileCopyrightText: © 2020 Open Networking Foundation <support@opennetworking.org> |
| 4 | # SPDX-License-Identifier: Apache-2.0 |
| 5 | |
| 6 | from __future__ import absolute_import |
| 7 | |
| 8 | import argparse |
| 9 | import base64 |
| 10 | import json |
| 11 | import logging |
| 12 | import os |
| 13 | import re |
| 14 | import sys |
| 15 | import urllib.request |
| 16 | import yaml |
| 17 | |
| 18 | from jsonpath_ng.ext import parse as jpparse |
| 19 | |
| 20 | # create shared logger |
| 21 | logging.basicConfig() |
| 22 | logger = logging.getLogger("sjsgc") |
| 23 | |
| 24 | # global dict of jsonpath expressions -> compiled jsonpath parsers, as |
| 25 | # reparsing expressions in each loop results in 100x longer execution time |
| 26 | gjpaths = {} |
| 27 | |
| 28 | # credentials global |
| 29 | |
| 30 | |
| 31 | def parse_collector_args(): |
| 32 | """ |
| 33 | parse CLI arguments |
| 34 | """ |
| 35 | |
| 36 | parser = argparse.ArgumentParser(description="Jenkins job results collector") |
| 37 | |
| 38 | # Positional args |
| 39 | parser.add_argument( |
| 40 | "scrape_file", |
| 41 | default="scrape.yaml", |
| 42 | type=argparse.FileType("r"), |
| 43 | help="YAML file describing Jenkins job and data to scrape", |
| 44 | ) |
| 45 | |
| 46 | # Flags |
| 47 | parser.add_argument( |
| 48 | "--credentials", |
| 49 | type=argparse.FileType("r"), |
| 50 | help="Credentials to use for private jenkins jobs", |
| 51 | ) |
| 52 | |
| 53 | parser.add_argument( |
| 54 | "--local", action="store_true", help="Prefer local copies of build lists" |
| 55 | ) |
| 56 | |
| 57 | parser.add_argument( |
| 58 | "--product_dir", default="products", help="Directory to save per-product output" |
| 59 | ) |
| 60 | |
| 61 | parser.add_argument( |
| 62 | "--jobs_dir", default="jobs", help="Directory to save raw Jenkins job output" |
| 63 | ) |
| 64 | |
| 65 | parser.add_argument( |
| 66 | "--debug", action="store_true", help="Print additional debugging information" |
| 67 | ) |
| 68 | |
| 69 | return parser.parse_args() |
| 70 | |
| 71 | |
| 72 | def jenkins_job_list_url(server_url, job_name): |
| 73 | """ |
| 74 | create a Jenkins JSON API URL for a job (list of builds) |
| 75 | """ |
| 76 | |
| 77 | url = "%s/job/%s/api/json" % (server_url, job_name) |
| 78 | return url |
| 79 | |
| 80 | |
| 81 | def jenkins_job_build_url(server_url, job_name, build_number): |
| 82 | """ |
| 83 | create a Jenkins JSON API URL for a specific build of a job |
| 84 | """ |
| 85 | |
| 86 | url = "%s/job/%s/%d/api/json" % (server_url, job_name, build_number) |
| 87 | return url |
| 88 | |
| 89 | |
| 90 | def basic_auth_header(username, password): |
| 91 | """ |
| 92 | returns a tuple containing a http basic auth header |
| 93 | """ |
| 94 | creds_str = "%s:%s" % (username, password) |
| 95 | creds_b64 = base64.standard_b64encode(creds_str.encode("utf-8")) |
| 96 | |
| 97 | return ("Authorization", "Basic %s" % creds_b64.decode("utf-8")) |
| 98 | |
| 99 | |
| 100 | def jenkins_api_get(url, headers=[]): |
| 101 | """ |
| 102 | Get data from Jenkins JSON API endpoint, return data as a dict |
| 103 | """ |
| 104 | |
| 105 | request = urllib.request.Request(url) |
| 106 | |
| 107 | # add headers tuples |
| 108 | for header in headers: |
| 109 | request.add_header(*header) |
| 110 | |
| 111 | try: |
| 112 | response = urllib.request.urlopen(request) |
| 113 | except urllib.error.HTTPError: |
| 114 | logger.exception("Server encountered an HTTPError at URL: '%s'", url) |
| 115 | except urllib.error.URLError: |
| 116 | logger.exception("An URLError occurred at URL: '%s'", url) |
| 117 | else: |
| 118 | # docs: https://docs.python.org/3/library/json.html |
| 119 | jsondata = response.read() |
| 120 | logger.debug("API response: %s", jsondata) |
| 121 | |
| 122 | try: |
| 123 | data = json.loads(jsondata) |
| 124 | except json.decoder.JSONDecodeError: |
| 125 | logger.exception("Unable to decode JSON") |
| 126 | else: |
| 127 | logger.debug("JSON decoded: %s", data) |
| 128 | |
| 129 | return data |
| 130 | |
| 131 | |
| 132 | def json_file_load(path): |
| 133 | """ |
| 134 | Get data from local file, return data as a dict |
| 135 | """ |
| 136 | |
| 137 | with open(path) as jf: |
| 138 | try: |
| 139 | data = json.loads(jf.read()) |
| 140 | except json.decoder.JSONDecodeError: |
| 141 | logger.exception("Unable to decode JSON from file: '%s'", path) |
| 142 | |
| 143 | return data |
| 144 | |
| 145 | |
| 146 | def json_file_dump(path, data): |
| 147 | """ |
| 148 | Write JSON file out to a path, creating directories in path as needed |
| 149 | """ |
Zack Williams | 0204788 | 2020-10-28 11:04:07 -0700 | [diff] [blame] | 150 | logger.debug("writing JSON file: %s", path) |
Zack Williams | 712caf6 | 2020-04-28 13:37:41 -0700 | [diff] [blame] | 151 | |
| 152 | # create directory if it doesn't already exist |
| 153 | parent_dir = os.path.dirname(path) |
| 154 | os.makedirs(parent_dir, exist_ok=True) |
| 155 | |
| 156 | # write file, pretty printed |
| 157 | with open(path, "w") as jf: |
| 158 | json.dump(data, jf, indent=2) |
| 159 | |
| 160 | |
| 161 | def parse_scrape_file(scrape_file): |
| 162 | """ |
| 163 | Load and check the YAML scrape file, returning a list one or more documents |
| 164 | """ |
| 165 | |
| 166 | yout = list(yaml.safe_load_all(scrape_file)) # safe_load_all returns a generator |
| 167 | logger.debug("YAML decoded: %s", yout) |
| 168 | |
| 169 | def check_required_keys(to_check, req_keys): |
| 170 | """ |
| 171 | check that all required keys are found in the dict to check |
| 172 | """ |
| 173 | for rk in req_keys: |
| 174 | if rk not in to_check: |
| 175 | logger.error("Required key '%s' not found in: '%s'", rk, to_check) |
| 176 | sys.exit(1) |
| 177 | |
| 178 | # check that required keys exist in each YAML document |
| 179 | for ydoc in yout: |
| 180 | check_required_keys(ydoc, ["jenkins_jobs", "product_name", "onf_project"]) |
| 181 | |
| 182 | for group in ydoc["jenkins_jobs"]: |
| 183 | check_required_keys(group, ["group", "jenkins_url", "jobs"]) |
| 184 | |
| 185 | for job in group["jobs"]: |
| 186 | check_required_keys(job, ["name", "extract"]) |
| 187 | |
| 188 | return yout |
| 189 | |
| 190 | |
| 191 | def jsonpath_extract(json_in, extract_list): |
| 192 | """ |
| 193 | Extract data from json using list of jsonpath expressions |
| 194 | """ |
| 195 | |
| 196 | ret = {} |
| 197 | |
| 198 | for name, jpath in extract_list.items(): |
| 199 | |
| 200 | # parsing jsonpath is expensive, store in global of parsed |
| 201 | # jsonpath expressions |
| 202 | if jpath not in gjpaths: |
| 203 | gjpaths[jpath] = jpparse(jpath) |
| 204 | |
| 205 | jexpr = gjpaths[jpath] |
| 206 | |
| 207 | matches = [match.value for match in jexpr.find(json_in)] |
| 208 | |
| 209 | # If only a single match, unwrap from list |
| 210 | if len(matches) == 1: |
| 211 | ret[name] = matches[0] |
| 212 | else: |
| 213 | ret[name] = matches |
| 214 | |
| 215 | logger.debug("extracted data: %s", ret) |
| 216 | |
| 217 | return ret |
| 218 | |
| 219 | |
| 220 | def get_builds_for_job(jobs_dir, local, jenkins_url, job_name, headers=[]): |
| 221 | """ |
| 222 | Download list of builds from a Jenkins job, return list of build ids |
| 223 | """ |
| 224 | |
| 225 | # where to store jenkins JSON output with builds list |
| 226 | jbuildlist = "%s/%s/%s/0_list.json" % (jobs_dir, clean_url(jenkins_url), job_name) |
| 227 | |
| 228 | if os.path.isfile(jbuildlist) and local: |
| 229 | # if already downlaoded and want to use the local copy, load it |
| 230 | jl = json_file_load(jbuildlist) |
| 231 | else: |
| 232 | # if not, query jenkins for the list of job builds |
| 233 | jlu = jenkins_job_list_url(jenkins_url, job_name) |
| 234 | jl = jenkins_api_get(jlu, headers) |
| 235 | |
| 236 | # save to disk |
| 237 | json_file_dump(jbuildlist, jl) |
| 238 | |
| 239 | # JSONPath for list of builds in the job |
| 240 | jexpr = jpparse("builds[*].number") |
| 241 | |
| 242 | # get a list of builds |
| 243 | buildlist = [build.value for build in jexpr.find(jl)] |
| 244 | |
| 245 | return buildlist |
| 246 | |
| 247 | |
| 248 | def get_jenkins_build(jobs_dir, jenkins_url, job_name, build_id, headers=[]): |
| 249 | """ |
| 250 | Download a single build and store it on disk, if job has completed |
| 251 | """ |
| 252 | |
| 253 | # path to store a copy of the JSON recieved by Jenkins |
| 254 | jjson = "%s/%s/%s/%d_build.json" % ( |
| 255 | jobs_dir, |
| 256 | clean_url(jenkins_url), |
| 257 | job_name, |
| 258 | build_id, |
| 259 | ) |
| 260 | |
| 261 | if os.path.isfile(jjson): |
| 262 | # if have already run and local copy exists, read/return local copy |
| 263 | braw = json_file_load(jjson) |
| 264 | else: |
| 265 | # make an API call to get the JSON, store locally |
| 266 | burl = jenkins_job_build_url(jenkins_url, job_name, build_id) |
| 267 | braw = jenkins_api_get(burl, headers) |
| 268 | |
| 269 | # if build is still going on the result field is null, so don't return |
| 270 | # build or save a copy, as build status is not final. |
| 271 | if not braw["result"]: |
| 272 | return None |
| 273 | |
| 274 | # save to disk |
| 275 | json_file_dump(jjson, braw) |
| 276 | |
| 277 | return braw |
| 278 | |
| 279 | |
| 280 | def get_all_jenkins_builds(jobs_dir, jenkins_url, job_name, build_ids, headers=[]): |
| 281 | """ |
| 282 | Get a list of all jenkins build data, for completed builds |
| 283 | """ |
| 284 | |
| 285 | builds_list = [] |
| 286 | |
| 287 | # download build data for all builds |
| 288 | for build_id in build_ids: |
| 289 | |
| 290 | build = get_jenkins_build( |
| 291 | args.jobs_dir, jobgroup["jenkins_url"], job["name"], build_id, headers, |
| 292 | ) |
| 293 | |
| 294 | # may return None if build is in progress |
| 295 | if build: |
| 296 | builds_list.append(build) |
| 297 | |
| 298 | return builds_list |
| 299 | |
| 300 | |
| 301 | def clean_name(name): |
| 302 | """ |
| 303 | Clean up a name string. Currently only replaces spaces with underscores |
| 304 | """ |
| 305 | return name.replace(" ", "_") |
| 306 | |
| 307 | |
| 308 | def clean_url(url): |
| 309 | """ |
| 310 | remove prefix and any non-path friendly characters from URL |
| 311 | """ |
| 312 | return re.sub(r"\W", "_", re.sub(r"\w+://", "", url)) |
| 313 | |
| 314 | |
| 315 | def save_product_builds(product_doc, product_dir, builds): |
| 316 | """ |
| 317 | save the product-specific build information, if it's applicable to this |
| 318 | product based on the filters |
| 319 | """ |
| 320 | |
| 321 | # duplicate the scrape doc into final product data |
| 322 | product_data = dict(product_doc) |
| 323 | |
| 324 | # used to hold groups of jobs |
| 325 | groups = {} |
| 326 | |
| 327 | # each doc can have multiple job groups (usually version-specific) |
| 328 | for jobgroup in product_doc["jenkins_jobs"]: |
| 329 | |
| 330 | groups[jobgroup["group"]] = {} |
| 331 | |
| 332 | # each job group can have multiple jobs |
| 333 | for job in jobgroup["jobs"]: |
| 334 | |
| 335 | pbuilds = [] |
| 336 | |
| 337 | # get the build data for the job |
| 338 | for build in builds[job["name"]]: |
| 339 | |
| 340 | jpedata = jsonpath_extract(build, job["extract"]) |
| 341 | |
| 342 | # filter builds |
| 343 | save = True |
| 344 | if "filter" in job: |
| 345 | for k, v in job["filter"].items(): |
| 346 | # if data doesn't match the filter value given, don't save it |
| 347 | if jpedata[k] != v: |
| 348 | save = False |
| 349 | |
| 350 | if save: |
| 351 | pbuilds.append(jpedata) |
| 352 | |
| 353 | # allow job name to be overridden, for private jobs |
| 354 | if "name_override" in job: |
| 355 | groups[jobgroup["group"]][job["name_override"]] = pbuilds |
| 356 | else: |
| 357 | groups[jobgroup["group"]][job["name"]] = pbuilds |
| 358 | |
| 359 | product_data["groups"] = groups |
| 360 | |
Zack Williams | 0204788 | 2020-10-28 11:04:07 -0700 | [diff] [blame] | 361 | product_filename = "%s/%s/%s.json" % ( |
Zack Williams | 712caf6 | 2020-04-28 13:37:41 -0700 | [diff] [blame] | 362 | product_dir, |
Zack Williams | 0204788 | 2020-10-28 11:04:07 -0700 | [diff] [blame] | 363 | product_doc["onf_project"], |
Zack Williams | 712caf6 | 2020-04-28 13:37:41 -0700 | [diff] [blame] | 364 | clean_name(product_doc["product_name"]), |
| 365 | ) |
| 366 | |
| 367 | json_file_dump(product_filename, product_data) |
| 368 | |
| 369 | |
| 370 | # main function that calls other functions |
| 371 | if __name__ == "__main__": |
| 372 | |
| 373 | args = parse_collector_args() |
| 374 | |
| 375 | if not os.path.isdir(args.product_dir): |
| 376 | logger.error("Output directory is not a directory: '%s'", args.product_dir) |
| 377 | sys.exit(1) |
| 378 | |
| 379 | # only print log messages if debugging |
| 380 | if args.debug: |
| 381 | logger.setLevel(logging.DEBUG) |
| 382 | else: |
| 383 | logger.setLevel(logging.CRITICAL) |
| 384 | |
| 385 | # read in credentials file if option if argument passed |
| 386 | credentials = {} |
| 387 | if args.credentials: |
| 388 | cred_file = yaml.safe_load(args.credentials) |
| 389 | credentials = cred_file["credentials"] |
| 390 | |
| 391 | # read in the Scrape File |
| 392 | sfile = parse_scrape_file(args.scrape_file) |
| 393 | |
| 394 | # dict of job name -> build data |
| 395 | builds = {} |
| 396 | |
| 397 | # Scrape File YAML may contain multiple documents |
| 398 | for sdoc in sfile: |
| 399 | |
| 400 | # phase 1 - identify all the Jenkins jobs |
| 401 | # each doc can have multiple job groups (usually version-specific) |
| 402 | for jobgroup in sdoc["jenkins_jobs"]: |
| 403 | |
| 404 | api_headers = [] |
| 405 | |
| 406 | if "credentials" in jobgroup: |
| 407 | if jobgroup["credentials"] in credentials: |
| 408 | api_headers = [ |
| 409 | basic_auth_header( |
| 410 | credentials[jobgroup["credentials"]]["jenkins_api_user"], |
| 411 | credentials[jobgroup["credentials"]]["jenkins_api_token"], |
| 412 | ) |
| 413 | ] |
| 414 | else: |
| 415 | logger.error( |
| 416 | "Credentials for '%s' not supplied", jobgroup["credentials"] |
| 417 | ) |
| 418 | sys.exit(1) |
| 419 | |
| 420 | # each job group can have multiple jobs |
| 421 | for job in jobgroup["jobs"]: |
| 422 | |
| 423 | # only redownload jobs that haven't been downloaded before |
| 424 | if job["name"] not in builds: |
| 425 | |
| 426 | # get list of all Job ID's |
| 427 | build_ids = get_builds_for_job( |
| 428 | args.jobs_dir, |
| 429 | args.local, |
| 430 | jobgroup["jenkins_url"], |
| 431 | job["name"], |
| 432 | api_headers, |
| 433 | ) |
| 434 | |
| 435 | # get build info - either download or load from disk |
| 436 | builds[job["name"]] = get_all_jenkins_builds( |
| 437 | args.jobs_dir, |
| 438 | jobgroup["jenkins_url"], |
| 439 | job["name"], |
| 440 | build_ids, |
| 441 | api_headers, |
| 442 | ) |
| 443 | |
| 444 | # phase 2 - create per-product (document) lists of build extracted data |
| 445 | save_product_builds(sdoc, args.product_dir, builds) |