Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 1 | #!/bin/bash |
| 2 | |
| 3 | # This script will collect all of the pertinent logs from a voltha |
| 4 | # HA swarm cluster host and place them in replicated storage. |
| 5 | |
| 6 | volthaDir="/cord/incubator/voltha" |
| 7 | hName=`hostname` |
| 8 | declare -A lNames |
| 9 | declare -A lPids |
| 10 | declare -A lSizes |
| 11 | |
| 12 | # Checks if a value is not in an array. |
| 13 | notIn() { |
| 14 | local e match=$1 |
| 15 | shift |
| 16 | for e; do [[ "$e" == "$match" ]] && return 1; done |
| 17 | return 0 |
| 18 | } |
| 19 | |
| 20 | pushd ${volthaDir}/registry_data/registry_volume/log_tmp |
| 21 | |
| 22 | # Get the image list from this host |
| 23 | echo "Getting docker image ls from ${hName}" |
| 24 | docker image ls > docker_image_ls_${hName} 2>&1 |
| 25 | # Get the memory info for this host |
| 26 | echo "Getting memory info from ${hName}" |
| 27 | cat /proc/meminfo > meminfo_${hName} 2>&1 |
| 28 | # Get the disk info for this host |
| 29 | echo "Getting disk info from ${hName}" |
| 30 | df -h > df_${hName} 2>&1 |
| 31 | |
| 32 | # |
| 33 | # If too many logs are generated it's not unusual that docker service logs |
| 34 | # hangs and never produces the totality of logs for a service. In order |
| 35 | # to get as much information as possible get the individual container logs |
| 36 | # for each container on each host |
| 37 | # |
| 38 | |
| 39 | # Get the container logs for this host |
| 40 | # Start of cut range |
| 41 | st=`docker ps | head -n 1 | sed -e 's/NAMES.*//' | wc -c` |
| 42 | ed=`expr $st + 100` |
| 43 | containers=`docker ps | tail -n +2 | awk '{print $1}'` |
| 44 | for i in $containers |
| 45 | do |
| 46 | cont=`docker ps | grep $i | cut -c ${st}-${ed}` |
| 47 | lNames[$cont]=$cont |
| 48 | lSizes[$cont]=0 |
| 49 | echo "Getting logs for ${cont} on host ${hName}" |
| 50 | docker logs $i > "docker_logs_${hName}_${cont}" 2>&1 & |
| 51 | lPids[$cont]=$! |
| 52 | done |
| 53 | |
| 54 | patience=5 |
| 55 | while [ "${#lNames[*]}" -ne 0 ] |
| 56 | do |
Sergio Slobodrian | 8725ea8 | 2017-08-27 23:47:41 -0400 | [diff] [blame] | 57 | echo "*** Waiting on log collection to complete (patience = ${patience}). Outstanding jobs: ${#lNames[*]} (${lNames[@]})" |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 58 | sleep 10 |
| 59 | # Check which collectors are done are remove them from the list |
| 60 | jobs > /dev/null # Don't delete this useless line or the next one will eroniously report a PID |
| 61 | pids=`jobs -p` |
| 62 | for i in "${lNames[@]}" |
| 63 | do |
| 64 | if notIn "${lPids[$i]}" $pids; then |
| 65 | unset lPids[$i] |
| 66 | unset lNames[$i] |
| 67 | unset lSizes[$i] |
| 68 | fi |
| 69 | done |
| 70 | unset pids |
| 71 | # Now for all remaining jobs check the file size of the log file for growth |
| 72 | # reset the timeout if the file is still growing. If no files are still growing |
| 73 | # then don't touch the timeout. |
| 74 | for i in "${lNames[@]}" |
| 75 | do |
| 76 | fsz=`stat --format=%s "docker_logs_${hName}_${i}"` |
| 77 | if [ ${lSizes[$i]} -lt $fsz ]; then |
| 78 | patience=5 |
| 79 | lSizes[$i]=$fsz |
| 80 | fi |
| 81 | done |
| 82 | patience=`expr $patience - 1` |
| 83 | if [ $patience -eq 0 ]; then |
| 84 | echo "Log collection stuck, killing any active collectors" |
| 85 | for i in "${lNames[@]}" |
| 86 | do |
| 87 | echo "${i}:${lNames[$i]}:${lSizes[$i]}:${lPids[$i]}" |
| 88 | kill -s TERM ${lPids[$i]} |
| 89 | done |
| 90 | break |
| 91 | fi |
| 92 | done |
| 93 | |
| 94 | |
| 95 | popd |