Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 1 | #!/bin/bash |
| 2 | |
| 3 | # This script will collect all of the pertinent logs from a voltha |
| 4 | # HA swarm cluster, tar, and bizip them to facilitate sending them |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 5 | # to the suspected issue owner. The replicated storage is used to |
| 6 | # allow all hosts to place the logs in a single place. |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 7 | |
| 8 | volthaDir="/cord/incubator/voltha" |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 9 | declare -A lNames |
| 10 | declare -A lPids |
| 11 | declare -A lSizes |
| 12 | |
| 13 | # Checks if a value is not in an array. |
| 14 | notIn() { |
| 15 | local e match=$1 |
| 16 | shift |
| 17 | for e; do [[ "$e" == "$match" ]] && return 1; done |
| 18 | return 0 |
| 19 | } |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 20 | |
| 21 | # Get the list of the other hosts that make up the cluster |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 22 | hosts=`docker node ls | tail -n +2 | grep -v "*" | grep -v "Down" | awk '{print $2}'` |
| 23 | |
| 24 | echo "Collecting logs for hosts: `hostname` ${hosts}" |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 25 | |
| 26 | # Create a temporary directory for temporary storage of all the logs |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 27 | mkdir ${volthaDir}/registry_data/registry_volume/log_tmp |
| 28 | pushd ${volthaDir}/registry_data/registry_volume/log_tmp |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 29 | |
| 30 | # Docker health in general. |
| 31 | |
| 32 | echo "Getting docker node ls" |
| 33 | docker node ls > docker_node_ls.log 2>&1 |
| 34 | echo "Getting docker service ls" |
| 35 | docker service ls > docker_service_ls.log 2>&1 |
| 36 | |
| 37 | # Get the list of services to ps each one and get logs for each one. |
| 38 | svcs=`docker service ls | tail -n +2 | awk '{print $2}'` |
| 39 | |
| 40 | # Get the PS information |
| 41 | for i in $svcs |
| 42 | do |
| 43 | echo "Getting docker service ps $i" |
| 44 | docker service ps ${i} > docker_service_ps_${i} 2>&1 |
| 45 | done |
| 46 | |
| 47 | # Get the logs for each service |
| 48 | for i in $svcs |
| 49 | do |
| 50 | echo "Getting docker service logs $i" |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 51 | lNames[$i]=$i |
| 52 | lSizes[$i]=0 |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 53 | docker service logs ${i} > docker_service_logs_${i} 2>&1 & |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 54 | lPids[$i]=$! |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 55 | done |
| 56 | |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 57 | patience=5 |
| 58 | while [ "${#lNames[*]}" -ne 0 ] |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 59 | do |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 60 | echo "*** Waiting on log collection to complete (patience = ${patience}). Outstanding jobs: ${#lNames[*]} (${lNames[@]})" |
| 61 | sleep 10 |
| 62 | # Check which collectors are done are remove them from the list |
| 63 | jobs > /dev/null # Don't delete this useless line or the next one will eroniously report a PID |
| 64 | pids=`jobs -p` |
| 65 | for i in "${lNames[@]}" |
| 66 | do |
| 67 | if notIn "${lPids[$i]}" $pids; then |
| 68 | unset lPids[$i] |
| 69 | unset lNames[$i] |
| 70 | unset lSizes[$i] |
| 71 | fi |
| 72 | done |
| 73 | unset pids |
| 74 | # Now for all remaining jobs check the file size of the log file for growth |
| 75 | # reset the timeout if the file is still growing. If no files are still growing |
| 76 | # then don't touch the timeout. |
| 77 | for i in "${lNames[@]}" |
| 78 | do |
| 79 | fsz=`stat --format=%s "docker_service_logs_${i}"` |
| 80 | if [ ${lSizes[$i]} -lt $fsz ]; then |
| 81 | patience=5 |
| 82 | lSizes[$i]=$fsz |
| 83 | fi |
| 84 | done |
| 85 | patience=`expr $patience - 1` |
| 86 | if [ $patience -eq 0 ]; then |
| 87 | echo "Log collection stuck, killing any active collectors" |
| 88 | for i in "${lNames[@]}" |
| 89 | do |
| 90 | echo "${i}:${lNames[$i]}:${lSizes[$i]}:${lPids[$i]}" |
| 91 | kill -s TERM ${lPids[$i]} |
| 92 | done |
| 93 | break |
| 94 | fi |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 95 | done |
| 96 | |
| 97 | # Get the image list from this host |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 98 | #echo "Getting docker image ls from `hostname`" |
| 99 | #docker image ls > docker_image_ls_`hostname` 2>&1 |
| 100 | # Get the memory info for this host |
| 101 | #echo "Getting memory info from `hostname`" |
| 102 | #cat /proc/meminfo > meminfo_`hostname` 2>&1 |
| 103 | # Get the disk info for this host |
| 104 | #echo "Getting disk info from `hostname`" |
| 105 | #df -h > df_`hostname` 2>&1 |
| 106 | |
| 107 | # |
| 108 | # If too many logs are generated it's not unusual that docker service logs |
| 109 | # hangs and never produces the totality of logs for a service. In order |
| 110 | # to get as much information as possible get the individual container logs |
| 111 | # for each container on each host |
| 112 | # |
| 113 | |
| 114 | # Get the logs for this host |
| 115 | ${volthaDir}/get-host-logs.sh |
| 116 | |
| 117 | |
| 118 | # Get the logs for the other hosts |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 119 | for i in $hosts |
| 120 | do |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 121 | ssh voltha@$i ${volthaDir}/get-host-logs.sh |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 122 | done |
| 123 | |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 124 | popd |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 125 | pushd ${volthaDir}/registry_data/registry_volume |
| 126 | tar cjvf ${volthaDir}/logs.tar.bz2 log_tmp/* |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 127 | rm -fr log_tmp |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 128 | popd |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 129 | |
| 130 | |
| 131 | |