Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 1 | #!/bin/bash |
Zack Williams | 41513bf | 2018-07-07 20:08:35 -0700 | [diff] [blame] | 2 | # Copyright 2017-present Open Networking Foundation |
| 3 | # |
| 4 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | # you may not use this file except in compliance with the License. |
| 6 | # You may obtain a copy of the License at |
| 7 | # |
| 8 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | # |
| 10 | # Unless required by applicable law or agreed to in writing, software |
| 11 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | # See the License for the specific language governing permissions and |
| 14 | # limitations under the License. |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 15 | |
| 16 | # This script will collect all of the pertinent logs from a voltha |
| 17 | # HA swarm cluster, tar, and bizip them to facilitate sending them |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 18 | # to the suspected issue owner. The replicated storage is used to |
| 19 | # allow all hosts to place the logs in a single place. |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 20 | |
| 21 | volthaDir="/cord/incubator/voltha" |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 22 | declare -A lNames |
| 23 | declare -A lPids |
| 24 | declare -A lSizes |
| 25 | |
| 26 | # Checks if a value is not in an array. |
| 27 | notIn() { |
| 28 | local e match=$1 |
| 29 | shift |
| 30 | for e; do [[ "$e" == "$match" ]] && return 1; done |
| 31 | return 0 |
| 32 | } |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 33 | |
| 34 | # Get the list of the other hosts that make up the cluster |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 35 | hosts=`docker node ls | tail -n +2 | grep -v "*" | grep -v "Down" | awk '{print $2}'` |
| 36 | |
| 37 | echo "Collecting logs for hosts: `hostname` ${hosts}" |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 38 | |
| 39 | # Create a temporary directory for temporary storage of all the logs |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 40 | mkdir ${volthaDir}/registry_data/registry_volume/log_tmp |
| 41 | pushd ${volthaDir}/registry_data/registry_volume/log_tmp |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 42 | |
| 43 | # Docker health in general. |
| 44 | |
| 45 | echo "Getting docker node ls" |
| 46 | docker node ls > docker_node_ls.log 2>&1 |
| 47 | echo "Getting docker service ls" |
| 48 | docker service ls > docker_service_ls.log 2>&1 |
| 49 | |
| 50 | # Get the list of services to ps each one and get logs for each one. |
| 51 | svcs=`docker service ls | tail -n +2 | awk '{print $2}'` |
| 52 | |
| 53 | # Get the PS information |
| 54 | for i in $svcs |
| 55 | do |
| 56 | echo "Getting docker service ps $i" |
| 57 | docker service ps ${i} > docker_service_ps_${i} 2>&1 |
| 58 | done |
| 59 | |
| 60 | # Get the logs for each service |
| 61 | for i in $svcs |
| 62 | do |
| 63 | echo "Getting docker service logs $i" |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 64 | lNames[$i]=$i |
| 65 | lSizes[$i]=0 |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 66 | docker service logs ${i} > docker_service_logs_${i} 2>&1 & |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 67 | lPids[$i]=$! |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 68 | done |
| 69 | |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 70 | patience=5 |
| 71 | while [ "${#lNames[*]}" -ne 0 ] |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 72 | do |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 73 | echo "*** Waiting on log collection to complete (patience = ${patience}). Outstanding jobs: ${#lNames[*]} (${lNames[@]})" |
| 74 | sleep 10 |
| 75 | # Check which collectors are done are remove them from the list |
| 76 | jobs > /dev/null # Don't delete this useless line or the next one will eroniously report a PID |
| 77 | pids=`jobs -p` |
| 78 | for i in "${lNames[@]}" |
| 79 | do |
| 80 | if notIn "${lPids[$i]}" $pids; then |
| 81 | unset lPids[$i] |
| 82 | unset lNames[$i] |
| 83 | unset lSizes[$i] |
| 84 | fi |
| 85 | done |
| 86 | unset pids |
| 87 | # Now for all remaining jobs check the file size of the log file for growth |
| 88 | # reset the timeout if the file is still growing. If no files are still growing |
| 89 | # then don't touch the timeout. |
| 90 | for i in "${lNames[@]}" |
| 91 | do |
| 92 | fsz=`stat --format=%s "docker_service_logs_${i}"` |
| 93 | if [ ${lSizes[$i]} -lt $fsz ]; then |
| 94 | patience=5 |
| 95 | lSizes[$i]=$fsz |
| 96 | fi |
| 97 | done |
| 98 | patience=`expr $patience - 1` |
| 99 | if [ $patience -eq 0 ]; then |
| 100 | echo "Log collection stuck, killing any active collectors" |
| 101 | for i in "${lNames[@]}" |
| 102 | do |
| 103 | echo "${i}:${lNames[$i]}:${lSizes[$i]}:${lPids[$i]}" |
| 104 | kill -s TERM ${lPids[$i]} |
| 105 | done |
| 106 | break |
| 107 | fi |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 108 | done |
| 109 | |
| 110 | # Get the image list from this host |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 111 | #echo "Getting docker image ls from `hostname`" |
| 112 | #docker image ls > docker_image_ls_`hostname` 2>&1 |
| 113 | # Get the memory info for this host |
| 114 | #echo "Getting memory info from `hostname`" |
| 115 | #cat /proc/meminfo > meminfo_`hostname` 2>&1 |
| 116 | # Get the disk info for this host |
| 117 | #echo "Getting disk info from `hostname`" |
| 118 | #df -h > df_`hostname` 2>&1 |
| 119 | |
| 120 | # |
| 121 | # If too many logs are generated it's not unusual that docker service logs |
| 122 | # hangs and never produces the totality of logs for a service. In order |
| 123 | # to get as much information as possible get the individual container logs |
| 124 | # for each container on each host |
| 125 | # |
| 126 | |
| 127 | # Get the logs for this host |
| 128 | ${volthaDir}/get-host-logs.sh |
| 129 | |
| 130 | |
| 131 | # Get the logs for the other hosts |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 132 | for i in $hosts |
| 133 | do |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 134 | ssh voltha@$i ${volthaDir}/get-host-logs.sh |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 135 | done |
| 136 | |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 137 | popd |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 138 | pushd ${volthaDir}/registry_data/registry_volume |
Sergio Slobodrian | 8725ea8 | 2017-08-27 23:47:41 -0400 | [diff] [blame] | 139 | tar cjvf ${volthaDir}/logs.tar`date "+%Y%m%d-%H:%M:%S"`.bz2 log_tmp/* |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 140 | rm -fr log_tmp |
Sergio Slobodrian | bcd30b1 | 2017-08-22 22:32:00 -0400 | [diff] [blame] | 141 | popd |
Sergio Slobodrian | ee417fa | 2017-08-11 09:34:50 -0400 | [diff] [blame] | 142 | |
| 143 | |
| 144 | |