blob: e40d2210c2809b713bfe70712b4a959b5de0f29e [file] [log] [blame]
Sergio Slobodrianee417fa2017-08-11 09:34:50 -04001#!/bin/bash
Zack Williams41513bf2018-07-07 20:08:35 -07002# Copyright 2017-present Open Networking Foundation
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
Sergio Slobodrianee417fa2017-08-11 09:34:50 -040015
16# This script will collect all of the pertinent logs from a voltha
17# HA swarm cluster, tar, and bizip them to facilitate sending them
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -040018# to the suspected issue owner. The replicated storage is used to
19# allow all hosts to place the logs in a single place.
Sergio Slobodrianee417fa2017-08-11 09:34:50 -040020
21volthaDir="/cord/incubator/voltha"
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -040022declare -A lNames
23declare -A lPids
24declare -A lSizes
25
26# Checks if a value is not in an array.
27notIn() {
28 local e match=$1
29 shift
30 for e; do [[ "$e" == "$match" ]] && return 1; done
31 return 0
32}
Sergio Slobodrianee417fa2017-08-11 09:34:50 -040033
34# Get the list of the other hosts that make up the cluster
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -040035hosts=`docker node ls | tail -n +2 | grep -v "*" | grep -v "Down" | awk '{print $2}'`
36
37echo "Collecting logs for hosts: `hostname` ${hosts}"
Sergio Slobodrianee417fa2017-08-11 09:34:50 -040038
39# Create a temporary directory for temporary storage of all the logs
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -040040mkdir ${volthaDir}/registry_data/registry_volume/log_tmp
41pushd ${volthaDir}/registry_data/registry_volume/log_tmp
Sergio Slobodrianee417fa2017-08-11 09:34:50 -040042
43# Docker health in general.
44
45echo "Getting docker node ls"
46docker node ls > docker_node_ls.log 2>&1
47echo "Getting docker service ls"
48docker service ls > docker_service_ls.log 2>&1
49
50# Get the list of services to ps each one and get logs for each one.
51svcs=`docker service ls | tail -n +2 | awk '{print $2}'`
52
53# Get the PS information
54for i in $svcs
55do
56 echo "Getting docker service ps $i"
57 docker service ps ${i} > docker_service_ps_${i} 2>&1
58done
59
60# Get the logs for each service
61for i in $svcs
62do
63 echo "Getting docker service logs $i"
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -040064 lNames[$i]=$i
65 lSizes[$i]=0
Sergio Slobodrianee417fa2017-08-11 09:34:50 -040066 docker service logs ${i} > docker_service_logs_${i} 2>&1 &
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -040067 lPids[$i]=$!
Sergio Slobodrianee417fa2017-08-11 09:34:50 -040068done
69
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -040070patience=5
71while [ "${#lNames[*]}" -ne 0 ]
Sergio Slobodrianee417fa2017-08-11 09:34:50 -040072do
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -040073 echo "*** Waiting on log collection to complete (patience = ${patience}). Outstanding jobs: ${#lNames[*]} (${lNames[@]})"
74 sleep 10
75 # Check which collectors are done are remove them from the list
76 jobs > /dev/null # Don't delete this useless line or the next one will eroniously report a PID
77 pids=`jobs -p`
78 for i in "${lNames[@]}"
79 do
80 if notIn "${lPids[$i]}" $pids; then
81 unset lPids[$i]
82 unset lNames[$i]
83 unset lSizes[$i]
84 fi
85 done
86 unset pids
87 # Now for all remaining jobs check the file size of the log file for growth
88 # reset the timeout if the file is still growing. If no files are still growing
89 # then don't touch the timeout.
90 for i in "${lNames[@]}"
91 do
92 fsz=`stat --format=%s "docker_service_logs_${i}"`
93 if [ ${lSizes[$i]} -lt $fsz ]; then
94 patience=5
95 lSizes[$i]=$fsz
96 fi
97 done
98 patience=`expr $patience - 1`
99 if [ $patience -eq 0 ]; then
100 echo "Log collection stuck, killing any active collectors"
101 for i in "${lNames[@]}"
102 do
103 echo "${i}:${lNames[$i]}:${lSizes[$i]}:${lPids[$i]}"
104 kill -s TERM ${lPids[$i]}
105 done
106 break
107 fi
Sergio Slobodrianee417fa2017-08-11 09:34:50 -0400108done
109
110# Get the image list from this host
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -0400111#echo "Getting docker image ls from `hostname`"
112#docker image ls > docker_image_ls_`hostname` 2>&1
113# Get the memory info for this host
114#echo "Getting memory info from `hostname`"
115#cat /proc/meminfo > meminfo_`hostname` 2>&1
116# Get the disk info for this host
117#echo "Getting disk info from `hostname`"
118#df -h > df_`hostname` 2>&1
119
120#
121# If too many logs are generated it's not unusual that docker service logs
122# hangs and never produces the totality of logs for a service. In order
123# to get as much information as possible get the individual container logs
124# for each container on each host
125#
126
127# Get the logs for this host
128${volthaDir}/get-host-logs.sh
129
130
131# Get the logs for the other hosts
Sergio Slobodrianee417fa2017-08-11 09:34:50 -0400132for i in $hosts
133do
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -0400134 ssh voltha@$i ${volthaDir}/get-host-logs.sh
Sergio Slobodrianee417fa2017-08-11 09:34:50 -0400135done
136
Sergio Slobodrianee417fa2017-08-11 09:34:50 -0400137popd
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -0400138pushd ${volthaDir}/registry_data/registry_volume
Sergio Slobodrian8725ea82017-08-27 23:47:41 -0400139tar cjvf ${volthaDir}/logs.tar`date "+%Y%m%d-%H:%M:%S"`.bz2 log_tmp/*
Sergio Slobodrianee417fa2017-08-11 09:34:50 -0400140rm -fr log_tmp
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -0400141popd
Sergio Slobodrianee417fa2017-08-11 09:34:50 -0400142
143
144