blob: 88434970b87eefbe984de0604d217dc38dee703e [file] [log] [blame]
Sergio Slobodrianee417fa2017-08-11 09:34:50 -04001#!/bin/bash
2
3# This script will collect all of the pertinent logs from a voltha
4# HA swarm cluster, tar, and bizip them to facilitate sending them
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -04005# to the suspected issue owner. The replicated storage is used to
6# allow all hosts to place the logs in a single place.
Sergio Slobodrianee417fa2017-08-11 09:34:50 -04007
8volthaDir="/cord/incubator/voltha"
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -04009declare -A lNames
10declare -A lPids
11declare -A lSizes
12
13# Checks if a value is not in an array.
14notIn() {
15 local e match=$1
16 shift
17 for e; do [[ "$e" == "$match" ]] && return 1; done
18 return 0
19}
Sergio Slobodrianee417fa2017-08-11 09:34:50 -040020
21# Get the list of the other hosts that make up the cluster
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -040022hosts=`docker node ls | tail -n +2 | grep -v "*" | grep -v "Down" | awk '{print $2}'`
23
24echo "Collecting logs for hosts: `hostname` ${hosts}"
Sergio Slobodrianee417fa2017-08-11 09:34:50 -040025
26# Create a temporary directory for temporary storage of all the logs
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -040027mkdir ${volthaDir}/registry_data/registry_volume/log_tmp
28pushd ${volthaDir}/registry_data/registry_volume/log_tmp
Sergio Slobodrianee417fa2017-08-11 09:34:50 -040029
30# Docker health in general.
31
32echo "Getting docker node ls"
33docker node ls > docker_node_ls.log 2>&1
34echo "Getting docker service ls"
35docker service ls > docker_service_ls.log 2>&1
36
37# Get the list of services to ps each one and get logs for each one.
38svcs=`docker service ls | tail -n +2 | awk '{print $2}'`
39
40# Get the PS information
41for i in $svcs
42do
43 echo "Getting docker service ps $i"
44 docker service ps ${i} > docker_service_ps_${i} 2>&1
45done
46
47# Get the logs for each service
48for i in $svcs
49do
50 echo "Getting docker service logs $i"
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -040051 lNames[$i]=$i
52 lSizes[$i]=0
Sergio Slobodrianee417fa2017-08-11 09:34:50 -040053 docker service logs ${i} > docker_service_logs_${i} 2>&1 &
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -040054 lPids[$i]=$!
Sergio Slobodrianee417fa2017-08-11 09:34:50 -040055done
56
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -040057patience=5
58while [ "${#lNames[*]}" -ne 0 ]
Sergio Slobodrianee417fa2017-08-11 09:34:50 -040059do
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -040060 echo "*** Waiting on log collection to complete (patience = ${patience}). Outstanding jobs: ${#lNames[*]} (${lNames[@]})"
61 sleep 10
62 # Check which collectors are done are remove them from the list
63 jobs > /dev/null # Don't delete this useless line or the next one will eroniously report a PID
64 pids=`jobs -p`
65 for i in "${lNames[@]}"
66 do
67 if notIn "${lPids[$i]}" $pids; then
68 unset lPids[$i]
69 unset lNames[$i]
70 unset lSizes[$i]
71 fi
72 done
73 unset pids
74 # Now for all remaining jobs check the file size of the log file for growth
75 # reset the timeout if the file is still growing. If no files are still growing
76 # then don't touch the timeout.
77 for i in "${lNames[@]}"
78 do
79 fsz=`stat --format=%s "docker_service_logs_${i}"`
80 if [ ${lSizes[$i]} -lt $fsz ]; then
81 patience=5
82 lSizes[$i]=$fsz
83 fi
84 done
85 patience=`expr $patience - 1`
86 if [ $patience -eq 0 ]; then
87 echo "Log collection stuck, killing any active collectors"
88 for i in "${lNames[@]}"
89 do
90 echo "${i}:${lNames[$i]}:${lSizes[$i]}:${lPids[$i]}"
91 kill -s TERM ${lPids[$i]}
92 done
93 break
94 fi
Sergio Slobodrianee417fa2017-08-11 09:34:50 -040095done
96
97# Get the image list from this host
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -040098#echo "Getting docker image ls from `hostname`"
99#docker image ls > docker_image_ls_`hostname` 2>&1
100# Get the memory info for this host
101#echo "Getting memory info from `hostname`"
102#cat /proc/meminfo > meminfo_`hostname` 2>&1
103# Get the disk info for this host
104#echo "Getting disk info from `hostname`"
105#df -h > df_`hostname` 2>&1
106
107#
108# If too many logs are generated it's not unusual that docker service logs
109# hangs and never produces the totality of logs for a service. In order
110# to get as much information as possible get the individual container logs
111# for each container on each host
112#
113
114# Get the logs for this host
115${volthaDir}/get-host-logs.sh
116
117
118# Get the logs for the other hosts
Sergio Slobodrianee417fa2017-08-11 09:34:50 -0400119for i in $hosts
120do
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -0400121 ssh voltha@$i ${volthaDir}/get-host-logs.sh
Sergio Slobodrianee417fa2017-08-11 09:34:50 -0400122done
123
Sergio Slobodrianee417fa2017-08-11 09:34:50 -0400124popd
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -0400125pushd ${volthaDir}/registry_data/registry_volume
126tar cjvf ${volthaDir}/logs.tar.bz2 log_tmp/*
Sergio Slobodrianee417fa2017-08-11 09:34:50 -0400127rm -fr log_tmp
Sergio Slobodrianbcd30b12017-08-22 22:32:00 -0400128popd
Sergio Slobodrianee417fa2017-08-11 09:34:50 -0400129
130
131