VOL-384 This update improves the issue reporting log collecctor such that if a specific docker log request hangs it will be skipped such that as many logs as possible can be collected rather than hanging on a docker log request that hangs. Change-Id: I5e7a38fbfbc781d7a7d8825458106ea7463f3e17

commit: bcd30b17108c5a2eb2b861efc70c68c2834313d6 [log] [tgz]
author: Sergio Slobodrian <sslobodr@ciena.com> Tue Aug 22 22:32:00 2017 -0400
committer: Sergio Slobodrian <sslobodr@ciena.com> Wed Aug 23 09:18:57 2017 -0400
tree: 2c7cabd83f579d8fda5718f62ef9614bb3f17a95
parent: 92136d00878bdfa6929d2bbff8ddc0da7749ae52 [diff]
diff --git a/install/ansible/roles/installer/tasks/main.yml b/install/ansible/roles/installer/tasks/main.yml
index cbf2206..5f39f16 100644
--- a/install/ansible/roles/installer/tasks/main.yml
+++ b/install/ansible/roles/installer/tasks/main.yml

@@ -3,7 +3,7 @@
     repo: 'ppa:ansible/ansible'
   tags: [installer]
 
-- name: Debian ansible is present
+- name: Ansible is present
   apt:
     name: ansible
     state: latest
@@ -25,6 +25,7 @@
     - install/voltha-swarm-start.sh
     - install/voltha-swarm-stop.sh
     - install/get-logs.sh
+    - install/get-host-logs.sh
     - install/ansible
     - compose
   tags: [installer]
@@ -97,5 +98,6 @@
     - voltha-swarm-start.sh
     - voltha-swarm-stop.sh
     - get-logs.sh
+    - get-host-logs.sh
     - docker-compose-Linux-x86_64
   tags: [installer]

diff --git a/install/ansible/roles/voltha/tasks/main.yml b/install/ansible/roles/voltha/tasks/main.yml
index fcdefe4..eba2325 100644
--- a/install/ansible/roles/voltha/tasks/main.yml
+++ b/install/ansible/roles/voltha/tasks/main.yml

@@ -45,6 +45,7 @@
     - voltha-swarm-start.sh
     - voltha-swarm-stop.sh
     - get-logs.sh
+    - get-host-logs.sh
   when: target == "cluster"
   tags: [voltha]
 
@@ -83,6 +84,7 @@
     - voltha-swarm-start.sh
     - voltha-swarm-stop.sh
     - get-logs.sh
+    - get-host-logs.sh
   when: target == "cluster"
   tags: [voltha]
 

diff --git a/install/get-host-logs.sh b/install/get-host-logs.sh
new file mode 100755
index 0000000..3e0e443
--- /dev/null
+++ b/install/get-host-logs.sh

@@ -0,0 +1,95 @@
+#!/bin/bash
+
+# This script will collect all of the pertinent logs from a voltha
+# HA swarm cluster host and place them in replicated storage.
+
+volthaDir="/cord/incubator/voltha"
+hName=`hostname`
+declare -A lNames
+declare -A lPids
+declare -A lSizes
+
+# Checks if a value is not in an array.
+notIn() {
+	local e match=$1
+	shift
+	for e; do [[ "$e" == "$match" ]] && return 1; done
+	return 0
+}
+
+pushd ${volthaDir}/registry_data/registry_volume/log_tmp
+
+# Get the image list from this host
+echo "Getting docker image ls from ${hName}"
+docker image ls > docker_image_ls_${hName} 2>&1
+# Get the memory info for this host
+echo "Getting memory info from ${hName}"
+cat /proc/meminfo > meminfo_${hName} 2>&1
+# Get the disk info for this host
+echo "Getting disk info from ${hName}"
+df -h > df_${hName} 2>&1
+
+#
+# If too many logs are generated it's not unusual that docker service logs
+# hangs and never produces the totality of logs for a service. In order
+# to get as much information as possible get the individual container logs
+# for each container on each host
+#
+
+# Get the container logs for this host 
+# Start of cut range
+st=`docker ps | head -n 1 | sed -e 's/NAMES.*//' | wc -c`
+ed=`expr $st + 100`
+containers=`docker ps | tail -n +2 | awk '{print $1}'`
+for i in $containers
+do
+	cont=`docker ps | grep $i | cut -c ${st}-${ed}`
+	lNames[$cont]=$cont
+	lSizes[$cont]=0
+	echo "Getting logs for ${cont} on host ${hName}"
+	docker logs $i > "docker_logs_${hName}_${cont}" 2>&1 &
+	lPids[$cont]=$!
+done
+
+patience=5
+while [ "${#lNames[*]}" -ne 0  ]
+do
+	echo "*** Waiting on log collection to complete. Outstanding jobs: ${#lNames[*]} (${lNames[@]})"
+	sleep 10
+	# Check which collectors are done are remove them from the list
+	jobs > /dev/null # Don't delete this useless line or the next one will eroniously report a PID
+	pids=`jobs -p`
+	for i in "${lNames[@]}"
+	do
+		if notIn "${lPids[$i]}" $pids; then
+			unset lPids[$i]
+			unset lNames[$i]
+			unset lSizes[$i]
+		fi
+	done
+	unset pids
+	# Now for all remaining jobs check the file size of the log file for growth
+	# reset the timeout if the file is still growing. If no files are still growing
+	# then don't touch the timeout.
+	for i in "${lNames[@]}"
+	do
+		fsz=`stat --format=%s "docker_logs_${hName}_${i}"`
+		if [ ${lSizes[$i]} -lt $fsz ]; then
+			patience=5
+			lSizes[$i]=$fsz
+		fi
+	done
+	patience=`expr $patience - 1`
+	if [ $patience -eq 0 ]; then
+		echo "Log collection stuck, killing any active collectors"
+		for i in "${lNames[@]}"
+		do
+			echo "${i}:${lNames[$i]}:${lSizes[$i]}:${lPids[$i]}"
+			kill -s TERM ${lPids[$i]}
+		done
+		break
+	fi
+done
+
+
+popd

diff --git a/install/get-logs.sh b/install/get-logs.sh
index 37d0755..8843497 100644
--- a/install/get-logs.sh
+++ b/install/get-logs.sh

@@ -2,16 +2,30 @@
 
 # This script will collect all of the pertinent logs from a voltha
 # HA swarm cluster, tar, and bizip them to facilitate sending them
-# to the suspected issue owner.
+# to the suspected issue owner. The replicated storage is used to
+# allow all hosts to place the logs in a single place.
 
 volthaDir="/cord/incubator/voltha"
+declare -A lNames
+declare -A lPids
+declare -A lSizes
+
+# Checks if a value is not in an array.
+notIn() {
+	local e match=$1
+	shift
+	for e; do [[ "$e" == "$match" ]] && return 1; done
+	return 0
+}
 
 # Get the list of the other hosts that make up the cluster
-hosts=`docker node ls | tail -n +2 | awk '{print $2}' | grep -v "*"`
+hosts=`docker node ls | tail -n +2 | grep -v "*" | grep -v "Down" | awk '{print $2}'`
+
+echo "Collecting logs for hosts: `hostname` ${hosts}"
 
 # Create a temporary directory for temporary storage of all the logs
-mkdir ${volthaDir}/log_tmp
-pushd ${volthaDir}/log_tmp
+mkdir ${volthaDir}/registry_data/registry_volume/log_tmp
+pushd ${volthaDir}/registry_data/registry_volume/log_tmp
 
 # Docker health in general.
 
@@ -34,38 +48,84 @@
 for i in $svcs
 do
 	echo "Getting docker service logs $i"
+	lNames[$i]=$i
+	lSizes[$i]=0
 	docker service logs ${i} > docker_service_logs_${i} 2>&1 &
+	lPids[$i]=$!
 done
 
-patience=10
-while [ ! -z "`jobs -p`" ]
+patience=5
+while [ "${#lNames[*]}" -ne 0  ]
 do
- echo "*** Waiting on log collection to complete. Outstanding jobs: `jobs -p | wc -l`"
- sleep 10
- patience=`expr $patience - 1`
- if [ $patience -eq 0 ]; then
-  echo "Log collection stuck, killing any active collectors"
-  for i in `jobs -p`
-  do
-   kill -s TERM $i
-  done
-  break
- fi
+	echo "*** Waiting on log collection to complete (patience = ${patience}). Outstanding jobs: ${#lNames[*]} (${lNames[@]})"
+	sleep 10
+	# Check which collectors are done are remove them from the list
+	jobs > /dev/null # Don't delete this useless line or the next one will eroniously report a PID
+	pids=`jobs -p`
+	for i in "${lNames[@]}"
+	do
+		if notIn "${lPids[$i]}" $pids; then
+			unset lPids[$i]
+			unset lNames[$i]
+			unset lSizes[$i]
+		fi
+	done
+	unset pids
+	# Now for all remaining jobs check the file size of the log file for growth
+	# reset the timeout if the file is still growing. If no files are still growing
+	# then don't touch the timeout.
+	for i in "${lNames[@]}"
+	do
+		fsz=`stat --format=%s "docker_service_logs_${i}"`
+		if [ ${lSizes[$i]} -lt $fsz ]; then
+			patience=5
+			lSizes[$i]=$fsz
+		fi
+	done
+	patience=`expr $patience - 1`
+	if [ $patience -eq 0 ]; then
+		echo "Log collection stuck, killing any active collectors"
+		for i in "${lNames[@]}"
+		do
+			echo "${i}:${lNames[$i]}:${lSizes[$i]}:${lPids[$i]}"
+			kill -s TERM ${lPids[$i]}
+		done
+		break
+	fi
 done
 
 # Get the image list from this host
-echo "Getting docker image ls from `hostname`"
-docker image ls > docker_image_ls_`hostname` 2>&1
+#echo "Getting docker image ls from `hostname`"
+#docker image ls > docker_image_ls_`hostname` 2>&1
+# Get the memory info for this host
+#echo "Getting memory info from `hostname`"
+#cat /proc/meminfo > meminfo_`hostname` 2>&1
+# Get the disk info for this host
+#echo "Getting disk info from `hostname`"
+#df -h > df_`hostname` 2>&1
+
+#
+# If too many logs are generated it's not unusual that docker service logs
+# hangs and never produces the totality of logs for a service. In order
+# to get as much information as possible get the individual container logs
+# for each container on each host
+#
+
+# Get the logs for this host
+${volthaDir}/get-host-logs.sh
+
+
+# Get the logs for the other hosts
 for i in $hosts
 do
-	echo "Getting docker image ls from $i"
-	ssh voltha@$i "docker image ls" > docker_image_ls_$i 2>&1
+	ssh voltha@$i ${volthaDir}/get-host-logs.sh
 done
 
-
 popd
-tar cjvf logs.tar.bz2 log_tmp/*
+pushd ${volthaDir}/registry_data/registry_volume
+tar cjvf ${volthaDir}/logs.tar.bz2 log_tmp/*
 rm -fr log_tmp
+popd
commit	bcd30b17108c5a2eb2b861efc70c68c2834313d6	[log] [tgz]
author	Sergio Slobodrian <sslobodr@ciena.com>	Tue Aug 22 22:32:00 2017 -0400
committer	Sergio Slobodrian <sslobodr@ciena.com>	Wed Aug 23 09:18:57 2017 -0400
tree	2c7cabd83f579d8fda5718f62ef9614bb3f17a95
parent	92136d00878bdfa6929d2bbff8ddc0da7749ae52 [diff]