Blame - install/get-logs.sh - voltha

blob: 2733b885899b310ba256ac2e0f8eca2680607059 [file] [log] [blame]

Sergio Slobodrian	ee417fa	2017-08-11 09:34:50 -0400	[diff] [blame]	1	#!/bin/bash
				2
				3	# This script will collect all of the pertinent logs from a voltha
				4	# HA swarm cluster, tar, and bizip them to facilitate sending them
Sergio Slobodrian	bcd30b1	2017-08-22 22:32:00 -0400	[diff] [blame]	5	# to the suspected issue owner. The replicated storage is used to
				6	# allow all hosts to place the logs in a single place.
Sergio Slobodrian	ee417fa	2017-08-11 09:34:50 -0400	[diff] [blame]	7
				8	volthaDir="/cord/incubator/voltha"
Sergio Slobodrian	bcd30b1	2017-08-22 22:32:00 -0400	[diff] [blame]	9	declare -A lNames
				10	declare -A lPids
				11	declare -A lSizes
				12
				13	# Checks if a value is not in an array.
				14	notIn() {
				15	local e match=$1
				16	shift
				17	for e; do [[ "$e" == "$match" ]] && return 1; done
				18	return 0
				19	}
Sergio Slobodrian	ee417fa	2017-08-11 09:34:50 -0400	[diff] [blame]	20
				21	# Get the list of the other hosts that make up the cluster
Sergio Slobodrian	bcd30b1	2017-08-22 22:32:00 -0400	[diff] [blame]	22	hosts=`docker node ls \| tail -n +2 \| grep -v "*" \| grep -v "Down" \| awk '{print $2}'`
				23
				24	echo "Collecting logs for hosts: `hostname` ${hosts}"
Sergio Slobodrian	ee417fa	2017-08-11 09:34:50 -0400	[diff] [blame]	25
				26	# Create a temporary directory for temporary storage of all the logs
Sergio Slobodrian	bcd30b1	2017-08-22 22:32:00 -0400	[diff] [blame]	27	mkdir ${volthaDir}/registry_data/registry_volume/log_tmp
				28	pushd ${volthaDir}/registry_data/registry_volume/log_tmp
Sergio Slobodrian	ee417fa	2017-08-11 09:34:50 -0400	[diff] [blame]	29
				30	# Docker health in general.
				31
				32	echo "Getting docker node ls"
				33	docker node ls > docker_node_ls.log 2>&1
				34	echo "Getting docker service ls"
				35	docker service ls > docker_service_ls.log 2>&1
				36
				37	# Get the list of services to ps each one and get logs for each one.
				38	svcs=`docker service ls \| tail -n +2 \| awk '{print $2}'`
				39
				40	# Get the PS information
				41	for i in $svcs
				42	do
				43	echo "Getting docker service ps $i"
				44	docker service ps ${i} > docker_service_ps_${i} 2>&1
				45	done
				46
				47	# Get the logs for each service
				48	for i in $svcs
				49	do
				50	echo "Getting docker service logs $i"
Sergio Slobodrian	bcd30b1	2017-08-22 22:32:00 -0400	[diff] [blame]	51	lNames[$i]=$i
				52	lSizes[$i]=0
Sergio Slobodrian	ee417fa	2017-08-11 09:34:50 -0400	[diff] [blame]	53	docker service logs ${i} > docker_service_logs_${i} 2>&1 &
Sergio Slobodrian	bcd30b1	2017-08-22 22:32:00 -0400	[diff] [blame]	54	lPids[$i]=$!
Sergio Slobodrian	ee417fa	2017-08-11 09:34:50 -0400	[diff] [blame]	55	done
				56
Sergio Slobodrian	bcd30b1	2017-08-22 22:32:00 -0400	[diff] [blame]	57	patience=5
				58	while [ "${#lNames[*]}" -ne 0 ]
Sergio Slobodrian	ee417fa	2017-08-11 09:34:50 -0400	[diff] [blame]	59	do
Sergio Slobodrian	bcd30b1	2017-08-22 22:32:00 -0400	[diff] [blame]	60	echo "*** Waiting on log collection to complete (patience = ${patience}). Outstanding jobs: ${#lNames[*]} (${lNames[@]})"
				61	sleep 10
				62	# Check which collectors are done are remove them from the list
				63	jobs > /dev/null # Don't delete this useless line or the next one will eroniously report a PID
				64	pids=`jobs -p`
				65	for i in "${lNames[@]}"
				66	do
				67	if notIn "${lPids[$i]}" $pids; then
				68	unset lPids[$i]
				69	unset lNames[$i]
				70	unset lSizes[$i]
				71	fi
				72	done
				73	unset pids
				74	# Now for all remaining jobs check the file size of the log file for growth
				75	# reset the timeout if the file is still growing. If no files are still growing
				76	# then don't touch the timeout.
				77	for i in "${lNames[@]}"
				78	do
				79	fsz=`stat --format=%s "docker_service_logs_${i}"`
				80	if [ ${lSizes[$i]} -lt $fsz ]; then
				81	patience=5
				82	lSizes[$i]=$fsz
				83	fi
				84	done
				85	patience=`expr $patience - 1`
				86	if [ $patience -eq 0 ]; then
				87	echo "Log collection stuck, killing any active collectors"
				88	for i in "${lNames[@]}"
				89	do
				90	echo "${i}:${lNames[$i]}:${lSizes[$i]}:${lPids[$i]}"
				91	kill -s TERM ${lPids[$i]}
				92	done
				93	break
				94	fi
Sergio Slobodrian	ee417fa	2017-08-11 09:34:50 -0400	[diff] [blame]	95	done
				96
				97	# Get the image list from this host
Sergio Slobodrian	bcd30b1	2017-08-22 22:32:00 -0400	[diff] [blame]	98	#echo "Getting docker image ls from `hostname`"
				99	#docker image ls > docker_image_ls_`hostname` 2>&1
				100	# Get the memory info for this host
				101	#echo "Getting memory info from `hostname`"
				102	#cat /proc/meminfo > meminfo_`hostname` 2>&1
				103	# Get the disk info for this host
				104	#echo "Getting disk info from `hostname`"
				105	#df -h > df_`hostname` 2>&1
				106
				107	#
				108	# If too many logs are generated it's not unusual that docker service logs
				109	# hangs and never produces the totality of logs for a service. In order
				110	# to get as much information as possible get the individual container logs
				111	# for each container on each host
				112	#
				113
				114	# Get the logs for this host
				115	${volthaDir}/get-host-logs.sh
				116
				117
				118	# Get the logs for the other hosts
Sergio Slobodrian	ee417fa	2017-08-11 09:34:50 -0400	[diff] [blame]	119	for i in $hosts
				120	do
Sergio Slobodrian	bcd30b1	2017-08-22 22:32:00 -0400	[diff] [blame]	121	ssh voltha@$i ${volthaDir}/get-host-logs.sh
Sergio Slobodrian	ee417fa	2017-08-11 09:34:50 -0400	[diff] [blame]	122	done
				123
Sergio Slobodrian	ee417fa	2017-08-11 09:34:50 -0400	[diff] [blame]	124	popd
Sergio Slobodrian	bcd30b1	2017-08-22 22:32:00 -0400	[diff] [blame]	125	pushd ${volthaDir}/registry_data/registry_volume
Sergio Slobodrian	8725ea8	2017-08-27 23:47:41 -0400	[diff] [blame]	126	tar cjvf ${volthaDir}/logs.tar`date "+%Y%m%d-%H:%M:%S"`.bz2 log_tmp/*
Sergio Slobodrian	ee417fa	2017-08-11 09:34:50 -0400	[diff] [blame]	127	rm -fr log_tmp
Sergio Slobodrian	bcd30b1	2017-08-22 22:32:00 -0400	[diff] [blame]	128	popd
Sergio Slobodrian	ee417fa	2017-08-11 09:34:50 -0400	[diff] [blame]	129
				130
				131