Skip to content

Commit

Permalink
Add support debug info script
Browse files Browse the repository at this point in the history
  • Loading branch information
jvoravong committed Oct 11, 2024
1 parent c76617f commit ac645aa
Showing 1 changed file with 328 additions and 0 deletions.
328 changes: 328 additions & 0 deletions tools/splunk_kubernetes_debug_info.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,328 @@
#!/bin/bash

# Description:
# This script collects debugging information from a Kubernetes cluster.
# It retrieves networking, firewall, security policies, custom resource definitions (CRDs),
# and logs from specified pods and secrets (sanitized). The outputs are saved to files for each namespace and object type.
# This helps in diagnosing and troubleshooting cluster configurations.
# Finally, it compresses all the collected files into a ZIP archive.
#
# Input Parameters:
# - NAMESPACES: Comma-separated list of namespaces to collect data from. If not specified, the script collects data from all namespaces.
# - K8S_OBJECT_NAME_FILTER: Filter for Kubernetes object names (default: 'splunk|collector|otel|certmanager|test|sck|sock').
#
# Usage:
# 1. Ensure you have `kubectl`, `yq`, and `helm` installed and configured to access your Kubernetes cluster.
# 2. Save the script to a file called `splunk_kubernetes_debug_info.sh`.
# 3. Make the script executable:
# chmod +x splunk_kubernetes_debug_info.sh
# 4. Run the script:
# 4.1. Via Terminal and Curl:
# curl -s https://raw.githubusercontent.com/signalfx/splunk-otel-collector-chart/main/tools/splunk_kubernetes_debug_info.sh | bash
# 4.2. Via Terminal and Local Code:
# ./splunk_kubernetes_debug_info.sh [NAMESPACES=namespace1,namespace2,...] [K8S_OBJECT_NAME_FILTER=splunk|collector|otel|certmanager|test|sck|sock|customname]
# Note: If no namespaces are specified, the script will collect information from all namespaces.
# Sensitive Data Handling:
# The script attempts to redact sensitive information where possible, including tokens, passwords, and certificates.
# However, users should review the files for any sensitive data before sharing.
#
# Objects Scraped:
# - Pod logs for agent, cluster-receiver, certmanager, operator, gateway, splunk pods
# - Deployments, daemonsets, secrets, Helm releases matching K8S_OBJECT_NAME_FILTER
# - NetworkPolicies, Services, Ingress resources, Endpoints, Roles, RoleBindings, Security contexts
# - OpenTelemetry Instrumentation objects
# - Custom Resource Definitions (CRDs), Pod Security Policies (PSPs), Security Context Constraints (SCCs)
# - Cert-manager related objects
# - MutatingWebhookConfiguration objects

# Helper function to write output to a file
write_output() {
local output="$1"
local file_name="$2"
local cmd="$3"

# Check if output is empty, starts with "No resources found", or "error"
if [[ -z "$output" || "$output" == "No resources found"* || "$output" == "error"* || "$output" == "Error"* ]]; then
echo "[$(date)] Skipping $file_name: $output" >> "$temp_dir/errors.txt"
return
fi

# Check if output is in YAML format
if echo "$output" | yq eval '.' - > /dev/null 2>&1; then
# Check if output contains empty list using yq
if [[ $(echo "$output" | yq eval '.kind' -) == "List" ]] && [[ $(echo "$output" | yq eval '.items | length' -) -eq 0 ]]; then
echo "[$(date)] Skipping $file_name: Empty list" >> "$temp_dir/errors.txt"
return
fi
fi

# Redact sensitive information
output=$(echo "$output" | awk '
/BEGIN CERTIFICATE/,/END CERTIFICATE/ {
if (/BEGIN CERTIFICATE/) print;
else if (/END CERTIFICATE/) print;
else print " [CERTIFICATE REDACTED]";
next;
}
/ca\.crt|client\.crt|client\.key/ {
print " [SENSITIVE DATA REDACTED]";
next;
}
/[Tt][Oo][Kk][Ee][Nn]/ {
print " [TOKEN REDACTED]";
next;
}
/[Pp][Aa][Ss][Ss][Ww][Oo][Rr][Dd]/ {
print " [PASSWORD REDACTED]";
next;
}
{print}')

# Write command and output to file
echo "# Command: $cmd" > "$file_name"
echo "$output" >> "$file_name"
}

# Function to collect data for a given namespace
collect_data_namespace() {
local ns=$1

object_types=("deployments" "daemonsets" "configmaps" "secrets" "networkpolicies" "svc" "ingress" "endpoints" "roles" "rolebindings" "otelinst")
for type in "${object_types[@]}"; do
stdbuf -oL echo "Collecting $type data for $ns namespace with $k8s_object_name_filter name filter"
if [[ "$type" == "deployment" || "$type" == "daemonset" || "$type" == "configmaps" || "$type" == "secrets" ]]; then
kubectl get "$type" -n "$ns" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep -E "$k8s_object_name_filter" | while read object; do
cmd="kubectl get $type $object -n $ns -o yaml"
output=$(eval "$cmd")
write_output "$output" "$temp_dir/namespace_${ns}_${type}_${object}.yaml" "$cmd"
done
else
kubectl get "$type" -n "$ns" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | while read object; do
cmd="kubectl get $type $object -n $ns -o yaml"
output=$(eval "$cmd")
write_output "$output" "$temp_dir/namespace_${ns}_${type}_${object}.yaml" "$cmd"
done
fi
done

# Collect logs from specific pods
pods=$(kubectl get pods -n "$ns" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep -E "$k8s_object_name_filter")
# Collect logs from a single agent pod
agent_pod=$(echo "$pods" | grep "agent" | head -n 1)
if [ -n "$agent_pod" ]; then
cmd="kubectl logs $agent_pod -n $ns"
output=$(eval "$cmd")
write_output "$output" "$temp_dir/namespace_${ns}_logs_pod_${agent_pod}.log" "$cmd"
pods=$(echo "$pods" | grep -v "$agent_pod")
fi

# Collect logs from a single cluster-receiver pod
cluster_receiver_pod=$(echo "$pods" | grep "cluster-receiver" | head -n 1)
if [ -n "$cluster_receiver_pod" ]; then
cmd="kubectl logs $cluster_receiver_pod -n $ns"
output=$(eval "$cmd")
write_output "$output" "$temp_dir/namespace_${ns}_logs_pod_${cluster_receiver_pod}.log" "$cmd"
pods=$(echo "$pods" | grep -v "$cluster_receiver_pod")
fi

# Collect logs from all certmanager pods
certmanager_pods=$(echo "$pods" | grep "certmanager")
for pod in $certmanager_pods; do
cmd="kubectl logs $pod -n $ns"
output=$(eval "$cmd")
write_output "$output" "$temp_dir/namespace_${ns}_logs_pod_${pod}.log" "$cmd"
done
pods=$(echo "$pods" | grep -v "certmanager")

# Collect logs from all operator pods
operator_pods=$(echo "$pods" | grep "operator")
for pod in $operator_pods; do
cmd="kubectl logs $pod -n $ns"
output=$(eval "$cmd")
write_output "$output" "$temp_dir/namespace_${ns}_logs_pod_${pod}.log" "$cmd"
done
pods=$(echo "$pods" | grep -v "operator")

# Collect logs from a single Splunk pod
splunk_pod=$(kubectl get pods -n "$ns" -l app=splunk -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
if [ -n "$splunk_pod" ]; then
echo "Getting logs for pod $splunk_pod in namespace ${ns}"
cmd="kubectl logs -n ${ns} $splunk_pod"
output=$(eval "$cmd")
write_output "$output" "$temp_dir/namespace_${ns}_logs_pod_${splunk_pod}.log" "$cmd"
fi

# Collect pod spec and logs for specific annotations
annotations=(
"instrumentation.opentelemetry.io/inject-java"
"instrumentation.opentelemetry.io/inject-python"
"instrumentation.opentelemetry.io/inject-dotnet"
"instrumentation.opentelemetry.io/inject-go"
"instrumentation.opentelemetry.io/inject-nodejs"
"instrumentation.opentelemetry.io/inject-nginx"
"instrumentation.opentelemetry.io/inject-sdk"
"instrumentation.opentelemetry.io/inject-apache-httpd"
)

for annotation in "${annotations[@]}"; do
pod_with_annotation=$(kubectl get pods -n "$ns" -o jsonpath="{range .items[?(@.metadata.annotations['$annotation'])]}{.metadata.name}{'\n'}{end}" | head -n 1)
if [ -n "$pod_with_annotation" ]; then
cmd="kubectl get pod $pod_with_annotation -n $ns -o yaml"
output=$(eval "$cmd")
write_output "$output" "$temp_dir/namespace_${ns}_pod_spec_${pod_with_annotation}.yaml" "$cmd"
cmd="kubectl logs $pod_with_annotation -n $ns"
output=$(eval "$cmd")
write_output "$output" "$temp_dir/namespace_${ns}_logs_pod_${pod_with_annotation}.log" "$cmd"
fi
done
}

# Function to collect cluster-wide data
collect_data_cluster() {
echo "Collecting cluster-wide data..."

echo "Basic Cluster Configurations:" >> "$output_file"
echo "Cluster Name: $(kubectl config view --minify -o jsonpath='{.clusters[].name}')" >> "$output_file"
echo "Kubernetes Version:" >> "$output_file"
kubectl version >> "$output_file"
echo "Number of Namespaces:" >> "$output_file"
kubectl get namespaces | wc -l >> "$output_file"
echo "Namespaces: $(kubectl get namespaces -o jsonpath='{.items[*].metadata.name}')" >> "$output_file"
echo "Number of Running Nodes:" >> "$output_file"
kubectl get nodes | wc -l >> "$output_file"
echo "Number of Running Pods:" >> "$output_file"
kubectl get pods --all-namespaces --field-selector=status.phase=Running | wc -l >> "$output_file"
echo "Splunk Related Pods:" >> "$output_file"
kubectl get pods --all-namespaces | (head -n 1 && grep -E "$k8s_object_name_filter") >> "$output_file"
echo "---" >> "$output_file"

echo "Collecting custom resource definitions..."
cmd="kubectl get crds -o yaml"
output=$(eval "$cmd")
write_output "$output" "$temp_dir/cluster_custom_resource_definitions.yaml" "$cmd"

echo "Collecting pod security policies..."
cmd="kubectl get psp -o yaml"
output=$(eval "$cmd")
write_output "$output" "$temp_dir/cluster_pod_security_policies.yaml" "$cmd"

echo "Collecting security context constraints..."
cmd="kubectl get scc -o yaml"
output=$(eval "$cmd")
write_output "$output" "$temp_dir/cluster_security_context_constraints.yaml" "$cmd"

echo "Collecting MutatingWebhookConfiguration objects..."
cmd="kubectl get mutatingwebhookconfiguration.admissionregistration.k8s.io -o yaml; kubectl describe mutatingwebhookconfiguration.admissionregistration.k8s.io; kubectl get --raw /metrics | grep apiserver_admission_webhook_rejection_count;"
output=$(eval "$cmd")
write_output "$output" "$temp_dir/cluster_webhooks.yaml" "$cmd"

echo "Checking for cert-manager installation..."
cert_manager_pods=$(kubectl get pods --all-namespaces -l app=cert-manager --no-headers)
if [ -n "$cert_manager_pods" ]; then
echo "Cert-manager is installed. Collecting related objects..."
cmd="kubectl get Issuers,ClusterIssuers,Certificates,CertificateRequests,Orders,Challenges --all-namespaces -o yaml; kubectl describe Issuers,ClusterIssuers,Certificates,CertificateRequests,Orders,Challenges --all-namespaces"
output=$(eval "$cmd")
write_output "$output" "$temp_dir/cluster_cert_manager_objects.yaml" "$cmd"
fi

echo "Collecting Helm values for relevant releases..."
helm list -A | grep -E "$k8s_object_name_filter" | awk '{print $1, $2}' | while read release namespace; do
cmd="helm get values $release -n $namespace"
output=$(eval "$cmd")
write_output "$output" "$temp_dir/helm_values_${release}_${namespace}.yaml" "$cmd"
done
}

# Parse input parameters
namespaces=""
k8s_object_name_filter="splunk|collector|otel|certmanager|test|sck|sock"

for arg in "$@"; do
case $arg in
NAMESPACES=*)
namespaces="${arg#*=}"
;;
K8S_OBJECT_NAME_FILTER=*)
k8s_object_name_filter="${arg#*=}"
;;
*)
echo "Unknown parameter: $arg"
exit 1
;;
esac
done

# Collect data from all namespaces if no namespaces are specified
if [[ -z "$namespaces" ]]; then
# Get all namespaces and convert the string into an array
IFS=' ' read -r -a namespaces_array <<< "$(kubectl get namespaces -o jsonpath='{.items[*].metadata.name}')"
else
# Split the specified namespaces string into an array
IFS=',' read -r -a namespaces_array <<< "$namespaces"
fi

echo "Namespaces: ${namespaces_array[@]}"
echo "Kubernetes object name filter: $k8s_object_name_filter"

# Create a temporary directory with a unique name
temp_dir=$(mktemp -d -t splunk_kubernetes_debug_info_XXXXXX)
if [[ ! -d "$temp_dir" ]]; then
echo "Failed to create temporary directory"
exit 1
fi

# Output file for basic cluster information
output_file="$temp_dir/cluster.txt"

# Print script start time
script_start_time=$(date +"%Y-%m-%d %H:%M:%S")
echo "Script start time: $script_start_time"
echo "Script start time: $script_start_time" >> "$output_file"

# Collect cluster-wide data
collect_data_cluster

# Function to manage parallel processing of namespaces
collect_data_namespace_namespaces() {
local parallelism=20
local pids=()

for ns in "${namespaces_array[@]}"; do
collect_data_namespace "$ns" &
pids+=($!)

if [[ ${#pids[@]} -ge $parallelism ]]; then
for pid in "${pids[@]}"; do
wait "$pid"
done
pids=()
fi
done

# Wait for any remaining background processes to complete
for pid in "${pids[@]}"; do
wait "$pid"
done
}

# Process namespaces in parallel
collect_data_namespace_namespaces

# Print script end time
script_end_time=$(date +"%Y-%m-%d %H:%M:%S")
echo "Script end time: $script_end_time"
echo "Script end time: $script_end_time" >> "$output_file"

# Create a ZIP archive of all the collected YAML files
output_zip="splunk_kubernetes_debug_info_$(date +%Y%m%d_%H%M%S).zip"
echo "Creating ZIP archive: $output_zip"

# Find and delete empty files before creating the ZIP archive
find "$temp_dir" -type f -empty -delete

zip -j -r "$output_zip" "$temp_dir"

# Clean up the temporary directory
rm -rf "$temp_dir"

echo "Data collection complete. Output files are available in the ZIP archive: $output_zip"

0 comments on commit ac645aa

Please sign in to comment.