Skip to content

Commit bd395f3

Browse files
authored
Make PEM heap dump scripts compatible with Kelvin (#2220)
1 parent fc1a4fa commit bd395f3

File tree

3 files changed

+106
-34
lines changed

3 files changed

+106
-34
lines changed

scripts/collect_heap_pprofs.sh

Lines changed: 43 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -19,43 +19,68 @@
1919
usage() {
2020
echo "This script downloads all of the files listed in the mappings section of a heap profile."
2121
echo ""
22-
echo "Usage: $0 <heap_profile_dir> <vizier_cluster_id> [<gcloud ssh opts>...]"
22+
echo "Usage: $0 <heap_profile_dir> <vizier_cluster_id> [--gcloud-common-args] [--gcloud-ssh-args...]"
2323
echo "<heap_profile_dir> : the directory where the heap profile and memory mapped files will be stored. It will be created if it does not exist."
2424
echo "<vizier_cluster_id> : the ID of the Vizier cluster to connect to."
25-
echo "Common gcloud ssh options include --project."
25+
echo "[--gcloud-common-args] : common arguments to pass to gcloud commands, such as --project."
26+
echo "[--gcloud-ssh-args] : additional arguments to pass to gcloud compute ssh commands, such as --internal-ip."
2627
exit 1
2728
}
2829

29-
heap_profile_dir="$1"
30-
cluster_id="$2"
30+
parse_args() {
31+
if [ $# -lt 2 ]; then
32+
usage
33+
fi
34+
35+
heap_profile_dir="$1"
36+
shift
37+
38+
cluster_id="$1"
39+
shift
40+
}
41+
42+
check_args() {
43+
if [ -z "$heap_profile_dir" ] || [ -z "$cluster_id" ]; then
44+
usage
45+
fi
46+
}
47+
48+
parse_args "${@}"
49+
check_args
50+
3151
script_dir=$(dirname "$(realpath "$0")")
3252
repo_root=$(git rev-parse --show-toplevel)
3353

34-
if [ -z "$heap_profile_dir" ] || [ -z "$cluster_id" ]; then
35-
usage
36-
fi
37-
3854
mkdir -p "$heap_profile_dir"
3955

40-
pxl_heap_output_file="${heap_profile_dir}/raw_output_from_hot_table_test.json"
56+
pxl_heap_output_file="${heap_profile_dir}/heap_dump_raw_output.json"
4157

4258
px run -o json -c "$cluster_id" -f "${repo_root}/src/pxl_scripts/px/collect_heap_dumps.pxl" > "$pxl_heap_output_file"
4359

4460
while IFS= read -r line; do
61+
asid=$(echo "$line" | jq -r '.asid')
4562
hostname=$(echo "$line" | jq -r '.hostname')
4663
heap_content=$(echo "$line" | jq -r '.heap')
47-
echo "$heap_content" > "${heap_profile_dir}/${hostname}.txt"
48-
echo "Wrote ${heap_profile_dir}/${hostname}.txt"
64+
filename="${heap_profile_dir}/${hostname}-${asid}.txt"
65+
echo "$heap_content" > "$filename"
66+
echo "Wrote heap profile for ASID $asid on host $hostname to $filename"
4967
done < "$pxl_heap_output_file"
5068

51-
nodes=()
69+
declare -a agents
5270
for file in "${heap_profile_dir}"/*.txt; do
53-
hostname=$(basename "${file%.*}")
54-
nodes+=("$hostname")
55-
hostname_dir="${heap_profile_dir}/${hostname}"
56-
mkdir -p "$hostname_dir"
71+
filename=$(basename "$file")
72+
filename_noext="${filename%.txt}"
73+
hostname="${filename_noext%-*}"
74+
asid="${filename_noext##*-}"
75+
76+
agents["$asid"]="$hostname"
77+
agent_dir="${heap_profile_dir}/${hostname}-${asid}"
78+
mkdir -p "$agent_dir"
5779
done
5880

59-
for node in "${nodes[@]}"; do
60-
"${script_dir}/download_heap_prof_mapped_files.sh" "${heap_profile_dir}/${node}.txt" "$node" "${@:3}"
81+
for agent in "${!agents[@]}"; do
82+
asid="${agent}"
83+
hostname="${agents[$agent]}"
84+
filename="${heap_profile_dir}/${hostname}-${asid}.txt"
85+
"${script_dir}/download_heap_prof_mapped_files.sh" "$filename" "$hostname" "${@:3}"
6186
done

scripts/download_heap_prof_mapped_files.sh

Lines changed: 57 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,61 @@
1919
usage() {
2020
echo "This script downloads all of the files listed in the mappings section of a heap profile."
2121
echo ""
22-
echo "Usage: $0 <heap_profile> <node_name> [<gcloud ssh opts>...]"
23-
echo "Common gcloud ssh options include --project."
22+
echo "Usage: $0 <heap_profile> <node_name> [--gcloud-common-args] [--gcloud-ssh-args...]"
23+
echo "<heap_profile> : the path to the heap profile file (in pprof format) to analyze."
24+
echo "<node_name> : the name of the node to connect to (e.g., the name of a Vizier agent node)."
25+
echo "[--gcloud-common-args] : common arguments to pass to gcloud commands, such as --project."
26+
echo "[--gcloud-ssh-args] : additional arguments to pass to gcloud compute ssh commands, such as --internal-ip."
2427
exit 1
2528
}
2629

27-
heap_profile="$1"
28-
node_name="$2"
29-
output_dir="${heap_profile%.txt}"
30+
parse_args() {
31+
if [ $# -lt 2 ]; then
32+
usage
33+
fi
34+
35+
heap_profile="$1"
36+
shift
37+
38+
node_name="$1"
39+
shift
40+
41+
while test $# -gt 0; do
42+
case "$1" in
43+
--gcloud-common-args=*)
44+
GCLOUD_COMMON_ARGS="${1#*=}"
45+
shift
46+
;;
47+
--gcloud-common-args)
48+
GCLOUD_COMMON_ARGS="$2"
49+
shift
50+
shift
51+
;;
52+
--gcloud-ssh-args=*)
53+
GCLOUD_SSH_ARGS="${1#*=}"
54+
shift
55+
;;
56+
--gcloud-ssh-args)
57+
GCLOUD_SSH_ARGS="$2"
58+
shift
59+
shift
60+
;;
61+
*) usage ;;
62+
esac
63+
done;
64+
}
65+
3066

31-
if [ -z "$heap_profile" ] || [ -z "$node_name" ]; then
32-
usage
33-
fi
67+
check_args() {
68+
if [ -z "$heap_profile" ] || [ -z "$node_name" ]; then
69+
usage
70+
fi
71+
}
72+
73+
parse_args "${@}"
74+
check_args
75+
76+
output_dir="${heap_profile%.txt}"
3477

3578
# Create the output directory at the beginning of the script.
3679
mkdir -p "$output_dir"
@@ -44,16 +87,16 @@ mkdir -p "$output_dir"
4487
mappings=$(awk 'BEGIN{m=0} /MAPPED_LIBRARIES/{m=1} { if(m) { print $6 }}' "$heap_profile" | grep "^/" | sort | uniq)
4588

4689
err_file="$output_dir/gcloud_error.log"
47-
zone=$(gcloud compute instances list "${@:3}" --filter="$node_name" --format="table(name, zone)"| tail -n 1 | awk '{print $2}')
48-
procs=$(gcloud compute ssh --zone "$zone" --command='ps ax' "$node_name" "${@:3}" 2> "$err_file") || cat "$err_file" && rm "$err_file"
90+
zone=$(gcloud compute instances list "${GCLOUD_COMMON_ARGS}" --filter="$node_name" --format="table(name, zone)"| tail -n 1 | awk '{print $2}')
91+
procs=$(gcloud compute ssh --zone "$zone" --command='ps ax' "$node_name" "${GCLOUD_COMMON_ARGS}" "${GCLOUD_SSH_ARGS}" 2> "$err_file") || cat "$err_file" && rm "$err_file"
4992

5093
# Find the mapping that corresponds to a process on the node.
5194
# We assume that the process was started by running one of the files in the mappings
5295
# (which is commonly the case and certainly the case for the binaries we care about, eg. pem, kelvin).
5396
for fname in $mappings
5497
do
5598
bname=$(basename "$fname")
56-
matching_proc=$(echo "$procs" | grep "$bname" || true)
99+
matching_proc=$(echo "$procs" | grep -E "$bname$" || true)
57100
if [[ -n "$matching_proc" ]]; then
58101
pid=$(echo "$matching_proc" | awk '{ print $1 }')
59102
matched_file="$fname"
@@ -80,13 +123,13 @@ output_on_err() {
80123
}
81124

82125
# Create tar archive on node.
83-
output_on_err gcloud compute ssh --zone "$zone" --command="$create_tar_cmd" "$node_name" "${@:3}"
126+
output_on_err gcloud compute ssh --zone "$zone" --command="$create_tar_cmd" "$node_name" "${GCLOUD_COMMON_ARGS}" "${GCLOUD_SSH_ARGS}"
84127

85128
# Copy archive to local machine.
86-
output_on_err gcloud compute scp --zone "$zone" "${@:3}" "$USER@$node_name:~/$tar_file" "${output_dir}/$tar_file"
129+
output_on_err gcloud compute scp --zone "$zone" "${GCLOUD_COMMON_ARGS}" "${GCLOUD_SSH_ARGS}" "$USER@$node_name:~/$tar_file" "${output_dir}/$tar_file"
87130

88131
# Cleanup tar archive on node.
89-
output_on_err gcloud compute ssh --zone "$zone" --command="rm ~/$tar_file" "$node_name" "${@:3}"
132+
output_on_err gcloud compute ssh --zone "$zone" --command="rm ~/$tar_file" "$node_name" "${GCLOUD_COMMON_ARGS}" "${GCLOUD_SSH_ARGS}"
90133

91134
tar --strip-components=1 -C "$output_dir" -xzf "${output_dir}/$tar_file"
92135

src/pxl_scripts/px/collect_heap_dumps.pxl

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,13 @@
1616

1717
import px
1818

19-
df = px.GetAgentStatus(False)
19+
df = px.GetAgentStatus()
20+
df.ip_address = px.pluck_array(px.split(df.ip_address, ":"), 0)
21+
df.hostname_by_ip = px.pod_id_to_node_name(px.ip_to_pod_id(df.ip_address))
22+
df.hostname = px.select(df.hostname_by_ip == "", df.hostname, df.hostname_by_ip)
2023
df = df[['asid', 'hostname']]
2124
heap_stats = px._HeapGrowthStacks()
2225
df = df.merge(heap_stats, how='inner', left_on='asid', right_on='asid')
23-
df = df[['hostname', 'heap']]
26+
df.asid = df.asid_x
27+
df = df[['asid', 'hostname', 'heap']]
2428
px.display(df)

0 commit comments

Comments
 (0)