Skip to content

Commit 101b851

Browse files
authored
Merge pull request #18 from perifaws/main
ParallelCluster 3 Compatibility , thanks @perifaws
2 parents e5a664e + dea8c10 commit 101b851

File tree

3 files changed

+31
-23
lines changed

3 files changed

+31
-23
lines changed

parallelcluster-setup/install-monitoring.sh

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,15 @@ usermod -a -G docker $cfn_cluster_user
1818
curl -L "https://github.com/docker/compose/releases/download/1.27.4/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
1919
chmod +x /usr/local/bin/docker-compose
2020

21-
monitoring_dir_name=$(echo ${cfn_postinstall_args}| cut -d ',' -f 2 )
21+
monitoring_dir_name=${cfn_postinstall_args[1]}
2222
monitoring_home="/home/${cfn_cluster_user}/${monitoring_dir_name}"
2323

24+
echo "$> variable monitoring_dir_name -> ${monitoring_dir_name}"
25+
echo "$> variable monitoring_home -> ${monitoring_home}"
26+
27+
2428
case "${cfn_node_type}" in
25-
MasterServer)
29+
HeadNode)
2630

2731
#cfn_efs=$(cat /etc/chef/dna.json | grep \"cfn_efs\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
2832
#cfn_cluster_cw_logging_enabled=$(cat /etc/chef/dna.json | grep \"cfn_cluster_cw_logging_enabled\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
@@ -37,32 +41,32 @@ case "${cfn_node_type}" in
3741

3842
aws s3api get-object --bucket $cluster_s3_bucket --key $cluster_config_s3_key --region $cfn_region --version-id $cluster_config_version ${monitoring_home}/parallelcluster-setup/cluster-config.json
3943

40-
yum -y install golang-bin
44+
yum -y install golang-bin
4145

4246
chown $cfn_cluster_user:$cfn_cluster_user -R /home/$cfn_cluster_user
43-
chmod +x ${monitoring_home}/custom-metrics/*
47+
chmod +x ${monitoring_home}/custom-metrics/*
4448

4549
cp -rp ${monitoring_home}/custom-metrics/* /usr/local/bin/
4650
mv ${monitoring_home}/prometheus-slurm-exporter/slurm_exporter.service /etc/systemd/system/
4751

4852
(crontab -l -u $cfn_cluster_user; echo "*/1 * * * * /usr/local/bin/1m-cost-metrics.sh") | crontab -u $cfn_cluster_user -
49-
(crontab -l -u $cfn_cluster_user; echo "*/60 * * * * /usr/local/bin/1h-cost-metrics.sh") | crontab -u $cfn_cluster_user -
53+
(crontab -l -u $cfn_cluster_user; echo "*/60 * * * * /usr/local/bin/1h-cost-metrics.sh") | crontab -u $cfn_cluster_user -
5054

5155

52-
# replace tokens
56+
# replace tokens
5357
sed -i "s/_S3_BUCKET_/${s3_bucket}/g" ${monitoring_home}/grafana/dashboards/ParallelCluster.json
54-
sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" ${monitoring_home}/grafana/dashboards/ParallelCluster.json
58+
sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" ${monitoring_home}/grafana/dashboards/ParallelCluster.json
5559
sed -i "s/__FSX_ID__/${cfn_fsx_fs_id}/g" ${monitoring_home}/grafana/dashboards/ParallelCluster.json
56-
sed -i "s/__AWS_REGION__/${cfn_region}/g" ${monitoring_home}/grafana/dashboards/ParallelCluster.json
60+
sed -i "s/__AWS_REGION__/${cfn_region}/g" ${monitoring_home}/grafana/dashboards/ParallelCluster.json
5761

5862
sed -i "s/__AWS_REGION__/${cfn_region}/g" ${monitoring_home}/grafana/dashboards/logs.json
5963
sed -i "s/__LOG_GROUP__NAMES__/${log_group_names}/g" ${monitoring_home}/grafana/dashboards/logs.json
6064

61-
sed -i "s/__Application__/${stack_name}/g" ${monitoring_home}/prometheus/prometheus.yml
65+
sed -i "s/__Application__/${stack_name}/g" ${monitoring_home}/prometheus/prometheus.yml
6266

6367
sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" ${monitoring_home}/grafana/dashboards/master-node-details.json
64-
sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" ${monitoring_home}/grafana/dashboards/compute-node-list.json
65-
sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" ${monitoring_home}/grafana/dashboards/compute-node-details.json
68+
sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" ${monitoring_home}/grafana/dashboards/compute-node-list.json
69+
sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" ${monitoring_home}/grafana/dashboards/compute-node-details.json
6670

6771
sed -i "s/__MONITORING_DIR__/${monitoring_dir_name}/g" ${monitoring_home}/docker-compose/docker-compose.master.yml
6872

@@ -73,13 +77,13 @@ case "${cfn_node_type}" in
7377
echo -e "\nDNS.1=$(ec2-metadata -p | awk '{print $2}')" >> "${nginx_dir}/openssl.cnf"
7478
openssl req -new -x509 -nodes -newkey rsa:4096 -days 3650 -keyout "${nginx_ssl_dir}/nginx.key" -out "${nginx_ssl_dir}/nginx.crt" -config "${nginx_dir}/openssl.cnf"
7579

76-
#give $cfn_cluster_user ownership
80+
#give $cfn_cluster_user ownership
7781
chown -R $cfn_cluster_user:$cfn_cluster_user "${nginx_ssl_dir}/nginx.key"
7882
chown -R $cfn_cluster_user:$cfn_cluster_user "${nginx_ssl_dir}/nginx.crt"
7983

8084
/usr/local/bin/docker-compose --env-file /etc/parallelcluster/cfnconfig -f ${monitoring_home}/docker-compose/docker-compose.master.yml -p monitoring-master up -d
8185

82-
# Download and build prometheus-slurm-exporter
86+
# Download and build prometheus-slurm-exporter
8387
##### Plese note this software package is under GPLv3 License #####
8488
# More info here: https://github.com/vpenso/prometheus-slurm-exporter/blob/master/LICENSE
8589
cd ${monitoring_home}
@@ -98,6 +102,8 @@ case "${cfn_node_type}" in
98102
ComputeFleet)
99103
compute_instance_type=$(ec2-metadata -t | awk '{print $2}')
100104
gpu_instances="[pg][2-9].*\.[0-9]*[x]*large"
105+
echo "$> Compute Instances Type EC2 -> ${compute_instance_type}"
106+
echo "$> GPUS Instances EC2 -> ${gpu_instances}"
101107
if [[ $compute_instance_type =~ $gpu_instances ]]; then
102108
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
103109
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | tee /etc/yum.repos.d/nvidia-docker.repo

post-install.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,14 @@
1010
. /etc/parallelcluster/cfnconfig
1111

1212
#get GitHub repo to clone and the installation script
13-
monitoring_url=$(echo ${cfn_postinstall_args}| cut -d ',' -f 1 )
14-
monitoring_dir_name=$(echo ${cfn_postinstall_args}| cut -d ',' -f 2 )
13+
monitoring_url=${cfn_postinstall_args[0]}
14+
monitoring_dir_name=${cfn_postinstall_args[1]}
1515
monitoring_tarball="${monitoring_dir_name}.tar.gz"
16-
setup_command=$(echo ${cfn_postinstall_args}| cut -d ',' -f 3 )
16+
setup_command=${cfn_postinstall_args[2]}
1717
monitoring_home="/home/${cfn_cluster_user}/${monitoring_dir_name}"
1818

1919
case ${cfn_node_type} in
20-
MasterServer)
20+
HeadNode)
2121
wget ${monitoring_url} -O ${monitoring_tarball}
2222
mkdir -p ${monitoring_home}
2323
tar xvf ${monitoring_tarball} -C ${monitoring_home} --strip-components 1
@@ -29,4 +29,4 @@ esac
2929

3030
#Execute the monitoring installation script
3131
bash -x "${monitoring_home}/parallelcluster-setup/${setup_command}" >/tmp/monitoring-setup.log 2>&1
32-
exit $?
32+
exit $?

prometheus/prometheus.yml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@ scrape_configs:
3131
- port: 9400
3232
refresh_interval: 10s
3333
filters:
34+
- name: instance-state-name
35+
values:
36+
- running
37+
- name: tag:Name
38+
values:
39+
- Compute
3440
- name: instance-type
3541
values:
3642
- p2.xlarge
@@ -57,17 +63,13 @@ scrape_configs:
5763
target_label: instance_name
5864
- source_labels: [__meta_ec2_tag_Application]
5965
target_label: instance_grafana
60-
regex: __Application__
61-
action: keep
6266
- source_labels: [__meta_ec2_instance_id]
6367
target_label: instance_id
6468
- source_labels: [__meta_ec2_availability_zone]
6569
target_label: instance_az
6670
- source_labels: [__meta_ec2_instance_state]
67-
regex: running
68-
action: keep
6971
target_label: instance_state
7072
- source_labels: [__meta_ec2_instance_type]
7173
target_label: instance_type
7274
- source_labels: [__meta_ec2_vpc_id]
73-
target_label: instance_vpc
75+
target_label: instance_vpc

0 commit comments

Comments
 (0)