@@ -18,11 +18,15 @@ usermod -a -G docker $cfn_cluster_user
1818curl -L " https://github.com/docker/compose/releases/download/1.27.4/docker-compose-$( uname -s) -$( uname -m) " -o /usr/local/bin/docker-compose
1919chmod +x /usr/local/bin/docker-compose
2020
21- monitoring_dir_name=$( echo $ {cfn_postinstall_args} | cut -d ' , ' -f 2 )
21+ monitoring_dir_name=${cfn_postinstall_args[1]}
2222monitoring_home=" /home/${cfn_cluster_user} /${monitoring_dir_name} "
2323
24+ echo " $> variable monitoring_dir_name -> ${monitoring_dir_name} "
25+ echo " $> variable monitoring_home -> ${monitoring_home} "
26+
27+
2428case " ${cfn_node_type} " in
25- MasterServer )
29+ HeadNode )
2630
2731 # cfn_efs=$(cat /etc/chef/dna.json | grep \"cfn_efs\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
2832 # cfn_cluster_cw_logging_enabled=$(cat /etc/chef/dna.json | grep \"cfn_cluster_cw_logging_enabled\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
@@ -37,32 +41,32 @@ case "${cfn_node_type}" in
3741
3842 aws s3api get-object --bucket $cluster_s3_bucket --key $cluster_config_s3_key --region $cfn_region --version-id $cluster_config_version ${monitoring_home} /parallelcluster-setup/cluster-config.json
3943
40- yum -y install golang-bin
44+ yum -y install golang-bin
4145
4246 chown $cfn_cluster_user :$cfn_cluster_user -R /home/$cfn_cluster_user
43- chmod +x ${monitoring_home} /custom-metrics/*
47+ chmod +x ${monitoring_home} /custom-metrics/*
4448
4549 cp -rp ${monitoring_home} /custom-metrics/* /usr/local/bin/
4650 mv ${monitoring_home} /prometheus-slurm-exporter/slurm_exporter.service /etc/systemd/system/
4751
4852 (crontab -l -u $cfn_cluster_user ; echo " */1 * * * * /usr/local/bin/1m-cost-metrics.sh" ) | crontab -u $cfn_cluster_user -
49- (crontab -l -u $cfn_cluster_user ; echo " */60 * * * * /usr/local/bin/1h-cost-metrics.sh" ) | crontab -u $cfn_cluster_user -
53+ (crontab -l -u $cfn_cluster_user ; echo " */60 * * * * /usr/local/bin/1h-cost-metrics.sh" ) | crontab -u $cfn_cluster_user -
5054
5155
52- # replace tokens
56+ # replace tokens
5357 sed -i " s/_S3_BUCKET_/${s3_bucket} /g" ${monitoring_home} /grafana/dashboards/ParallelCluster.json
54- sed -i " s/__INSTANCE_ID__/${master_instance_id} /g" ${monitoring_home} /grafana/dashboards/ParallelCluster.json
58+ sed -i " s/__INSTANCE_ID__/${master_instance_id} /g" ${monitoring_home} /grafana/dashboards/ParallelCluster.json
5559 sed -i " s/__FSX_ID__/${cfn_fsx_fs_id} /g" ${monitoring_home} /grafana/dashboards/ParallelCluster.json
56- sed -i " s/__AWS_REGION__/${cfn_region} /g" ${monitoring_home} /grafana/dashboards/ParallelCluster.json
60+ sed -i " s/__AWS_REGION__/${cfn_region} /g" ${monitoring_home} /grafana/dashboards/ParallelCluster.json
5761
5862 sed -i " s/__AWS_REGION__/${cfn_region} /g" ${monitoring_home} /grafana/dashboards/logs.json
5963 sed -i " s/__LOG_GROUP__NAMES__/${log_group_names} /g" ${monitoring_home} /grafana/dashboards/logs.json
6064
61- sed -i " s/__Application__/${stack_name} /g" ${monitoring_home} /prometheus/prometheus.yml
65+ sed -i " s/__Application__/${stack_name} /g" ${monitoring_home} /prometheus/prometheus.yml
6266
6367 sed -i " s/__INSTANCE_ID__/${master_instance_id} /g" ${monitoring_home} /grafana/dashboards/master-node-details.json
64- sed -i " s/__INSTANCE_ID__/${master_instance_id} /g" ${monitoring_home} /grafana/dashboards/compute-node-list.json
65- sed -i " s/__INSTANCE_ID__/${master_instance_id} /g" ${monitoring_home} /grafana/dashboards/compute-node-details.json
68+ sed -i " s/__INSTANCE_ID__/${master_instance_id} /g" ${monitoring_home} /grafana/dashboards/compute-node-list.json
69+ sed -i " s/__INSTANCE_ID__/${master_instance_id} /g" ${monitoring_home} /grafana/dashboards/compute-node-details.json
6670
6771 sed -i " s/__MONITORING_DIR__/${monitoring_dir_name} /g" ${monitoring_home} /docker-compose/docker-compose.master.yml
6872
@@ -73,13 +77,13 @@ case "${cfn_node_type}" in
7377 echo -e " \nDNS.1=$( ec2-metadata -p | awk ' {print $2}' ) " >> " ${nginx_dir} /openssl.cnf"
7478 openssl req -new -x509 -nodes -newkey rsa:4096 -days 3650 -keyout " ${nginx_ssl_dir} /nginx.key" -out " ${nginx_ssl_dir} /nginx.crt" -config " ${nginx_dir} /openssl.cnf"
7579
76- # give $cfn_cluster_user ownership
80+ # give $cfn_cluster_user ownership
7781 chown -R $cfn_cluster_user :$cfn_cluster_user " ${nginx_ssl_dir} /nginx.key"
7882 chown -R $cfn_cluster_user :$cfn_cluster_user " ${nginx_ssl_dir} /nginx.crt"
7983
8084 /usr/local/bin/docker-compose --env-file /etc/parallelcluster/cfnconfig -f ${monitoring_home} /docker-compose/docker-compose.master.yml -p monitoring-master up -d
8185
82- # Download and build prometheus-slurm-exporter
86+ # Download and build prometheus-slurm-exporter
8387 # #### Plese note this software package is under GPLv3 License #####
8488 # More info here: https://github.com/vpenso/prometheus-slurm-exporter/blob/master/LICENSE
8589 cd ${monitoring_home}
@@ -98,6 +102,8 @@ case "${cfn_node_type}" in
98102 ComputeFleet)
99103 compute_instance_type=$( ec2-metadata -t | awk ' {print $2}' )
100104 gpu_instances=" [pg][2-9].*\.[0-9]*[x]*large"
105+ echo " $> Compute Instances Type EC2 -> ${compute_instance_type} "
106+ echo " $> GPUS Instances EC2 -> ${gpu_instances} "
101107 if [[ $compute_instance_type =~ $gpu_instances ]]; then
102108 distribution=$( . /etc/os-release; echo $ID$VERSION_ID )
103109 curl -s -L https://nvidia.github.io/nvidia-docker/$distribution /nvidia-docker.repo | tee /etc/yum.repos.d/nvidia-docker.repo
0 commit comments