Skip to content

Commit c4c7f11

Browse files
authored
enable indefinite duration telemetry and flamegraph collection on remote targets (#595)
* enable indefinite duration telemetry collection on remote targets Signed-off-by: Harper, Jason M <jason.m.harper@intel.com> * improve comments Signed-off-by: Harper, Jason M <jason.m.harper@intel.com> * wait is not necessary and causes issue when signal not received Signed-off-by: Harper, Jason M <jason.m.harper@intel.com> * Enhance signal handler to support progress updates and increase timeout duration Signed-off-by: Harper, Jason M <jason.m.harper@intel.com> * rm pid file when script completes Signed-off-by: Harper, Jason M <jason.m.harper@intel.com> * add flamegraph Signed-off-by: Harper, Jason M <jason.m.harper@intel.com> * fix comment Signed-off-by: Harper, Jason M <jason.m.harper@intel.com> * spaces for bash Signed-off-by: Harper, Jason M <jason.m.harper@intel.com> * remove commented code Signed-off-by: Harper, Jason M <jason.m.harper@intel.com> * align help Signed-off-by: Harper, Jason M <jason.m.harper@intel.com> * typo in flame help Signed-off-by: Harper, Jason M <jason.m.harper@intel.com> * remove unnecessary comments from configureSignalHandler function Signed-off-by: Harper, Jason M <jason.m.harper@intel.com> --------- Signed-off-by: Harper, Jason M <jason.m.harper@intel.com>
1 parent 2dea050 commit c4c7f11

File tree

13 files changed

+281
-166
lines changed

13 files changed

+281
-166
lines changed

cmd/flamegraph/flamegraph.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ const (
5959
func init() {
6060
Cmd.Flags().StringVar(&common.FlagInput, common.FlagInputName, "", "")
6161
Cmd.Flags().StringSliceVar(&common.FlagFormat, common.FlagFormatName, []string{report.FormatAll}, "")
62-
Cmd.Flags().IntVar(&flagDuration, flagDurationName, 30, "")
62+
Cmd.Flags().IntVar(&flagDuration, flagDurationName, 0, "")
6363
Cmd.Flags().IntVar(&flagFrequency, flagFrequencyName, 11, "")
6464
Cmd.Flags().IntSliceVar(&flagPids, flagPidsName, nil, "")
6565
Cmd.Flags().BoolVar(&flagNoSystemSummary, flagNoSystemSummaryName, false, "")
@@ -100,7 +100,7 @@ func getFlagGroups() []common.FlagGroup {
100100
flags := []common.Flag{
101101
{
102102
Name: flagDurationName,
103-
Help: "number of seconds to run the collection",
103+
Help: "number of seconds to run the collection. If 0, the collection will run indefinitely. Ctrl+c to stop.",
104104
},
105105
{
106106
Name: flagFrequencyName,
@@ -155,19 +155,19 @@ func validateFlags(cmd *cobra.Command, args []string) error {
155155
return common.FlagValidationError(cmd, fmt.Sprintf("input file %s does not exist", common.FlagInput))
156156
}
157157
}
158-
if flagDuration <= 0 {
159-
return common.FlagValidationError(cmd, "duration must be greater than 0")
158+
if flagDuration < 0 {
159+
return common.FlagValidationError(cmd, "duration must be 0 or greater")
160160
}
161161
if flagFrequency <= 0 {
162-
return common.FlagValidationError(cmd, "frequency must be greater than 0")
162+
return common.FlagValidationError(cmd, "frequency must be 1 or greater")
163163
}
164164
for _, pid := range flagPids {
165165
if pid < 0 {
166-
return common.FlagValidationError(cmd, "PID must be greater than or equal to 0")
166+
return common.FlagValidationError(cmd, "PID must be 0 or greater")
167167
}
168168
}
169169
if flagMaxDepth < 0 {
170-
return common.FlagValidationError(cmd, "max depth must be greater than or equal to 0")
170+
return common.FlagValidationError(cmd, "max depth must be 0 or greater")
171171
}
172172
// common target flags
173173
if err := common.ValidateTargetFlags(cmd); err != nil {

cmd/metrics/metadata.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -692,7 +692,7 @@ func getUncoreDeviceIDs(isAMDArchitecture bool, scriptOutputs map[string]script.
692692
// getCPUInfo - reads and returns all data from /proc/cpuinfo
693693
func getCPUInfo(t target.Target) (cpuInfo []map[string]string, err error) {
694694
cmd := exec.Command("cat", "/proc/cpuinfo")
695-
stdout, stderr, exitcode, err := t.RunCommand(cmd, 0, true)
695+
stdout, stderr, exitcode, err := t.RunCommand(cmd)
696696
if err != nil {
697697
err = fmt.Errorf("failed to get cpuinfo: %s, %d, %v", stderr, exitcode, err)
698698
return
@@ -717,7 +717,7 @@ func getCPUInfo(t target.Target) (cpuInfo []map[string]string, err error) {
717717
// getLscpu - runs lscpu on the target and returns the output
718718
func getLscpu(t target.Target) (output string, err error) {
719719
cmd := exec.Command("lscpu")
720-
output, stderr, exitcode, err := t.RunCommand(cmd, 0, true)
720+
output, stderr, exitcode, err := t.RunCommand(cmd)
721721
if err != nil || exitcode != 0 {
722722
err = fmt.Errorf("failed to run lscpu: %s, %d, %v", stderr, exitcode, err)
723723
return

cmd/metrics/nmi_watchdog.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ func getNMIWatchdog(myTarget target.Target) (setting string, err error) {
4949
return
5050
}
5151
cmd := exec.Command(sysctl, "kernel.nmi_watchdog") // #nosec G204 // nosemgrep
52-
stdout, _, _, err := myTarget.RunCommand(cmd, 0, true)
52+
stdout, _, _, err := myTarget.RunCommand(cmd)
5353
if err != nil {
5454
return
5555
}
@@ -88,7 +88,7 @@ func setNMIWatchdog(myTarget target.Target, setting string, localTempDir string)
8888
// findSysctl - gets a useable path to sysctl or error
8989
func findSysctl(myTarget target.Target) (path string, err error) {
9090
cmd := exec.Command("which", "sysctl")
91-
stdout, _, _, err := myTarget.RunCommand(cmd, 0, true)
91+
stdout, _, _, err := myTarget.RunCommand(cmd)
9292
if err == nil {
9393
//found it
9494
path = strings.TrimSpace(stdout)
@@ -97,7 +97,7 @@ func findSysctl(myTarget target.Target) (path string, err error) {
9797
// didn't find it on the path, try being specific
9898
sbinPath := "/usr/sbin/sysctl"
9999
cmd = exec.Command("which", sbinPath)
100-
_, _, _, err = myTarget.RunCommand(cmd, 0, true)
100+
_, _, _, err = myTarget.RunCommand(cmd)
101101
if err == nil {
102102
// found it
103103
path = sbinPath

cmd/metrics/process.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ func GetHotProcesses(myTarget target.Target, maxProcesses int, filter string) (p
6767
}
6868
// run ps to get list of processes sorted by cpu utilization (descending)
6969
cmd := exec.Command("ps", "-a", "-x", "-h", "-o", "pid,ppid,comm,cmd", "--sort=-%cpu")
70-
stdout, stderr, exitcode, err := myTarget.RunCommand(cmd, 0, true)
70+
stdout, stderr, exitcode, err := myTarget.RunCommand(cmd)
7171
if err != nil {
7272
err = fmt.Errorf("failed to get hot processes: %s, %d, %v", stderr, exitcode, err)
7373
return
@@ -177,7 +177,7 @@ done | sort -nr | head -n %d
177177

178178
func processExists(myTarget target.Target, pid string) (exists bool) {
179179
cmd := exec.Command("ps", "-p", pid)
180-
_, _, _, err := myTarget.RunCommand(cmd, 0, true)
180+
_, _, _, err := myTarget.RunCommand(cmd)
181181
if err != nil {
182182
exists = false
183183
return
@@ -188,7 +188,7 @@ func processExists(myTarget target.Target, pid string) (exists bool) {
188188

189189
func getProcess(myTarget target.Target, pid string) (process Process, err error) {
190190
cmd := exec.Command("ps", "-q", pid, "h", "-o", "pid,ppid,comm,cmd", "ww")
191-
stdout, stderr, exitcode, err := myTarget.RunCommand(cmd, 0, true)
191+
stdout, stderr, exitcode, err := myTarget.RunCommand(cmd)
192192
if err != nil {
193193
err = fmt.Errorf("failed to get process: %s, %d, %v", stderr, exitcode, err)
194194
return

cmd/telemetry/telemetry.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ func init() {
122122
Cmd.Flags().StringVar(&common.FlagInput, common.FlagInputName, "", "")
123123
Cmd.Flags().BoolVar(&flagAll, flagAllName, true, "")
124124
Cmd.Flags().StringSliceVar(&common.FlagFormat, common.FlagFormatName, []string{report.FormatAll}, "")
125-
Cmd.Flags().IntVar(&flagDuration, flagDurationName, 30, "")
125+
Cmd.Flags().IntVar(&flagDuration, flagDurationName, 0, "")
126126
Cmd.Flags().IntVar(&flagInterval, flagIntervalName, 2, "")
127127
Cmd.Flags().IntVar(&flagInstrMixPid, flagInstrMixPidName, 0, "")
128128
Cmd.Flags().IntVar(&flagInstrMixFrequency, flagInstrMixFrequencyName, instrmixFrequencyDefaultSystemWide, "")
@@ -244,9 +244,6 @@ func validateFlags(cmd *cobra.Command, args []string) error {
244244
if flagDuration < 0 {
245245
return common.FlagValidationError(cmd, "duration must be 0 or greater")
246246
}
247-
if flagDuration == 0 && (cmd.Flags().Lookup(common.FlagTargetsFileName).Changed || cmd.Flags().Lookup(common.FlagTargetHostName).Changed) {
248-
return common.FlagValidationError(cmd, "duration must be greater than 0 when collecting from a remote target")
249-
}
250247
if flagInstrMixFrequency < 100000 { // 100,000 instructions is the minimum frequency
251248
return common.FlagValidationError(cmd, "instruction mix frequency must be 100,000 or greater to limit overhead")
252249
}

internal/common/common.go

Lines changed: 98 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,12 @@ package common
66
// SPDX-License-Identifier: BSD-3-Clause
77

88
import (
9+
"context"
910
"errors"
1011
"fmt"
1112
"log/slog"
1213
"os"
14+
"os/exec"
1315
"os/signal"
1416
"path/filepath"
1517
"perfspect/internal/progress"
@@ -20,6 +22,7 @@ import (
2022
"perfspect/internal/util"
2123
"strings"
2224
"syscall"
25+
"time"
2326

2427
"slices"
2528

@@ -118,22 +121,6 @@ func (rc *ReportingCommand) Run() error {
118121
localTempDir := appContext.LocalTempDir
119122
outputDir := appContext.OutputDir
120123
logFilePath := appContext.LogFilePath
121-
// handle signals
122-
// child processes will exit when the signals are received which will
123-
// allow this app to exit normally
124-
sigChannel := make(chan os.Signal, 1)
125-
signal.Notify(sigChannel, syscall.SIGINT, syscall.SIGTERM)
126-
go func() {
127-
sig := <-sigChannel
128-
slog.Info("received signal", slog.String("signal", sig.String()))
129-
// when perfspect receives ctrl-c while in the shell, the shell makes sure to propogate the
130-
// signal to all our children. But when perfspect is run in the background or disowned and
131-
// then receives SIGINT, e.g., from a script, we need to send the signal to our children
132-
err := util.SignalChildren(syscall.SIGINT)
133-
if err != nil {
134-
slog.Error("error sending signal to children", slog.String("error", err.Error()))
135-
}
136-
}()
137124
// create output directory
138125
err := util.CreateDirectoryIfNotExists(outputDir, 0755) // #nosec G301
139126
if err != nil {
@@ -144,8 +131,8 @@ func (rc *ReportingCommand) Run() error {
144131
return err
145132
}
146133

147-
var orderedTargetScriptOutputs []TargetScriptOutputs
148134
var myTargets []target.Target
135+
var orderedTargetScriptOutputs []TargetScriptOutputs
149136
if FlagInput != "" {
150137
var err error
151138
orderedTargetScriptOutputs, err = outputsFromInput(rc.Tables, rc.SummaryTableName)
@@ -203,6 +190,8 @@ func (rc *ReportingCommand) Run() error {
203190
for i := len(indicesToRemove) - 1; i >= 0; i-- {
204191
myTargets = slices.Delete(myTargets, indicesToRemove[i], indicesToRemove[i]+1)
205192
}
193+
// set up signal handler to help with cleaning up child processes on ctrl-c/SIGINT or SIGTERM
194+
configureSignalHandler(myTargets, multiSpinner.Status)
206195
// collect data from targets
207196
orderedTargetScriptOutputs, err = outputsFromTargets(rc.Cmd, myTargets, rc.Tables, rc.ScriptParams, multiSpinner.Status, localTempDir)
208197
if err != nil {
@@ -299,6 +288,94 @@ func (rc *ReportingCommand) Run() error {
299288
return nil
300289
}
301290

291+
// configureSignalHandler sets up a signal handler to catch SIGINT and SIGTERM
292+
//
293+
// When perfspect receives ctrl-c while in the shell, the shell propagates the
294+
// signal to all our children. But when perfspect is run in the background or disowned and
295+
// then receives SIGINT, e.g., from a script, we need to send the signal to our children
296+
//
297+
// Also, when running scripts in parallel using the parallel_master.sh script, we need to
298+
// send the signal to the parallel_master.sh script on each target so that it can clean up
299+
// its child processes. This is because the parallel_master.sh script is run in its own process group
300+
// and does not receive the signal when perfspect receives it.
301+
//
302+
// Parameters:
303+
// - myTargets: The list of targets to send the signal to.
304+
// - statusFunc: A function to update the status of the progress indicator.
305+
func configureSignalHandler(myTargets []target.Target, statusFunc progress.MultiSpinnerUpdateFunc) {
306+
sigChannel := make(chan os.Signal, 1)
307+
signal.Notify(sigChannel, syscall.SIGINT, syscall.SIGTERM)
308+
go func() {
309+
sig := <-sigChannel
310+
slog.Debug("received signal", slog.String("signal", sig.String()))
311+
// Scripts that are run in parallel using the parallel_master.sh script and a few other sequential scripts need to be handled specially
312+
// because they are run in their own process group, we need to send the signal directly to the PID of the script.
313+
// For every target, look for the primary_collection_script PID file and send SIGINT to it.
314+
for _, t := range myTargets {
315+
if statusFunc != nil {
316+
_ = statusFunc(t.GetName(), "Signal received, cleaning up...")
317+
}
318+
pidFilePath := filepath.Join(t.GetTempDirectory(), "primary_collection_script.pid")
319+
stdout, _, exitcode, err := t.RunCommandEx(exec.Command("cat", pidFilePath), 5, false, true) // #nosec G204
320+
if err != nil {
321+
slog.Error("error retrieving target primary_collection_script PID", slog.String("target", t.GetName()), slog.String("error", err.Error()))
322+
}
323+
if exitcode == 0 {
324+
pidStr := strings.TrimSpace(stdout)
325+
_, _, _, err := t.RunCommandEx(exec.Command("sudo", "kill", "-SIGINT", pidStr), 5, false, true) // #nosec G204
326+
if err != nil {
327+
slog.Error("error sending signal to target primary_collection_script", slog.String("target", t.GetName()), slog.String("error", err.Error()))
328+
}
329+
}
330+
}
331+
// now wait until all primary collection scripts have exited
332+
slog.Debug("waiting for primary_collection_script scripts to exit")
333+
for _, t := range myTargets {
334+
// create a per-target timeout context
335+
targetTimeout := 10 * time.Second
336+
ctx, cancel := context.WithTimeout(context.Background(), targetTimeout)
337+
timedOut := false
338+
pidFilePath := filepath.Join(t.GetTempDirectory(), "primary_collection_script.pid")
339+
for {
340+
// check for timeout
341+
select {
342+
case <-ctx.Done():
343+
if statusFunc != nil {
344+
_ = statusFunc(t.GetName(), "cleanup timeout exceeded")
345+
}
346+
slog.Warn("signal handler cleanup timeout exceeded for target", slog.String("target", t.GetName()))
347+
timedOut = true
348+
default:
349+
}
350+
if timedOut {
351+
break
352+
}
353+
// read the pid file
354+
stdout, _, exitcode, err := t.RunCommandEx(exec.Command("cat", pidFilePath), 5, false, true) // #nosec G204
355+
if err != nil || exitcode != 0 {
356+
// pid file doesn't exist
357+
break
358+
}
359+
pidStr := strings.TrimSpace(stdout)
360+
// determine if the process still exists
361+
_, _, exitcode, err = t.RunCommandEx(exec.Command("ps", "-p", pidStr), 5, false, true) // #nosec G204
362+
if err != nil || exitcode != 0 {
363+
break // process no longer exists, script has exited
364+
}
365+
// sleep for a short time before checking again
366+
time.Sleep(500 * time.Millisecond)
367+
}
368+
cancel()
369+
}
370+
371+
// send SIGINT to perfspect's children
372+
err := util.SignalChildren(syscall.SIGINT)
373+
if err != nil {
374+
slog.Error("error sending signal to children", slog.String("error", err.Error()))
375+
}
376+
}()
377+
}
378+
302379
// DefaultInsightsFunc returns the insights table values from the table values
303380
func DefaultInsightsFunc(allTableValues []table.TableValues, scriptOutputs map[string]script.ScriptOutput) table.TableValues {
304381
insightsTableValues := table.TableValues{
@@ -554,7 +631,8 @@ func outputsFromTargets(cmd *cobra.Command, myTargets []target.Target, tables []
554631
scriptsToRunOnTarget = append(scriptsToRunOnTarget, script)
555632
}
556633
// run the selected scripts on the target
557-
go collectOnTarget(target, scriptsToRunOnTarget, localTempDir, scriptParams["Duration"], cmd.Name() == "telemetry", channelTargetScriptOutputs, channelError, statusUpdate)
634+
ctrlCToStop := cmd.Name() == "telemetry" || cmd.Name() == "flamegraph"
635+
go collectOnTarget(target, scriptsToRunOnTarget, localTempDir, scriptParams["Duration"], ctrlCToStop, channelTargetScriptOutputs, channelError, statusUpdate)
558636
}
559637
// wait for scripts to run on all targets
560638
var allTargetScriptOutputs []TargetScriptOutputs
@@ -631,10 +709,10 @@ func elevatedPrivilegesRequired(tables []table.TableDefinition) bool {
631709
}
632710

633711
// collectOnTarget runs the scripts on the target and sends the results to the appropriate channels
634-
func collectOnTarget(myTarget target.Target, scriptsToRun []script.ScriptDefinition, localTempDir string, duration string, isTelemetry bool, channelTargetScriptOutputs chan TargetScriptOutputs, channelError chan error, statusUpdate progress.MultiSpinnerUpdateFunc) {
712+
func collectOnTarget(myTarget target.Target, scriptsToRun []script.ScriptDefinition, localTempDir string, duration string, ctrlCToStop bool, channelTargetScriptOutputs chan TargetScriptOutputs, channelError chan error, statusUpdate progress.MultiSpinnerUpdateFunc) {
635713
// run the scripts on the target
636714
status := "collecting data"
637-
if isTelemetry && duration == "0" { // telemetry is the only command that uses this common code that can run indefinitely
715+
if ctrlCToStop && duration == "0" {
638716
status += ", press Ctrl+c to stop"
639717
} else if duration != "0" && duration != "" {
640718
status += fmt.Sprintf(" for %s seconds", duration)

internal/common/targets.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ func parseMountOutput(mountOutput string) ([]mountRecord, error) {
495495
// isDirNoExec checks if the target directory is on a file system that is mounted with noexec.
496496
func isDirNoExec(t target.Target, dir string) (bool, error) {
497497
dfCmd := exec.Command("df", "-P", dir)
498-
dfOutput, _, _, err := t.RunCommand(dfCmd, 0, true)
498+
dfOutput, _, _, err := t.RunCommand(dfCmd)
499499
if err != nil {
500500
err = fmt.Errorf("failed to run df command: %w", err)
501501
return false, err
@@ -509,7 +509,7 @@ func isDirNoExec(t target.Target, dir string) (bool, error) {
509509
return false, err
510510
}
511511
mountCmd := exec.Command("mount")
512-
mountOutput, _, _, err := t.RunCommand(mountCmd, 0, true)
512+
mountOutput, _, _, err := t.RunCommand(mountCmd)
513513
if err != nil {
514514
err = fmt.Errorf("failed to run mount command: %w", err)
515515
return false, err
@@ -553,7 +553,7 @@ func GetTargetVendor(t target.Target) (string, error) {
553553
if vendor == "" {
554554
cmd := exec.Command("bash", "-c", "lscpu | grep -i \"^Vendor ID:\" | awk '{print $NF}'")
555555
var err error
556-
vendor, _, _, err = t.RunCommand(cmd, 0, true)
556+
vendor, _, _, err = t.RunCommand(cmd)
557557
if err != nil {
558558
return "", fmt.Errorf("failed to get target CPU vendor: %v", err)
559559
}
@@ -568,7 +568,7 @@ func GetTargetFamily(t target.Target) (string, error) {
568568
if family == "" {
569569
cmd := exec.Command("bash", "-c", "lscpu | grep -i \"^CPU family:\" | awk '{print $NF}'")
570570
var err error
571-
family, _, _, err = t.RunCommand(cmd, 0, true)
571+
family, _, _, err = t.RunCommand(cmd)
572572
if err != nil {
573573
return "", fmt.Errorf("failed to get target CPU family: %v", err)
574574
}
@@ -583,7 +583,7 @@ func GetTargetModel(t target.Target) (string, error) {
583583
if model == "" {
584584
cmd := exec.Command("bash", "-c", "lscpu | grep -i \"^Model:\" | awk '{print $NF}'")
585585
var err error
586-
model, _, _, err = t.RunCommand(cmd, 0, true)
586+
model, _, _, err = t.RunCommand(cmd)
587587
if err != nil {
588588
return "", fmt.Errorf("failed to get target CPU model: %v", err)
589589
}
@@ -598,7 +598,7 @@ func GetTargetStepping(t target.Target) (string, error) {
598598
if stepping == "" {
599599
cmd := exec.Command("bash", "-c", "lscpu | grep -i \"^Stepping:\" | awk '{print $NF}'")
600600
var err error
601-
stepping, _, _, err = t.RunCommand(cmd, 0, true)
601+
stepping, _, _, err = t.RunCommand(cmd)
602602
if err != nil {
603603
return "", fmt.Errorf("failed to get target CPU stepping: %v", err)
604604
}

0 commit comments

Comments
 (0)