Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/operator-manual/config-management-plugins.md
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,9 @@ If you don't need to set any environment variables, you can set an empty plugin
if version was mentioned in the `ConfigManagementPlugin` spec or else just use `<metadata.name>`. You can also remove the name altogether
and let the automatic discovery to identify the plugin.
!!! note
If a CMP renders blank manfiests, and `prune` is set to `true`, Argo CD will automatically remove resources. CMP plugin authors should ensure errors are part of the exit code. Commonly something like `kustomize build . | cat` won't pass errors because of the pipe. Consider setting `set -o pipefail` so anything piped will pass errors on failure.
If a CMP renders blank manifests, and `prune` is set to `true`, Argo CD will automatically remove resources. CMP plugin authors should ensure errors are part of the exit code. Commonly something like `kustomize build . | cat` won't pass errors because of the pipe. Consider setting `set -o pipefail` so anything piped will pass errors on failure.
!!! note
If a CMP command fails to gracefully exit on `ARGOCD_EXEC_TIMEOUT`, it will be forcefully killed after an additional timeout of `ARGOCD_EXEC_FATAL_TIMEOUT`.

## Debugging a CMP

Expand Down
2 changes: 2 additions & 0 deletions docs/operator-manual/high_availability.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ and might fail. To avoid failed syncs use the `ARGOCD_GIT_ATTEMPTS_COUNT` enviro

* `argocd-repo-server` executes config management tools such as `helm` or `kustomize` and enforces a 90 second timeout. This timeout can be changed by using the `ARGOCD_EXEC_TIMEOUT` env variable. The value should be in the Go time duration string format, for example, `2m30s`.

* `argocd-repo-server` will issue a `SIGTERM` signal to a command that has elapsed the `ARGOCD_EXEC_TIMEOUT`. In most cases, well-behaved commands will exit immediately when receiving the signal. However, if this does not happen, `argocd-repo-server` will wait an additional timeout of `ARGOCD_EXEC_FATAL_TIMEOUT` and then forcefully exit the command with a `SIGKILL` to prevent stalling. Note that a failure to exit with `SIGTERM` is usually a bug in either the offending command or in the way `argocd-repo-server` calls it and should be reported to the issue tracker for further investigation.

**metrics:**

* `argocd_git_request_total` - Number of git requests. This metric provides two tags: `repo` - Git repo URL; `request_type` - `ls-remote` or `fetch`.
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/fixture/applicationsets/utils/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import (
"os/exec"
"strings"

argoexec "github.com/argoproj/pkg/exec"
argoexec "github.com/argoproj/argo-cd/v3/util/exec"
)

func Run(workDir, name string, args ...string) (string, error) {
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/fixture/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import (
"os/exec"
"strings"

argoexec "github.com/argoproj/pkg/exec"
argoexec "github.com/argoproj/argo-cd/v3/util/exec"
)

func Run(workDir, name string, args ...string) (string, error) {
Expand Down
3 changes: 1 addition & 2 deletions test/fixture/revision_metadata/author.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@ import (
"fmt"
"strings"

argoexec "github.com/argoproj/pkg/exec"

"github.com/argoproj/argo-cd/v3/util/errors"
argoexec "github.com/argoproj/argo-cd/v3/util/exec"
)

var Author string
Expand Down
2 changes: 1 addition & 1 deletion test/manifests_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ import (
"path/filepath"
"testing"

argoexec "github.com/argoproj/pkg/exec"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/argoproj/argo-cd/v3/test/fixture/test"
argoexec "github.com/argoproj/argo-cd/v3/util/exec"
)

func TestKustomizeVersion(t *testing.T) {
Expand Down
207 changes: 202 additions & 5 deletions util/exec/exec.go
Original file line number Diff line number Diff line change
@@ -1,27 +1,35 @@
package exec

import (
"bytes"
"errors"
"fmt"
"os"
"os/exec"
"strconv"
"strings"
"syscall"
"time"
"unicode"

"github.com/argoproj/gitops-engine/pkg/utils/tracing"
argoexec "github.com/argoproj/pkg/exec"
"github.com/sirupsen/logrus"

"github.com/argoproj/argo-cd/v3/util/log"
"github.com/argoproj/argo-cd/v3/util/rand"
)

var timeout time.Duration
var (
timeout time.Duration
fatalTimeout time.Duration
Unredacted = Redact(nil)
)

type ExecRunOpts struct {
// Redactor redacts tokens from the output
Redactor func(text string) string
// TimeoutBehavior configures what to do in case of timeout
TimeoutBehavior argoexec.TimeoutBehavior
TimeoutBehavior TimeoutBehavior
// SkipErrorLogging determines whether to skip logging of execution errors (rc > 0)
SkipErrorLogging bool
// CaptureStderr determines whether to capture stderr in addition to stdout
Expand All @@ -38,6 +46,10 @@ func initTimeout() {
if err != nil {
timeout = 90 * time.Second
}
fatalTimeout, err = time.ParseDuration(os.Getenv("ARGOCD_EXEC_FATAL_TIMEOUT"))
if err != nil {
fatalTimeout = 10 * time.Second
}
}

func Run(cmd *exec.Cmd) (string, error) {
Expand All @@ -50,7 +62,7 @@ func RunWithRedactor(cmd *exec.Cmd, redactor func(text string) string) (string,
}

func RunWithExecRunOpts(cmd *exec.Cmd, opts ExecRunOpts) (string, error) {
cmdOpts := argoexec.CmdOpts{Timeout: timeout, Redactor: opts.Redactor, TimeoutBehavior: opts.TimeoutBehavior, SkipErrorLogging: opts.SkipErrorLogging}
cmdOpts := CmdOpts{Timeout: timeout, FatalTimeout: fatalTimeout, Redactor: opts.Redactor, TimeoutBehavior: opts.TimeoutBehavior, SkipErrorLogging: opts.SkipErrorLogging}
span := tracing.NewLoggingTracer(log.NewLogrusLogger(log.NewWithCurrentConfig())).StartSpan(fmt.Sprintf("exec %v", cmd.Args[0]))
span.SetBaggageItem("dir", cmd.Dir)
if cmdOpts.Redactor != nil {
Expand All @@ -59,7 +71,7 @@ func RunWithExecRunOpts(cmd *exec.Cmd, opts ExecRunOpts) (string, error) {
span.SetBaggageItem("args", fmt.Sprintf("%v", cmd.Args))
}
defer span.Finish()
return argoexec.RunCommandExt(cmd, cmdOpts)
return RunCommandExt(cmd, cmdOpts)
}

// GetCommandArgsToLog represents the given command in a way that we can copy-and-paste into a terminal
Expand Down Expand Up @@ -88,3 +100,188 @@ func GetCommandArgsToLog(cmd *exec.Cmd) string {
args := strings.Join(argsToLog, " ")
return args
}

type CmdError struct {
Args string
Stderr string
Cause error
}

func (ce *CmdError) Error() string {
res := fmt.Sprintf("`%v` failed %v", ce.Args, ce.Cause)
if ce.Stderr != "" {
res = fmt.Sprintf("%s: %s", res, ce.Stderr)
}
return res
}

func (ce *CmdError) String() string {
return ce.Error()
}

func newCmdError(args string, cause error, stderr string) *CmdError {
return &CmdError{Args: args, Stderr: stderr, Cause: cause}
}

// TimeoutBehavior defines behavior for when the command takes longer than the passed in timeout to exit
// By default, SIGKILL is sent to the process and it is not waited upon
type TimeoutBehavior struct {
// Signal determines the signal to send to the process
Signal syscall.Signal
// ShouldWait determines whether to wait for the command to exit once timeout is reached
ShouldWait bool
}

type CmdOpts struct {
// Timeout determines how long to wait for the command to exit
Timeout time.Duration
// FatalTimeout is the amount of additional time to wait after Timeout before fatal SIGKILL
FatalTimeout time.Duration
// Redactor redacts tokens from the output
Redactor func(text string) string
// TimeoutBehavior configures what to do in case of timeout
TimeoutBehavior TimeoutBehavior
// SkipErrorLogging defines whether to skip logging of execution errors (rc > 0)
SkipErrorLogging bool
// CaptureStderr defines whether to capture stderr in addition to stdout
CaptureStderr bool
}

var DefaultCmdOpts = CmdOpts{
Timeout: time.Duration(0),
FatalTimeout: time.Duration(0),
Redactor: Unredacted,
TimeoutBehavior: TimeoutBehavior{syscall.SIGKILL, false},
SkipErrorLogging: false,
CaptureStderr: false,
}

func Redact(items []string) func(text string) string {
return func(text string) string {
for _, item := range items {
text = strings.ReplaceAll(text, item, "******")
}
return text
}
}

// RunCommandExt is a convenience function to run/log a command and return/log stderr in an error upon
// failure.
func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) {
execId, err := rand.RandHex(5)
if err != nil {
return "", err
}
logCtx := logrus.WithFields(logrus.Fields{"execID": execId})

redactor := DefaultCmdOpts.Redactor
if opts.Redactor != nil {
redactor = opts.Redactor
}

// log in a way we can copy-and-paste into a terminal
args := strings.Join(cmd.Args, " ")
logCtx.WithFields(logrus.Fields{"dir": cmd.Dir}).Info(redactor(args))

var stdout bytes.Buffer
var stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr

start := time.Now()
err = cmd.Start()
if err != nil {
return "", err
}

done := make(chan error)
go func() { done <- cmd.Wait() }()

// Start timers for timeout
timeout := DefaultCmdOpts.Timeout
fatalTimeout := DefaultCmdOpts.FatalTimeout

if opts.Timeout != time.Duration(0) {
timeout = opts.Timeout
}

if opts.FatalTimeout != time.Duration(0) {
fatalTimeout = opts.FatalTimeout
}

var timoutCh <-chan time.Time
if timeout != 0 {
timoutCh = time.NewTimer(timeout).C
}

var fatalTimeoutCh <-chan time.Time
if fatalTimeout != 0 {
fatalTimeoutCh = time.NewTimer(timeout + fatalTimeout).C
}

timeoutBehavior := DefaultCmdOpts.TimeoutBehavior
fatalTimeoutBehaviour := syscall.SIGKILL
if opts.TimeoutBehavior.Signal != syscall.Signal(0) {
timeoutBehavior = opts.TimeoutBehavior
}

select {
// noinspection ALL
case <-timoutCh:
// send timeout signal
_ = cmd.Process.Signal(timeoutBehavior.Signal)
// wait on timeout signal and fallback to fatal timeout signal
if timeoutBehavior.ShouldWait {
select {
case <-done:
case <-fatalTimeoutCh:
// upgrades to SIGKILL if cmd does not respect SIGTERM
_ = cmd.Process.Signal(fatalTimeoutBehaviour)
// now original cmd should exit immediately after SIGKILL
<-done
// return error with a marker indicating that cmd exited only after fatal SIGKILL
output := stdout.String()
if opts.CaptureStderr {
output += stderr.String()
}
logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output))
err = newCmdError(redactor(args), fmt.Errorf("fatal timeout after %v", timeout+fatalTimeout), "")
logCtx.Error(err.Error())
return strings.TrimSuffix(output, "\n"), err
}
}
// either did not wait for timeout or cmd did respect SIGTERM
output := stdout.String()
if opts.CaptureStderr {
output += stderr.String()
}
logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output))
err = newCmdError(redactor(args), fmt.Errorf("timeout after %v", timeout), "")
logCtx.Error(err.Error())
return strings.TrimSuffix(output, "\n"), err
case err := <-done:
if err != nil {
output := stdout.String()
if opts.CaptureStderr {
output += stderr.String()
}
logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output))
err := newCmdError(redactor(args), errors.New(redactor(err.Error())), strings.TrimSpace(redactor(stderr.String())))
if !opts.SkipErrorLogging {
logCtx.Error(err.Error())
}
return strings.TrimSuffix(output, "\n"), err
}
}
output := stdout.String()
if opts.CaptureStderr {
output += stderr.String()
}
logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output))

return strings.TrimSuffix(output, "\n"), nil
}

func RunCommand(name string, opts CmdOpts, arg ...string) (string, error) {
return RunCommandExt(exec.Command(name, arg...), opts)
}
44 changes: 44 additions & 0 deletions util/exec/exec_norace_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
//go:build !race

package exec

import (
"testing"
"time"

log "github.com/sirupsen/logrus"
"github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

// FIXME: there's a race in RunCommand that causes the test to fail when -race is enabled. The race is in the timeout
// handling, which shares bytes buffers between the exec goroutine and the timeout handler code.
func TestRunCommandTimeout(t *testing.T) {
hook := test.NewGlobal()
log.SetLevel(log.DebugLevel)
defer log.SetLevel(log.InfoLevel)

output, err := RunCommand("sh", CmdOpts{Timeout: 500 * time.Millisecond}, "-c", "echo hello world && echo my-error >&2 && sleep 2")
assert.Equal(t, "hello world", output)
require.EqualError(t, err, "`sh -c echo hello world && echo my-error >&2 && sleep 2` failed timeout after 500ms")

assert.Len(t, hook.Entries, 3)

entry := hook.Entries[0]
assert.Equal(t, log.InfoLevel, entry.Level)
assert.Equal(t, "sh -c echo hello world && echo my-error >&2 && sleep 2", entry.Message)
assert.Contains(t, entry.Data, "dir")
assert.Contains(t, entry.Data, "execID")

entry = hook.Entries[1]
assert.Equal(t, log.DebugLevel, entry.Level)
assert.Equal(t, "hello world\n", entry.Message)
assert.Contains(t, entry.Data, "duration")
assert.Contains(t, entry.Data, "execID")

entry = hook.Entries[2]
assert.Equal(t, log.ErrorLevel, entry.Level)
assert.Equal(t, "`sh -c echo hello world && echo my-error >&2 && sleep 2` failed timeout after 500ms", entry.Message)
assert.Contains(t, entry.Data, "execID")
}
Loading
Loading