Skip to content

Commit 042e356

Browse files
authored
Re-introduce use of px/agent_status_diagnostics in px cli to detect missing kernel headers (#2091)
Summary: Re-introduce use of `px/agent_status_diagnostics` in px cli to detect missing kernel headers The original version of this was reverted since it resulted in hung `px deploy` and `px collect-logs` commands on larger clusters. This PR reintroduces the change with the fixes necessary to prevent the previous issue and is best reviewed commit by commit as outlined below: Commit 1: Cherry-pick of #2065 Commit 2: Fix for goroutine deadlock Commit 3: Add bundle flag to the `px collect-logs` command Commit 4: Introduce `PX_LOG_FILE` env var for redirecting `px` log output to a file -- useful for debugging the cli since its terminal spinners complicate logging to stdout Commits 1, 3 and 4 should be self explanatory. As for Commit 2, the goroutine deadlock occurred from the `streamCh` channel consumer. The previous version read a single value from the `streamCh` channel, parsed the result and [terminated](2ec63c8#diff-4da8f48b4c664d330cff34e70f907d6015289797c832587b0b14004875ef0831R363) its goroutine. Thus future sends to the `streamCh` channel could block and prevent the pipe receiving the pxl script results to be fully consumed. Since the stream adapter writes to the pipe, it couldn't flush all of its results and the deadlock occurred. The original testing was performed on clusters with 1 and 2 nodes -- max of 2 PEMs and 2 results from `px/agent_status`. This deadlock issue didn't surface in those situations because `streamCh` was a buffered channel with capacity of 1 and the consumer would read a single record before terminating. This meant that the pipe reader would hit EOF before it would initiate a channel send that would deadlock as outlined below: 2 Node cluster situation: 1. `px` cli executes `px/agent_status` as `px/agent_status_diagnostics` is not in the canonical bundle yet 2. streamCh producer sends 1st PEMs result -- streamCh at capacity 3. streamCh consumer reads the value and exits -- streamCh ready to accept 1 value 4. streamCh producer sends 2nd and final PEM result -- streamCh at capacity and future sends would block! 5. Program exits since pxl script is complete Relevant Issues: #2051 Type of change: /kind feature Test Plan: Verified that the deadlock no longer occurs on clusters with 3-6 nodes - [x] Used the [following](https://github.com/user-attachments/files/18457105/deadlocked-goroutines.txt) pprof goroutine stack dump to understand the deadlock described above -- see blocked goroutine on `streamCh` channel send on `script.go:337` - [x] Re-tested all of the scenarios from #2065 Changelog Message: Re-introduce enhanced diagnostics for `px deploy` and `px collect-logs` commands used to detect common sources of environment incompatibilities --------- Signed-off-by: Dom Del Nano <ddelnano@gmail.com>
1 parent d9e18b4 commit 042e356

File tree

10 files changed

+358
-148
lines changed

10 files changed

+358
-148
lines changed

src/pixie_cli/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ go_library(
3333
"@com_github_getsentry_sentry_go//:sentry-go",
3434
"@com_github_segmentio_analytics_go_v3//:analytics-go",
3535
"@com_github_sirupsen_logrus//:logrus",
36+
"@com_github_spf13_viper//:viper",
3637
],
3738
)
3839

src/pixie_cli/pkg/cmd/collect_logs.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,12 @@ import (
2727
"github.com/spf13/viper"
2828

2929
"px.dev/pixie/src/pixie_cli/pkg/utils"
30-
"px.dev/pixie/src/utils/shared/k8s"
30+
"px.dev/pixie/src/pixie_cli/pkg/vizier"
3131
)
3232

3333
func init() {
3434
CollectLogsCmd.Flags().StringP("namespace", "n", "", "The namespace vizier is deployed in")
35+
CollectLogsCmd.Flags().StringP("bundle", "b", "", "Path/URL to bundle file")
3536
}
3637

3738
// CollectLogsCmd is the "deploy" command.
@@ -40,9 +41,10 @@ var CollectLogsCmd = &cobra.Command{
4041
Short: "Collect Pixie logs on the cluster",
4142
PreRun: func(cmd *cobra.Command, args []string) {
4243
viper.BindPFlag("namespace", cmd.Flags().Lookup("namespace"))
44+
viper.BindPFlag("bundle", cmd.Flags().Lookup("bundle"))
4345
},
4446
Run: func(cmd *cobra.Command, args []string) {
45-
c := k8s.NewLogCollector()
47+
c := vizier.NewLogCollector(mustCreateBundleReader(), viper.GetString("cloud_addr"))
4648
fName := fmt.Sprintf("pixie_logs_%s.zip", time.Now().Format("20060102150405"))
4749
err := c.CollectPixieLogs(fName)
4850
if err != nil {

src/pixie_cli/pkg/cmd/deploy.go

Lines changed: 19 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ import (
2222
"context"
2323
"errors"
2424
"fmt"
25-
"io"
2625
"os"
2726
"strings"
2827
"time"
@@ -72,6 +71,7 @@ var BlockListedLabels = []string{
7271
}
7372

7473
func init() {
74+
DeployCmd.Flags().StringP("bundle", "b", "", "Path/URL to bundle file")
7575
DeployCmd.Flags().StringP("extract_yaml", "e", "", "Directory to extract the Pixie yamls to")
7676
DeployCmd.Flags().StringP("vizier_version", "v", "", "Pixie version to deploy")
7777
DeployCmd.Flags().BoolP("check", "c", true, "Check whether the cluster can run Pixie")
@@ -106,6 +106,7 @@ var DeployCmd = &cobra.Command{
106106
Use: "deploy",
107107
Short: "Deploys Pixie on the current K8s cluster",
108108
PreRun: func(cmd *cobra.Command, args []string) {
109+
viper.BindPFlag("bundle", cmd.Flags().Lookup("bundle"))
109110
viper.BindPFlag("extract_yaml", cmd.Flags().Lookup("extract_yaml"))
110111
viper.BindPFlag("vizier_version", cmd.Flags().Lookup("vizier_version"))
111112
viper.BindPFlag("check", cmd.Flags().Lookup("check"))
@@ -604,61 +605,6 @@ func deploy(cloudConn *grpc.ClientConn, clientset *kubernetes.Clientset, vzClien
604605
return clusterID
605606
}
606607

607-
func runSimpleHealthCheckScript(cloudAddr string, clusterID uuid.UUID) error {
608-
v, err := vizier.ConnectionToVizierByID(cloudAddr, clusterID)
609-
br := mustCreateBundleReader()
610-
if err != nil {
611-
return err
612-
}
613-
execScript := br.MustGetScript(script.AgentStatusScript)
614-
615-
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
616-
defer cancel()
617-
618-
resp, err := v.ExecuteScriptStream(ctx, execScript, nil)
619-
if err != nil {
620-
return err
621-
}
622-
623-
// TODO(zasgar): Make this use the Null output. We can't right now
624-
// because of fatal message on vizier failure.
625-
errCh := make(chan error)
626-
// Eat all responses.
627-
go func() {
628-
for {
629-
select {
630-
case <-ctx.Done():
631-
if ctx.Err() != nil {
632-
errCh <- ctx.Err()
633-
return
634-
}
635-
errCh <- nil
636-
return
637-
case msg := <-resp:
638-
if msg == nil {
639-
errCh <- nil
640-
return
641-
}
642-
if msg.Err != nil {
643-
if msg.Err == io.EOF {
644-
errCh <- nil
645-
return
646-
}
647-
errCh <- msg.Err
648-
return
649-
}
650-
if msg.Resp.Status != nil && msg.Resp.Status.Code != 0 {
651-
errCh <- errors.New(msg.Resp.Status.Message)
652-
}
653-
// Eat messages.
654-
}
655-
}
656-
}()
657-
658-
err = <-errCh
659-
return err
660-
}
661-
662608
func waitForHealthCheckTaskGenerator(cloudAddr string, clusterID uuid.UUID) func() error {
663609
return func() error {
664610
timeout := time.NewTimer(5 * time.Minute)
@@ -668,10 +614,15 @@ func waitForHealthCheckTaskGenerator(cloudAddr string, clusterID uuid.UUID) func
668614
case <-timeout.C:
669615
return errors.New("timeout waiting for healthcheck (it is possible that Pixie stabilized after the healthcheck timeout. To check if Pixie successfully deployed, run `px debug pods`)")
670616
default:
671-
err := runSimpleHealthCheckScript(cloudAddr, clusterID)
617+
_, err := vizier.RunSimpleHealthCheckScript(mustCreateBundleReader(), cloudAddr, clusterID)
672618
if err == nil {
673619
return nil
674620
}
621+
// The health check warning error indicates the cluster successfully deployed, but there are some warnings.
622+
// Return the error to end the polling and show the warnings.
623+
if _, ok := err.(*vizier.HealthCheckWarning); ok {
624+
return err
625+
}
675626
time.Sleep(5 * time.Second)
676627
}
677628
}
@@ -691,13 +642,17 @@ func waitForHealthCheck(cloudAddr string, clusterID uuid.UUID, clientset *kubern
691642
hc := utils.NewSerialTaskRunner(healthCheckJobs)
692643
err := hc.RunAndMonitor()
693644
if err != nil {
694-
_ = pxanalytics.Client().Enqueue(&analytics.Track{
695-
UserId: pxconfig.Cfg().UniqueClientID,
696-
Event: "Deploy Healthcheck Failed",
697-
Properties: analytics.NewProperties().
698-
Set("err", err.Error()),
699-
})
700-
utils.WithError(err).Fatal("Failed Pixie healthcheck")
645+
if _, ok := err.(*vizier.HealthCheckWarning); ok {
646+
utils.WithError(err).Error("Pixie healthcheck detected the following warnings:")
647+
} else {
648+
_ = pxanalytics.Client().Enqueue(&analytics.Track{
649+
UserId: pxconfig.Cfg().UniqueClientID,
650+
Event: "Deploy Healthcheck Failed",
651+
Properties: analytics.NewProperties().
652+
Set("err", err.Error()),
653+
})
654+
utils.WithError(err).Fatal("Failed Pixie healthcheck")
655+
}
701656
}
702657
_ = pxanalytics.Client().Enqueue(&analytics.Track{
703658
UserId: pxconfig.Cfg().UniqueClientID,

src/pixie_cli/pkg/cmd/root.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ var (
5050
func init() {
5151
// Flags that are relevant to all sub-commands.
5252

53+
RootCmd.PersistentFlags().StringP("log_file", "", "", "The log file to redirect output to. if not set, logs will be printed to stdout.")
5354
RootCmd.PersistentFlags().StringP("cloud_addr", "a", defaultCloudAddr, "The address of Pixie Cloud")
5455
viper.BindPFlag("cloud_addr", RootCmd.PersistentFlags().Lookup("cloud_addr"))
5556

@@ -92,6 +93,9 @@ func init() {
9293
RootCmd.AddCommand(DebugCmd)
9394

9495
RootCmd.PersistentFlags().MarkHidden("cloud_addr")
96+
// log_file is accessed in the cli's main func and as a result only works via the env var.
97+
// Hide it from the help text to prevent confusion that the flag can be used.
98+
RootCmd.PersistentFlags().MarkHidden("log_file")
9599
RootCmd.PersistentFlags().MarkHidden("dev_cloud_namespace")
96100
RootCmd.PersistentFlags().MarkHidden("do_not_track")
97101

@@ -203,7 +207,6 @@ var RootCmd = &cobra.Command{
203207

204208
// Name a variable to store a slice of commands that don't require cloudAddr
205209
var cmdsCloudAddrNotReqd = []*cobra.Command{
206-
CollectLogsCmd,
207210
VersionCmd,
208211
}
209212

@@ -245,7 +248,7 @@ func checkAuthForCmd(c *cobra.Command) {
245248
os.Exit(1)
246249
}
247250
switch c {
248-
case DeployCmd, UpdateCmd, GetCmd, DeployKeyCmd, APIKeyCmd:
251+
case CollectLogsCmd, DeployCmd, UpdateCmd, GetCmd, DeployKeyCmd, APIKeyCmd:
249252
utils.Errorf("These commands are unsupported in Direct Vizier mode.")
250253
os.Exit(1)
251254
default:
@@ -254,7 +257,7 @@ func checkAuthForCmd(c *cobra.Command) {
254257
}
255258

256259
switch c {
257-
case DeployCmd, UpdateCmd, RunCmd, LiveCmd, GetCmd, ScriptCmd, DeployKeyCmd, APIKeyCmd:
260+
case CollectLogsCmd, DeployCmd, UpdateCmd, RunCmd, LiveCmd, GetCmd, ScriptCmd, DeployKeyCmd, APIKeyCmd:
258261
authenticated := auth.IsAuthenticated(viper.GetString("cloud_addr"))
259262
if !authenticated {
260263
utils.Errorf("Failed to authenticate. Please retry `px auth login`.")

src/pixie_cli/pkg/vizier/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ go_library(
2525
"data_formatter.go",
2626
"errors.go",
2727
"lister.go",
28+
"logs.go",
2829
"script.go",
2930
"stream_adapter.go",
3031
"utils.go",

src/pixie_cli/pkg/vizier/logs.go

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
/*
2+
* Copyright 2018- The Pixie Authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*
16+
* SPDX-License-Identifier: Apache-2.0
17+
*/
18+
19+
package vizier
20+
21+
import (
22+
"archive/zip"
23+
"context"
24+
"errors"
25+
"os"
26+
"strings"
27+
28+
log "github.com/sirupsen/logrus"
29+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30+
"k8s.io/client-go/kubernetes"
31+
"k8s.io/client-go/rest"
32+
33+
"px.dev/pixie/src/utils/script"
34+
"px.dev/pixie/src/utils/shared/k8s"
35+
)
36+
37+
// LogCollector collect logs for Pixie and cluster setup information.
38+
type LogCollector struct {
39+
k8sConfig *rest.Config
40+
k8sClientSet *kubernetes.Clientset
41+
cloudAddr string
42+
br *script.BundleManager
43+
k8s.LogCollector
44+
}
45+
46+
// NewLogCollector creates a new log collector.
47+
func NewLogCollector(br *script.BundleManager, cloudAddr string) *LogCollector {
48+
cfg := k8s.GetConfig()
49+
cs := k8s.GetClientset(cfg)
50+
return &LogCollector{
51+
cfg,
52+
cs,
53+
cloudAddr,
54+
br,
55+
*k8s.NewLogCollector(),
56+
}
57+
}
58+
59+
// CollectPixieLogs collects logs for all Pixie pods and write them to the zip file fName.
60+
func (c *LogCollector) CollectPixieLogs(fName string) error {
61+
if !strings.HasSuffix(fName, ".zip") {
62+
return errors.New("fname must have .zip suffix")
63+
}
64+
f, err := os.Create(fName)
65+
if err != nil {
66+
return err
67+
}
68+
defer f.Close()
69+
70+
zf := zip.NewWriter(f)
71+
defer zf.Close()
72+
73+
vls := k8s.VizierLabelSelector()
74+
vizierLabelSelector := metav1.FormatLabelSelector(&vls)
75+
76+
// We check across all namespaces for the matching pixie pods.
77+
vizierPodList, err := c.k8sClientSet.CoreV1().Pods("").List(context.Background(), metav1.ListOptions{LabelSelector: vizierLabelSelector})
78+
if err != nil {
79+
return err
80+
}
81+
82+
// We also need to get the logs the operator logs.
83+
// As the LabelSelectors are ANDed, we need to make a new query and merge
84+
// the results.
85+
ols := k8s.OperatorLabelSelector()
86+
operatorLabelSelector := metav1.FormatLabelSelector(&ols)
87+
88+
operatorPodList, err := c.k8sClientSet.CoreV1().Pods("").List(context.Background(), metav1.ListOptions{LabelSelector: operatorLabelSelector})
89+
if err != nil {
90+
return err
91+
}
92+
93+
// Merge the two pod lists
94+
pods := append(vizierPodList.Items, operatorPodList.Items...)
95+
96+
for _, pod := range pods {
97+
for _, containerStatus := range pod.Status.ContainerStatuses {
98+
// Ignore prev logs, they might not exist.
99+
_ = c.LogPodInfoToZipFile(zf, pod, containerStatus.Name, true)
100+
101+
err := c.LogPodInfoToZipFile(zf, pod, containerStatus.Name, false)
102+
if err != nil {
103+
log.WithError(err).Warnf("Failed to log pod: %s", pod.Name)
104+
}
105+
}
106+
err = c.WritePodDescription(zf, pod)
107+
if err != nil {
108+
log.WithError(err).Warnf("failed to write pod description")
109+
}
110+
}
111+
112+
err = c.LogKubeCmd(zf, "nodes.log", "describe", "node")
113+
if err != nil {
114+
log.WithError(err).Warn("failed to log node info")
115+
}
116+
117+
err = c.LogKubeCmd(zf, "services.log", "describe", "services", "--all-namespaces", "-l", vizierLabelSelector)
118+
if err != nil {
119+
log.WithError(err).Warnf("failed to log services")
120+
}
121+
122+
// Describe vizier and write it to vizier.log
123+
err = c.LogKubeCmd(zf, "vizier.log", "describe", "vizier", "--all-namespaces")
124+
if err != nil {
125+
log.WithError(err).Warnf("failed to log vizier crd")
126+
}
127+
128+
clusterID, err := GetCurrentVizier(c.cloudAddr)
129+
if err != nil {
130+
log.WithError(err).Warnf("failed to get cluster ID")
131+
}
132+
outputCh, err := RunSimpleHealthCheckScript(c.br, c.cloudAddr, clusterID)
133+
134+
if err != nil {
135+
entry := log.WithError(err)
136+
if _, ok := err.(*HealthCheckWarning); ok {
137+
entry.Warn("healthcheck script detected the following warnings:")
138+
} else {
139+
entry.Warn("failed to run healthcheck script")
140+
}
141+
}
142+
143+
return c.LogOutputToZipFile(zf, "px_agent_diagnostics.txt", <-outputCh)
144+
}

0 commit comments

Comments
 (0)