Skip to content

Commit

Permalink
Unexpectedly deleted pod metrics
Browse files Browse the repository at this point in the history
* Count when we see unexpectedly terminated pods

Signed-off-by: Andrew Dye <andrewwdye@gmail.com>
  • Loading branch information
iaroslav-ciupin authored and andrewwdye committed Sep 25, 2024
1 parent 693dfb8 commit 69991e1
Showing 1 changed file with 13 additions and 4 deletions.
17 changes: 13 additions & 4 deletions flytepropeller/pkg/controller/nodes/task/k8s/plugin_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"time"

"github.com/prometheus/client_golang/prometheus"
"golang.org/x/time/rate"
v1 "k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
Expand Down Expand Up @@ -64,6 +65,7 @@ type PluginMetrics struct {
GetCacheHit labeled.StopWatch
GetAPILatency labeled.StopWatch
ResourceDeleted labeled.Counter
TaskPodErrors *prometheus.CounterVec
}

func newPluginMetrics(s promutils.Scope) PluginMetrics {
Expand All @@ -77,6 +79,8 @@ func newPluginMetrics(s promutils.Scope) PluginMetrics {
time.Millisecond, s),
ResourceDeleted: labeled.NewCounter("pods_deleted", "Counts how many times CheckTaskStatus is"+
" called with a deleted resource.", s),
TaskPodErrors: s.MustNewCounterVec("task_pod_errors", "Counts how many times task pods failed in given phase with given code",
"phase", "error_code"),
}
}

Expand Down Expand Up @@ -350,14 +354,19 @@ func (e PluginManager) Handle(ctx context.Context, tCtx pluginsCore.TaskExecutio
return transition, err
}

phaseInfo := transition.Info()
if phaseInfo.Err() != nil {
e.metrics.TaskPodErrors.WithLabelValues(phaseInfo.Phase().String(), phaseInfo.Err().GetCode()).Inc()
}

// Add events since last update
version := transition.Info().Version()
version := phaseInfo.Version()
lastEventUpdate := pluginState.LastEventUpdate
if e.eventWatcher != nil && o != nil {
nsName := k8stypes.NamespacedName{Namespace: o.GetNamespace(), Name: o.GetName()}
recentEvents := e.eventWatcher.List(nsName, lastEventUpdate)
if len(recentEvents) > 0 {
taskInfo := transition.Info().Info()
taskInfo := phaseInfo.Info()
taskInfo.AdditionalReasons = make([]pluginsCore.ReasonInfo, 0, len(recentEvents))
for _, event := range recentEvents {
taskInfo.AdditionalReasons = append(taskInfo.AdditionalReasons,
Expand All @@ -373,9 +382,9 @@ func (e PluginManager) Handle(ctx context.Context, tCtx pluginsCore.TaskExecutio
newPluginState := PluginState{
Phase: pluginPhase,
K8sPluginState: k8s.PluginState{
Phase: transition.Info().Phase(),
Phase: phaseInfo.Phase(),
PhaseVersion: version,
Reason: transition.Info().Reason(),
Reason: phaseInfo.Reason(),
},
LastEventUpdate: lastEventUpdate,
}
Expand Down

0 comments on commit 69991e1

Please sign in to comment.