From 0b3ac78d1922e05141ca2b3095bec0997b628062 Mon Sep 17 00:00:00 2001 From: Damien Grisonnet Date: Mon, 12 Jul 2021 15:35:36 +0200 Subject: [PATCH] pkg/resourceprovider: guard from negative metrics When serving the resource metrics API, prometheus-adapter may return negative values for pods/nodes memory and CPU usage. This happens because Prometheus sees counter resets which results in Prometheus interpolating data incorrectly to avoid the counter value going down. To prevent that, we need to add some guards in prometheus-adapter to replace the negative value by zero whenever it detects one. Signed-off-by: Damien Grisonnet --- pkg/resourceprovider/provider.go | 15 ++++-- pkg/resourceprovider/provider_test.go | 70 +++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 5 deletions(-) diff --git a/pkg/resourceprovider/provider.go b/pkg/resourceprovider/provider.go index 13d8c787..483377fa 100644 --- a/pkg/resourceprovider/provider.go +++ b/pkg/resourceprovider/provider.go @@ -19,6 +19,7 @@ package resourceprovider import ( "context" "fmt" + "math" "sync" "time" @@ -390,13 +391,17 @@ func (p *resourceProvider) runQuery(now pmodel.Time, queryInfo resourceQuery, re // associate the results back to each given pod or node res := make(queryResults, len(*rawRes.Vector)) - for _, val := range *rawRes.Vector { - if val == nil { - // skip empty values + for _, sample := range *rawRes.Vector { + // skip empty samples + if sample == nil { continue } - resKey := string(val.Metric[resourceLbl]) - res[resKey] = append(res[resKey], val) + // replace NaN and negative values by zero + if math.IsNaN(float64(sample.Value)) || sample.Value < 0 { + sample.Value = 0 + } + resKey := string(sample.Metric[resourceLbl]) + res[resKey] = append(res[resKey], sample) } return res, nil diff --git a/pkg/resourceprovider/provider_test.go b/pkg/resourceprovider/provider_test.go index cd005409..f3fe8d7b 100644 --- a/pkg/resourceprovider/provider_test.go +++ b/pkg/resourceprovider/provider_test.go @@ -17,6 +17,7 @@ limitations under the License. package resourceprovider import ( + "math" "time" corev1 "k8s.io/api/core/v1" @@ -206,6 +207,47 @@ var _ = Describe("Resource Metrics Provider", func() { Expect(times[0]).To(Equal(api.TimeInfo{Timestamp: pmodel.Time(10).Time(), Window: 1 * time.Minute})) }) + It("should return metrics of value zero when pod metrics have NaN or negative values", func() { + fakeProm.QueryResults = map[prom.Selector]prom.QueryResult{ + mustBuild(cpuQueries.contQuery.Build("", podResource, "some-ns", []string{cpuQueries.containerLabel}, labels.Everything(), "pod1", "pod3")): buildQueryRes("container_cpu_usage_seconds_total", + buildPodSample("some-ns", "pod1", "cont1", -1100.0, 10), + buildPodSample("some-ns", "pod1", "cont2", math.NaN(), 20), + buildPodSample("some-ns", "pod3", "cont1", -1300.0, 10), + buildPodSample("some-ns", "pod3", "cont2", 1310.0, 20), + ), + mustBuild(memQueries.contQuery.Build("", podResource, "some-ns", []string{cpuQueries.containerLabel}, labels.Everything(), "pod1", "pod3")): buildQueryRes("container_memory_working_set_bytes", + buildPodSample("some-ns", "pod1", "cont1", 3100.0, 11), + buildPodSample("some-ns", "pod1", "cont2", -3110.0, 21), + buildPodSample("some-ns", "pod3", "cont1", math.NaN(), 11), + buildPodSample("some-ns", "pod3", "cont2", -3310.0, 21), + ), + } + + By("querying for metrics for some pods") + times, metricVals, err := prov.GetPodMetrics( + types.NamespacedName{Namespace: "some-ns", Name: "pod1"}, + types.NamespacedName{Namespace: "some-ns", Name: "pod3"}, + ) + Expect(err).NotTo(HaveOccurred()) + + By("verifying that the reported times for each are the earliest times for each pod") + Expect(times).To(Equal([]api.TimeInfo{ + {Timestamp: pmodel.Time(10).Time(), Window: 1 * time.Minute}, + {Timestamp: pmodel.Time(10).Time(), Window: 1 * time.Minute}, + })) + + By("verifying that NaN and negative values were replaced by zero") + Expect(metricVals).To(HaveLen(2)) + Expect(metricVals[0]).To(ConsistOf( + metrics.ContainerMetrics{Name: "cont1", Usage: buildResList(0, 3100.0)}, + metrics.ContainerMetrics{Name: "cont2", Usage: buildResList(0, 0)}, + )) + Expect(metricVals[1]).To(ConsistOf( + metrics.ContainerMetrics{Name: "cont1", Usage: buildResList(0, 0)}, + metrics.ContainerMetrics{Name: "cont2", Usage: buildResList(1310.0, 0)}, + )) + }) + It("should be able to list metrics for nodes", func() { fakeProm.QueryResults = map[prom.Selector]prom.QueryResult{ mustBuild(cpuQueries.nodeQuery.Build("", nodeResource, "", nil, labels.Everything(), "node1", "node2")): buildQueryRes("container_cpu_usage_seconds_total", @@ -265,4 +307,32 @@ var _ = Describe("Resource Metrics Provider", func() { {}, })) }) + + It("should return metrics of value zero when node metrics have NaN or negative values", func() { + fakeProm.QueryResults = map[prom.Selector]prom.QueryResult{ + mustBuild(cpuQueries.nodeQuery.Build("", nodeResource, "", nil, labels.Everything(), "node1", "node2")): buildQueryRes("container_cpu_usage_seconds_total", + buildNodeSample("node1", -1100.0, 10), + buildNodeSample("node2", 1200.0, 14), + ), + mustBuild(memQueries.nodeQuery.Build("", nodeResource, "", nil, labels.Everything(), "node1", "node2")): buildQueryRes("container_memory_working_set_bytes", + buildNodeSample("node1", 2100.0, 11), + buildNodeSample("node2", math.NaN(), 12), + ), + } + By("querying for metrics for some nodes") + times, metricVals, err := prov.GetNodeMetrics("node1", "node2") + Expect(err).NotTo(HaveOccurred()) + + By("verifying that the reported times for each are the earliest times for each pod") + Expect(times).To(Equal([]api.TimeInfo{ + {Timestamp: pmodel.Time(10).Time(), Window: 1 * time.Minute}, + {Timestamp: pmodel.Time(12).Time(), Window: 1 * time.Minute}, + })) + + By("verifying that NaN and negative values were replaced by zero") + Expect(metricVals).To(Equal([]corev1.ResourceList{ + buildResList(0, 2100.0), + buildResList(1200.0, 0), + })) + }) })