pkg/resourceprovider: guard from negative metrics

When serving the resource metrics API, prometheus-adapter may return
negative values for pods/nodes memory and CPU usage. This happens
because Prometheus sees counter resets which results in Prometheus
interpolating data incorrectly to avoid the counter value going down.
To prevent that, we need to add some guards in prometheus-adapter to
replace the negative value by zero whenever it detects one.

Signed-off-by: Damien Grisonnet <dgrisonn@redhat.com>
This commit is contained in:
Damien Grisonnet 2021-07-12 15:35:36 +02:00
parent 93450fc29f
commit 0b3ac78d19
2 changed files with 80 additions and 5 deletions

View file

@ -19,6 +19,7 @@ package resourceprovider
import (
"context"
"fmt"
"math"
"sync"
"time"
@ -390,13 +391,17 @@ func (p *resourceProvider) runQuery(now pmodel.Time, queryInfo resourceQuery, re
// associate the results back to each given pod or node
res := make(queryResults, len(*rawRes.Vector))
for _, val := range *rawRes.Vector {
if val == nil {
// skip empty values
for _, sample := range *rawRes.Vector {
// skip empty samples
if sample == nil {
continue
}
resKey := string(val.Metric[resourceLbl])
res[resKey] = append(res[resKey], val)
// replace NaN and negative values by zero
if math.IsNaN(float64(sample.Value)) || sample.Value < 0 {
sample.Value = 0
}
resKey := string(sample.Metric[resourceLbl])
res[resKey] = append(res[resKey], sample)
}
return res, nil

View file

@ -17,6 +17,7 @@ limitations under the License.
package resourceprovider
import (
"math"
"time"
corev1 "k8s.io/api/core/v1"
@ -206,6 +207,47 @@ var _ = Describe("Resource Metrics Provider", func() {
Expect(times[0]).To(Equal(api.TimeInfo{Timestamp: pmodel.Time(10).Time(), Window: 1 * time.Minute}))
})
It("should return metrics of value zero when pod metrics have NaN or negative values", func() {
fakeProm.QueryResults = map[prom.Selector]prom.QueryResult{
mustBuild(cpuQueries.contQuery.Build("", podResource, "some-ns", []string{cpuQueries.containerLabel}, labels.Everything(), "pod1", "pod3")): buildQueryRes("container_cpu_usage_seconds_total",
buildPodSample("some-ns", "pod1", "cont1", -1100.0, 10),
buildPodSample("some-ns", "pod1", "cont2", math.NaN(), 20),
buildPodSample("some-ns", "pod3", "cont1", -1300.0, 10),
buildPodSample("some-ns", "pod3", "cont2", 1310.0, 20),
),
mustBuild(memQueries.contQuery.Build("", podResource, "some-ns", []string{cpuQueries.containerLabel}, labels.Everything(), "pod1", "pod3")): buildQueryRes("container_memory_working_set_bytes",
buildPodSample("some-ns", "pod1", "cont1", 3100.0, 11),
buildPodSample("some-ns", "pod1", "cont2", -3110.0, 21),
buildPodSample("some-ns", "pod3", "cont1", math.NaN(), 11),
buildPodSample("some-ns", "pod3", "cont2", -3310.0, 21),
),
}
By("querying for metrics for some pods")
times, metricVals, err := prov.GetPodMetrics(
types.NamespacedName{Namespace: "some-ns", Name: "pod1"},
types.NamespacedName{Namespace: "some-ns", Name: "pod3"},
)
Expect(err).NotTo(HaveOccurred())
By("verifying that the reported times for each are the earliest times for each pod")
Expect(times).To(Equal([]api.TimeInfo{
{Timestamp: pmodel.Time(10).Time(), Window: 1 * time.Minute},
{Timestamp: pmodel.Time(10).Time(), Window: 1 * time.Minute},
}))
By("verifying that NaN and negative values were replaced by zero")
Expect(metricVals).To(HaveLen(2))
Expect(metricVals[0]).To(ConsistOf(
metrics.ContainerMetrics{Name: "cont1", Usage: buildResList(0, 3100.0)},
metrics.ContainerMetrics{Name: "cont2", Usage: buildResList(0, 0)},
))
Expect(metricVals[1]).To(ConsistOf(
metrics.ContainerMetrics{Name: "cont1", Usage: buildResList(0, 0)},
metrics.ContainerMetrics{Name: "cont2", Usage: buildResList(1310.0, 0)},
))
})
It("should be able to list metrics for nodes", func() {
fakeProm.QueryResults = map[prom.Selector]prom.QueryResult{
mustBuild(cpuQueries.nodeQuery.Build("", nodeResource, "", nil, labels.Everything(), "node1", "node2")): buildQueryRes("container_cpu_usage_seconds_total",
@ -265,4 +307,32 @@ var _ = Describe("Resource Metrics Provider", func() {
{},
}))
})
It("should return metrics of value zero when node metrics have NaN or negative values", func() {
fakeProm.QueryResults = map[prom.Selector]prom.QueryResult{
mustBuild(cpuQueries.nodeQuery.Build("", nodeResource, "", nil, labels.Everything(), "node1", "node2")): buildQueryRes("container_cpu_usage_seconds_total",
buildNodeSample("node1", -1100.0, 10),
buildNodeSample("node2", 1200.0, 14),
),
mustBuild(memQueries.nodeQuery.Build("", nodeResource, "", nil, labels.Everything(), "node1", "node2")): buildQueryRes("container_memory_working_set_bytes",
buildNodeSample("node1", 2100.0, 11),
buildNodeSample("node2", math.NaN(), 12),
),
}
By("querying for metrics for some nodes")
times, metricVals, err := prov.GetNodeMetrics("node1", "node2")
Expect(err).NotTo(HaveOccurred())
By("verifying that the reported times for each are the earliest times for each pod")
Expect(times).To(Equal([]api.TimeInfo{
{Timestamp: pmodel.Time(10).Time(), Window: 1 * time.Minute},
{Timestamp: pmodel.Time(12).Time(), Window: 1 * time.Minute},
}))
By("verifying that NaN and negative values were replaced by zero")
Expect(metricVals).To(Equal([]corev1.ResourceList{
buildResList(0, 2100.0),
buildResList(1200.0, 0),
}))
})
})