From c912ebed77f7a6d08a34a5c27cdb995dea04801a Mon Sep 17 00:00:00 2001 From: Solly Ross Date: Tue, 7 Aug 2018 13:45:16 -0400 Subject: [PATCH 1/8] Fix naming typo in the config docs The config docs had a find/replace error in the naming section, leading to an erroneus `as` clause. This fixes that. --- docs/config.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/config.md b/docs/config.md index 43d5dee2..d5fccdf1 100644 --- a/docs/config.md +++ b/docs/config.md @@ -150,7 +150,7 @@ For example: # e.g. http_requests_total becomes http_requests_per_second name: matches: "^(.*)_total$" - as: "<<1}_per_second" + as: "${1}_per_second" ``` Querying From 9d7157f7ccc0a47ee7a15c36ede16725f283d5e0 Mon Sep 17 00:00:00 2001 From: Solly Ross Date: Mon, 20 Aug 2018 16:53:51 -0400 Subject: [PATCH 2/8] Reduce metric namer label-GVR logspam It's fairly common to have a label pattern that matches lots of irrelevant labels, so this makes the "error" about being unable to normalize/resolve a label to a GVR a V(9) info log, instead of an error log. --- pkg/custom-provider/metric_namer.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/custom-provider/metric_namer.go b/pkg/custom-provider/metric_namer.go index 7de4f69d..0c17090b 100644 --- a/pkg/custom-provider/metric_namer.go +++ b/pkg/custom-provider/metric_namer.go @@ -275,7 +275,8 @@ func (n *metricNamer) ResourcesForSeries(series prom.Series) ([]schema.GroupReso if groupRes, ok = n.labelResExtractor.GroupResourceForLabel(lbl); ok { info, _, err := provider.CustomMetricInfo{GroupResource: groupRes}.Normalized(n.mapper) if err != nil { - glog.Errorf("unable to normalize group-resource %s from label %q, skipping: %v", groupRes.String(), lbl, err) + // this is likely to show up for a lot of labels, so make it a verbose info log + glog.V(9).Infof("unable to normalize group-resource %s from label %q, skipping: %v", groupRes.String(), lbl, err) continue } From cfba614544840ce6c02d2dc178d07830942beeb9 Mon Sep 17 00:00:00 2001 From: Solly Ross Date: Mon, 20 Aug 2018 17:12:27 -0400 Subject: [PATCH 3/8] Remove extraneous `/adapter` from deployment args There was an extraneous `/adapter` in the deployment args that was causing people issues. This removes it. --- deploy/manifests/custom-metrics-apiserver-deployment.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/deploy/manifests/custom-metrics-apiserver-deployment.yaml b/deploy/manifests/custom-metrics-apiserver-deployment.yaml index 0930fa03..b36d517e 100644 --- a/deploy/manifests/custom-metrics-apiserver-deployment.yaml +++ b/deploy/manifests/custom-metrics-apiserver-deployment.yaml @@ -21,7 +21,6 @@ spec: - name: custom-metrics-apiserver image: directxman12/k8s-prometheus-adapter-amd64 args: - - /adapter - --secure-port=6443 - --tls-cert-file=/var/run/serving-cert/serving.crt - --tls-private-key-file=/var/run/serving-cert/serving.key From 9f08038f07f523a5a56343d4e9b10b4adc927e6f Mon Sep 17 00:00:00 2001 From: Solly Ross Date: Tue, 7 Aug 2018 15:11:47 -0400 Subject: [PATCH 4/8] Add a config walkthrough and update the readme A helful community member rightly pointed out that configuring the adapter was a bit confusing, and a step-by-step example would be useful. This adds such an example, and links to it from relevant places. --- README.md | 7 ++ docs/config-walkthrough.md | 230 +++++++++++++++++++++++++++++++++++++ docs/config.md | 7 ++ docs/walkthrough.md | 5 +- 4 files changed, 248 insertions(+), 1 deletion(-) create mode 100644 docs/config-walkthrough.md diff --git a/README.md b/README.md index 34da55c6..04207b3c 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,13 @@ metrics API suitable for use with the autoscaling/v2 Horizontal Pod Autoscaler in Kubernetes 1.6+. +Quick Links +----------- + +- [Config walkthrough](docs/config-walkthrough.md) and [config reference](docs/config.md). +- [End-to-end walkthrough](docs/walkthrough.md) +- [Deployment info and files](deploy/README.md) + Configuration ------------- diff --git a/docs/config-walkthrough.md b/docs/config-walkthrough.md new file mode 100644 index 00000000..e71ae1a5 --- /dev/null +++ b/docs/config-walkthrough.md @@ -0,0 +1,230 @@ +Configuration Walkthroughs +========================== + +*If you're looking for reference documentation on configuration, please +read the the [configuration reference](/docs/config.md)* + +Per-pod HTTP Requests +--------------------- + +### Background + +*The [full walkthrough](/docs/walkthrough.md) sets up a the background for +something like this* + +Suppose we have some frontend webserver, and we're trying to write an +configuration for the Promtheus adapter so that we can autoscale it based +on the HTTP requests per second that it receives. + +Before starting, we've gone and instrumented our frontend server with +a metric, `http_requests_total`. It is exposed with a single label, +`method`, breaking down the requests by HTTP verb. + +We've configured our Prometheus to collect the metric, and our promethues +adds the `kubernetes_namespace` and `kubernetes_pod_name` labels, +representing namespace and pod, respectively. + +If we query Prometheus, we see series that look like + +``` +http_requests_total{method="GET",kubernetes_namespace="production",kubernetes_pod_name="frontend-server-abcd-0123"} +``` + +### Configuring the adapter + +The adapter considers metrics in the following ways: + +1. First, It discovers the metrics available (*Discovery*) + +2. Then, it figures out which Kubernetes resources each metric is + associated with (*Association*) + +3. Then, it figures out how it should expose them to the custom metrics + API (*Naming*) + +4. Finally, it figures out how it should query Prometheus to get the + actual numbers (*Querying*) + +We need to inform the adapter how it should perform each of these steps +for our metric, `http_requests_total`, so we'll need to add a new +***rule***. Each rule in the adapter encodes these steps. Let's add a new +one to our configuration: + +```yaml +rules: +- {} +``` + +If we want to find all `http_requests_total` series ourselves in the +Prometheus dashboard, we'd write +`http_requests_total{kubernetes_namespace!="",kubernetes_pod_name!=""}` to +find all find all `http_requests_total` series that were associated with +a namespace and pod. + +We can add this to our rule in the `seriesQuery` field, to tell the +adapter how *discover* the right series itself: + +```yaml +rules: +- seriesQuery: 'http_requests_total{kubernetes_namespace!="",kubernetes_pod_name!=""}' +``` + +Next, we'll need to tell the adapter how to figure out which Kubernetes +resources are associated with the metric. We've already said that +`kubernetes_namespace` represents the namespace name, and +`kubernetes_pod_name` represents the pod name. Since these names don't +quite follow a consistent pattern, we use the `overrides` section of the +`resources` field in our rule: + +```yaml +rules: +- seriesQuery: 'http_requests_total{kubernetes_namespace!="",kubernetes_pod_name!=""}' + resources: + overrides: + kubernetes_namespace: {resource: "namespace"} + kubernetes_pod_name: {resource: "pod"} +``` + +This says that each label represents its corresponding resource. Since the +resources are in the "core" kubernetes API, we don't need to specify +a group. The adapter will automatically take care of pluralization, so we +can specify either `pod` or `pods`, just the same way as in `kubectl get`. +The resources can be any resource available in your kubernetes cluster, as +long as you've got a corresponding label. + +If our labels followed a consistent pattern, like `kubernetes_`, +we could specify `resources: {template: "kubernetes_<<.Resource>>"}` +instead of specifying an override for each resource. If you want to see +all resources currently available in your cluster, you can use the +`kubectl api-resources` command (but the list of available resources can +change as you add or remove CRDs or aggregated API servers). For more +information on resources, see [Kinds, Resources, and +Scopes](https://github.com/kubernetes-incubator/custom-metrics-apiserver/blob/master/docs/getting-started.md#kinds-resources-and-scopes) +in the custom-metrics-apiserver boilerplate guide. + +Now, cumulative metrics (like those that end in `_total`) aren't +particularly useful for autoscaling, so we want to convert them to rate +metrics in the API. We'll call the rate version of our metric +`http_requests_per_second`. We can use the the `name` field to tell the +adapter about that: + +```yaml +rules: +- seriesQuery: 'http_requests_total{kubernetes_namespace!="",kubernetes_pod_name!=""}' + resources: + overrides: + kubernetes_namespace: {resource: "namespace"} + kubernetes_pod_name: {resource: "pod"} + name: + matches: "^(.*)_total" + as: "${1}_per_second" +``` + +Here, we've said that we should take the name matching +`_total`, and turning it into `_per_second`. + +Finally, we need to tell the adapter how to actually query Prometheus to +get some numbers. Since we want a rate, we might write: +`sum(rate(http_requests_total{kubernetes_namespace="production",kubernetes_pod_name=~"frontend-server-abcd-0123|fronted-server-abcd-4567"}) by (kubernetes_pod_name)`, +which would get us the total requests per second for each pod, summed across verbs. + +We can write something similar in the adapter, using the `metricsQuery` +field: + +```yaml +rules: +- seriesQuery: 'http_requests_total{kubernetes_namespace!="",kubernetes_pod_name!=""}' + resources: + overrides: + kubernetes_namespace: {resource: "namespace"} + kubernetes_pod_name: {resource: "pod"} + name: + matches: "^(.*)_total" + as: "${1}_per_second" + metricsQuery: 'sum(rate(<<.Series>>{<<.LabelMatchers>>}[2m])) by (<<.GroupBy>>)' +``` + +The adapter will automatically fill in the right series name, label +matchers, and group-by clause, depending on what we put into the API. +Since we're only working with a single metric anyway, we could replace +`<<.Series>>` with `http_requests_total`. + +Now, if we run an instance of the Prometheus adapter with this +configuration, we should see discovery information at +`$KUBERNETES/apis/custom.metrics.k8s.io/v1beta1/` of + +```json +{ + "kind": "APIResourceList", + "apiVersion": "v1", + "groupVersion": "custom.metrics.k8s.io/v1beta1", + "resources": [ + { + "name": "pods/http_requests_total", + "singularName": "", + "namespaced": true, + "kind": "MetricValueList", + "verbs": ["get"] + }, + { + "name": "namespaces/http_requests_total", + "singularName": "", + "namespaced": false, + "kind": "MetricValueList", + "verbs": ["get"] + } + ] +} +``` + +Notice that we get an entry for both "pods" and "namespaces" -- the +adapter exposes the metric on each resource that we've associated the +metric with (and all namespaced resources must be associated with +a namespace), and will fill in the `<<.GroupBy>>` section with the +appropriate label depending on which we ask for. + +We can now connect to +`$KUBERNETES/apis/custom.metrics.k8s.io/v1beta1/namespaces/production/pods/*/http_requests_per_second`, +and we should see + +```json +{ + "kind": "MetricValueList", + "apiVersion": "custom.metrics.k8s.io/v1beta1", + "metadata": { + "selfLink": "/apis/custom.metrics.k8s.io/v1beta1/namespaces/production/pods/*/http_requests_per_second", + }, + "items": [ + { + "describedObject": { + "kind": "Pod", + "name": "frontend-server-abcd-0123", + "apiVersion": "/__internal", + }, + "metricName": "http_requests_per_second", + "timestamp": "2018-08-07T17:45:22Z", + "value": "16m" + }, + { + "describedObject": { + "kind": "Pod", + "name": "frontend-server-abcd-4567", + "apiVersion": "/__internal", + }, + "metricName": "http_requests_per_second", + "timestamp": "2018-08-07T17:45:22Z", + "value": "22m" + } + ] +} +``` + +This says that our server pods are receiving 16 and 22 milli-requests per +second (depending on the pod), which is 0.016 and 0.022 requests per +second, written out as a decimal. That's about what we'd expect with +little-to-no traffic except for the Prometheus scrape. + +If we added some traffic to our pods, we might see `1` or `20` instead of +`16m`, which would be `1` or `20` requests per second. We might also see +`20500m`, which would mean 20500 milli-requests per second, or 20.5 +requests per second in decimal form. diff --git a/docs/config.md b/docs/config.md index d5fccdf1..e4abb029 100644 --- a/docs/config.md +++ b/docs/config.md @@ -1,6 +1,10 @@ Metrics Discovery and Presentation Configuration ================================================ +*If you want a full walkthrough of configuring the adapter for a sample +metric, please read the [configuration +walkthrough](/docs/config-walkthrough.md)* + The adapter determines which metrics to expose, and how to expose them, through a set of "discovery" rules. Each rule is executed independently (so make sure that your rules are mutually exclusive), and specifies each @@ -123,6 +127,9 @@ resource: These two can be combined, so you can specify both a template and some individual overrides. +The resources mentioned can be any resource available in your kubernetes +cluster, as long as you've got a corresponding label. + Naming ------ diff --git a/docs/walkthrough.md b/docs/walkthrough.md index fb285463..19b6ae10 100644 --- a/docs/walkthrough.md +++ b/docs/walkthrough.md @@ -146,7 +146,10 @@ You may also need to modify the ConfigMap containing the metrics discovery configuration. If you're using the Prometheus configuration described above, it should work out of the box in common cases. Otherwise, read the [configuration documentation](/docs/config.md) to learn how to configure -the adapter for your particular metrics and labels. +the adapter for your particular metrics and labels. The [configuration +walkthrough](/docs/config-walkthrough.md) gives an end-to-end +configuration tutorial for configure the adapter for a scenario similar to +this one. ### The Registered API ### From 77614d151a89a6490eacafa07dcf7a84252c30c9 Mon Sep 17 00:00:00 2001 From: Solly Ross Date: Tue, 7 Aug 2018 15:19:06 -0400 Subject: [PATCH 5/8] Add a note on resource quanities to walkthrough Quantities continue to confuse people, so adding concrete examples should help. --- docs/walkthrough.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/walkthrough.md b/docs/walkthrough.md index 19b6ae10..42b8ad6e 100644 --- a/docs/walkthrough.md +++ b/docs/walkthrough.md @@ -273,6 +273,18 @@ Try fetching the metrics again. You should see an increase in the rate after the collection interval specified in your Prometheus configuration has elapsed. If you leave it for a bit, the rate will go back down again. +Notice that the API uses Kubernetes-style quantities to describe metric +values. These quantities use SI suffixes instead of decimal points. The +most common to see in the metrics API is the `m` suffix, which means +milli-units, or 1000ths of a unit. If your metric is exactly a whole +number of units on the nose, you might not see a suffix. Otherwise, you'll +probably see an `m` suffix to represent fractions of a unit. + +For example, here, `500m` would be half a request per second, `10` would +be 10 requests per second, and `10500m` would be `10.5` requests per +second. + + ### Troubleshooting Missing Metrics If the metric does not appear, or is not registered with the right From 5118c9ee1eaa669926f94d15b836154469f56175 Mon Sep 17 00:00:00 2001 From: Solly Ross Date: Tue, 7 Aug 2018 15:47:49 -0400 Subject: [PATCH 6/8] Add some FAQs to the README This adds some FAQs to the README containing information about certs, labelling, configuration, quantities, and multiple metrics. --- README.md | 90 +++++++++++++++++++++++++++++++++++++++++++++ docs/config.md | 2 +- docs/walkthrough.md | 2 + 3 files changed, 93 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 04207b3c..44dfa484 100644 --- a/README.md +++ b/README.md @@ -83,3 +83,93 @@ attention to: Operator](https://github.com/luxas/kubeadm-workshop#deploying-the-prometheus-operator-for-monitoring-services-in-the-cluster) - [Setting up the custom metrics adapter and sample app](https://github.com/luxas/kubeadm-workshop#deploying-a-custom-metrics-api-server-and-a-sample-app) + +FAQs +---- + +### Why do my metrics keep jumping between a normal value and a very large number? + +You're probably switching between whole numbers (e.g. `10`) and milli-quantities (e.g. `10500m`). +Worry not! This is just how Kubernetes represents fractional values. See the +[Quantity Values](/docs/walkthrough.md#quantity-values) section of the walkthrough for a bit more +information. + +### Why isn't my metric showing up? + +First, check your configuration. Does it select your metric? You can +find the [default configuration](/deploy/custom-metrics-config-map.yaml) +in the deploy directory, and more information about configuring the +adapter in the [docs](/docs/config.md). + +Next, check if the discovery information looks right. You should see the +metrics showing up as associated with the resources you expect at +`/apis/custom.metrics.k8s.io/v1beta1/` (you can use `kubectl get --raw +/apis/custom.metrics.k8s.io/v1beta1` to check, and can pipe to `jq` to +pretty-print the results, if you have it installed). If not, make sure +your series are labeled correctly. Consumers of the custom metrics API +(especially the HPA) don't do any special logic to associate a particular +resource to a particular series, so you have to make sure that the adapter +does it instead. + +For example, if you want a series `foo` to be associated with deployment +`bar` in namespace `somens`, make sure there's some label that represents +deployment name, and that the adapter is configured to use it. With the +default config, that means you'd need the query +`foo{namespace="somens",deployment="bar"}` to return some results in +Prometheus. + +Next, try using the `--v=6` flag on the adapter to see the exact queries +being made by the adapter. Try url-decoding the query and pasting it into +the Prometheus web console to see if the query looks wrong. + +### My query contains multiple metrics, how do I make that work? + +It's actually fairly straightforward, if a bit non-obvious. Simply choose one +metric to act as the "discovery" and "naming" metric, and use that to configure +the "discovery" and "naming" parts of the configuration. Then, you can write +whichever metrics you want in the `metricsQuery`! The series query can contain +whichever metrics you want, as long as they have the right set of labels. + +For example, if you have two metrics `foo_total` and `foo_count`, you might write + +```yaml +rules: +- seriesQuery: 'foo_total' + resources: {overrides: {system_name: {resource: "node"}}} + name: + matches: 'foo_total' + as: 'foo' + metricsQuery: 'sum(foo_total) by (<<.GroupBy>>) / sum(foo_count) by (<<.GroupBy>>)' +``` + +### I get errors about SubjectAccessReviews/system:anonymous/TLS/Certificates/RequestHeader! + +It's important to understand the role of TLS in the Kubernetes cluster. There's a high-level +overview here: https://github.com/kubernetes-incubator/apiserver-builder/blob/master/docs/concepts/auth.md. + +All of the above errors generally boil down to misconfigured certificates. +Specifically, you'll need to make sure your cluster's aggregation layer is +properly configured, with requestheader certificates set up properly. + +Errors about SubjectAccessReviews failing for system:anonymous generally mean +that your cluster's given requestheader CA doesn't trust the proxy certificates +from the API server aggregator. + +On the other hand, if you get an error from the aggregator about invalid certificates, +it's probably because the CA specified in the `caBundle` field of your APIService +object doesn't trust the serving certificates for the adapter. + +If you're seeing SubjectAccessReviews failures for non-anonymous users, check your +RBAC rules -- you probably haven't given users permission to operate on resources in +the `custom.metrics.k8s.io` API group. + +### My metrics appear and disappear + +You probably have a Prometheus collection interval or computation interval +that's larger than your adapter's discovery interval. If the metrics +appear in discovery but occaisionally return not-found, those intervals +are probably larger than one of the rate windows used in one of your +queries. The adapter only considers metrics with datapoints in the window +`[now-discoveryInterval, now]` (in order to only capture metrics that are +still present), so make sure that your discovery interval is at least as +large as your collection interval. diff --git a/docs/config.md b/docs/config.md index e4abb029..64a3c6cd 100644 --- a/docs/config.md +++ b/docs/config.md @@ -188,7 +188,7 @@ Kubernetes resources. Then, if someone requested the metric `pods/http_request_per_second` for the pods `pod1` and `pod2` in the `somens` namespace, we'd have: -- `Series: "http_requests_total" +- `Series: "http_requests_total"` - `LabelMatchers: "pod=~\"pod1|pod2",namespace="somens"` - `GroupBy`: `pod` diff --git a/docs/walkthrough.md b/docs/walkthrough.md index 42b8ad6e..e2da48d5 100644 --- a/docs/walkthrough.md +++ b/docs/walkthrough.md @@ -273,6 +273,8 @@ Try fetching the metrics again. You should see an increase in the rate after the collection interval specified in your Prometheus configuration has elapsed. If you leave it for a bit, the rate will go back down again. +### Quantity Values + Notice that the API uses Kubernetes-style quantities to describe metric values. These quantities use SI suffixes instead of decimal points. The most common to see in the metrics API is the `m` suffix, which means From a1f4aab6d4e8d02cc30f0743ed3e91bfec473c09 Mon Sep 17 00:00:00 2001 From: Solly Ross Date: Wed, 8 Aug 2018 17:09:47 -0400 Subject: [PATCH 7/8] Restructure walkthrough This restructures the walkthrough to focus on the goal of scaling an application from the start. We start out with an application and an autoscaler, and then walk through how we can make the autoscaler actually able to function. --- docs/walkthrough.md | 472 ++++++++++++++++++++++---------------------- 1 file changed, 231 insertions(+), 241 deletions(-) diff --git a/docs/walkthrough.md b/docs/walkthrough.md index e2da48d5..e0236091 100644 --- a/docs/walkthrough.md +++ b/docs/walkthrough.md @@ -8,7 +8,7 @@ metrics sourced from the adapter. Prerequisites ------------- -### Cluster Configuration ### +### Cluster Configuration Before getting started, ensure that the main components of your cluster are configured for autoscaling on custom metrics. As of @@ -29,193 +29,52 @@ Note that most of the API versions in this walkthrough target Kubernetes Kubernetes 1.8+. Version 0.1.0 works with Kubernetes 1.7, but is significantly different. -### Binaries and Images ### +### Binaries and Images In order to follow this walkthrough, you'll need container images for Prometheus and the custom metrics adapter. -Prometheus can be found at `prom/prometheus` on Dockerhub. The adapter -has different images for each arch, and can be found at +It's easiest to deploy Prometheus with the [Prometheus +Operator](https://coreos.com/operators/prometheus/docs/latest/), which +makes it easy to get up and running with Prometheus. This walkthrough +will assume you're planning on doing that -- if you've deployed it by hand +instead, you'll need to make a few adjustments to the way you expose +metrics to Prometheus. + +The adapter has different images for each arch, and can be found at `directxman12/k8s-prometheus-adapter-${ARCH}`. For instance, if you're on an x86_64 machine, use the `directxman12/k8s-prometheus-adapter-amd64` image. If you're feeling adventurous, you can build the latest version of the -custom metrics adapter by running `make docker-build`. +custom metrics adapter by running `make docker-build` or `make +build-local-image`. -Launching Prometheus and the Adapter ------------------------------------- +Special thanks to [@luxas](https://github.com/luxas) for providing the +demo application for this walkthrough. -In this walkthrough, it's assumed that you're deploying Prometheus into -its own namespace called `prom`. Most of the sample commands and files -are namespace-agnostic, but there are a few commands that rely on -namespace. If you're using a different namespace, simply substitute that -in for `prom` when it appears. +The Scenario +------------ -### Prometheus Configuration ### +Suppose that you've written some new web application, and you know it's +the next best thing since sliced bread. It's ready to unveil to the +world... except you're not sure that just one instance will handle all the +traffic once it goes viral. Thankfully, you've got Kubernetes. -It's reccomended to use the [Prometheus -Operator](https://coreos.com/operators/prometheus/docs/latest/) to deploy -Prometheus. It's a lot easier than configuring Prometheus by hand. Note -that the Prometheus operator rules rename some labels if they conflict -with its automatic labels, so you may have to tweak the adapter -configuration slightly. - -If you don't want to use the Prometheus Operator, you'll have to deploy -Prometheus with a hand-written configuration. Below, you can find the -relevant parts of the configuration that are expected for this -walkthrough. See the Prometheus documentation on [configuring -Prometheus](https://prometheus.io/docs/operating/configuration/) for more -information. - -For the purposes of this walkthrough, you'll need the following -configuration options to be set: - -
- -prom-cfg.yaml - -```yaml -# a short scrape interval means you can respond to changes in -# metrics more quickly -global: - scrape_interval: 15s - -# you need a scrape configuration for scraping from pods -scrape_configs: -- job_name: 'kubernetes-pods' - # if you want to use metrics on jobs, set the below field to - # true to prevent Prometheus from setting the `job` label - # automatically. - honor_labels: false - kubernetes_sd_configs: - - role: pod - # skip verification so you can do HTTPS to pods - tls_config: - insecure_skip_verify: true - # make sure your labels are in order - relabel_configs: - # these labels tell Prometheus to automatically attach source - # pod and namespace information to each collected sample, so - # that they'll be exposed in the custom metrics API automatically. - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - # these labels tell Prometheus to look for - # prometheus.io/{scrape,path,port} annotations to configure - # how to scrape - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] - action: replace - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - target_label: __address__ - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scheme] - action: replace - target_label: __scheme__ - regex: (.+) -``` - -
- -Ensure that your Prometheus is up and running by accessing the Prometheus -dashboard, and checking on the labels on those metrics. You'll need the -label names for configuring the adapter. - -### Creating the Resources and Launching the Deployment ### - -The [deploy/manifests](/deploy/manifests) directory contains the -appropriate files for creating the Kubernetes objects to deploy the -adapter. - -See the [deployment README](/deploy/README.md) for more information about -the steps to deploy the adapter. Note that if you're deploying on -a non-x86_64 (amd64) platform, you'll need to change the `image` field in -the Deployment to be the appropriate image for your platform. - -You may also need to modify the ConfigMap containing the metrics discovery -configuration. If you're using the Prometheus configuration described -above, it should work out of the box in common cases. Otherwise, read the -[configuration documentation](/docs/config.md) to learn how to configure -the adapter for your particular metrics and labels. The [configuration -walkthrough](/docs/config-walkthrough.md) gives an end-to-end -configuration tutorial for configure the adapter for a scenario similar to -this one. - -### The Registered API ### - -As part of the creation of the adapter Deployment and associated objects -(performed above), we registered the API with the API aggregator (part of -the main Kubernetes API server). - -The API is registered as `custom.metrics.k8s.io/v1beta1`, and you can find -more information about aggregation at [Concepts: -Aggregation](https://github.com/kubernetes-incubator/apiserver-builder/blob/master/docs/concepts/aggregation.md). - -If you're deploying into production, you'll probably want to modify the -APIService object to contain the CA used to sign your serving -certificates. - -To do this, first base64-encode the CA (assuming it's stored in -/tmp/ca.crt): - -```shell -$ base64 -w 0 < /tmp/ca.crt -``` - -Then, edit the APIService and place the encoded contents into the -`caBundle` field under `spec`, and removing the `insecureSkipTLSVerify` -field in the same location: - -```shell -$ kubectl edit apiservice v1beta1.custom.metrics.k8s.io -``` - -This ensures that the API aggregator checks that the API is being served -by the server that you expect, by verifying the certificates. - -### Double-Checking Your Work ### - -With that all set, your custom metrics API should show up in discovery. - -Try fetching the discovery information for it: - -```shell -$ kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 -``` - -Since you don't have any metrics collected yet, you shouldn't see any -available resources, but the request should return successfully. Keep -this command in mind -- you'll want to use it later once you have a pod -producing custom metrics. - -Collecting Application Metrics ------------------------------- - -Now that you have a working pipeline for ingesting application metrics, -you'll need an application that produces some metrics. Any application -which produces Prometheus-formatted metrics will do. For the purposes of -this walkthrough, try out [@luxas](https://github.com/luxas)'s simple HTTP -counter in the `luxas/autoscale-demo` image on Dockerhub: +Deploy your app into your cluster, exposed via a service so that you can +send traffic to it and fetch metrics from it:
sample-app.deploy.yaml ```yaml -apiVersion: apps/v1beta1 +apiVersion: apps/v1 kind: Deployment metadata: name: sample-app + labels: + app: sample-app spec: replicas: 1 selector: @@ -225,13 +84,6 @@ spec: metadata: labels: app: sample-app - annotations: - # if you're not using the Operator, you'll need these annotations - # otherwise, configure the operator to collect metrics from - # the sample-app service on port 80 at /metrics - prometheus.io/scrape: true - prometheus.io/port: 8080 - prometheus.io/path: "/metrics" spec: containers: - image: luxas/autoscale-demo:v0.1.2 @@ -243,75 +95,23 @@ spec:
-Create this deployment, and expose it so that you can easily trigger -increases in metrics: - -```yaml +```shell $ kubectl create -f sample-app.deploy.yaml $ kubectl create service clusterip sample-app --tcp=80:8080 ``` -This sample application provides some metrics on the number of HTTP -requests it receives. Consider the metric `http_requests_total`. First, -check that it appears in discovery using the command from [Double-Checking -Yor Work](#double-checking-your-work). The cumulative Prometheus metric -`http_requests_total` should have become the custom-metrics-API rate -metric `pods/http_requests`. Check out its value: - -```shell -$ kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1/namespaces/default/pods/*/http_requests?selector=app%3Dsample-app" -``` - -It should be zero, since you're not currently accessing it. Now, create -a few requests with curl: +Now, check your app, which exposes metrics and counts the number of +accesses to the metrics page via the `http_requests_total` metric: ```shell $ curl http://$(kubectl get service sample-app -o jsonpath='{ .spec.clusterIP }')/metrics ``` -Try fetching the metrics again. You should see an increase in the rate -after the collection interval specified in your Prometheus configuration -has elapsed. If you leave it for a bit, the rate will go back down again. +Notice that each time you access the page, the counter goes up. -### Quantity Values - -Notice that the API uses Kubernetes-style quantities to describe metric -values. These quantities use SI suffixes instead of decimal points. The -most common to see in the metrics API is the `m` suffix, which means -milli-units, or 1000ths of a unit. If your metric is exactly a whole -number of units on the nose, you might not see a suffix. Otherwise, you'll -probably see an `m` suffix to represent fractions of a unit. - -For example, here, `500m` would be half a request per second, `10` would -be 10 requests per second, and `10500m` would be `10.5` requests per -second. - - -### Troubleshooting Missing Metrics - -If the metric does not appear, or is not registered with the right -resources, you might need to modify your [metrics discovery -configuration](/docs/config.md), as mentioned above. Check your labels via -the Prometheus dashboard, and then modify the configuration appropriately. - -As noted in the main [README](/README.md), you'll need to also make sure -your metrics relist interval is at least your Prometheus scrape interval. -If it's less that that, you'll see metrics periodically appear and -disappear from the adapter. - -Autoscaling ------------ - -Now that you have an application which produces custom metrics, you'll be -able to autoscale on it. As noted in the [HorizontalPodAutoscaler -walkthrough](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics), -there are three different types of metrics that the -HorizontalPodAutoscaler can handle. - -In this walkthrough, you've exposed some metrics that can be consumed -using the `Pods` metric type. - -Create a description for the HorizontalPodAutoscaler (HPA): +Now, you'll want to make sure you can autoscale your application on that +metric, so that you're ready for your launch. You can use +a HorizontalPodAutoscaler like this to accomplish the autoscaling:
@@ -346,25 +146,215 @@ spec:
-Create the HorizontalPodAutoscaler with +If you try creating that now (and take a look at your controller-manager +logs), you'll see that the that the HorizontalPodAutoscaler controller is +attempting to fetch metrics from +`/apis/custom.metrics.k8s.io/v1beta1/namespaces/default/pods/*/http_requests?selector=app%3Dsample-app`, +but right now, nothing's serving that API. +Before you can autoscale your application, you'll need to make sure that +Kubernetes can read the metrics that your application exposes. + +Launching Prometheus and the Adapter +------------------------------------ + +In order to expose metrics beyond CPU and memory to Kubernetes for +autoscaling, you'll need an "adapter" that serves the custom metrics API. +Since you've got Prometheus metrics, it makes sense to use the +Prometheus adapter to serve metrics out of Prometheus. + +### Launching Prometheus + +First, you'll need to deploy the Prometheus Operator. Check out the +[getting started +guide](https://coreos.com/operators/prometheus/docs/latest/user-guides/getting-started.html) +for the Operator to deploy a copy of Prometheus. + +This walkthrough assumes that Prometheus is deployed in the `prom` +namespace. Most of the sample commands and files are namespace-agnostic, +but there are a few commands or pieces of configuration that rely on +namespace. If you're using a different namespace, simply substitute that +in for `prom` when it appears. + +### Monitoring Your Application + +In order to monitor your application, you'll need to set up +a ServiceMonitor pointing at the application. Assuming you've set up your +Prometheus instance to use ServiceMonitors with the `app: sample-app` +label, create a ServiceMonitor to monitor the app's metrics via the +service: + +
+ +service-monitor.yaml + +```yaml +kind: ServiceMonitor +apiVersion: monitoring.coreos.com/v1 +metadata: + name: sample-app + labels: + app: sample-app +spec: + selector: + matchLabels: + app: sample-app + endpoints: + - port: http ``` + +
+ +```shell +$ kubectl create -f service-monitor.yaml +``` + +Now, you should see your metrics appear in your Prometheus instance. Look +them up via the dashboard, and make sure they have the `namespace` and +`pod` labels. + +### Launching the Adapter + +Now that you've got a running copy of Prometheus that's monitoring your +application, you'll need to deploy the adapter, which knows how to +communicate with both Kubernetes and Promethues, acting as a translator +between the two. + +The [deploy/manifests](/deploy/manifests) directory contains the +appropriate files for creating the Kubernetes objects to deploy the +adapter. + +See the [deployment README](/deploy/README.md) for more information about +the steps to deploy the adapter. Note that if you're deploying on +a non-x86_64 (amd64) platform, you'll need to change the `image` field in +the Deployment to be the appropriate image for your platform. + +The default adapter configuration should work for this walkthrough and +a standard Prometheus Operator configuration, but if you've got custom +relabelling rules, or your labels above weren't exactly `namespace` and +`pod`, you may need to edit the configuration in the ConfigMap. The +[configuration walkthrough](/docs/config-walkthrough.md) provides an +overview of how configuration works. + +### The Registered API + +As part of the creation of the adapter Deployment and associated objects +(performed above), we registered the API with the API aggregator (part of +the main Kubernetes API server). + +The API is registered as `custom.metrics.k8s.io/v1beta1`, and you can find +more information about aggregation at [Concepts: +Aggregation](https://github.com/kubernetes-incubator/apiserver-builder/blob/master/docs/concepts/aggregation.md). + +### Double-Checking Your Work + +With that all set, your custom metrics API should show up in discovery. + +Try fetching the discovery information for it: + +```shell +$ kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 +``` + +Since you've set up Prometheus to collect your app's metrics, you should +see a `pods/http_request` resource show up. This represents the +`http_requests_total` metric, converted into a rate, aggregated to have +one datapoint per pod. Notice that this translates to the same API that +our HorizontalPodAutoscaler was trying to use above. + +You can check the value of the metric using `kubectl get --raw`, which +sends a raw GET request to the Kubernetes API server, automatically +injecting auth information: + +```shell +$ kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1/namespaces/default/pods/*/http_requests?selector=app%3Dsample-app" +``` + +Because of the adapter's configuration, the cumulative metric +`http_requests_total` has been converted into a rate metric, +`pods/http_requests`, which measures requests per second over a 2 minute +interval. The value should currently be close to zero, since there's no +traffic to your app, except for the regular metrics collection from +Prometheus. + +Try generating some traffic using cURL a few times, like before: + +```shell +$ curl http://$(kubectl get service sample-app -o jsonpath='{ .spec.clusterIP }')/metrics +``` + +Now, if you fetch the metrics again, you should see an increase in the +value. If you leave it alone for a bit, the value should go back down +again. + +### Quantity Values + +Notice that the API uses Kubernetes-style quantities to describe metric +values. These quantities use SI suffixes instead of decimal points. The +most common to see in the metrics API is the `m` suffix, which means +milli-units, or 1000ths of a unit. If your metric is exactly a whole +number of units on the nose, you might not see a suffix. Otherwise, you'll +probably see an `m` suffix to represent fractions of a unit. + +For example, here, `500m` would be half a request per second, `10` would +be 10 requests per second, and `10500m` would be `10.5` requests per +second. + +### Troubleshooting Missing Metrics + +If the metric does not appear, or is not registered with the right +resources, you might need to modify your [adapter +configuration](/docs/config.md), as mentioned above. Check your labels +via the Prometheus dashboard, and then modify the configuration +appropriately. + +As noted in the main [README](/README.md), you'll need to also make sure +your metrics relist interval is at least your Prometheus scrape interval. +If it's less that that, you'll see metrics periodically appear and +disappear from the adapter. + +Autoscaling +----------- + +Now that you finally have the metrics API set up, your +HorizontalPodAutoscaler should be able to fetch the appropriate metric, +and make decisions based on it. + +If you didn't create the HorizontalPodAutoscaler above, create it now: + +```shell $ kubectl create -f sample-app-hpa.yaml ``` -Then, like before, make some requests to the sample app's service. If you -describe the HPA, after the HPA sync interval has elapsed, you should see -the number of pods increase proportionally to the ratio between the actual -requests per second and your target of 1 request every 2 seconds. - -You can examine the HPA with +Wait a little bit, and then examine the HPA: ```shell $ kubectl describe hpa sample-app ``` -You should see the HPA's last observed metric value, which should roughly -correspond to the rate of requests that you made. +You should see that it succesfully fetched the metric, but it hasn't tried +to scale, since there's not traffic. + +Since your app is going to need to scale in response to traffic, generate +some via cURL like above: + +```shell +$ curl http://$(kubectl get service sample-app -o jsonpath='{ .spec.clusterIP }')/metrics +``` + +Recall from the configuration at the start that you configured your HPA to +have each replica handle 500 milli-requests, or 1 request every two +seconds (ok, so *maybe* you still have some performance issues to work out +before your beta period ends). Thus, if you generate a few requests, you +should see the HPA scale up your app relatively quickly. + +If you describe the HPA again, you should see that the last observed +metric value roughly corresponds to your rate of requests, and that the +HPA has recently scaled your app. + +Now that you've got your app autoscaling on the HTTP requests, you're all +ready to launch! If you leave the app alone for a while, the HPA should +scale it back down, so you can save precious budget for the launch party. Next Steps ---------- From 262493780f1b31af8f2301ee59405a8627970f16 Mon Sep 17 00:00:00 2001 From: Solly Ross Date: Tue, 28 Aug 2018 13:45:55 -0400 Subject: [PATCH 8/8] Clarify multi-metric queries The example on the multi-metric query was a bit misleading, so this clarifies it a bit so it looks closer to a real-world example. --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 44dfa484..662caa37 100644 --- a/README.md +++ b/README.md @@ -130,7 +130,9 @@ the "discovery" and "naming" parts of the configuration. Then, you can write whichever metrics you want in the `metricsQuery`! The series query can contain whichever metrics you want, as long as they have the right set of labels. -For example, if you have two metrics `foo_total` and `foo_count`, you might write +For example, suppose you have two metrics `foo_total` and `foo_count`, +both with the label `system_name`, which represents the `node` resource. +Then, you might write ```yaml rules: @@ -139,7 +141,7 @@ rules: name: matches: 'foo_total' as: 'foo' - metricsQuery: 'sum(foo_total) by (<<.GroupBy>>) / sum(foo_count) by (<<.GroupBy>>)' + metricsQuery: 'sum(foo_total{<<.LabelMatchers>>}) by (<<.GroupBy>>) / sum(foo_count{<<.LabelMatchers>>}) by (<<.GroupBy>>)' ``` ### I get errors about SubjectAccessReviews/system:anonymous/TLS/Certificates/RequestHeader!