diff --git a/cmd/stackit-csi-plugin/main.go b/cmd/stackit-csi-plugin/main.go index d952bd60..f1640dae 100644 --- a/cmd/stackit-csi-plugin/main.go +++ b/cmd/stackit-csi-plugin/main.go @@ -1,11 +1,17 @@ package main import ( + "context" "fmt" "os" + "os/signal" + "syscall" + "github.com/prometheus/client_golang/prometheus" "github.com/spf13/cobra" "github.com/spf13/pflag" + "github.com/stackitcloud/cloud-provider-stackit/pkg/metrics" + sdkconfig "github.com/stackitcloud/stackit-sdk-go/core/config" "k8s.io/component-base/cli" "k8s.io/klog/v2" @@ -21,7 +27,7 @@ var ( endpoint string cloudConfig string cluster string - httpEndpoint string + metricsAddress string provideControllerService bool provideNodeService bool ) @@ -31,7 +37,10 @@ func main() { Use: "stackit-csi-plugin", Short: "STACKIT block-storage CSI plugin", Run: func(_ *cobra.Command, _ []string) { - handle() + ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGTERM, syscall.SIGINT) + defer cancel() + + handle(ctx) }, PersistentPreRunE: func(cmd *cobra.Command, _ []string) error { f := cmd.Flags() @@ -64,7 +73,7 @@ func main() { cmd.Flags().StringVar(&cloudConfig, "cloud-config", "", "CSI driver cloud config. This option can be given multiple times") cmd.PersistentFlags().StringVar(&cluster, "cluster", "", "The identifier of the cluster that the plugin is running in.") - cmd.PersistentFlags().StringVar(&httpEndpoint, "http-endpoint", "", + cmd.PersistentFlags().StringVar(&metricsAddress, "metrics-address", "", "The TCP network address where the HTTP server for providing metrics for diagnostics, will listen (example: `:8080`)."+ "The default is empty string, which means the server is disabled.") @@ -79,7 +88,16 @@ func main() { os.Exit(code) } -func handle() { +func handle(ctx context.Context) { + if metricsAddress != "" { + metricsExporter := metrics.NewExporter() + prometheus.MustRegister(metricsExporter) + go func() { + if err := metrics.Run(ctx, metricsAddress); err != nil { + klog.Fatalf("Run metrics returned an error: %v", err) + } + }() + } // Initialize cloud d := blockstorage.NewDriver(&blockstorage.DriverOpts{ Endpoint: endpoint, @@ -94,7 +112,15 @@ func handle() { klog.Fatal(err) } - iaasClient, err := stackit.CreateIaaSClient(&cfg) + iaasHTTPClient, err := metrics.NewInstrumentedHTTPClient(metrics.APINameIaaS) + if err != nil { + klog.Fatalf("create IaaS metrics HTTP client: %v", err) + } + iaasOpts := []sdkconfig.ConfigurationOption{ + sdkconfig.WithHTTPClient(iaasHTTPClient), + } + + iaasClient, err := stackit.CreateIaaSClient(&cfg, iaasOpts...) if err != nil { klog.Fatalf("Failed to create IaaS client: %v", err) } diff --git a/go.mod b/go.mod index 909ce101..2301a6be 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,7 @@ require ( github.com/onsi/ginkgo/v2 v2.30.0 github.com/onsi/gomega v1.41.0 github.com/prometheus/client_golang v1.23.2 + github.com/prometheus/client_model v0.6.2 github.com/spf13/cobra v1.10.2 github.com/spf13/pflag v1.0.10 github.com/stackitcloud/stackit-sdk-go/core v0.26.0 @@ -88,7 +89,6 @@ require ( github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.67.5 // indirect github.com/prometheus/procfs v0.19.2 // indirect github.com/stackitcloud/stackit-sdk-go/services/resourcemanager v0.24.0 // indirect diff --git a/pkg/ccm/stackit.go b/pkg/ccm/stackit.go index 94e72246..bf25bd68 100644 --- a/pkg/ccm/stackit.go +++ b/pkg/ccm/stackit.go @@ -119,8 +119,12 @@ func BuildObservability() (*MetricsRemoteWrite, error) { // NewCloudControllerManager creates a new instance of the stackit struct from a stackitconfig struct func NewCloudControllerManager(cfg *stackitconfig.CCMConfig, obs *MetricsRemoteWrite) (*CloudControllerManager, error) { + lbHTTPClient, err := metrics.NewInstrumentedHTTPClient(metrics.APINameLoadBalancer) + if err != nil { + return nil, fmt.Errorf("create load balancer metrics HTTP client: %w", err) + } lbOpts := []sdkconfig.ConfigurationOption{ - sdkconfig.WithHTTPClient(metrics.NewInstrumentedHTTPClient()), + sdkconfig.WithHTTPClient(lbHTTPClient), } if cfg.Global.APIEndpoints.LoadBalancerAPI != "" { @@ -144,8 +148,12 @@ func NewCloudControllerManager(cfg *stackitconfig.CCMConfig, obs *MetricsRemoteW return nil, err } + iaasHTTPClient, err := metrics.NewInstrumentedHTTPClient(metrics.APINameIaaS) + if err != nil { + return nil, fmt.Errorf("create IaaS metrics HTTP client: %w", err) + } iaasOpts := []sdkconfig.ConfigurationOption{ - sdkconfig.WithHTTPClient(metrics.NewInstrumentedHTTPClient()), + sdkconfig.WithHTTPClient(iaasHTTPClient), } if cfg.Global.APIEndpoints.IaasAPI != "" { diff --git a/pkg/metrics/http.go b/pkg/metrics/http.go index 047032e6..27f7028f 100644 --- a/pkg/metrics/http.go +++ b/pkg/metrics/http.go @@ -1,21 +1,31 @@ package metrics import ( + "errors" "fmt" "net/http" + "strconv" "strings" "time" "github.com/prometheus/client_golang/prometheus" ) -func NewInstrumentedHTTPClient() *http.Client { - return &http.Client{ - Transport: &InstrumentedRoundTripper{http.DefaultTransport}, +func NewInstrumentedHTTPClient(api string) (*http.Client, error) { + if api == "" { + return nil, errors.New("api name is required") } + + return &http.Client{ + Transport: &InstrumentedRoundTripper{ + api: api, + base: http.DefaultTransport, + }, + }, nil } type InstrumentedRoundTripper struct { + api string base http.RoundTripper } @@ -26,15 +36,25 @@ func (rt *InstrumentedRoundTripper) RoundTrip(request *http.Request) (*http.Resp response, err := rt.base.RoundTrip(request) duration := time.Since(startTime) - LoadBalancerResponseTimeHistogram. - With(prometheus.Labels{operationLabel: operation}). + HTTPRequestDurationHistogram. + With(prometheus.Labels{ + apiLabel: rt.api, + operationLabel: operation, + }). Observe(float64(duration.Seconds())) - LoadBalancerRequestCount. - With(prometheus.Labels{operationLabel: operation}). + HTTPRequestCount. + With(prometheus.Labels{ + apiLabel: rt.api, + operationLabel: operation, + }). Inc() - if response != nil && response.StatusCode >= http.StatusInternalServerError { - LoadBalancerErrorCount.Inc() + if response != nil && response.StatusCode >= 400 { + HTTPErrorCount.With(prometheus.Labels{ + apiLabel: rt.api, + methodLabel: request.Method, + codeLabel: strconv.Itoa(response.StatusCode), + }).Inc() } return response, err diff --git a/pkg/metrics/http_test.go b/pkg/metrics/http_test.go index 1ffcc08f..62f3bdb2 100644 --- a/pkg/metrics/http_test.go +++ b/pkg/metrics/http_test.go @@ -2,10 +2,14 @@ package metrics import ( "net/http" + "net/http/httptest" "net/url" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + dto "github.com/prometheus/client_model/go" ) var _ = Describe("Metrics", func() { @@ -22,4 +26,140 @@ var _ = Describe("Metrics", func() { Entry("get load-balancers", "GET", "/v2/projects/6-a-4-8-c/regions/eu01/load-balancers", "get_load-balancers"), Entry("get load-balancers instance", "GET", "/v2/projects/6-a-4-8-c/regions/eu01/load-balancers/id", "get_load-balancers_instance"), ) + + Describe("InstrumentedRoundTripper", func() { + It("requires an API name", func() { + client, err := NewInstrumentedHTTPClient("") + Expect(err).To(MatchError("api name is required")) + Expect(client).To(BeNil()) + }) + + It("increments HTTPRequestCount for responses", func() { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer server.Close() + + labels := prometheus.Labels{ + apiLabel: "test", + operationLabel: "get_request-count-test", + } + before := testutil.ToFloat64(HTTPRequestCount.With(labels)) + + client, err := NewInstrumentedHTTPClient("test") + Expect(err).NotTo(HaveOccurred()) + + response, err := client.Get(server.URL + "/request-count-test") + Expect(err).NotTo(HaveOccurred()) + defer response.Body.Close() + + after := testutil.ToFloat64(HTTPRequestCount.With(labels)) + Expect(after - before).To(Equal(float64(1))) + }) + + It("records HTTPRequestDurationHistogram observations for responses", func() { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer server.Close() + + labels := prometheus.Labels{ + apiLabel: "test", + operationLabel: "get_request-duration-test", + } + before := histogramSampleCount(HTTPRequestDurationHistogram.With(labels)) + + client, err := NewInstrumentedHTTPClient("test") + Expect(err).NotTo(HaveOccurred()) + + response, err := client.Get(server.URL + "/request-duration-test") + Expect(err).NotTo(HaveOccurred()) + defer response.Body.Close() + + after := histogramSampleCount(HTTPRequestDurationHistogram.With(labels)) + Expect(after - before).To(Equal(uint64(1))) + }) + + It("increments HTTPErrorCount for 400 responses", func() { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusBadRequest) + })) + defer server.Close() + + labels := prometheus.Labels{ + apiLabel: "test", + methodLabel: http.MethodGet, + codeLabel: "400", + } + before := testutil.ToFloat64(HTTPErrorCount.With(labels)) + + client, err := NewInstrumentedHTTPClient("test") + Expect(err).NotTo(HaveOccurred()) + + response, err := client.Get(server.URL) + Expect(err).NotTo(HaveOccurred()) + defer response.Body.Close() + + after := testutil.ToFloat64(HTTPErrorCount.With(labels)) + Expect(after - before).To(Equal(float64(1))) + }) + + It("increments HTTPErrorCount for 500 responses", func() { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer server.Close() + + labels := prometheus.Labels{ + apiLabel: "test", + methodLabel: http.MethodPost, + codeLabel: "500", + } + before := testutil.ToFloat64(HTTPErrorCount.With(labels)) + + client, err := NewInstrumentedHTTPClient("test") + Expect(err).NotTo(HaveOccurred()) + + response, err := client.Post(server.URL, "application/json", nil) + Expect(err).NotTo(HaveOccurred()) + defer response.Body.Close() + + after := testutil.ToFloat64(HTTPErrorCount.With(labels)) + Expect(after - before).To(Equal(float64(1))) + }) + + It("does not increment HTTPErrorCount for successful responses", func() { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer server.Close() + + labels := prometheus.Labels{ + apiLabel: "test", + methodLabel: http.MethodGet, + codeLabel: "200", + } + before := testutil.ToFloat64(HTTPErrorCount.With(labels)) + + client, err := NewInstrumentedHTTPClient("test") + Expect(err).NotTo(HaveOccurred()) + + response, err := client.Get(server.URL) + Expect(err).NotTo(HaveOccurred()) + defer response.Body.Close() + + after := testutil.ToFloat64(HTTPErrorCount.With(labels)) + Expect(after - before).To(Equal(float64(0))) + }) + }) }) + +func histogramSampleCount(observer prometheus.Observer) uint64 { + metric, ok := observer.(prometheus.Metric) + Expect(ok).To(BeTrue()) + + dtoMetric := &dto.Metric{} + Expect(metric.Write(dtoMetric)).To(Succeed()) + + return dtoMetric.GetHistogram().GetSampleCount() +} diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index dd21347f..f15ef09a 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -6,35 +6,37 @@ import ( const ( cloudProviderMetricPrefix = "cloud_provider_stackit" - loadBalancerSubSystem = "lb" + apiLabel = "api" + methodLabel = "method" + codeLabel = "code" operationLabel = "op" + + APINameLoadBalancer = "loadbalancer" + APINameIaaS = "iaas" ) var ( - LoadBalancerRequestCount = prometheus.NewCounterVec(prometheus.CounterOpts{ + HTTPRequestCount = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: cloudProviderMetricPrefix, - Subsystem: loadBalancerSubSystem, - Name: "requests_total", - Help: "the number of requests to the load balancer API", + Name: "http_requests_total", + Help: "The number of requests to external APIs", ConstLabels: nil, - }, []string{operationLabel}) + }, []string{apiLabel, operationLabel}) - LoadBalancerErrorCount = prometheus.NewCounter(prometheus.CounterOpts{ + HTTPErrorCount = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: cloudProviderMetricPrefix, - Subsystem: loadBalancerSubSystem, - Name: "errors_total", - Help: "the number of server errors reported when calling the load balancer API", + Name: "http_errors_total", + Help: "Number of HTTP errors returned by external APIs", ConstLabels: nil, - }) + }, []string{apiLabel, methodLabel, codeLabel}) - LoadBalancerResponseTimeHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + HTTPRequestDurationHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: cloudProviderMetricPrefix, - Subsystem: loadBalancerSubSystem, - Name: "request_duration_seconds", - Help: "the response times of the load balancer API", + Name: "http_request_duration_seconds", + Help: "The response times of external API requests", ConstLabels: nil, Buckets: nil, - }, []string{operationLabel}) + }, []string{apiLabel, operationLabel}) ) type Exporter struct { @@ -55,13 +57,13 @@ func (e *Exporter) Collect(metrics chan<- prometheus.Metric) { } func (e *Exporter) describeCloudProvider(descs chan<- *prometheus.Desc) { - LoadBalancerRequestCount.Describe(descs) - LoadBalancerErrorCount.Describe(descs) - LoadBalancerResponseTimeHistogram.Describe(descs) + HTTPRequestCount.Describe(descs) + HTTPErrorCount.Describe(descs) + HTTPRequestDurationHistogram.Describe(descs) } func (e *Exporter) collectCloudProvider(metrics chan<- prometheus.Metric) { - LoadBalancerRequestCount.Collect(metrics) - LoadBalancerErrorCount.Collect(metrics) - LoadBalancerResponseTimeHistogram.Collect(metrics) + HTTPRequestCount.Collect(metrics) + HTTPErrorCount.Collect(metrics) + HTTPRequestDurationHistogram.Collect(metrics) } diff --git a/pkg/stackit/client.go b/pkg/stackit/client.go index 12960832..f676b6d4 100644 --- a/pkg/stackit/client.go +++ b/pkg/stackit/client.go @@ -171,7 +171,7 @@ func CreateSTACKITProvider(client iaas.DefaultAPI, cfg *stackitconfig.CSIConfig) return instance, nil } -func CreateIaaSClient(cfg *stackitconfig.CSIConfig) (iaas.DefaultAPI, error) { +func CreateIaaSClient(cfg *stackitconfig.CSIConfig, clientOpts ...sdkconfig.ConfigurationOption) (iaas.DefaultAPI, error) { var userAgent []string var opts []sdkconfig.ConfigurationOption userAgent = append(userAgent, fmt.Sprintf("%s/%s", "block-storage-csi-driver", version.Version)) @@ -186,6 +186,7 @@ func CreateIaaSClient(cfg *stackitconfig.CSIConfig) (iaas.DefaultAPI, error) { } opts = append(opts, sdkconfig.WithUserAgent(strings.Join(userAgent, " "))) + opts = append(opts, clientOpts...) client, err := iaas.NewAPIClient(opts...) if err != nil {