From e1d3d1be339a69e6806b983061f53f79819bbd30 Mon Sep 17 00:00:00 2001 From: JM-elastic Date: Wed, 1 Jul 2026 14:49:00 -0700 Subject: [PATCH] [aws_lambda_otel] Add ML anomaly detection module Add ML anomaly detection modules for Lambda function performance (duration, concurrency) and errors/throttles. --- packages/aws_lambda_otel/changelog.yml | 5 + .../aws_lambda_otel-activity-ml.json | 194 ++++++++++++++++++ .../ml_module/aws_lambda_otel-metrics-ml.json | 181 ++++++++++++++++ packages/aws_lambda_otel/manifest.yml | 2 +- 4 files changed, 381 insertions(+), 1 deletion(-) create mode 100644 packages/aws_lambda_otel/kibana/ml_module/aws_lambda_otel-activity-ml.json create mode 100644 packages/aws_lambda_otel/kibana/ml_module/aws_lambda_otel-metrics-ml.json diff --git a/packages/aws_lambda_otel/changelog.yml b/packages/aws_lambda_otel/changelog.yml index a24a958b6cb..aaf5cfa5c1a 100644 --- a/packages/aws_lambda_otel/changelog.yml +++ b/packages/aws_lambda_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.8.0" + changes: + - description: Add ML anomaly detection modules for Lambda function performance (duration, concurrency) and errors/throttles. + type: enhancement + link: https://github.com/elastic/integrations/pull/19923 - version: "0.7.0" changes: - description: Improve ESQL queries in dashboards diff --git a/packages/aws_lambda_otel/kibana/ml_module/aws_lambda_otel-activity-ml.json b/packages/aws_lambda_otel/kibana/ml_module/aws_lambda_otel-activity-ml.json new file mode 100644 index 00000000000..af994f488d2 --- /dev/null +++ b/packages/aws_lambda_otel/kibana/ml_module/aws_lambda_otel-activity-ml.json @@ -0,0 +1,194 @@ +{ + "id": "aws_lambda_otel-activity-ml", + "type": "ml-module", + "migrationVersion": { + "search": "7.9.3" + }, + "references": [], + "attributes": { + "id": "aws_lambda_otel-activity-ml", + "title": "AWS Lambda function errors and throttles (OpenTelemetry)", + "description": "Detect anomalous rates of Lambda errors, throttles, and dead-letter failures (the per-bucket Sum statistic) streamed from CloudWatch via the OpenTelemetry firehose receiver. Each function is modelled against its own history, so an elevation in error or throttle rate that is abnormal for that function is caught even when it stays below the fixed error-rate alert thresholds.", + "type": "AWS metrics", + "logo": { + "icon": "logoAWSMono" + }, + "defaultIndexPattern": "metrics-aws.lambda.otel-*", + "query": { + "bool": { + "filter": [ + { + "term": { + "Namespace": "AWS/Lambda" + } + }, + { + "exists": { + "field": "FunctionName" + } + } + ], + "must_not": { + "terms": { + "_tier": [ + "data_frozen", + "data_cold" + ] + } + } + } + }, + "jobs": [ + { + "id": "aws_lambda_function_error_anomaly", + "config": { + "groups": [ + "aws", + "lambda", + "otel" + ], + "description": "AWS Lambda: detect functions producing unusual rates of errors, throttles, or dead-letter failures relative to that function's own history. The threshold-based alert rules cover hard error-rate and throttle-rate breaches; this job catches per-function rate elevations that drift below any fixed threshold, with the function, region, and account surfaced as influencers for attribution.", + "analysis_config": { + "bucket_span": "15m", + "summary_count_field_name": "doc_count", + "detectors": [ + { + "detector_description": "Anomalous error rate", + "function": "high_mean", + "field_name": "metrics.amazonaws.com/AWS/Lambda/Errors", + "by_field_name": "FunctionName", + "partition_field_name": "cloud.region" + }, + { + "detector_description": "Anomalous throttle rate", + "function": "high_mean", + "field_name": "metrics.amazonaws.com/AWS/Lambda/Throttles", + "by_field_name": "FunctionName", + "partition_field_name": "cloud.region" + }, + { + "detector_description": "Anomalous dead-letter delivery failures", + "function": "high_mean", + "field_name": "metrics.amazonaws.com/AWS/Lambda/DeadLetterErrors", + "by_field_name": "FunctionName", + "partition_field_name": "cloud.region" + } + ], + "influencers": [ + "FunctionName", + "cloud.region", + "cloud.account.id" + ] + }, + "analysis_limits": { + "model_memory_limit": "64mb" + }, + "data_description": { + "time_field": "@timestamp", + "time_format": "epoch_ms" + }, + "model_plot_config": { + "enabled": false, + "annotations_enabled": true + }, + "custom_settings": { + "created_by": "ml-module-aws-lambda-otel-activity-ml" + } + } + } + ], + "datafeeds": [ + { + "id": "datafeed-aws_lambda_function_error_anomaly", + "job_id": "aws_lambda_function_error_anomaly", + "config": { + "job_id": "aws_lambda_function_error_anomaly", + "indices": [ + "INDEX_PATTERN_NAME" + ], + "indices_options": { + "allow_no_indices": true + }, + "query": { + "bool": { + "filter": [ + { + "term": { + "stat": "Sum" + } + }, + { + "terms": { + "MetricName": [ + "Errors", + "Throttles", + "DeadLetterErrors" + ] + } + }, + { + "exists": { + "field": "FunctionName" + } + } + ] + } + }, + "aggregations": { + "buckets": { + "composite": { + "size": 1000, + "sources": [ + { + "date": { + "date_histogram": { + "field": "@timestamp", + "fixed_interval": "900s" + } + } + }, + { + "cloud.region": { + "terms": { + "field": "cloud.region" + } + } + }, + { + "FunctionName": { + "terms": { + "field": "FunctionName" + } + } + } + ] + }, + "aggregations": { + "@timestamp": { + "max": { + "field": "@timestamp" + } + }, + "metrics.amazonaws.com/AWS/Lambda/Errors": { + "avg": { + "field": "metrics.amazonaws.com/AWS/Lambda/Errors" + } + }, + "metrics.amazonaws.com/AWS/Lambda/Throttles": { + "avg": { + "field": "metrics.amazonaws.com/AWS/Lambda/Throttles" + } + }, + "metrics.amazonaws.com/AWS/Lambda/DeadLetterErrors": { + "avg": { + "field": "metrics.amazonaws.com/AWS/Lambda/DeadLetterErrors" + } + } + } + } + } + } + } + ] + } +} diff --git a/packages/aws_lambda_otel/kibana/ml_module/aws_lambda_otel-metrics-ml.json b/packages/aws_lambda_otel/kibana/ml_module/aws_lambda_otel-metrics-ml.json new file mode 100644 index 00000000000..a5cc94679bf --- /dev/null +++ b/packages/aws_lambda_otel/kibana/ml_module/aws_lambda_otel-metrics-ml.json @@ -0,0 +1,181 @@ +{ + "id": "aws_lambda_otel-metrics-ml", + "type": "ml-module", + "migrationVersion": { + "search": "7.9.3" + }, + "references": [], + "attributes": { + "id": "aws_lambda_otel-metrics-ml", + "title": "AWS Lambda function performance (OpenTelemetry)", + "description": "Detect anomalies in Lambda function duration and concurrency streamed from CloudWatch via the OpenTelemetry firehose receiver. Each function is modelled against its own history, so latency drift and concurrency climbing toward the account limit are caught before they cross the static alert thresholds or begin to throttle.", + "type": "AWS metrics", + "logo": { + "icon": "logoAWSMono" + }, + "defaultIndexPattern": "metrics-aws.lambda.otel-*", + "query": { + "bool": { + "filter": [ + { + "term": { + "Namespace": "AWS/Lambda" + } + }, + { + "exists": { + "field": "FunctionName" + } + } + ], + "must_not": { + "terms": { + "_tier": [ + "data_frozen", + "data_cold" + ] + } + } + } + }, + "jobs": [ + { + "id": "aws_lambda_function_performance_anomaly", + "config": { + "groups": [ + "aws", + "lambda", + "otel" + ], + "description": "AWS Lambda: detect functions whose average duration or concurrency has drifted unusually relative to that function's own history - latency creeping up (a slow dependency, cold-start regression, or larger payloads) or concurrency climbing toward the account limit before throttling begins. The static alert rules cover hard duration/concurrency breaches; this job covers the sub-threshold drift, with the function, region, and account as influencers.", + "analysis_config": { + "bucket_span": "15m", + "summary_count_field_name": "doc_count", + "detectors": [ + { + "detector_description": "Anomalous average duration (latency drift)", + "function": "high_mean", + "field_name": "metrics.amazonaws.com/AWS/Lambda/Duration", + "by_field_name": "FunctionName", + "partition_field_name": "cloud.region" + }, + { + "detector_description": "Anomalous concurrent executions (saturation trajectory)", + "function": "high_mean", + "field_name": "metrics.amazonaws.com/AWS/Lambda/ConcurrentExecutions", + "by_field_name": "FunctionName", + "partition_field_name": "cloud.region" + } + ], + "influencers": [ + "FunctionName", + "cloud.region", + "cloud.account.id" + ] + }, + "analysis_limits": { + "model_memory_limit": "128mb" + }, + "data_description": { + "time_field": "@timestamp", + "time_format": "epoch_ms" + }, + "model_plot_config": { + "enabled": false, + "annotations_enabled": true + }, + "custom_settings": { + "created_by": "ml-module-aws-lambda-otel-metrics-ml" + } + } + } + ], + "datafeeds": [ + { + "id": "datafeed-aws_lambda_function_performance_anomaly", + "job_id": "aws_lambda_function_performance_anomaly", + "config": { + "job_id": "aws_lambda_function_performance_anomaly", + "indices": [ + "INDEX_PATTERN_NAME" + ], + "indices_options": { + "allow_no_indices": true + }, + "query": { + "bool": { + "filter": [ + { + "term": { + "stat": "Average" + } + }, + { + "terms": { + "MetricName": [ + "Duration", + "ConcurrentExecutions" + ] + } + }, + { + "exists": { + "field": "FunctionName" + } + } + ] + } + }, + "aggregations": { + "buckets": { + "composite": { + "size": 1000, + "sources": [ + { + "date": { + "date_histogram": { + "field": "@timestamp", + "fixed_interval": "900s" + } + } + }, + { + "cloud.region": { + "terms": { + "field": "cloud.region" + } + } + }, + { + "FunctionName": { + "terms": { + "field": "FunctionName" + } + } + } + ] + }, + "aggregations": { + "@timestamp": { + "max": { + "field": "@timestamp" + } + }, + "metrics.amazonaws.com/AWS/Lambda/Duration": { + "avg": { + "field": "metrics.amazonaws.com/AWS/Lambda/Duration" + } + }, + "metrics.amazonaws.com/AWS/Lambda/ConcurrentExecutions": { + "avg": { + "field": "metrics.amazonaws.com/AWS/Lambda/ConcurrentExecutions" + } + } + } + } + } + } + } + ] + } +} diff --git a/packages/aws_lambda_otel/manifest.yml b/packages/aws_lambda_otel/manifest.yml index 3a3c73afc15..e98d9d8fb36 100644 --- a/packages/aws_lambda_otel/manifest.yml +++ b/packages/aws_lambda_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.6.0 name: aws_lambda_otel title: "AWS Lambda Metrics OpenTelemetry Assets" -version: 0.7.0 +version: 0.8.0 source: license: "Elastic-2.0" description: "AWS Lambda Metrics OpenTelemetry Assets"