Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 14 additions & 9 deletions obp-api/src/main/scala/code/api/v7_0_0/Http4s700.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3281,16 +3281,21 @@ object Http4s700 {
| recorded yet.
|* `checks` — a list of integrity checks, each with a `status` of `OK`,
| `WARNING`, or `ERROR`:
| * `write_metrics_enabled`
| * `metrics_scheduler_enabled`
| * `metric_oldest_within_retention` — flags if the oldest live metric
| is older than the retention window (move job not keeping up / stopped).
| * `archive_oldest_within_retention` — flags if the oldest archived
| * `check_metrics_are_being_written` — warns if `write_metrics` is off,
| so no new metrics are being recorded.
| * `check_archive_scheduler_is_enabled` — errors if `enable_metrics_scheduler`
| is off, so old metrics are never archived nor deleted.
| * `check_metric_retention_policy_is_respected` — flags if the oldest live
| metric is older than the retention window (move job not keeping up / stopped).
| * `check_all_old_metrics_can_be_archived` — warns if old metric rows have an
| empty correlation id and so cannot be moved to the archive.
| * `check_archive_retention_policy_is_respected` — flags if the oldest archived
| metric is older than the archive retention (cleanup not keeping up / stopped).
| * `archive_recently_updated` — flags if a backlog exists but the newest
| archived record is stale (move job stopped).
| * `archive_job_last_run` — reports the outcome of the most recent run
| from the run log (errors if the last run failed; warns if none recorded).
| * `check_archive_metrics_is_fresh_enough` — flags if a backlog exists but
| the newest archived record is stale (move job stopped). "enough" because
| a fresh record is only required when there is a backlog to move.
| * `check_last_archive_run_succeeded` — reports the outcome of the most recent
| run from the run log (errors if the last run failed; warns if none recorded).
|* `everything_as_expected` — `true` only when every check is `OK`.
|
|${userAuthenticationMessage(true)}""".stripMargin,
Expand Down
52 changes: 26 additions & 26 deletions obp-api/src/main/scala/code/api/v7_0_0/JSONFactory7.0.0.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1238,30 +1238,30 @@ object JSONFactory700 extends MdcLoggable with code.api.util.CustomJsonFormats {
val checks = scala.collection.mutable.ListBuffer[MetricsIntegrityCheckJsonV700]()

checks += (if (writeMetrics)
MetricsIntegrityCheckJsonV700("write_metrics_enabled", "OK",
MetricsIntegrityCheckJsonV700("check_metrics_are_being_written", "OK",
"write_metrics=true: API calls are being recorded into the metric table.")
else
MetricsIntegrityCheckJsonV700("write_metrics_enabled", "WARNING",
MetricsIntegrityCheckJsonV700("check_metrics_are_being_written", "WARNING",
"write_metrics=false: no new API metrics are being written, so the metric table count will not grow."))

checks += (if (schedulerEnabled)
MetricsIntegrityCheckJsonV700("metrics_scheduler_enabled", "OK",
MetricsIntegrityCheckJsonV700("check_archive_scheduler_is_enabled", "OK",
"enable_metrics_scheduler=true: the archive/cleanup scheduler is active.")
else
MetricsIntegrityCheckJsonV700("metrics_scheduler_enabled", "ERROR",
MetricsIntegrityCheckJsonV700("check_archive_scheduler_is_enabled", "ERROR",
"enable_metrics_scheduler=false: old metrics are never moved to the archive nor deleted; the metric table will grow without bound."))

metricOldest match {
case Some(d) =>
val age = metricsAgeInDays(d, now)
if (age <= retainMetricsDaysEffective + graceDays)
checks += MetricsIntegrityCheckJsonV700("metric_oldest_within_retention", "OK",
checks += MetricsIntegrityCheckJsonV700("check_metric_retention_policy_is_respected", "OK",
s"Oldest metric is $age days old, within the effective retention of $retainMetricsDaysEffective days (+${graceDays}d grace).")
else
checks += MetricsIntegrityCheckJsonV700("metric_oldest_within_retention", "ERROR",
checks += MetricsIntegrityCheckJsonV700("check_metric_retention_policy_is_respected", "ERROR",
s"Oldest metric is $age days old but the effective retention is $retainMetricsDaysEffective days. Records older than this should have been moved to the archive — the archive move job is not keeping up or has stopped.")
case None =>
checks += MetricsIntegrityCheckJsonV700("metric_oldest_within_retention", "OK", "The metric table is empty.")
checks += MetricsIntegrityCheckJsonV700("check_metric_retention_policy_is_respected", "OK", "The metric table is empty.")
}

// Old metric rows with an empty correlation id can't be archived (the archive
Expand All @@ -1273,23 +1273,23 @@ object JSONFactory700 extends MdcLoggable with code.api.util.CustomJsonFormats {
By(MappedMetric.correlationId, "")
)
if (unarchivableOldMetricCount == 0)
checks += MetricsIntegrityCheckJsonV700("metric_unarchivable_rows", "OK",
checks += MetricsIntegrityCheckJsonV700("check_all_old_metrics_can_be_archived", "OK",
"No metric rows older than the retention window are blocked from archiving.")
else
checks += MetricsIntegrityCheckJsonV700("metric_unarchivable_rows", "WARNING",
checks += MetricsIntegrityCheckJsonV700("check_all_old_metrics_can_be_archived", "WARNING",
s"$unarchivableOldMetricCount metric row(s) older than the retention window have an empty correlation id and cannot be archived (the archive requires a UUID). They remain in the metric table and are excluded from the move job — typically legacy rows that predate correlation ids.")

archiveOldest match {
case Some(d) =>
val age = metricsAgeInDays(d, now)
if (age <= retainArchiveMetricsDaysEffective + graceDays)
checks += MetricsIntegrityCheckJsonV700("archive_oldest_within_retention", "OK",
checks += MetricsIntegrityCheckJsonV700("check_archive_retention_policy_is_respected", "OK",
s"Oldest archived metric is $age days old, within the effective archive retention of $retainArchiveMetricsDaysEffective days (+${graceDays}d grace).")
else
checks += MetricsIntegrityCheckJsonV700("archive_oldest_within_retention", "ERROR",
checks += MetricsIntegrityCheckJsonV700("check_archive_retention_policy_is_respected", "ERROR",
s"Oldest archived metric is $age days old but the effective archive retention is $retainArchiveMetricsDaysEffective days. Records older than this should have been deleted — the archive cleanup job is not keeping up or has stopped.")
case None =>
checks += MetricsIntegrityCheckJsonV700("archive_oldest_within_retention", "OK", "The metricarchive table is empty.")
checks += MetricsIntegrityCheckJsonV700("check_archive_retention_policy_is_respected", "OK", "The metricarchive table is empty.")
}

// If a backlog of metrics older than the retention window exists, the move
Expand All @@ -1300,13 +1300,13 @@ object JSONFactory700 extends MdcLoggable with code.api.util.CustomJsonFormats {
case (Some(mo), Some(an)) if metricsAgeInDays(mo, now) > retainMetricsDaysEffective + graceDays =>
val newestArchiveAge = metricsAgeInDays(an, now)
if (newestArchiveAge <= retainMetricsDaysEffective + graceDays)
checks += MetricsIntegrityCheckJsonV700("archive_recently_updated", "OK",
checks += MetricsIntegrityCheckJsonV700("check_archive_metrics_is_fresh_enough", "OK",
s"Newest archived metric is $newestArchiveAge days old, consistent with an active move job.")
else
checks += MetricsIntegrityCheckJsonV700("archive_recently_updated", "ERROR",
checks += MetricsIntegrityCheckJsonV700("check_archive_metrics_is_fresh_enough", "ERROR",
s"There are metric rows older than the retention window, yet the newest archived record is $newestArchiveAge days old. The move job appears to have stopped roughly ${newestArchiveAge - retainMetricsDaysEffective} days ago.")
case _ =>
checks += MetricsIntegrityCheckJsonV700("archive_recently_updated", "OK",
checks += MetricsIntegrityCheckJsonV700("check_archive_metrics_is_fresh_enough", "OK",
"No backlog of metrics older than the retention window — nothing to move right now.")
}

Expand All @@ -1316,20 +1316,20 @@ object JSONFactory700 extends MdcLoggable with code.api.util.CustomJsonFormats {
lastRun match {
case Some(r) if r.Success.get =>
val ageDays = metricsAgeInDays(r.StartedAt.get, now)
checks += MetricsIntegrityCheckJsonV700("archive_job_last_run", "OK",
checks += MetricsIntegrityCheckJsonV700("check_last_archive_run_succeeded", "OK",
s"Last archive run succeeded $ageDays days ago (moved ${r.RowsMovedToArchive.get} rows, deleted ${r.RowsDeletedFromArchive.get} outdated archive rows).")
case Some(r) =>
val ageDays = metricsAgeInDays(r.StartedAt.get, now)
val lastOkNote = lastSuccessfulRun
.map(s => s" Last successful run was ${metricsAgeInDays(s.StartedAt.get, now)} days ago.")
.getOrElse(" No successful run has ever been recorded.")
checks += MetricsIntegrityCheckJsonV700("archive_job_last_run", "ERROR",
checks += MetricsIntegrityCheckJsonV700("check_last_archive_run_succeeded", "ERROR",
s"The most recent archive run ($ageDays days ago) failed: ${r.Remark.get}.$lastOkNote")
case None if schedulerEnabled =>
checks += MetricsIntegrityCheckJsonV700("archive_job_last_run", "WARNING",
checks += MetricsIntegrityCheckJsonV700("check_last_archive_run_succeeded", "WARNING",
"No archive run has been recorded yet. The scheduler is enabled but may not have completed its first run since this table was introduced.")
case None =>
checks += MetricsIntegrityCheckJsonV700("archive_job_last_run", "OK",
checks += MetricsIntegrityCheckJsonV700("check_last_archive_run_succeeded", "OK",
"No archive run recorded — the scheduler is disabled, so this is expected.")
}

Expand Down Expand Up @@ -1394,19 +1394,19 @@ object JSONFactory700 extends MdcLoggable with code.api.util.CustomJsonFormats {
remark = ""
)),
checks = List(
MetricsIntegrityCheckJsonV700("write_metrics_enabled", "OK",
MetricsIntegrityCheckJsonV700("check_metrics_are_being_written", "OK",
"write_metrics=true: API calls are being recorded into the metric table."),
MetricsIntegrityCheckJsonV700("metrics_scheduler_enabled", "OK",
MetricsIntegrityCheckJsonV700("check_archive_scheduler_is_enabled", "OK",
"enable_metrics_scheduler=true: the archive/cleanup scheduler is active."),
MetricsIntegrityCheckJsonV700("metric_oldest_within_retention", "OK",
MetricsIntegrityCheckJsonV700("check_metric_retention_policy_is_respected", "OK",
"Oldest metric is 85 days old, within the effective retention of 90 days (+7d grace)."),
MetricsIntegrityCheckJsonV700("metric_unarchivable_rows", "OK",
MetricsIntegrityCheckJsonV700("check_all_old_metrics_can_be_archived", "OK",
"No metric rows older than the retention window are blocked from archiving."),
MetricsIntegrityCheckJsonV700("archive_oldest_within_retention", "OK",
MetricsIntegrityCheckJsonV700("check_archive_retention_policy_is_respected", "OK",
"Oldest archived metric is 700 days old, within the effective archive retention of 730 days (+7d grace)."),
MetricsIntegrityCheckJsonV700("archive_recently_updated", "OK",
MetricsIntegrityCheckJsonV700("check_archive_metrics_is_fresh_enough", "OK",
"Newest archived metric is 92 days old, consistent with an active move job."),
MetricsIntegrityCheckJsonV700("archive_job_last_run", "OK",
MetricsIntegrityCheckJsonV700("check_last_archive_run_succeeded", "OK",
"Last archive run succeeded 0 days ago (moved 4000 rows, deleted 1500 outdated archive rows).")
),
everything_as_expected = true
Expand Down
Loading