Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 31 additions & 10 deletions crates/hotblocks/src/cli.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::{
collections::{BTreeSet, HashSet},
sync::Arc
sync::Arc,
time::Duration
};

use anyhow::Context;
Expand All @@ -10,7 +11,7 @@ use sqd_storage::db::{DatabaseSettings, DatasetId};
use crate::{
data_service::{DataService, DataServiceRef},
dataset_config::{DatasetConfig, RetentionConfig},
metrics::DatasetMetricsCollector,
metrics::{ColumnFamilySizes, DatasetMetricsCollector, StorageMetricsCollector},
query::{QueryService, QueryServiceRef},
types::DBRef
};
Expand Down Expand Up @@ -64,7 +65,11 @@ pub struct CLI {
/// Known client IDs for metrics labeling. Client IDs not in this list
/// will be reported as "unknown" to prevent metrics cardinality abuse.
#[arg(long = "known-client", value_name = "ID")]
pub known_clients: Vec<String>
pub known_clients: Vec<String>,

/// Refresh interval for hotblocks_dataset_size_bytes and hotblocks_column_family_size_bytes
#[arg(long, value_name = "SECONDS", default_value = "60")]
pub storage_stats_interval_secs: u64
}

pub struct App {
Expand All @@ -73,7 +78,9 @@ pub struct App {
pub query_service: QueryServiceRef,
pub api_controlled_datasets: BTreeSet<DatasetId>,
pub metrics_registry: prometheus_client::registry::Registry,
pub known_clients: HashSet<String>
pub known_clients: HashSet<String>,
pub storage_metrics_sender: tokio::sync::watch::Sender<ColumnFamilySizes>,
pub storage_stats_interval: Duration
}

impl CLI {
Expand All @@ -91,17 +98,29 @@ impl CLI {
.context("failed to open rocksdb database")?;

let mut metrics_registry = crate::metrics::build_metrics_registry();
metrics_registry.register_collector(Box::new(DatasetMetricsCollector {
db: db.clone(),
datasets: datasets.keys().copied().collect()
}));

let dataset_ids: Vec<DatasetId> = datasets.keys().copied().collect();

let api_controlled_datasets = datasets
.iter()
.filter_map(|(id, cfg)| (cfg.retention_strategy == RetentionConfig::Api).then_some(*id))
.collect();

let data_service = DataService::start(db.clone(), datasets).await.map(Arc::new)?;
let storage_stats_interval = Duration::from_secs(self.storage_stats_interval_secs);

let data_service = DataService::start(db.clone(), datasets, storage_stats_interval)
.await
.map(Arc::new)?;

metrics_registry.register_collector(Box::new(DatasetMetricsCollector {
data_service: data_service.clone(),
datasets: dataset_ids
}));

let (storage_metrics_sender, storage_metrics_receiver) = tokio::sync::watch::channel(ColumnFamilySizes::new());
metrics_registry.register_collector(Box::new(StorageMetricsCollector {
receiver: storage_metrics_receiver
}));

let query_service = {
let mut builder = QueryService::builder(db.clone());
Expand All @@ -128,7 +147,9 @@ impl CLI {
query_service,
api_controlled_datasets,
metrics_registry,
known_clients
known_clients,
storage_metrics_sender,
storage_stats_interval
})
}
}
7 changes: 4 additions & 3 deletions crates/hotblocks/src/data_service.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::{
collections::{BTreeMap, HashMap},
sync::Arc
sync::Arc,
time::Duration
};

use anyhow::{Context, anyhow};
Expand All @@ -23,7 +24,7 @@ pub struct DataService {
}

impl DataService {
pub async fn start(db: DBRef, datasets: BTreeMap<DatasetId, DatasetConfig>) -> anyhow::Result<Self> {
pub async fn start(db: DBRef, datasets: BTreeMap<DatasetId, DatasetConfig>, stats_interval: Duration) -> anyhow::Result<Self> {
let all_datasets = db.get_all_datasets()?;
for dataset in all_datasets {
if !datasets.contains_key(&dataset.id) {
Expand Down Expand Up @@ -55,7 +56,7 @@ impl DataService {
};

tokio::task::spawn_blocking(move || {
DatasetController::new(db, dataset_id, cfg.kind, retention, data_sources).map(|c| {
DatasetController::new(db, dataset_id, cfg.kind, retention, data_sources, stats_interval).map(|c| {
c.enable_compaction(!cfg.disable_compaction);
Arc::new(c)
})
Expand Down
80 changes: 76 additions & 4 deletions crates/hotblocks/src/dataset_controller/dataset_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use anyhow::{Context, anyhow};
use futures::{FutureExt, StreamExt, future::BoxFuture, stream::FuturesUnordered};
use sqd_data_client::reqwest::ReqwestDataClient;
use sqd_primitives::{BlockNumber, BlockRef};
use sqd_storage::db::{Chunk, CompactionStatus, DatasetId};
use sqd_storage::db::{Chunk, CompactionStatus, Database, DatasetId};
use tokio::{select, task::JoinHandle, time::Instant};
use tracing::{Instrument, debug, error, info, info_span, instrument, warn};

Expand All @@ -24,25 +24,38 @@ pub struct DatasetController {
head_receiver: tokio::sync::watch::Receiver<Option<BlockRef>>,
finalized_head_receiver: tokio::sync::watch::Receiver<Option<BlockRef>>,
compaction_enabled_sender: tokio::sync::watch::Sender<bool>,
stats_receiver: tokio::sync::watch::Receiver<DatasetStats>,
task: JoinHandle<()>,
compaction_task: JoinHandle<()>
compaction_task: JoinHandle<()>,
stats_task: JoinHandle<()>
}

impl Drop for DatasetController {
fn drop(&mut self) {
self.task.abort();
self.compaction_task.abort();
self.stats_task.abort();
}
}

/// Dataset metrics refreshed by [`dataset_stats_loop`] on the configured stats interval.
#[derive(Clone, Debug, Default)]
pub struct DatasetStats {
pub first_block: Option<BlockNumber>,
pub last_block_time: Option<i64>,
/// `None` until computed once, distinguishing a fresh process from an empty dataset.
pub size_bytes: Option<u64>
}

impl DatasetController {
#[instrument(name = "dataset", skip_all, fields(dataset_id = %dataset_id))]
pub fn new(
db: DBRef,
dataset_id: DatasetId,
dataset_kind: DatasetKind,
retention: RetentionStrategy,
data_sources: Vec<ReqwestDataClient>
data_sources: Vec<ReqwestDataClient>,
stats_interval: Duration
) -> anyhow::Result<Self> {
let mut write = WriteController::new(db.clone(), dataset_id, dataset_kind)?;

Expand All @@ -54,6 +67,7 @@ impl DatasetController {
let (head_sender, head_receiver) = tokio::sync::watch::channel(None);
let (finalized_head_sender, finalized_head_receiver) = tokio::sync::watch::channel(None);
let (compaction_enabled_sender, compaction_enabled_receiver) = tokio::sync::watch::channel(false);
let (stats_sender, stats_receiver) = tokio::sync::watch::channel(DatasetStats::default());

let _ = head_sender.send(write.head().cloned());
let _ = finalized_head_sender.send(write.finalized_head().cloned());
Expand All @@ -70,6 +84,9 @@ impl DatasetController {

let task = tokio::spawn(ctl.run(write).in_current_span());

let stats_task =
tokio::spawn(dataset_stats_loop(db.clone(), dataset_id, stats_interval, stats_sender).in_current_span());

let compaction_task =
tokio::spawn(compaction_loop(db, dataset_id, compaction_enabled_receiver).in_current_span());

Expand All @@ -80,8 +97,10 @@ impl DatasetController {
head_receiver,
finalized_head_receiver,
compaction_enabled_sender,
stats_receiver,
task,
compaction_task
compaction_task,
stats_task
})
}

Expand Down Expand Up @@ -117,6 +136,10 @@ impl DatasetController {
self.retention_sender.borrow().clone()
}

pub fn get_stats(&self) -> DatasetStats {
self.stats_receiver.borrow().clone()
}

pub async fn wait_for_block(&self, block_number: BlockNumber) -> BlockNumber {
let mut recv = self.head_receiver.clone();
loop {
Expand Down Expand Up @@ -591,6 +614,55 @@ async fn fetch_chain_top(clients: Vec<ReqwestDataClient>) -> BlockNumber {
}
}

#[instrument(name = "dataset_stats", skip_all)]
async fn dataset_stats_loop(db: DBRef, dataset_id: DatasetId, interval: Duration, sender: tokio::sync::watch::Sender<DatasetStats>) {
// Delay the first run by a per-dataset offset so the loops don't hit RocksDB together.
if interval.as_secs() > 0 {
let offset = dataset_id
.as_ref()
.iter()
.fold(0u64, |acc, &b| acc.wrapping_add(b as u64))
% interval.as_secs();
tokio::time::sleep(Duration::from_secs(offset)).await;
}

loop {
let db = db.clone();
let span = tracing::Span::current();
let result = tokio::task::spawn_blocking(move || {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One more blocking task per dataset, per minute doesn't look right to me.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've measured the time of running, and it's <40 ms per dataset at worst, without blocking anything.
The time of running the global metadata estimator is <10 ms and holds the global locks for writes, but doesn't affect reads.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, would you mind sharing some details related to the overall load, size of the db, etc?

let _s = span.enter();
compute_dataset_stats(&db, dataset_id)
})
.await;

match result {
Ok(Ok(stats)) => {
let _ = sender.send(stats);
}
Ok(Err(err)) => error!(reason =? err, "failed to estimate dataset stats"),
Err(err) => error!(reason =? err, "dataset stats task panicked")
}

tokio::time::sleep(interval).await;
}
}

fn compute_dataset_stats(db: &Database, dataset_id: DatasetId) -> anyhow::Result<DatasetStats> {
let snapshot = db.snapshot();
let first_block = snapshot.get_first_chunk(dataset_id)?.map(|c| c.first_block());
let last_block_time = snapshot.get_last_chunk(dataset_id)?.and_then(|c| c.last_block_time());

let started = std::time::Instant::now();
let size_bytes = snapshot.estimate_dataset_size(dataset_id)?;
debug!(elapsed_us = started.elapsed().as_micros(), "estimated dataset size");

Ok(DatasetStats {
first_block,
last_block_time,
size_bytes: Some(size_bytes)
})
}

#[instrument(name = "compaction", skip_all)]
async fn compaction_loop(db: DBRef, dataset_id: DatasetId, mut enabled: tokio::sync::watch::Receiver<bool>) {
let mut skips = 0;
Expand Down
5 changes: 5 additions & 0 deletions crates/hotblocks/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ fn main() -> anyhow::Result<()> {
let app = args.build_app().await?;

tokio::spawn(db_cleanup_task(app.db.clone()));
tokio::spawn(metrics::storage_metrics_loop(
app.db.clone(),
app.storage_stats_interval,
app.storage_metrics_sender.clone()
));

let api = build_api(app);

Expand Down
Loading
Loading