Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
4e50143
feat: Added a new trait to expose SchemaProvider
parmesant Apr 15, 2026
2a23ced
rebase with main
parmesant May 4, 2026
c6be14c
update Cargo.toml for release v2.7.2 (#1642)
nikhilsinhaparseable May 6, 2026
64a34ba
update helm chart to v2.7.2 (#1643)
nitisht May 6, 2026
8f19a7b
add quantile values to known field list (#1644)
nikhilsinhaparseable May 6, 2026
bd9a679
Hottier fix (#1645)
parmesant May 15, 2026
44dcb24
chore: stdout for span exporter (#1647)
nikhilsinhaparseable May 19, 2026
b67b828
feat: add series hash in metrics ingestion, bounded streaming merge (…
nikhilsinhaparseable May 21, 2026
073f9ac
fix: Surface ingestion errors (#1652)
parmesant May 27, 2026
b726d61
add helm chart for parseable-enterprise 2.7.3 (#1654)
AdheipSingh May 28, 2026
bc5d7b1
update Cargo.toml for release v2.8.0 (#1656)
nikhilsinhaparseable May 28, 2026
6b3f8b5
update to helm 2.8.0 (#1658)
nitisht May 29, 2026
ce0ae60
update registry to quay (#1659)
nikhilsinhaparseable May 30, 2026
3f0f21d
fix: load IRSA web identity config for S3 (#1649)
ygndotgg Jun 1, 2026
8fd4b1a
Ingestion optimization (#1661)
parmesant Jun 2, 2026
a9b4847
chore: cleanup readme and update helm to point to new container regis…
nitisht Jun 2, 2026
5684bee
chore: readme cleanup (#1664)
nitisht Jun 2, 2026
0a594b0
fix: commit schema bug (#1666)
parmesant Jun 3, 2026
f6eb4f5
helm: support tolerations (plural) and affinity in standalone deploym…
AdheipSingh Jun 8, 2026
d28b619
chore: update to new helm chart with tolerations fix (#1670)
nitisht Jun 8, 2026
abf47a0
Further optimizations for ingestion flow (#1668)
parmesant Jun 8, 2026
57dd9e9
Remove async (#1671)
parmesant Jun 10, 2026
dea2cb9
helm: make terminationGracePeriodSeconds configurable for ingestor an…
nitisht Jun 10, 2026
8f65159
fix: deadlock in metric pruning (#1674)
parmesant Jun 11, 2026
fe8acea
fix: field stats parquet aggregation (#1675)
nikhilsinhaparseable Jun 11, 2026
b0d1154
Ingest api key (#1667)
nikhilsinhaparseable Jun 12, 2026
41db1e1
chore: reuse time bin logic across apis (#1660)
nikhilsinhaparseable Jun 12, 2026
8c40db1
chore: remove unwanted error and warning logs (#1676)
nikhilsinhaparseable Jun 12, 2026
832f3ca
update Cargo.toml for release v2.9.0 (#1677)
nikhilsinhaparseable Jun 13, 2026
29a0f89
perf: field stats more performant for high volume ingestion (#1679)
nikhilsinhaparseable Jun 15, 2026
f26f594
update Cargo.toml for release v2.9.1 (#1680)
nikhilsinhaparseable Jun 15, 2026
5d155ce
update release wf to run linux builds on self hosted runner (#1681)
nikhilsinhaparseable Jun 16, 2026
960915d
track insertion time instead of data time for eviction (#1650)
ygndotgg Jun 17, 2026
9d0643c
chore: Update deps (#1683)
parmesant Jun 17, 2026
236d7ef
dashboard: allow optional fields in dashboard json (#1684)
nikhilsinhaparseable Jun 17, 2026
831cea4
bugfix: schema updation issue (#1686)
parmesant Jun 18, 2026
ea262fc
feat: log context api (#1687)
nikhilsinhaparseable Jun 19, 2026
80ead6b
chore: update to latest pai helm chart v0.3.0 (#1692)
nitisht Jun 19, 2026
8d79243
update: remove historic sync from hottier (#1694)
parmesant Jun 22, 2026
07a09b9
add optional ingestion quota to tenant metadata (#1691)
nikhilsinhaparseable Jun 22, 2026
955f53b
add stream name to the ingest script param (#1696)
nikhilsinhaparseable Jun 23, 2026
12eb6c6
update: context api support for start and end time (#1695)
nikhilsinhaparseable Jun 23, 2026
2cea7b0
Restore and harden credential masking for /targets (#1698)
ygndotgg Jun 23, 2026
01ef055
fix windows ingest script (#1699)
nikhilsinhaparseable Jun 23, 2026
dd14314
update Cargo.toml for release v2.9.2 (#1700)
nikhilsinhaparseable Jun 23, 2026
67681e5
fix: create checksum release action (#1701)
nikhilsinhaparseable Jun 24, 2026
89a07f3
cargo fmt, rebase oss
parmesant Jun 24, 2026
f4334ca
Merge branch 'main' into query-updates
parmesant Jun 24, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ object_store = { version = "0.13.1", features = [
"azure",
"gcp",
] }
datafusion-proto = "53.1.0"
parquet = "58.0.0"

# Web server and HTTP-related
Expand Down
6 changes: 6 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,12 @@ pub mod validator;
use std::time::Duration;

// Public re-exports of crates being used in enterprise
pub use arrow_array;
pub use arrow_flight;
pub use arrow_ipc;
pub use catalog as parseable_catalog;
pub use datafusion;
pub use datafusion_proto;
pub use handlers::http::modal::{
ParseableServer, ingest_server::IngestServer, query_server::QueryServer, server::Server,
};
Expand All @@ -68,6 +73,7 @@ use once_cell::sync::Lazy;
pub use openid;
use parseable::PARSEABLE;
use reqwest::{Client, ClientBuilder};
pub use utils as parseable_utils;
pub use {opentelemetry, opentelemetry_otlp, opentelemetry_proto, opentelemetry_sdk};
pub use {tracing_actix_web, tracing_opentelemetry, tracing_subscriber};

Expand Down
23 changes: 23 additions & 0 deletions src/metrics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,18 @@ pub static TOTAL_QUERY_CALLS_BY_DATE: Lazy<IntCounterVec> = Lazy::new(|| {
.expect("metric can be created")
});

pub static TOTAL_FILES_SCANNED_IN_HOTTIER_BY_DATE: Lazy<IntCounterVec> = Lazy::new(|| {
IntCounterVec::new(
Opts::new(
"total_files_scanned_in_hottier_by_date",
"Total files scanned in hottier by date",
)
.namespace(METRICS_NAMESPACE),
&["stream", "date", "tenant_id"],
)
.expect("metric can be created")
});
Comment on lines +260 to +270

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

FILE="src/metrics/mod.rs"

echo "Occurrences of TOTAL_FILES_SCANNED_IN_HOTTIER_BY_DATE:"
rg -nC2 'TOTAL_FILES_SCANNED_IN_HOTTIER_BY_DATE' "$FILE"

echo
echo "Registration block in custom_metrics:"
sed -n '512,580p' "$FILE"

Repository: parseablehq/parseable

Length of output: 3289


🏁 Script executed:

#!/bin/bash
set -euo pipefail

FILE="src/metrics/mod.rs"

# Find the custom_metrics function definition and check its bounds
echo "=== custom_metrics function definition and scope ==="
sed -n '463,470p' "$FILE"

# Get the complete custom_metrics function (find where it ends)
echo
echo "=== End of custom_metrics function ==="
sed -n '575,585p' "$FILE"

# Search for any other registration attempts with the new metric
echo
echo "=== All occurrences of register and TOTAL_FILES_SCANNED_IN_HOTTIER_BY_DATE in same context ==="
rg -n 'register.*TOTAL_FILES_SCANNED_IN_HOTTIER_BY_DATE|TOTAL_FILES_SCANNED_IN_HOTTIER_BY_DATE.*register' "$FILE"

# Confirm the metric is not already registered elsewhere
echo
echo "=== Checking if the metric appears in any register calls ==="
rg -n '\.register\(' "$FILE" | grep -i 'hottier' || echo "No register calls found for hottier metric"

Repository: parseablehq/parseable

Length of output: 920


Register this counter in custom_metrics.

TOTAL_FILES_SCANNED_IN_HOTTIER_BY_DATE is declared at line 260 and incremented in the helper at lines 704-706, but it is never added to METRICS_REGISTRY inside custom_metrics. With the custom registry setup in this file, the metric will not be exposed on /metrics.

Proposed fix
     registry
         .register(Box::new(TOTAL_QUERY_CALLS_BY_DATE.clone()))
         .expect("metric can be registered");
+    registry
+        .register(Box::new(TOTAL_FILES_SCANNED_IN_HOTTIER_BY_DATE.clone()))
+        .expect("metric can be registered");
     registry
         .register(Box::new(TOTAL_FILES_SCANNED_IN_QUERY_BY_DATE.clone()))
         .expect("metric can be registered");
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/metrics/mod.rs` around lines 260 - 270, The new IntCounterVec
TOTAL_FILES_SCANNED_IN_HOTTIER_BY_DATE is never registered with the custom
prometheus registry, so it won't be exposed; update the custom_metrics function
to call
METRICS_REGISTRY.register(Box::new(TOTAL_FILES_SCANNED_IN_HOTTIER_BY_DATE.clone()))
(or the equivalent register method used in this module) alongside the other
metrics registration, handling any register errors consistently with existing
patterns so the counter is available on /metrics.


pub static TOTAL_FILES_SCANNED_IN_QUERY_BY_DATE: Lazy<IntCounterVec> = Lazy::new(|| {
IntCounterVec::new(
Opts::new(
Expand Down Expand Up @@ -716,6 +728,17 @@ pub fn increment_files_scanned_in_query_by_date(count: u64, date: &str, tenant_i
.inc_by(count);
}

pub fn increment_files_scanned_in_hottier_by_date(
count: u64,
date: &str,
tenant_id: &str,
stream_name: &str,
) {
TOTAL_FILES_SCANNED_IN_HOTTIER_BY_DATE
.with_label_values(&[stream_name, date, tenant_id])
.inc_by(count);
}

pub fn increment_bytes_scanned_in_query_by_date(bytes: u64, date: &str, tenant_id: &str) {
TOTAL_BYTES_SCANNED_IN_QUERY_BY_DATE
.with_label_values(&[date, tenant_id])
Expand Down
104 changes: 71 additions & 33 deletions src/query/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ use arrow_schema::SchemaRef;
use chrono::NaiveDateTime;
use chrono::{DateTime, Duration, Utc};
use datafusion::arrow::record_batch::RecordBatch;
use datafusion::catalog::SchemaProvider;
use datafusion::common::tree_node::Transformed;
use datafusion::execution::disk_manager::DiskManager;
use datafusion::execution::{
Expand All @@ -45,7 +46,7 @@ use datafusion::sql::sqlparser::dialect::PostgreSqlDialect;
use futures::Stream;
use futures::StreamExt;
use itertools::Itertools;
use once_cell::sync::Lazy;
use once_cell::sync::{Lazy, OnceCell};
use serde::{Deserialize, Serialize};
use serde_json::{Value, json};
use std::ops::Bound;
Expand All @@ -59,7 +60,6 @@ use tokio_stream::wrappers::UnboundedReceiverStream;
use tracing::Instrument;

use self::error::ExecuteError;
use self::stream_schema_provider::GlobalSchemaProvider;
pub use self::stream_schema_provider::PartialTimeFilter;
use crate::alerts::alert_structs::Conditions;
use crate::alerts::alerts_utils::get_filter_string;
Expand All @@ -72,7 +72,8 @@ use crate::handlers::http::query::QueryError;
use crate::metrics::increment_bytes_scanned_in_query_by_date;
use crate::option::Mode;
use crate::parseable::{DEFAULT_TENANT, PARSEABLE};
use crate::storage::{ObjectStorageProvider, ObjectStoreFormat};
use crate::query::stream_schema_provider::GlobalSchemaProvider;
use crate::storage::{ObjectStorage, ObjectStorageProvider, ObjectStoreFormat};
use crate::utils::time::{DATE_BIN_EPOCH_ANCHOR, TimeRange, count_api_bin_interval};

/// Boxed record-batch stream used as the streaming half of query results.
Expand All @@ -81,8 +82,13 @@ type BoxedBatchStream = SendableRecordBatchStream;
/// Result type returned by query execution: either collected batches or a streaming adapter, plus field names.
type QueryResult = Result<(Either<Vec<RecordBatch>, BoxedBatchStream>, Vec<String>), ExecuteError>;

// pub static QUERY_SESSION: Lazy<SessionContext> =
// Lazy::new(|| Query::create_session_context(PARSEABLE.storage()));
pub static SCHEMA_PROVIDER: OnceCell<Box<dyn ParseableSchemaProvider>> = OnceCell::new();

/// Additional physical optimizer rules registered by enterprise/plugins.
/// Must be populated BEFORE `QUERY_SESSION_STATE` is first accessed.
pub static ADDITIONAL_PHYSICAL_OPTIMIZER_RULES: Lazy<
RwLock<Vec<Arc<dyn datafusion::physical_optimizer::PhysicalOptimizerRule + Send + Sync>>>,
> = Lazy::new(|| RwLock::new(Vec::new()));

pub static QUERY_SESSION_STATE: Lazy<SessionState> =
Lazy::new(|| Query::create_session_state(PARSEABLE.storage()));
Expand All @@ -98,6 +104,15 @@ pub static QUERY_SESSION: Lazy<InMemorySessionContext> = Lazy::new(|| {
}
});

/// Trait to enable implementation of SchemaProvider
pub trait ParseableSchemaProvider: Send + Sync {
fn new_provider(
&self,
storage: Option<Arc<dyn ObjectStorage>>,
tenant_id: &Option<String>,
) -> Box<dyn SchemaProvider>;
}

pub struct InMemorySessionContext {
session_context: Arc<RwLock<SessionContext>>,
}
Expand All @@ -112,18 +127,23 @@ impl InMemorySessionContext {
}

pub fn add_schema(&self, tenant_id: &str) {
let schema_provider = if let Some(provider) = SCHEMA_PROVIDER.get() {
provider.new_provider(
Some(PARSEABLE.storage().get_object_store()),
&Some(tenant_id.to_owned()),
)
} else {
Box::new(GlobalSchemaProvider {
storage: PARSEABLE.storage().get_object_store(),
tenant_id: Some(tenant_id.to_owned()),
})
};
self.session_context
.write()
.expect("SessionContext should be writeable")
.catalog("datafusion")
.expect("Default catalog should be available")
.register_schema(
tenant_id,
Arc::new(GlobalSchemaProvider {
storage: PARSEABLE.storage().get_object_store(),
tenant_id: Some(tenant_id.to_owned()),
}),
)
.register_schema(tenant_id, schema_provider.into())
.expect("Should be able to register new schema");
}

Expand Down Expand Up @@ -179,29 +199,41 @@ impl Query {
// register multiple schemas
if let Some(tenants) = PARSEABLE.list_tenants() {
for t in tenants.iter() {
let schema_provider = Arc::new(GlobalSchemaProvider {
storage: storage.get_object_store(),
tenant_id: Some(t.clone()),
});
let _ = catalog.register_schema(t, schema_provider);
let schema_provider = if let Some(provider) = SCHEMA_PROVIDER.get() {
provider.new_provider(
Some(PARSEABLE.storage().get_object_store()),
&Some(t.to_owned()),
)
} else {
Box::new(GlobalSchemaProvider {
storage: PARSEABLE.storage().get_object_store(),
tenant_id: Some(t.to_owned()),
})
};
let _ = catalog.register_schema(t, schema_provider.into());
}
}
} else {
// register just one schema
let schema_provider = Arc::new(GlobalSchemaProvider {
storage: storage.get_object_store(),
tenant_id: None,
});
let schema_provider = if let Some(provider) = SCHEMA_PROVIDER.get() {
provider.new_provider(Some(PARSEABLE.storage().get_object_store()), &None)
} else {
Box::new(GlobalSchemaProvider {
storage: PARSEABLE.storage().get_object_store(),
tenant_id: None,
})
};

let _ = catalog.register_schema(
&state.config_options().catalog.default_schema,
schema_provider,
schema_provider.into(),
);
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}

SessionContext::new_with_state(state)
}

fn create_session_state(storage: Arc<dyn ObjectStorageProvider>) -> SessionState {
pub fn create_session_state(storage: Arc<dyn ObjectStorageProvider>) -> SessionState {
let runtime_config = storage
.get_datafusion_runtime()
.with_disk_manager_builder(DiskManager::builder());
Expand Down Expand Up @@ -260,11 +292,19 @@ impl Query {
.parquet
.schema_force_view_types = true;

SessionStateBuilder::new()
let mut builder = SessionStateBuilder::new()
.with_default_features()
.with_config(config)
.with_runtime_env(runtime)
.build()
.with_runtime_env(runtime);

// Append any additional physical optimizer rules (e.g., enterprise partial agg pushdown)
if let Ok(rules) = ADDITIONAL_PHYSICAL_OPTIMIZER_RULES.read() {
for rule in rules.iter() {
builder = builder.with_physical_optimizer_rule(Arc::clone(rule));
}
}

builder.build()
}

/// this function returns the result of the query
Expand Down Expand Up @@ -296,14 +336,12 @@ impl Query {
return Ok((Either::Left(vec![]), fields));
}

let plan = QUERY_SESSION
.get_ctx()
.state()
.create_physical_plan(df.logical_plan())
.await?;
let ctx = QUERY_SESSION.get_ctx();

let plan = ctx.state().create_physical_plan(df.logical_plan()).await?;

let results = if !is_streaming {
let task_ctx = QUERY_SESSION.get_ctx().task_ctx();
let task_ctx = ctx.task_ctx();

let batches = collect_partitioned(plan.clone(), task_ctx.clone())
.await?
Expand All @@ -319,7 +357,7 @@ impl Query {

Either::Left(batches)
} else {
let task_ctx = QUERY_SESSION.get_ctx().task_ctx();
let task_ctx = ctx.task_ctx();

let output_partitions = plan.output_partitioning().partition_count();

Expand Down
Loading
Loading