unexge · unexge · Feb 1, 2026 · Feb 1, 2026
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -14,5 +14,7 @@ jobs:
       - uses: actions/checkout@v4
         with:
           submodules: recursive
+      - name: Download public datasets
+        run: ./scripts/download-public-datasets.sh --all
       - uses: mlugg/setup-zig@v2
-      - run: zig build test --summary all
+      - run: zig build test -Doptimize=ReleaseSafe -Dci-tests=true --summary all
diff --git a/build.zig b/build.zig
@@ -89,10 +89,26 @@ pub fn build(b: *std.Build) void {
     parquet_testing.root_module.addImport("parzig", lib);
     const run_parquet_testing = b.addRunArtifact(parquet_testing);
 
+    const public_datasets_options = b.addOptions();
+    const ci_tests = b.option(bool, "ci-tests", "Enable CI-only tests for large public datasets") orelse false;
+    public_datasets_options.addOption(bool, "ci_tests", ci_tests);
+
+    const public_datasets_testing = b.addTest(.{
+        .root_module = b.createModule(.{
+            .root_source_file = b.path("src/public_datasets_testing.zig"),
+            .target = target,
+            .optimize = optimize,
+        }),
+    });
+    public_datasets_testing.root_module.addImport("parzig", lib);
+    public_datasets_testing.root_module.addOptions("build_options", public_datasets_options);
+    const run_public_datasets_testing = b.addRunArtifact(public_datasets_testing);
+
     const test_step = b.step("test", "Run unit tests");
     test_step.dependOn(&run_lib_unit_tests.step);
     test_step.dependOn(&run_exe_unit_tests.step);
     test_step.dependOn(&run_parquet_testing.step);
+    test_step.dependOn(&run_public_datasets_testing.step);
 
     const test_lldb_step = b.step("test-lldb", "Debug unit tests with LLDB");
     const lldb = b.addSystemCommand(&.{"lldb"});

diff --git a/scripts/download-public-datasets.sh b/scripts/download-public-datasets.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(dirname "$SCRIPT_DIR")"
+DEST_DIR="$REPO_ROOT/testdata/public-datasets"
+
+download_file() {
+    local url="$1"
+    local dest="$2"
+
+    if [[ -f "$dest" ]]; then
+        echo "  Already exists: $(basename "$dest")"
+        return
+    fi
+
+    echo "  Downloading: $(basename "$dest")"
+    curl -fSL "$url" -o "$dest"
+}
+
+# =============================================================================
+# NYC Taxi Dataset
+# Source: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
+# =============================================================================
+download_nyc_taxi() {
+    local mode="$1"
+    local base_url="https://d37ci6vzurychx.cloudfront.net/trip-data"
+    local dest="$DEST_DIR/nyc-taxi"
+    mkdir -p "$dest"
+
+    echo "=== NYC Taxi Dataset ==="
+
+    local small_files=(
+        "green_tripdata_2025-10.parquet"
+        "fhv_tripdata_2025-10.parquet"
+    )
+
+    local big_files=(
+        "yellow_tripdata_2025-10.parquet"
+        "fhvhv_tripdata_2025-10.parquet"
+    )
+
+    echo "Downloading small files..."
+    for file in "${small_files[@]}"; do
+        download_file "$base_url/$file" "$dest/$file"
+    done
+
+    if [[ "$mode" == "all" ]]; then
+        echo "Downloading big files..."
+        for file in "${big_files[@]}"; do
+            download_file "$base_url/$file" "$dest/$file"
+        done
+    fi
+}
+
+# =============================================================================
+# Add more datasets here following the same pattern
+# =============================================================================
+
+usage() {
+    echo "Usage: $0 [--small|--all]"
+    echo ""
+    echo "Download public Parquet datasets for testing."
+    echo ""
+    echo "Options:"
+    echo "  --small    Download only small files (default)"
+    echo "  --all      Download all files including large ones"
+    exit 0
+}
+
+MODE="small"
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --small)
+            MODE="small"
+            shift
+            ;;
+        --all)
+            MODE="all"
+            shift
+            ;;
+        --help|-h)
+            usage
+            ;;
+        *)
+            echo "Unknown option: $1"
+            usage
+            ;;
+    esac
+done
+
+mkdir -p "$DEST_DIR"
+
+download_nyc_taxi "$MODE"
+
+echo ""
+echo "Done!"
diff --git a/src/public_datasets_testing.zig b/src/public_datasets_testing.zig
@@ -0,0 +1,117 @@
+const std = @import("std");
+const parzig = @import("parzig");
+const build_options = @import("build_options");
+
+const File = parzig.parquet.File;
+const testing = std.testing;
+const io = testing.io;
+const Io = std.Io;
+
+const ci_tests = build_options.ci_tests;
+
+fn readAllRowGroups(file: *File) !void {
+    for (file.metadata.row_groups, 0..) |rg_metadata, rg_idx| {
+        var rg = file.rowGroup(rg_idx);
+
+        for (rg_metadata.columns, 0..) |_, col_idx| {
+            _ = try rg.readColumnDynamic(col_idx);
+        }
+    }
+}
+
+// =============================================================================
+// Small datasets - always run
+// =============================================================================
+
+test "nyc taxi: green tripdata 2025-10" {
+    var reader_buf: [4096]u8 = undefined;
+    var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/nyc-taxi/green_tripdata_2025-10.parquet", .{ .mode = .read_only })).reader(io, &reader_buf);
+    var file = try File.read(testing.allocator, &file_reader);
+    defer file.deinit();
+
+    try testing.expectEqual(1, file.metadata.row_groups.len);
+    try testing.expectEqual(49416, file.metadata.num_rows);
+
+    var rg = file.rowGroup(0);
+
+    // VendorID (i32)
+    const vendor_ids = try rg.readColumn(i32, 0);
+    try testing.expectEqualSlices(i32, &[_]i32{ 2, 2, 2 }, vendor_ids[0..3]);
+
+    // PULocationID (i32)
+    const pu_location_ids = try rg.readColumn(i32, 5);
+    try testing.expectEqualSlices(i32, &[_]i32{ 247, 66, 244 }, pu_location_ids[0..3]);
+
+    // DOLocationID (i32)
+    const do_location_ids = try rg.readColumn(i32, 6);
+    try testing.expectEqualSlices(i32, &[_]i32{ 69, 25, 244 }, do_location_ids[0..3]);
+
+    // passenger_count (i64, nullable)
+    const passenger_counts = try rg.readColumn(?i64, 7);
+    try testing.expectEqualSlices(?i64, &[_]?i64{ 1, 1, 1 }, passenger_counts[0..3]);
+
+    // trip_distance (f64)
+    const trip_distances = try rg.readColumn(f64, 8);
+    try testing.expectEqualSlices(f64, &[_]f64{ 0.7, 1.61, 0.0 }, trip_distances[0..3]);
+
+    // fare_amount (f64)
+    const fare_amounts = try rg.readColumn(f64, 9);
+    try testing.expectEqualSlices(f64, &[_]f64{ 5.8, 11.4, 10.0 }, fare_amounts[0..3]);
+
+    try readAllRowGroups(&file);
+}
+
+test "nyc taxi: fhv tripdata 2025-10" {
+    var reader_buf: [4096]u8 = undefined;
+    var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/nyc-taxi/fhv_tripdata_2025-10.parquet", .{ .mode = .read_only })).reader(io, &reader_buf);
+    var file = try File.read(testing.allocator, &file_reader);
+    defer file.deinit();
+
+    try testing.expectEqual(3, file.metadata.row_groups.len);
+    try testing.expectEqual(2446615, file.metadata.num_rows);
+
+    var rg = file.rowGroup(0);
+
+    // dispatching_base_num (string)
+    const base_nums = try rg.readColumn([]const u8, 0);
+    try testing.expectEqualDeep(@as([]const u8, "B00009"), base_nums[0]);
+
+    // Affiliated_base_number (string)
+    const affiliated_base_nums = try rg.readColumn(?[]const u8, 6);
+    try testing.expectEqualDeep(@as(?[]const u8, "B00009"), affiliated_base_nums[0]);
+
+    try readAllRowGroups(&file);
+}
+
+// =============================================================================
+// Big datasets - CI only
+// =============================================================================
+
+test "nyc taxi: yellow tripdata 2025-10 (ci only)" {
+    if (!ci_tests) return error.SkipZigTest;
+
+    var reader_buf: [4096]u8 = undefined;
+    var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/nyc-taxi/yellow_tripdata_2025-10.parquet", .{ .mode = .read_only })).reader(io, &reader_buf);
+    var file = try File.read(testing.allocator, &file_reader);
+    defer file.deinit();
+
+    try testing.expect(file.metadata.num_rows > 0);
+    try testing.expect(file.metadata.row_groups.len > 0);
+
+    try readAllRowGroups(&file);
+}
+
+test "nyc taxi: fhvhv tripdata 2025-10 (ci only)" {
+    if (!ci_tests) return error.SkipZigTest;
+
+    var reader_buf: [4096]u8 = undefined;
+    var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/nyc-taxi/fhvhv_tripdata_2025-10.parquet", .{ .mode = .read_only })).reader(io, &reader_buf);
+    var file = try File.read(testing.allocator, &file_reader);
+    defer file.deinit();
+
+    try testing.expect(file.metadata.num_rows > 0);
+    try testing.expect(file.metadata.row_groups.len > 0);
+
+    // TODO: This causes OOM on the CI. We probably need to have a seperate arena for each row group and de-allocate it between.
+    // try readAllRowGroups(&file);
+}
diff --git a/testdata/public-datasets/.gitignore b/testdata/public-datasets/.gitignore
@@ -0,0 +1 @@
+**/*.parquet
diff --git a/testdata/public-datasets/README.md b/testdata/public-datasets/README.md
@@ -0,0 +1,48 @@
+# Public Datasets
+
+This directory contains public Parquet datasets for testing. Files are downloaded
+during CI or manually using the download script.
+
+## Directory Structure
+
+```
+public-datasets/
+├── nyc-taxi/           # NYC Taxi trip data
+│   ├── green_tripdata_2025-10.parquet
+│   ├── fhv_tripdata_2025-10.parquet
+│   ├── yellow_tripdata_2025-10.parquet (CI only)
+│   └── fhvhv_tripdata_2025-10.parquet (CI only)
+└── <future-dataset>/   # More datasets can be added
+```
+
+## Datasets
+
+### NYC Taxi Data
+
+Source: [TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
+
+| File | Size | CI Only |
+|------|------|---------|
+| `green_tripdata_2025-10.parquet` | ~1MB | No |
+| `fhv_tripdata_2025-10.parquet` | ~25MB | No |
+| `yellow_tripdata_2025-10.parquet` | ~50MB | Yes |
+| `fhvhv_tripdata_2025-10.parquet` | ~400MB | Yes |
+
+## Download
+
+Run the download script from the repository root:
+
+```bash
+# Download small files only (for local testing)
+./scripts/download-public-datasets.sh
+
+# Download all files including large ones (for CI)
+./scripts/download-public-datasets.sh --all
+```
+
+## Adding New Datasets
+
+1. Add a new `download_<dataset>()` function to `scripts/download-public-datasets.sh`
+2. Call it from the main section of the script
+3. Create corresponding tests in `src/public_datasets_testing.zig`
+4. Update this README with dataset documentation