diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 56846a3..25cf613 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -14,5 +14,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: recursive + - name: Download public datasets + run: ./scripts/download-public-datasets.sh --all - uses: mlugg/setup-zig@v2 - - run: zig build test --summary all + - run: zig build test -Doptimize=ReleaseSafe -Dci-tests=true --summary all diff --git a/build.zig b/build.zig index 8b7a1ea..ebc6c7b 100644 --- a/build.zig +++ b/build.zig @@ -89,10 +89,26 @@ pub fn build(b: *std.Build) void { parquet_testing.root_module.addImport("parzig", lib); const run_parquet_testing = b.addRunArtifact(parquet_testing); + const public_datasets_options = b.addOptions(); + const ci_tests = b.option(bool, "ci-tests", "Enable CI-only tests for large public datasets") orelse false; + public_datasets_options.addOption(bool, "ci_tests", ci_tests); + + const public_datasets_testing = b.addTest(.{ + .root_module = b.createModule(.{ + .root_source_file = b.path("src/public_datasets_testing.zig"), + .target = target, + .optimize = optimize, + }), + }); + public_datasets_testing.root_module.addImport("parzig", lib); + public_datasets_testing.root_module.addOptions("build_options", public_datasets_options); + const run_public_datasets_testing = b.addRunArtifact(public_datasets_testing); + const test_step = b.step("test", "Run unit tests"); test_step.dependOn(&run_lib_unit_tests.step); test_step.dependOn(&run_exe_unit_tests.step); test_step.dependOn(&run_parquet_testing.step); + test_step.dependOn(&run_public_datasets_testing.step); const test_lldb_step = b.step("test-lldb", "Debug unit tests with LLDB"); const lldb = b.addSystemCommand(&.{"lldb"}); diff --git a/scripts/download-public-datasets.sh b/scripts/download-public-datasets.sh new file mode 100755 index 0000000..249dbc6 --- /dev/null +++ b/scripts/download-public-datasets.sh @@ -0,0 +1,98 @@ +#!/bin/bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(dirname "$SCRIPT_DIR")" +DEST_DIR="$REPO_ROOT/testdata/public-datasets" + +download_file() { + local url="$1" + local dest="$2" + + if [[ -f "$dest" ]]; then + echo " Already exists: $(basename "$dest")" + return + fi + + echo " Downloading: $(basename "$dest")" + curl -fSL "$url" -o "$dest" +} + +# ============================================================================= +# NYC Taxi Dataset +# Source: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page +# ============================================================================= +download_nyc_taxi() { + local mode="$1" + local base_url="https://d37ci6vzurychx.cloudfront.net/trip-data" + local dest="$DEST_DIR/nyc-taxi" + mkdir -p "$dest" + + echo "=== NYC Taxi Dataset ===" + + local small_files=( + "green_tripdata_2025-10.parquet" + "fhv_tripdata_2025-10.parquet" + ) + + local big_files=( + "yellow_tripdata_2025-10.parquet" + "fhvhv_tripdata_2025-10.parquet" + ) + + echo "Downloading small files..." + for file in "${small_files[@]}"; do + download_file "$base_url/$file" "$dest/$file" + done + + if [[ "$mode" == "all" ]]; then + echo "Downloading big files..." + for file in "${big_files[@]}"; do + download_file "$base_url/$file" "$dest/$file" + done + fi +} + +# ============================================================================= +# Add more datasets here following the same pattern +# ============================================================================= + +usage() { + echo "Usage: $0 [--small|--all]" + echo "" + echo "Download public Parquet datasets for testing." + echo "" + echo "Options:" + echo " --small Download only small files (default)" + echo " --all Download all files including large ones" + exit 0 +} + +MODE="small" + +while [[ $# -gt 0 ]]; do + case $1 in + --small) + MODE="small" + shift + ;; + --all) + MODE="all" + shift + ;; + --help|-h) + usage + ;; + *) + echo "Unknown option: $1" + usage + ;; + esac +done + +mkdir -p "$DEST_DIR" + +download_nyc_taxi "$MODE" + +echo "" +echo "Done!" diff --git a/src/public_datasets_testing.zig b/src/public_datasets_testing.zig new file mode 100644 index 0000000..75566db --- /dev/null +++ b/src/public_datasets_testing.zig @@ -0,0 +1,117 @@ +const std = @import("std"); +const parzig = @import("parzig"); +const build_options = @import("build_options"); + +const File = parzig.parquet.File; +const testing = std.testing; +const io = testing.io; +const Io = std.Io; + +const ci_tests = build_options.ci_tests; + +fn readAllRowGroups(file: *File) !void { + for (file.metadata.row_groups, 0..) |rg_metadata, rg_idx| { + var rg = file.rowGroup(rg_idx); + + for (rg_metadata.columns, 0..) |_, col_idx| { + _ = try rg.readColumnDynamic(col_idx); + } + } +} + +// ============================================================================= +// Small datasets - always run +// ============================================================================= + +test "nyc taxi: green tripdata 2025-10" { + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/nyc-taxi/green_tripdata_2025-10.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(1, file.metadata.row_groups.len); + try testing.expectEqual(49416, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // VendorID (i32) + const vendor_ids = try rg.readColumn(i32, 0); + try testing.expectEqualSlices(i32, &[_]i32{ 2, 2, 2 }, vendor_ids[0..3]); + + // PULocationID (i32) + const pu_location_ids = try rg.readColumn(i32, 5); + try testing.expectEqualSlices(i32, &[_]i32{ 247, 66, 244 }, pu_location_ids[0..3]); + + // DOLocationID (i32) + const do_location_ids = try rg.readColumn(i32, 6); + try testing.expectEqualSlices(i32, &[_]i32{ 69, 25, 244 }, do_location_ids[0..3]); + + // passenger_count (i64, nullable) + const passenger_counts = try rg.readColumn(?i64, 7); + try testing.expectEqualSlices(?i64, &[_]?i64{ 1, 1, 1 }, passenger_counts[0..3]); + + // trip_distance (f64) + const trip_distances = try rg.readColumn(f64, 8); + try testing.expectEqualSlices(f64, &[_]f64{ 0.7, 1.61, 0.0 }, trip_distances[0..3]); + + // fare_amount (f64) + const fare_amounts = try rg.readColumn(f64, 9); + try testing.expectEqualSlices(f64, &[_]f64{ 5.8, 11.4, 10.0 }, fare_amounts[0..3]); + + try readAllRowGroups(&file); +} + +test "nyc taxi: fhv tripdata 2025-10" { + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/nyc-taxi/fhv_tripdata_2025-10.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(3, file.metadata.row_groups.len); + try testing.expectEqual(2446615, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // dispatching_base_num (string) + const base_nums = try rg.readColumn([]const u8, 0); + try testing.expectEqualDeep(@as([]const u8, "B00009"), base_nums[0]); + + // Affiliated_base_number (string) + const affiliated_base_nums = try rg.readColumn(?[]const u8, 6); + try testing.expectEqualDeep(@as(?[]const u8, "B00009"), affiliated_base_nums[0]); + + try readAllRowGroups(&file); +} + +// ============================================================================= +// Big datasets - CI only +// ============================================================================= + +test "nyc taxi: yellow tripdata 2025-10 (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/nyc-taxi/yellow_tripdata_2025-10.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expect(file.metadata.num_rows > 0); + try testing.expect(file.metadata.row_groups.len > 0); + + try readAllRowGroups(&file); +} + +test "nyc taxi: fhvhv tripdata 2025-10 (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/nyc-taxi/fhvhv_tripdata_2025-10.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expect(file.metadata.num_rows > 0); + try testing.expect(file.metadata.row_groups.len > 0); + + // TODO: This causes OOM on the CI. We probably need to have a seperate arena for each row group and de-allocate it between. + // try readAllRowGroups(&file); +} diff --git a/testdata/public-datasets/.gitignore b/testdata/public-datasets/.gitignore new file mode 100644 index 0000000..ae5182d --- /dev/null +++ b/testdata/public-datasets/.gitignore @@ -0,0 +1 @@ +**/*.parquet diff --git a/testdata/public-datasets/README.md b/testdata/public-datasets/README.md new file mode 100644 index 0000000..973e621 --- /dev/null +++ b/testdata/public-datasets/README.md @@ -0,0 +1,48 @@ +# Public Datasets + +This directory contains public Parquet datasets for testing. Files are downloaded +during CI or manually using the download script. + +## Directory Structure + +``` +public-datasets/ +├── nyc-taxi/ # NYC Taxi trip data +│ ├── green_tripdata_2025-10.parquet +│ ├── fhv_tripdata_2025-10.parquet +│ ├── yellow_tripdata_2025-10.parquet (CI only) +│ └── fhvhv_tripdata_2025-10.parquet (CI only) +└── / # More datasets can be added +``` + +## Datasets + +### NYC Taxi Data + +Source: [TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page) + +| File | Size | CI Only | +|------|------|---------| +| `green_tripdata_2025-10.parquet` | ~1MB | No | +| `fhv_tripdata_2025-10.parquet` | ~25MB | No | +| `yellow_tripdata_2025-10.parquet` | ~50MB | Yes | +| `fhvhv_tripdata_2025-10.parquet` | ~400MB | Yes | + +## Download + +Run the download script from the repository root: + +```bash +# Download small files only (for local testing) +./scripts/download-public-datasets.sh + +# Download all files including large ones (for CI) +./scripts/download-public-datasets.sh --all +``` + +## Adding New Datasets + +1. Add a new `download_()` function to `scripts/download-public-datasets.sh` +2. Call it from the main section of the script +3. Create corresponding tests in `src/public_datasets_testing.zig` +4. Update this README with dataset documentation