Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,7 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Download public datasets
run: ./scripts/download-public-datasets.sh --all
- uses: mlugg/setup-zig@v2
- run: zig build test --summary all
- run: zig build test -Doptimize=ReleaseSafe -Dci-tests=true --summary all
16 changes: 16 additions & 0 deletions build.zig
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,26 @@ pub fn build(b: *std.Build) void {
parquet_testing.root_module.addImport("parzig", lib);
const run_parquet_testing = b.addRunArtifact(parquet_testing);

const public_datasets_options = b.addOptions();
const ci_tests = b.option(bool, "ci-tests", "Enable CI-only tests for large public datasets") orelse false;
public_datasets_options.addOption(bool, "ci_tests", ci_tests);

const public_datasets_testing = b.addTest(.{
.root_module = b.createModule(.{
.root_source_file = b.path("src/public_datasets_testing.zig"),
.target = target,
.optimize = optimize,
}),
});
public_datasets_testing.root_module.addImport("parzig", lib);
public_datasets_testing.root_module.addOptions("build_options", public_datasets_options);
const run_public_datasets_testing = b.addRunArtifact(public_datasets_testing);

const test_step = b.step("test", "Run unit tests");
test_step.dependOn(&run_lib_unit_tests.step);
test_step.dependOn(&run_exe_unit_tests.step);
test_step.dependOn(&run_parquet_testing.step);
test_step.dependOn(&run_public_datasets_testing.step);

const test_lldb_step = b.step("test-lldb", "Debug unit tests with LLDB");
const lldb = b.addSystemCommand(&.{"lldb"});
Expand Down
98 changes: 98 additions & 0 deletions scripts/download-public-datasets.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/bin/bash
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(dirname "$SCRIPT_DIR")"
DEST_DIR="$REPO_ROOT/testdata/public-datasets"

download_file() {
local url="$1"
local dest="$2"

if [[ -f "$dest" ]]; then
echo " Already exists: $(basename "$dest")"
return
fi

echo " Downloading: $(basename "$dest")"
curl -fSL "$url" -o "$dest"
}

# =============================================================================
# NYC Taxi Dataset
# Source: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
# =============================================================================
download_nyc_taxi() {
local mode="$1"
local base_url="https://d37ci6vzurychx.cloudfront.net/trip-data"
local dest="$DEST_DIR/nyc-taxi"
mkdir -p "$dest"

echo "=== NYC Taxi Dataset ==="

local small_files=(
"green_tripdata_2025-10.parquet"
"fhv_tripdata_2025-10.parquet"
)

local big_files=(
"yellow_tripdata_2025-10.parquet"
"fhvhv_tripdata_2025-10.parquet"
)

echo "Downloading small files..."
for file in "${small_files[@]}"; do
download_file "$base_url/$file" "$dest/$file"
done

if [[ "$mode" == "all" ]]; then
echo "Downloading big files..."
for file in "${big_files[@]}"; do
download_file "$base_url/$file" "$dest/$file"
done
fi
}

# =============================================================================
# Add more datasets here following the same pattern
# =============================================================================

usage() {
echo "Usage: $0 [--small|--all]"
echo ""
echo "Download public Parquet datasets for testing."
echo ""
echo "Options:"
echo " --small Download only small files (default)"
echo " --all Download all files including large ones"
exit 0
}

MODE="small"

while [[ $# -gt 0 ]]; do
case $1 in
--small)
MODE="small"
shift
;;
--all)
MODE="all"
shift
;;
--help|-h)
usage
;;
*)
echo "Unknown option: $1"
usage
;;
esac
done

mkdir -p "$DEST_DIR"

download_nyc_taxi "$MODE"

echo ""
echo "Done!"
117 changes: 117 additions & 0 deletions src/public_datasets_testing.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
const std = @import("std");
const parzig = @import("parzig");
const build_options = @import("build_options");

const File = parzig.parquet.File;
const testing = std.testing;
const io = testing.io;
const Io = std.Io;

const ci_tests = build_options.ci_tests;

fn readAllRowGroups(file: *File) !void {
for (file.metadata.row_groups, 0..) |rg_metadata, rg_idx| {
var rg = file.rowGroup(rg_idx);

for (rg_metadata.columns, 0..) |_, col_idx| {
_ = try rg.readColumnDynamic(col_idx);
}
}
}

// =============================================================================
// Small datasets - always run
// =============================================================================

test "nyc taxi: green tripdata 2025-10" {
var reader_buf: [4096]u8 = undefined;
var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/nyc-taxi/green_tripdata_2025-10.parquet", .{ .mode = .read_only })).reader(io, &reader_buf);
var file = try File.read(testing.allocator, &file_reader);
defer file.deinit();

try testing.expectEqual(1, file.metadata.row_groups.len);
try testing.expectEqual(49416, file.metadata.num_rows);

var rg = file.rowGroup(0);

// VendorID (i32)
const vendor_ids = try rg.readColumn(i32, 0);
try testing.expectEqualSlices(i32, &[_]i32{ 2, 2, 2 }, vendor_ids[0..3]);

// PULocationID (i32)
const pu_location_ids = try rg.readColumn(i32, 5);
try testing.expectEqualSlices(i32, &[_]i32{ 247, 66, 244 }, pu_location_ids[0..3]);

// DOLocationID (i32)
const do_location_ids = try rg.readColumn(i32, 6);
try testing.expectEqualSlices(i32, &[_]i32{ 69, 25, 244 }, do_location_ids[0..3]);

// passenger_count (i64, nullable)
const passenger_counts = try rg.readColumn(?i64, 7);
try testing.expectEqualSlices(?i64, &[_]?i64{ 1, 1, 1 }, passenger_counts[0..3]);

// trip_distance (f64)
const trip_distances = try rg.readColumn(f64, 8);
try testing.expectEqualSlices(f64, &[_]f64{ 0.7, 1.61, 0.0 }, trip_distances[0..3]);

// fare_amount (f64)
const fare_amounts = try rg.readColumn(f64, 9);
try testing.expectEqualSlices(f64, &[_]f64{ 5.8, 11.4, 10.0 }, fare_amounts[0..3]);

try readAllRowGroups(&file);
}

test "nyc taxi: fhv tripdata 2025-10" {
var reader_buf: [4096]u8 = undefined;
var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/nyc-taxi/fhv_tripdata_2025-10.parquet", .{ .mode = .read_only })).reader(io, &reader_buf);
var file = try File.read(testing.allocator, &file_reader);
defer file.deinit();

try testing.expectEqual(3, file.metadata.row_groups.len);
try testing.expectEqual(2446615, file.metadata.num_rows);

var rg = file.rowGroup(0);

// dispatching_base_num (string)
const base_nums = try rg.readColumn([]const u8, 0);
try testing.expectEqualDeep(@as([]const u8, "B00009"), base_nums[0]);

// Affiliated_base_number (string)
const affiliated_base_nums = try rg.readColumn(?[]const u8, 6);
try testing.expectEqualDeep(@as(?[]const u8, "B00009"), affiliated_base_nums[0]);

try readAllRowGroups(&file);
}

// =============================================================================
// Big datasets - CI only
// =============================================================================

test "nyc taxi: yellow tripdata 2025-10 (ci only)" {
if (!ci_tests) return error.SkipZigTest;

var reader_buf: [4096]u8 = undefined;
var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/nyc-taxi/yellow_tripdata_2025-10.parquet", .{ .mode = .read_only })).reader(io, &reader_buf);
var file = try File.read(testing.allocator, &file_reader);
defer file.deinit();

try testing.expect(file.metadata.num_rows > 0);
try testing.expect(file.metadata.row_groups.len > 0);

try readAllRowGroups(&file);
}

test "nyc taxi: fhvhv tripdata 2025-10 (ci only)" {
if (!ci_tests) return error.SkipZigTest;

var reader_buf: [4096]u8 = undefined;
var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/nyc-taxi/fhvhv_tripdata_2025-10.parquet", .{ .mode = .read_only })).reader(io, &reader_buf);
var file = try File.read(testing.allocator, &file_reader);
defer file.deinit();

try testing.expect(file.metadata.num_rows > 0);
try testing.expect(file.metadata.row_groups.len > 0);

// TODO: This causes OOM on the CI. We probably need to have a seperate arena for each row group and de-allocate it between.
// try readAllRowGroups(&file);
}
1 change: 1 addition & 0 deletions testdata/public-datasets/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
**/*.parquet
48 changes: 48 additions & 0 deletions testdata/public-datasets/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Public Datasets

This directory contains public Parquet datasets for testing. Files are downloaded
during CI or manually using the download script.

## Directory Structure

```
public-datasets/
├── nyc-taxi/ # NYC Taxi trip data
│ ├── green_tripdata_2025-10.parquet
│ ├── fhv_tripdata_2025-10.parquet
│ ├── yellow_tripdata_2025-10.parquet (CI only)
│ └── fhvhv_tripdata_2025-10.parquet (CI only)
└── <future-dataset>/ # More datasets can be added
```

## Datasets

### NYC Taxi Data

Source: [TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page)

| File | Size | CI Only |
|------|------|---------|
| `green_tripdata_2025-10.parquet` | ~1MB | No |
| `fhv_tripdata_2025-10.parquet` | ~25MB | No |
| `yellow_tripdata_2025-10.parquet` | ~50MB | Yes |
| `fhvhv_tripdata_2025-10.parquet` | ~400MB | Yes |

## Download

Run the download script from the repository root:

```bash
# Download small files only (for local testing)
./scripts/download-public-datasets.sh

# Download all files including large ones (for CI)
./scripts/download-public-datasets.sh --all
```

## Adding New Datasets

1. Add a new `download_<dataset>()` function to `scripts/download-public-datasets.sh`
2. Call it from the main section of the script
3. Create corresponding tests in `src/public_datasets_testing.zig`
4. Update this README with dataset documentation