diff --git a/.actlignore b/.actlignore new file mode 100644 index 0000000..07b529c --- /dev/null +++ b/.actlignore @@ -0,0 +1,15 @@ +# Keep large experiment data on /mnt/diffuse-shared/waterflow (also exposed as +# /data inside the ACTL image), not in the synced source checkout. +data/ +flow_cache/ +wandb/ +logs/ +outputs/ +checkpoints/ +*.ckpt +*.h5 +*.hdf5 +*.pt +*.pth +*.safetensors +*.wandb diff --git a/.github/workflows/astera-docker.yml b/.github/workflows/astera-docker.yml new file mode 100644 index 0000000..5e8d565 --- /dev/null +++ b/.github/workflows/astera-docker.yml @@ -0,0 +1,66 @@ +name: Astera ACTL Docker image + +on: + push: + branches: [main] + paths: + - Dockerfile.astera + - .github/workflows/astera-docker.yml + workflow_dispatch: + +env: + ASTERA_REGISTRY: harbor.astera.sh + ASTERA_IMAGE_NAME: library/waterflow + ACTL_VERSION_TAG: main-actl-2026-06-09 + WATERFLOW_BASE_IMAGE: ${{ vars.WATERFLOW_BASE_IMAGE || 'docker.io/diffuseproject/waterflow:latest@sha256:cfa4d600c88adf5223814e2c1861de85bf6047fe279c0df44f44cb4a8e6c65dc' }} + +jobs: + astera: + name: Build Astera ACTL image + runs-on: astera-sh-builder + permissions: + contents: read + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v4 + + - name: Login to Harbor + uses: docker/login-action@v4 + with: + registry: ${{ env.ASTERA_REGISTRY }} + username: ${{ secrets.HARBOR_USERNAME }} + password: ${{ secrets.HARBOR_PASSWORD }} + + - name: Docker metadata + id: meta + uses: docker/metadata-action@v6 + with: + images: ${{ env.ASTERA_REGISTRY }}/${{ env.ASTERA_IMAGE_NAME }} + tags: | + type=raw,value=${{ env.ACTL_VERSION_TAG }} + type=raw,value=main-actl + type=raw,value=actl + type=sha,prefix=sha- + + - name: Build and push Astera image + id: build + uses: docker/build-push-action@v7 + with: + context: . + file: Dockerfile.astera + platforms: linux/amd64 + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + build-args: | + WATERFLOW_BASE_IMAGE=${{ env.WATERFLOW_BASE_IMAGE }} + cache-from: type=registry,ref=${{ env.ASTERA_REGISTRY }}/${{ env.ASTERA_IMAGE_NAME }}:buildcache + cache-to: type=registry,ref=${{ env.ASTERA_REGISTRY }}/${{ env.ASTERA_IMAGE_NAME }}:buildcache,mode=max + provenance: false + + - name: Image digest + run: echo "Astera WaterFlow image pushed with digest ${{ steps.build.outputs.digest }}" diff --git a/Dockerfile.astera b/Dockerfile.astera new file mode 100644 index 0000000..9198d9e --- /dev/null +++ b/Dockerfile.astera @@ -0,0 +1,132 @@ +# syntax=docker/dockerfile:1 +# WaterFlow - Astera ACTL overlay. +# +# The public diffuseproject/waterflow image bakes the scientific stack and the +# gated ESM3 model. This overlay keeps that stack intact and only adds ACTL +# workspace conventions: /home/dev as the persisted home, editor/sync tooling, +# a shell-friendly command, and shared-volume data defaults. +# +# Build locally: +# docker buildx build --platform linux/amd64 \ +# -f Dockerfile.astera \ +# --build-arg WATERFLOW_BASE_IMAGE=docker.io/diffuseproject/waterflow:latest@sha256:cfa4d600c88adf5223814e2c1861de85bf6047fe279c0df44f44cb4a8e6c65dc \ +# -t harbor.astera.sh/library/waterflow:main-actl-2026-06-09 \ +# . + +ARG WATERFLOW_BASE_IMAGE=docker.io/diffuseproject/waterflow:latest@sha256:cfa4d600c88adf5223814e2c1861de85bf6047fe279c0df44f44cb4a8e6c65dc +FROM ${WATERFLOW_BASE_IMAGE} AS astera + +USER root + +ARG ACTL_PACKAGES="bash ca-certificates curl wget rsync tini vim nano emacs-nox git zsh htop tmux ncdu iputils-ping dnsutils" + +ENV DEBIAN_FRONTEND=noninteractive \ + HOME=/home/dev \ + XDG_CONFIG_HOME=/home/dev/.config \ + XDG_CACHE_HOME=/home/dev/.cache \ + XDG_DATA_HOME=/home/dev/.local/share \ + SHELL=/bin/bash \ + VIRTUAL_ENV=/app/.venv \ + PYTHONPATH=/home/dev/workspace:/app \ + PATH="/app/.venv/bin:${PATH}" \ + WATERFLOW_DATA_DIR=/mnt/diffuse-shared/waterflow \ + WATERFLOW_PDB_DIR=/data/pdb \ + WATERFLOW_CACHE_DIR=/data/cache \ + WATERFLOW_CHECKPOINT_DIR=/data/checkpoints \ + WATERFLOW_OUTPUT_DIR=/data/outputs \ + WATERFLOW_LOG_DIR=/data/logs \ + WATERFLOW_SPLITS_DIR=/data/splits + +RUN apt-get update && apt-get install -y --no-install-recommends \ + ${ACTL_PACKAGES} \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean \ + && mkdir -p \ + /home/dev/.config \ + /home/dev/.cache \ + /home/dev/.local/share \ + /home/dev/workspace \ + /mnt/diffuse-shared/waterflow/pdb \ + /mnt/diffuse-shared/waterflow/cache \ + /mnt/diffuse-shared/waterflow/checkpoints \ + /mnt/diffuse-shared/waterflow/outputs \ + /mnt/diffuse-shared/waterflow/logs \ + /mnt/diffuse-shared/waterflow/splits \ + /etc/zsh \ + && rm -rf /data \ + && ln -s /mnt/diffuse-shared/waterflow /data \ + && cat > /usr/local/bin/waterflow <<'EOF' +#!/usr/bin/env bash +if [ -d /mnt/diffuse-shared ]; then + mkdir -p \ + /mnt/diffuse-shared/waterflow/pdb \ + /mnt/diffuse-shared/waterflow/cache \ + /mnt/diffuse-shared/waterflow/checkpoints \ + /mnt/diffuse-shared/waterflow/outputs \ + /mnt/diffuse-shared/waterflow/logs \ + /mnt/diffuse-shared/waterflow/splits 2>/dev/null || true +fi +entrypoint=/app/entrypoint.sh +if [ -x /home/dev/workspace/docker/entrypoint.sh ] && [ -d /home/dev/workspace/scripts ]; then + export WATERFLOW_APP_DIR=/home/dev/workspace + entrypoint=/home/dev/workspace/docker/entrypoint.sh +fi +exec "${entrypoint}" "$@" +EOF +RUN chmod 0755 /usr/local/bin/waterflow \ + && cat > /usr/local/share/actl-waterflow-shell-init.sh <<'EOF' +# Keep the baked WaterFlow virtualenv active while letting ACTL's synced checkout +# at /home/dev/workspace override the baked /app source for edits. +if [ -d /app/.venv/bin ]; then + VIRTUAL_ENV=/app/.venv + export VIRTUAL_ENV + case ":${PATH}:" in + *:/app/.venv/bin:*) ;; + *) PATH="/app/.venv/bin:${PATH}" ;; + esac + export PATH +fi + +if [ -d /home/dev/workspace ]; then + case ":${PYTHONPATH:-}:" in + *:/home/dev/workspace:*) ;; + *) PYTHONPATH="/home/dev/workspace${PYTHONPATH:+:${PYTHONPATH}}" ;; + esac +fi +if [ -d /app ]; then + case ":${PYTHONPATH:-}:" in + *:/app:*) ;; + *) PYTHONPATH="${PYTHONPATH:+${PYTHONPATH}:}/app" ;; + esac +fi +export PYTHONPATH + +if [ -d /mnt/diffuse-shared ]; then + mkdir -p \ + /mnt/diffuse-shared/waterflow/pdb \ + /mnt/diffuse-shared/waterflow/cache \ + /mnt/diffuse-shared/waterflow/checkpoints \ + /mnt/diffuse-shared/waterflow/outputs \ + /mnt/diffuse-shared/waterflow/logs \ + /mnt/diffuse-shared/waterflow/splits 2>/dev/null || true +fi + +case "$-" in + *i*) + if [ -d /home/dev/workspace ]; then + cd /home/dev/workspace || true + fi + ;; +esac +EOF +RUN chmod 0644 /usr/local/share/actl-waterflow-shell-init.sh \ + && printf '\n# Astera WaterFlow environment\n[ -r /usr/local/share/actl-waterflow-shell-init.sh ] && . /usr/local/share/actl-waterflow-shell-init.sh\n' >> /etc/bash.bashrc \ + && printf '\n# Astera WaterFlow environment\n[ -r /usr/local/share/actl-waterflow-shell-init.sh ] && . /usr/local/share/actl-waterflow-shell-init.sh\n' >> /etc/zsh/zshrc \ + && for cmd in \ + waterflow python curl wget rsync tini vim nano emacs git zsh htop tmux ncdu ping dig; do \ + command -v "${cmd}" >/dev/null; \ + done + +WORKDIR /home/dev +ENTRYPOINT [] +CMD ["bash"] diff --git a/README.md b/README.md index 3ee5e48..ffed2ee 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,63 @@ Predicting water molecule placements on protein surfaces using flow matching conditioned on learned protein structure embeddings. +## Running on ACTL + +The Astera ACTL overlay image is published as: + +```bash +harbor.astera.sh/library/waterflow:main-actl-2026-06-09 +``` + +Once the ACTL catalog alias is available, launch from this checkout with: + +```bash +actl pod profiles -n diffuse +actl pod up waterflow --profile single --image waterflow --pvc-size 100Gi -n diffuse --yes +``` + +Before the alias lands, use the full image reference: + +```bash +actl pod up waterflow --profile single \ + --image harbor.astera.sh/library/waterflow:main-actl-2026-06-09 \ + --pvc-size 100Gi -n diffuse --yes +``` + +The selected diffuse profile auto-mounts the shared volume at `/mnt/diffuse-shared`. The ACTL image exposes `/mnt/diffuse-shared/waterflow` as `/data`, so the container defaults are persistent: + +```text +/data/pdb # input PDB tree +/data/cache # preprocessed geometry/ESM/SLAE caches +/data/checkpoints # training checkpoints +/data/outputs # inference outputs +/data/logs # W&B/offline logs +/data/splits # train/val/test split files +``` + +Inside the ACTL shell: + +```bash +waterflow train \ + --encoder_type gvp \ + --train_list /data/splits/train_list_0.95.txt \ + --val_list /data/splits/valid_list_0.05.txt \ + --batch_size 4 +``` + +`waterflow` uses the synced checkout under `/home/dev/workspace` when available, +so edits to `src/` and `scripts/` are picked up without rebuilding the image. + +To build the ACTL overlay locally: + +```bash +docker buildx build --platform linux/amd64 \ + -f Dockerfile.astera \ + --build-arg WATERFLOW_BASE_IMAGE=docker.io/diffuseproject/waterflow:latest@sha256:cfa4d600c88adf5223814e2c1861de85bf6047fe279c0df44f44cb4a8e6c65dc \ + -t harbor.astera.sh/library/waterflow:main-actl-2026-06-09 \ + . +``` + ## Project Structure ``` diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index ecd8950..d190c8f 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -6,6 +6,11 @@ set -e +# ACTL overlays sync the editable checkout to /home/dev/workspace while the +# production image keeps the baked source in /app. Let the wrapper select the +# source tree without changing normal Docker behavior. +APP_DIR="${WATERFLOW_APP_DIR:-/app}" + # Show help if no arguments show_help() { cat << EOF @@ -59,7 +64,7 @@ COMMAND="${1:-}" case "$COMMAND" in train) shift - exec python /app/scripts/train.py \ + exec python "${APP_DIR}/scripts/train.py" \ --base_pdb_dir "${WATERFLOW_PDB_DIR}" \ --processed_dir "${WATERFLOW_CACHE_DIR}" \ --save_dir "${WATERFLOW_CHECKPOINT_DIR}" \ @@ -69,7 +74,7 @@ case "$COMMAND" in inference) shift - exec python /app/scripts/inference.py \ + exec python "${APP_DIR}/scripts/inference.py" \ --base_pdb_dir "${WATERFLOW_PDB_DIR}" \ --processed_dir "${WATERFLOW_CACHE_DIR}" \ --output_dir "${WATERFLOW_OUTPUT_DIR}" \ @@ -78,7 +83,7 @@ case "$COMMAND" in generate-esm) shift - exec python /app/scripts/generate_esm_embeddings.py \ + exec python "${APP_DIR}/scripts/generate_esm_embeddings.py" \ --base_pdb_dir "${WATERFLOW_PDB_DIR}" \ --cache_dir "${WATERFLOW_CACHE_DIR}" \ "$@"