Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
195 changes: 195 additions & 0 deletions .github/workflows/zmqml-hybrid.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
name: ZMQML Hybrid

on:
pull_request:
branches:
- master
push:
branches:
- master

env:
ROSS_REF: 9b6ccb18f9b9db438bf41b5b221d0ef16a4dac48
ZMQML_IMAGE: ghcr.io/codes-org/codes-ci-full:latest

jobs:
zmqml-hybrid:
name: zmqml hybrid workflows
runs-on: ubuntu-24.04

steps:
- name: Checkout CODES
uses: actions/checkout@v4
with:
path: codes

- name: Checkout ROSS
uses: actions/checkout@v4
with:
repository: ROSS-org/ROSS
ref: ${{ env.ROSS_REF }}
path: ross

- name: Pull full dependency image
run: docker pull "$ZMQML_IMAGE"

- name: Create Docker network
run: docker network create codes-zmqml-ci

- name: Start ZMQML server container
run: |
mkdir -p "$PWD/zmqml-artifacts"

docker run -d \
--name zmqml-server \
--network codes-zmqml-ci \
-v "$PWD/codes:/work/codes" \
-v "$PWD/zmqml-artifacts:/work/zmqml-artifacts" \
-w /work/codes/src/surrogate/zmqml \
-e ZMQML_ITERATION_HISTORY_LEN=2 \
-e ZMQML_ITERATION_HORIZON=3 \
-e ZMQML_ITERATION_TRAIN_STRIDE=1 \
-e ZMQML_EVENT_TIME_MIN_ROWS=4 \
-e ZMQML_EVENT_TIME_EPOCHS=2 \
-e ZMQML_RECORD_LOG_PATH=/work/zmqml-artifacts/iteration-records.csv \
-e ZMQML_EVENT_TIME_RECORD_LOG_PATH=/work/zmqml-artifacts/event-time-records.csv \
"$ZMQML_IMAGE" \
bash -euxo pipefail -c '
apt-get update
apt-get install -y python3-zmq python3-numpy python3-sklearn python3-pandas python3-pip gettext-base
Comment on lines +57 to +59

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

with #255 updating the ci image, this is no longer needed., same for the other places this is done. in the future we should update the docker image with dependencies like this instead of grabbing them in every job. just note that it has to be done in a separate PR because the job that creates the docker image does it on push to master only when the dockerfile changes.


python3 -c "import importlib.util, subprocess, sys; sys.exit(0) if importlib.util.find_spec(\"torch\") else subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"--break-system-packages\", \"torch\", \"--index-url\", \"https://download.pytorch.org/whl/cpu\"])"

exec python3 -u zmqmlserver.py
'

sleep 5
docker ps --filter name=zmqml-server
docker logs zmqml-server

- name: Build ROSS and CODES with ZMQML
run: |
docker run --rm \
--name codes-zmqml-build \
--network codes-zmqml-ci \
-v "$PWD/codes:/work/codes" \
-v "$PWD/ross:/work/ross" \
-v "$PWD/ross-install:/work/ross-install" \
-w /work \
"$ZMQML_IMAGE" \
bash -euxo pipefail -c '
apt-get update
apt-get install -y python3-zmq python3-numpy python3-sklearn python3-pandas python3-pip gettext-base
python3 -c "import importlib.util, subprocess, sys; sys.exit(0) if importlib.util.find_spec(\"torch\") else subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"--break-system-packages\", \"torch\", \"--index-url\", \"https://download.pytorch.org/whl/cpu\"])"

cmake -S ross -B ross/build -G Ninja \
-DCMAKE_BUILD_TYPE=Debug \
-DROSS_BUILD_MODELS=ON \
-DCMAKE_INSTALL_PREFIX=/work/ross-install

cmake --build ross/build --target install -j

cd /work/codes
rm -rf build

cmake -S . -B build -G Ninja \
-DCMAKE_BUILD_TYPE=Debug \
-DBUILD_TESTING=ON \
-DCODES_USE_SWM=ON \
-DCODES_USE_TORCH=ON \
-DCODES_USE_ZEROMQ=ON \
-DCODES_ENABLE_ZMQML_HYBRID_TESTS=ON \
-DCMAKE_C_COMPILER=mpicc \
-DCMAKE_CXX_COMPILER=mpicxx \
-DCMAKE_PREFIX_PATH="/work/ross-install;/opt/swm;/opt/argobots" \
-DTorch_DIR="$(python3 -c "import torch; print(torch.utils.cmake_prefix_path)")/Torch"

cmake --build build -j
'

- name: Run ZMQML hybrid tests
run: |
docker run --rm \
--name codes-zmqml-tests \
--network codes-zmqml-ci \
-v "$PWD/codes:/work/codes" \
-v "$PWD/zmqml-artifacts:/work/zmqml-artifacts" \
-w /work/codes \
-e ZMQML_ENDPOINT=tcp://zmqml-server:5555 \
-e ZMQML_TEST_NP=1 \
-e ZMQML_CTL_TIMEOUT=30 \
"$ZMQML_IMAGE" \
bash -euxo pipefail -c '
apt-get update
apt-get install -y python3-zmq python3-numpy python3-sklearn python3-pandas python3-pip gettext-base
python3 -c "import importlib.util, subprocess, sys; sys.exit(0) if importlib.util.find_spec(\"torch\") else subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"--break-system-packages\", \"torch\", \"--index-url\", \"https://download.pytorch.org/whl/cpu\"])"

ctest --test-dir build -N \
-R "zmqml-(iteration-time|event-time)-hybrid-workflow.sh" \
| tee /tmp/zmqml-ctest-list.txt

grep -E "Test #[0-9]+: zmqml-(iteration-time|event-time)-hybrid-workflow.sh" \
/tmp/zmqml-ctest-list.txt

ctest --test-dir build \
-R "zmqml-(iteration-time|event-time)-hybrid-workflow.sh" \
--output-on-failure \
--timeout 1200 \
-VV
'

- name: Validate ZMQML server logs
run: |
mkdir -p "$PWD/zmqml-artifacts"
docker logs zmqml-server 2>&1 | tee "$PWD/zmqml-artifacts/zmqml-server.log"

require_log() {
local pattern="$1"
local description="$2"

if ! grep -nE "$pattern" "$PWD/zmqml-artifacts/zmqml-server.log"; then
echo "::error::Missing server-side ZMQML evidence: $description"
exit 1
fi
}

require_log '\[zmqmlserver\] director_debug_prints=1' \
'simulation configured the server debug flag'

require_log '\[iteration-time records\]' \
'iteration-time records reached the server'

require_log '\[iteration-time inference\].*predictions=' \
'iteration-time inference reached the server and returned predictions'

require_log '\[event-time records\]' \
'event-time records reached the server'

require_log '\[event-time inference\].*predictions=' \
'event-time inference reached the server and returned predictions'

test -s "$PWD/zmqml-artifacts/iteration-records.csv"
test -s "$PWD/zmqml-artifacts/event-time-records.csv"

- name: Dump ZMQML server logs
if: always()
run: docker logs zmqml-server || true

- name: Stop ZMQML server
if: always()
run: |
docker rm -f zmqml-server || true
docker network rm codes-zmqml-ci || true

- name: Upload ZMQML logs
if: always()
uses: actions/upload-artifact@v4
with:
name: zmqml-hybrid-logs
path: |
codes/build/Testing/Temporary/LastTest.log
codes/build/Testing/Temporary/LastTestsFailed.log
codes/build/testing-output/**
zmqml-artifacts/**
if-no-files-found: ignore
retention-days: 14
53 changes: 53 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,59 @@ set(ARGOBOTS_PKG_CONFIG_PATH "" CACHE PATH "DEPRECATED: use CMAKE_PREFIX_PATH. W
# dirs, the MPI dependency, and link libraries (linked in src/CMakeLists.txt).
find_package(ROSS CONFIG REQUIRED)

# Compatibility for older ROSS CMake package configs.
# Some ROSS installs provide ROSSConfig.cmake but do not define the modern
# imported target ROSS::ROSS. This CODES tree links against ROSS::ROSS, so
# synthesize that target from the installed ROSS prefix when needed.
if(NOT TARGET ROSS::ROSS)
message(WARNING "ROSS package did not define ROSS::ROSS; creating compatibility imported target.")
Comment on lines +84 to +85

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a ci job (can be build only, I don't think it's necessary to run the test suite) that builds CODES with an old ROSS. That way we can be sure this continues to work.

Also change the warning message to a deprecation message (see elsewhere in this file where deprecation is used), because I don't think we should keep this around forever.


if(DEFINED ROSS_DIR)
get_filename_component(_ROSS_CONFIG_DIR "${ROSS_DIR}" ABSOLUTE)
else()
set(_ROSS_CONFIG_DIR "")
endif()

# In this install layout, ROSSConfig.cmake is under:
# <prefix>/lib/ROSSConfig.cmake
# so the prefix is one directory above ROSS_DIR.
get_filename_component(_ROSS_PREFIX "${_ROSS_CONFIG_DIR}/.." ABSOLUTE)

find_library(_ROSS_COMPAT_LIBRARY
NAMES ROSS ross
PATHS
"${_ROSS_PREFIX}/lib"
"${_ROSS_CONFIG_DIR}"
NO_DEFAULT_PATH
)

find_path(_ROSS_COMPAT_INCLUDE_DIR
NAMES ross.h
PATHS
"${_ROSS_PREFIX}/include"
"${CMAKE_CURRENT_SOURCE_DIR}/../ross/core"
"$ENV{HOME}/ross/core"
NO_DEFAULT_PATH
)

if(NOT _ROSS_COMPAT_LIBRARY)
message(FATAL_ERROR "Could not locate ROSS library for compatibility target. Checked ${_ROSS_PREFIX}/lib and ${_ROSS_CONFIG_DIR}.")
endif()

if(NOT _ROSS_COMPAT_INCLUDE_DIR)
message(FATAL_ERROR "Could not locate ross.h for compatibility target. Checked ${_ROSS_PREFIX}/include and ~/ross/core.")
endif()

add_library(ROSS::ROSS UNKNOWN IMPORTED)
set_target_properties(ROSS::ROSS PROPERTIES
IMPORTED_LOCATION "${_ROSS_COMPAT_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${_ROSS_COMPAT_INCLUDE_DIR}"
)

message(STATUS "Using compatibility ROSS::ROSS library: ${_ROSS_COMPAT_LIBRARY}")
message(STATUS "Using compatibility ROSS::ROSS include dir: ${_ROSS_COMPAT_INCLUDE_DIR}")
endif()

# PkgConfig discovers the optional SWM/UNION/ARGOBOTS deps below (as imported
# targets). The recommended way to point at a non-standard install is
# CMAKE_PREFIX_PATH (pkg_check_modules searches <prefix>/lib/pkgconfig etc. under
Expand Down
13 changes: 12 additions & 1 deletion CODES-compile-instructions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -333,12 +333,23 @@ INNERPY
fi
fi

cmake_prefix_path="$(realpath "$CUR_DIR/ross/build/bin")"
ross_config="$(find "$CUR_DIR/ross/build" \( -name ROSSConfig.cmake -o -name ross-config.cmake \) | head -n 1)"
if [ -z "$ross_config" ]; then
echo "ERROR: Could not find built ROSSConfig.cmake under $CUR_DIR/ross/build." >&2
echo " Try rebuilding ROSS or check whether make install completed." >&2
exit 1
fi

ross_dir="$(dirname "$ross_config")"
echo "Using ROSS_DIR=${ross_dir}"

cmake_prefix_path="${ross_dir}"
if [ "$torch_enable" = 1 ]; then
cmake_prefix_path="${cmake_prefix_path};${torch_cmake_prefix}"
fi

make_args_codes=(
-DROSS_DIR="${ross_dir}"
-DCMAKE_PREFIX_PATH="${cmake_prefix_path}"
-DCMAKE_C_FLAGS="-g -Wall"
-DCMAKE_CXX_FLAGS="-g -Wall"
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ This repo uses [clang-format](https://clang.llvm.org/docs/ClangFormat.html) to k
- **Emacs:** see [clang-format.el](https://clang.llvm.org/docs/ClangFormat.html#emacs-integration).

To reformat a file manually: `clang-format -i path/to/file.c`. CI runs `clang-format --dry-run --Werror` on every PR and rejects any drift, so PRs with unformatted code don't merge.
Note: The CI uses clang-format major release version 20, so you should format your files with that version.

### Determinism

Expand Down
4 changes: 1 addition & 3 deletions codes/surrogate/director-client.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,7 @@ extern "C" {


extern void director_lp_register_model(const char*);


extern void director_record_external_zmq_latency(double processing_sec, double total_sec);
/*
extern void director_parse_args(char *args, int **args_array, int *length);
static void director_issue_codes_event(director_state * s, tw_lpid nw_lpid, int dir_registered_event_type, tw_stime ts, tw_lp* lp);
Expand All @@ -142,5 +141,4 @@ extern void dir_test_finalize(director_state* s, tw_lp* lp);
#ifdef __cplusplus
}
#endif

#endif
16 changes: 8 additions & 8 deletions doc/example/kb.dfdally-72-zeromq-director.conf.in
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,19 @@ LPGROUPS

DIRECTOR
{
start_iter="${DIRECTOR_START_ITER}";
end_iter="${DIRECTOR_END_ITER}";
start_iter="${START_ITER}";
end_iter="${END_ITER}";

# Optional one-shot pause/retrain/resume pipeline.
# First implementation is intended for --synch=1.
retrain_enabled="${DIRECTOR_RETRAIN_ENABLED}";
retrain_iter="${DIRECTOR_RETRAIN_ITER}";
retrain_save_path="${DIRECTOR_RETRAIN_SAVE_PATH}";
retrain_enabled="${RETRAIN_ENABLED}";
retrain_iter="${RETRAIN_ITER}";
retrain_save_path="${RETRAIN_SAVE_PATH}";

# Optional second surrogate window after retraining.
second_surrogate_enabled="${DIRECTOR_SECOND_SURROGATE_ENABLED}";
second_start_iter="${DIRECTOR_SECOND_START_ITER}";
second_end_iter="${DIRECTOR_SECOND_END_ITER}";
second_surrogate_enabled="${SECOND_SURROGATE_ENABLED}";
second_start_iter="${SECOND_START_ITER}";
second_end_iter="${SECOND_END_ITER}";

# Common modes:
#
Expand Down
Loading
Loading