From aa05104287d8ff7dd8f2c256649956d2f38bb757 Mon Sep 17 00:00:00 2001 From: Sanjay Chari Date: Tue, 30 Jun 2026 11:54:14 -0400 Subject: [PATCH 1/3] Restore dfdally CI test config --- .../dfdally-72-par.conf.in | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/tests/conf/union-milc-jacobi-workload/dfdally-72-par.conf.in b/tests/conf/union-milc-jacobi-workload/dfdally-72-par.conf.in index 9e2648ff..ade97ef2 100644 --- a/tests/conf/union-milc-jacobi-workload/dfdally-72-par.conf.in +++ b/tests/conf/union-milc-jacobi-workload/dfdally-72-par.conf.in @@ -5,33 +5,11 @@ LPGROUPS repetitions="36"; # name of this lp changes according to the model nw-lp="2"; - dir-nw-lp="${DIRECTOR_LP_PER_REP}"; # these lp names will be the same for dragonfly-custom model modelnet_dragonfly_dally="2"; modelnet_dragonfly_dally_router="1"; } } -DIRECTOR -{ - start_iter="${DIRECTOR_START_ITER}"; - end_iter="${DIRECTOR_END_ITER}"; - - retrain_enabled="${DIRECTOR_RETRAIN_ENABLED}"; - retrain_iter="${DIRECTOR_RETRAIN_ITER}"; - retrain_save_path="${DIRECTOR_RETRAIN_SAVE_PATH}"; - - second_surrogate_enabled="${DIRECTOR_SECOND_SURROGATE_ENABLED}"; - second_start_iter="${DIRECTOR_SECOND_START_ITER}"; - second_end_iter="${DIRECTOR_SECOND_END_ITER}"; - - inferencing_enabled="${INFERENCING_ENABLED}"; - surrogate_enabled="${SURROGATE_ENABLED}"; - training_enabled="${TRAINING_ENABLED}"; - - debug_prints="${DIRECTOR_DEBUG_PRINTS}"; - shutdown_zmqml_server_on_finalize="${SHUTDOWN_ZMQML_SERVER_ON_FINALIZE}"; -} - PARAMS { # packet size in the network From 49e307baec772cfaa2dc7fb0d6ed04fccf7d242d Mon Sep 17 00:00:00 2001 From: Sanjay Chari Date: Tue, 30 Jun 2026 15:21:35 -0400 Subject: [PATCH 2/3] Fix union test scripts --- tests/union-workload-test-surrogate-parallel.sh | 12 +++++++++--- ...ion-workload-test-surrogate-smaller-chunk-size.sh | 12 +++++++++--- tests/union-workload-test-surrogate.sh | 12 +++++++++--- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/tests/union-workload-test-surrogate-parallel.sh b/tests/union-workload-test-surrogate-parallel.sh index f84bccad..57ca0f03 100644 --- a/tests/union-workload-test-surrogate-parallel.sh +++ b/tests/union-workload-test-surrogate-parallel.sh @@ -77,10 +77,16 @@ grep 'Net Events Processed' model-output.txt err=$? [[ $err -ne 0 ]] && exit $err -# Checking both milc and jacobi ran -grep 'MILC: Iteration 119/120' model-output.txt +# Checking both Jacobi and MILC ran. App 0 is Jacobi and App 1 is MILC in +# jacobi_MILC.workload.conf. In surrogate mode, MILC may not print the final +# high-fidelity iteration line, so check that at least one MILC rank finished. +grep -E 'Network node [0-9]+ Rank [0-9]+ App 1 finished' model-output.txt err=$? -[[ $err -ne 0 ]] && exit $err +if [[ $err -ne 0 ]]; then + echo "MILC/App 1 completion output not found" + grep -nE 'MILC|App 1|All non-synthetic|application iteration' model-output.txt | tail -80 + exit $err +fi grep 'Jacobi3D: Completed 39 iterations' model-output.txt err=$? diff --git a/tests/union-workload-test-surrogate-smaller-chunk-size.sh b/tests/union-workload-test-surrogate-smaller-chunk-size.sh index d7266ccc..6b7b0e58 100644 --- a/tests/union-workload-test-surrogate-smaller-chunk-size.sh +++ b/tests/union-workload-test-surrogate-smaller-chunk-size.sh @@ -77,10 +77,16 @@ grep 'Net Events Processed' model-output.txt err=$? [[ $err -ne 0 ]] && exit $err -# Checking both milc and jacobi ran -grep 'MILC: Iteration 119/120' model-output.txt +# Checking both Jacobi and MILC ran. App 0 is Jacobi and App 1 is MILC in +# jacobi_MILC.workload.conf. In surrogate mode, MILC may not print the final +# high-fidelity iteration line, so check that at least one MILC rank finished. +grep -E 'Network node [0-9]+ Rank [0-9]+ App 1 finished' model-output.txt err=$? -[[ $err -ne 0 ]] && exit $err +if [[ $err -ne 0 ]]; then + echo "MILC/App 1 completion output not found" + grep -nE 'MILC|App 1|All non-synthetic|application iteration' model-output.txt | tail -80 + exit $err +fi grep 'Jacobi3D: Completed 39 iterations' model-output.txt err=$? diff --git a/tests/union-workload-test-surrogate.sh b/tests/union-workload-test-surrogate.sh index 0a34b2c8..00927c18 100644 --- a/tests/union-workload-test-surrogate.sh +++ b/tests/union-workload-test-surrogate.sh @@ -77,10 +77,16 @@ grep 'Net Events Processed' model-output.txt err=$? [[ $err -ne 0 ]] && exit $err -# Checking both milc and jacobi ran -grep 'MILC: Iteration 119/120' model-output.txt +# Checking both Jacobi and MILC ran. App 0 is Jacobi and App 1 is MILC in +# jacobi_MILC.workload.conf. In surrogate mode, MILC may not print the final +# high-fidelity iteration line, so check that at least one MILC rank finished. +grep -E 'Network node [0-9]+ Rank [0-9]+ App 1 finished' model-output.txt err=$? -[[ $err -ne 0 ]] && exit $err +if [[ $err -ne 0 ]]; then + echo "MILC/App 1 completion output not found" + grep -nE 'MILC|App 1|All non-synthetic|application iteration' model-output.txt | tail -80 + exit $err +fi grep 'Jacobi3D: Completed 39 iterations' model-output.txt err=$? From 5c8ce938bfa7f56f01665ab8b8c157bbc0854562 Mon Sep 17 00:00:00 2001 From: Sanjay Chari Date: Wed, 1 Jul 2026 09:23:07 -0400 Subject: [PATCH 3/3] Add dfdally director test config file --- .../dfdally-director-72-par.conf.in | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 tests/conf/union-milc-jacobi-workload/dfdally-director-72-par.conf.in diff --git a/tests/conf/union-milc-jacobi-workload/dfdally-director-72-par.conf.in b/tests/conf/union-milc-jacobi-workload/dfdally-director-72-par.conf.in new file mode 100644 index 00000000..ade97ef2 --- /dev/null +++ b/tests/conf/union-milc-jacobi-workload/dfdally-director-72-par.conf.in @@ -0,0 +1,98 @@ +LPGROUPS +{ + MODELNET_GRP + { + repetitions="36"; +# name of this lp changes according to the model + nw-lp="2"; +# these lp names will be the same for dragonfly-custom model + modelnet_dragonfly_dally="2"; + modelnet_dragonfly_dally_router="1"; + } +} +PARAMS +{ +# packet size in the network + packet_size="4096"; + modelnet_order=( "dragonfly_dally","dragonfly_dally_router" ); + # scheduler options + modelnet_scheduler="fcfs"; +# chunk size in the network (when chunk size = packet size, packets will not be +# divided into chunks) + chunk_size="${CHUNK_SIZE}"; +# modelnet_scheduler="round-robin"; +# number of routers in group + num_routers="4"; +# number of groups in the network + num_groups="9"; +# buffer size in bytes for local virtual channels + local_vc_size="16384"; +#buffer size in bytes for global virtual channels + global_vc_size="16384"; +#buffer size in bytes for compute node virtual channels + cn_vc_size="32768"; +#bandwidth in GiB/s for local channels + local_bandwidth="5.25"; +# bandwidth in GiB/s for global channels + global_bandwidth="4.7"; +# bandwidth in GiB/s for compute node-router channels + cn_bandwidth="5.25"; +# ROSS message size + message_size="840"; +# number of compute nodes connected to router, dictated by dragonfly config +# file + num_cns_per_router="2"; +# number of global channels per router + num_global_channels="2"; +# network config file for intra-group connections + intra-group-connections="@CMAKE_SOURCE_DIR@/src/network-workloads/conf/dragonfly-dally/dfdally-72-intra"; +# network config file for inter-group connections + inter-group-connections="@CMAKE_SOURCE_DIR@/src/network-workloads/conf/dragonfly-dally/dfdally-72-inter"; +# routing protocol to be used + routing="prog-adaptive"; + minimal-bias="1"; + df-dally-vc = "1"; +# counting msgs recv to/send from routers + counting_bool="0"; + counting_start="0"; + counting_windows="1800"; + #interval in us + counting_interval="300"; + num_apps="2"; + #offset for app_id: model-net-mpi-replay is 88, synthetic-dfly-plus is 24 + offset="144"; +} +NETWORK_SURROGATE { + enable="${NETWORK_SURR_ON}"; # either "0" or "1" +# determines the director switching from surrogate to high-def simulation strategy + director_mode="delegate-to-app-director"; + #director_mode="at-fixed-virtual-times"; + +# director configuration for: director_mode == "at-fixed-virtual-times" +# timestamps at which to switch to surrogate-mode and back + fixed_switch_timestamps=( "25.0e6", "400.0e6" ); + +# latency predictor to use. Options: average, torch-jit + packet_latency_predictor="average"; +# some workload models need some time to stabilize, a point where the network behaviour stabilizes. The predictor will ignore all packet latencies that arrive during this period + ignore_until="2.0e6"; + +# parameters for torch-jit latency predictor + torch_jit_mode="single-static-model-for-all-terminals"; + torch_jit_model_path=""; + +# selecting network treatment on switching to surrogate. Options: freeze, nothing + network_treatment_on_switch="${NETWORK_MODE}"; +} +APPLICATION_SURROGATE { + enable="${APP_SURR_ON}"; # either 0 or 1 + + # Configuring director + director_mode="${APP_DIRECTOR_MODE}"; # Opts: "every-n-gvt", "every-n-nanoseconds" + director_num_gvt="${EVERY_N_GVT}"; + director_num_ns="${EVERY_NSECS}"; # 1^6 means 1ms + + # Configuring predictor + # Minimum number of iterations to collect data from before skipping ahead in the simulation + num_iters_to_collect="2"; +}