Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 0 additions & 22 deletions tests/conf/union-milc-jacobi-workload/dfdally-72-par.conf.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,33 +5,11 @@ LPGROUPS
repetitions="36";
# name of this lp changes according to the model
nw-lp="2";
dir-nw-lp="${DIRECTOR_LP_PER_REP}";
# these lp names will be the same for dragonfly-custom model
modelnet_dragonfly_dally="2";
modelnet_dragonfly_dally_router="1";
}
}
DIRECTOR
{
Comment thread
sanjaychari marked this conversation as resolved.
start_iter="${DIRECTOR_START_ITER}";
end_iter="${DIRECTOR_END_ITER}";

retrain_enabled="${DIRECTOR_RETRAIN_ENABLED}";
retrain_iter="${DIRECTOR_RETRAIN_ITER}";
retrain_save_path="${DIRECTOR_RETRAIN_SAVE_PATH}";

second_surrogate_enabled="${DIRECTOR_SECOND_SURROGATE_ENABLED}";
second_start_iter="${DIRECTOR_SECOND_START_ITER}";
second_end_iter="${DIRECTOR_SECOND_END_ITER}";

inferencing_enabled="${INFERENCING_ENABLED}";
surrogate_enabled="${SURROGATE_ENABLED}";
training_enabled="${TRAINING_ENABLED}";

debug_prints="${DIRECTOR_DEBUG_PRINTS}";
shutdown_zmqml_server_on_finalize="${SHUTDOWN_ZMQML_SERVER_ON_FINALIZE}";
}

PARAMS
{
# packet size in the network
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
LPGROUPS
{
MODELNET_GRP
{
repetitions="36";
# name of this lp changes according to the model
nw-lp="2";
# these lp names will be the same for dragonfly-custom model
modelnet_dragonfly_dally="2";
modelnet_dragonfly_dally_router="1";
}
}
PARAMS
{
# packet size in the network
packet_size="4096";
modelnet_order=( "dragonfly_dally","dragonfly_dally_router" );
# scheduler options
modelnet_scheduler="fcfs";
# chunk size in the network (when chunk size = packet size, packets will not be
# divided into chunks)
chunk_size="${CHUNK_SIZE}";
# modelnet_scheduler="round-robin";
# number of routers in group
num_routers="4";
# number of groups in the network
num_groups="9";
# buffer size in bytes for local virtual channels
local_vc_size="16384";
#buffer size in bytes for global virtual channels
global_vc_size="16384";
#buffer size in bytes for compute node virtual channels
cn_vc_size="32768";
#bandwidth in GiB/s for local channels
local_bandwidth="5.25";
# bandwidth in GiB/s for global channels
global_bandwidth="4.7";
# bandwidth in GiB/s for compute node-router channels
cn_bandwidth="5.25";
# ROSS message size
message_size="840";
# number of compute nodes connected to router, dictated by dragonfly config
# file
num_cns_per_router="2";
# number of global channels per router
num_global_channels="2";
# network config file for intra-group connections
intra-group-connections="@CMAKE_SOURCE_DIR@/src/network-workloads/conf/dragonfly-dally/dfdally-72-intra";
# network config file for inter-group connections
inter-group-connections="@CMAKE_SOURCE_DIR@/src/network-workloads/conf/dragonfly-dally/dfdally-72-inter";
# routing protocol to be used
routing="prog-adaptive";
minimal-bias="1";
df-dally-vc = "1";
# counting msgs recv to/send from routers
counting_bool="0";
counting_start="0";
counting_windows="1800";
#interval in us
counting_interval="300";
num_apps="2";
#offset for app_id: model-net-mpi-replay is 88, synthetic-dfly-plus is 24
offset="144";
}
NETWORK_SURROGATE {
enable="${NETWORK_SURR_ON}"; # either "0" or "1"
# determines the director switching from surrogate to high-def simulation strategy
director_mode="delegate-to-app-director";
#director_mode="at-fixed-virtual-times";

# director configuration for: director_mode == "at-fixed-virtual-times"
# timestamps at which to switch to surrogate-mode and back
fixed_switch_timestamps=( "25.0e6", "400.0e6" );

# latency predictor to use. Options: average, torch-jit
packet_latency_predictor="average";
# some workload models need some time to stabilize, a point where the network behaviour stabilizes. The predictor will ignore all packet latencies that arrive during this period
ignore_until="2.0e6";

# parameters for torch-jit latency predictor
torch_jit_mode="single-static-model-for-all-terminals";
torch_jit_model_path="";

# selecting network treatment on switching to surrogate. Options: freeze, nothing
network_treatment_on_switch="${NETWORK_MODE}";
}
APPLICATION_SURROGATE {
enable="${APP_SURR_ON}"; # either 0 or 1

# Configuring director
director_mode="${APP_DIRECTOR_MODE}"; # Opts: "every-n-gvt", "every-n-nanoseconds"
director_num_gvt="${EVERY_N_GVT}";
director_num_ns="${EVERY_NSECS}"; # 1^6 means 1ms

# Configuring predictor
# Minimum number of iterations to collect data from before skipping ahead in the simulation
num_iters_to_collect="2";
}
12 changes: 9 additions & 3 deletions tests/union-workload-test-surrogate-parallel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,16 @@ grep 'Net Events Processed' model-output.txt
err=$?
[[ $err -ne 0 ]] && exit $err

# Checking both milc and jacobi ran
grep 'MILC: Iteration 119/120' model-output.txt
# Checking both Jacobi and MILC ran. App 0 is Jacobi and App 1 is MILC in
# jacobi_MILC.workload.conf. In surrogate mode, MILC may not print the final
# high-fidelity iteration line, so check that at least one MILC rank finished.
grep -E 'Network node [0-9]+ Rank [0-9]+ App 1 finished' model-output.txt
err=$?
[[ $err -ne 0 ]] && exit $err
if [[ $err -ne 0 ]]; then
echo "MILC/App 1 completion output not found"
grep -nE 'MILC|App 1|All non-synthetic|application iteration' model-output.txt | tail -80
exit $err
fi

grep 'Jacobi3D: Completed 39 iterations' model-output.txt
err=$?
Expand Down
12 changes: 9 additions & 3 deletions tests/union-workload-test-surrogate-smaller-chunk-size.sh
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,16 @@ grep 'Net Events Processed' model-output.txt
err=$?
[[ $err -ne 0 ]] && exit $err

# Checking both milc and jacobi ran
grep 'MILC: Iteration 119/120' model-output.txt
# Checking both Jacobi and MILC ran. App 0 is Jacobi and App 1 is MILC in
# jacobi_MILC.workload.conf. In surrogate mode, MILC may not print the final
# high-fidelity iteration line, so check that at least one MILC rank finished.
grep -E 'Network node [0-9]+ Rank [0-9]+ App 1 finished' model-output.txt
err=$?
[[ $err -ne 0 ]] && exit $err
if [[ $err -ne 0 ]]; then
echo "MILC/App 1 completion output not found"
grep -nE 'MILC|App 1|All non-synthetic|application iteration' model-output.txt | tail -80
exit $err
fi

grep 'Jacobi3D: Completed 39 iterations' model-output.txt
err=$?
Expand Down
12 changes: 9 additions & 3 deletions tests/union-workload-test-surrogate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,16 @@ grep 'Net Events Processed' model-output.txt
err=$?
[[ $err -ne 0 ]] && exit $err

# Checking both milc and jacobi ran
grep 'MILC: Iteration 119/120' model-output.txt
# Checking both Jacobi and MILC ran. App 0 is Jacobi and App 1 is MILC in
# jacobi_MILC.workload.conf. In surrogate mode, MILC may not print the final
# high-fidelity iteration line, so check that at least one MILC rank finished.
grep -E 'Network node [0-9]+ Rank [0-9]+ App 1 finished' model-output.txt
err=$?
[[ $err -ne 0 ]] && exit $err
if [[ $err -ne 0 ]]; then
echo "MILC/App 1 completion output not found"
grep -nE 'MILC|App 1|All non-synthetic|application iteration' model-output.txt | tail -80
exit $err
fi

grep 'Jacobi3D: Completed 39 iterations' model-output.txt
err=$?
Expand Down
Loading