diff --git a/.github/workflows/ci-package.yml b/.github/workflows/ci-package.yml
index 98125014932..8652bb3f028 100644
--- a/.github/workflows/ci-package.yml
+++ b/.github/workflows/ci-package.yml
@@ -127,7 +127,8 @@ jobs:
         CI_BRANCH: ${{ github.ref }}
 
     - name: Upload Artifacts
-      uses: actions/upload-artifact@v2
+      # This points to the latest upload-artifact v4.x.x.
+      uses: actions/upload-artifact@v4
       with:
         name: linux-tarball
         path: DynamoRIO-Linux-${{ steps.version.outputs.version_number }}.tar.gz
@@ -215,7 +216,8 @@ jobs:
         CI_BRANCH: ${{ github.ref }}
 
     - name: Upload AArch64
-      uses: actions/upload-artifact@v2
+      # This points to the latest upload-artifact v4.x.x.
+      uses: actions/upload-artifact@v4
       with:
         name: aarch64-tarball
         path: DynamoRIO-AArch64-Linux-${{ steps.version.outputs.version_number }}.tar.gz
@@ -303,7 +305,8 @@ jobs:
         CI_BRANCH: ${{ github.ref }}
 
     - name: Upload ARM
-      uses: actions/upload-artifact@v2
+      # This points to the latest upload-artifact v4.x.x.
+      uses: actions/upload-artifact@v4
       with:
         name: arm-tarball
         path: DynamoRIO-ARM-Linux-EABIHF-${{ steps.version.outputs.version_number }}.tar.gz
@@ -393,7 +396,8 @@ jobs:
         CI_BRANCH: ${{ github.ref }}
 
     - name: Upload Artifacts
-      uses: actions/upload-artifact@v2
+      # This points to the latest upload-artifact v4.x.x.
+      uses: actions/upload-artifact@v4
       with:
         name: android-tarball
         path: DynamoRIO-ARM-Android-EABI-${{ steps.version.outputs.version_number }}.tar.gz
@@ -484,7 +488,8 @@ jobs:
         CI_BRANCH: ${{ github.ref }}
 
     - name: Upload Artifacts
-      uses: actions/upload-artifact@v2
+      # This points to the latest upload-artifact v4.x.x.
+      uses: actions/upload-artifact@v4
       with:
         name: windows-zip
         path: DynamoRIO-Windows-${{ steps.version.outputs.version_number }}.zip
@@ -562,7 +567,8 @@ jobs:
         prerelease: false
 
     - name: Download Linux
-      uses: actions/download-artifact@v4.1.7
+      # This points to the latest download-artifact v4.x.x.
+      uses: actions/download-artifact@v4
       with:
         name: linux-tarball
     - name: Upload Linux
@@ -577,7 +583,8 @@ jobs:
         asset_content_type: application/x-gzip
 
     - name: Download AArch64
-      uses: actions/download-artifact@v4.1.7
+      # This points to the latest download-artifact v4.x.x.
+      uses: actions/download-artifact@v4
       with:
         name: aarch64-tarball
     - name: Upload AArch64
@@ -592,7 +599,8 @@ jobs:
         asset_content_type: application/x-gzip
 
     - name: Download ARM
-      uses: actions/download-artifact@v4.1.7
+      # This points to the latest download-artifact v4.x.x.
+      uses: actions/download-artifact@v4
       with:
         name: arm-tarball
     - name: Upload ARM
@@ -607,7 +615,8 @@ jobs:
         asset_content_type: application/x-gzip
 
     - name: Download Android
-      uses: actions/download-artifact@v4.1.7
+      # This points to the latest download-artifact v4.x.x.
+      uses: actions/download-artifact@v4
       with:
         name: android-tarball
     - name: Upload Android
@@ -622,7 +631,8 @@ jobs:
         asset_content_type: application/x-gzip
 
     - name: Download Windows
-      uses: actions/download-artifact@v4.1.7
+      # This points to the latest download-artifact v4.x.x.
+      uses: actions/download-artifact@v4
       with:
         name: windows-zip
     - name: Upload Windows
diff --git a/clients/drcachesim/analyzer_multi.cpp b/clients/drcachesim/analyzer_multi.cpp
index aa7c6936b16..c567f328f4e 100644
--- a/clients/drcachesim/analyzer_multi.cpp
+++ b/clients/drcachesim/analyzer_multi.cpp
@@ -558,13 +558,17 @@ analyzer_multi_tmpl_t<RecordType, ReaderType>::init_dynamic_schedule()
         op_sched_order_time.get_value() ? sched_type_t::DEPENDENCY_TIMESTAMPS
                                         : sched_type_t::DEPENDENCY_IGNORE,
         sched_type_t::SCHEDULER_DEFAULTS, op_verbose.get_value());
-    sched_ops.quantum_duration = op_sched_quantum.get_value();
-    if (op_sched_time.get_value())
+    sched_ops.time_units_per_us = op_sched_time_per_us.get_value();
+    if (op_sched_time.get_value()) {
         sched_ops.quantum_unit = sched_type_t::QUANTUM_TIME;
+        sched_ops.quantum_duration_us = op_sched_quantum.get_value();
+    } else {
+        sched_ops.quantum_duration_instrs = op_sched_quantum.get_value();
+    }
     sched_ops.syscall_switch_threshold = op_sched_syscall_switch_us.get_value();
     sched_ops.blocking_switch_threshold = op_sched_blocking_switch_us.get_value();
-    sched_ops.block_time_scale = op_sched_block_scale.get_value();
-    sched_ops.block_time_max = op_sched_block_max_us.get_value();
+    sched_ops.block_time_multiplier = op_sched_block_scale.get_value();
+    sched_ops.block_time_max_us = op_sched_block_max_us.get_value();
     sched_ops.randomize_next_input = op_sched_randomize.get_value();
     sched_ops.honor_direct_switches = !op_sched_disable_direct_switches.get_value();
 #ifdef HAS_ZIP
diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp
index f14c89490ff..f13bdded9b5 100644
--- a/clients/drcachesim/common/options.cpp
+++ b/clients/drcachesim/common/options.cpp
@@ -897,13 +897,19 @@ droption_t<bool> op_core_serial(
     "How the scheduling is performed is controlled by a set "
     "of options with the prefix \"sched_\" along with -cores.");
 
+droption_t<double>
+    op_sched_time_per_us(DROPTION_SCOPE_ALL, "sched_time_per_us", 1000.,
+                         "Wall-clock microseconds per simulated microsecond",
+                         "Wall-clock microseconds per simulated microsecond.");
+
 droption_t<int64_t>
-    // We pick 6 million to match 2 instructions per nanosecond with a 3ms quantum.
-    op_sched_quantum(DROPTION_SCOPE_ALL, "sched_quantum", 6 * 1000 * 1000,
+    // We pick 10 million to match 2 instructions per nanosecond with a 5ms quantum.
+    op_sched_quantum(DROPTION_SCOPE_ALL, "sched_quantum", 10 * 1000 * 1000,
                      "Scheduling quantum",
                      "Applies to -core_sharded and -core_serial. "
-                     "Scheduling quantum: in microseconds of wall-clock "
-                     "time if -sched_time is set; otherwise in instructions.");
+                     "Scheduling quantum in instructions, unless -sched_time is set in "
+                     "which case this value is multiplied by -sched_time_per_us to "
+                     "produce a quantum in wall-clock microseconds.");
 
 droption_t<bool>
     op_sched_time(DROPTION_SCOPE_ALL, "sched_time", false,
@@ -932,14 +938,18 @@ droption_t<uint64_t> op_sched_blocking_switch_us(
     "-core_serial. ");
 
 droption_t<double> op_sched_block_scale(
-    DROPTION_SCOPE_ALL, "sched_block_scale", 10., "Input block time scale factor",
-    "The scale applied to the microsecond latency of blocking system calls.  A higher "
-    "value here results in blocking syscalls keeping inputs unscheduled for longer.  "
-    "This should roughly equal the slowdown of instruction record processing versus the "
-    "original (untraced) application execution.");
-
-// We have a max to avoid outlier latencies that are already a second or more from
-// scaling up to tens of minutes.  We assume a cap is representative as the outliers
+    DROPTION_SCOPE_ALL, "sched_block_scale", 0.1, "Input block time scale factor",
+    "A system call considered to block (see -sched_blocking_switch_us) will "
+    "block in the trace scheduler for an amount of simulator time equal to its "
+    "as-traced latency in trace-time microseconds multiplied by this parameter "
+    "and by -sched_time_per_us in simulated microseconds, subject to a "
+    "maximum of --sched_block_max_us. A higher value here results in blocking "
+    "syscalls keeping inputs unscheduled for longer. There is indirect "
+    "overhead inflating the as-traced times, so a value below 1 is typical.");
+
+// We have a max to avoid outlier latencies from scaling up to extreme times.  There is
+// some inflation in the as-traced latencies and some can be inflated more than others.
+// We assume a cap is representative as the outliers
 // likely were not part of key dependence chains.  Without a cap the other threads all
 // finish and the simulation waits for tens of minutes further for a couple of outliers.
 // The cap remains a flag and not a constant as different length traces and different
@@ -947,8 +957,8 @@ droption_t<double> op_sched_block_scale(
 // to achieve desired cpu usage targets.  The default value was selected to avoid unduly
 // long idle times with local analyzers; it may need to be increased with more
 // heavyweight analyzers/simulators.
-droption_t<uint64_t> op_sched_block_max_us(DROPTION_SCOPE_ALL, "sched_block_max_us",
-                                           250000,
+// TODO i#6959: Once we have -exit_if_all_unscheduled raise this.
+droption_t<uint64_t> op_sched_block_max_us(DROPTION_SCOPE_ALL, "sched_block_max_us", 2500,
                                            "Maximum blocked input time, in microseconds",
                                            "The maximum blocked time, after scaling with "
                                            "-sched_block_scale.");
@@ -995,6 +1005,13 @@ droption_t<bool> op_sched_disable_direct_switches(
     "switch being determined by latency and the next input in the queue.  The "
     "TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH markers are not removed from the trace.");
 
+droption_t<double> op_sched_time_units_per_us(
+    DROPTION_SCOPE_ALL, "sched_time_units_per_us", 100.,
+    "Time units per simulated microsecond",
+    "Time units (currently wall-clock time) per simulated microsecond.  This scales all "
+    "of the -sched_*_us values as it converts wall-clock time into the simulated "
+    "microseconds measured by those options.");
+
 // Schedule_stats options.
 droption_t<uint64_t>
     op_schedule_stats_print_every(DROPTION_SCOPE_ALL, "schedule_stats_print_every",
diff --git a/clients/drcachesim/common/options.h b/clients/drcachesim/common/options.h
index 1b54c9d89a4..f08bc7f9969 100644
--- a/clients/drcachesim/common/options.h
+++ b/clients/drcachesim/common/options.h
@@ -200,6 +200,7 @@ extern dynamorio::droption::droption_t<int> op_kernel_trace_buffer_size_shift;
 #endif
 extern dynamorio::droption::droption_t<bool> op_core_sharded;
 extern dynamorio::droption::droption_t<bool> op_core_serial;
+extern dynamorio::droption::droption_t<double> op_sched_time_per_us;
 extern dynamorio::droption::droption_t<int64_t> op_sched_quantum;
 extern dynamorio::droption::droption_t<bool> op_sched_time;
 extern dynamorio::droption::droption_t<bool> op_sched_order_time;
diff --git a/clients/drcachesim/docs/drcachesim.dox.in b/clients/drcachesim/docs/drcachesim.dox.in
index a1097a54a16..406e3293530 100644
--- a/clients/drcachesim/docs/drcachesim.dox.in
+++ b/clients/drcachesim/docs/drcachesim.dox.in
@@ -66,6 +66,7 @@ targets are provided up front.  These join the recent features of
  - \subpage sec_drcachesim_analyzer
  - \subpage sec_drcachesim_phys
  - \subpage sec_drcachesim_core
+ - \subpage sec_drcachesim_sched
  - \subpage sec_drcachesim_extend
  - \subpage sec_drcachesim_tracer
  - \subpage sec_drcachesim_funcs
@@ -180,7 +181,7 @@ Some of the more important markers are:
 
 - #dynamorio::drmemtrace::TRACE_MARKER_TYPE_TIMESTAMP - The marker value provides a timestamp for this point of the trace (in units of microseconds since Jan 1, 1601 UTC). This value can be used to synchronize records from different threads as well as analyze latencies (however, tracing overhead inflates time unevenly, so time deltas should not be considered perfectly representative). It is used in the sequential analysis of a multi-threaded trace.
 
-- #dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID - The marker value contains the CPU identifier on which the subsequent records were collected. It is useful to help track thread migrations occurring during execution. This marker is written to the header of each trace buffer when the buffer is flushed. Note that if the thread migrates to a different CPU due to preemption by the kernel before a buffer is full, we do not output a separate #dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID marker to capture the previous CPU identifier. However, we expect such cases to be rare.
+- #dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID - The marker value contains the CPU identifier on which the subsequent records were collected.  It can be used to identify the "as traced" schedule, indicating which threads were on which cores at which times.  However, this schedule is not representative and should not be treated as indicating how the application behaves without tracing.  See \ref sec_drcachesim_as_traced for further information.
 
 - #dynamorio::drmemtrace::TRACE_MARKER_TYPE_FUNC_ID, #dynamorio::drmemtrace::TRACE_MARKER_TYPE_FUNC_RETADDR, #dynamorio::drmemtrace::TRACE_MARKER_TYPE_FUNC_ARG, #dynamorio::drmemtrace::TRACE_MARKER_TYPE_FUNC_RETVAL - These markers are used to capture information about function calls.  Which functions to capture must be explicitly selected at tracing time.  Typical candiates are heap allocation and freeing functions.  See \ref sec_drcachesim_funcs.
 
@@ -188,12 +189,16 @@ Some of the more important markers are:
 
 - #dynamorio::drmemtrace::TRACE_MARKER_TYPE_SYSCALL - This identifies a system call.  A timestamp is inserted in the trace before and after marker of this type.  This marker should be considered to be the actual system call invocation by the kernel, rather than the prior system call gateway instruction fetch record.  Thus, these timestamps provide the system call latency.
 
+- #dynamorio::drmemtrace::TRACE_MARKER_TYPE_CORE_IDLE - This is inserted by the trace scheduler (see \ref sec_drcachesim_sched) when there is no work available on a core in core-sharded mode.  This is meant to simulate actual idle time on a core.
+
+- #dynamorio::drmemtrace::TRACE_MARKER_TYPE_CORE_WAIT - This is inserted by the trace scheduler (see \ref sec_drcachesim_sched) during replay of a previously recorded schedule when one core gets too far ahead of another according to the recorded timestamps.  This is an artificial wait to keep the replay on track, as opposed to the natural idle time of #dynamorio::drmemtrace::TRACE_MARKER_TYPE_CORE_IDLE.
+
 The full set of markers is listed under the enum #dynamorio::drmemtrace::trace_marker_type_t.
 
 ****************************************************************************
-\page sec_drcachesim_run Running the Cache Simulator
+\page sec_drcachesim_run Running Tools
 
-To launch \p drcachesim, use the \p -t flag to \p drrun and specify the
+To launch \p drmemtrace, use the \p -t flag to \p drrun and specify the
 \p drmemtrace framework where the default tool to run is the cache simulator:
 
 \code
@@ -867,7 +872,7 @@ on program counter continuity and guarantees around kernel control
 transfer interruptions.  It optionally checks for restricted behavior
 that technically is legal but is not expected to happen in the target
 trace, helping to identify tracing problems and suitability for use of
-a trace for core simulation.
+a trace for core simulation (see \ref sec_drcachesim_core).
 
 \section sec_tool_syscall_mix System Call Mix
 
@@ -1286,76 +1291,79 @@ are user-specified (see \ref sec_drcachesim_ops).
 Neither simulator has a simple way to know which core any particular thread
 executed on for each of its instructions.  The tracer records which core a
 thread is on each time it writes out a full trace buffer, giving an
-approximation of the actual scheduling (at the granularity of the trace
-buffer size).  By default, these cache and TLB simulators ignore that
+approximation of the actual scheduling: but this is not representative
+due to overhead (see \ref sec_drcachesim_as_traced).  By default, these cache and TLB
+simulators ignore that
 information and schedule threads to simulated cores in a static round-robin
 fashion with load balancing to fill in gaps with new threads after threads
 exit.  The option "-cpu_scheduling" (see \ref sec_drcachesim_ops) can be
 used to instead map each physical cpu to a simulated core and use the
 recorded cpu that each segment of thread execution occurred on to schedule
-execution in a manner that more closely resembles the traced execution on
-the physical machine.  Below is an example of the output using this option
-running an application with many threads on a pysical machine with 8 cpus.
-The 8 cpus are mapped to the 4 simulated cores:
+execution following the "as traced" schedule, but as just noted this is not
+representative.  Instead, we recommend using offline traces and dynamic
+re-scheduling as explained in \ref sec_drcachesim_sched_dynamic using the
+`-core_serial` parameter.  Here is an example:
 
 \code
-$ bin64/drrun -t drmemtrace -cpu_scheduling -- ~/test/pi_estimator 20
+$ bin64/drrun -t drmemtrace -offline -- ~/test/pi_estimator 8 20
 Estimation of pi is 3.141592653798125
-<Stopping application /home/bruening/dr/test/threadsig (213517)>
----- <application exited with code 0> ----
+$ bin64/drrun -t drcachesim -core_serial -cores 3 -indir drmemtrace.pi_estimator.*.dir
 Cache simulation results:
-Core #0 (2 traced CPU(s): #2, #5)
-  L1I stats:
-    Hits:                        2,756,429
-    Misses:                          1,190
-    Miss rate:                        0.04%
-  L1D stats:
-    Hits:                        1,747,822
-    Misses:                         13,511
-    Prefetch hits:                   2,354
-    Prefetch misses:                11,157
-    Miss rate:                        0.77%
-Core #1 (2 traced CPU(s): #4, #0)
-  L1I stats:
-    Hits:                          472,948
-    Misses:                            299
-    Miss rate:                        0.06%
-  L1D stats:
-    Hits:                          895,099
-    Misses:                          1,224
-    Prefetch hits:                     253
-    Prefetch misses:                   971
+Core #0 (traced CPU(s): #0)
+  L1I0 (size=32768, assoc=8, block=64, LRU) stats:
+    Hits:                        1,853,727
+    Misses:                          2,152
+    Compulsory misses:               2,045
+    Invalidations:                       0
+    Miss rate:                        0.12%
+  L1D0 (size=32768, assoc=8, block=64, LRU) stats:
+    Hits:                          605,114
+    Misses:                         11,973
+    Compulsory misses:               9,845
+    Invalidations:                       0
+    Prefetch hits:                   1,880
+    Prefetch misses:                10,093
+    Miss rate:                        1.94%
+Core #1 (traced CPU(s): #1)
+  L1I1 (size=32768, assoc=8, block=64, LRU) stats:
+    Hits:                          942,992
+    Misses:                            461
+    Compulsory misses:                 366
+    Invalidations:                       0
+    Miss rate:                        0.05%
+  L1D1 (size=32768, assoc=8, block=64, LRU) stats:
+    Hits:                          385,134
+    Misses:                            534
+    Compulsory misses:                 775
+    Invalidations:                       0
+    Prefetch hits:                     144
+    Prefetch misses:                   390
     Miss rate:                        0.14%
-Core #2 (2 traced CPU(s): #1, #7)
-  L1I stats:
-    Hits:                          448,581
-    Misses:                            649
+Core #2 (traced CPU(s): #2)
+  L1I2 (size=32768, assoc=8, block=64, LRU) stats:
+    Hits:                          944,622
+    Misses:                            453
+    Compulsory misses:                 365
+    Invalidations:                       0
+    Miss rate:                        0.05%
+  L1D2 (size=32768, assoc=8, block=64, LRU) stats:
+    Hits:                          385,808
+    Misses:                            537
+    Compulsory misses:                 791
+    Invalidations:                       0
+    Prefetch hits:                     140
+    Prefetch misses:                   397
     Miss rate:                        0.14%
-  L1D stats:
-    Hits:                          811,483
-    Misses:                          1,723
-    Prefetch hits:                     378
-    Prefetch misses:                 1,345
-    Miss rate:                        0.21%
-Core #3 (2 traced CPU(s): #6, #3)
-  L1I stats:
-    Hits:                          275,192
-    Misses:                            154
-    Miss rate:                        0.06%
-  L1D stats:
-    Hits:                          522,655
-    Misses:                            850
-    Prefetch hits:                     173
-    Prefetch misses:                   677
-    Miss rate:                        0.16%
-LL stats:
-    Hits:                           12,491
-    Misses:                          7,109
-    Prefetch hits:                   8,922
-    Prefetch misses:                 5,228
-    Local miss rate:                 36.27%
-    Child hits:                  7,933,367
-    Total miss rate:                  0.09%
+LL (size=8388608, assoc=16, block=64, LRU) stats:
+    Hits:                            8,091
+    Misses:                          8,019
+    Compulsory misses:              13,173
+    Invalidations:                       0
+    Prefetch hits:                   5,693
+    Prefetch misses:                 5,187
+    Local miss rate:                 49.78%
+    Child hits:                  5,119,561
+    Total miss rate:                  0.16%
 \endcode
 
 The memory access traces contain some optimizations that combine references
@@ -1452,9 +1460,17 @@ instruction information to go along with each load and store, while cache
 simulators can ignore these "no-fetch" entries and avoid incorrectly
 inflating instruction fetch statistics.
 
-Traces include scheduling markers providing the timestamp and hardware
-thread identifier on each thread transition, allowing a simulator to more
-closely match the actual hardware if so desired.
+Traces include scheduling markers providing the timestamp and hardware thread identifier
+on each thread transition, allowing a simulator to more closely match the actual
+hardware if so desired: but be aware that this "as-traced" schedule is not
+representative, as shown in \ref sec_drcachesim_as_traced.  We recommend instead using
+dynamic re-scheduling of the software threads: see \ref sec_drcachesim_sched_dynamic.
+While we suggest keeping traces stored as thread-sharded and using the dynamic scheduler
+in each run, there is support for running the scheduler once and creating a new set of
+stored traces in core-sharded format: essentially switching to hardware-thread-oriented
+traces.  This is done using the \ref sec_tool_record_filter tool in `-core_sharded` mode.
+The #dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID markers are not modified by the
+dynamic scheduler, and should be ignored in a newly created core-sharded trace.
 
 Traces also include markers indicating disruptions in user mode control
 flow such as signal handler entry and exit.
@@ -1480,6 +1496,168 @@ Filtered traces (filtered via -L0_filter) include the dynamic
 #dynamorio::drmemtrace::TRACE_MARKER_TYPE_INSTRUCTION_COUNT marker at
 each thread buffer boundary and at thread exit.
 
+
+****************************************************************************
+\page sec_drcachesim_sched Trace Scheduler
+
+In addition to the analysis tool framework, which targets running
+multiple tools at once either in parallel across all traced threads or
+in a serial fashion, we provide a scheduler which will map inputs to a
+given set of outputs in a specified manner.  This allows a tool such
+as a core simulator, or just a tool wanting its own control over
+advancing the trace stream (unlike the analysis tool framework where
+the framework controls the iteration), to request the next trace
+record for each output on its own.  This scheduling is also available to any analysis tool
+when the input traces are sharded by core (see the `-core_sharded` and `-core_serial`
+and various `-sched_*` option documentation under \ref sec_drcachesim_ops as well as
+core-sharded notes when \ref sec_drcachesim_newtool).
+
+********************
+\section sec_drcachesim_as_traced As-Traced Schedule Limitations
+
+During tracing, marker records (see \ref sec_drcachesim_format_other) of type
+#dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID record the "as traced" schedule,
+indicating which threads were on which cores at which times.  However, this schedule is
+not representative and should not be treated as indicating how the application behaves
+without tracing.  In addition to only containing coarse-grain information at the top and
+bottom of trace buffers and missing any context switches occurring in the between, the
+indicated switches do not always correlate with where the untraced application would
+switch.  This is due to tracing overhead, where heavyweight instrumentation is
+interspersed with application code and heavyweight i/o operations to write out the trace
+data cause delays.  This extra overhead causes additional quantum preempts and additional
+switches due to blocking system calls for i/o.  The resulting as-traced schedule can
+contain from 2x to 10x as many context switches as the untraced
+application.  Consequently, we do not recommend using the as-traced schedule to study the
+application itself, though our scheduler does support replaying the as-traced schedule
+through the -cpu_schedule_file option.
+
+********************
+\section sec_drcachesim_sched_dynamic Dynamic Scheduling
+
+Instead of using the as-traced schedule, we recommend re-scheduling the traced software
+threads using our trace scheduler.  Our scheduler essentially serves as an operating
+system scheduler for this purpose, though using simpler schemes.  It models separate
+runqueues per core with support for binding inputs to certain cores, priorities, idle
+time from blocking system calls, migration thresholds, rebalancing runqueues, etc.  It
+exposes a number of knobs in the form of -sched_* parameters for the command-line \p
+drmemtrace launcher or programmatically through the #dynamorio::drmemtrace::scheduler_t
+API.
+
+Dynamic scheduling provides the following benefits:
+
+- Deflation of the as-traced context switch rate (see \ref sec_drcachesim_as_traced) to
+  provide a representative context switch rate.
+
+- Support for different numbers of cores than were present during tracing.
+
+- Multi-tenant support where separately traced applications are combined, with the
+  dynamic scheduler interleaving them.  This simulates a multi-tenant machine with a mix
+  of processes running.
+
+The downsides include:
+
+- Risk of incorrect ordering between application software threads.  Today, our scheduler
+  does use the in-trace timestamps (when requested via
+  #dynamorio::drmemtrace::scheduler_t::DEPENDENCY_TIMESTAMPS) to keep things in relative
+  order.  However, enforcing representative context switch rates is considered more
+  important that honoring precise trace-buffer-based timestamp inter-input dependencies:
+  thus, timestamp ordering will be followed at context switch points for picking the
+  next input, but timestamps will not preempt an input.
+
+The #dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID markers are not modified by the
+dynamic scheduler, and should be ignored in a dynamic rescheduling.
+
+********************
+\section sec_drcachesim_sched_time Simulated Time
+
+As the simulator, rather than the scheduler, tracks simulated time, yet the scheduler
+needs to make some decisions based on time (such as when to preempt, when to migrate
+across cores, etc.), the simulator should pass in the current time when it queries the
+scheduler for the next record.  The simulator tells the scheduler how many units of this
+simulated time comprise one microsecond so that the scheduler can scale its other
+parameters appropriately.
+
+********************
+\section sec_drcachesim_sched_idle Idle Time
+
+The dynamic scheduler inserts markers of type
+#dynamorio::drmemtrace::TRACE_MARKER_TYPE_CORE_IDLE when there is no work available on a
+core, simulating actual idle time.  This can happen even when there are inputs
+potentially available as the scheduler simulates i/o by blocking inputs from executing
+for a period of time when they make blocking system calls.  This time is based on the
+system call latency recorded in the trace, but since this can be indirectly inflated due
+to tracing overhead the scheduler provides parameters to scale this time, exposed as
+`-sched_block_scale` and `-sched_block_max_us` to the \p drmemtrace launcher.  These can
+be modified to try to achieve a desired level of idle time during simulation.
+
+********************
+\section sec_drcachesim_sched_replay Record and Replay
+
+The scheduler supports recording a schedule and replaying it later, allowing for
+repeated execution of the same schedule.  Timestamps in the recorded schedule help to
+align the cores during replay.  If one gets too far ahead, markers of type
+#dynamorio::drmemtrace::TRACE_MARKER_TYPE_CORE_WAIT are inserted to indicate an
+artificial wait in order for the replay to get back on track.
+
+********************
+\section sec_drcachesim_sched_roi Regions of Interest
+
+The scheduler supports running a subset of each input.  A list of start and stop
+endpoints delimiting the regions of interest can be supplied with each input.  The end
+result is as though the inputs had been edited to remove all content not inside the
+target regions.
+
+********************
+\section sec_drcachesim_sched_speculation Speculation Support
+
+The scheduler contains preliminary speculation support for wrong-path execution.
+Currently it only feeds nops, but future versions plan to fill in content based on prior
+trace paths.
+
+********************
+\section sec_drcachesim_sched_ex Scheduler Interface Example
+
+Here is a simple example of using the scheduler interface directly.
+
+\code
+void
+simulate_core(scheduler_t::stream_t *stream)
+{
+    memref_t record;
+    for (scheduler_t::stream_status_t status = stream->next_record(record);
+         status != scheduler_t::STATUS_EOF; status = stream->next_record(record)) {
+        if (status == scheduler_t::STATUS_WAIT || status == scheduler_t::STATUS_IDLE) {
+            std::this_thread::yield();
+            continue;
+        }
+        assert(status == scheduler_t::STATUS_OK);
+        // Process "record" here.
+    }
+}
+
+void
+run_scheduler(const std::string &trace_directory)
+{
+    scheduler_t scheduler;
+    std::vector<scheduler_t::input_workload_t> sched_inputs;
+    sched_inputs.emplace_back(trace_directory);
+    scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                               scheduler_t::DEPENDENCY_TIMESTAMPS,
+                                               scheduler_t::SCHEDULER_DEFAULTS);
+    constexpr int NUM_CORES = 4;
+    if (scheduler.init(sched_inputs, NUM_CORES, std::move(sched_ops)) !=
+        scheduler_t::STATUS_SUCCESS)
+        assert(false);
+    std::vector<std::thread> threads;
+    threads.reserve(NUM_CORES);
+    for (int i = 0; i < NUM_CORES; ++i) {
+        threads.emplace_back(std::thread(&simulate_core, scheduler.get_stream(i)));
+    }
+    for (std::thread &thread : threads)
+        thread.join();
+}
+\endcode
+
 ****************************************************************************
 \page sec_drcachesim_extend Extending the Simulator
 
@@ -1722,45 +1900,6 @@ example, see <a
 href="https://github.com/DynamoRIO/dynamorio/blob/master/clients/drcachesim/tools/external/example">
 minimal external analysis tool</a>.
 
-\section sec_drcachesim_sched Scheduler
-
-In addition to the analysis tool framework, which targets running
-multiple tools at once either in parallel across all traced threads or
-in a serial fashion, we provide a scheduler which will map inputs to a
-given set of outputs in a specified manner.  This allows a tool such
-as a core simulator, or just a tool wanting its own control over
-advancing the trace stream (unlike the analysis tool framework where
-the framework controls the iteration), to request the next trace
-record for each output on its own.  This scheduling is also available to any analysis tool
-when the input traces are sharded by core (see the `-core_sharding` option documentation
-under \ref sec_drcachesim_ops as well as \ref sec_drcachesim_newtool).
-
-Here is a simple example of a single-output, serial stream.  This also
-serves as an example of how to replace the now-removed old analysis
-tool framework's "external iterator" interface:
-
-\code
-    scheduler_t scheduler;
-    std::vector<scheduler_t::input_workload_t> sched_inputs;
-    sched_inputs.emplace_back(trace_directory);
-    if (scheduler.init(sched_inputs, 1, scheduler_t::make_scheduler_serial_options()) !=
-        scheduler_t::STATUS_SUCCESS) {
-        FATAL_ERROR("failed to initialize scheduler: %s",
-                    scheduler.get_error_string().c_str());
-    }
-    auto *stream = scheduler.get_stream(0);
-    memref_t record;
-    for (scheduler_t::stream_status_t status = stream->next_record(record);
-         status != scheduler_t::STATUS_EOF; status = stream->next_record(record)) {
-        if (status != scheduler_t::STATUS_OK)
-            FATAL_ERROR("scheduler failed to advance: %d", status);
-        if (!my_tool->process_memref(record)) {
-            FATAL_ERROR("tool failed to process entire trace: %s",
-                        my_tool->get_error_string().c_str());
-        }
-    }
-\endcode
-
 ****************************************************************************
 \page sec_drcachesim_ops Simulator Parameters
 
diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index 294bd488287..331df93b419 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -38,6 +38,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cinttypes>
+#include <cstddef>
 #include <cstdio>
 #include <iomanip>
 #include <limits>
@@ -842,6 +843,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::init(
         }
     }
 
+    // Legacy field support.
+    sched_type_t::scheduler_status_t res = legacy_field_support();
+    if (res != sched_type_t::STATUS_SUCCESS)
+        return res;
+
     if (TESTANY(sched_type_t::SCHEDULER_USE_SINGLE_INPUT_ORDINALS, options_.flags) &&
         inputs_.size() == 1 && output_count == 1) {
         options_.flags = static_cast<scheduler_flags_t>(
@@ -881,13 +887,75 @@ scheduler_tmpl_t<RecordType, ReaderType>::init(
     VPRINT(this, 1, "%zu inputs\n", inputs_.size());
     live_input_count_.store(static_cast<int>(inputs_.size()), std::memory_order_release);
 
-    sched_type_t::scheduler_status_t res = read_switch_sequences();
+    res = read_switch_sequences();
     if (res != sched_type_t::STATUS_SUCCESS)
         return STATUS_ERROR_INVALID_PARAMETER;
 
     return set_initial_schedule(workload2inputs);
 }
 
+template <typename RecordType, typename ReaderType>
+typename scheduler_tmpl_t<RecordType, ReaderType>::scheduler_status_t
+scheduler_tmpl_t<RecordType, ReaderType>::legacy_field_support()
+{
+    if (options_.time_units_per_us == 0) {
+        error_string_ = "time_units_per_us must be > 0";
+        return STATUS_ERROR_INVALID_PARAMETER;
+    }
+    if (options_.quantum_duration > 0) {
+        if (options_.struct_size > offsetof(scheduler_options_t, quantum_duration_us)) {
+            error_string_ = "quantum_duration is deprecated; use quantum_duration_us and "
+                            "time_units_per_us or quantum_duration_instrs";
+            return STATUS_ERROR_INVALID_PARAMETER;
+        }
+        if (options_.quantum_unit == QUANTUM_INSTRUCTIONS) {
+            options_.quantum_duration_instrs = options_.quantum_duration;
+        } else {
+            options_.quantum_duration_us =
+                static_cast<uint64_t>(static_cast<double>(options_.quantum_duration) /
+                                      options_.time_units_per_us);
+            VPRINT(this, 2,
+                   "Legacy support: setting quantum_duration_us to %" PRIu64 "\n",
+                   options_.quantum_duration_us);
+        }
+    }
+    if (options_.quantum_duration_us == 0) {
+        error_string_ = "quantum_duration_us must be > 0";
+        return STATUS_ERROR_INVALID_PARAMETER;
+    }
+    if (options_.block_time_scale > 0) {
+        if (options_.struct_size > offsetof(scheduler_options_t, block_time_multiplier)) {
+            error_string_ = "quantum_duration is deprecated; use block_time_multiplier "
+                            "and time_units_per_us";
+            return STATUS_ERROR_INVALID_PARAMETER;
+        }
+        options_.block_time_multiplier =
+            static_cast<double>(options_.block_time_scale) / options_.time_units_per_us;
+        VPRINT(this, 2, "Legacy support: setting block_time_multiplier to %6.3f\n",
+               options_.block_time_multiplier);
+    }
+    if (options_.block_time_multiplier == 0) {
+        error_string_ = "block_time_multiplier must != 0";
+        return STATUS_ERROR_INVALID_PARAMETER;
+    }
+    if (options_.block_time_max > 0) {
+        if (options_.struct_size > offsetof(scheduler_options_t, block_time_max_us)) {
+            error_string_ = "quantum_duration is deprecated; use block_time_max_us "
+                            "and time_units_per_us";
+            return STATUS_ERROR_INVALID_PARAMETER;
+        }
+        options_.block_time_max_us = static_cast<uint64_t>(
+            static_cast<double>(options_.block_time_max) / options_.time_units_per_us);
+        VPRINT(this, 2, "Legacy support: setting block_time_max_us to %" PRIu64 "\n",
+               options_.block_time_max_us);
+    }
+    if (options_.block_time_max_us == 0) {
+        error_string_ = "block_time_max_us must be > 0";
+        return STATUS_ERROR_INVALID_PARAMETER;
+    }
+    return STATUS_SUCCESS;
+}
+
 template <typename RecordType, typename ReaderType>
 typename scheduler_tmpl_t<RecordType, ReaderType>::scheduler_status_t
 scheduler_tmpl_t<RecordType, ReaderType>::set_initial_schedule(
@@ -2551,17 +2619,17 @@ template <typename RecordType, typename ReaderType>
 uint64_t
 scheduler_tmpl_t<RecordType, ReaderType>::scale_blocked_time(uint64_t initial_time) const
 {
-    uint64_t scaled = static_cast<uint64_t>(static_cast<double>(initial_time) *
-                                            options_.block_time_scale);
-    if (scaled > options_.block_time_max) {
+    uint64_t scaled_us = static_cast<uint64_t>(static_cast<double>(initial_time) *
+                                               options_.block_time_multiplier);
+    if (scaled_us > options_.block_time_max_us) {
         // We have a max to avoid outlier latencies that are already a second or
         // more from scaling up to tens of minutes.  We assume a cap is representative
         // as the outliers likely were not part of key dependence chains.  Without a
         // cap the other threads all finish and the simulation waits for tens of
         // minutes further for a couple of outliers.
-        scaled = options_.block_time_max;
+        scaled_us = options_.block_time_max_us;
     }
-    return scaled;
+    return static_cast<uint64_t>(scaled_us * options_.time_units_per_us);
 }
 
 template <typename RecordType, typename ReaderType>
@@ -2587,11 +2655,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::syscall_incurs_switch(input_info_t *in
         : options_.syscall_switch_threshold;
     blocked_time = scale_blocked_time(latency);
     VPRINT(this, 3,
-           "input %d %ssyscall latency %" PRIu64 " * scale %5.1f => blocked time %" PRIu64
+           "input %d %ssyscall latency %" PRIu64 " * scale %6.3f => blocked time %" PRIu64
            "\n",
            input->index,
            input->processing_maybe_blocking_syscall ? "maybe-blocking " : "", latency,
-           options_.block_time_scale, blocked_time);
+           options_.block_time_multiplier, blocked_time);
     return latency >= threshold;
 }
 
@@ -3279,6 +3347,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
         // It's more efficient for QUANTUM_INSTRUCTIONS to get the time here instead of
         // in get_output_time().  This also makes the two more similarly behaved with
         // respect to blocking system calls.
+        // TODO i#6971: Use INSTRS_PER_US to replace .cur_time completely
+        // with a counter-based time, weighted appropriately for STATUS_IDLE.
         cur_time = get_time_micros();
     }
     outputs_[output].cur_time = cur_time; // Invalid values are checked below.
@@ -3492,7 +3562,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                 record_type_is_instr_boundary(record, outputs_[output].last_record) &&
                 !outputs_[output].in_kernel_code) {
                 ++input->instrs_in_quantum;
-                if (input->instrs_in_quantum > options_.quantum_duration) {
+                if (input->instrs_in_quantum > options_.quantum_duration_instrs) {
                     // We again prefer to switch to another input even if the current
                     // input has the oldest timestamp, prioritizing context switches
                     // over timestamp ordering.
@@ -3516,7 +3586,10 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                 input->time_spent_in_quantum += cur_time - input->prev_time_in_quantum;
                 prev_time_in_quantum = input->prev_time_in_quantum;
                 input->prev_time_in_quantum = cur_time;
-                if (input->time_spent_in_quantum >= options_.quantum_duration &&
+                double elapsed_micros =
+                    static_cast<double>(input->time_spent_in_quantum) /
+                    options_.time_units_per_us;
+                if (elapsed_micros >= options_.quantum_duration_us &&
                     // We only switch on instruction boundaries.  We could possibly switch
                     // in between (e.g., scatter/gather long sequence of reads/writes) by
                     // setting input->switching_pre_instruction.
@@ -3759,13 +3832,14 @@ scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output,
                     outputs_[output].wait_start_time = get_output_time(output);
                 } else {
                     uint64_t now = get_output_time(output);
-                    if (now - outputs_[output].wait_start_time >
-                        options_.block_time_max) {
+                    double elapsed_micros = (now - outputs_[output].wait_start_time) *
+                        options_.time_units_per_us;
+                    if (elapsed_micros > options_.block_time_max_us) {
                         // XXX i#6822: We may want some other options here for what to
                         // do.  We could release just one input at a time, which would be
                         // the same scheduling order (as we have FIFO in
                         // unscheduled_priority_) but may take a long time at
-                        // block_time_max each; we could declare we're done and just
+                        // block_time_max_us each; we could declare we're done and just
                         // exit, maybe under a flag or if we could see what % of total
                         // records we've processed.
                         VPRINT(this, 1,
diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
index 4184ffa1a86..821644b5c38 100644
--- a/clients/drcachesim/scheduler/scheduler.h
+++ b/clients/drcachesim/scheduler/scheduler.h
@@ -595,12 +595,15 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         /** The unit of the schedule time quantum. */
         quantum_unit_t quantum_unit = QUANTUM_INSTRUCTIONS;
         /**
-         * The scheduling quantum duration for preemption.  The units are
-         * specified by
-         * #dynamorio::drmemtrace::scheduler_tmpl_t::scheduler_options_t::quantum_unit.
+         * Deprecated: use #quantum_duration_us and #time_units_per_us for #QUANTUM_TIME,
+         * or #quantum_duration_instrs for #QUANTUM_INSTRUCTIONS, instead.  It
+         * is an error to set this to a non-zero value when #struct_size includes
+         * #quantum_duration_us.  When #struct_size does not include
+         * #quantum_duration_us and this value is non-zero, the value in
+         * #quantum_duration_us is replaced with this value divided by the default
+         * value of #time_units_per_us.
          */
-        // We pick 6 million to match 2 instructions per nanosecond with a 3ms quantum.
-        uint64_t quantum_duration = 6 * 1000 * 1000;
+        uint64_t quantum_duration = 0;
         /**
          * If > 0, diagnostic messages are printed to stderr.  Higher values produce
          * more frequent diagnostics.
@@ -643,37 +646,21 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
          */
         uint64_t blocking_switch_threshold = 100;
         /**
-         * Controls the amount of time inputs are considered blocked at a syscall whose
-         * latency exceeds #syscall_switch_threshold or #blocking_switch_threshold.  The
-         * syscall latency (in microseconds) is multiplied by this field to produce the
-         * blocked time.  For #QUANTUM_TIME, that blocked time in the units reported by
-         * the time parameter to next_record() must pass before the input is no longer
-         * considered blocked.  Since the system call latencies are in microseconds, this
-         * #block_time_scale should be set to the number of next_record() time units in
-         * one simulated microsecond.  For #QUANTUM_INSTRUCTIONS, the blocked time in
-         * wall-clock microseconds must pass before the input is actually selected
-         * (wall-clock time is used as there is no reasonable alternative with no other
-         * uniform notion of time); thus, the #block_time_scale value here should equal
-         * the slowdown of the instruction record processing versus the original
-         * (untraced) application execution.  The blocked time is clamped to a maximum
-         * value controlled by #block_time_max.
-         *
-         * The default value is meant to be reasonable for simple analyzers.  It may
-         * result in too much or too little idle time depending on the analyzer or
-         * simulator and its speed; it is meant to be tuned and modified.
-         */
-        double block_time_scale = 10.;
-        /**
-         * The maximum time, in the units explained by #block_time_scale (either
-         * #QUANTUM_TIME simulator time or wall-clock microseconds for
-         * #QUANTUM_INSTRUCTIONS), for an input to be considered blocked for any one
-         * system call.  This is applied after multiplying by #block_time_scale.
-         * This is also used as a fallback to avoid hangs when there are no scheduled
-         * inputs: if the only inputs left are "unscheduled" (see
-         * #TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE), after this amount of time those
-         * inputs are all re-scheduled.
-         */
-        uint64_t block_time_max = 250000;
+         * Deprecated: use #block_time_multiplier instead.  It is an error to set
+         * this to a non-zero value when #struct_size includes #block_time_multiplier.
+         * When #struct_size does not include #block_time_multiplier and this value is
+         * non-zero, the value in #block_time_multiplier is replaced with this value
+         * divided by the default value of #time_units_per_us.
+         */
+        double block_time_scale = 0.;
+        /**
+         * Deprecated: use #block_time_max_us and #time_units_per_us instead.  It is
+         * an error to set this to a non-zero value when #struct_size includes
+         * #block_time_max_us.  When #struct_size does not include #block_time_max_us
+         * and this value is non-zero, the value in #block_time_max_us is replaced
+         * with this value divided by the default value of #time_units_per_us.
+         */
+        uint64_t block_time_max = 0;
         // XXX: Should we share the file-to-reader code currently in the scheduler
         // with the analyzer and only then need reader interfaces and not pass paths
         // to the scheduler?
@@ -740,6 +727,59 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
          * (these markers remain: they are not removed from the trace).
          */
         bool honor_direct_switches = true;
+        /**
+         * How many time units for the "cur_time" value passed to next_record() are
+         * equivalent to one simulated microsecond.  E.g., if the time units are in
+         * picoseconds, pass one million here.  This is used to scale all of the
+         * other parameters that are in microseconds (they all end in "_us": e.g.,
+         * #quantum_duration_us) so that they operate on the right time scale for the
+         * passed-in simulator time (or wall-clock microseconds if no time is passed).
+         */
+        double time_units_per_us = 100.;
+        /**
+         * The scheduling quantum duration for preemption, in simulated microseconds,
+         * for #QUANTUM_TIME.  This value is multiplied by #time_units_per_us to
+         * produce a value that is compared to the "cur_time" parameter to
+         * next_record() to determine when to force a quantum switch.
+         */
+        uint64_t quantum_duration_us = 5000;
+        /**
+         * The scheduling quantum duration for preemption, in instruction count,
+         * for #QUANTUM_INSTRUCTIONS.  The time passed to next_record() is ignored
+         * for purposes of quantum preempts.
+         */
+        // We pick 10 million to match 2 instructions per nanosecond with a 5ms quantum.
+        uint64_t quantum_duration_instrs = 10 * 1000 * 1000;
+        /**
+         * Controls the amount of time inputs are considered blocked at a syscall
+         * whose as-traced latency (recorded in timestamp records in the trace)
+         * exceeds #syscall_switch_threshold or #blocking_switch_threshold.  The
+         * as-traced syscall latency (which is in traced microseconds) is multiplied
+         * by this field to produce the blocked time in simulated microseconds.  Once
+         * that many simulated microseconds have passed according to the "cur_time"
+         * value passed to next_record() (multiplied by #time_units_per_us), the
+         * input will be no longer considered blocked.  The blocked time is clamped
+         * to a maximum value controlled by #block_time_max.
+         *
+         * While there is no direct overhead during tracing, indirect overhead
+         * does result in some inflation of recorded system call latencies.
+         * Thus, a value below 0 is typically used here.  This value, in combination
+         * with #block_time_max_us, can be tuned to achieve a desired idle rate.
+         * The default value errs on the side of less idle time.
+         */
+        double block_time_multiplier = 0.1;
+        /**
+         * The maximum time in microseconds for an input to be considered blocked for
+         * any one system call.  This value is multiplied by #time_units_per_us to
+         * produce a value that is compared to the "cur_time" parameter to
+         * next_record().  If any block time (see #block_time_multiplier) exceeds
+         * this value, it is capped to this value.  This value is also used as a
+         * fallback to avoid hangs when there are no scheduled inputs: if the only
+         * inputs left are "unscheduled" (see #TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE),
+         * after this amount of time those inputs are all re-scheduled.
+         */
+        // TODO i#6959: Once we have -exit_if_all_unscheduled raise this.
+        uint64_t block_time_max_us = 2500;
     };
 
     /**
@@ -1324,11 +1364,11 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         uint64_t syscall_timeout_arg = 0;
         // Used to switch before we've read the next instruction.
         bool switching_pre_instruction = false;
-        // Used for time-based quanta.
+        // Used for time-based quanta.  The units are simulation time.
         uint64_t prev_time_in_quantum = 0;
         uint64_t time_spent_in_quantum = 0;
         // These fields model waiting at a blocking syscall.
-        // The units are us for instr quanta and simuilation time for time quanta.
+        // The units are in simuilation time.
         uint64_t blocked_time = 0;
         uint64_t blocked_start_time = 0;
         // An input can be "unscheduled" and not on the ready_priority_ run queue at all
@@ -1532,6 +1572,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     process_next_initial_record(input_info_t &input, RecordType record,
                                 bool &found_filetype, bool &found_timestamp);
 
+    scheduler_status_t
+    legacy_field_support();
+
     // Opens readers for each file in 'path', subject to the constraints in
     // 'reader_info'.  'path' may be a directory.
     // Updates the ti2dinput, unfiltered_tids, and input_count fields of 'reader_info'.
diff --git a/clients/drcachesim/tests/scheduler_launcher.cpp b/clients/drcachesim/tests/scheduler_launcher.cpp
index 16554a9daa9..1944029c96d 100644
--- a/clients/drcachesim/tests/scheduler_launcher.cpp
+++ b/clients/drcachesim/tests/scheduler_launcher.cpp
@@ -307,10 +307,13 @@ _tmain(int argc, const TCHAR *targv[])
         op_honor_stamps.get_value() ? scheduler_t::DEPENDENCY_TIMESTAMPS
                                     : scheduler_t::DEPENDENCY_IGNORE,
         scheduler_t::SCHEDULER_DEFAULTS, op_verbose.get_value());
-    sched_ops.quantum_duration = op_sched_quantum.get_value();
-    if (op_sched_time.get_value())
+    if (op_sched_time.get_value()) {
         sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
-    sched_ops.block_time_scale = op_block_time_scale.get_value();
+        sched_ops.quantum_duration_us = op_sched_quantum.get_value();
+    } else {
+        sched_ops.quantum_duration_instrs = op_sched_quantum.get_value();
+    }
+    sched_ops.block_time_multiplier = op_block_time_scale.get_value();
 #ifdef HAS_ZIP
     std::unique_ptr<zipfile_ostream_t> record_zip;
     std::unique_ptr<zipfile_istream_t> replay_zip;
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index 26e4310d679..d9515676db9 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -34,14 +34,20 @@
 #undef NDEBUG
 #include <assert.h>
 #include <algorithm>
+#include <cstddef>
 #include <cstring>
 #include <iostream>
+#include <set>
+#include <string>
 #include <thread>
 #include <vector>
+#include <utility>
 
 #include "dr_api.h"
 #include "scheduler.h"
 #include "mock_reader.h"
+#include "memref.h"
+#include "trace_entry.h"
 #ifdef HAS_ZIP
 #    include "zipfile_istream.h"
 #    include "zipfile_ostream.h"
@@ -160,6 +166,104 @@ verify_scheduler_stats(scheduler_t::stream_t *stream, int64_t switch_input_to_in
            migrations);
 }
 
+// Returns a vector of strings, one per output, where each string has one char per input
+// showing the order of inputs scheduled onto that output.
+// Assumes the input threads are all tid_base plus an offset < 26.
+// When send_time=true, the record count is passed to the scheduler as the current
+// time, to avoid relying on wall-clock time.  For this use case of send_time=true,
+// typically time_units_per_us should be set to 1 to avoid any scaling of the record
+// count for simpler small tests.
+static std::vector<std::string>
+run_lockstep_simulation(scheduler_t &scheduler, int num_outputs, memref_tid_t tid_base,
+                        bool send_time = false, bool print_markers = true)
+{
+    // Walk the outputs in lockstep for crude but deterministic concurrency.
+    std::vector<scheduler_t::stream_t *> outputs(num_outputs, nullptr);
+    std::vector<bool> eof(num_outputs, false);
+    for (int i = 0; i < num_outputs; i++)
+        outputs[i] = scheduler.get_stream(i);
+    int num_eof = 0;
+    int64_t meta_records = 0;
+    // Record the threads, one char each.
+    std::vector<std::string> sched_as_string(num_outputs);
+    static constexpr char THREAD_LETTER_START = 'A';
+    static constexpr char WAIT_SYMBOL = '-';
+    static constexpr char IDLE_SYMBOL = '_';
+    static constexpr char NON_INSTR_SYMBOL = '.';
+    while (num_eof < num_outputs) {
+        for (int i = 0; i < num_outputs; i++) {
+            if (eof[i])
+                continue;
+            memref_t memref;
+            scheduler_t::stream_status_t status;
+            if (send_time) {
+                // We assume IPC=1 and so send the instruction count (+1 to avoid an
+                // invalid time of 0) which allows apples-to-apples comparisons with
+                // instruction quanta.  This is a per-output time which technically
+                // violates the globally-increasing requirement, so this will not work
+                // perfectly with i/o waits, but should work fine for basic tests.
+                // We add the wait and idle records to make progress with idle time.
+                status = outputs[i]->next_record(
+                    memref, outputs[i]->get_instruction_ordinal() + 1 + meta_records);
+            } else {
+                status = outputs[i]->next_record(memref);
+            }
+            if (status == scheduler_t::STATUS_EOF) {
+                ++num_eof;
+                eof[i] = true;
+                continue;
+            }
+            if (status == scheduler_t::STATUS_WAIT) {
+                sched_as_string[i] += WAIT_SYMBOL;
+                ++meta_records;
+                continue;
+            }
+            if (status == scheduler_t::STATUS_IDLE) {
+                sched_as_string[i] += IDLE_SYMBOL;
+                ++meta_records;
+                continue;
+            }
+            assert(status == scheduler_t::STATUS_OK);
+            if (type_is_instr(memref.instr.type)) {
+                sched_as_string[i] +=
+                    THREAD_LETTER_START + static_cast<char>(memref.instr.tid - tid_base);
+            } else {
+                // While this makes the string longer, it is just too confusing
+                // with the same letter seemingly on 2 cores at once without these
+                // fillers to line everything up in time.
+                sched_as_string[i] += NON_INSTR_SYMBOL;
+            }
+            assert(outputs[i]->get_shard_index() ==
+                   outputs[i]->get_output_stream_ordinal());
+        }
+    }
+    // Ensure we never see the same output on multiple cores in the same timestep.
+    size_t max_size = 0;
+    for (int i = 0; i < num_outputs; ++i)
+        max_size = std::max(max_size, sched_as_string[i].size());
+    for (int step = 0; step < static_cast<int>(max_size); ++step) {
+        std::set<char> inputs;
+        for (int out = 0; out < num_outputs; ++out) {
+            if (static_cast<int>(sched_as_string[out].size()) <= step)
+                continue;
+            if (sched_as_string[out][step] < 'A' || sched_as_string[out][step] > 'Z')
+                continue;
+            assert(inputs.find(sched_as_string[out][step]) == inputs.end());
+            inputs.insert(sched_as_string[out][step]);
+        }
+    }
+    if (!print_markers) {
+        // We kept the dots internally for our same-timestep check above.
+        for (int i = 0; i < num_outputs; ++i) {
+            sched_as_string[i].erase(std::remove(sched_as_string[i].begin(),
+                                                 sched_as_string[i].end(),
+                                                 NON_INSTR_SYMBOL),
+                                     sched_as_string[i].end());
+        }
+    }
+    return sched_as_string;
+}
+
 static void
 test_serial()
 {
@@ -294,7 +398,7 @@ test_parallel()
 }
 
 static void
-test_param_checks()
+test_invalid_regions()
 {
     std::vector<scheduler_t::input_reader_t> readers;
     readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t()),
@@ -333,6 +437,143 @@ test_param_checks()
         scheduler_t::STATUS_ERROR_INVALID_PARAMETER);
 }
 
+static void
+test_legacy_fields()
+{
+    std::cerr << "\n----------------\nTesting legacy fields\n";
+    static constexpr int NUM_INPUTS = 7;
+    static constexpr int NUM_OUTPUTS = 2;
+    static constexpr int NUM_INSTRS = 9;
+    static constexpr int QUANTUM_DURATION = 3;
+    // We do not want to block for very long.
+    static constexpr uint64_t BLOCK_LATENCY = 200;
+    static constexpr double BLOCK_SCALE = 0.01;
+    static constexpr uint64_t BLOCK_MAX = 50;
+    static constexpr memref_tid_t TID_BASE = 100;
+    static constexpr uint64_t START_TIME = 20;
+    std::vector<trace_entry_t> inputs[NUM_INPUTS];
+    for (int i = 0; i < NUM_INPUTS; i++) {
+        memref_tid_t tid = TID_BASE + i;
+        inputs[i].push_back(make_thread(tid));
+        inputs[i].push_back(make_pid(1));
+        inputs[i].push_back(make_version(TRACE_ENTRY_VERSION));
+        inputs[i].push_back(make_timestamp(START_TIME)); // All the same time priority.
+        for (int j = 0; j < NUM_INSTRS; j++) {
+            inputs[i].push_back(make_instr(42 + j * 4));
+            // Including blocking syscalls.
+            if ((i == 0 || i == 1) && j == 1) {
+                inputs[i].push_back(make_timestamp(START_TIME * 2));
+                inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 42));
+                inputs[i].push_back(
+                    make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0));
+                inputs[i].push_back(make_timestamp(START_TIME * 2 + BLOCK_LATENCY));
+            }
+        }
+        inputs[i].push_back(make_exit(tid));
+    }
+    {
+        // Test invalid quantum.
+        std::vector<scheduler_t::input_workload_t> sched_inputs;
+        std::vector<scheduler_t::input_reader_t> readers;
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(inputs[0])),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()),
+                             TID_BASE);
+        sched_inputs.emplace_back(std::move(readers));
+        scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                                   scheduler_t::DEPENDENCY_IGNORE,
+                                                   scheduler_t::SCHEDULER_DEFAULTS);
+        sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+        sched_ops.quantum_duration = QUANTUM_DURATION;
+        scheduler_t scheduler;
+        assert(scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) ==
+               scheduler_t::STATUS_ERROR_INVALID_PARAMETER);
+    }
+    {
+        // Test invalid block scale.
+        std::vector<scheduler_t::input_workload_t> sched_inputs;
+        std::vector<scheduler_t::input_reader_t> readers;
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(inputs[0])),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()),
+                             TID_BASE);
+        sched_inputs.emplace_back(std::move(readers));
+        scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                                   scheduler_t::DEPENDENCY_IGNORE,
+                                                   scheduler_t::SCHEDULER_DEFAULTS);
+        sched_ops.block_time_scale = BLOCK_SCALE;
+        scheduler_t scheduler;
+        assert(scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) ==
+               scheduler_t::STATUS_ERROR_INVALID_PARAMETER);
+    }
+    {
+        // Test invalid block max.
+        std::vector<scheduler_t::input_workload_t> sched_inputs;
+        std::vector<scheduler_t::input_reader_t> readers;
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(inputs[0])),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()),
+                             TID_BASE);
+        sched_inputs.emplace_back(std::move(readers));
+        scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                                   scheduler_t::DEPENDENCY_IGNORE,
+                                                   scheduler_t::SCHEDULER_DEFAULTS);
+        sched_ops.block_time_max = BLOCK_MAX;
+        scheduler_t scheduler;
+        assert(scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) ==
+               scheduler_t::STATUS_ERROR_INVALID_PARAMETER);
+    }
+    {
+        // Test valid legacy fields.
+        std::vector<scheduler_t::input_workload_t> sched_inputs;
+        for (int i = 0; i < NUM_INPUTS; i++) {
+            std::vector<scheduler_t::input_reader_t> readers;
+            readers.emplace_back(
+                std::unique_ptr<mock_reader_t>(new mock_reader_t(inputs[i])),
+                std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_BASE + i);
+            sched_inputs.emplace_back(std::move(readers));
+        }
+        scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                                   scheduler_t::DEPENDENCY_IGNORE,
+                                                   scheduler_t::SCHEDULER_DEFAULTS,
+                                                   /*verbosity=*/4);
+        // Simulate binary compatibility with a legacy struct.
+        sched_ops.struct_size =
+            offsetof(scheduler_t::scheduler_options_t, time_units_per_us);
+        sched_ops.quantum_duration_us = QUANTUM_DURATION;
+        sched_ops.block_time_scale = BLOCK_SCALE;
+        sched_ops.block_time_max = BLOCK_MAX;
+        // To do our test we use instrs-as-time for deterministic block times.
+        sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+        sched_ops.time_units_per_us = 1.;
+        scheduler_t scheduler;
+        if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+            scheduler_t::STATUS_SUCCESS)
+            assert(false);
+        std::vector<std::string> sched_as_string =
+            run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
+        // Hardcoding here for the 2 outputs and 7 inputs.
+        // We expect 3 letter sequences (our quantum) alternating every-other as each
+        // core alternates. The dots are markers and thread exits.
+        // A and B have a voluntary switch after their 1st 2 letters, but we expect
+        // the usage to persist to their next scheduling which should only have
+        // a single letter.
+        static const char *const CORE0_SCHED_STRING =
+            "..AA......CCC..EEE..GGGDDDFFFBBBCCC.EEE.AAA.GGG.";
+        static const char *const CORE1_SCHED_STRING =
+            "..BB......DDD..FFFABCCCEEEAAAGGGDDD.FFF.BBB.____";
+        for (int i = 0; i < NUM_OUTPUTS; i++) {
+            std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
+        }
+        assert(sched_as_string[0] == CORE0_SCHED_STRING);
+        assert(sched_as_string[1] == CORE1_SCHED_STRING);
+    }
+}
+
+static void
+test_param_checks()
+{
+    test_invalid_regions();
+    test_legacy_fields();
+}
+
 // Tests regions without timestamps for a simple, direct test.
 static void
 test_regions_bare()
@@ -947,99 +1188,6 @@ test_real_file_queries_and_filters(const char *testdir)
 #endif
 }
 
-// Returns a string with one char per input.
-// Assumes the input threads are all tid_base plus an offset < 26.
-static std::vector<std::string>
-run_lockstep_simulation(scheduler_t &scheduler, int num_outputs, memref_tid_t tid_base,
-                        bool send_time = false, bool print_markers = true)
-{
-    // Walk the outputs in lockstep for crude but deterministic concurrency.
-    std::vector<scheduler_t::stream_t *> outputs(num_outputs, nullptr);
-    std::vector<bool> eof(num_outputs, false);
-    for (int i = 0; i < num_outputs; i++)
-        outputs[i] = scheduler.get_stream(i);
-    int num_eof = 0;
-    int64_t meta_records = 0;
-    // Record the threads, one char each.
-    std::vector<std::string> sched_as_string(num_outputs);
-    static constexpr char THREAD_LETTER_START = 'A';
-    static constexpr char WAIT_SYMBOL = '-';
-    static constexpr char IDLE_SYMBOL = '_';
-    static constexpr char NON_INSTR_SYMBOL = '.';
-    while (num_eof < num_outputs) {
-        for (int i = 0; i < num_outputs; i++) {
-            if (eof[i])
-                continue;
-            memref_t memref;
-            scheduler_t::stream_status_t status;
-            if (send_time) {
-                // We assume IPC=1 and so send the instruction count (+1 to avoid an
-                // invalid time of 0) which allows apples-to-apples comparisons with
-                // instruction quanta.  This is a per-output time which technically
-                // violates the globally-increasing requirement, so this will not work
-                // perfectly with i/o waits, but should work fine for basic tests.
-                // We add the wait and idle records to make progress with idle time.
-                status = outputs[i]->next_record(
-                    memref, outputs[i]->get_instruction_ordinal() + 1 + meta_records);
-            } else {
-                status = outputs[i]->next_record(memref);
-            }
-            if (status == scheduler_t::STATUS_EOF) {
-                ++num_eof;
-                eof[i] = true;
-                continue;
-            }
-            if (status == scheduler_t::STATUS_WAIT) {
-                sched_as_string[i] += WAIT_SYMBOL;
-                ++meta_records;
-                continue;
-            }
-            if (status == scheduler_t::STATUS_IDLE) {
-                sched_as_string[i] += IDLE_SYMBOL;
-                ++meta_records;
-                continue;
-            }
-            assert(status == scheduler_t::STATUS_OK);
-            if (type_is_instr(memref.instr.type)) {
-                sched_as_string[i] +=
-                    THREAD_LETTER_START + static_cast<char>(memref.instr.tid - tid_base);
-            } else {
-                // While this makes the string longer, it is just too confusing
-                // with the same letter seemingly on 2 cores at once without these
-                // fillers to line everything up in time.
-                sched_as_string[i] += NON_INSTR_SYMBOL;
-            }
-            assert(outputs[i]->get_shard_index() ==
-                   outputs[i]->get_output_stream_ordinal());
-        }
-    }
-    // Ensure we never see the same output on multiple cores in the same timestep.
-    size_t max_size = 0;
-    for (int i = 0; i < num_outputs; ++i)
-        max_size = std::max(max_size, sched_as_string[i].size());
-    for (int step = 0; step < static_cast<int>(max_size); ++step) {
-        std::set<char> inputs;
-        for (int out = 0; out < num_outputs; ++out) {
-            if (static_cast<int>(sched_as_string[out].size()) <= step)
-                continue;
-            if (sched_as_string[out][step] < 'A' || sched_as_string[out][step] > 'Z')
-                continue;
-            assert(inputs.find(sched_as_string[out][step]) == inputs.end());
-            inputs.insert(sched_as_string[out][step]);
-        }
-    }
-    if (!print_markers) {
-        // We kept the dots internally for our same-timestep check above.
-        for (int i = 0; i < num_outputs; ++i) {
-            sched_as_string[i].erase(std::remove(sched_as_string[i].begin(),
-                                                 sched_as_string[i].end(),
-                                                 NON_INSTR_SYMBOL),
-                                     sched_as_string[i].end());
-        }
-    }
-    return sched_as_string;
-}
-
 static void
 test_synthetic()
 {
@@ -1095,8 +1243,9 @@ test_synthetic()
                                                    scheduler_t::DEPENDENCY_IGNORE,
                                                    scheduler_t::SCHEDULER_DEFAULTS,
                                                    /*verbosity=*/3);
-        sched_ops.quantum_duration = QUANTUM_DURATION;
-        sched_ops.block_time_scale = BLOCK_SCALE;
+        sched_ops.quantum_duration_instrs = QUANTUM_DURATION;
+        sched_ops.block_time_multiplier = BLOCK_SCALE;
+        sched_ops.time_units_per_us = 1.;
         scheduler_t scheduler;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
             scheduler_t::STATUS_SUCCESS)
@@ -1160,8 +1309,9 @@ test_synthetic()
                                                    scheduler_t::SCHEDULER_DEFAULTS,
                                                    /*verbosity=*/3);
         sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
-        sched_ops.quantum_duration = QUANTUM_DURATION;
-        sched_ops.block_time_scale = BLOCK_SCALE;
+        sched_ops.time_units_per_us = 1.;
+        sched_ops.quantum_duration_us = QUANTUM_DURATION;
+        sched_ops.block_time_multiplier = BLOCK_SCALE;
         scheduler_t scheduler;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
             scheduler_t::STATUS_SUCCESS)
@@ -1223,9 +1373,10 @@ test_synthetic_time_quanta()
                                                    scheduler_t::SCHEDULER_DEFAULTS,
                                                    /*verbosity=*/4);
         sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
-        sched_ops.quantum_duration = 3;
+        sched_ops.time_units_per_us = 1.;
+        sched_ops.quantum_duration_us = 3;
         // Ensure it waits 10 steps.
-        sched_ops.block_time_scale = 10. / (POST_BLOCK_TIME - PRE_BLOCK_TIME);
+        sched_ops.block_time_multiplier = 10. / (POST_BLOCK_TIME - PRE_BLOCK_TIME);
         zipfile_ostream_t outfile(record_fname);
         sched_ops.schedule_record_ostream = &outfile;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
@@ -1408,7 +1559,7 @@ test_synthetic_with_timestamps()
                                                scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                scheduler_t::SCHEDULER_DEFAULTS,
                                                /*verbosity=*/3);
-    sched_ops.quantum_duration = 3;
+    sched_ops.quantum_duration_instrs = 3;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
         scheduler_t::STATUS_SUCCESS)
@@ -1506,7 +1657,7 @@ test_synthetic_with_priorities()
                                                scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                scheduler_t::SCHEDULER_DEFAULTS,
                                                /*verbosity=*/3);
-    sched_ops.quantum_duration = 3;
+    sched_ops.quantum_duration_instrs = 3;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
         scheduler_t::STATUS_SUCCESS)
@@ -1592,7 +1743,7 @@ test_synthetic_with_bindings_time(bool time_deps)
         time_deps ? scheduler_t::DEPENDENCY_TIMESTAMPS : scheduler_t::DEPENDENCY_IGNORE,
         scheduler_t::SCHEDULER_DEFAULTS,
         /*verbosity=*/3);
-    sched_ops.quantum_duration = 3;
+    sched_ops.quantum_duration_instrs = 3;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
         scheduler_t::STATUS_SUCCESS)
@@ -1646,7 +1797,7 @@ test_synthetic_with_bindings_more_out()
                                                scheduler_t::DEPENDENCY_IGNORE,
                                                scheduler_t::SCHEDULER_DEFAULTS,
                                                /*verbosity=*/3);
-    sched_ops.quantum_duration = 3;
+    sched_ops.quantum_duration_instrs = 3;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
         scheduler_t::STATUS_SUCCESS)
@@ -1716,7 +1867,7 @@ test_synthetic_with_bindings_weighted()
                                                scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                scheduler_t::SCHEDULER_DEFAULTS,
                                                /*verbosity=*/3);
-    sched_ops.quantum_duration = 3;
+    sched_ops.quantum_duration_instrs = 3;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
         scheduler_t::STATUS_SUCCESS)
@@ -1828,11 +1979,12 @@ test_synthetic_with_syscalls_multiple()
                                                scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                scheduler_t::SCHEDULER_DEFAULTS,
                                                /*verbosity=*/3);
-    sched_ops.quantum_duration = 3;
+    sched_ops.quantum_duration_us = 3;
     // We use our mock's time==instruction count for a deterministic result.
     sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+    sched_ops.time_units_per_us = 1.;
     sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
-    sched_ops.block_time_scale = BLOCK_SCALE;
+    sched_ops.block_time_multiplier = BLOCK_SCALE;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
         scheduler_t::STATUS_SUCCESS)
@@ -1936,11 +2088,12 @@ test_synthetic_with_syscalls_single()
                                                scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                scheduler_t::SCHEDULER_DEFAULTS,
                                                /*verbosity=*/4);
-    sched_ops.quantum_duration = 3;
+    sched_ops.quantum_duration_us = 3;
     // We use our mock's time==instruction count for a deterministic result.
     sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+    sched_ops.time_units_per_us = 1.;
     sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
-    sched_ops.block_time_scale = BLOCK_SCALE;
+    sched_ops.block_time_multiplier = BLOCK_SCALE;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
         scheduler_t::STATUS_SUCCESS)
@@ -2140,8 +2293,9 @@ test_synthetic_with_syscalls_latencies()
                                                /*verbosity=*/4);
     // We use a mock time for a deterministic result.
     sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+    sched_ops.time_units_per_us = 1.;
     sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
-    sched_ops.block_time_scale = BLOCK_SCALE;
+    sched_ops.block_time_multiplier = BLOCK_SCALE;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, 1, std::move(sched_ops)) !=
         scheduler_t::STATUS_SUCCESS)
@@ -2247,11 +2401,12 @@ test_synthetic_with_syscalls_idle()
                                                scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                scheduler_t::SCHEDULER_DEFAULTS,
                                                /*verbosity=*/3);
-    sched_ops.quantum_duration = 3;
+    sched_ops.quantum_duration_us = 3;
     // We use a mock time for a deterministic result.
     sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+    sched_ops.time_units_per_us = 1.;
     sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
-    sched_ops.block_time_scale = BLOCK_SCALE;
+    sched_ops.block_time_multiplier = BLOCK_SCALE;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
         scheduler_t::STATUS_SUCCESS)
@@ -2312,7 +2467,7 @@ test_synthetic_multi_threaded(const char *testdir)
                                                /*verbosity=*/2);
     static constexpr int NUM_OUTPUTS = 4;
     static constexpr int QUANTUM_DURATION = 2000;
-    sched_ops.quantum_duration = QUANTUM_DURATION;
+    sched_ops.quantum_duration_instrs = QUANTUM_DURATION;
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
         scheduler_t::STATUS_SUCCESS)
         assert(false);
@@ -2544,7 +2699,7 @@ test_replay()
                                                    scheduler_t::DEPENDENCY_IGNORE,
                                                    scheduler_t::SCHEDULER_DEFAULTS,
                                                    /*verbosity=*/3);
-        sched_ops.quantum_duration = QUANTUM_INSTRS;
+        sched_ops.quantum_duration_instrs = QUANTUM_INSTRS;
 
         zipfile_ostream_t outfile(record_fname);
         sched_ops.schedule_record_ostream = &outfile;
@@ -2672,7 +2827,7 @@ test_replay_multi_threaded(const char *testdir)
         zipfile_ostream_t outfile(record_fname);
         sched_ops.schedule_record_ostream = &outfile;
         static constexpr int QUANTUM_DURATION = 2000;
-        sched_ops.quantum_duration = QUANTUM_DURATION;
+        sched_ops.quantum_duration_instrs = QUANTUM_DURATION;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
             scheduler_t::STATUS_SUCCESS)
             assert(false);
@@ -3300,7 +3455,7 @@ test_replay_limit()
                                                    /*verbosity=*/2);
         zipfile_ostream_t outfile(record_fname);
         sched_ops.schedule_record_ostream = &outfile;
-        sched_ops.quantum_duration = NUM_INSTRS / 10;
+        sched_ops.quantum_duration_instrs = NUM_INSTRS / 10;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
             scheduler_t::STATUS_SUCCESS)
             assert(false);
@@ -4029,7 +4184,7 @@ test_inactive()
                                                    scheduler_t::DEPENDENCY_IGNORE,
                                                    scheduler_t::SCHEDULER_DEFAULTS,
                                                    /*verbosity=*/4);
-        sched_ops.quantum_duration = 2;
+        sched_ops.quantum_duration_instrs = 2;
         zipfile_ostream_t outfile(record_fname);
         sched_ops.schedule_record_ostream = &outfile;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
@@ -4253,11 +4408,12 @@ test_direct_switch()
                                                    scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                    scheduler_t::SCHEDULER_DEFAULTS,
                                                    /*verbosity=*/3);
-        sched_ops.quantum_duration = QUANTUM_DURATION;
+        sched_ops.quantum_duration_us = QUANTUM_DURATION;
         // We use our mock's time==instruction count for a deterministic result.
         sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+        sched_ops.time_units_per_us = 1.;
         sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
-        sched_ops.block_time_scale = BLOCK_SCALE;
+        sched_ops.block_time_multiplier = BLOCK_SCALE;
         scheduler_t scheduler;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
             scheduler_t::STATUS_SUCCESS)
@@ -4293,11 +4449,12 @@ test_direct_switch()
                                                    scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                    scheduler_t::SCHEDULER_DEFAULTS,
                                                    /*verbosity=*/3);
-        sched_ops.quantum_duration = QUANTUM_DURATION;
+        sched_ops.quantum_duration_us = QUANTUM_DURATION;
         // We use our mock's time==instruction count for a deterministic result.
         sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+        sched_ops.time_units_per_us = 1.;
         sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
-        sched_ops.block_time_scale = BLOCK_SCALE;
+        sched_ops.block_time_multiplier = BLOCK_SCALE;
         sched_ops.honor_direct_switches = false;
         scheduler_t scheduler;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
@@ -4460,11 +4617,12 @@ test_unscheduled()
                                                    scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                    scheduler_t::SCHEDULER_DEFAULTS,
                                                    /*verbosity=*/3);
-        sched_ops.quantum_duration = QUANTUM_DURATION;
+        sched_ops.quantum_duration_us = QUANTUM_DURATION;
         // We use our mock's time==instruction count for a deterministic result.
         sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+        sched_ops.time_units_per_us = 1.;
         sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
-        sched_ops.block_time_scale = BLOCK_SCALE;
+        sched_ops.block_time_multiplier = BLOCK_SCALE;
         scheduler_t scheduler;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
             scheduler_t::STATUS_SUCCESS)
@@ -4497,11 +4655,12 @@ test_unscheduled()
                                                    scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                    scheduler_t::SCHEDULER_DEFAULTS,
                                                    /*verbosity=*/3);
-        sched_ops.quantum_duration = QUANTUM_DURATION;
+        sched_ops.quantum_duration_us = QUANTUM_DURATION;
         // We use our mock's time==instruction count for a deterministic result.
         sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+        sched_ops.time_units_per_us = 1.;
         sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
-        sched_ops.block_time_scale = BLOCK_SCALE;
+        sched_ops.block_time_multiplier = BLOCK_SCALE;
         sched_ops.honor_direct_switches = false;
         scheduler_t scheduler;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
@@ -4644,12 +4803,13 @@ test_unscheduled_fallback()
                                                    scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                    scheduler_t::SCHEDULER_DEFAULTS,
                                                    /*verbosity=*/3);
-        sched_ops.quantum_duration = QUANTUM_DURATION;
+        sched_ops.quantum_duration_us = QUANTUM_DURATION;
         // We use our mock's time==instruction count for a deterministic result.
         sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+        sched_ops.time_units_per_us = 1.;
         sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
-        sched_ops.block_time_scale = BLOCK_SCALE;
-        sched_ops.block_time_max = BLOCK_TIME_MAX;
+        sched_ops.block_time_multiplier = BLOCK_SCALE;
+        sched_ops.block_time_max_us = BLOCK_TIME_MAX;
         scheduler_t scheduler;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
             scheduler_t::STATUS_SUCCESS)
@@ -4681,12 +4841,13 @@ test_unscheduled_fallback()
                                                    scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                    scheduler_t::SCHEDULER_DEFAULTS,
                                                    /*verbosity=*/3);
-        sched_ops.quantum_duration = QUANTUM_DURATION;
+        sched_ops.quantum_duration_us = QUANTUM_DURATION;
         // We use our mock's time==instruction count for a deterministic result.
         sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+        sched_ops.time_units_per_us = 1.;
         sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
-        sched_ops.block_time_scale = BLOCK_SCALE;
-        sched_ops.block_time_max = BLOCK_TIME_MAX;
+        sched_ops.block_time_multiplier = BLOCK_SCALE;
+        sched_ops.block_time_max_us = BLOCK_TIME_MAX;
         sched_ops.honor_direct_switches = false;
         scheduler_t scheduler;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
@@ -4768,9 +4929,10 @@ test_unscheduled_initially()
                                                    scheduler_t::SCHEDULER_DEFAULTS,
                                                    /*verbosity=*/3);
         sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+        sched_ops.time_units_per_us = 1.;
         sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
-        sched_ops.block_time_scale = BLOCK_SCALE;
-        sched_ops.block_time_max = BLOCK_TIME_MAX;
+        sched_ops.block_time_multiplier = BLOCK_SCALE;
+        sched_ops.block_time_max_us = BLOCK_TIME_MAX;
         scheduler_t scheduler;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
             scheduler_t::STATUS_SUCCESS)
@@ -4801,9 +4963,10 @@ test_unscheduled_initially()
                                                    /*verbosity=*/3);
         // We use our mock's time==instruction count for a deterministic result.
         sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+        sched_ops.time_units_per_us = 1.;
         sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
-        sched_ops.block_time_scale = BLOCK_SCALE;
-        sched_ops.block_time_max = BLOCK_TIME_MAX;
+        sched_ops.block_time_multiplier = BLOCK_SCALE;
+        sched_ops.block_time_max_us = BLOCK_TIME_MAX;
         sched_ops.honor_direct_switches = false;
         scheduler_t scheduler;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
@@ -5014,7 +5177,7 @@ test_kernel_switch_sequences()
                                                scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                scheduler_t::SCHEDULER_DEFAULTS,
                                                /*verbosity=*/4);
-    sched_ops.quantum_duration = INSTR_QUANTUM;
+    sched_ops.quantum_duration_instrs = INSTR_QUANTUM;
     sched_ops.kernel_switch_reader = std::move(switch_reader);
     sched_ops.kernel_switch_reader_end = std::move(switch_reader_end);
     scheduler_t scheduler;
@@ -5242,7 +5405,7 @@ test_random_schedule()
                                                    scheduler_t::SCHEDULER_DEFAULTS,
                                                    /*verbosity=*/3);
         sched_ops.randomize_next_input = true;
-        sched_ops.quantum_duration = QUANTUM_DURATION;
+        sched_ops.quantum_duration_instrs = QUANTUM_DURATION;
         scheduler_t scheduler;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
             scheduler_t::STATUS_SUCCESS)
@@ -5328,8 +5491,8 @@ test_record_scheduler()
         record_scheduler_t::MAP_TO_ANY_OUTPUT, record_scheduler_t::DEPENDENCY_IGNORE,
         record_scheduler_t::SCHEDULER_DEFAULTS,
         /*verbosity=*/4);
-    sched_ops.quantum_duration = 2;
-    sched_ops.block_time_scale = 0.001; // Do not stay blocked.
+    sched_ops.quantum_duration_instrs = 2;
+    sched_ops.block_time_multiplier = 0.001; // Do not stay blocked.
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
         record_scheduler_t::STATUS_SUCCESS)
         assert(false);
@@ -5457,6 +5620,7 @@ test_main(int argc, const char *argv[])
     test_kernel_switch_sequences();
     test_random_schedule();
     test_record_scheduler();
+
     dr_standalone_exit();
     return 0;
 }
diff --git a/clients/drcachesim/tests/trace_interval_analysis_unit_tests.cpp b/clients/drcachesim/tests/trace_interval_analysis_unit_tests.cpp
index 7ead1275a13..9cfaada137c 100644
--- a/clients/drcachesim/tests/trace_interval_analysis_unit_tests.cpp
+++ b/clients/drcachesim/tests/trace_interval_analysis_unit_tests.cpp
@@ -249,7 +249,7 @@ class dummy_analysis_tool_t : public analysis_tool_t {
     }
     bool
     finalize_interval_snapshots(
-        std::vector<interval_state_snapshot_t *> &interval_snapshots)
+        std::vector<interval_state_snapshot_t *> &interval_snapshots) override
     {
         if (saw_serial_generate_snapshot_) {
             error_string_ = "Did not expect finalize_interval_snapshots call in serial "
diff --git a/core/loader_shared.c b/core/loader_shared.c
index 8c9233db577..8e7d68fb00a 100644
--- a/core/loader_shared.c
+++ b/core/loader_shared.c
@@ -178,6 +178,7 @@ loader_init_prologue(void)
         privmod_t *mod =
             privload_insert(NULL, privmod_static[i].base, privmod_static[i].size,
                             privmod_static[i].name, privmod_static[i].path);
+        mod->is_top_level_client = true;
         mod->is_client = true;
     }
 
@@ -520,6 +521,7 @@ privload_insert(privmod_t *after, app_pc base, size_t size, const char *name,
     }
     mod->ref_count = 1;
     mod->externally_loaded = false;
+    mod->is_top_level_client = false; /* up to caller to set later */
     mod->is_client = false; /* up to caller to set later */
     mod->called_proc_entry = false;
     mod->called_proc_exit = false;
diff --git a/core/module_shared.h b/core/module_shared.h
index 0cc997c8d8f..a9f64e23a11 100644
--- a/core/module_shared.h
+++ b/core/module_shared.h
@@ -413,7 +413,9 @@ typedef struct _privmod_t {
     char path[MAXIMUM_PATH];
     uint ref_count;
     bool externally_loaded;
-    bool is_client; /* or Extension */
+    /* XXX i#6982: Perhaps replace is_client with is_top_level_client. */
+    bool is_top_level_client; /* set for command-line clients */
+    bool is_client;           /* set for command-line client or extension */
     bool called_proc_entry;
     bool called_proc_exit;
     struct _privmod_t *next;
diff --git a/core/unix/loader.c b/core/unix/loader.c
index 2794dde0448..f011287725a 100644
--- a/core/unix/loader.c
+++ b/core/unix/loader.c
@@ -581,9 +581,9 @@ privload_process_imports(privmod_t *mod)
                     SYSLOG_INTERNAL_WARNING(
                         "private libpthread.so loaded but not fully supported (i#956)");
                 }
-                /* i#852: identify all libs that import from DR as client libs.
-                 * XXX: this code seems stale as libdynamorio.so is already loaded
-                 * (xref #3850).
+                /* i#852: Identify all libs that import from DR as client libs.
+                 * XXX i#6982: The following condition is never true as
+                 * libdynamorio.so has already been loaded (xref #3850).
                  */
                 if (impmod->base == get_dynamorio_dll_start())
                     mod->is_client = true;
diff --git a/core/unix/module_elf.c b/core/unix/module_elf.c
index 75c943d3d8c..2a3ae3977cc 100644
--- a/core/unix/module_elf.c
+++ b/core/unix/module_elf.c
@@ -1104,7 +1104,6 @@ module_lookup_symbol(ELF_SYM_TYPE *sym, os_privmod_data_t *pd)
 {
     app_pc res;
     const char *name;
-    privmod_t *mod;
     bool is_ifunc;
     dcontext_t *dcontext = get_thread_private_dcontext();
 
@@ -1133,11 +1132,21 @@ module_lookup_symbol(ELF_SYM_TYPE *sym, os_privmod_data_t *pd)
      * FIXME: i#461 We do not tell weak/global, but return on the first we see.
      */
     ASSERT_OWN_RECURSIVE_LOCK(true, &privload_lock);
-    mod = privload_first_module();
     /* FIXME i#3850: Symbols are currently looked up following the dependency chain
      * depth-first instead of breadth-first.
      */
-    while (mod != NULL) {
+    for (privmod_t *mod = privload_first_module(); mod != NULL;
+         mod = privload_next_module(mod)) {
+        /* Skip other client modules at this point because some will not be
+         * initialised and clients should be leaves of the dependency tree and
+         * not provide symbols for other modules. Skipping just the uninitialised
+         * client modules should also work but might introduce an element of
+         * unpredictability if we are unsure in what order modules will be
+         * initialised. Skipping all uninitialised modules should also work but
+         * might hide a more serious problem. See i#4501.
+         */
+        if (mod->is_top_level_client)
+            continue;
         pd = mod->os_privmod_data;
         ASSERT(pd != NULL && name != NULL);
 
@@ -1172,7 +1181,6 @@ module_lookup_symbol(ELF_SYM_TYPE *sym, os_privmod_data_t *pd)
             }
             return res;
         }
-        mod = privload_next_module(mod);
     }
     return NULL;
 }
diff --git a/suite/tests/CMakeLists.txt b/suite/tests/CMakeLists.txt
index f065df88277..9e30c5bda27 100644
--- a/suite/tests/CMakeLists.txt
+++ b/suite/tests/CMakeLists.txt
@@ -3666,6 +3666,19 @@ if (BUILD_SAMPLES)
         "CFLAGS=-m32;CXXFLAGS=-m32")
     endif ()
   endif ()
+  if (UNIX)
+    # XXX: Change this to go through torun() like the other tests to share
+    # output, multiplexing, _timeout, etc. features: not entirely
+    # straightforward because this test uses two clients.
+    # For now, "-s 90 -quiet -killpg" is added explicitly.
+    get_target_path_for_execution(drrun_path drrun "${location_suffix}")
+    get_client_path(client1 bbcount bbcount)
+    get_client_path(client2 opcodes opcodes)
+    get_target_path_for_execution(app_path "${ci_shared_app}" "${location_suffix}")
+    add_test(two_clients ${drrun_path} -s 90 -quiet -killpg
+      -client ${client1} 0 ''
+      -client ${client2} 1 '' ${dr_test_ops} -- ${app_path})
+  endif (UNIX)
 endif (BUILD_SAMPLES)
 
 if (BUILD_CLIENTS)