i#6585: Add drcachesim vector length trace marker (#6603)

Adds a new trace marker 'TRACE_MARKER_TYPE_DYNAMIC_VECTOR_LENGTH'
to drcachesim that indicates the current vector length for architectures which have a dynamic vector length that can't be statically determined from the instruction.

The marker is emitted as part of the thread header when running on AArch64 with SVE support, but in the future could also be used to track changes in the vector length after prctl(PR_SVE_SET_VL, ..) system calls (i#6625).

Some SVE load and store instructions such as
    LDR <Zt>, [<Xn|SP>{, #<imm>, MUL VL}]
or
    ST1D { <Zt>.D }, <Pg>, [<Xn|SP>{, #<imm>, MUL VL}]

scale the immediate offset based on the hardware vector length so knowing the correct vector length for the traced application is important to properly decode and analyse these instructions.

Fixes: #6585
Issues: #6625
diff --git a/api/docs/release.dox b/api/docs/release.dox
index d74f8f5..3d864c9 100644
--- a/api/docs/release.dox
+++ b/api/docs/release.dox
@@ -193,6 +193,9 @@
  - Added instr_is_opnd_store_source().
  - Added kernel context switch sequence injection support to the drmemtrace scheduler.
  - Added dr_running_under_dynamorio().
+ - Added #dynamorio::drmemtrace::TRACE_MARKER_TYPE_VECTOR_LENGTH marker to indicate the
+   current vector length for architectures with a hardware defined or runtime changeable
+   vector length (such as AArch64's SVE scalable vectors).
 
 **************************************************
 <hr>
diff --git a/clients/drcachesim/common/trace_entry.h b/clients/drcachesim/common/trace_entry.h
index a34a296..454dedd 100644
--- a/clients/drcachesim/common/trace_entry.h
+++ b/clients/drcachesim/common/trace_entry.h
@@ -612,6 +612,21 @@
      */
     TRACE_MARKER_TYPE_CONTEXT_SWITCH_END,
 
+    /**
+     * This marker's value is the current thread's vector length in bytes, for
+     * architectures with a dynamic vector length. It is currently only used on AArch64.
+     *
+     * On AArch64 the marker's value contains the SVE vector length. The marker is
+     * emitted with the thread header to establish the initial vector length for that
+     * thread. In the future it will also be emitted later in the trace if the app
+     * changes the vector length at runtime (TODO i#6625). In all cases the vector
+     * length value is specific to the current thread.
+     * The vector length affects how some SVE instructions are decoded so any tools which
+     * decode instructions should clear any cached data and set the vector length used by
+     * the decoder using dr_set_sve_vector_length().
+     */
+    TRACE_MARKER_TYPE_VECTOR_LENGTH,
+
     // ...
     // These values are reserved for future built-in marker types.
     // ...
diff --git a/clients/drcachesim/docs/drcachesim.dox.in b/clients/drcachesim/docs/drcachesim.dox.in
index 447fd74..13f6bab 100644
--- a/clients/drcachesim/docs/drcachesim.dox.in
+++ b/clients/drcachesim/docs/drcachesim.dox.in
@@ -125,7 +125,11 @@
 information should be invalidated due to possibly changed application
 code.  (For online traces, encodings are not provided unless the
 option `-instr_encodings` is passed, as encodings add overhead and
-are not needed for many tools.)
+are not needed for many tools.) Cached decoding information might also
+need to be discarded if there is a
+#dynamorio::drmemtrace::TRACE_MARKER_TYPE_VECTOR_LENGTH marker entry
+indicating a change of vector length on architectures such as AArch64
+which have a dynamic vector length.
 
 Older legacy traces may not contain instruction encodings.  For those
 traces, encodings for static code can be obtained by
diff --git a/clients/drcachesim/tests/allasm-scattergather-vl-view-aarch64.templatex b/clients/drcachesim/tests/allasm-scattergather-vl-view-aarch64.templatex
new file mode 100644
index 0000000..e609dea
--- /dev/null
+++ b/clients/drcachesim/tests/allasm-scattergather-vl-view-aarch64.templatex
@@ -0,0 +1,9 @@
+.*
+#if __ARM_FEATURE_SVE_BITS == 128
+.*<marker: vector length 16 bytes>
+#elif __ARM_FEATURE_SVE_BITS == 256
+.*<marker: vector length 32 bytes>
+#elif __ARM_FEATURE_SVE_BITS == 512
+.*<marker: vector length 64 bytes>
+#endif
+.*
diff --git a/clients/drcachesim/tests/allasm_scattergather_aarch64.asm b/clients/drcachesim/tests/allasm_scattergather_aarch64.asm
index 658e12a..8e08cee 100644
--- a/clients/drcachesim/tests/allasm_scattergather_aarch64.asm
+++ b/clients/drcachesim/tests/allasm_scattergather_aarch64.asm
@@ -292,75 +292,75 @@
 
 
 test_scalar_plus_immediate:
-        ld1b    DEST_REG1.b, B_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 16
-        ld1b    DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 8
-        ld1b    DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 4
-        ld1b    DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 2
-        ldnt1b  DEST_REG1.b, B_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 16
-        ld1sb   DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 8
-        ld1sb   DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 4
-        ld1sb   DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 2
-        ld1h    DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 8
-        ld1h    DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 4
-        ld1h    DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 2
-        ldnt1h  DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 8
-        ld1sh   DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 4
-        ld1sh   DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 2
-        ld1w    DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 4
-        ld1w    DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 2
-        ldnt1w  DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 4
-        ld1sw   DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 2
-        ld1d    DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 2
-        ldnt1d  DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 2
+        ld1b    DEST_REG1.b, B_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 16
+        ld1b    DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8
+        ld1b    DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
+        ld1b    DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
+        ldnt1b  DEST_REG1.b, B_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 16
+        ld1sb   DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8
+        ld1sb   DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
+        ld1sb   DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
+        ld1h    DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8
+        ld1h    DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
+        ld1h    DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
+        ldnt1h  DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8
+        ld1sh   DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
+        ld1sh   DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
+        ld1w    DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
+        ld1w    DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
+        ldnt1w  DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
+        ld1sw   DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
+        ld1d    DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
+        ldnt1d  DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
                                                                     // Total: 104
 
-        ld2b { DEST_REG1.b, DEST_REG2.b }, B_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 32
-        ld2h { DEST_REG1.h, DEST_REG2.h }, H_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 16
-        ld2w { DEST_REG1.s, DEST_REG2.s }, S_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 8
-        ld2d { DEST_REG1.d, DEST_REG2.d }, D_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 4
+        ld2b { DEST_REG1.b, DEST_REG2.b }, B_MASK_REG/z, [BUFFER_REG, #2, mul vl] // 32
+        ld2h { DEST_REG1.h, DEST_REG2.h }, H_MASK_REG/z, [BUFFER_REG, #2, mul vl] // 16
+        ld2w { DEST_REG1.s, DEST_REG2.s }, S_MASK_REG/z, [BUFFER_REG, #2, mul vl] // 8
+        ld2d { DEST_REG1.d, DEST_REG2.d }, D_MASK_REG/z, [BUFFER_REG, #2, mul vl] // 4
                                                                                   // Total: 60
 
-        ld3b { DEST_REG1.b, DEST_REG2.b, DEST_REG3.b }, B_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 48
-        ld3h { DEST_REG1.h, DEST_REG2.h, DEST_REG3.h }, H_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 24
-        ld3w { DEST_REG1.s, DEST_REG2.s, DEST_REG3.s }, S_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 12
-        ld3d { DEST_REG1.d, DEST_REG2.d, DEST_REG3.d }, D_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 6
+        ld3b { DEST_REG1.b, DEST_REG2.b, DEST_REG3.b }, B_MASK_REG/z, [BUFFER_REG, #3, mul vl] // 48
+        ld3h { DEST_REG1.h, DEST_REG2.h, DEST_REG3.h }, H_MASK_REG/z, [BUFFER_REG, #3, mul vl] // 24
+        ld3w { DEST_REG1.s, DEST_REG2.s, DEST_REG3.s }, S_MASK_REG/z, [BUFFER_REG, #3, mul vl] // 12
+        ld3d { DEST_REG1.d, DEST_REG2.d, DEST_REG3.d }, D_MASK_REG/z, [BUFFER_REG, #3, mul vl] // 6
                                                                                                // Total: 90
 
-        ld4b { DEST_REG1.b, DEST_REG2.b, DEST_REG3.b, DEST_REG4.b }, B_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 64
-        ld4h { DEST_REG1.h, DEST_REG2.h, DEST_REG3.h, DEST_REG4.h }, H_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 32
-        ld4w { DEST_REG1.s, DEST_REG2.s, DEST_REG3.s, DEST_REG4.s }, S_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 16
-        ld4d { DEST_REG1.d, DEST_REG2.d, DEST_REG3.d, DEST_REG4.d }, D_MASK_REG/z, [BUFFER_REG, #0, mul vl] // 8
+        ld4b { DEST_REG1.b, DEST_REG2.b, DEST_REG3.b, DEST_REG4.b }, B_MASK_REG/z, [BUFFER_REG, #4, mul vl] // 64
+        ld4h { DEST_REG1.h, DEST_REG2.h, DEST_REG3.h, DEST_REG4.h }, H_MASK_REG/z, [BUFFER_REG, #4, mul vl] // 32
+        ld4w { DEST_REG1.s, DEST_REG2.s, DEST_REG3.s, DEST_REG4.s }, S_MASK_REG/z, [BUFFER_REG, #4, mul vl] // 16
+        ld4d { DEST_REG1.d, DEST_REG2.d, DEST_REG3.d, DEST_REG4.d }, D_MASK_REG/z, [BUFFER_REG, #4, mul vl] // 8
                                                                                                             // Total: 120
         // Total loads: 104 + 60 + 90 + 120 = 374
 
-        st1b SRC_REG1.b, B_MASK_REG, [BUFFER_REG, #0, mul vl] // 16
-        st1b SRC_REG1.h, H_MASK_REG, [BUFFER_REG, #0, mul vl] // 8
-        st1b SRC_REG1.s, S_MASK_REG, [BUFFER_REG, #0, mul vl] // 4
-        st1b SRC_REG1.d, D_MASK_REG, [BUFFER_REG, #0, mul vl] // 2
-        st1h SRC_REG1.h, H_MASK_REG, [BUFFER_REG, #0, mul vl] // 8
-        st1h SRC_REG1.s, S_MASK_REG, [BUFFER_REG, #0, mul vl] // 4
-        st1h SRC_REG1.d, D_MASK_REG, [BUFFER_REG, #0, mul vl] // 2
-        st1w SRC_REG1.s, S_MASK_REG, [BUFFER_REG, #0, mul vl] // 4
-        st1w SRC_REG1.d, D_MASK_REG, [BUFFER_REG, #0, mul vl] // 2
-        st1d SRC_REG1.d, D_MASK_REG, [BUFFER_REG, #0, mul vl] // 2
+        st1b SRC_REG1.b, B_MASK_REG, [BUFFER_REG, #1, mul vl] // 16
+        st1b SRC_REG1.h, H_MASK_REG, [BUFFER_REG, #1, mul vl] // 8
+        st1b SRC_REG1.s, S_MASK_REG, [BUFFER_REG, #1, mul vl] // 4
+        st1b SRC_REG1.d, D_MASK_REG, [BUFFER_REG, #1, mul vl] // 2
+        st1h SRC_REG1.h, H_MASK_REG, [BUFFER_REG, #1, mul vl] // 8
+        st1h SRC_REG1.s, S_MASK_REG, [BUFFER_REG, #1, mul vl] // 4
+        st1h SRC_REG1.d, D_MASK_REG, [BUFFER_REG, #1, mul vl] // 2
+        st1w SRC_REG1.s, S_MASK_REG, [BUFFER_REG, #1, mul vl] // 4
+        st1w SRC_REG1.d, D_MASK_REG, [BUFFER_REG, #1, mul vl] // 2
+        st1d SRC_REG1.d, D_MASK_REG, [BUFFER_REG, #1, mul vl] // 2
                                                               // Total: 52
 
-        st2b { SRC_REG1.b, SRC_REG2.b }, B_MASK_REG, [BUFFER_REG, #0, mul vl] // 32
-        st2h { SRC_REG1.h, SRC_REG2.h }, H_MASK_REG, [BUFFER_REG, #0, mul vl] // 16
-        st2w { SRC_REG1.s, SRC_REG2.s }, S_MASK_REG, [BUFFER_REG, #0, mul vl] // 8
-        st2d { SRC_REG1.d, SRC_REG2.d }, D_MASK_REG, [BUFFER_REG, #0, mul vl] // 4
+        st2b { SRC_REG1.b, SRC_REG2.b }, B_MASK_REG, [BUFFER_REG, #2, mul vl] // 32
+        st2h { SRC_REG1.h, SRC_REG2.h }, H_MASK_REG, [BUFFER_REG, #2, mul vl] // 16
+        st2w { SRC_REG1.s, SRC_REG2.s }, S_MASK_REG, [BUFFER_REG, #2, mul vl] // 8
+        st2d { SRC_REG1.d, SRC_REG2.d }, D_MASK_REG, [BUFFER_REG, #2, mul vl] // 4
                                                                               // Total: 60
 
-        st3b { SRC_REG1.b, SRC_REG2.b, SRC_REG3.b }, B_MASK_REG, [BUFFER_REG, #0, mul vl] // 48
-        st3h { SRC_REG1.h, SRC_REG2.h, SRC_REG3.h }, H_MASK_REG, [BUFFER_REG, #0, mul vl] // 24
-        st3w { SRC_REG1.s, SRC_REG2.s, SRC_REG3.s }, S_MASK_REG, [BUFFER_REG, #0, mul vl] // 12
-        st3d { SRC_REG1.d, SRC_REG2.d, SRC_REG3.d }, D_MASK_REG, [BUFFER_REG, #0, mul vl] // 6
+        st3b { SRC_REG1.b, SRC_REG2.b, SRC_REG3.b }, B_MASK_REG, [BUFFER_REG, #3, mul vl] // 48
+        st3h { SRC_REG1.h, SRC_REG2.h, SRC_REG3.h }, H_MASK_REG, [BUFFER_REG, #3, mul vl] // 24
+        st3w { SRC_REG1.s, SRC_REG2.s, SRC_REG3.s }, S_MASK_REG, [BUFFER_REG, #3, mul vl] // 12
+        st3d { SRC_REG1.d, SRC_REG2.d, SRC_REG3.d }, D_MASK_REG, [BUFFER_REG, #3, mul vl] // 6
                                                                                           // Total: 90
 
-        st4b { SRC_REG1.b, SRC_REG2.b, SRC_REG3.b, SRC_REG4.b }, B_MASK_REG, [BUFFER_REG, #0, mul vl] // 64
-        st4h { SRC_REG1.h, SRC_REG2.h, SRC_REG3.h, SRC_REG4.h }, H_MASK_REG, [BUFFER_REG, #0, mul vl] // 32
-        st4w { SRC_REG1.s, SRC_REG2.s, SRC_REG3.s, SRC_REG4.s }, S_MASK_REG, [BUFFER_REG, #0, mul vl] // 16
-        st4d { SRC_REG1.d, SRC_REG2.d, SRC_REG3.d, SRC_REG4.d }, D_MASK_REG, [BUFFER_REG, #0, mul vl] // 8
+        st4b { SRC_REG1.b, SRC_REG2.b, SRC_REG3.b, SRC_REG4.b }, B_MASK_REG, [BUFFER_REG, #4, mul vl] // 64
+        st4h { SRC_REG1.h, SRC_REG2.h, SRC_REG3.h, SRC_REG4.h }, H_MASK_REG, [BUFFER_REG, #4, mul vl] // 32
+        st4w { SRC_REG1.s, SRC_REG2.s, SRC_REG3.s, SRC_REG4.s }, S_MASK_REG, [BUFFER_REG, #4, mul vl] // 16
+        st4d { SRC_REG1.d, SRC_REG2.d, SRC_REG3.d, SRC_REG4.d }, D_MASK_REG, [BUFFER_REG, #4, mul vl] // 8
                                                                                                       // Total: 120
         // Total stores: 52 + 60 + 90 + 120 = 322
 
@@ -557,4 +557,9 @@
         .ascii   "Hello, world!\n"
 
 buffer:
-        .zero   1024                // Maximum size of an SVE Z register * 4.
+        .zero   2048                // Maximum size of an SVE Z register * 8.
+                                    // This gives us enough space to use an offset of
+                                    // #1, mul vl for scalar+immediate/vector+immediate
+                                    // instructions which lets us check the VL scaling of
+                                    // offsets in the IR in
+                                    // tool.drcacheoff.allasm-scattergather-vl-view
diff --git a/clients/drcachesim/tests/offline-allasm-scattergather-vl-view-aarch64.templatex b/clients/drcachesim/tests/offline-allasm-scattergather-vl-view-aarch64.templatex
new file mode 100644
index 0000000..7e7d070
--- /dev/null
+++ b/clients/drcachesim/tests/offline-allasm-scattergather-vl-view-aarch64.templatex
@@ -0,0 +1,15 @@
+.*
+#if __ARM_FEATURE_SVE_BITS == 128
+.*<marker: vector length 16 bytes>
+.*a401a03c   ld1b   \+0x10\(%x1\)\[1byte\] %p0/z -> %z28\.b
+.*e5d1ec3c   st3d   %z28\.d %z29\.d %z30\.d %p3 -> \+0x30\(%x1\)\[8byte\]
+#elif __ARM_FEATURE_SVE_BITS == 256
+.*<marker: vector length 32 bytes>
+.*a401a03c   ld1b   \+0x20\(%x1\)\[1byte\] %p0/z -> %z28.b
+.*e5d1ec3c   st3d   %z28\.d %z29\.d %z30\.d %p3 -> \+0x60\(%x1\)\[8byte\]
+#elif __ARM_FEATURE_SVE_BITS == 512
+.*<marker: vector length 64 bytes>
+.*a401a03c   ld1b   \+0x40\(%x1\)\[1byte\] %p0/z -> %z28.b
+.*e5d1ec3c   st3d   %z28\.d %z29\.d %z30\.d %p3 -> \+0xc0\(%x1\)\[8byte\]
+#endif
+.*
diff --git a/clients/drcachesim/tests/offline-view.templatex b/clients/drcachesim/tests/offline-view.templatex
index c1f4943..c2a3dde 100644
--- a/clients/drcachesim/tests/offline-view.templatex
+++ b/clients/drcachesim/tests/offline-view.templatex
@@ -7,9 +7,16 @@
            3           0: +[0-9]+ <marker: cache line size [0-9]*>
            4           0: +[0-9]+ <marker: chunk instruction count [0-9]*>
            5           0: +[0-9]+ <marker: page size [0-9]*>
+#ifdef __ARM_FEATURE_SVE
+           6           0: +[0-9]+ <marker: vector length [0-9]* bytes>
+           7           0: +[0-9]+ <marker: timestamp [0-9]*>
+           8           0: +[0-9]+ <marker: tid [0-9]* on core [0-9]*>
+           9           1: +[0-9]+ ifetch      .*
+#else
            6           0: +[0-9]+ <marker: timestamp [0-9]*>
            7           0: +[0-9]+ <marker: tid [0-9]* on core [0-9]*>
            8           1: +[0-9]+ ifetch      .*
+#endif
 .*
 View tool results:
     *[0-9]* : total instructions
diff --git a/clients/drcachesim/tools/invariant_checker.cpp b/clients/drcachesim/tools/invariant_checker.cpp
index 66cac4b..73fbd2c 100644
--- a/clients/drcachesim/tools/invariant_checker.cpp
+++ b/clients/drcachesim/tools/invariant_checker.cpp
@@ -367,6 +367,28 @@
                         "Stream interface page size != trace marker");
     }
     if (memref.marker.type == TRACE_TYPE_MARKER &&
+        memref.marker.marker_type == TRACE_MARKER_TYPE_VECTOR_LENGTH) {
+#ifdef AARCH64
+        static const int MAX_VL_BYTES = 256; // SVE's maximum vector length is 2048-bit
+        // Vector length must be a multiple of 16 bytes between 16 and 256.
+        report_if_false(shard,
+                        memref.marker.marker_value > 0 &&
+                            memref.marker.marker_value <= MAX_VL_BYTES &&
+                            memref.marker.marker_value % 16 == 0,
+                        "Vector length marker has invalid size");
+
+        const int new_vl_bits = memref.marker.marker_value * 8;
+        if (dr_get_sve_vector_length() != new_vl_bits) {
+            dr_set_sve_vector_length(new_vl_bits);
+            // Changing the vector length can change the IR representation of some SVE
+            // instructions but it doesn't effect any of the metadata that is stored
+            // in decode_cache_ so we don't need to flush the cache.
+        }
+#else
+        report_if_false(shard, false, "Unexpected vector length marker");
+#endif
+    }
+    if (memref.marker.type == TRACE_TYPE_MARKER &&
         memref.marker.marker_type == TRACE_MARKER_TYPE_VERSION) {
         shard->trace_version_ = memref.marker.marker_value;
         report_if_false(shard,
diff --git a/clients/drcachesim/tools/opcode_mix.cpp b/clients/drcachesim/tools/opcode_mix.cpp
index 5389829..18743ff 100644
--- a/clients/drcachesim/tools/opcode_mix.cpp
+++ b/clients/drcachesim/tools/opcode_mix.cpp
@@ -164,6 +164,17 @@
                 " but tool built for " + trace_arch_string(build_target_arch_type());
             return false;
         }
+    } else if (memref.marker.type == TRACE_TYPE_MARKER &&
+               memref.marker.marker_type == TRACE_MARKER_TYPE_VECTOR_LENGTH) {
+#ifdef AARCH64
+        const int new_vl_bits = memref.marker.marker_value * 8;
+        if (dr_get_sve_vector_length() != new_vl_bits) {
+            dr_set_sve_vector_length(new_vl_bits);
+            // Changing the vector length can change the IR representation of some SVE
+            // instructions but it will never change the opcode so we don't need to
+            // flush the opcode cache.
+        }
+#endif
     }
     if (!type_is_instr(memref.instr.type) &&
         memref.data.type != TRACE_TYPE_INSTR_NO_FETCH) {
diff --git a/clients/drcachesim/tools/view.cpp b/clients/drcachesim/tools/view.cpp
index b98e4d6..2d7391d 100644
--- a/clients/drcachesim/tools/view.cpp
+++ b/clients/drcachesim/tools/view.cpp
@@ -443,6 +443,10 @@
             std::cerr << "<marker: wait for another core>\n";
             break;
         case TRACE_MARKER_TYPE_CORE_IDLE: std::cerr << "<marker: core is idle>\n"; break;
+        case TRACE_MARKER_TYPE_VECTOR_LENGTH:
+            std::cerr << "<marker: vector length " << memref.marker.marker_value
+                      << " bytes>\n";
+            break;
         default:
             std::cerr << "<marker: type " << memref.marker.marker_type << "; value "
                       << memref.marker.marker_value << ">\n";
diff --git a/clients/drcachesim/tracer/instru_offline.cpp b/clients/drcachesim/tracer/instru_offline.cpp
index 96ff48a..bf3e635 100644
--- a/clients/drcachesim/tracer/instru_offline.cpp
+++ b/clients/drcachesim/tracer/instru_offline.cpp
@@ -397,6 +397,15 @@
     new_buf += append_marker(new_buf, TRACE_MARKER_TYPE_CACHE_LINE_SIZE,
                              proc_get_cache_line_size());
     new_buf += append_marker(new_buf, TRACE_MARKER_TYPE_PAGE_SIZE, dr_page_size());
+#if defined(AARCH64)
+    // TRACE_MARKER_TYPE_VECTOR_LENGTH is emitted in the thread header to establish the
+    // initial vector length for the thread, but the marker can also be emitted again
+    // later if the app changes the vector length.
+    if (proc_has_feature(FEATURE_SVE)) {
+        new_buf += append_marker(new_buf, TRACE_MARKER_TYPE_VECTOR_LENGTH,
+                                 proc_get_vector_length_bytes());
+    }
+#endif
     return (int)(new_buf - buf_ptr);
 }
 
diff --git a/clients/drcachesim/tracer/instru_online.cpp b/clients/drcachesim/tracer/instru_online.cpp
index c2d7805..58b234f 100644
--- a/clients/drcachesim/tracer/instru_online.cpp
+++ b/clients/drcachesim/tracer/instru_online.cpp
@@ -179,6 +179,15 @@
     new_buf += append_marker(new_buf, TRACE_MARKER_TYPE_CACHE_LINE_SIZE,
                              proc_get_cache_line_size());
     new_buf += append_marker(new_buf, TRACE_MARKER_TYPE_PAGE_SIZE, dr_page_size());
+#if defined(AARCH64)
+    // TRACE_MARKER_TYPE_VECTOR_LENGTH is emitted in the thread header to establish the
+    // initial vector length for the thread, but the marker can also be emitted again
+    // later if the app changes the vector length.
+    if (proc_has_feature(FEATURE_SVE)) {
+        new_buf += append_marker(new_buf, TRACE_MARKER_TYPE_VECTOR_LENGTH,
+                                 proc_get_vector_length_bytes());
+    }
+#endif
     return (int)(new_buf - buf_ptr);
 }
 
diff --git a/clients/drcachesim/tracer/raw2trace.cpp b/clients/drcachesim/tracer/raw2trace.cpp
index 736f7c8..dafcc25 100644
--- a/clients/drcachesim/tracer/raw2trace.cpp
+++ b/clients/drcachesim/tracer/raw2trace.cpp
@@ -883,6 +883,23 @@
         log(2, "Maybe-blocking syscall %zu\n", marker_val);
         buf += trace_metadata_writer_t::write_marker(
             buf, TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0);
+    } else if (marker_type == TRACE_MARKER_TYPE_VECTOR_LENGTH) {
+#ifdef AARCH64
+        log(4,
+            "Setting SVE vector length for thread " INT64_FORMAT_STRING " to %zu bytes\n",
+            tdata->tid, marker_val);
+
+        const int new_vl_bits = marker_val * 8;
+        if (dr_get_sve_vector_length() != new_vl_bits) {
+            dr_set_sve_vector_length(new_vl_bits);
+            // Some SVE load/store instructions have an offset which is scaled by a value
+            // that depends on the vector length. These instructions will need to be
+            // re-decoded after the vector length changes.
+            *flush_decode_cache = true;
+        }
+#else
+        log(2, "Ignoring unexpected dynamic vector length marker\n");
+#endif
     }
     return true;
 }
@@ -932,6 +949,7 @@
         header->cache_line_size = proc_get_cache_line_size();
         unread_last_entry(tdata);
     }
+
     return true;
 }
 
@@ -3783,17 +3801,6 @@
     decode_cache_.reserve(cache_count);
     for (int i = 0; i < cache_count; ++i)
         decode_cache_.emplace_back(cache_count);
-
-#if defined(AARCH64)
-    // TODO i#6556, i#1684: The decoder uses a global sve_veclen variable to store the
-    // vector length value it uses when decoding. drdecodelib ends up being linked into
-    // drcachesim twice: once into the drcachesim executable, and one into libdynamorio.
-    // When we call dr_standalone_init() above it will initialize the version of
-    // sve_veclen in libdynamorio, but not the one in drcachesim.
-    // Unfortunately it is the version of sve_veclen in drcachesim that gets used when
-    // decoding in raw2trace so we need to explicitly initialize its sve_veclen here.
-    dr_set_sve_vector_length(proc_get_vector_length_bytes() * 8);
-#endif
 }
 
 raw2trace_t::~raw2trace_t()
diff --git a/suite/tests/CMakeLists.txt b/suite/tests/CMakeLists.txt
index 102d147..bb16d43 100644
--- a/suite/tests/CMakeLists.txt
+++ b/suite/tests/CMakeLists.txt
@@ -4241,6 +4241,10 @@
 
       torunonly_drcacheoff(view ${ci_shared_app} ""
         "@-simulator_type@view@-sim_refs@16384" "")
+      unset(tool.drcacheoff.view_rawtemp) # Use preprocessor
+      if (AARCH64 AND proc_supports_sve)
+        set(tool.drcacheoff.view_runsve 1)
+      endif ()
 
       set(tool.drcacheoff.func_view_full_run ON) # Fails on Windows if truncated.
       torunonly_drcacheoff(func_view common.fib "-record_function fib|1"
@@ -4503,6 +4507,22 @@
             "allasm-scattergather-basic-counts-${ARCH_NAME}")
     endif ()
 
+    if (UNIX AND AARCH64 AND proc_supports_sve)
+      torunonly_drcacheoff(allasm-scattergather-vl-view allasm_scattergather
+        "" "@-simulator_type@view" "")
+      unset(tool.drcacheoff.allasm-scattergather-vl-view_rawtemp) # use preprocessor
+      set(tool.drcacheoff.allasm-scattergather-vl-view_runsve 1)
+      set(tool.drcacheoff.allasm-scattergather-vl-view_expectbase
+        "offline-allasm-scattergather-vl-view-${ARCH_NAME}")
+
+      torunonly_drcachesim(allasm-scattergather-vl-view allasm_scattergather
+        "-simulator_type view" "")
+      unset(tool.drcachesim.allasm-scattergather-vl-view_rawtemp) # use preprocessor
+      set(tool.drcachesim.allasm-scattergather-vl-view_runsve 1)
+      set(tool.drcachesim.allasm-scattergather-vl-view_expectbase
+            "allasm-scattergather-vl-view-${ARCH_NAME}")
+    endif ()
+
     if (UNIX AND X86 AND X64)
       torunonly_drcacheoff(allasm-repstr-basic-counts allasm_repstr
         "" "@-simulator_type@basic_counts" "")