baby steps for aarch64 support

So far this is just as easy as I had hoped.

Change-Id: I5f69a900b32d9bf70156b55e334233d7376b820f
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/223340
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index 9a66c70..99df6f6 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -704,6 +704,29 @@
         this->byte(mod_rm(Mod::Indirect, src&7, dst&7));
     }
 
+    void Assembler::word(uint32_t w) {
+        this->byte(&w, 4);
+    }
+
+    // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf
+
+    void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) {
+        this->word( (hi & 2047) << 21
+                  | (m  &   31) << 16
+                  | (lo &   63) << 10
+                  | (n  &   31) <<  5
+                  | (d  &   31) <<  0 );
+    }
+
+    void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); }
+    void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); }
+    void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); }
+
+    void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); }
+    void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); }
+    void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); }
+    void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); }
+
 #if defined(SKVM_JIT)
     static bool can_jit(int regs, int nargs) {
         return true
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index 7f651f8..f9e57b6 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -25,7 +25,7 @@
         // Order matters... GP64, Xmm, Ymm values match 4-bit register encoding for each.
         enum GP64 {
             rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi,
-            r8 , r9,  r10, r11, r12, r13, r14, r15,
+            r8 , r9 , r10, r11, r12, r13, r14, r15,
         };
         enum Xmm {
             xmm0, xmm1, xmm2 , xmm3 , xmm4 , xmm5 , xmm6 , xmm7 ,
@@ -36,10 +36,28 @@
             ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15,
         };
 
+        // X and V values match 5-bit encoding for each (nothing tricky).
+        enum X {
+            x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 ,
+            x8 , x9 , x10, x11, x12, x13, x14, x15,
+            x16, x17, x18, x19, x20, x21, x22, x23,
+            x24, x25, x26, x27, x28, x29, x30, x31,
+        };
+        enum V {
+            v0 , v1 , v2 , v3 , v4 , v5 , v6 , v7 ,
+            v8 , v9 , v10, v11, v12, v13, v14, v15,
+            v16, v17, v18, v19, v20, v21, v22, v23,
+            v24, v25, v26, v27, v28, v29, v30, v31,
+        };
+
         void byte(const void*, int);
         void byte(uint8_t);
         template <typename... Rest> void byte(uint8_t, Rest...);
 
+        void word(uint32_t);
+
+        // x86-64
+
         void nop();
         void align(int mod);
 
@@ -80,6 +98,13 @@
         void vmovups(GP64 dst, Ymm src);
         void vmovq  (GP64 dst, Xmm src);
 
+        // aarch64
+
+        // d = op(n,m)
+        using DOpNM = void(V d, V n, V m);
+        DOpNM  add4s,  sub4s,  mul4s,
+              fadd4s, fsub4s, fmul4s, fdiv4s;
+
     private:
         // dst = op(dst, imm)
         void op(int opcode, int opcode_ext, GP64 dst, int imm);
@@ -104,6 +129,11 @@
         // *ptr = ymm or ymm = *ptr, depending on opcode.
         void load_store(int prefix, int map, int opcode, Ymm ymm, GP64 ptr);
 
+        // General layout top to bottom is:
+        //    [11 bits hi] [5 bits m] [6 bits lo] [5 bits n] [5 bits d]
+        // where the opcode is split between hi and lo.
+        void op(uint32_t hi, V m, uint32_t lo, V n, V d);
+
         uint8_t* fCode;
         size_t   fSize;
     };
diff --git a/tests/SkVMTest.cpp b/tests/SkVMTest.cpp
index b44cfc6..d74d0fa 100644
--- a/tests/SkVMTest.cpp
+++ b/tests/SkVMTest.cpp
@@ -369,4 +369,26 @@
     },{
         0xc5, 0x9d, 0xdf, 0xda,
     });
+
+    // echo "fmul v4.4s, v3.4s, v1.4s" | llvm-mc -show-encoding -arch arm64
+
+    test_asm(r, [&](A& a) {
+        a.fadd4s(A::v4, A::v3, A::v1);
+        a.fsub4s(A::v4, A::v3, A::v1);
+        a.fmul4s(A::v4, A::v3, A::v1);
+        a.fdiv4s(A::v4, A::v3, A::v1);
+
+        a.add4s(A::v4, A::v3, A::v1);
+        a.sub4s(A::v4, A::v3, A::v1);
+        a.mul4s(A::v4, A::v3, A::v1);
+    },{
+        0x64,0xd4,0x21,0x4e,
+        0x64,0xd4,0xa1,0x4e,
+        0x64,0xdc,0x21,0x6e,
+        0x64,0xfc,0x21,0x6e,
+
+        0x64,0x84,0xa1,0x4e,
+        0x64,0x84,0xa1,0x6e,
+        0x64,0x9c,0xa1,0x4e,
+    });
 }