Always align the stack to the fixed alloca requirements.

Local variables that use a fixed alloca stack slot are assigned offsets
starting at 0, before the prolog is written. Therefore the stack pointer
needs to be aligned to the alloca's maximum alignment requirement. This
required the following changes:
- Add FixedAllocaSizeBytes to SpillAreaSizeBytes before aligning it.
- Compute the maximum alignment requirement from FixedAllocaAlignBytes
  and SpillAreaAlignmentBytes, and prior NeedsStackAlignment uses.
- Always align the stack pointer to this maximum.
- Affected lit tests have been rebased. Note that in some cases the
  frame size is now bigger than necessary. This is due to
  FixedAllocaSizeBytes being padding to be a multiple of the alignment.
  This isn't strictly necessary since the spill areas take care of their
  own alignment.

BUG=swiftshader:29

Change-Id: Ief30acda91c958d072528b8b59c2e933f68adbb1
Reviewed-on: https://chromium-review.googlesource.com/419816
Reviewed-by: Jim Stichnoth <stichnot@chromium.org>
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index 0b22701..db81c18 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -1061,7 +1061,7 @@
 
   InstructionSetEnum InstructionSet = Traits::InstructionSet::Begin;
   bool IsEbpBasedFrame = false;
-  bool NeedsStackAlignment = false;
+  size_t RequiredStackAlignment = sizeof(Traits::WordType);
   size_t SpillAreaSizeBytes = 0;
   size_t FixedAllocaSizeBytes = 0;
   size_t FixedAllocaAlignBytes = 0;
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index deaacc6..c368e5f 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -1124,40 +1124,47 @@
   const Type ReturnType = Func->getReturnType();
   if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
     if (isScalarFloatingType(ReturnType)) {
-      // Avoid misaligned double-precicion load/store.
-      NeedsStackAlignment = true;
+      // Avoid misaligned double-precision load/store.
+      RequiredStackAlignment = std::max<size_t>(
+          RequiredStackAlignment, Traits::X86_STACK_ALIGNMENT_BYTES);
       SpillAreaSizeBytes =
           std::max(typeWidthInBytesOnStack(ReturnType), SpillAreaSizeBytes);
     }
   }
 
-  // Align esp if necessary.
-  if (NeedsStackAlignment) {
-    uint32_t StackOffset =
-        Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
-    uint32_t StackSize =
-        Traits::applyStackAlignment(StackOffset + SpillAreaSizeBytes);
-    StackSize = Traits::applyStackAlignment(StackSize + maxOutArgsSizeBytes());
-    SpillAreaSizeBytes = StackSize - StackOffset;
-  } else {
-    SpillAreaSizeBytes += maxOutArgsSizeBytes();
+  RequiredStackAlignment =
+      std::max<size_t>(RequiredStackAlignment, SpillAreaAlignmentBytes);
+
+  if (PrologEmitsFixedAllocas) {
+    RequiredStackAlignment =
+        std::max(RequiredStackAlignment, FixedAllocaAlignBytes);
   }
 
   // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the
   // fixed allocations in the prolog.
   if (PrologEmitsFixedAllocas)
     SpillAreaSizeBytes += FixedAllocaSizeBytes;
+
+  // Entering the function has made the stack pointer unaligned. Re-align it by
+  // adjusting the stack size.
+  uint32_t StackOffset = Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
+  uint32_t StackSize = Utils::applyAlignment(StackOffset + SpillAreaSizeBytes,
+                                             RequiredStackAlignment);
+  StackSize = Utils::applyAlignment(StackSize + maxOutArgsSizeBytes(),
+                                    RequiredStackAlignment);
+  SpillAreaSizeBytes = StackSize - StackOffset;
+
   if (SpillAreaSizeBytes) {
     // Generate "sub stackptr, SpillAreaSizeBytes"
     _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
-    // If the fixed allocas are aligned more than the stack frame, align the
-    // stack pointer accordingly.
-    if (PrologEmitsFixedAllocas &&
-        FixedAllocaAlignBytes > Traits::X86_STACK_ALIGNMENT_BYTES) {
-      assert(IsEbpBasedFrame);
-      _and(getPhysicalRegister(getStackReg(), Traits::WordType),
-           Ctx->getConstantInt32(-FixedAllocaAlignBytes));
-    }
+  }
+
+  // If the required alignment is greater than the stack pointer's guaranteed
+  // alignment, align the stack pointer accordingly.
+  if (RequiredStackAlignment > Traits::X86_STACK_ALIGNMENT_BYTES) {
+    assert(IsEbpBasedFrame);
+    _and(getPhysicalRegister(getStackReg(), Traits::WordType),
+         Ctx->getConstantInt32(-RequiredStackAlignment));
   }
 
   // Account for known-frame-offset alloca instructions that were not already
@@ -1449,7 +1456,8 @@
   // alloca. All the alloca code ensures that the stack alignment is preserved
   // after the alloca. The stack alignment restriction can be relaxed in some
   // cases.
-  NeedsStackAlignment = true;
+  RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
+                                            Traits::X86_STACK_ALIGNMENT_BYTES);
 
   // For default align=0, set it to the real value 1, to avoid any
   // bit-manipulation problems below.
@@ -2603,7 +2611,8 @@
   // * Stack arguments of vector type are aligned to start at the next highest
   // multiple of 16 bytes. Other stack arguments are aligned to the next word
   // size boundary (4 or 8 bytes, respectively).
-  NeedsStackAlignment = true;
+  RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
+                                            Traits::X86_STACK_ALIGNMENT_BYTES);
 
   using OperandList =
       llvm::SmallVector<Operand *, constexprMax(Traits::X86_MAX_XMM_ARGS,
diff --git a/tests_lit/asan_tests/alignment.ll b/tests_lit/asan_tests/alignment.ll
index 908f2d0..b28ee4b 100644
--- a/tests_lit/asan_tests/alignment.ll
+++ b/tests_lit/asan_tests/alignment.ll
@@ -13,7 +13,7 @@
 }
 
 ; CHECK: func
-; CHECK-NEXT: sub    esp,0xa0
+; CHECK-NEXT: sub    esp,0xac
 ; CHECK-NEXT: lea    eax,[esp]
 ; CHECK-NEXT: shr    eax,0x3
 ; CHECK-NEXT: mov    DWORD PTR [eax+0x20000000],0xffffffff
@@ -27,5 +27,5 @@
 ; CHECK-NEXT: mov    DWORD PTR [eax+0x2000000c],0x0
 ; CHECK-NEXT: mov    DWORD PTR [eax+0x20000010],0x0
 ; CHECK-NEXT: mov    eax,0x2a
-; CHECK-NEXT: add    esp,0xa0
+; CHECK-NEXT: add    esp,0xac
 ; CHECK-NEXT: ret
diff --git a/tests_lit/llvm2ice_tests/alloc.ll b/tests_lit/llvm2ice_tests/alloc.ll
index 04e5b96..64b3513 100644
--- a/tests_lit/llvm2ice_tests/alloc.ll
+++ b/tests_lit/llvm2ice_tests/alloc.ll
@@ -80,7 +80,7 @@
 ; CHECK-LABEL: fixed_416_align_32
 ; CHECK:      push    ebp
 ; CHECK-NEXT: mov     ebp,esp
-; CHECK:      sub     esp,0x1b8
+; CHECK:      sub     esp,0x1d8
 ; CHECK:      and     esp,0xffffffe0
 ; CHECK:      lea     eax,[esp+0x10]
 ; CHECK:      mov     DWORD PTR [esp],eax
@@ -145,7 +145,7 @@
 ; CHECK-LABEL: fixed_351_align_32
 ; CHECK:      push    ebp
 ; CHECK-NEXT: mov     ebp,esp
-; CHECK:      sub     esp,0x178
+; CHECK:      sub     esp,0x198
 ; CHECK:      and     esp,0xffffffe0
 ; CHECK:      lea     eax,[esp+0x10]
 ; CHECK:      mov     DWORD PTR [esp],eax
diff --git a/tests_lit/llvm2ice_tests/fused-alloca.ll b/tests_lit/llvm2ice_tests/fused-alloca.ll
index 4dcc2b7..2c6992a 100644
--- a/tests_lit/llvm2ice_tests/fused-alloca.ll
+++ b/tests_lit/llvm2ice_tests/fused-alloca.ll
@@ -25,12 +25,12 @@
   ret void
 }
 ; CHECK-LABEL: fused_small_align
-; CHECK-NEXT: sub    esp,0x30
-; CHECK-NEXT: mov    eax,DWORD PTR [esp+0x34]
+; CHECK-NEXT: sub    esp,0x3c
+; CHECK-NEXT: mov    eax,DWORD PTR [esp+0x40]
 ; CHECK-NEXT: mov    DWORD PTR [esp+0x10],eax
 ; CHECK-NEXT: mov    DWORD PTR [esp+0x18],eax
 ; CHECK-NEXT: mov    DWORD PTR [esp],eax
-; CHECK-NEXT: add    esp,0x30
+; CHECK-NEXT: add    esp,0x3c
 ; MIPS32-LABEL: fused_small_align
 ; MIPS32: 	addiu	sp,sp,{{.*}}
 ; MIPS32: 	move	v0,a0
@@ -57,7 +57,7 @@
 ; CHECK-LABEL: fused_large_align
 ; CHECK-NEXT: push   ebp
 ; CHECK-NEXT: mov    ebp,esp
-; CHECK-NEXT: sub    esp,0x80
+; CHECK-NEXT: sub    esp,0xb8
 ; CHECK-NEXT: and    esp,0xffffffc0
 ; CHECK-NEXT: mov    eax,DWORD PTR [ebp+0x8]
 ; CHECK-NEXT: mov    DWORD PTR [esp+0x40],eax
@@ -102,13 +102,13 @@
   br label %block1
 }
 ; CHECK-LABEL: fused_derived
-; CHECK-NEXT: sub    esp,0x180
-; CHECK-NEXT: mov    [[ARG:e..]],DWORD PTR [esp+0x184]
+; CHECK-NEXT: sub    esp,0x18c
+; CHECK-NEXT: mov    [[ARG:e..]],DWORD PTR [esp+0x190]
 ; CHECK-NEXT: jmp
 ; CHECK-NEXT: mov    DWORD PTR [esp+0x80],[[ARG]]
 ; CHECK-NEXT: mov    DWORD PTR [esp+0x8c],[[ARG]]
 ; CHECK-NEXT: lea    eax,[esp+0x81]
-; CHECK-NEXT: add    esp,0x180
+; CHECK-NEXT: add    esp,0x18c
 ; CHECK-NEXT: ret
 ; MIPS32-LABEL: fused_derived
 ; MIPS32: 	addiu	sp,sp,{{.*}}