[ARM] Implement TTI::isHardwareLoopProfitable
    
Implement the backend target hook to drive the HardwareLoops pass.
The low-overhead branch extension for Arm M-class cores is flexible
enough that we don't have to ensure correctness at this point, except
checking that the loop counter variable can be stored in LR - a
32-bit register. For it to be profitable, we want to avoid loops that
contain function calls, or any other instruction that alters the PC.
    
This implementation uses TargetLoweringInfo, to query type and
operation actions, looks at intrinsic calls and also performs some
manual checks for remainder/division and FP operations.
    
I think this should be a good base to start and extra details can be
filled out later.

Differential Revision: https://reviews.llvm.org/D62907

llvm-svn: 363149
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index c626c41..128d14c 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -36,6 +36,10 @@
 
 #define DEBUG_TYPE "armtti"
 
+static cl::opt<bool> DisableLowOverheadLoops(
+  "disable-arm-loloops", cl::Hidden, cl::init(true),
+  cl::desc("Disable the generation of low-overhead loops"));
+
 bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
                                      const Function *Callee) const {
   const TargetMachine &TM = getTLI()->getTargetMachine();
@@ -628,6 +632,196 @@
                                            UseMaskForCond, UseMaskForGaps);
 }
 
+bool ARMTTIImpl::isLoweredToCall(const Function *F) {
+  if (!F->isIntrinsic())
+    BaseT::isLoweredToCall(F);
+
+  // Assume all Arm-specific intrinsics map to an instruction.
+  if (F->getName().startswith("llvm.arm"))
+    return false;
+
+  switch (F->getIntrinsicID()) {
+  default: break;
+  case Intrinsic::powi:
+  case Intrinsic::sin:
+  case Intrinsic::cos:
+  case Intrinsic::pow:
+  case Intrinsic::log:
+  case Intrinsic::log10:
+  case Intrinsic::log2:
+  case Intrinsic::exp:
+  case Intrinsic::exp2:
+    return true;
+  case Intrinsic::sqrt:
+  case Intrinsic::fabs:
+  case Intrinsic::copysign:
+  case Intrinsic::floor:
+  case Intrinsic::ceil:
+  case Intrinsic::trunc:
+  case Intrinsic::rint:
+  case Intrinsic::nearbyint:
+  case Intrinsic::round:
+  case Intrinsic::canonicalize:
+  case Intrinsic::lround:
+  case Intrinsic::llround:
+  case Intrinsic::lrint:
+  case Intrinsic::llrint:
+    if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
+      return true;
+    if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
+      return true;
+    // Some operations can be handled by vector instructions and assume
+    // unsupported vectors will be expanded into supported scalar ones.
+    // TODO Handle scalar operations properly.
+    return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
+  case Intrinsic::masked_store:
+  case Intrinsic::masked_load:
+  case Intrinsic::masked_gather:
+  case Intrinsic::masked_scatter:
+    return !ST->hasMVEIntegerOps();
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::sadd_sat:
+  case Intrinsic::uadd_sat:
+  case Intrinsic::ssub_sat:
+  case Intrinsic::usub_sat:
+    return false;
+  }
+
+  return BaseT::isLoweredToCall(F);
+}
+
+bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                          AssumptionCache &AC,
+                                          TargetLibraryInfo *LibInfo,
+                                          TTI::HardwareLoopInfo &HWLoopInfo) {
+  // Low-overhead branches are only supported in the 'low-overhead branch'
+  // extension of v8.1-m.
+  if (!ST->hasLOB() || DisableLowOverheadLoops)
+    return false;
+
+  // For now, for simplicity, only support loops with one exit block.
+  if (!L->getExitBlock())
+    return false;
+
+  if (!SE.hasLoopInvariantBackedgeTakenCount(L))
+    return false;
+
+  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    return false;
+
+  const SCEV *TripCountSCEV =
+    SE.getAddExpr(BackedgeTakenCount,
+                  SE.getOne(BackedgeTakenCount->getType()));
+
+  // We need to store the trip count in LR, a 32-bit register.
+  if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32)
+    return false;
+
+  // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
+  // point in generating a hardware loop if that's going to happen.
+  auto MaybeCall = [this](Instruction &I) {
+    const ARMTargetLowering *TLI = getTLI();
+    unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
+    EVT VT = TLI->getValueType(DL, I.getType(), true);
+    if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
+      return true;
+
+    // Check if an intrinsic will be lowered to a call and assume that any
+    // other CallInst will generate a bl.
+    if (auto *Call = dyn_cast<CallInst>(&I)) {
+      if (isa<IntrinsicInst>(Call)) {
+        if (const Function *F = Call->getCalledFunction())
+          return isLoweredToCall(F);
+      }
+      return true;
+    }
+
+    // FPv5 provides conversions between integer, double-precision,
+    // single-precision, and half-precision formats.
+    switch (I.getOpcode()) {
+    default:
+      break;
+    case Instruction::FPToSI:
+    case Instruction::FPToUI:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+      return !ST->hasFPARMv8Base();
+    }
+
+    // FIXME: Unfortunately the approach of checking the Operation Action does
+    // not catch all cases of Legalization that use library calls. Our
+    // Legalization step categorizes some transformations into library calls as
+    // Custom, Expand or even Legal when doing type legalization. So for now
+    // we have to special case for instance the SDIV of 64bit integers and the
+    // use of floating point emulation.
+    if (VT.isInteger() && VT.getSizeInBits() >= 64) {
+      switch (ISD) {
+      default:
+        break;
+      case ISD::SDIV:
+      case ISD::UDIV:
+      case ISD::SREM:
+      case ISD::UREM:
+      case ISD::SDIVREM:
+      case ISD::UDIVREM:
+        return true;
+      }
+    }
+
+    // Assume all other non-float operations are supported.
+    if (!VT.isFloatingPoint())
+      return false;
+
+    // We'll need a library call to handle most floats when using soft.
+    if (TLI->useSoftFloat()) {
+      switch (I.getOpcode()) {
+      default:
+        return true;
+      case Instruction::Alloca:
+      case Instruction::Load:
+      case Instruction::Store:
+      case Instruction::Select:
+      case Instruction::PHI:
+        return false;
+      }
+    }
+
+    // We'll need a libcall to perform double precision operations on a single
+    // precision only FPU.
+    if (I.getType()->isDoubleTy() && !ST->hasFP64())
+      return true;
+
+    // Likewise for half precision arithmetic.
+    if (I.getType()->isHalfTy() && !ST->hasFullFP16())
+      return true;
+
+    return false;
+  };
+
+  // Scan the instructions to see if there's any that we know will turn into a
+  // call.
+  for (auto *BB : L->getBlocks())
+    for (auto &I : *BB)
+      if (MaybeCall(I))
+        return false;
+
+  // TODO: Check whether the trip count calculation is expensive. If L is the
+  // inner loop but we know it has a low trip count, calculating that trip
+  // count (in the parent loop) may be detrimental.
+
+  LLVMContext &C = L->getHeader()->getContext();
+  HWLoopInfo.CounterInReg = true;
+  HWLoopInfo.CountType = Type::getInt32Ty(C);
+  HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
+  return true;
+}
+
 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                          TTI::UnrollingPreferences &UP) {
   // Only currently enable these preferences for M-Class cores.
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 882a63c..4f68a0b 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -180,6 +180,12 @@
                                  bool UseMaskForCond = false,
                                  bool UseMaskForGaps = false);
 
+  bool isLoweredToCall(const Function *F);
+  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                AssumptionCache &AC,
+                                TargetLibraryInfo *LibInfo,
+                                TTI::HardwareLoopInfo &HWLoopInfo);
+
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
 
diff --git a/llvm/test/Transforms/HardwareLoops/ARM/calls.ll b/llvm/test/Transforms/HardwareLoops/ARM/calls.ll
new file mode 100644
index 0000000..f2fb8f4
--- /dev/null
+++ b/llvm/test/Transforms/HardwareLoops/ARM/calls.ll
@@ -0,0 +1,404 @@
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MAIN
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fullfp16 -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fp-armv8,+fullfp16 -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP64
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVEFP
+
+
+; CHECK-LABEL: skip_call
+; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-NOT: call i32 @llvm.loop.decrement
+
+define i32 @skip_call(i32 %n) {
+entry:
+  %cmp6 = icmp eq i32 %n, 0
+  br i1 %cmp6, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %i.08 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
+  %res.07 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %call = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #2
+  %add = add nsw i32 %call, %res.07
+  %inc1 = add nuw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: test_target_specific
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 50)
+; CHECK: [[COUNT:%[^ ]+]] = phi i32 [ 50, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %loop, label %exit
+
+define i32 @test_target_specific(i32* %a, i32* %b) {
+entry:
+  br label %loop
+loop:
+  %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr i32, i32* %a, i32 %count
+  %addr.b = getelementptr i32, i32* %b, i32 %count
+  %load.a = load i32, i32* %addr.a
+  %load.b = load i32, i32* %addr.b
+  %res = call i32 @llvm.arm.smlad(i32 %load.a, i32 %load.b, i32 %acc)
+  %count.next = add nuw i32 %count, 2
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret i32 %res
+}
+
+; CHECK-LABEL: test_fabs_f16
+; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
+; CHECK-MVE-NOT:  call void @llvm.set.loop.iterations
+; CHECK-FP:       call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVEFP:    call void @llvm.set.loop.iterations.i32(i32 100)
+define void @test_fabs_f16(half* %a, half* %b) {
+entry:
+  br label %loop
+loop:
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr half, half* %a, i32 %count
+  %load.a = load half, half* %addr.a
+  %abs = call half @llvm.fabs.f16(half %load.a)
+  %addr.b = getelementptr half, half* %b, i32 %count
+  store half %abs, half *%addr.b
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_fabs
+; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
+; CHECK-MVE-NOT:  call void @llvm.set.loop.iterations
+; CHECK-FP:       call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVEFP:    call void @llvm.set.loop.iterations.i32(i32 100)
+define float @test_fabs(float* %a) {
+entry:
+  br label %loop
+loop:
+  %acc = phi float [ 0.0, %entry ], [ %res, %loop ]
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr float, float* %a, i32 %count
+  %load.a = load float, float* %addr.a
+  %abs = call float @llvm.fabs.f32(float %load.a)
+  %res = fadd float %abs, %acc
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret float %res
+}
+
+; CHECK-LABEL: test_fabs_64
+; CHECK-MAIN-NOT:   call void @llvm.set.loop.iterations 
+; CHECK-MVE-NOT:    call void @llvm.set.loop.iterations
+; CHECK-FP-NOT:     call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-FP64:       void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVEFP-NOT:  call void @llvm.set.loop.iterations.i32(i32 100)
+define void @test_fabs_64(double* %a, double* %b) {
+entry:
+  br label %loop
+loop:
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr double, double* %a, i32 %count
+  %load.a = load double, double* %addr.a
+  %abs = call double @llvm.fabs.f64(double %load.a)
+  %addr.b = getelementptr double, double* %b, i32 %count
+  store double %abs, double *%addr.b
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_fabs_vec
+; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
+; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
+; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
+; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
+define <4 x float> @test_fabs_vec(<4 x float>* %a) {
+entry:
+  br label %loop
+loop:
+  %acc = phi <4 x float> [ zeroinitializer, %entry ], [ %res, %loop ]
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
+  %load.a = load <4 x float>, <4 x float>* %addr.a
+  %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %load.a)
+  %res = fadd <4 x float> %abs, %acc
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: test_log
+; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-NOT: llvm.loop.decrement
+define float @test_log(float* %a) {
+entry:
+  br label %loop
+loop:
+  %acc = phi float [ 0.0, %entry ], [ %res, %loop ]
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr float, float* %a, i32 %count
+  %load.a = load float, float* %addr.a
+  %abs = call float @llvm.log.f32(float %load.a)
+  %res = fadd float %abs, %acc
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret float %res
+}
+
+; CHECK-LABEL: test_sqrt_16
+; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
+; CHECK-MVE-NOT:  call void @llvm.set.loop.iterations
+; CHECK-FP:       call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVEFP:    call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-FP64:     call void @llvm.set.loop.iterations.i32(i32 100)
+define void @test_sqrt_16(half* %a, half* %b) {
+entry:
+  br label %loop
+loop:
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr half, half* %a, i32 %count
+  %load.a = load half, half* %addr.a
+  %sqrt = call half @llvm.sqrt.f16(half %load.a)
+  %addr.b = getelementptr half, half* %b, i32 %count
+  store half %sqrt, half *%addr.b
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+; CHECK-LABEL: test_sqrt
+; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
+; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
+; CHECK-FP: call void @llvm.set.loop.iterations
+; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
+; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
+; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
+define void @test_sqrt(float* %a, float* %b) {
+entry:
+  br label %loop
+loop:
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr float, float* %a, i32 %count
+  %load.a = load float, float* %addr.a
+  %sqrt = call float @llvm.sqrt.f32(float %load.a)
+  %addr.b = getelementptr float, float* %b, i32 %count
+  store float %sqrt, float* %addr.b
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_sqrt_64
+; CHECK-MAIN-NOT:   call void @llvm.set.loop.iterations 
+; CHECK-MVE-NOT:    call void @llvm.set.loop.iterations
+; CHECK-FP-NOT:     call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVEFP-NOT:  call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-FP64:       call void @llvm.set.loop.iterations.i32(i32 100)
+define void @test_sqrt_64(double* %a, double* %b) {
+entry:
+  br label %loop
+loop:
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr double, double* %a, i32 %count
+  %load.a = load double, double* %addr.a
+  %sqrt = call double @llvm.sqrt.f64(double %load.a)
+  %addr.b = getelementptr double, double* %b, i32 %count
+  store double %sqrt, double *%addr.b
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_sqrt_vec
+; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
+; CHECK-MVE-NOT:  call void @llvm.set.loop.iterations
+; CHECK-FP:       call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVEFP:    call void @llvm.set.loop.iterations.i32(i32 100)
+define void @test_sqrt_vec(<4 x float>* %a, <4 x float>* %b) {
+entry:
+  br label %loop
+loop:
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
+  %load.a = load <4 x float>, <4 x float>* %addr.a
+  %sqrt = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %load.a)
+  %addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count
+  store <4 x float> %sqrt, <4 x float>* %addr.b
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_overflow
+; CHECK: call void @llvm.set.loop.iterations
+define i32 @test_overflow(i32* %a, i32* %b) {
+entry:
+  br label %loop
+loop:
+  %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr i32, i32* %a, i32 %count
+  %addr.b = getelementptr i32, i32* %b, i32 %count
+  %load.a = load i32, i32* %addr.a
+  %load.b = load i32, i32* %addr.b
+  %sadd = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %load.a, i32 %load.b)
+  %res = extractvalue {i32, i1} %sadd, 0
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret i32 %res
+}
+
+; TODO: We should be able to generate a qadd/sub
+; CHECK-LABEL: test_sat
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 100)
+define i32 @test_sat(i32* %a, i32* %b) {
+entry:
+  br label %loop
+loop:
+  %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr i32, i32* %a, i32 %count
+  %addr.b = getelementptr i32, i32* %b, i32 %count
+  %load.a = load i32, i32* %addr.a
+  %load.b = load i32, i32* %addr.b
+  %res = call i32 @llvm.sadd.sat.i32(i32 %load.a, i32 %load.b)
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret i32 %res
+}
+
+; CHECK-LABEL: test_masked_i32
+; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-MVEFP: call void @llvm.set.loop.iterations
+; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
+; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
+; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
+define void @test_masked_i32(<4 x i1> %mask, <4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c, <4 x i32> %passthru) {
+entry:
+  br label %loop
+loop:
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr <4 x i32>, <4 x i32>* %a, i32 %count
+  %addr.b = getelementptr <4 x i32>, <4 x i32>* %b, i32 %count
+  %addr.c = getelementptr <4 x i32>, <4 x i32>* %c, i32 %count
+  %load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.a, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
+  %load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.b, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
+  %res = add <4 x i32> %load.a, %load.b
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %res, <4 x i32>* %addr.c, i32 4, <4 x i1> %mask)
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_masked_f32
+; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-MVEFP: call void @llvm.set.loop.iterations
+; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
+; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
+; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
+define void @test_masked_f32(<4 x i1> %mask, <4 x float>* %a, <4 x float>* %b, <4 x float>* %c, <4 x float> %passthru) {
+entry:
+  br label %loop
+loop:
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
+  %addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count
+  %addr.c = getelementptr <4 x float>, <4 x float>* %c, i32 %count
+  %load.a = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.a, i32 4, <4 x i1> %mask, <4 x float> %passthru)
+  %load.b = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.b, i32 4, <4 x i1> %mask, <4 x float> %passthru)
+  %res = fadd <4 x float> %load.a, %load.b
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %res, <4 x float>* %addr.c, i32 4, <4 x i1> %mask)
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_gather_scatter
+; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-MVEFP: call void @llvm.set.loop.iterations
+; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
+; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
+; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
+define void @test_gather_scatter(<4 x i1> %mask, <4 x float*> %a, <4 x float*> %b, <4 x float*> %c, <4 x float> %passthru) {
+entry:
+  br label %loop
+loop:
+  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
+  %load.a = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %a, i32 4, <4 x i1> %mask, <4 x float> %passthru)
+  %load.b = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %b, i32 4, <4 x i1> %mask, <4 x float> %passthru)
+  %res = fadd <4 x float> %load.a, %load.b
+  call void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float> %res, <4 x float*> %c, i32 4, <4 x i1> %mask)
+  %count.next = add nuw i32 %count, 1
+  %cmp = icmp ne i32 %count.next, 100
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+declare i32 @bar(...) local_unnamed_addr #1
+declare i32 @llvm.arm.smlad(i32, i32, i32)
+declare half @llvm.fabs.f16(half)
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+declare float @llvm.log.f32(float)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
+declare half @llvm.sqrt.f16(half)
+declare float @llvm.sqrt.f32(float)
+declare double @llvm.sqrt.f64(double)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+declare i32 @llvm.sadd.sat.i32(i32, i32)
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
+declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
+declare <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
+declare void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float>, <4 x float*>, i32, <4 x i1>)
diff --git a/llvm/test/Transforms/HardwareLoops/ARM/counter.ll b/llvm/test/Transforms/HardwareLoops/ARM/counter.ll
new file mode 100644
index 0000000..bdd83d1
--- /dev/null
+++ b/llvm/test/Transforms/HardwareLoops/ARM/counter.ll
@@ -0,0 +1,35 @@
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -o - | FileCheck %s
+
+@g = common local_unnamed_addr global i32* null, align 4
+
+; CHECK-LABEL: counter_too_large
+; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-NOT: call i32 @llvm.loop.decrement
+
+define i32 @counter_too_large(i64 %n) {
+entry:
+  %cmp7 = icmp eq i64 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:
+  %0 = load i32*, i32** @g, align 4
+  br label %while.body
+
+while.body:
+  %i.09 = phi i64 [ 0, %while.body.lr.ph ], [ %inc1, %while.body ]
+  %res.08 = phi i32 [ 0, %while.body.lr.ph ], [ %add, %while.body ]
+  %idxprom = trunc i64 %i.09 to i32
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 %idxprom
+  %1 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %1, %res.08
+  %inc1 = add nuw i64 %i.09, 1
+  %cmp = icmp ult i64 %inc1, %n
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i32 %res.0.lcssa
+}
diff --git a/llvm/test/Transforms/HardwareLoops/ARM/do-rem.ll b/llvm/test/Transforms/HardwareLoops/ARM/do-rem.ll
new file mode 100644
index 0000000..074a1bb
--- /dev/null
+++ b/llvm/test/Transforms/HardwareLoops/ARM/do-rem.ll
@@ -0,0 +1,259 @@
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s
+
+@g = common local_unnamed_addr global i32* null, align 4
+
+; CHECK-LABEL: do_with_i32_urem
+; CHECK: while.body.preheader:
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
+; CHECK-NEXT: br label %while.body
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
+
+define i32 @do_with_i32_urem(i32 %n) {
+entry:
+  %cmp7 = icmp eq i32 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
+  %res.08 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %rem = urem i32 %i.09, 5
+  %add = add i32 %rem, %res.08
+  %inc1 = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: do_with_i32_srem
+; CHECK: while.body.preheader:
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
+; CHECK-NEXT: br label %while.body
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
+
+define i32 @do_with_i32_srem(i32 %n) {
+entry:
+  %cmp7 = icmp eq i32 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
+  %res.08 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %rem = srem i32 %i.09, 5
+  %add = sub i32 %rem, %res.08
+  %inc1 = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: do_with_i32_udiv
+; CHECK: while.body.preheader:
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
+; CHECK-NEXT: br label %while.body
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
+
+define i32 @do_with_i32_udiv(i32 %n) {
+entry:
+  %cmp7 = icmp eq i32 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
+  %res.08 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %rem = udiv i32 %i.09, 5
+  %add = add i32 %rem, %res.08
+  %inc1 = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: do_with_i32_sdiv
+; CHECK: while.body.preheader:
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
+; CHECK-NEXT: br label %while.body
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
+
+define i32 @do_with_i32_sdiv(i32 %n) {
+entry:
+  %cmp7 = icmp eq i32 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
+  %res.08 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %rem = sdiv i32 %i.09, 5
+  %add = sub i32 %rem, %res.08
+  %inc1 = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: do_with_i64_urem
+; CHECK-NOT: llvm.set.loop.iterations
+; CHECK-NOT: llvm.loop.decrement
+define i64 @do_with_i64_urem(i32 %n) {
+entry:
+  %cmp7 = icmp eq i32 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
+  %res.08 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %conv = zext i32 %i.09 to i64
+  %rem = urem i64 %conv, 5
+  %add = add i64 %rem, %res.08
+  %inc1 = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i64 %res.0.lcssa
+}
+
+; CHECK-LABEL: do_with_i64_srem
+; CHECK-NOT: llvm.set.loop.iterations
+; CHECK-NOT: llvm.loop.decrement
+define i64 @do_with_i64_srem(i32 %n) {
+entry:
+  %cmp7 = icmp eq i32 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
+  %res.08 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %conv = zext i32 %i.09 to i64
+  %rem = srem i64 %conv, 5
+  %add = sub i64 %rem, %res.08
+  %inc1 = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i64 %res.0.lcssa
+}
+
+; CHECK-LABEL: do_with_i64_udiv
+; CHECK-NOT: llvm.set.loop.iterations
+; CHECK-NOT: llvm.loop.decrement
+define i64 @do_with_i64_udiv(i32 %n) {
+entry:
+  %cmp7 = icmp eq i32 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
+  %res.08 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %conv = zext i32 %i.09 to i64
+  %rem = udiv i64 %conv, 5
+  %add = add i64 %rem, %res.08
+  %inc1 = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i64 %res.0.lcssa
+}
+
+; CHECK-LABEL: do_with_i64_sdiv
+; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-NOT: call i32 @llvm.loop.decrement
+define i64 @do_with_i64_sdiv(i32 %n) {
+entry:
+  %cmp7 = icmp eq i32 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
+  %res.08 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %conv = zext i32 %i.09 to i64
+  %rem = sdiv i64 %conv, 5
+  %add = sub i64 %rem, %res.08
+  %inc1 = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i64 %res.0.lcssa
+}
diff --git a/llvm/test/Transforms/HardwareLoops/ARM/fp-emulation.ll b/llvm/test/Transforms/HardwareLoops/ARM/fp-emulation.ll
new file mode 100644
index 0000000..8336b98
--- /dev/null
+++ b/llvm/test/Transforms/HardwareLoops/ARM/fp-emulation.ll
@@ -0,0 +1,207 @@
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fp-armv8 -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+soft-float -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SOFT
+
+; CHECK-LABEL: test_fptosi
+; CHECK: while.body.lr.ph:
+; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
+; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
+; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-FP-NEXT: br label %while.body
+
+; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
+; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
+
+; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations
+
+define void @test_fptosi(i32 %n, i32** %g, double** %d) {
+entry:
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 500
+  br i1 %0, label %while.body.lr.ph, label %cleanup
+
+while.body.lr.ph:
+  %1 = load double*, double** %d, align 4
+  %2 = load i32*, i32** %g, align 4
+  br label %while.body
+
+while.body:
+  %i.012 = phi i32 [ 0, %while.body.lr.ph ], [ %inc, %if.end4 ]
+  %rem = urem i32 %i.012, 10
+  %tobool = icmp eq i32 %rem, 0
+  br i1 %tobool, label %if.end4, label %if.then2
+
+if.then2:
+  %arrayidx = getelementptr inbounds double, double* %1, i32 %i.012
+  %3 = load double, double* %arrayidx, align 8
+  %conv = fptosi double %3 to i32
+  %arrayidx3 = getelementptr inbounds i32, i32* %2, i32 %i.012
+  store i32 %conv, i32* %arrayidx3, align 4
+  br label %if.end4
+
+if.end4:
+  %inc = add nuw i32 %i.012, 1
+  %cmp1 = icmp ult i32 %inc, %n
+  br i1 %cmp1, label %while.body, label %cleanup.loopexit
+
+cleanup.loopexit:
+  br label %cleanup
+
+cleanup:
+  ret void
+}
+
+; CHECK-LABEL: test_fptoui
+; CHECK-FP: while.body.lr.ph:
+; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
+; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
+; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-FP-NEXT: br label %while.body
+
+; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
+; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
+
+; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations
+
+define void @test_fptoui(i32 %n, i32** %g, double** %d) {
+entry:
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 500
+  br i1 %0, label %while.body.lr.ph, label %cleanup
+
+while.body.lr.ph:
+  %1 = load double*, double** %d, align 4
+  %2 = load i32*, i32** %g, align 4
+  br label %while.body
+
+while.body:
+  %i.012 = phi i32 [ 0, %while.body.lr.ph ], [ %inc, %if.end4 ]
+  %rem = urem i32 %i.012, 10
+  %tobool = icmp eq i32 %rem, 0
+  br i1 %tobool, label %if.end4, label %if.then2
+
+if.then2:
+  %arrayidx = getelementptr inbounds double, double* %1, i32 %i.012
+  %3 = load double, double* %arrayidx, align 8
+  %conv = fptoui double %3 to i32
+  %arrayidx3 = getelementptr inbounds i32, i32* %2, i32 %i.012
+  store i32 %conv, i32* %arrayidx3, align 4
+  br label %if.end4
+
+if.end4:
+  %inc = add nuw i32 %i.012, 1
+  %cmp1 = icmp ult i32 %inc, %n
+  br i1 %cmp1, label %while.body, label %cleanup.loopexit
+
+cleanup.loopexit:
+  br label %cleanup
+
+cleanup:
+  ret void
+}
+
+; CHECK-LABEL: load_store_float
+; CHECK: while.body.lr.ph:
+; CHECK: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
+; CHECK: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-NEXT: br label %while.body
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
+
+define void @load_store_float(i32 %n, double** %d, double** %g) {
+entry:
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 500
+  br i1 %0, label %while.body.lr.ph, label %cleanup
+
+while.body.lr.ph:
+  %1 = load double*, double** %d, align 4
+  %2 = load double*, double** %g, align 4
+  br label %while.body
+
+while.body:
+  %i.012 = phi i32 [ 0, %while.body.lr.ph ], [ %inc, %if.end4 ]
+  %rem = urem i32 %i.012, 10
+  %tobool = icmp eq i32 %rem, 0
+  br i1 %tobool, label %if.end4, label %if.then2
+
+if.then2:
+  %arrayidx = getelementptr inbounds double, double* %1, i32 %i.012
+  %3 = load double, double* %arrayidx, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %2, i32 %i.012
+  store double %3, double* %arrayidx3, align 8
+  br label %if.end4
+
+if.end4:
+  %inc = add nuw i32 %i.012, 1
+  %cmp1 = icmp ult i32 %inc, %n
+  br i1 %cmp1, label %while.body, label %cleanup.loopexit
+
+cleanup.loopexit:
+  br label %cleanup
+
+cleanup:
+  ret void
+}
+
+; CHECK-LABEL: fp_add
+; CHECK: while.body.lr.ph:
+
+; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations
+
+; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
+; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
+; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK: br label %while.body
+
+; CHECK-SOFT-NOT: call i32 @llvm.loop.decrement
+
+; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
+; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
+
+define void @fp_add(i32 %n, float** %d, float** %g) {
+entry:
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 500
+  br i1 %0, label %while.body.lr.ph, label %cleanup
+
+while.body.lr.ph:
+  %1 = load float*, float** %d, align 4
+  %2 = load float*, float** %g, align 4
+  br label %while.body
+
+while.body:
+  %i.012 = phi i32 [ 0, %while.body.lr.ph ], [ %inc, %if.end4 ]
+  %rem = urem i32 %i.012, 10
+  %tobool = icmp eq i32 %rem, 0
+  br i1 %tobool, label %if.end4, label %if.then2
+
+if.then2:
+  %arrayidx = getelementptr inbounds float, float* %1, i32 %i.012
+  %3 = load float, float* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds float, float* %2, i32 %i.012
+  %4 = load float, float* %arrayidx3, align 4
+  %add = fadd float %3, %4
+  store float %add, float* %arrayidx3, align 4
+  br label %if.end4
+
+if.end4:
+  %inc = add nuw i32 %i.012, 1
+  %cmp1 = icmp ult i32 %inc, %n
+  br i1 %cmp1, label %while.body, label %cleanup.loopexit
+
+cleanup.loopexit:
+  br label %cleanup
+
+cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/HardwareLoops/ARM/simple-do.ll b/llvm/test/Transforms/HardwareLoops/ARM/simple-do.ll
new file mode 100644
index 0000000..41eb071
--- /dev/null
+++ b/llvm/test/Transforms/HardwareLoops/ARM/simple-do.ll
@@ -0,0 +1,155 @@
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=true %s -S -o - | FileCheck %s --check-prefix=DISABLED
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=-lob -hardware-loops %s -S -o - | FileCheck %s --check-prefix=DISABLED
+
+; DISABLED-NOT: llvm.set.loop.iterations
+; DISABLED-NOT: llvm.loop.decrement
+
+@g = common local_unnamed_addr global i32* null, align 4
+
+; CHECK-LABEL: do_copy
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
+; CHECK: br label %while.body
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %entry ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body, label %while.end
+define i32 @do_copy(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) {
+entry:
+  br label %while.body
+
+while.body:
+  %q.addr.05 = phi i32* [ %incdec.ptr, %while.body ], [ %q, %entry ]
+  %p.addr.04 = phi i32* [ %incdec.ptr1, %while.body ], [ %p, %entry ]
+  %x.addr.03 = phi i32 [ %dec, %while.body ], [ %n, %entry ]
+  %dec = add nsw i32 %x.addr.03, -1
+  %incdec.ptr = getelementptr inbounds i32, i32* %q.addr.05, i32 1
+  %0 = load i32, i32* %q.addr.05, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %p.addr.04, i32 1
+  store i32 %0, i32* %p.addr.04, align 4
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:
+  ret i32 0
+}
+
+; CHECK-LABEL: do_inc1
+; CHECK: while.body.lr.ph:
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
+; CHECK-NEXT: br label %while.body
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
+
+define i32 @do_inc1(i32 %n) {
+entry:
+  %cmp7 = icmp eq i32 %n, 0
+  br i1 %cmp7, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:
+  %0 = load i32*, i32** @g, align 4
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ 0, %while.body.lr.ph ], [ %inc1, %while.body ]
+  %res.08 = phi i32 [ 0, %while.body.lr.ph ], [ %add, %while.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 %i.09
+  %1 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %1, %res.08
+  %inc1 = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc1, %n
+  br i1 %exitcond, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: do_inc2
+; CHECK: while.body.lr.ph:
+; CHECK: [[ROUND:%[^ ]+]] = add i32 %n, -1
+; CHECK: [[HALVE:%[^ ]+]] = lshr i32 [[ROUND]], 1
+; CHECK: [[COUNT:%[^ ]+]] = add i32 [[HALVE]], 1
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-NEXT: br label %while.body
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
+define i32 @do_inc2(i32 %n) {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph:
+  %0 = load i32*, i32** @g, align 4
+  br label %while.body
+
+while.body:
+  %i.09 = phi i32 [ 0, %while.body.lr.ph ], [ %add1, %while.body ]
+  %res.08 = phi i32 [ 0, %while.body.lr.ph ], [ %add, %while.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 %i.09
+  %1 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %1, %res.08
+  %add1 = add nuw nsw i32 %i.09, 2
+  %cmp = icmp slt i32 %add1, %n
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: do_dec2
+
+; CHECK: while.body.lr.ph:
+; CHECK: [[ROUND:%[^ ]+]] = add i32 %n, 1
+; CHECK: [[CMP:%[^ ]+]] = icmp slt i32 %n, 2
+; CHECK: [[SMIN:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 2
+; CHECK: [[SUB:%[^ ]+]] = sub i32 [[ROUND]], [[SMIN]]
+; CHECK: [[HALVE:%[^ ]+]] = lshr i32 [[SUB]], 1
+; CHECK: [[COUNT:%[^ ]+]] = add i32 [[HALVE]], 1
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-NEXT: br label %while.body
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
+define i32 @do_dec2(i32 %n) {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph:
+  %0 = load i32*, i32** @g, align 4
+  br label %while.body
+
+while.body:
+  %i.08 = phi i32 [ %n, %while.body.lr.ph ], [ %sub, %while.body ]
+  %res.07 = phi i32 [ 0, %while.body.lr.ph ], [ %add, %while.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 %i.08
+  %1 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %1, %res.07
+  %sub = add nsw i32 %i.08, -2
+  %cmp = icmp sgt i32 %i.08, 2
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
+  ret i32 %res.0.lcssa
+}
diff --git a/llvm/test/Transforms/HardwareLoops/ARM/structure.ll b/llvm/test/Transforms/HardwareLoops/ARM/structure.ll
new file mode 100644
index 0000000..e3fe762
--- /dev/null
+++ b/llvm/test/Transforms/HardwareLoops/ARM/structure.ll
@@ -0,0 +1,72 @@
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s
+
+; CHECK-LABEL: early_exit
+; CHECK-NOT: llvm.set.loop.iterations
+; CHECK-NOT: llvm.loop.decrement
+define i32 @early_exit(i32* nocapture readonly %a, i32 %max, i32 %n) {
+entry:
+  br label %do.body
+
+do.body:
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.0
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp = icmp sgt i32 %0, %max
+  br i1 %cmp, label %do.end, label %if.end
+
+if.end:
+  %inc = add nuw i32 %i.0, 1
+  %cmp1 = icmp ult i32 %inc, %n
+  br i1 %cmp1, label %do.body, label %if.end.do.end_crit_edge
+
+if.end.do.end_crit_edge:
+  %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i32 %inc
+  %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4
+  br label %do.end
+
+do.end:
+  %1 = phi i32 [ %.pre, %if.end.do.end_crit_edge ], [ %0, %do.body ]
+  ret i32 %1
+}
+
+; CHECK-LABEL: nested
+; CHECK-NOT: call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK: br i1 %cmp20, label %while.end7, label %while.cond1.preheader.us
+
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK: br label %while.body3.us
+
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ %N, %while.cond1.preheader.us ], [ [[LOOP_DEC:%[^ ]+]], %while.body3.us ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body3.us, label %while.cond1.while.end_crit_edge.us
+
+; CHECK-NOT: [[LOOP_DEC1:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-NOT: br i1 [[LOOP_DEC1]], label %while.cond1.preheader.us, label %while.end7
+define void @nested(i32* nocapture %A, i32 %N) {
+entry:
+  %cmp20 = icmp eq i32 %N, 0
+  br i1 %cmp20, label %while.end7, label %while.cond1.preheader.us
+
+while.cond1.preheader.us:
+  %i.021.us = phi i32 [ %inc6.us, %while.cond1.while.end_crit_edge.us ], [ 0, %entry ]
+  %mul.us = mul i32 %i.021.us, %N
+  br label %while.body3.us
+
+while.body3.us:
+  %j.019.us = phi i32 [ 0, %while.cond1.preheader.us ], [ %inc.us, %while.body3.us ]
+  %add.us = add i32 %j.019.us, %mul.us
+  %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us
+  store i32 %add.us, i32* %arrayidx.us, align 4
+  %inc.us = add nuw i32 %j.019.us, 1
+  %exitcond = icmp eq i32 %inc.us, %N
+  br i1 %exitcond, label %while.cond1.while.end_crit_edge.us, label %while.body3.us
+
+while.cond1.while.end_crit_edge.us:
+  %inc6.us = add nuw i32 %i.021.us, 1
+  %exitcond23 = icmp eq i32 %inc6.us, %N
+  br i1 %exitcond23, label %while.end7, label %while.cond1.preheader.us
+
+while.end7:
+  ret void
+}