Vulkan: revamp present semaphore management

See doc/PresentSemaphores.md for details.

Bug: angleproject:3450
Bug: angleproject:3670
Change-Id: I52d5bd13a4af25f224d386c9584525c182af6f17
Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/1776880
Reviewed-by: Tim Van Patten <timvp@google.com>
Commit-Queue: Shahbaz Youssefi <syoussefi@chromium.org>
diff --git a/src/libANGLE/renderer/vulkan/RendererVk.cpp b/src/libANGLE/renderer/vulkan/RendererVk.cpp
index c6b5b4c..e789575 100644
--- a/src/libANGLE/renderer/vulkan/RendererVk.cpp
+++ b/src/libANGLE/renderer/vulkan/RendererVk.cpp
@@ -1569,7 +1569,7 @@
     }
     else
     {
-        mFenceRecycler.fetch(mDevice, &fence);
+        mFenceRecycler.fetch(&fence);
         ANGLE_VK_TRY(context, fence.reset(mDevice));
     }
     sharedFenceOut->assign(mDevice, std::move(fence));
diff --git a/src/libANGLE/renderer/vulkan/SurfaceVk.cpp b/src/libANGLE/renderer/vulkan/SurfaceVk.cpp
index 1d8a4d6..c380f19 100644
--- a/src/libANGLE/renderer/vulkan/SurfaceVk.cpp
+++ b/src/libANGLE/renderer/vulkan/SurfaceVk.cpp
@@ -321,40 +321,77 @@
     return &mColorAttachment.image;
 }
 
-WindowSurfaceVk::SwapchainImage::SwapchainImage()  = default;
-WindowSurfaceVk::SwapchainImage::~SwapchainImage() = default;
-
-WindowSurfaceVk::SwapchainImage::SwapchainImage(SwapchainImage &&other)
-    : image(std::move(other.image)),
-      imageView(std::move(other.imageView)),
-      framebuffer(std::move(other.framebuffer))
-{}
-
-WindowSurfaceVk::SwapHistory::SwapHistory() = default;
-
-WindowSurfaceVk::SwapHistory::~SwapHistory() = default;
-
-void WindowSurfaceVk::SwapHistory::destroy(RendererVk *renderer)
+namespace impl
 {
-    if (swapchain != VK_NULL_HANDLE)
+SwapchainCleanupData::SwapchainCleanupData() = default;
+SwapchainCleanupData::~SwapchainCleanupData()
+{
+    ASSERT(swapchain == VK_NULL_HANDLE);
+    ASSERT(semaphores.empty());
+}
+
+SwapchainCleanupData::SwapchainCleanupData(SwapchainCleanupData &&other)
+    : swapchain(other.swapchain), semaphores(std::move(other.semaphores))
+{
+    other.swapchain = VK_NULL_HANDLE;
+}
+
+void SwapchainCleanupData::destroy(VkDevice device, vk::Recycler<vk::Semaphore> *semaphoreRecycler)
+{
+    if (swapchain)
     {
-        vkDestroySwapchainKHR(renderer->getDevice(), swapchain, nullptr);
+        vkDestroySwapchainKHR(device, swapchain, nullptr);
         swapchain = VK_NULL_HANDLE;
     }
 
-    renderer->resetSharedFence(&sharedFence);
-    presentImageSemaphore.destroy(renderer->getDevice());
+    for (vk::Semaphore &semaphore : semaphores)
+    {
+        semaphoreRecycler->recycle(std::move(semaphore));
+    }
+    semaphores.clear();
 }
 
-angle::Result WindowSurfaceVk::SwapHistory::waitFence(ContextVk *contextVk)
+ImagePresentHistory::ImagePresentHistory() = default;
+ImagePresentHistory::~ImagePresentHistory()
 {
-    if (sharedFence.isReferenced())
-    {
-        ANGLE_VK_TRY(contextVk, sharedFence.get().wait(contextVk->getDevice(),
-                                                       std::numeric_limits<uint64_t>::max()));
-    }
+    ASSERT(!semaphore.valid());
+    ASSERT(oldSwapchains.empty());
+}
+
+ImagePresentHistory::ImagePresentHistory(ImagePresentHistory &&other)
+    : semaphore(std::move(other.semaphore)), oldSwapchains(std::move(other.oldSwapchains))
+{}
+
+SwapchainImage::SwapchainImage()  = default;
+SwapchainImage::~SwapchainImage() = default;
+
+SwapchainImage::SwapchainImage(SwapchainImage &&other)
+    : image(std::move(other.image)),
+      imageView(std::move(other.imageView)),
+      framebuffer(std::move(other.framebuffer)),
+      presentHistory(std::move(other.presentHistory)),
+      currentPresentHistoryIndex(other.currentPresentHistoryIndex)
+{}
+
+SwapHistory::SwapHistory() = default;
+
+SwapHistory::~SwapHistory() = default;
+
+void SwapHistory::destroy(RendererVk *renderer)
+{
+    renderer->resetSharedFence(&sharedFence);
+}
+
+angle::Result SwapHistory::waitFence(ContextVk *contextVk)
+{
+    ASSERT(sharedFence.isReferenced());
+    ANGLE_VK_TRY(contextVk, sharedFence.get().wait(contextVk->getDevice(),
+                                                   std::numeric_limits<uint64_t>::max()));
     return angle::Result::Continue;
 }
+}  // namespace impl
+
+using namespace impl;
 
 WindowSurfaceVk::WindowSurfaceVk(const egl::SurfaceState &surfaceState,
                                  EGLNativeWindowType window,
@@ -370,8 +407,8 @@
       mMinImageCount(0),
       mPreTransform(VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR),
       mCompositeAlpha(VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR),
-      mCurrentSwapchainImageIndex(0),
-      mCurrentSwapHistoryIndex(0)
+      mCurrentSwapHistoryIndex(0),
+      mCurrentSwapchainImageIndex(0)
 {
     // Initialize the color render target with the multisampled targets.  If not multisampled, the
     // render target will be updated to refer to a swapchain image on every acquire.
@@ -408,6 +445,12 @@
         mSwapchain = VK_NULL_HANDLE;
     }
 
+    for (SwapchainCleanupData &oldSwapchain : mOldSwapchains)
+    {
+        oldSwapchain.destroy(device, &mPresentSemaphoreRecycler);
+    }
+    mOldSwapchains.clear();
+
     if (mSurface)
     {
         vkDestroySurfaceKHR(instance, mSurface, nullptr);
@@ -415,6 +458,7 @@
     }
 
     mAcquireImageSemaphore.destroy(device);
+    mPresentSemaphoreRecycler.destroy(device);
 }
 
 egl::Error WindowSurfaceVk::initialize(const egl::Display *display)
@@ -538,27 +582,131 @@
                                                  const gl::Extents &extents,
                                                  uint32_t swapHistoryIndex)
 {
-    VkSwapchainKHR oldSwapchain = mSwapchain;
-    mSwapchain                  = VK_NULL_HANDLE;
+    // If mOldSwapchains is not empty, it means that a new swapchain was created, but before
+    // any of its images were presented, it's asked to be recreated.  In this case, we can destroy
+    // the current swapchain immediately (although the old swapchains still need to be kept to be
+    // scheduled for destruction).  This can happen for example if vkQueuePresentKHR returns
+    // OUT_OF_DATE, the swapchain is recreated and the following vkAcquireNextImageKHR again
+    // returns OUT_OF_DATE.
+    //
+    // Otherwise, keep the current swapchain as the old swapchain to be scheduled for destruction
+    // and create a new one.
 
-    if (oldSwapchain)
+    VkSwapchainKHR swapchainToDestroy = VK_NULL_HANDLE;
+
+    if (!mOldSwapchains.empty())
     {
-        // Note: the old swapchain must be destroyed regardless of whether creating the new
-        // swapchain succeeds.  We can only destroy the swapchain once rendering to all its images
-        // have finished.  We therefore store the handle to the swapchain being destroyed in the
-        // swap history (alongside the serial of the last submission) so it can be destroyed once we
-        // wait on that serial as part of the CPU throttling.
-        mSwapHistory[swapHistoryIndex].swapchain = oldSwapchain;
+        // Keep the old swapchain, destroy the current (never-used) swapchain.
+        swapchainToDestroy = mSwapchain;
+
+        // Recycle present semaphores.
+        for (SwapchainImage &swapchainImage : mSwapchainImages)
+        {
+            for (ImagePresentHistory &presentHistory : swapchainImage.presentHistory)
+            {
+                ASSERT(presentHistory.semaphore.valid());
+                ASSERT(presentHistory.oldSwapchains.empty());
+
+                mPresentSemaphoreRecycler.recycle(std::move(presentHistory.semaphore));
+            }
+        }
     }
+    else
+    {
+        SwapchainCleanupData cleanupData;
+
+        // Remember the current swapchain to be scheduled for destruction later.
+        cleanupData.swapchain = mSwapchain;
+
+        // Accumulate the semaphores to be destroyed at the same time as the swapchain.
+        for (SwapchainImage &swapchainImage : mSwapchainImages)
+        {
+            for (ImagePresentHistory &presentHistory : swapchainImage.presentHistory)
+            {
+                ASSERT(presentHistory.semaphore.valid());
+                cleanupData.semaphores.emplace_back(std::move(presentHistory.semaphore));
+
+                // Accumulate any previous swapchains that are pending destruction too.
+                for (SwapchainCleanupData &oldSwapchain : presentHistory.oldSwapchains)
+                {
+                    mOldSwapchains.emplace_back(std::move(oldSwapchain));
+                }
+                presentHistory.oldSwapchains.clear();
+            }
+        }
+
+        // If too many old swapchains have accumulated, wait idle and destroy them.  This is to
+        // prevent failures due to too many swapchains allocated.
+        //
+        // Note: Nvidia has been observed to fail creation of swapchains after 20 are allocated on
+        // desktop, or less than 10 on Quadro P400.
+        static constexpr size_t kMaxOldSwapchains = 5;
+        if (mOldSwapchains.size() > kMaxOldSwapchains)
+        {
+            ANGLE_TRY(contextVk->getRenderer()->queueWaitIdle(contextVk));
+            for (SwapchainCleanupData &oldSwapchain : mOldSwapchains)
+            {
+                oldSwapchain.destroy(contextVk->getDevice(), &mPresentSemaphoreRecycler);
+            }
+            mOldSwapchains.clear();
+        }
+
+        mOldSwapchains.emplace_back(std::move(cleanupData));
+    }
+
+    // Recreate the swapchain based on the most recent one.
+    VkSwapchainKHR lastSwapchain = mSwapchain;
+    mSwapchain                   = VK_NULL_HANDLE;
 
     releaseSwapchainImages(contextVk);
 
-    return createSwapChain(contextVk, extents, oldSwapchain);
+    angle::Result result = createSwapChain(contextVk, extents, lastSwapchain);
+
+    // If the most recent swapchain was never used, destroy it right now.
+    if (swapchainToDestroy)
+    {
+        vkDestroySwapchainKHR(contextVk->getDevice(), swapchainToDestroy, nullptr);
+    }
+
+    return result;
+}
+
+angle::Result WindowSurfaceVk::newPresentSemaphore(vk::Context *context,
+                                                   vk::Semaphore *semaphoreOut)
+{
+    if (mPresentSemaphoreRecycler.empty())
+    {
+        ANGLE_VK_TRY(context, semaphoreOut->init(context->getDevice()));
+    }
+    else
+    {
+        mPresentSemaphoreRecycler.fetch(semaphoreOut);
+    }
+    return angle::Result::Continue;
+}
+
+angle::Result WindowSurfaceVk::resizeSwapchainImages(vk::Context *context, uint32_t imageCount)
+{
+    mSwapchainImages.resize(imageCount);
+
+    // At this point, if there was a previous swapchain, the previous present semaphores have all
+    // been moved to mOldSwapchains to be scheduled for destruction, so all semaphore handles in
+    // mSwapchainImages should be invalid.
+    for (SwapchainImage &swapchainImage : mSwapchainImages)
+    {
+        for (ImagePresentHistory &presentHistory : swapchainImage.presentHistory)
+        {
+            ASSERT(!presentHistory.semaphore.valid());
+            ANGLE_TRY(newPresentSemaphore(context, &presentHistory.semaphore));
+        }
+    }
+
+    return angle::Result::Continue;
 }
 
 angle::Result WindowSurfaceVk::createSwapChain(vk::Context *context,
                                                const gl::Extents &extents,
-                                               VkSwapchainKHR oldSwapchain)
+                                               VkSwapchainKHR lastSwapchain)
 {
     ANGLE_TRACE_EVENT0("gpu.angle", "WindowSurfaceVk::createSwapchain");
 
@@ -592,7 +740,7 @@
     swapchainInfo.compositeAlpha        = mCompositeAlpha;
     swapchainInfo.presentMode           = mDesiredSwapchainPresentMode;
     swapchainInfo.clipped               = VK_TRUE;
-    swapchainInfo.oldSwapchain          = oldSwapchain;
+    swapchainInfo.oldSwapchain          = lastSwapchain;
 
     // TODO(syoussefi): Once EGL_SWAP_BEHAVIOR_PRESERVED_BIT is supported, the contents of the old
     // swapchain need to carry over to the new one.  http://anglebug.com/2942
@@ -632,7 +780,7 @@
         mColorImageMS.stageClearIfEmulatedFormat(gl::ImageIndex::Make2D(0), format);
     }
 
-    mSwapchainImages.resize(imageCount);
+    ANGLE_TRY(resizeSwapchainImages(context, imageCount));
 
     for (uint32_t imageIndex = 0; imageIndex < imageCount; ++imageIndex)
     {
@@ -765,14 +913,14 @@
         swapchainImage.image.resetImageWeakReference();
         swapchainImage.image.destroy(contextVk->getDevice());
 
-        if (swapchainImage.imageView.valid())
-        {
-            contextVk->releaseObject(imageSerial, &swapchainImage.imageView);
-        }
+        contextVk->releaseObject(imageSerial, &swapchainImage.imageView);
+        contextVk->releaseObject(imageSerial, &swapchainImage.framebuffer);
 
-        if (swapchainImage.framebuffer.valid())
+        // present history must have already been taken care of.
+        for (ImagePresentHistory &presentHistory : swapchainImage.presentHistory)
         {
-            contextVk->releaseObject(imageSerial, &swapchainImage.framebuffer);
+            ASSERT(!presentHistory.semaphore.valid());
+            ASSERT(presentHistory.oldSwapchains.empty());
         }
     }
 
@@ -802,25 +950,39 @@
         mFramebufferMS.dumpResources(&garbageObjects);
     }
 
+    VkDevice device = displayVk->getDevice();
+
     for (vk::GarbageObjectBase &garbage : garbageObjects)
     {
-        garbage.destroy(displayVk->getDevice());
+        garbage.destroy(device);
     }
 
     for (SwapchainImage &swapchainImage : mSwapchainImages)
     {
         // We don't own the swapchain image handles, so we just remove our reference to it.
         swapchainImage.image.resetImageWeakReference();
-        swapchainImage.image.destroy(displayVk->getDevice());
+        swapchainImage.image.destroy(device);
 
         if (swapchainImage.imageView.valid())
         {
-            swapchainImage.imageView.destroy(displayVk->getDevice());
+            swapchainImage.imageView.destroy(device);
         }
 
         if (swapchainImage.framebuffer.valid())
         {
-            swapchainImage.framebuffer.destroy(displayVk->getDevice());
+            swapchainImage.framebuffer.destroy(device);
+        }
+
+        for (ImagePresentHistory &presentHistory : swapchainImage.presentHistory)
+        {
+            ASSERT(presentHistory.semaphore.valid());
+
+            mPresentSemaphoreRecycler.recycle(std::move(presentHistory.semaphore));
+            for (SwapchainCleanupData &oldSwapchain : presentHistory.oldSwapchains)
+            {
+                oldSwapchain.destroy(device, &mPresentSemaphoreRecycler);
+            }
+            presentHistory.oldSwapchains.clear();
         }
     }
 
@@ -861,8 +1023,11 @@
     SwapHistory &swap = mSwapHistory[mCurrentSwapHistoryIndex];
     {
         ANGLE_TRACE_EVENT0("gpu.angle", "WindowSurfaceVk::present: Throttle CPU");
-        ANGLE_TRY(swap.waitFence(contextVk));
-        swap.destroy(contextVk->getRenderer());
+        if (swap.sharedFence.isReferenced())
+        {
+            ANGLE_TRY(swap.waitFence(contextVk));
+            swap.destroy(contextVk->getRenderer());
+        }
     }
 
     SwapchainImage &image = mSwapchainImages[mCurrentSwapchainImageIndex];
@@ -899,14 +1064,34 @@
     }
     image.image.changeLayout(VK_IMAGE_ASPECT_COLOR_BIT, vk::ImageLayout::Present, swapCommands);
 
-    ANGLE_VK_TRY(contextVk, swap.presentImageSemaphore.init(contextVk->getDevice()));
+    // Knowing that the kSwapHistorySize'th submission ago has finished, we can know that the
+    // (kSwapHistorySize+1)'th present ago of this image is definitely finished and so its wait
+    // semaphore can be reused.  See doc/PresentSemaphores.md for details.
+    //
+    // This also means the swapchain(s) scheduled to be deleted at the same time can be deleted.
+    ImagePresentHistory &presentHistory = image.presentHistory[image.currentPresentHistoryIndex];
+    vk::Semaphore *presentSemaphore     = &presentHistory.semaphore;
+    ASSERT(presentSemaphore->valid());
 
-    ANGLE_TRY(contextVk->flushImpl(&swap.presentImageSemaphore));
+    for (SwapchainCleanupData &oldSwapchain : presentHistory.oldSwapchains)
+    {
+        oldSwapchain.destroy(contextVk->getDevice(), &mPresentSemaphoreRecycler);
+    }
+    presentHistory.oldSwapchains.clear();
+
+    // Schedule pending old swapchains to be destroyed at the same time the semaphore for this
+    // present can be destroyed.
+    presentHistory.oldSwapchains = std::move(mOldSwapchains);
+
+    image.currentPresentHistoryIndex =
+        (image.currentPresentHistoryIndex + 1) % image.presentHistory.size();
+
+    ANGLE_TRY(contextVk->flushImpl(presentSemaphore));
 
     VkPresentInfoKHR presentInfo   = {};
     presentInfo.sType              = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
     presentInfo.waitSemaphoreCount = 1;
-    presentInfo.pWaitSemaphores    = swap.presentImageSemaphore.ptr();
+    presentInfo.pWaitSemaphores    = presentSemaphore->ptr();
     presentInfo.swapchainCount     = 1;
     presentInfo.pSwapchains        = &mSwapchain;
     presentInfo.pImageIndices      = &mCurrentSwapchainImageIndex;
@@ -956,7 +1141,7 @@
 
     // If OUT_OF_DATE is returned, it's ok, we just need to recreate the swapchain before
     // continuing.
-    // If VK_SUBOPTIMAL_KHR is returned we it's because the device orientation changed and we should
+    // If VK_SUBOPTIMAL_KHR is returned it's because the device orientation changed and we should
     // recreate the swapchain with a new window orientation. We aren't quite ready for that so just
     // ignore for now.
     // TODO: Check for preRotation: http://anglebug.com/3502
diff --git a/src/libANGLE/renderer/vulkan/SurfaceVk.h b/src/libANGLE/renderer/vulkan/SurfaceVk.h
index 1198f5c..0949e3b 100644
--- a/src/libANGLE/renderer/vulkan/SurfaceVk.h
+++ b/src/libANGLE/renderer/vulkan/SurfaceVk.h
@@ -99,6 +99,78 @@
     AttachmentImage mDepthStencilAttachment;
 };
 
+// Data structures used in WindowSurfaceVk
+namespace impl
+{
+// The submission fence of the context used to throttle the CPU.
+struct SwapHistory : angle::NonCopyable
+{
+    SwapHistory();
+    SwapHistory(SwapHistory &&other) = delete;
+    SwapHistory &operator=(SwapHistory &&other) = delete;
+    ~SwapHistory();
+
+    void destroy(RendererVk *renderer);
+
+    angle::Result waitFence(ContextVk *contextVk);
+
+    // Fence associated with the last submitted work to render to this swapchain image.
+    vk::Shared<vk::Fence> sharedFence;
+};
+static constexpr size_t kSwapHistorySize = 2;
+
+// Old swapchain and associated present semaphores that need to be scheduled for destruction when
+// appropriate.
+struct SwapchainCleanupData : angle::NonCopyable
+{
+    SwapchainCleanupData();
+    SwapchainCleanupData(SwapchainCleanupData &&other);
+    ~SwapchainCleanupData();
+
+    void destroy(VkDevice device, vk::Recycler<vk::Semaphore> *semaphoreRecycler);
+
+    // The swapchain to be destroyed.
+    VkSwapchainKHR swapchain = VK_NULL_HANDLE;
+    // Any present semaphores that were pending destruction at the time the swapchain was
+    // recreated will be scheduled for destruction at the same time as the swapchain.
+    std::vector<vk::Semaphore> semaphores;
+};
+
+// A circular buffer per image stores the semaphores used for presenting that image.  Taking the
+// swap history into account, only the oldest semaphore is guaranteed to be no longer in use by the
+// presentation engine.  See doc/PresentSemaphores.md for details.
+//
+// Old swapchains are scheduled to be destroyed at the same time as the first semaphore used to
+// present an image of the new swapchain.  This is to ensure that the presentation engine is no
+// longer presenting an image from the old swapchain.
+struct ImagePresentHistory : angle::NonCopyable
+{
+    ImagePresentHistory();
+    ImagePresentHistory(ImagePresentHistory &&other);
+    ~ImagePresentHistory();
+
+    vk::Semaphore semaphore;
+    std::vector<SwapchainCleanupData> oldSwapchains;
+};
+
+// Swapchain images and their associated objects.
+struct SwapchainImage : angle::NonCopyable
+{
+    SwapchainImage();
+    SwapchainImage(SwapchainImage &&other);
+    ~SwapchainImage();
+
+    vk::ImageHelper image;
+    vk::ImageView imageView;
+    vk::Framebuffer framebuffer;
+
+    // A circular array of semaphores used for presenting this image.
+    static constexpr size_t kPresentHistorySize = kSwapHistorySize + 1;
+    std::array<ImagePresentHistory, kPresentHistorySize> presentHistory;
+    size_t currentPresentHistoryIndex = 0;
+};
+}  // namespace impl
+
 class WindowSurfaceVk : public SurfaceVk
 {
   public:
@@ -163,6 +235,7 @@
     angle::Result checkForOutOfDateSwapchain(ContextVk *contextVk,
                                              uint32_t swapHistoryIndex,
                                              bool presentOutOfDate);
+    angle::Result resizeSwapchainImages(vk::Context *context, uint32_t imageCount);
     void releaseSwapchainImages(ContextVk *contextVk);
     void destroySwapChainImages(DisplayVk *displayVk);
     VkResult nextSwapchainImage(vk::Context *context);
@@ -173,6 +246,8 @@
 
     angle::Result swapImpl(const gl::Context *context, EGLint *rects, EGLint n_rects);
 
+    angle::Result newPresentSemaphore(vk::Context *context, vk::Semaphore *semaphoreOut);
+
     bool isMultiSampled() const;
 
     VkSurfaceCapabilitiesKHR mSurfaceCaps;
@@ -186,46 +261,25 @@
     VkSurfaceTransformFlagBitsKHR mPreTransform;
     VkCompositeAlphaFlagBitsKHR mCompositeAlpha;
 
+    // A circular buffer that stores the submission fence of the context on every swap.  The CPU is
+    // throttled by waiting for the 2nd previous serial to finish.
+    std::array<impl::SwapHistory, impl::kSwapHistorySize> mSwapHistory;
+    size_t mCurrentSwapHistoryIndex;
+
+    // The previous swapchain which needs to be scheduled for destruction when appropriate.  This
+    // will be done when the first image of the current swapchain is presented.  If there were
+    // older swapchains pending destruction when the swapchain is recreated, they will accumulate
+    // and be destroyed with the previous swapchain.
+    //
+    // Note that if the user resizes the window such that the swapchain is recreated every frame,
+    // this array can go grow indefinitely.
+    std::vector<impl::SwapchainCleanupData> mOldSwapchains;
+
+    std::vector<impl::SwapchainImage> mSwapchainImages;
+    vk::Semaphore mAcquireImageSemaphore;
     uint32_t mCurrentSwapchainImageIndex;
 
-    struct SwapchainImage : angle::NonCopyable
-    {
-        SwapchainImage();
-        SwapchainImage(SwapchainImage &&other);
-        ~SwapchainImage();
-
-        vk::ImageHelper image;
-        vk::ImageView imageView;
-        vk::Framebuffer framebuffer;
-    };
-
-    std::vector<SwapchainImage> mSwapchainImages;
-    vk::Semaphore mAcquireImageSemaphore;
-
-    // A circular buffer that stores the serial of the renderer on every swap.  The CPU is
-    // throttled by waiting for the 2nd previous serial to finish.  Old swapchains are scheduled to
-    // be destroyed at the same time.
-    struct SwapHistory : angle::NonCopyable
-    {
-        SwapHistory();
-        SwapHistory(SwapHistory &&other) = delete;
-        SwapHistory &operator=(SwapHistory &&other) = delete;
-        ~SwapHistory();
-
-        void destroy(RendererVk *renderer);
-
-        angle::Result waitFence(ContextVk *contextVk);
-
-        // Fence associated with the last submitted work to render to this swapchain image.
-        vk::Shared<vk::Fence> sharedFence;
-
-        vk::Semaphore presentImageSemaphore;
-
-        VkSwapchainKHR swapchain = VK_NULL_HANDLE;
-    };
-    static constexpr size_t kSwapHistorySize = 2;
-    std::array<SwapHistory, kSwapHistorySize> mSwapHistory;
-    size_t mCurrentSwapHistoryIndex;
+    vk::Recycler<vk::Semaphore> mPresentSemaphoreRecycler;
 
     // Depth/stencil image.  Possibly multisampled.
     vk::ImageHelper mDepthStencilImage;
diff --git a/src/libANGLE/renderer/vulkan/doc/PresentSemaphores.md b/src/libANGLE/renderer/vulkan/doc/PresentSemaphores.md
new file mode 100644
index 0000000..5adc2f7
--- /dev/null
+++ b/src/libANGLE/renderer/vulkan/doc/PresentSemaphores.md
@@ -0,0 +1,151 @@
+# Queue Present Wait Semaphore Management
+
+The following shorthand notations are used throughout this document:
+
+- PE: Presentation Engine
+- ANI: vkAcquireNextImageKHR
+- QS: vkQueueSubmit
+- QP: vkQueuePresentKHR
+- W: Wait
+- S: Signal
+- R: Render
+- P: Present
+- SN: Semaphore N
+- IN: Swapchain image N
+- FN: Fence N
+
+---
+
+## Introduction
+
+Vulkan requires the application (ANGLE in this case) to acquire swapchain images and queue them for
+presentation, synchronizing GPU submissions with semaphores.  A single frame looks like the
+following:
+
+    CPU: ANI  ... QS   ... QP
+         S:S1     W:S1     W:S2
+                  S:S2
+    GPU:          <------------ R ----------->
+     PE:                                      <-------- P ------>
+
+That is, the GPU starts rendering after submission, and the presentation is done when rendering is
+finished.  With multiple frames, the pipeline looks different based on present mode.  Let's focus on
+FIFO (the arguments in this document translate to all modes) with 3 images:
+
+    CPU: QS QP QS QP QS QP QS QP
+         I1 I1 I2 I2 I3 I3 I1 I1
+    GPU: <---- R I1 ----><---- R I2 ----><---- R I3 ----><---- R I1 ---->
+     PE:                 <----- P I1 -----><----- P I2 -----><----- P I3 -----><----- P I1 ----->
+
+First, an issue is evident here.  The CPU is submitting jobs and queuing images for presentation
+faster than the GPU can render them or the PE can view them.  This causes the length of the PE queue
+to grow indefinitely, resulting in larger and larger input lag.
+
+To address this issue, ANGLE paces the CPU such that the length of the PE queue is kept at a maximum
+of 1 image (i.e. one image is being presented, and another one is in queue):
+
+    CPU: QS   QS          W:F1 QS         W:F2 QS
+         I1   I2               I3              I1
+         S:F1 S:F2             S:F3            S:F4
+    GPU: <---- R I1 ----><---- R I2 ----><---- R I3 ----><---- R I1 ---->
+
+> Note: While this works in heavy applications (as the rendering time is almost as long as the frame
+> (i.e. present time), in which case pacing the submissions similarly paces the presentation), it's
+> not technically keeping the PE queue length 1, but rather below n+2 where n is the number of
+> swapchain images.
+>
+> To understand why, imagine a FIFO swapchain with 1000 images and submissions that are
+> infinitesimally short.  In this case, the CPU pacing is effectively a no-op (as the GPU instantly
+> finishes jobs) for the first 1002 submissions.  The 1003rd submission waits for F1001 (which uses
+> I1).  However, the 1001st submission will not start until the PE is finished presenting I1 (at the
+> next V-Sync).  The CPU then waits for V-Sync before the 1003rd submission.  The CPU waits for one
+> V-Sync for every subsequent submission, keeping the length of the queue 1002.
+> [`VK_GOOGLE_display_timing`][DisplayTimingGOOGLE] is likely a solution to this problem.
+
+Associated with each QP operation is a semaphore signaled by the preceding QS and waited on by the
+PE before the image can be presented.  Currently, there's no feedback from Vulkan (See [internal
+Khronos issue][VulkanIssue1060]) regarding _when_ the PE has actually finished waiting on the
+semaphore!  This means that the application cannot generally know when to destroy the corresponding
+semaphore.  However, taking ANGLE's CPU pacing into account, we are able to destroy (or rather
+reuse) semaphores when they are provably unused.
+
+The interested reader may follow the discussion in this abandoned [gerrit CL][CL1757018] for more
+background and ideas.
+
+[DisplayTimingGOOGLE]: https://www.khronos.org/registry/vulkan/specs/1.1-extensions/man/html/VK_GOOGLE_display_timing.html
+[VulkanIssue1060]: https://gitlab.khronos.org/vulkan/vulkan/issues/1060
+[CL1757018]: https://chromium-review.googlesource.com/c/angle/angle/+/1757018
+
+## Determining When a QP Semaphore is Waited On
+
+Let's combine the above diagrams with all the details:
+
+    CPU: ANI   | QS    | QP    | ANI   | QS    | QP    | ANI   | W:F1 | QS    | QP    | ANI   | W:F2 | QS    | QP
+         I1    | I1    | I1    | I2    | I2    | I2    | I3    |      | I3    | I3    | I1    |      | I1    | I1
+         S:SA1 | W:SA1 |       | S:SA2 | W:SA2 |       | S:SA3 |      | W:SA3 |       | S:SA4 |      | W:SA4 |
+               | S:SP1 | W:SP1 |       | S:SP2 | W:SP2 |       |      | S:SP3 | W:SP3 |       |      | S:SP4 | W:SP4
+               | S:F1  |       |       | S:F2  |       |       |      | S:F3  |       |       |      | S:F4  |
+
+Let's focus only on sequences that return the same image:
+
+    CPU: ANI   | W:F(X-2) | QS    | QP    | ... | ANI   | W:F(Y-2) | QS    | QP
+         I1    |          | I1    | I1    |     | I1    |          | I1    | I1
+         S:SAX |          | W:SAX |       |     | S:SAY |          | W:SAY |
+               |          | S:SPX | W:SPX |     |       |          | S:SPY | W:SPY
+               |          | S:FX  |       |     |       |          | S:FY  |
+
+Note that X and Y are arbitrarily distanced (including possibly being sequential).
+
+Say we are at frame Y+2.  There's therefore a wait on FY.  The following holds:
+
+    FY is signaled
+    => SAY is signaled
+    => Previous presentation of I1 (corresponding to SPX) is finished
+    => SPX is waited
+
+At this point, we can destroy SPX.  In other words, in frame Y+2, we can destroy SPX (note that 2 is
+the number of frames the CPU pacing code uses).  If frame Y+1 is not using I1, this means the
+history of present semaphores for I1 would be `{SPX, SPY}` and we can destroy the oldest semaphore
+in this list.  If frame Y+1 is also using I1, we should still destroy SPX in frame Y+2, but the
+history of the present semaphores for I1 would be `{SPX, SPY, SP(Y+1)}`.
+
+In the Vulkan backend, we simplify destruction of semaphores by always keeping a history of 3
+present semaphores for each image (again, 3 is H+1 where H is the swap history size used in CPU
+pacing) and always reuse (instead of destroy) the oldest semaphore of the image that is about to be
+presented.
+
+To summarize, we use the completion of a submission using an image to provably when the *previous*
+presentation of that image was finished.
+
+## Swapchain recreation
+
+When recreating the swapchain, all images are freed and new ones are created, possibly with a
+different count and present mode.  For the old swapchain, we can no longer rely on the completion of
+a future submission to know when a previous presentation is done, as there won't be any more
+submissions using images from the old swapchain.
+
+> For example, imagine the old swapchain was created in FIFO mode, and one image is being presented
+> until the next V-Sync.  Furthermore, imagine the new swapchain is created in MAILBOX mode.  Since
+> the old swapchain's image will remain presented until V-Sync, the new MAILBOX swapchain can
+> perform an arbitrarily large number of (throw-away) presentations.  The old swapchain (and its
+> associated present semaphores) cannot be destroyed until V-Sync; a signal that's not captured by
+> Vulkan.
+
+ANGLE resolves this issue by deferring the destruction of the old swapchain and its remaining
+present semaphores to the time when the semaphore corresponding to the first present of the new
+swapchain can be destroyed.  In the example in the previous section, if SPX is the present semaphore
+of the first QP done on the new swapchain, at frame Y+2, when we know SPX can be destroyed, we know
+that the first image of the new swapchain has already been presented.  This proves that all previous
+presentations of the old swapchain have finished.
+
+> Note: the swapchain can potentially be destroyed much earlier, but with no feedback from the
+> presentation engine, we cannot know that.  This delays means that the swapchain could be recreated
+> while there are pending old swapchains to be destroyed.  The destruction of both old swapchains
+> must now be deferred to when the first present of the new swapchain has finished.  If an
+> application resizes the window constantly and at a high rate, ANGLE would keep accumulating old
+> swapchains and not free them until it stops.  While a user will likely not be able to do this (as
+> the rate of window system events is lower than the framerate), this can be programmatically done
+> (as indeed done in EGL dEQP tests).  Nvidia for example fails creation of a new swapchain if there
+> are already 20 allocated (on desktop, or less than ten on Quadro).  If the backlog of old
+> swapchains get larger than a threshold, ANGLE calls `vkQueueWaitIdle()` and destroys the
+> swapchains.
diff --git a/src/libANGLE/renderer/vulkan/vk_utils.h b/src/libANGLE/renderer/vulkan/vk_utils.h
index c9f8bef..f4f29b2 100644
--- a/src/libANGLE/renderer/vulkan/vk_utils.h
+++ b/src/libANGLE/renderer/vulkan/vk_utils.h
@@ -493,7 +493,7 @@
             if (!mRefCounted->isReferenced())
             {
                 ASSERT(mRefCounted->get().valid());
-                recycler->recyle(std::move(mRefCounted->get()));
+                recycler->recycle(std::move(mRefCounted->get()));
                 SafeDelete(mRefCounted);
             }
 
@@ -530,9 +530,9 @@
   public:
     Recycler() = default;
 
-    void recyle(T &&garbageObject) { mObjectFreeList.emplace_back(std::move(garbageObject)); }
+    void recycle(T &&garbageObject) { mObjectFreeList.emplace_back(std::move(garbageObject)); }
 
-    void fetch(VkDevice device, T *outObject)
+    void fetch(T *outObject)
     {
         ASSERT(!empty());
         *outObject = std::move(mObjectFreeList.back());