Merge tag 'pull-riscv-to-apply-20230207' of https://github.com/alistair23/qemu into staging
Third RISC-V PR for QEMU 8.0
* Update disas for xnor/orn/andn and slli.uw
* Update opentitan IRQs
* Fix rom code when Zicsr is disabled
* Update VS timer whenever htimedelta changes
* A collection of fixes for virtulisation
* Set tval for triggered watchpoints
* Cleanups for board and FDT creation
* Add support for the T-Head vendor extensions
* A fix for virtual instr exception
* Fix ctzw behavior
* Fix SBI getchar handler for KVM
# -----BEGIN PGP SIGNATURE-----
#
# iQEzBAABCAAdFiEE9sSsRtSTSGjTuM6PIeENKd+XcFQFAmPh+QQACgkQIeENKd+X
# cFSdHwf9HQkO8/zTnWUFCbLVQAV3RB32i6E26uNC4+fQBpcqRWAel2PMYGi6x0H/
# fU43B5YpS7Ddfcc1ql6pJlisqsjkIsQBYjOeUfiMozLIR0dvX14jgUMxc0A8e8sZ
# uv3iRXjkVz/7bEWIdflPcaXgXh74HcQCPgMDsteluZYaz4yRGP6SvI9UJHqe4tjI
# HbiYmP1pcWFGDsAlKx7KbHfH/v9CE03odN3MLzwdsGYekUaFkdLlj7hmyamnqbAh
# OR1y29i2od/8uJMeIu5R8rnGdWoWetCZWP0z2xv3rrZuUzMJ6VXHcdWsY9ycomJs
# rYA0NB/ezmnKX+QAf+8TzW2nybG+5Q==
# =XLpK
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 07 Feb 2023 07:08:52 GMT
# gpg: using RSA key F6C4AC46D4934868D3B8CE8F21E10D29DF977054
# gpg: Good signature from "Alistair Francis <alistair@alistair23.me>" [full]
# Primary key fingerprint: F6C4 AC46 D493 4868 D3B8 CE8F 21E1 0D29 DF97 7054
* tag 'pull-riscv-to-apply-20230207' of https://github.com/alistair23/qemu: (32 commits)
hw/riscv: virt: Simplify virt_{get,set}_aclint()
target/riscv: fix SBI getchar handler for KVM
target/riscv: fix ctzw behavior
target/riscv: fix for virtual instr exception
target/riscv: add a MAINTAINERS entry for XThead* extension support
RISC-V: Adding XTheadFmv ISA extension
RISC-V: Add initial support for T-Head C906
RISC-V: Set minimum priv version for Zfh to 1.11
RISC-V: Adding T-Head FMemIdx extension
RISC-V: Adding T-Head MemIdx extension
RISC-V: Adding T-Head MemPair extension
RISC-V: Adding T-Head multiply-accumulate instructions
RISC-V: Adding XTheadCondMov ISA extension
RISC-V: Adding XTheadBs ISA extension
RISC-V: Adding XTheadBb ISA extension
RISC-V: Adding XTheadBa ISA extension
RISC-V: Adding XTheadSync ISA extension
RISC-V: Adding XTheadCmo ISA extension
hw/riscv: change riscv_compute_fdt_addr() semantics
hw/riscv: split fdt address calculation from fdt load
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 29ffe50..c226543 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -283,11 +283,11 @@
if (qio_channel_has_feature(s->ioc, QIO_CHANNEL_FEATURE_FD_PASS)) {
ret = qio_channel_readv_full(s->ioc, &iov, 1,
&msgfds, &msgfds_num,
- NULL);
+ 0, NULL);
} else {
ret = qio_channel_readv_full(s->ioc, &iov, 1,
NULL, NULL,
- NULL);
+ 0, NULL);
}
if (msgfds_num) {
diff --git a/configs/devices/x86_64-softmmu/x86_64-quintela-devices.mak b/configs/devices/x86_64-softmmu/x86_64-quintela-devices.mak
new file mode 100644
index 0000000..ee2bb8c
--- /dev/null
+++ b/configs/devices/x86_64-softmmu/x86_64-quintela-devices.mak
@@ -0,0 +1,7 @@
+# Boards:
+#
+CONFIG_ISAPC=n
+CONFIG_I440FX=n
+CONFIG_Q35=n
+CONFIG_MICROVM=y
+
diff --git a/configs/devices/x86_64-softmmu/x86_64-quintela2-devices.mak b/configs/devices/x86_64-softmmu/x86_64-quintela2-devices.mak
new file mode 100644
index 0000000..f7e4dae
--- /dev/null
+++ b/configs/devices/x86_64-softmmu/x86_64-quintela2-devices.mak
@@ -0,0 +1,6 @@
+# Boards:
+#
+CONFIG_ISAPC=y
+CONFIG_I440FX=y
+CONFIG_Q35=y
+CONFIG_MICROVM=y
diff --git a/docs/devel/migration.rst b/docs/devel/migration.rst
index 3e9656d..6f65c23 100644
--- a/docs/devel/migration.rst
+++ b/docs/devel/migration.rst
@@ -482,15 +482,17 @@
- A ``load_setup`` function that initialises the data structures on the
destination.
- - A ``save_live_pending`` function that is called repeatedly and must
- indicate how much more data the iterative data must save. The core
- migration code will use this to determine when to pause the CPUs
- and complete the migration.
+ - A ``state_pending_exact`` function that indicates how much more
+ data we must save. The core migration code will use this to
+ determine when to pause the CPUs and complete the migration.
- - A ``save_live_iterate`` function (called after ``save_live_pending``
- when there is significant data still to be sent). It should send
- a chunk of data until the point that stream bandwidth limits tell it
- to stop. Each call generates one section.
+ - A ``state_pending_estimate`` function that indicates how much more
+ data we must save. When the estimated amount is smaller than the
+ threshold, we call ``state_pending_exact``.
+
+ - A ``save_live_iterate`` function should send a chunk of data until
+ the point that stream bandwidth limits tell it to stop. Each call
+ generates one section.
- A ``save_live_complete_precopy`` function that must transmit the
last section for the device containing any remaining data.
diff --git a/docs/devel/vfio-migration.rst b/docs/devel/vfio-migration.rst
index 9ff6163..673057c 100644
--- a/docs/devel/vfio-migration.rst
+++ b/docs/devel/vfio-migration.rst
@@ -28,7 +28,7 @@
* A ``load_setup`` function that sets up the migration region on the
destination and sets _RESUMING flag in the VFIO device state.
-* A ``save_live_pending`` function that reads pending_bytes from the vendor
+* A ``state_pending_exact`` function that reads pending_bytes from the vendor
driver, which indicates the amount of data that the vendor driver has yet to
save for the VFIO device.
@@ -114,7 +114,7 @@
(RUNNING, _SETUP, _RUNNING|_SAVING)
|
(RUNNING, _ACTIVE, _RUNNING|_SAVING)
- If device is active, get pending_bytes by .save_live_pending()
+ If device is active, get pending_bytes by .state_pending_exact()
If total pending_bytes >= threshold_size, call .save_live_iterate()
Data of VFIO device for pre-copy phase is copied
Iterate till total pending bytes converge and are less than threshold
diff --git a/hw/core/machine.c b/hw/core/machine.c
index f7761ba..b5cd42c 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -41,7 +41,9 @@
#include "hw/virtio/virtio-pci.h"
#include "qom/object_interfaces.h"
-GlobalProperty hw_compat_7_2[] = {};
+GlobalProperty hw_compat_7_2[] = {
+ { "virtio-mem", "x-early-migration", "false" },
+};
const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2);
GlobalProperty hw_compat_7_1[] = {
diff --git a/hw/misc/macio/cuda.c b/hw/misc/macio/cuda.c
index 853e88bf..7208b90 100644
--- a/hw/misc/macio/cuda.c
+++ b/hw/misc/macio/cuda.c
@@ -27,8 +27,6 @@
#include "hw/irq.h"
#include "hw/qdev-properties.h"
#include "migration/vmstate.h"
-#include "hw/input/adb.h"
-#include "hw/misc/mos6522.h"
#include "hw/misc/macio/cuda.h"
#include "qapi/error.h"
#include "qemu/timer.h"
diff --git a/hw/misc/macio/macio.c b/hw/misc/macio/macio.c
index 08dbdd7..265c0bb 100644
--- a/hw/misc/macio/macio.c
+++ b/hw/misc/macio/macio.c
@@ -53,10 +53,8 @@
*/
static void macio_escc_legacy_setup(MacIOState *s)
{
- ESCCState *escc = ESCC(&s->escc);
- SysBusDevice *sbd = SYS_BUS_DEVICE(escc);
+ SysBusDevice *sbd = SYS_BUS_DEVICE(&s->escc);
MemoryRegion *escc_legacy = g_new(MemoryRegion, 1);
- MemoryRegion *bar = &s->bar;
int i;
static const int maps[] = {
0x00, 0x00, /* Command B */
@@ -80,30 +78,29 @@
memory_region_add_subregion(escc_legacy, maps[i], port);
}
- memory_region_add_subregion(bar, 0x12000, escc_legacy);
+ memory_region_add_subregion(&s->bar, 0x12000, escc_legacy);
}
static void macio_bar_setup(MacIOState *s)
{
- ESCCState *escc = ESCC(&s->escc);
- SysBusDevice *sbd = SYS_BUS_DEVICE(escc);
- MemoryRegion *bar = &s->bar;
+ SysBusDevice *sbd = SYS_BUS_DEVICE(&s->escc);
+ MemoryRegion *bar = sysbus_mmio_get_region(sbd, 0);
- memory_region_add_subregion(bar, 0x13000, sysbus_mmio_get_region(sbd, 0));
+ memory_region_add_subregion(&s->bar, 0x13000, bar);
macio_escc_legacy_setup(s);
}
-static void macio_common_realize(PCIDevice *d, Error **errp)
+static bool macio_common_realize(PCIDevice *d, Error **errp)
{
MacIOState *s = MACIO(d);
- SysBusDevice *sysbus_dev;
+ SysBusDevice *sbd;
if (!qdev_realize(DEVICE(&s->dbdma), BUS(&s->macio_bus), errp)) {
- return;
+ return false;
}
- sysbus_dev = SYS_BUS_DEVICE(&s->dbdma);
+ sbd = SYS_BUS_DEVICE(&s->dbdma);
memory_region_add_subregion(&s->bar, 0x08000,
- sysbus_mmio_get_region(sysbus_dev, 0));
+ sysbus_mmio_get_region(sbd, 0));
qdev_prop_set_uint32(DEVICE(&s->escc), "disabled", 0);
qdev_prop_set_uint32(DEVICE(&s->escc), "frequency", ESCC_CLOCK);
@@ -111,28 +108,29 @@
qdev_prop_set_uint32(DEVICE(&s->escc), "chnBtype", escc_serial);
qdev_prop_set_uint32(DEVICE(&s->escc), "chnAtype", escc_serial);
if (!qdev_realize(DEVICE(&s->escc), BUS(&s->macio_bus), errp)) {
- return;
+ return false;
}
macio_bar_setup(s);
pci_register_bar(d, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, &s->bar);
+
+ return true;
}
-static void macio_realize_ide(MacIOState *s, MACIOIDEState *ide,
+static bool macio_realize_ide(MacIOState *s, MACIOIDEState *ide,
qemu_irq irq0, qemu_irq irq1, int dmaid,
Error **errp)
{
- SysBusDevice *sysbus_dev;
+ SysBusDevice *sbd = SYS_BUS_DEVICE(ide);
- sysbus_dev = SYS_BUS_DEVICE(ide);
- sysbus_connect_irq(sysbus_dev, 0, irq0);
- sysbus_connect_irq(sysbus_dev, 1, irq1);
+ sysbus_connect_irq(sbd, 0, irq0);
+ sysbus_connect_irq(sbd, 1, irq1);
qdev_prop_set_uint32(DEVICE(ide), "channel", dmaid);
object_property_set_link(OBJECT(ide), "dbdma", OBJECT(&s->dbdma),
&error_abort);
macio_ide_register_dma(ide);
- qdev_realize(DEVICE(ide), BUS(&s->macio_bus), errp);
+ return qdev_realize(DEVICE(ide), BUS(&s->macio_bus), errp);
}
static void macio_oldworld_realize(PCIDevice *d, Error **errp)
@@ -140,12 +138,9 @@
MacIOState *s = MACIO(d);
OldWorldMacIOState *os = OLDWORLD_MACIO(d);
DeviceState *pic_dev = DEVICE(&os->pic);
- Error *err = NULL;
- SysBusDevice *sysbus_dev;
+ SysBusDevice *sbd;
- macio_common_realize(d, &err);
- if (err) {
- error_propagate(errp, err);
+ if (!macio_common_realize(d, errp)) {
return;
}
@@ -153,51 +148,44 @@
if (!qdev_realize(DEVICE(&os->pic), BUS(&s->macio_bus), errp)) {
return;
}
- sysbus_dev = SYS_BUS_DEVICE(&os->pic);
+ sbd = SYS_BUS_DEVICE(&os->pic);
memory_region_add_subregion(&s->bar, 0x0,
- sysbus_mmio_get_region(sysbus_dev, 0));
+ sysbus_mmio_get_region(sbd, 0));
qdev_prop_set_uint64(DEVICE(&s->cuda), "timebase-frequency",
s->frequency);
if (!qdev_realize(DEVICE(&s->cuda), BUS(&s->macio_bus), errp)) {
return;
}
- sysbus_dev = SYS_BUS_DEVICE(&s->cuda);
+ sbd = SYS_BUS_DEVICE(&s->cuda);
memory_region_add_subregion(&s->bar, 0x16000,
- sysbus_mmio_get_region(sysbus_dev, 0));
- sysbus_connect_irq(sysbus_dev, 0, qdev_get_gpio_in(pic_dev,
- OLDWORLD_CUDA_IRQ));
+ sysbus_mmio_get_region(sbd, 0));
+ sysbus_connect_irq(sbd, 0, qdev_get_gpio_in(pic_dev, OLDWORLD_CUDA_IRQ));
- sysbus_dev = SYS_BUS_DEVICE(&s->escc);
- sysbus_connect_irq(sysbus_dev, 0, qdev_get_gpio_in(pic_dev,
- OLDWORLD_ESCCB_IRQ));
- sysbus_connect_irq(sysbus_dev, 1, qdev_get_gpio_in(pic_dev,
- OLDWORLD_ESCCA_IRQ));
+ sbd = SYS_BUS_DEVICE(&s->escc);
+ sysbus_connect_irq(sbd, 0, qdev_get_gpio_in(pic_dev, OLDWORLD_ESCCB_IRQ));
+ sysbus_connect_irq(sbd, 1, qdev_get_gpio_in(pic_dev, OLDWORLD_ESCCA_IRQ));
if (!qdev_realize(DEVICE(&os->nvram), BUS(&s->macio_bus), errp)) {
return;
}
- sysbus_dev = SYS_BUS_DEVICE(&os->nvram);
+ sbd = SYS_BUS_DEVICE(&os->nvram);
memory_region_add_subregion(&s->bar, 0x60000,
- sysbus_mmio_get_region(sysbus_dev, 0));
+ sysbus_mmio_get_region(sbd, 0));
pmac_format_nvram_partition(&os->nvram, os->nvram.size);
/* IDE buses */
- macio_realize_ide(s, &os->ide[0],
- qdev_get_gpio_in(pic_dev, OLDWORLD_IDE0_IRQ),
- qdev_get_gpio_in(pic_dev, OLDWORLD_IDE0_DMA_IRQ),
- 0x16, &err);
- if (err) {
- error_propagate(errp, err);
+ if (!macio_realize_ide(s, &os->ide[0],
+ qdev_get_gpio_in(pic_dev, OLDWORLD_IDE0_IRQ),
+ qdev_get_gpio_in(pic_dev, OLDWORLD_IDE0_DMA_IRQ),
+ 0x16, errp)) {
return;
}
- macio_realize_ide(s, &os->ide[1],
- qdev_get_gpio_in(pic_dev, OLDWORLD_IDE1_IRQ),
- qdev_get_gpio_in(pic_dev, OLDWORLD_IDE1_DMA_IRQ),
- 0x1a, &err);
- if (err) {
- error_propagate(errp, err);
+ if (!macio_realize_ide(s, &os->ide[1],
+ qdev_get_gpio_in(pic_dev, OLDWORLD_IDE1_IRQ),
+ qdev_get_gpio_in(pic_dev, OLDWORLD_IDE1_DMA_IRQ),
+ 0x1a, errp)) {
return;
}
}
@@ -220,11 +208,11 @@
DeviceState *dev;
int i;
- object_initialize_child(OBJECT(s), "pic", &os->pic, TYPE_HEATHROW);
+ object_initialize_child(obj, "pic", &os->pic, TYPE_HEATHROW);
- object_initialize_child(OBJECT(s), "cuda", &s->cuda, TYPE_CUDA);
+ object_initialize_child(obj, "cuda", &s->cuda, TYPE_CUDA);
- object_initialize_child(OBJECT(s), "nvram", &os->nvram, TYPE_MACIO_NVRAM);
+ object_initialize_child(obj, "nvram", &os->nvram, TYPE_MACIO_NVRAM);
dev = DEVICE(&os->nvram);
qdev_prop_set_uint32(dev, "size", MACIO_NVRAM_SIZE);
qdev_prop_set_uint32(dev, "it_shift", 4);
@@ -273,45 +261,36 @@
MacIOState *s = MACIO(d);
NewWorldMacIOState *ns = NEWWORLD_MACIO(d);
DeviceState *pic_dev = DEVICE(&ns->pic);
- Error *err = NULL;
- SysBusDevice *sysbus_dev;
+ SysBusDevice *sbd;
MemoryRegion *timer_memory = NULL;
- macio_common_realize(d, &err);
- if (err) {
- error_propagate(errp, err);
+ if (!macio_common_realize(d, errp)) {
return;
}
/* OpenPIC */
qdev_prop_set_uint32(pic_dev, "model", OPENPIC_MODEL_KEYLARGO);
- sysbus_dev = SYS_BUS_DEVICE(&ns->pic);
- sysbus_realize_and_unref(sysbus_dev, &error_fatal);
+ sbd = SYS_BUS_DEVICE(&ns->pic);
+ sysbus_realize_and_unref(sbd, &error_fatal);
memory_region_add_subregion(&s->bar, 0x40000,
- sysbus_mmio_get_region(sysbus_dev, 0));
+ sysbus_mmio_get_region(sbd, 0));
- sysbus_dev = SYS_BUS_DEVICE(&s->escc);
- sysbus_connect_irq(sysbus_dev, 0, qdev_get_gpio_in(pic_dev,
- NEWWORLD_ESCCB_IRQ));
- sysbus_connect_irq(sysbus_dev, 1, qdev_get_gpio_in(pic_dev,
- NEWWORLD_ESCCA_IRQ));
+ sbd = SYS_BUS_DEVICE(&s->escc);
+ sysbus_connect_irq(sbd, 0, qdev_get_gpio_in(pic_dev, NEWWORLD_ESCCB_IRQ));
+ sysbus_connect_irq(sbd, 1, qdev_get_gpio_in(pic_dev, NEWWORLD_ESCCA_IRQ));
/* IDE buses */
- macio_realize_ide(s, &ns->ide[0],
- qdev_get_gpio_in(pic_dev, NEWWORLD_IDE0_IRQ),
- qdev_get_gpio_in(pic_dev, NEWWORLD_IDE0_DMA_IRQ),
- 0x16, &err);
- if (err) {
- error_propagate(errp, err);
+ if (!macio_realize_ide(s, &ns->ide[0],
+ qdev_get_gpio_in(pic_dev, NEWWORLD_IDE0_IRQ),
+ qdev_get_gpio_in(pic_dev, NEWWORLD_IDE0_DMA_IRQ),
+ 0x16, errp)) {
return;
}
- macio_realize_ide(s, &ns->ide[1],
- qdev_get_gpio_in(pic_dev, NEWWORLD_IDE1_IRQ),
- qdev_get_gpio_in(pic_dev, NEWWORLD_IDE1_DMA_IRQ),
- 0x1a, &err);
- if (err) {
- error_propagate(errp, err);
+ if (!macio_realize_ide(s, &ns->ide[1],
+ qdev_get_gpio_in(pic_dev, NEWWORLD_IDE1_IRQ),
+ qdev_get_gpio_in(pic_dev, NEWWORLD_IDE1_DMA_IRQ),
+ 0x1a, errp)) {
return;
}
@@ -326,27 +305,26 @@
if (!qdev_realize(DEVICE(&ns->gpio), BUS(&s->macio_bus), errp)) {
return;
}
- sysbus_dev = SYS_BUS_DEVICE(&ns->gpio);
- sysbus_connect_irq(sysbus_dev, 1, qdev_get_gpio_in(pic_dev,
+ sbd = SYS_BUS_DEVICE(&ns->gpio);
+ sysbus_connect_irq(sbd, 1, qdev_get_gpio_in(pic_dev,
NEWWORLD_EXTING_GPIO1));
- sysbus_connect_irq(sysbus_dev, 9, qdev_get_gpio_in(pic_dev,
+ sysbus_connect_irq(sbd, 9, qdev_get_gpio_in(pic_dev,
NEWWORLD_EXTING_GPIO9));
memory_region_add_subregion(&s->bar, 0x50,
- sysbus_mmio_get_region(sysbus_dev, 0));
+ sysbus_mmio_get_region(sbd, 0));
/* PMU */
object_initialize_child(OBJECT(s), "pmu", &s->pmu, TYPE_VIA_PMU);
- object_property_set_link(OBJECT(&s->pmu), "gpio", OBJECT(sysbus_dev),
+ object_property_set_link(OBJECT(&s->pmu), "gpio", OBJECT(sbd),
&error_abort);
qdev_prop_set_bit(DEVICE(&s->pmu), "has-adb", ns->has_adb);
if (!qdev_realize(DEVICE(&s->pmu), BUS(&s->macio_bus), errp)) {
return;
}
- sysbus_dev = SYS_BUS_DEVICE(&s->pmu);
- sysbus_connect_irq(sysbus_dev, 0, qdev_get_gpio_in(pic_dev,
- NEWWORLD_PMU_IRQ));
+ sbd = SYS_BUS_DEVICE(&s->pmu);
+ sysbus_connect_irq(sbd, 0, qdev_get_gpio_in(pic_dev, NEWWORLD_PMU_IRQ));
memory_region_add_subregion(&s->bar, 0x16000,
- sysbus_mmio_get_region(sysbus_dev, 0));
+ sysbus_mmio_get_region(sbd, 0));
} else {
object_unparent(OBJECT(&ns->gpio));
@@ -358,11 +336,10 @@
if (!qdev_realize(DEVICE(&s->cuda), BUS(&s->macio_bus), errp)) {
return;
}
- sysbus_dev = SYS_BUS_DEVICE(&s->cuda);
- sysbus_connect_irq(sysbus_dev, 0, qdev_get_gpio_in(pic_dev,
- NEWWORLD_CUDA_IRQ));
+ sbd = SYS_BUS_DEVICE(&s->cuda);
+ sysbus_connect_irq(sbd, 0, qdev_get_gpio_in(pic_dev, NEWWORLD_CUDA_IRQ));
memory_region_add_subregion(&s->bar, 0x16000,
- sysbus_mmio_get_region(sysbus_dev, 0));
+ sysbus_mmio_get_region(sbd, 0));
}
}
@@ -372,9 +349,9 @@
NewWorldMacIOState *ns = NEWWORLD_MACIO(obj);
int i;
- object_initialize_child(OBJECT(s), "pic", &ns->pic, TYPE_OPENPIC);
+ object_initialize_child(obj, "pic", &ns->pic, TYPE_OPENPIC);
- object_initialize_child(OBJECT(s), "gpio", &ns->gpio, TYPE_MACIO_GPIO);
+ object_initialize_child(obj, "gpio", &ns->gpio, TYPE_MACIO_GPIO);
for (i = 0; i < 2; i++) {
macio_init_ide(s, &ns->ide[i], i);
@@ -390,9 +367,9 @@
qbus_init(&s->macio_bus, sizeof(s->macio_bus), TYPE_MACIO_BUS,
DEVICE(obj), "macio.0");
- object_initialize_child(OBJECT(s), "dbdma", &s->dbdma, TYPE_MAC_DBDMA);
+ object_initialize_child(obj, "dbdma", &s->dbdma, TYPE_MAC_DBDMA);
- object_initialize_child(OBJECT(s), "escc", &s->escc, TYPE_ESCC);
+ object_initialize_child(obj, "escc", &s->escc, TYPE_ESCC);
}
static const VMStateDescription vmstate_macio_oldworld = {
diff --git a/hw/misc/macio/pmu.c b/hw/misc/macio/pmu.c
index 97ef8c7..8575bc1 100644
--- a/hw/misc/macio/pmu.c
+++ b/hw/misc/macio/pmu.c
@@ -31,10 +31,7 @@
#include "qemu/osdep.h"
#include "hw/qdev-properties.h"
#include "migration/vmstate.h"
-#include "hw/input/adb.h"
#include "hw/irq.h"
-#include "hw/misc/mos6522.h"
-#include "hw/misc/macio/gpio.h"
#include "hw/misc/macio/pmu.h"
#include "qapi/error.h"
#include "qemu/timer.h"
diff --git a/hw/misc/mos6522.c b/hw/misc/mos6522.c
index 0ed6311..d6ba47b 100644
--- a/hw/misc/mos6522.c
+++ b/hw/misc/mos6522.c
@@ -25,7 +25,6 @@
*/
#include "qemu/osdep.h"
-#include "hw/input/adb.h"
#include "hw/irq.h"
#include "hw/misc/mos6522.h"
#include "hw/qdev-properties.h"
diff --git a/hw/nvram/mac_nvram.c b/hw/nvram/mac_nvram.c
index 3d9ddda..810e84f 100644
--- a/hw/nvram/mac_nvram.c
+++ b/hw/nvram/mac_nvram.c
@@ -24,9 +24,12 @@
*/
#include "qemu/osdep.h"
+#include "qapi/error.h"
#include "hw/nvram/chrp_nvram.h"
#include "hw/nvram/mac_nvram.h"
#include "hw/qdev-properties.h"
+#include "hw/qdev-properties-system.h"
+#include "sysemu/block-backend.h"
#include "migration/vmstate.h"
#include "qemu/cutils.h"
#include "qemu/module.h"
@@ -44,6 +47,9 @@
addr = (addr >> s->it_shift) & (s->size - 1);
trace_macio_nvram_write(addr, value);
s->data[addr] = value;
+ if (s->blk) {
+ blk_pwrite(s->blk, addr, 1, &s->data[addr], 0);
+ }
}
static uint64_t macio_nvram_readb(void *opaque, hwaddr addr,
@@ -91,6 +97,27 @@
s->data = g_malloc0(s->size);
+ if (s->blk) {
+ int64_t len = blk_getlength(s->blk);
+ if (len < 0) {
+ error_setg_errno(errp, -len,
+ "could not get length of nvram backing image");
+ return;
+ } else if (len != s->size) {
+ error_setg_errno(errp, -len,
+ "invalid size nvram backing image");
+ return;
+ }
+ if (blk_set_perm(s->blk, BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE,
+ BLK_PERM_ALL, errp) < 0) {
+ return;
+ }
+ if (blk_pread(s->blk, 0, s->size, s->data, 0) < 0) {
+ error_setg(errp, "can't read-nvram contents");
+ return;
+ }
+ }
+
memory_region_init_io(&s->mem, OBJECT(s), &macio_nvram_ops, s,
"macio-nvram", s->size << s->it_shift);
sysbus_init_mmio(d, &s->mem);
@@ -106,6 +133,7 @@
static Property macio_nvram_properties[] = {
DEFINE_PROP_UINT32("size", MacIONVRAMState, size, 0),
DEFINE_PROP_UINT32("it_shift", MacIONVRAMState, it_shift, 0),
+ DEFINE_PROP_DRIVE("drive", MacIONVRAMState, blk),
DEFINE_PROP_END_OF_LIST()
};
diff --git a/hw/ppc/mac_newworld.c b/hw/ppc/mac_newworld.c
index 601ea51..460c14b 100644
--- a/hw/ppc/mac_newworld.c
+++ b/hw/ppc/mac_newworld.c
@@ -466,8 +466,7 @@
fw_cfg = FW_CFG(dev);
qdev_prop_set_uint32(dev, "data_width", 1);
qdev_prop_set_bit(dev, "dma_enabled", false);
- object_property_add_child(OBJECT(qdev_get_machine()), TYPE_FW_CFG,
- OBJECT(fw_cfg));
+ object_property_add_child(OBJECT(machine), TYPE_FW_CFG, OBJECT(fw_cfg));
s = SYS_BUS_DEVICE(dev);
sysbus_realize_and_unref(s, &error_fatal);
sysbus_mmio_map(s, 0, CFG_ADDR);
diff --git a/hw/ppc/mac_oldworld.c b/hw/ppc/mac_oldworld.c
index 558c639..2e4cc3f 100644
--- a/hw/ppc/mac_oldworld.c
+++ b/hw/ppc/mac_oldworld.c
@@ -102,7 +102,7 @@
DeviceState *dev, *pic_dev, *grackle_dev;
BusState *adb_bus;
uint16_t ppc_boot_device;
- DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS];
+ DriveInfo *dinfo, *hd[MAX_IDE_BUS * MAX_IDE_DEVS];
void *fw_cfg;
uint64_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq() : TBFREQ;
@@ -245,6 +245,12 @@
qdev_prop_set_chr(dev, "chrA", serial_hd(0));
qdev_prop_set_chr(dev, "chrB", serial_hd(1));
+ dinfo = drive_get(IF_MTD, 0, 0);
+ if (dinfo) {
+ dev = DEVICE(object_resolve_path_component(macio, "nvram"));
+ qdev_prop_set_drive(dev, "drive", blk_by_legacy_dinfo(dinfo));
+ }
+
pci_realize_and_unref(PCI_DEVICE(macio), pci_bus, &error_fatal);
pic_dev = DEVICE(object_resolve_path_component(macio, "pic"));
@@ -303,8 +309,7 @@
fw_cfg = FW_CFG(dev);
qdev_prop_set_uint32(dev, "data_width", 1);
qdev_prop_set_bit(dev, "dma_enabled", false);
- object_property_add_child(OBJECT(qdev_get_machine()), TYPE_FW_CFG,
- OBJECT(fw_cfg));
+ object_property_add_child(OBJECT(machine), TYPE_FW_CFG, OBJECT(fw_cfg));
s = SYS_BUS_DEVICE(dev);
sysbus_realize_and_unref(s, &error_fatal);
sysbus_mmio_map(s, 0, CFG_ADDR);
diff --git a/hw/s390x/s390-stattrib.c b/hw/s390x/s390-stattrib.c
index 9eda1c3..3e32002 100644
--- a/hw/s390x/s390-stattrib.c
+++ b/hw/s390x/s390-stattrib.c
@@ -182,10 +182,10 @@
return 0;
}
-static void cmma_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
- uint64_t *res_precopy_only,
- uint64_t *res_compatible,
- uint64_t *res_postcopy_only)
+static void cmma_state_pending(void *opaque,
+ uint64_t *res_precopy_only,
+ uint64_t *res_compatible,
+ uint64_t *res_postcopy_only)
{
S390StAttribState *sas = S390_STATTRIB(opaque);
S390StAttribClass *sac = S390_STATTRIB_GET_CLASS(sas);
@@ -371,7 +371,8 @@
.save_setup = cmma_save_setup,
.save_live_iterate = cmma_save_iterate,
.save_live_complete_precopy = cmma_save_complete,
- .save_live_pending = cmma_save_pending,
+ .state_pending_exact = cmma_state_pending,
+ .state_pending_estimate = cmma_state_pending,
.save_cleanup = cmma_save_cleanup,
.load_state = cmma_load,
.is_active = cmma_active,
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index c74453e..b3318f0 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -456,11 +456,10 @@
trace_vfio_save_cleanup(vbasedev->name);
}
-static void vfio_save_pending(QEMUFile *f, void *opaque,
- uint64_t threshold_size,
- uint64_t *res_precopy_only,
- uint64_t *res_compatible,
- uint64_t *res_postcopy_only)
+static void vfio_state_pending(void *opaque,
+ uint64_t *res_precopy_only,
+ uint64_t *res_compatible,
+ uint64_t *res_postcopy_only)
{
VFIODevice *vbasedev = opaque;
VFIOMigration *migration = vbasedev->migration;
@@ -473,7 +472,7 @@
*res_precopy_only += migration->pending_bytes;
- trace_vfio_save_pending(vbasedev->name, *res_precopy_only,
+ trace_vfio_state_pending(vbasedev->name, *res_precopy_only,
*res_postcopy_only, *res_compatible);
}
@@ -515,9 +514,9 @@
}
/*
- * Reset pending_bytes as .save_live_pending is not called during savevm or
- * snapshot case, in such case vfio_update_pending() at the start of this
- * function updates pending_bytes.
+ * Reset pending_bytes as state_pending* are not called during
+ * savevm or snapshot case, in such case vfio_update_pending() at
+ * the start of this function updates pending_bytes.
*/
migration->pending_bytes = 0;
trace_vfio_save_iterate(vbasedev->name, data_size);
@@ -685,7 +684,8 @@
static SaveVMHandlers savevm_vfio_handlers = {
.save_setup = vfio_save_setup,
.save_cleanup = vfio_save_cleanup,
- .save_live_pending = vfio_save_pending,
+ .state_pending_exact = vfio_state_pending,
+ .state_pending_estimate = vfio_state_pending,
.save_live_iterate = vfio_save_iterate,
.save_live_complete_precopy = vfio_save_complete_precopy,
.save_state = vfio_save_state,
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 73dffe9..52de1c8 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -157,7 +157,7 @@
vfio_save_buffer(const char *name, uint64_t data_offset, uint64_t data_size, uint64_t pending) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64" pending 0x%"PRIx64
vfio_update_pending(const char *name, uint64_t pending) " (%s) pending 0x%"PRIx64
vfio_save_device_config_state(const char *name) " (%s)"
-vfio_save_pending(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t compatible) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" compatible 0x%"PRIx64
+vfio_state_pending(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t compatible) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" compatible 0x%"PRIx64
vfio_save_iterate(const char *name, int data_size) " (%s) data_size %d"
vfio_save_complete_precopy(const char *name) " (%s)"
vfio_load_device_config_state(const char *name) " (%s)"
diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
index 1ed1f5a..957fe77 100644
--- a/hw/virtio/virtio-mem.c
+++ b/hw/virtio/virtio-mem.c
@@ -31,6 +31,8 @@
#include CONFIG_DEVICES
#include "trace.h"
+static const VMStateDescription vmstate_virtio_mem_device_early;
+
/*
* We only had legacy x86 guests that did not support
* VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE. Other targets don't have legacy guests.
@@ -202,6 +204,30 @@
return ret;
}
+static int virtio_mem_for_each_plugged_range(const VirtIOMEM *vmem, void *arg,
+ virtio_mem_range_cb cb)
+{
+ unsigned long first_bit, last_bit;
+ uint64_t offset, size;
+ int ret = 0;
+
+ first_bit = find_first_bit(vmem->bitmap, vmem->bitmap_size);
+ while (first_bit < vmem->bitmap_size) {
+ offset = first_bit * vmem->block_size;
+ last_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
+ first_bit + 1) - 1;
+ size = (last_bit - first_bit + 1) * vmem->block_size;
+
+ ret = cb(vmem, arg, offset, size);
+ if (ret) {
+ break;
+ }
+ first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
+ last_bit + 2);
+ }
+ return ret;
+}
+
/*
* Adjust the memory section to cover the intersection with the given range.
*
@@ -772,6 +798,12 @@
error_setg(errp, "'%s' property specifies an unsupported memdev",
VIRTIO_MEM_MEMDEV_PROP);
return;
+ } else if (vmem->memdev->prealloc) {
+ error_setg(errp, "'%s' property specifies a memdev with preallocation"
+ " enabled: %s. Instead, specify 'prealloc=on' for the"
+ " virtio-mem device. ", VIRTIO_MEM_MEMDEV_PROP,
+ object_get_canonical_path_component(OBJECT(vmem->memdev)));
+ return;
}
if ((nb_numa_nodes && vmem->node >= nb_numa_nodes) ||
@@ -872,6 +904,10 @@
host_memory_backend_set_mapped(vmem->memdev, true);
vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem));
+ if (vmem->early_migration) {
+ vmstate_register(VMSTATE_IF(vmem), VMSTATE_INSTANCE_ID_ANY,
+ &vmstate_virtio_mem_device_early, vmem);
+ }
qemu_register_reset(virtio_mem_system_reset, vmem);
/*
@@ -893,6 +929,10 @@
*/
memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL);
qemu_unregister_reset(virtio_mem_system_reset, vmem);
+ if (vmem->early_migration) {
+ vmstate_unregister(VMSTATE_IF(vmem), &vmstate_virtio_mem_device_early,
+ vmem);
+ }
vmstate_unregister_ram(&vmem->memdev->mr, DEVICE(vmem));
host_memory_backend_set_mapped(vmem->memdev, false);
virtio_del_queue(vdev, 0);
@@ -922,6 +962,10 @@
RamDiscardListener *rdl;
int ret;
+ if (vmem->prealloc && !vmem->early_migration) {
+ warn_report("Proper preallocation with migration requires a newer QEMU machine");
+ }
+
/*
* We started out with all memory discarded and our memory region is mapped
* into an address space. Replay, now that we updated the bitmap.
@@ -941,6 +985,64 @@
return virtio_mem_restore_unplugged(vmem);
}
+static int virtio_mem_prealloc_range_cb(const VirtIOMEM *vmem, void *arg,
+ uint64_t offset, uint64_t size)
+{
+ void *area = memory_region_get_ram_ptr(&vmem->memdev->mr) + offset;
+ int fd = memory_region_get_fd(&vmem->memdev->mr);
+ Error *local_err = NULL;
+
+ qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err);
+ if (local_err) {
+ error_report_err(local_err);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static int virtio_mem_post_load_early(void *opaque, int version_id)
+{
+ VirtIOMEM *vmem = VIRTIO_MEM(opaque);
+ RAMBlock *rb = vmem->memdev->mr.ram_block;
+ int ret;
+
+ if (!vmem->prealloc) {
+ return 0;
+ }
+
+ /*
+ * We restored the bitmap and verified that the basic properties
+ * match on source and destination, so we can go ahead and preallocate
+ * memory for all plugged memory blocks, before actual RAM migration starts
+ * touching this memory.
+ */
+ ret = virtio_mem_for_each_plugged_range(vmem, NULL,
+ virtio_mem_prealloc_range_cb);
+ if (ret) {
+ return ret;
+ }
+
+ /*
+ * This is tricky: postcopy wants to start with a clean slate. On
+ * POSTCOPY_INCOMING_ADVISE, postcopy code discards all (ordinarily
+ * preallocated) RAM such that postcopy will work as expected later.
+ *
+ * However, we run after POSTCOPY_INCOMING_ADVISE -- but before actual
+ * RAM migration. So let's discard all memory again. This looks like an
+ * expensive NOP, but actually serves a purpose: we made sure that we
+ * were able to allocate all required backend memory once. We cannot
+ * guarantee that the backend memory we will free will remain free
+ * until we need it during postcopy, but at least we can catch the
+ * obvious setup issues this way.
+ */
+ if (migration_incoming_postcopy_advised()) {
+ if (ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb))) {
+ return -EBUSY;
+ }
+ }
+ return 0;
+}
+
typedef struct VirtIOMEMMigSanityChecks {
VirtIOMEM *parent;
uint64_t addr;
@@ -1009,6 +1111,14 @@
},
};
+static bool virtio_mem_vmstate_field_exists(void *opaque, int version_id)
+{
+ const VirtIOMEM *vmem = VIRTIO_MEM(opaque);
+
+ /* With early migration, these fields were already migrated. */
+ return !vmem->early_migration;
+}
+
static const VMStateDescription vmstate_virtio_mem_device = {
.name = "virtio-mem-device",
.minimum_version_id = 1,
@@ -1016,11 +1126,39 @@
.priority = MIG_PRI_VIRTIO_MEM,
.post_load = virtio_mem_post_load,
.fields = (VMStateField[]) {
+ VMSTATE_WITH_TMP_TEST(VirtIOMEM, virtio_mem_vmstate_field_exists,
+ VirtIOMEMMigSanityChecks,
+ vmstate_virtio_mem_sanity_checks),
+ VMSTATE_UINT64(usable_region_size, VirtIOMEM),
+ VMSTATE_UINT64_TEST(size, VirtIOMEM, virtio_mem_vmstate_field_exists),
+ VMSTATE_UINT64(requested_size, VirtIOMEM),
+ VMSTATE_BITMAP_TEST(bitmap, VirtIOMEM, virtio_mem_vmstate_field_exists,
+ 0, bitmap_size),
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+/*
+ * Transfer properties that are immutable while migration is active early,
+ * such that we have have this information around before migrating any RAM
+ * content.
+ *
+ * Note that virtio_mem_is_busy() makes sure these properties can no longer
+ * change on the migration source until migration completed.
+ *
+ * With QEMU compat machines, we transmit these properties later, via
+ * vmstate_virtio_mem_device instead -- see virtio_mem_vmstate_field_exists().
+ */
+static const VMStateDescription vmstate_virtio_mem_device_early = {
+ .name = "virtio-mem-device-early",
+ .minimum_version_id = 1,
+ .version_id = 1,
+ .early_setup = true,
+ .post_load = virtio_mem_post_load_early,
+ .fields = (VMStateField[]) {
VMSTATE_WITH_TMP(VirtIOMEM, VirtIOMEMMigSanityChecks,
vmstate_virtio_mem_sanity_checks),
- VMSTATE_UINT64(usable_region_size, VirtIOMEM),
VMSTATE_UINT64(size, VirtIOMEM),
- VMSTATE_UINT64(requested_size, VirtIOMEM),
VMSTATE_BITMAP(bitmap, VirtIOMEM, 0, bitmap_size),
VMSTATE_END_OF_LIST()
},
@@ -1205,6 +1343,8 @@
DEFINE_PROP_ON_OFF_AUTO(VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP, VirtIOMEM,
unplugged_inaccessible, ON_OFF_AUTO_AUTO),
#endif
+ DEFINE_PROP_BOOL(VIRTIO_MEM_EARLY_MIGRATION_PROP, VirtIOMEM,
+ early_migration, true),
DEFINE_PROP_END_OF_LIST(),
};
diff --git a/include/hw/misc/mac_via.h b/include/hw/misc/mac_via.h
index 5fe7a7f..422da43 100644
--- a/include/hw/misc/mac_via.h
+++ b/include/hw/misc/mac_via.h
@@ -12,6 +12,7 @@
#include "exec/memory.h"
#include "hw/sysbus.h"
#include "hw/misc/mos6522.h"
+#include "hw/input/adb.h"
#include "qom/object.h"
diff --git a/include/hw/misc/macio/cuda.h b/include/hw/misc/macio/cuda.h
index a71deec..8a6678c 100644
--- a/include/hw/misc/macio/cuda.h
+++ b/include/hw/misc/macio/cuda.h
@@ -26,6 +26,7 @@
#ifndef CUDA_H
#define CUDA_H
+#include "hw/input/adb.h"
#include "hw/misc/mos6522.h"
#include "qom/object.h"
diff --git a/include/hw/misc/macio/pmu.h b/include/hw/misc/macio/pmu.h
index 00fcdd2..ba76afb 100644
--- a/include/hw/misc/macio/pmu.h
+++ b/include/hw/misc/macio/pmu.h
@@ -10,6 +10,7 @@
#ifndef PMU_H
#define PMU_H
+#include "hw/input/adb.h"
#include "hw/misc/mos6522.h"
#include "hw/misc/macio/gpio.h"
#include "qom/object.h"
diff --git a/include/hw/misc/mos6522.h b/include/hw/misc/mos6522.h
index 05872ff..fba4566 100644
--- a/include/hw/misc/mos6522.h
+++ b/include/hw/misc/mos6522.h
@@ -27,9 +27,8 @@
#ifndef MOS6522_H
#define MOS6522_H
-#include "exec/memory.h"
+#include "exec/hwaddr.h"
#include "hw/sysbus.h"
-#include "hw/input/adb.h"
#include "qom/object.h"
#define MOS6522_NUM_REGS 16
diff --git a/include/hw/nvram/mac_nvram.h b/include/hw/nvram/mac_nvram.h
index b780aca..0c4dfae 100644
--- a/include/hw/nvram/mac_nvram.h
+++ b/include/hw/nvram/mac_nvram.h
@@ -44,6 +44,7 @@
MemoryRegion mem;
uint8_t *data;
+ BlockBackend *blk;
};
void pmac_format_nvram_partition(MacIONVRAMState *nvr, int len);
diff --git a/include/hw/virtio/virtio-mem.h b/include/hw/virtio/virtio-mem.h
index 7745cfc..f15e561 100644
--- a/include/hw/virtio/virtio-mem.h
+++ b/include/hw/virtio/virtio-mem.h
@@ -31,6 +31,7 @@
#define VIRTIO_MEM_BLOCK_SIZE_PROP "block-size"
#define VIRTIO_MEM_ADDR_PROP "memaddr"
#define VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP "unplugged-inaccessible"
+#define VIRTIO_MEM_EARLY_MIGRATION_PROP "x-early-migration"
#define VIRTIO_MEM_PREALLOC_PROP "prealloc"
struct VirtIOMEM {
@@ -74,6 +75,13 @@
/* whether to prealloc memory when plugging new blocks */
bool prealloc;
+ /*
+ * Whether we migrate properties that are immutable while migration is
+ * active early, before state of other devices and especially, before
+ * migrating any RAM content.
+ */
+ bool early_migration;
+
/* notifiers to notify when "size" changes */
NotifierList size_change_notifiers;
diff --git a/include/io/channel.h b/include/io/channel.h
index 78b15f7..153fbd2 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -34,6 +34,8 @@
#define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1
+#define QIO_CHANNEL_READ_FLAG_MSG_PEEK 0x1
+
typedef enum QIOChannelFeature QIOChannelFeature;
enum QIOChannelFeature {
@@ -41,6 +43,7 @@
QIO_CHANNEL_FEATURE_SHUTDOWN,
QIO_CHANNEL_FEATURE_LISTEN,
QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY,
+ QIO_CHANNEL_FEATURE_READ_MSG_PEEK,
};
@@ -114,6 +117,7 @@
size_t niov,
int **fds,
size_t *nfds,
+ int flags,
Error **errp);
int (*io_close)(QIOChannel *ioc,
Error **errp);
@@ -188,6 +192,7 @@
* @niov: the length of the @iov array
* @fds: pointer to an array that will received file handles
* @nfds: pointer filled with number of elements in @fds on return
+ * @flags: read flags (QIO_CHANNEL_READ_FLAG_*)
* @errp: pointer to a NULL-initialized error object
*
* Read data from the IO channel, storing it in the
@@ -224,6 +229,7 @@
size_t niov,
int **fds,
size_t *nfds,
+ int flags,
Error **errp);
diff --git a/include/migration/misc.h b/include/migration/misc.h
index 4659067..8b49841 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -67,8 +67,10 @@
/* ...and after the device transmission */
bool migration_in_postcopy_after_devices(MigrationState *);
void migration_global_dump(Monitor *mon);
-/* True if incomming migration entered POSTCOPY_INCOMING_DISCARD */
+/* True if incoming migration entered POSTCOPY_INCOMING_DISCARD */
bool migration_in_incoming_postcopy(void);
+/* True if incoming migration entered POSTCOPY_INCOMING_ADVISE */
+bool migration_incoming_postcopy_advised(void);
/* True if background snapshot is active */
bool migration_in_bg_snapshot(void);
diff --git a/include/migration/register.h b/include/migration/register.h
index c1dcff0..b91a0cd 100644
--- a/include/migration/register.h
+++ b/include/migration/register.h
@@ -46,11 +46,6 @@
/* This runs outside the iothread lock! */
int (*save_setup)(QEMUFile *f, void *opaque);
- void (*save_live_pending)(QEMUFile *f, void *opaque,
- uint64_t threshold_size,
- uint64_t *res_precopy_only,
- uint64_t *res_compatible,
- uint64_t *res_postcopy_only);
/* Note for save_live_pending:
* - res_precopy_only is for data which must be migrated in precopy phase
* or in stopped state, in other words - before target vm start
@@ -61,8 +56,16 @@
* Sum of res_postcopy_only, res_compatible and res_postcopy_only is the
* whole amount of pending data.
*/
-
-
+ /* This estimates the remaining data to transfer */
+ void (*state_pending_estimate)(void *opaque,
+ uint64_t *res_precopy_only,
+ uint64_t *res_compatible,
+ uint64_t *res_postcopy_only);
+ /* This calculate the exact remaining data to transfer */
+ void (*state_pending_exact)(void *opaque,
+ uint64_t *res_precopy_only,
+ uint64_t *res_compatible,
+ uint64_t *res_postcopy_only);
LoadStateHandler *load_state;
int (*load_setup)(QEMUFile *f, void *opaque);
int (*load_cleanup)(void *opaque);
diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index ad24aa1..084f5e7 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -147,6 +147,9 @@
* VMStateField.struct_version_id to tell which version of the
* structure we are referencing to use. */
VMS_VSTRUCT = 0x8000,
+
+ /* Marker for end of list */
+ VMS_END = 0x10000
};
typedef enum {
@@ -178,7 +181,21 @@
struct VMStateDescription {
const char *name;
- int unmigratable;
+ bool unmigratable;
+ /*
+ * This VMSD describes something that should be sent during setup phase
+ * of migration. It plays similar role as save_setup() for explicitly
+ * registered vmstate entries, so it can be seen as a way to describe
+ * save_setup() in VMSD structures.
+ *
+ * Note that for now, a SaveStateEntry cannot have a VMSD and
+ * operations (e.g., save_setup()) set at the same time. Consequently,
+ * save_setup() and a VMSD with early_setup set to true are mutually
+ * exclusive. For this reason, also early_setup VMSDs are migrated in a
+ * QEMU_VM_SECTION_FULL section, while save_setup() data is migrated in
+ * a QEMU_VM_SECTION_START section.
+ */
+ bool early_setup;
int version_id;
int minimum_version_id;
MigrationPriority priority;
@@ -705,8 +722,9 @@
* '_state' type
* That the pointer is right at the start of _tmp_type.
*/
-#define VMSTATE_WITH_TMP(_state, _tmp_type, _vmsd) { \
+#define VMSTATE_WITH_TMP_TEST(_state, _test, _tmp_type, _vmsd) { \
.name = "tmp", \
+ .field_exists = (_test), \
.size = sizeof(_tmp_type) + \
QEMU_BUILD_BUG_ON_ZERO(offsetof(_tmp_type, parent) != 0) + \
type_check_pointer(_state, \
@@ -715,6 +733,9 @@
.info = &vmstate_info_tmp, \
}
+#define VMSTATE_WITH_TMP(_state, _tmp_type, _vmsd) \
+ VMSTATE_WITH_TMP_TEST(_state, NULL, _tmp_type, _vmsd)
+
#define VMSTATE_UNUSED_BUFFER(_test, _version, _size) { \
.name = "unused", \
.field_exists = (_test), \
@@ -738,8 +759,9 @@
/* _field_size should be a int32_t field in the _state struct giving the
* size of the bitmap _field in bits.
*/
-#define VMSTATE_BITMAP(_field, _state, _version, _field_size) { \
+#define VMSTATE_BITMAP_TEST(_field, _state, _test, _version, _field_size) { \
.name = (stringify(_field)), \
+ .field_exists = (_test), \
.version_id = (_version), \
.size_offset = vmstate_offset_value(_state, _field_size, int32_t),\
.info = &vmstate_info_bitmap, \
@@ -747,6 +769,9 @@
.offset = offsetof(_state, _field), \
}
+#define VMSTATE_BITMAP(_field, _state, _version, _field_size) \
+ VMSTATE_BITMAP_TEST(_field, _state, NULL, _version, _field_size)
+
/* For migrating a QTAILQ.
* Target QTAILQ needs be properly initialized.
* _type: type of QTAILQ element
@@ -1161,7 +1186,9 @@
VMSTATE_UNUSED_BUFFER(_test, 0, _size)
#define VMSTATE_END_OF_LIST() \
- {}
+ { \
+ .flags = VMS_END, \
+ }
int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd,
void *opaque, int version_id);
diff --git a/include/qemu/userfaultfd.h b/include/qemu/userfaultfd.h
index 6b74f92..d764496 100644
--- a/include/qemu/userfaultfd.h
+++ b/include/qemu/userfaultfd.h
@@ -13,10 +13,20 @@
#ifndef USERFAULTFD_H
#define USERFAULTFD_H
+#ifdef CONFIG_LINUX
+
#include "qemu/osdep.h"
#include "exec/hwaddr.h"
#include <linux/userfaultfd.h>
+/**
+ * uffd_open(): Open an userfaultfd handle for current context.
+ *
+ * @flags: The flags we want to pass in when creating the handle.
+ *
+ * Returns: the uffd handle if >=0, or <0 if error happens.
+ */
+int uffd_open(int flags);
int uffd_query_features(uint64_t *features);
int uffd_create_fd(uint64_t features, bool non_blocking);
void uffd_close_fd(int uffd_fd);
@@ -32,4 +42,6 @@
int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count);
bool uffd_poll_events(int uffd_fd, int tmo);
+#endif /* CONFIG_LINUX */
+
#endif /* USERFAULTFD_H */
diff --git a/io/channel-buffer.c b/io/channel-buffer.c
index bf52011..8096180 100644
--- a/io/channel-buffer.c
+++ b/io/channel-buffer.c
@@ -54,6 +54,7 @@
size_t niov,
int **fds,
size_t *nfds,
+ int flags,
Error **errp)
{
QIOChannelBuffer *bioc = QIO_CHANNEL_BUFFER(ioc);
diff --git a/io/channel-command.c b/io/channel-command.c
index 7451625..e7edd09 100644
--- a/io/channel-command.c
+++ b/io/channel-command.c
@@ -203,6 +203,7 @@
size_t niov,
int **fds,
size_t *nfds,
+ int flags,
Error **errp)
{
QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc);
diff --git a/io/channel-file.c b/io/channel-file.c
index b67687c..d76663e 100644
--- a/io/channel-file.c
+++ b/io/channel-file.c
@@ -86,6 +86,7 @@
size_t niov,
int **fds,
size_t *nfds,
+ int flags,
Error **errp)
{
QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc);
diff --git a/io/channel-null.c b/io/channel-null.c
index 75e3781..4fafdb7 100644
--- a/io/channel-null.c
+++ b/io/channel-null.c
@@ -60,6 +60,7 @@
size_t niov,
int **fds G_GNUC_UNUSED,
size_t *nfds G_GNUC_UNUSED,
+ int flags,
Error **errp)
{
QIOChannelNull *nioc = QIO_CHANNEL_NULL(ioc);
diff --git a/io/channel-socket.c b/io/channel-socket.c
index b76dca9..7aca84f 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -173,6 +173,9 @@
}
#endif
+ qio_channel_set_feature(QIO_CHANNEL(ioc),
+ QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
+
return 0;
}
@@ -406,6 +409,9 @@
}
#endif /* WIN32 */
+ qio_channel_set_feature(QIO_CHANNEL(cioc),
+ QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
+
trace_qio_channel_socket_accept_complete(ioc, cioc, cioc->fd);
return cioc;
@@ -496,6 +502,7 @@
size_t niov,
int **fds,
size_t *nfds,
+ int flags,
Error **errp)
{
QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
@@ -517,6 +524,10 @@
}
+ if (flags & QIO_CHANNEL_READ_FLAG_MSG_PEEK) {
+ sflags |= MSG_PEEK;
+ }
+
retry:
ret = recvmsg(sioc->fd, &msg, sflags);
if (ret < 0) {
@@ -624,11 +635,17 @@
size_t niov,
int **fds,
size_t *nfds,
+ int flags,
Error **errp)
{
QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
ssize_t done = 0;
ssize_t i;
+ int sflags = 0;
+
+ if (flags & QIO_CHANNEL_READ_FLAG_MSG_PEEK) {
+ sflags |= MSG_PEEK;
+ }
for (i = 0; i < niov; i++) {
ssize_t ret;
@@ -636,7 +653,7 @@
ret = recv(sioc->fd,
iov[i].iov_base,
iov[i].iov_len,
- 0);
+ sflags);
if (ret < 0) {
if (errno == EAGAIN) {
if (done) {
diff --git a/io/channel-tls.c b/io/channel-tls.c
index 4ce890a..c730cb8 100644
--- a/io/channel-tls.c
+++ b/io/channel-tls.c
@@ -260,6 +260,7 @@
size_t niov,
int **fds,
size_t *nfds,
+ int flags,
Error **errp)
{
QIOChannelTLS *tioc = QIO_CHANNEL_TLS(ioc);
diff --git a/io/channel-websock.c b/io/channel-websock.c
index fb4932a..a12acc2 100644
--- a/io/channel-websock.c
+++ b/io/channel-websock.c
@@ -1081,6 +1081,7 @@
size_t niov,
int **fds,
size_t *nfds,
+ int flags,
Error **errp)
{
QIOChannelWebsock *wioc = QIO_CHANNEL_WEBSOCK(ioc);
diff --git a/io/channel.c b/io/channel.c
index 0640941..a8c7f11 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -52,6 +52,7 @@
size_t niov,
int **fds,
size_t *nfds,
+ int flags,
Error **errp)
{
QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
@@ -63,7 +64,14 @@
return -1;
}
- return klass->io_readv(ioc, iov, niov, fds, nfds, errp);
+ if ((flags & QIO_CHANNEL_READ_FLAG_MSG_PEEK) &&
+ !qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
+ error_setg_errno(errp, EINVAL,
+ "Channel does not support peek read");
+ return -1;
+ }
+
+ return klass->io_readv(ioc, iov, niov, fds, nfds, flags, errp);
}
@@ -146,7 +154,7 @@
while ((nlocal_iov > 0) || local_fds) {
ssize_t len;
len = qio_channel_readv_full(ioc, local_iov, nlocal_iov, local_fds,
- local_nfds, errp);
+ local_nfds, 0, errp);
if (len == QIO_CHANNEL_ERR_BLOCK) {
if (qemu_in_coroutine()) {
qio_channel_yield(ioc, G_IO_IN);
@@ -284,7 +292,7 @@
size_t niov,
Error **errp)
{
- return qio_channel_readv_full(ioc, iov, niov, NULL, NULL, errp);
+ return qio_channel_readv_full(ioc, iov, niov, NULL, NULL, 0, errp);
}
@@ -303,7 +311,7 @@
Error **errp)
{
struct iovec iov = { .iov_base = buf, .iov_len = buflen };
- return qio_channel_readv_full(ioc, &iov, 1, NULL, NULL, errp);
+ return qio_channel_readv_full(ioc, &iov, 1, NULL, NULL, 0, errp);
}
diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
index 15127d4..5a62141 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -762,11 +762,10 @@
return 0;
}
-static void dirty_bitmap_save_pending(QEMUFile *f, void *opaque,
- uint64_t max_size,
- uint64_t *res_precopy_only,
- uint64_t *res_compatible,
- uint64_t *res_postcopy_only)
+static void dirty_bitmap_state_pending(void *opaque,
+ uint64_t *res_precopy_only,
+ uint64_t *res_compatible,
+ uint64_t *res_postcopy_only)
{
DBMSaveState *s = &((DBMState *)opaque)->save;
SaveBitmapState *dbms;
@@ -784,7 +783,7 @@
qemu_mutex_unlock_iothread();
- trace_dirty_bitmap_save_pending(pending, max_size);
+ trace_dirty_bitmap_state_pending(pending);
*res_postcopy_only += pending;
}
@@ -1253,7 +1252,8 @@
.save_live_complete_postcopy = dirty_bitmap_save_complete,
.save_live_complete_precopy = dirty_bitmap_save_complete,
.has_postcopy = dirty_bitmap_has_postcopy,
- .save_live_pending = dirty_bitmap_save_pending,
+ .state_pending_exact = dirty_bitmap_state_pending,
+ .state_pending_estimate = dirty_bitmap_state_pending,
.save_live_iterate = dirty_bitmap_save_iterate,
.is_active_iterate = dirty_bitmap_is_active_iterate,
.load_state = dirty_bitmap_load,
diff --git a/migration/block.c b/migration/block.c
index 5da15a6..29f6902 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -863,10 +863,10 @@
return 0;
}
-static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
- uint64_t *res_precopy_only,
- uint64_t *res_compatible,
- uint64_t *res_postcopy_only)
+static void block_state_pending(void *opaque,
+ uint64_t *res_precopy_only,
+ uint64_t *res_compatible,
+ uint64_t *res_postcopy_only)
{
/* Estimate pending number of bytes to send */
uint64_t pending;
@@ -885,7 +885,7 @@
pending = BLK_MIG_BLOCK_SIZE;
}
- trace_migration_block_save_pending(pending);
+ trace_migration_block_state_pending(pending);
/* We don't do postcopy */
*res_precopy_only += pending;
}
@@ -1020,7 +1020,8 @@
.save_setup = block_save_setup,
.save_live_iterate = block_save_iterate,
.save_live_complete_precopy = block_save_complete,
- .save_live_pending = block_save_pending,
+ .state_pending_exact = block_state_pending,
+ .state_pending_estimate = block_state_pending,
.load_state = block_load,
.save_cleanup = block_migration_cleanup,
.is_active = block_is_active,
diff --git a/migration/channel-block.c b/migration/channel-block.c
index f4ab53a..b737436 100644
--- a/migration/channel-block.c
+++ b/migration/channel-block.c
@@ -53,6 +53,7 @@
size_t niov,
int **fds,
size_t *nfds,
+ int flags,
Error **errp)
{
QIOChannelBlock *bioc = QIO_CHANNEL_BLOCK(ioc);
diff --git a/migration/channel.c b/migration/channel.c
index 1b08150..ca3319a 100644
--- a/migration/channel.c
+++ b/migration/channel.c
@@ -92,3 +92,48 @@
migrate_fd_connect(s, error);
error_free(error);
}
+
+
+/**
+ * @migration_channel_read_peek - Peek at migration channel, without
+ * actually removing it from channel buffer.
+ *
+ * @ioc: the channel object
+ * @buf: the memory region to read data into
+ * @buflen: the number of bytes to read in @buf
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Returns 0 if successful, returns -1 and sets @errp if fails.
+ */
+int migration_channel_read_peek(QIOChannel *ioc,
+ const char *buf,
+ const size_t buflen,
+ Error **errp)
+{
+ ssize_t len = 0;
+ struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen };
+
+ while (true) {
+ len = qio_channel_readv_full(ioc, &iov, 1, NULL, NULL,
+ QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
+
+ if (len <= 0 && len != QIO_CHANNEL_ERR_BLOCK) {
+ error_setg(errp,
+ "Failed to peek at channel");
+ return -1;
+ }
+
+ if (len == buflen) {
+ break;
+ }
+
+ /* 1ms sleep. */
+ if (qemu_in_coroutine()) {
+ qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000);
+ } else {
+ g_usleep(1000);
+ }
+ }
+
+ return 0;
+}
diff --git a/migration/channel.h b/migration/channel.h
index 67a461c..5bdb820 100644
--- a/migration/channel.h
+++ b/migration/channel.h
@@ -24,4 +24,9 @@
QIOChannel *ioc,
const char *hostname,
Error *error_in);
+
+int migration_channel_read_peek(QIOChannel *ioc,
+ const char *buf,
+ const size_t buflen,
+ Error **errp);
#endif
diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c
index 4bfb97f..575d48c 100644
--- a/migration/dirtyrate.c
+++ b/migration/dirtyrate.c
@@ -714,8 +714,8 @@
mode = DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING;
}
- if (has_sample_pages && mode == DIRTY_RATE_MEASURE_MODE_DIRTY_RING) {
- error_setg(errp, "either sample-pages or dirty-ring can be specified.");
+ if (has_sample_pages && mode != DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING) {
+ error_setg(errp, "sample-pages is used only in page-sampling mode");
return;
}
@@ -785,8 +785,10 @@
DirtyRateStatus_str(info->status));
monitor_printf(mon, "Start Time: %"PRIi64" (ms)\n",
info->start_time);
- monitor_printf(mon, "Sample Pages: %"PRIu64" (per GB)\n",
- info->sample_pages);
+ if (info->mode == DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING) {
+ monitor_printf(mon, "Sample Pages: %"PRIu64" (per GB)\n",
+ info->sample_pages);
+ }
monitor_printf(mon, "Period: %"PRIi64" (sec)\n",
info->calc_time);
monitor_printf(mon, "Mode: %s\n",
diff --git a/migration/meson.build b/migration/meson.build
index a9e7e18..0d1bb9f 100644
--- a/migration/meson.build
+++ b/migration/meson.build
@@ -26,6 +26,7 @@
'savevm.c',
'socket.c',
'tls.c',
+ 'threadinfo.c',
), gnutls)
softmmu_ss.add(when: rdma, if_true: files('rdma.c'))
diff --git a/migration/migration.c b/migration/migration.c
index 56859d5..7a14aa9 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -31,6 +31,7 @@
#include "migration.h"
#include "savevm.h"
#include "qemu-file.h"
+#include "channel.h"
#include "migration/vmstate.h"
#include "block/block.h"
#include "qapi/error.h"
@@ -57,6 +58,7 @@
#include "net/announce.h"
#include "qemu/queue.h"
#include "multifd.h"
+#include "threadinfo.h"
#include "qemu/yank.h"
#include "sysemu/cpus.h"
#include "yank_functions.h"
@@ -664,10 +666,6 @@
{
MigrationIncomingState *mis = migration_incoming_get_current();
- if (multifd_load_setup(errp) != 0) {
- return false;
- }
-
if (!mis->from_src_file) {
mis->from_src_file = f;
}
@@ -734,31 +732,56 @@
{
MigrationIncomingState *mis = migration_incoming_get_current();
Error *local_err = NULL;
- bool start_migration;
QEMUFile *f;
+ bool default_channel = true;
+ uint32_t channel_magic = 0;
+ int ret = 0;
- if (!mis->from_src_file) {
- /* The first connection (multifd may have multiple) */
+ if (migrate_use_multifd() && !migrate_postcopy_ram() &&
+ qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
+ /*
+ * With multiple channels, it is possible that we receive channels
+ * out of order on destination side, causing incorrect mapping of
+ * source channels on destination side. Check channel MAGIC to
+ * decide type of channel. Please note this is best effort, postcopy
+ * preempt channel does not send any magic number so avoid it for
+ * postcopy live migration. Also tls live migration already does
+ * tls handshake while initializing main channel so with tls this
+ * issue is not possible.
+ */
+ ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
+ sizeof(channel_magic), &local_err);
+
+ if (ret != 0) {
+ error_propagate(errp, local_err);
+ return;
+ }
+
+ default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
+ } else {
+ default_channel = !mis->from_src_file;
+ }
+
+ if (multifd_load_setup(errp) != 0) {
+ error_setg(errp, "Failed to setup multifd channels");
+ return;
+ }
+
+ if (default_channel) {
f = qemu_file_new_input(ioc);
if (!migration_incoming_setup(f, errp)) {
return;
}
-
- /*
- * Common migration only needs one channel, so we can start
- * right now. Some features need more than one channel, we wait.
- */
- start_migration = !migration_needs_multiple_sockets();
} else {
/* Multiple connections */
assert(migration_needs_multiple_sockets());
if (migrate_use_multifd()) {
- start_migration = multifd_recv_new_channel(ioc, &local_err);
+ multifd_recv_new_channel(ioc, &local_err);
} else {
assert(migrate_postcopy_preempt());
f = qemu_file_new_input(ioc);
- start_migration = postcopy_preempt_new_channel(mis, f);
+ postcopy_preempt_new_channel(mis, f);
}
if (local_err) {
error_propagate(errp, local_err);
@@ -766,7 +789,7 @@
}
}
- if (start_migration) {
+ if (migration_has_all_channels()) {
/* If it's a recovery, we're done */
if (postcopy_try_recover()) {
return;
@@ -1051,20 +1074,30 @@
}
}
+static bool migrate_show_downtime(MigrationState *s)
+{
+ return (s->state == MIGRATION_STATUS_COMPLETED) || migration_in_postcopy();
+}
+
static void populate_time_info(MigrationInfo *info, MigrationState *s)
{
info->has_status = true;
info->has_setup_time = true;
info->setup_time = s->setup_time;
+
if (s->state == MIGRATION_STATUS_COMPLETED) {
info->has_total_time = true;
info->total_time = s->total_time;
- info->has_downtime = true;
- info->downtime = s->downtime;
} else {
info->has_total_time = true;
info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
s->start_time;
+ }
+
+ if (migrate_show_downtime(s)) {
+ info->has_downtime = true;
+ info->downtime = s->downtime;
+ } else {
info->has_expected_downtime = true;
info->expected_downtime = s->expected_downtime;
}
@@ -1933,6 +1966,8 @@
g_free(s->hostname);
s->hostname = NULL;
+ json_writer_free(s->vmdesc);
+ s->vmdesc = NULL;
qemu_savevm_state_cleanup();
@@ -2124,6 +2159,13 @@
return ps >= POSTCOPY_INCOMING_DISCARD && ps < POSTCOPY_INCOMING_END;
}
+bool migration_incoming_postcopy_advised(void)
+{
+ PostcopyState ps = postcopy_state_get();
+
+ return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
+}
+
bool migration_in_bg_snapshot(void)
{
MigrationState *s = migrate_get_current();
@@ -3778,33 +3820,39 @@
*/
static MigIterateState migration_iteration_run(MigrationState *s)
{
- uint64_t pending_size, pend_pre, pend_compat, pend_post;
+ uint64_t pend_pre, pend_compat, pend_post;
bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
- qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
- &pend_compat, &pend_post);
- pending_size = pend_pre + pend_compat + pend_post;
+ qemu_savevm_state_pending_estimate(&pend_pre, &pend_compat, &pend_post);
+ uint64_t pending_size = pend_pre + pend_compat + pend_post;
- trace_migrate_pending(pending_size, s->threshold_size,
- pend_pre, pend_compat, pend_post);
+ trace_migrate_pending_estimate(pending_size,
+ pend_pre, pend_compat, pend_post);
- if (pending_size && pending_size >= s->threshold_size) {
- /* Still a significant amount to transfer */
- if (!in_postcopy && pend_pre <= s->threshold_size &&
- qatomic_read(&s->start_postcopy)) {
- if (postcopy_start(s)) {
- error_report("%s: postcopy failed to start", __func__);
- }
- return MIG_ITERATE_SKIP;
- }
- /* Just another iteration step */
- qemu_savevm_state_iterate(s->to_dst_file, in_postcopy);
- } else {
+ if (pend_pre + pend_compat <= s->threshold_size) {
+ qemu_savevm_state_pending_exact(&pend_pre, &pend_compat, &pend_post);
+ pending_size = pend_pre + pend_compat + pend_post;
+ trace_migrate_pending_exact(pending_size,
+ pend_pre, pend_compat, pend_post);
+ }
+
+ if (!pending_size || pending_size < s->threshold_size) {
trace_migration_thread_low_pending(pending_size);
migration_completion(s);
return MIG_ITERATE_BREAK;
}
+ /* Still a significant amount to transfer */
+ if (!in_postcopy && pend_pre <= s->threshold_size &&
+ qatomic_read(&s->start_postcopy)) {
+ if (postcopy_start(s)) {
+ error_report("%s: postcopy failed to start", __func__);
+ }
+ return MIG_ITERATE_SKIP;
+ }
+
+ /* Just another iteration step */
+ qemu_savevm_state_iterate(s->to_dst_file, in_postcopy);
return MIG_ITERATE_RESUME;
}
@@ -3981,10 +4029,13 @@
static void *migration_thread(void *opaque)
{
MigrationState *s = opaque;
+ MigrationThread *thread = NULL;
int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
MigThrError thr_error;
bool urgent = false;
+ thread = MigrationThreadAdd("live_migration", qemu_get_thread_id());
+
rcu_register_thread();
object_ref(OBJECT(s));
@@ -4061,6 +4112,7 @@
migration_iteration_finish(s);
object_unref(OBJECT(s));
rcu_unregister_thread();
+ MigrationThreadDel(thread);
return NULL;
}
diff --git a/migration/migration.h b/migration/migration.h
index ae4ffd3..66511ce 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -17,6 +17,7 @@
#include "exec/cpu-common.h"
#include "hw/qdev-core.h"
#include "qapi/qapi-types-migration.h"
+#include "qapi/qmp/json-writer.h"
#include "qemu/thread.h"
#include "qemu/coroutine_int.h"
#include "io/channel.h"
@@ -366,6 +367,9 @@
* This save hostname when out-going migration starts
*/
char *hostname;
+
+ /* QEMU_VM_VMDESCRIPTION content filled for all non-iterable devices. */
+ JSONWriter *vmdesc;
};
void migrate_set_state(int *state, int old_state, int new_state);
diff --git a/migration/multifd.c b/migration/multifd.c
index 000ca4d..b7ad700 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -24,6 +24,7 @@
#include "qemu-file.h"
#include "trace.h"
#include "multifd.h"
+#include "threadinfo.h"
#include "qemu/yank.h"
#include "io/channel-socket.h"
@@ -442,6 +443,7 @@
int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
{
MultiFDPages_t *pages = multifd_send_state->pages;
+ bool changed = false;
if (!pages->block) {
pages->block = block;
@@ -454,14 +456,16 @@
if (pages->num < pages->allocated) {
return 1;
}
+ } else {
+ changed = true;
}
if (multifd_send_pages(f) < 0) {
return -1;
}
- if (pages->block != block) {
- return multifd_queue_page(f, block, offset);
+ if (changed) {
+ return multifd_queue_page(f, block, offset);
}
return 1;
@@ -627,16 +631,16 @@
stat64_add(&ram_atomic_counters.transferred, p->packet_len);
qemu_mutex_unlock(&p->mutex);
qemu_sem_post(&p->sem);
-
- if (flush_zero_copy && p->c && (multifd_zero_copy_flush(p->c) < 0)) {
- return -1;
- }
}
for (i = 0; i < migrate_multifd_channels(); i++) {
MultiFDSendParams *p = &multifd_send_state->params[i];
trace_multifd_send_sync_main_wait(p->id);
qemu_sem_wait(&p->sem_sync);
+
+ if (flush_zero_copy && p->c && (multifd_zero_copy_flush(p->c) < 0)) {
+ return -1;
+ }
}
trace_multifd_send_sync_main(multifd_send_state->packet_num);
@@ -646,10 +650,13 @@
static void *multifd_send_thread(void *opaque)
{
MultiFDSendParams *p = opaque;
+ MigrationThread *thread = NULL;
Error *local_err = NULL;
int ret = 0;
bool use_zero_copy_send = migrate_use_zero_copy_send();
+ thread = MigrationThreadAdd(p->name, qemu_get_thread_id());
+
trace_multifd_send_thread_start(p->id);
rcu_register_thread();
@@ -759,6 +766,7 @@
qemu_mutex_unlock(&p->mutex);
rcu_unregister_thread();
+ MigrationThreadDel(thread);
trace_multifd_send_thread_end(p->id, p->num_packets, p->total_normal_pages);
return NULL;
@@ -1164,9 +1172,14 @@
uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
uint8_t i;
- if (!migrate_use_multifd()) {
+ /*
+ * Return successfully if multiFD recv state is already initialised
+ * or multiFD is not enabled.
+ */
+ if (multifd_recv_state || !migrate_use_multifd()) {
return 0;
}
+
if (!migrate_multi_channels_is_allowed()) {
error_setg(errp, "multifd is not supported by current protocol");
return -1;
@@ -1227,11 +1240,9 @@
/*
* Try to receive all multifd channels to get ready for the migration.
- * - Return true and do not set @errp when correctly receiving all channels;
- * - Return false and do not set @errp when correctly receiving the current one;
- * - Return false and set @errp when failing to receive the current channel.
+ * Sets @errp when failing to receive the current channel.
*/
-bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
+void multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
{
MultiFDRecvParams *p;
Error *local_err = NULL;
@@ -1244,7 +1255,7 @@
"failed to receive packet"
" via multifd channel %d: ",
qatomic_read(&multifd_recv_state->count));
- return false;
+ return;
}
trace_multifd_recv_new_channel(id);
@@ -1254,7 +1265,7 @@
id);
multifd_recv_terminate_threads(local_err);
error_propagate(errp, local_err);
- return false;
+ return;
}
p->c = ioc;
object_ref(OBJECT(ioc));
@@ -1265,6 +1276,4 @@
qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
QEMU_THREAD_JOINABLE);
qatomic_inc(&multifd_recv_state->count);
- return qatomic_read(&multifd_recv_state->count) ==
- migrate_multifd_channels();
}
diff --git a/migration/multifd.c.orig b/migration/multifd.c.orig
new file mode 100644
index 0000000..ad89293
--- /dev/null
+++ b/migration/multifd.c.orig
@@ -0,0 +1,1274 @@
+/*
+ * Multifd common code
+ *
+ * Copyright (c) 2019-2020 Red Hat Inc
+ *
+ * Authors:
+ * Juan Quintela <quintela@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/rcu.h"
+#include "exec/target_page.h"
+#include "sysemu/sysemu.h"
+#include "exec/ramblock.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "ram.h"
+#include "migration.h"
+#include "socket.h"
+#include "tls.h"
+#include "qemu-file.h"
+#include "trace.h"
+#include "multifd.h"
+
+#include "qemu/yank.h"
+#include "io/channel-socket.h"
+#include "yank_functions.h"
+
+/* Multiple fd's */
+
+#define MULTIFD_MAGIC 0x11223344U
+#define MULTIFD_VERSION 1
+
+typedef struct {
+ uint32_t magic;
+ uint32_t version;
+ unsigned char uuid[16]; /* QemuUUID */
+ uint8_t id;
+ uint8_t unused1[7]; /* Reserved for future use */
+ uint64_t unused2[4]; /* Reserved for future use */
+} __attribute__((packed)) MultiFDInit_t;
+
+/* Multifd without compression */
+
+/**
+ * nocomp_send_setup: setup send side
+ *
+ * For no compression this function does nothing.
+ *
+ * Returns 0 for success or -1 for error
+ *
+ * @p: Params for the channel that we are using
+ * @errp: pointer to an error
+ */
+static int nocomp_send_setup(MultiFDSendParams *p, Error **errp)
+{
+ return 0;
+}
+
+/**
+ * nocomp_send_cleanup: cleanup send side
+ *
+ * For no compression this function does nothing.
+ *
+ * @p: Params for the channel that we are using
+ * @errp: pointer to an error
+ */
+static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp)
+{
+ return;
+}
+
+/**
+ * nocomp_send_prepare: prepare date to be able to send
+ *
+ * For no compression we just have to calculate the size of the
+ * packet.
+ *
+ * Returns 0 for success or -1 for error
+ *
+ * @p: Params for the channel that we are using
+ * @errp: pointer to an error
+ */
+static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
+{
+ MultiFDPages_t *pages = p->pages;
+
+ for (int i = 0; i < p->normal_num; i++) {
+ p->iov[p->iovs_num].iov_base = pages->block->host + p->normal[i];
+ p->iov[p->iovs_num].iov_len = p->page_size;
+ p->iovs_num++;
+ }
+
+ p->next_packet_size = p->normal_num * p->page_size;
+ p->flags |= MULTIFD_FLAG_NOCOMP;
+ return 0;
+}
+
+/**
+ * nocomp_recv_setup: setup receive side
+ *
+ * For no compression this function does nothing.
+ *
+ * Returns 0 for success or -1 for error
+ *
+ * @p: Params for the channel that we are using
+ * @errp: pointer to an error
+ */
+static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp)
+{
+ return 0;
+}
+
+/**
+ * nocomp_recv_cleanup: setup receive side
+ *
+ * For no compression this function does nothing.
+ *
+ * @p: Params for the channel that we are using
+ */
+static void nocomp_recv_cleanup(MultiFDRecvParams *p)
+{
+}
+
+/**
+ * nocomp_recv_pages: read the data from the channel into actual pages
+ *
+ * For no compression we just need to read things into the correct place.
+ *
+ * Returns 0 for success or -1 for error
+ *
+ * @p: Params for the channel that we are using
+ * @errp: pointer to an error
+ */
+static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp)
+{
+ uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
+
+ if (flags != MULTIFD_FLAG_NOCOMP) {
+ error_setg(errp, "multifd %u: flags received %x flags expected %x",
+ p->id, flags, MULTIFD_FLAG_NOCOMP);
+ return -1;
+ }
+ for (int i = 0; i < p->normal_num; i++) {
+ p->iov[i].iov_base = p->host + p->normal[i];
+ p->iov[i].iov_len = p->page_size;
+ }
+ return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp);
+}
+
+static MultiFDMethods multifd_nocomp_ops = {
+ .send_setup = nocomp_send_setup,
+ .send_cleanup = nocomp_send_cleanup,
+ .send_prepare = nocomp_send_prepare,
+ .recv_setup = nocomp_recv_setup,
+ .recv_cleanup = nocomp_recv_cleanup,
+ .recv_pages = nocomp_recv_pages
+};
+
+static MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = {
+ [MULTIFD_COMPRESSION_NONE] = &multifd_nocomp_ops,
+};
+
+void multifd_register_ops(int method, MultiFDMethods *ops)
+{
+ assert(0 < method && method < MULTIFD_COMPRESSION__MAX);
+ multifd_ops[method] = ops;
+}
+
+static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
+{
+ MultiFDInit_t msg = {};
+ int ret;
+
+ msg.magic = cpu_to_be32(MULTIFD_MAGIC);
+ msg.version = cpu_to_be32(MULTIFD_VERSION);
+ msg.id = p->id;
+ memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
+
+ ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
+ if (ret != 0) {
+ return -1;
+ }
+ return 0;
+}
+
+static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
+{
+ MultiFDInit_t msg;
+ int ret;
+
+ ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
+ if (ret != 0) {
+ return -1;
+ }
+
+ msg.magic = be32_to_cpu(msg.magic);
+ msg.version = be32_to_cpu(msg.version);
+
+ if (msg.magic != MULTIFD_MAGIC) {
+ error_setg(errp, "multifd: received packet magic %x "
+ "expected %x", msg.magic, MULTIFD_MAGIC);
+ return -1;
+ }
+
+ if (msg.version != MULTIFD_VERSION) {
+ error_setg(errp, "multifd: received packet version %u "
+ "expected %u", msg.version, MULTIFD_VERSION);
+ return -1;
+ }
+
+ if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
+ char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
+ char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
+
+ error_setg(errp, "multifd: received uuid '%s' and expected "
+ "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
+ g_free(uuid);
+ g_free(msg_uuid);
+ return -1;
+ }
+
+ if (msg.id > migrate_multifd_channels()) {
+ error_setg(errp, "multifd: received channel version %u "
+ "expected %u", msg.version, MULTIFD_VERSION);
+ return -1;
+ }
+
+ return msg.id;
+}
+
+static MultiFDPages_t *multifd_pages_init(size_t size)
+{
+ MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
+
+ pages->allocated = size;
+ pages->offset = g_new0(ram_addr_t, size);
+
+ return pages;
+}
+
+static void multifd_pages_clear(MultiFDPages_t *pages)
+{
+ pages->num = 0;
+ pages->allocated = 0;
+ pages->packet_num = 0;
+ pages->block = NULL;
+ g_free(pages->offset);
+ pages->offset = NULL;
+ g_free(pages);
+}
+
+static void multifd_send_fill_packet(MultiFDSendParams *p)
+{
+ MultiFDPacket_t *packet = p->packet;
+ int i;
+
+ packet->flags = cpu_to_be32(p->flags);
+ packet->pages_alloc = cpu_to_be32(p->pages->allocated);
+ packet->normal_pages = cpu_to_be32(p->normal_num);
+ packet->next_packet_size = cpu_to_be32(p->next_packet_size);
+ packet->packet_num = cpu_to_be64(p->packet_num);
+
+ if (p->pages->block) {
+ strncpy(packet->ramblock, p->pages->block->idstr, 256);
+ }
+
+ for (i = 0; i < p->normal_num; i++) {
+ /* there are architectures where ram_addr_t is 32 bit */
+ uint64_t temp = p->normal[i];
+
+ packet->offset[i] = cpu_to_be64(temp);
+ }
+}
+
+static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
+{
+ MultiFDPacket_t *packet = p->packet;
+ RAMBlock *block;
+ int i;
+
+ packet->magic = be32_to_cpu(packet->magic);
+ if (packet->magic != MULTIFD_MAGIC) {
+ error_setg(errp, "multifd: received packet "
+ "magic %x and expected magic %x",
+ packet->magic, MULTIFD_MAGIC);
+ return -1;
+ }
+
+ packet->version = be32_to_cpu(packet->version);
+ if (packet->version != MULTIFD_VERSION) {
+ error_setg(errp, "multifd: received packet "
+ "version %u and expected version %u",
+ packet->version, MULTIFD_VERSION);
+ return -1;
+ }
+
+ p->flags = be32_to_cpu(packet->flags);
+
+ packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
+ /*
+ * If we received a packet that is 100 times bigger than expected
+ * just stop migration. It is a magic number.
+ */
+ if (packet->pages_alloc > p->page_count) {
+ error_setg(errp, "multifd: received packet "
+ "with size %u and expected a size of %u",
+ packet->pages_alloc, p->page_count) ;
+ return -1;
+ }
+
+ p->normal_num = be32_to_cpu(packet->normal_pages);
+ if (p->normal_num > packet->pages_alloc) {
+ error_setg(errp, "multifd: received packet "
+ "with %u pages and expected maximum pages are %u",
+ p->normal_num, packet->pages_alloc) ;
+ return -1;
+ }
+
+ p->next_packet_size = be32_to_cpu(packet->next_packet_size);
+ p->packet_num = be64_to_cpu(packet->packet_num);
+
+ if (p->normal_num == 0) {
+ return 0;
+ }
+
+ /* make sure that ramblock is 0 terminated */
+ packet->ramblock[255] = 0;
+ block = qemu_ram_block_by_name(packet->ramblock);
+ if (!block) {
+ error_setg(errp, "multifd: unknown ram block %s",
+ packet->ramblock);
+ return -1;
+ }
+
+ p->host = block->host;
+ for (i = 0; i < p->normal_num; i++) {
+ uint64_t offset = be64_to_cpu(packet->offset[i]);
+
+ if (offset > (block->used_length - p->page_size)) {
+ error_setg(errp, "multifd: offset too long %" PRIu64
+ " (max " RAM_ADDR_FMT ")",
+ offset, block->used_length);
+ return -1;
+ }
+ p->normal[i] = offset;
+ }
+
+ return 0;
+}
+
+struct {
+ MultiFDSendParams *params;
+ /* array of pages to sent */
+ MultiFDPages_t *pages;
+ /* global number of generated multifd packets */
+ uint64_t packet_num;
+ /* send channels ready */
+ QemuSemaphore channels_ready;
+ /*
+ * Have we already run terminate threads. There is a race when it
+ * happens that we got one error while we are exiting.
+ * We will use atomic operations. Only valid values are 0 and 1.
+ */
+ int exiting;
+ /* multifd ops */
+ MultiFDMethods *ops;
+} *multifd_send_state;
+
+/*
+ * How we use multifd_send_state->pages and channel->pages?
+ *
+ * We create a pages for each channel, and a main one. Each time that
+ * we need to send a batch of pages we interchange the ones between
+ * multifd_send_state and the channel that is sending it. There are
+ * two reasons for that:
+ * - to not have to do so many mallocs during migration
+ * - to make easier to know what to free at the end of migration
+ *
+ * This way we always know who is the owner of each "pages" struct,
+ * and we don't need any locking. It belongs to the migration thread
+ * or to the channel thread. Switching is safe because the migration
+ * thread is using the channel mutex when changing it, and the channel
+ * have to had finish with its own, otherwise pending_job can't be
+ * false.
+ */
+
+static int multifd_send_pages(QEMUFile *f)
+{
+ int i;
+ static int next_channel;
+ MultiFDSendParams *p = NULL; /* make happy gcc */
+ MultiFDPages_t *pages = multifd_send_state->pages;
+ uint64_t transferred;
+
+ if (qatomic_read(&multifd_send_state->exiting)) {
+ return -1;
+ }
+
+ qemu_sem_wait(&multifd_send_state->channels_ready);
+ /*
+ * next_channel can remain from a previous migration that was
+ * using more channels, so ensure it doesn't overflow if the
+ * limit is lower now.
+ */
+ next_channel %= migrate_multifd_channels();
+ for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
+ p = &multifd_send_state->params[i];
+
+ qemu_mutex_lock(&p->mutex);
+ if (p->quit) {
+ error_report("%s: channel %d has already quit!", __func__, i);
+ qemu_mutex_unlock(&p->mutex);
+ return -1;
+ }
+ if (!p->pending_job) {
+ p->pending_job++;
+ next_channel = (i + 1) % migrate_multifd_channels();
+ break;
+ }
+ qemu_mutex_unlock(&p->mutex);
+ }
+ assert(!p->pages->num);
+ assert(!p->pages->block);
+
+ p->packet_num = multifd_send_state->packet_num++;
+ multifd_send_state->pages = p->pages;
+ p->pages = pages;
+ transferred = ((uint64_t) pages->num) * p->page_size + p->packet_len;
+ qemu_file_acct_rate_limit(f, transferred);
+ ram_counters.multifd_bytes += transferred;
+ stat64_add(&ram_atomic_counters.transferred, transferred);
+ qemu_mutex_unlock(&p->mutex);
+ qemu_sem_post(&p->sem);
+
+ return 1;
+}
+
+int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
+{
+ MultiFDPages_t *pages = multifd_send_state->pages;
+ bool changed = false;
+
+ if (!pages->block) {
+ pages->block = block;
+ }
+
+ if (pages->block == block) {
+ pages->offset[pages->num] = offset;
+ pages->num++;
+
+ if (pages->num < pages->allocated) {
+ return 1;
+ }
+ } else {
+ changed = true;
+ }
+
+ if (multifd_send_pages(f) < 0) {
+ return -1;
+ }
+
+ if (changed) {
+ return multifd_queue_page(f, block, offset);
+ }
+
+ return 1;
+}
+
+static void multifd_send_terminate_threads(Error *err)
+{
+ int i;
+
+ trace_multifd_send_terminate_threads(err != NULL);
+
+ if (err) {
+ MigrationState *s = migrate_get_current();
+ migrate_set_error(s, err);
+ if (s->state == MIGRATION_STATUS_SETUP ||
+ s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
+ s->state == MIGRATION_STATUS_DEVICE ||
+ s->state == MIGRATION_STATUS_ACTIVE) {
+ migrate_set_state(&s->state, s->state,
+ MIGRATION_STATUS_FAILED);
+ }
+ }
+
+ /*
+ * We don't want to exit each threads twice. Depending on where
+ * we get the error, or if there are two independent errors in two
+ * threads at the same time, we can end calling this function
+ * twice.
+ */
+ if (qatomic_xchg(&multifd_send_state->exiting, 1)) {
+ return;
+ }
+
+ for (i = 0; i < migrate_multifd_channels(); i++) {
+ MultiFDSendParams *p = &multifd_send_state->params[i];
+
+ qemu_mutex_lock(&p->mutex);
+ p->quit = true;
+ qemu_sem_post(&p->sem);
+ if (p->c) {
+ qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
+ }
+ qemu_mutex_unlock(&p->mutex);
+ }
+}
+
+void multifd_save_cleanup(void)
+{
+ int i;
+
+ if (!migrate_use_multifd() || !migrate_multi_channels_is_allowed()) {
+ return;
+ }
+ multifd_send_terminate_threads(NULL);
+ for (i = 0; i < migrate_multifd_channels(); i++) {
+ MultiFDSendParams *p = &multifd_send_state->params[i];
+
+ if (p->running) {
+ qemu_thread_join(&p->thread);
+ }
+ }
+ for (i = 0; i < migrate_multifd_channels(); i++) {
+ MultiFDSendParams *p = &multifd_send_state->params[i];
+ Error *local_err = NULL;
+
+ if (p->registered_yank) {
+ migration_ioc_unregister_yank(p->c);
+ }
+ socket_send_channel_destroy(p->c);
+ p->c = NULL;
+ qemu_mutex_destroy(&p->mutex);
+ qemu_sem_destroy(&p->sem);
+ qemu_sem_destroy(&p->sem_sync);
+ g_free(p->name);
+ p->name = NULL;
+ multifd_pages_clear(p->pages);
+ p->pages = NULL;
+ p->packet_len = 0;
+ g_free(p->packet);
+ p->packet = NULL;
+ g_free(p->iov);
+ p->iov = NULL;
+ g_free(p->normal);
+ p->normal = NULL;
+ multifd_send_state->ops->send_cleanup(p, &local_err);
+ if (local_err) {
+ migrate_set_error(migrate_get_current(), local_err);
+ error_free(local_err);
+ }
+ }
+ qemu_sem_destroy(&multifd_send_state->channels_ready);
+ g_free(multifd_send_state->params);
+ multifd_send_state->params = NULL;
+ multifd_pages_clear(multifd_send_state->pages);
+ multifd_send_state->pages = NULL;
+ g_free(multifd_send_state);
+ multifd_send_state = NULL;
+}
+
+static int multifd_zero_copy_flush(QIOChannel *c)
+{
+ int ret;
+ Error *err = NULL;
+
+ ret = qio_channel_flush(c, &err);
+ if (ret < 0) {
+ error_report_err(err);
+ return -1;
+ }
+ if (ret == 1) {
+ dirty_sync_missed_zero_copy();
+ }
+
+ return ret;
+}
+
+int multifd_send_sync_main(QEMUFile *f)
+{
+ int i;
+ bool flush_zero_copy;
+
+ if (!migrate_use_multifd()) {
+ return 0;
+ }
+ if (multifd_send_state->pages->num) {
+ if (multifd_send_pages(f) < 0) {
+ error_report("%s: multifd_send_pages fail", __func__);
+ return -1;
+ }
+ }
+
+ /*
+ * When using zero-copy, it's necessary to flush the pages before any of
+ * the pages can be sent again, so we'll make sure the new version of the
+ * pages will always arrive _later_ than the old pages.
+ *
+ * Currently we achieve this by flushing the zero-page requested writes
+ * per ram iteration, but in the future we could potentially optimize it
+ * to be less frequent, e.g. only after we finished one whole scanning of
+ * all the dirty bitmaps.
+ */
+
+ flush_zero_copy = migrate_use_zero_copy_send();
+
+ for (i = 0; i < migrate_multifd_channels(); i++) {
+ MultiFDSendParams *p = &multifd_send_state->params[i];
+
+ trace_multifd_send_sync_main_signal(p->id);
+
+ qemu_mutex_lock(&p->mutex);
+
+ if (p->quit) {
+ error_report("%s: channel %d has already quit", __func__, i);
+ qemu_mutex_unlock(&p->mutex);
+ return -1;
+ }
+
+ p->packet_num = multifd_send_state->packet_num++;
+ p->flags |= MULTIFD_FLAG_SYNC;
+ p->pending_job++;
+ qemu_file_acct_rate_limit(f, p->packet_len);
+ ram_counters.multifd_bytes += p->packet_len;
+ stat64_add(&ram_atomic_counters.transferred, p->packet_len);
+ qemu_mutex_unlock(&p->mutex);
+ qemu_sem_post(&p->sem);
+
+ if (flush_zero_copy && p->c && (multifd_zero_copy_flush(p->c) < 0)) {
+ return -1;
+ }
+ }
+ for (i = 0; i < migrate_multifd_channels(); i++) {
+ MultiFDSendParams *p = &multifd_send_state->params[i];
+
+ trace_multifd_send_sync_main_wait(p->id);
+ qemu_sem_wait(&p->sem_sync);
+ }
+ trace_multifd_send_sync_main(multifd_send_state->packet_num);
+
+ return 0;
+}
+
+static void *multifd_send_thread(void *opaque)
+{
+ MultiFDSendParams *p = opaque;
+ Error *local_err = NULL;
+ int ret = 0;
+ bool use_zero_copy_send = migrate_use_zero_copy_send();
+
+ trace_multifd_send_thread_start(p->id);
+ rcu_register_thread();
+
+ if (multifd_send_initial_packet(p, &local_err) < 0) {
+ ret = -1;
+ goto out;
+ }
+ /* initial packet */
+ p->num_packets = 1;
+
+ while (true) {
+ qemu_sem_wait(&p->sem);
+
+ if (qatomic_read(&multifd_send_state->exiting)) {
+ break;
+ }
+ qemu_mutex_lock(&p->mutex);
+
+ if (p->pending_job) {
+ uint64_t packet_num = p->packet_num;
+ uint32_t flags = p->flags;
+ p->normal_num = 0;
+
+ if (use_zero_copy_send) {
+ p->iovs_num = 0;
+ } else {
+ p->iovs_num = 1;
+ }
+
+ for (int i = 0; i < p->pages->num; i++) {
+ p->normal[p->normal_num] = p->pages->offset[i];
+ p->normal_num++;
+ }
+
+ if (p->normal_num) {
+ ret = multifd_send_state->ops->send_prepare(p, &local_err);
+ if (ret != 0) {
+ qemu_mutex_unlock(&p->mutex);
+ break;
+ }
+ }
+ multifd_send_fill_packet(p);
+ p->flags = 0;
+ p->num_packets++;
+ p->total_normal_pages += p->normal_num;
+ p->pages->num = 0;
+ p->pages->block = NULL;
+ qemu_mutex_unlock(&p->mutex);
+
+ trace_multifd_send(p->id, packet_num, p->normal_num, flags,
+ p->next_packet_size);
+
+ if (use_zero_copy_send) {
+ /* Send header first, without zerocopy */
+ ret = qio_channel_write_all(p->c, (void *)p->packet,
+ p->packet_len, &local_err);
+ if (ret != 0) {
+ break;
+ }
+ } else {
+ /* Send header using the same writev call */
+ p->iov[0].iov_len = p->packet_len;
+ p->iov[0].iov_base = p->packet;
+ }
+
+ ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL,
+ 0, p->write_flags, &local_err);
+ if (ret != 0) {
+ break;
+ }
+
+ qemu_mutex_lock(&p->mutex);
+ p->pending_job--;
+ qemu_mutex_unlock(&p->mutex);
+
+ if (flags & MULTIFD_FLAG_SYNC) {
+ qemu_sem_post(&p->sem_sync);
+ }
+ qemu_sem_post(&multifd_send_state->channels_ready);
+ } else if (p->quit) {
+ qemu_mutex_unlock(&p->mutex);
+ break;
+ } else {
+ qemu_mutex_unlock(&p->mutex);
+ /* sometimes there are spurious wakeups */
+ }
+ }
+
+out:
+ if (local_err) {
+ trace_multifd_send_error(p->id);
+ multifd_send_terminate_threads(local_err);
+ error_free(local_err);
+ }
+
+ /*
+ * Error happen, I will exit, but I can't just leave, tell
+ * who pay attention to me.
+ */
+ if (ret != 0) {
+ qemu_sem_post(&p->sem_sync);
+ qemu_sem_post(&multifd_send_state->channels_ready);
+ }
+
+ qemu_mutex_lock(&p->mutex);
+ p->running = false;
+ qemu_mutex_unlock(&p->mutex);
+
+ rcu_unregister_thread();
+ trace_multifd_send_thread_end(p->id, p->num_packets, p->total_normal_pages);
+
+ return NULL;
+}
+
+static bool multifd_channel_connect(MultiFDSendParams *p,
+ QIOChannel *ioc,
+ Error *error);
+
+static void multifd_tls_outgoing_handshake(QIOTask *task,
+ gpointer opaque)
+{
+ MultiFDSendParams *p = opaque;
+ QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task));
+ Error *err = NULL;
+
+ if (qio_task_propagate_error(task, &err)) {
+ trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err));
+ } else {
+ trace_multifd_tls_outgoing_handshake_complete(ioc);
+ }
+
+ if (!multifd_channel_connect(p, ioc, err)) {
+ /*
+ * Error happen, mark multifd_send_thread status as 'quit' although it
+ * is not created, and then tell who pay attention to me.
+ */
+ p->quit = true;
+ qemu_sem_post(&multifd_send_state->channels_ready);
+ qemu_sem_post(&p->sem_sync);
+ }
+}
+
+static void *multifd_tls_handshake_thread(void *opaque)
+{
+ MultiFDSendParams *p = opaque;
+ QIOChannelTLS *tioc = QIO_CHANNEL_TLS(p->c);
+
+ qio_channel_tls_handshake(tioc,
+ multifd_tls_outgoing_handshake,
+ p,
+ NULL,
+ NULL);
+ return NULL;
+}
+
+static void multifd_tls_channel_connect(MultiFDSendParams *p,
+ QIOChannel *ioc,
+ Error **errp)
+{
+ MigrationState *s = migrate_get_current();
+ const char *hostname = s->hostname;
+ QIOChannelTLS *tioc;
+
+ tioc = migration_tls_client_create(s, ioc, hostname, errp);
+ if (!tioc) {
+ return;
+ }
+
+ object_unref(OBJECT(ioc));
+ trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname);
+ qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing");
+ p->c = QIO_CHANNEL(tioc);
+ qemu_thread_create(&p->thread, "multifd-tls-handshake-worker",
+ multifd_tls_handshake_thread, p,
+ QEMU_THREAD_JOINABLE);
+}
+
+static bool multifd_channel_connect(MultiFDSendParams *p,
+ QIOChannel *ioc,
+ Error *error)
+{
+ trace_multifd_set_outgoing_channel(
+ ioc, object_get_typename(OBJECT(ioc)),
+ migrate_get_current()->hostname, error);
+
+ if (!error) {
+ if (migrate_channel_requires_tls_upgrade(ioc)) {
+ multifd_tls_channel_connect(p, ioc, &error);
+ if (!error) {
+ /*
+ * tls_channel_connect will call back to this
+ * function after the TLS handshake,
+ * so we mustn't call multifd_send_thread until then
+ */
+ return true;
+ } else {
+ return false;
+ }
+ } else {
+ migration_ioc_register_yank(ioc);
+ p->registered_yank = true;
+ p->c = ioc;
+ qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
+ QEMU_THREAD_JOINABLE);
+ }
+ return true;
+ }
+
+ return false;
+}
+
+static void multifd_new_send_channel_cleanup(MultiFDSendParams *p,
+ QIOChannel *ioc, Error *err)
+{
+ migrate_set_error(migrate_get_current(), err);
+ /* Error happen, we need to tell who pay attention to me */
+ qemu_sem_post(&multifd_send_state->channels_ready);
+ qemu_sem_post(&p->sem_sync);
+ /*
+ * Although multifd_send_thread is not created, but main migration
+ * thread neet to judge whether it is running, so we need to mark
+ * its status.
+ */
+ p->quit = true;
+ object_unref(OBJECT(ioc));
+ error_free(err);
+}
+
+static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
+{
+ MultiFDSendParams *p = opaque;
+ QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
+ Error *local_err = NULL;
+
+ trace_multifd_new_send_channel_async(p->id);
+ if (qio_task_propagate_error(task, &local_err)) {
+ goto cleanup;
+ } else {
+ p->c = QIO_CHANNEL(sioc);
+ qio_channel_set_delay(p->c, false);
+ p->running = true;
+ if (!multifd_channel_connect(p, sioc, local_err)) {
+ goto cleanup;
+ }
+ return;
+ }
+
+cleanup:
+ multifd_new_send_channel_cleanup(p, sioc, local_err);
+}
+
+int multifd_save_setup(Error **errp)
+{
+ int thread_count;
+ uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
+ uint8_t i;
+
+ if (!migrate_use_multifd()) {
+ return 0;
+ }
+ if (!migrate_multi_channels_is_allowed()) {
+ error_setg(errp, "multifd is not supported by current protocol");
+ return -1;
+ }
+
+ thread_count = migrate_multifd_channels();
+ multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
+ multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
+ multifd_send_state->pages = multifd_pages_init(page_count);
+ qemu_sem_init(&multifd_send_state->channels_ready, 0);
+ qatomic_set(&multifd_send_state->exiting, 0);
+ multifd_send_state->ops = multifd_ops[migrate_multifd_compression()];
+
+ for (i = 0; i < thread_count; i++) {
+ MultiFDSendParams *p = &multifd_send_state->params[i];
+
+ qemu_mutex_init(&p->mutex);
+ qemu_sem_init(&p->sem, 0);
+ qemu_sem_init(&p->sem_sync, 0);
+ p->quit = false;
+ p->pending_job = 0;
+ p->id = i;
+ p->pages = multifd_pages_init(page_count);
+ p->packet_len = sizeof(MultiFDPacket_t)
+ + sizeof(uint64_t) * page_count;
+ p->packet = g_malloc0(p->packet_len);
+ p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
+ p->packet->version = cpu_to_be32(MULTIFD_VERSION);
+ p->name = g_strdup_printf("multifdsend_%d", i);
+ /* We need one extra place for the packet header */
+ p->iov = g_new0(struct iovec, page_count + 1);
+ p->normal = g_new0(ram_addr_t, page_count);
+ p->page_size = qemu_target_page_size();
+ p->page_count = page_count;
+
+ if (migrate_use_zero_copy_send()) {
+ p->write_flags = QIO_CHANNEL_WRITE_FLAG_ZERO_COPY;
+ } else {
+ p->write_flags = 0;
+ }
+
+ socket_send_channel_create(multifd_new_send_channel_async, p);
+ }
+
+ for (i = 0; i < thread_count; i++) {
+ MultiFDSendParams *p = &multifd_send_state->params[i];
+ Error *local_err = NULL;
+ int ret;
+
+ ret = multifd_send_state->ops->send_setup(p, &local_err);
+ if (ret) {
+ error_propagate(errp, local_err);
+ return ret;
+ }
+ }
+ return 0;
+}
+
+struct {
+ MultiFDRecvParams *params;
+ /* number of created threads */
+ int count;
+ /* syncs main thread and channels */
+ QemuSemaphore sem_sync;
+ /* global number of generated multifd packets */
+ uint64_t packet_num;
+ /* multifd ops */
+ MultiFDMethods *ops;
+} *multifd_recv_state;
+
+static void multifd_recv_terminate_threads(Error *err)
+{
+ int i;
+
+ trace_multifd_recv_terminate_threads(err != NULL);
+
+ if (err) {
+ MigrationState *s = migrate_get_current();
+ migrate_set_error(s, err);
+ if (s->state == MIGRATION_STATUS_SETUP ||
+ s->state == MIGRATION_STATUS_ACTIVE) {
+ migrate_set_state(&s->state, s->state,
+ MIGRATION_STATUS_FAILED);
+ }
+ }
+
+ for (i = 0; i < migrate_multifd_channels(); i++) {
+ MultiFDRecvParams *p = &multifd_recv_state->params[i];
+
+ qemu_mutex_lock(&p->mutex);
+ p->quit = true;
+ /*
+ * We could arrive here for two reasons:
+ * - normal quit, i.e. everything went fine, just finished
+ * - error quit: We close the channels so the channel threads
+ * finish the qio_channel_read_all_eof()
+ */
+ if (p->c) {
+ qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
+ }
+ qemu_mutex_unlock(&p->mutex);
+ }
+}
+
+int multifd_load_cleanup(Error **errp)
+{
+ int i;
+
+ if (!migrate_use_multifd() || !migrate_multi_channels_is_allowed()) {
+ return 0;
+ }
+ multifd_recv_terminate_threads(NULL);
+ for (i = 0; i < migrate_multifd_channels(); i++) {
+ MultiFDRecvParams *p = &multifd_recv_state->params[i];
+
+ if (p->running) {
+ p->quit = true;
+ /*
+ * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
+ * however try to wakeup it without harm in cleanup phase.
+ */
+ qemu_sem_post(&p->sem_sync);
+ qemu_thread_join(&p->thread);
+ }
+ }
+ for (i = 0; i < migrate_multifd_channels(); i++) {
+ MultiFDRecvParams *p = &multifd_recv_state->params[i];
+
+ migration_ioc_unregister_yank(p->c);
+ object_unref(OBJECT(p->c));
+ p->c = NULL;
+ qemu_mutex_destroy(&p->mutex);
+ qemu_sem_destroy(&p->sem_sync);
+ g_free(p->name);
+ p->name = NULL;
+ p->packet_len = 0;
+ g_free(p->packet);
+ p->packet = NULL;
+ g_free(p->iov);
+ p->iov = NULL;
+ g_free(p->normal);
+ p->normal = NULL;
+ multifd_recv_state->ops->recv_cleanup(p);
+ }
+ qemu_sem_destroy(&multifd_recv_state->sem_sync);
+ g_free(multifd_recv_state->params);
+ multifd_recv_state->params = NULL;
+ g_free(multifd_recv_state);
+ multifd_recv_state = NULL;
+
+ return 0;
+}
+
+void multifd_recv_sync_main(void)
+{
+ int i;
+
+ if (!migrate_use_multifd()) {
+ return;
+ }
+ for (i = 0; i < migrate_multifd_channels(); i++) {
+ MultiFDRecvParams *p = &multifd_recv_state->params[i];
+
+ trace_multifd_recv_sync_main_wait(p->id);
+ qemu_sem_wait(&multifd_recv_state->sem_sync);
+ }
+ for (i = 0; i < migrate_multifd_channels(); i++) {
+ MultiFDRecvParams *p = &multifd_recv_state->params[i];
+
+ WITH_QEMU_LOCK_GUARD(&p->mutex) {
+ if (multifd_recv_state->packet_num < p->packet_num) {
+ multifd_recv_state->packet_num = p->packet_num;
+ }
+ }
+ trace_multifd_recv_sync_main_signal(p->id);
+ qemu_sem_post(&p->sem_sync);
+ }
+ trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
+}
+
+static void *multifd_recv_thread(void *opaque)
+{
+ MultiFDRecvParams *p = opaque;
+ Error *local_err = NULL;
+ int ret;
+
+ trace_multifd_recv_thread_start(p->id);
+ rcu_register_thread();
+
+ while (true) {
+ uint32_t flags;
+
+ if (p->quit) {
+ break;
+ }
+
+ ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
+ p->packet_len, &local_err);
+ if (ret == 0) { /* EOF */
+ break;
+ }
+ if (ret == -1) { /* Error */
+ break;
+ }
+
+ qemu_mutex_lock(&p->mutex);
+ ret = multifd_recv_unfill_packet(p, &local_err);
+ if (ret) {
+ qemu_mutex_unlock(&p->mutex);
+ break;
+ }
+
+ flags = p->flags;
+ /* recv methods don't know how to handle the SYNC flag */
+ p->flags &= ~MULTIFD_FLAG_SYNC;
+ trace_multifd_recv(p->id, p->packet_num, p->normal_num, flags,
+ p->next_packet_size);
+ p->num_packets++;
+ p->total_normal_pages += p->normal_num;
+ qemu_mutex_unlock(&p->mutex);
+
+ if (p->normal_num) {
+ ret = multifd_recv_state->ops->recv_pages(p, &local_err);
+ if (ret != 0) {
+ break;
+ }
+ }
+
+ if (flags & MULTIFD_FLAG_SYNC) {
+ qemu_sem_post(&multifd_recv_state->sem_sync);
+ qemu_sem_wait(&p->sem_sync);
+ }
+ }
+
+ if (local_err) {
+ multifd_recv_terminate_threads(local_err);
+ error_free(local_err);
+ }
+ qemu_mutex_lock(&p->mutex);
+ p->running = false;
+ qemu_mutex_unlock(&p->mutex);
+
+ rcu_unregister_thread();
+ trace_multifd_recv_thread_end(p->id, p->num_packets, p->total_normal_pages);
+
+ return NULL;
+}
+
+int multifd_load_setup(Error **errp)
+{
+ int thread_count;
+ uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
+ uint8_t i;
+
+ /*
+ * Return successfully if multiFD recv state is already initialised
+ * or multiFD is not enabled.
+ */
+ if (multifd_recv_state || !migrate_use_multifd()) {
+ return 0;
+ }
+
+ if (!migrate_multi_channels_is_allowed()) {
+ error_setg(errp, "multifd is not supported by current protocol");
+ return -1;
+ }
+ thread_count = migrate_multifd_channels();
+ multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
+ multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
+ qatomic_set(&multifd_recv_state->count, 0);
+ qemu_sem_init(&multifd_recv_state->sem_sync, 0);
+ multifd_recv_state->ops = multifd_ops[migrate_multifd_compression()];
+
+ for (i = 0; i < thread_count; i++) {
+ MultiFDRecvParams *p = &multifd_recv_state->params[i];
+
+ qemu_mutex_init(&p->mutex);
+ qemu_sem_init(&p->sem_sync, 0);
+ p->quit = false;
+ p->id = i;
+ p->packet_len = sizeof(MultiFDPacket_t)
+ + sizeof(uint64_t) * page_count;
+ p->packet = g_malloc0(p->packet_len);
+ p->name = g_strdup_printf("multifdrecv_%d", i);
+ p->iov = g_new0(struct iovec, page_count);
+ p->normal = g_new0(ram_addr_t, page_count);
+ p->page_count = page_count;
+ p->page_size = qemu_target_page_size();
+ }
+
+ for (i = 0; i < thread_count; i++) {
+ MultiFDRecvParams *p = &multifd_recv_state->params[i];
+ Error *local_err = NULL;
+ int ret;
+
+ ret = multifd_recv_state->ops->recv_setup(p, &local_err);
+ if (ret) {
+ error_propagate(errp, local_err);
+ return ret;
+ }
+ }
+ return 0;
+}
+
+bool multifd_recv_all_channels_created(void)
+{
+ int thread_count = migrate_multifd_channels();
+
+ if (!migrate_use_multifd()) {
+ return true;
+ }
+
+ if (!multifd_recv_state) {
+ /* Called before any connections created */
+ return false;
+ }
+
+ return thread_count == qatomic_read(&multifd_recv_state->count);
+}
+
+/*
+ * Try to receive all multifd channels to get ready for the migration.
+ * Sets @errp when failing to receive the current channel.
+ */
+void multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
+{
+ MultiFDRecvParams *p;
+ Error *local_err = NULL;
+ int id;
+
+ id = multifd_recv_initial_packet(ioc, &local_err);
+ if (id < 0) {
+ multifd_recv_terminate_threads(local_err);
+ error_propagate_prepend(errp, local_err,
+ "failed to receive packet"
+ " via multifd channel %d: ",
+ qatomic_read(&multifd_recv_state->count));
+ return;
+ }
+ trace_multifd_recv_new_channel(id);
+
+ p = &multifd_recv_state->params[id];
+ if (p->c != NULL) {
+ error_setg(&local_err, "multifd: received id '%d' already setup'",
+ id);
+ multifd_recv_terminate_threads(local_err);
+ error_propagate(errp, local_err);
+ return;
+ }
+ p->c = ioc;
+ object_ref(OBJECT(ioc));
+ /* initial packet */
+ p->num_packets = 1;
+
+ p->running = true;
+ qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
+ QEMU_THREAD_JOINABLE);
+ qatomic_inc(&multifd_recv_state->count);
+}
diff --git a/migration/multifd.h b/migration/multifd.h
index e2802a9..ff3aa2e 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -18,7 +18,7 @@
int multifd_load_setup(Error **errp);
int multifd_load_cleanup(Error **errp);
bool multifd_recv_all_channels_created(void);
-bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp);
+void multifd_recv_new_channel(QIOChannel *ioc, Error **errp);
void multifd_recv_sync_main(void);
int multifd_send_sync_main(QEMUFile *f);
int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset);
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index b9a37ef..b98e95d 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -37,6 +37,7 @@
#include "qemu-file.h"
#include "yank_functions.h"
#include "tls.h"
+#include "qemu/userfaultfd.h"
/* Arbitrary limit on size of each discard command,
* keeps them around ~200 bytes
@@ -226,11 +227,9 @@
int ufd;
bool ret = true;
- /* if we are here __NR_userfaultfd should exists */
- ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
+ ufd = uffd_open(O_CLOEXEC);
if (ufd == -1) {
- error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
- strerror(errno));
+ error_report("%s: uffd_open() failed: %s", __func__, strerror(errno));
return false;
}
@@ -375,7 +374,7 @@
goto out;
}
- ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
+ ufd = uffd_open(O_CLOEXEC);
if (ufd == -1) {
error_report("%s: userfaultfd not available: %s", __func__,
strerror(errno));
@@ -1160,7 +1159,7 @@
int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
{
/* Open the fd for the kernel to give us userfaults */
- mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ mis->userfault_fd = uffd_open(O_CLOEXEC | O_NONBLOCK);
if (mis->userfault_fd == -1) {
error_report("%s: Failed to open userfault fd: %s", __func__,
strerror(errno));
@@ -1539,7 +1538,7 @@
}
}
-bool postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file)
+void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file)
{
/*
* The new loading channel has its own threads, so it needs to be
@@ -1548,9 +1547,6 @@
qemu_file_set_blocking(file, true);
mis->postcopy_qemufile_dst = file;
trace_postcopy_preempt_new_channel();
-
- /* Start the migration immediately */
- return true;
}
/*
diff --git a/migration/postcopy-ram.h b/migration/postcopy-ram.h
index 6147bf7..25881c4 100644
--- a/migration/postcopy-ram.h
+++ b/migration/postcopy-ram.h
@@ -190,7 +190,7 @@
RAM_CHANNEL_MAX,
};
-bool postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file);
+void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file);
int postcopy_preempt_setup(MigrationState *s, Error **errp);
int postcopy_preempt_wait_channel(MigrationState *s);
diff --git a/migration/ram.c b/migration/ram.c
index 334309f..b966e14 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1774,13 +1774,15 @@
static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
ram_addr_t size)
{
+ const ram_addr_t end = offset + size;
+
/*
* We read one byte of each page; this will preallocate page tables if
* required and populate the shared zeropage on MAP_PRIVATE anonymous memory
* where no page was populated yet. This might require adaption when
* supporting other mappings, like shmem.
*/
- for (; offset < size; offset += block->page_size) {
+ for (; offset < end; offset += block->page_size) {
char tmp = *((char *)block->host + offset);
/* Don't optimize the read out */
@@ -1863,6 +1865,39 @@
}
}
+static inline int uffd_protect_section(MemoryRegionSection *section,
+ void *opaque)
+{
+ const hwaddr size = int128_get64(section->size);
+ const hwaddr offset = section->offset_within_region;
+ RAMBlock *rb = section->mr->ram_block;
+ int uffd_fd = (uintptr_t)opaque;
+
+ return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
+ false);
+}
+
+static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
+{
+ assert(rb->flags & RAM_UF_WRITEPROTECT);
+
+ /* See ram_block_populate_read() */
+ if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
+ RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
+ MemoryRegionSection section = {
+ .mr = rb->mr,
+ .offset_within_region = 0,
+ .size = rb->mr->size,
+ };
+
+ return ram_discard_manager_replay_populated(rdm, §ion,
+ uffd_protect_section,
+ (void *)(uintptr_t)uffd_fd);
+ }
+ return uffd_change_protection(uffd_fd, rb->host,
+ rb->used_length, true, false);
+}
+
/*
* ram_write_tracking_start: start UFFD-WP memory tracking
*
@@ -1894,14 +1929,14 @@
block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
goto fail;
}
- /* Apply UFFD write protection to the block memory range */
- if (uffd_change_protection(rs->uffdio_fd, block->host,
- block->max_length, true, false)) {
- goto fail;
- }
block->flags |= RAM_UF_WRITEPROTECT;
memory_region_ref(block->mr);
+ /* Apply UFFD write protection to the block memory range */
+ if (ram_block_uffd_protect(block, uffd_fd)) {
+ goto fail;
+ }
+
trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
block->host, block->max_length);
}
@@ -1915,12 +1950,6 @@
if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
continue;
}
- /*
- * In case some memory block failed to be write-protected
- * remove protection and unregister all succeeded RAM blocks
- */
- uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
- false, false);
uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
/* Cleanup flags and remove reference */
block->flags &= ~RAM_UF_WRITEPROTECT;
@@ -1946,9 +1975,6 @@
if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
continue;
}
- /* Remove protection and unregister all affected RAM blocks */
- uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
- false, false);
uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
@@ -2319,8 +2345,25 @@
size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
pss->host_page_sending = true;
- pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
- pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
+ if (guest_pfns <= 1) {
+ /*
+ * This covers both when guest psize == host psize, or when guest
+ * has larger psize than the host (guest_pfns==0).
+ *
+ * For the latter, we always send one whole guest page per
+ * iteration of the host page (example: an Alpha VM on x86 host
+ * will have guest psize 8K while host psize 4K).
+ */
+ pss->host_page_start = pss->page;
+ pss->host_page_end = pss->page + 1;
+ } else {
+ /*
+ * The host page spans over multiple guest pages, we send them
+ * within the same host page iteration.
+ */
+ pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
+ pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
+ }
}
/*
@@ -3392,19 +3435,35 @@
return 0;
}
-static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
- uint64_t *res_precopy_only,
- uint64_t *res_compatible,
- uint64_t *res_postcopy_only)
+static void ram_state_pending_estimate(void *opaque,
+ uint64_t *res_precopy_only,
+ uint64_t *res_compatible,
+ uint64_t *res_postcopy_only)
{
RAMState **temp = opaque;
RAMState *rs = *temp;
- uint64_t remaining_size;
- remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
+ uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
- if (!migration_in_postcopy() &&
- remaining_size < max_size) {
+ if (migrate_postcopy_ram()) {
+ /* We can do postcopy, and all the data is postcopiable */
+ *res_postcopy_only += remaining_size;
+ } else {
+ *res_precopy_only += remaining_size;
+ }
+}
+
+static void ram_state_pending_exact(void *opaque,
+ uint64_t *res_precopy_only,
+ uint64_t *res_compatible,
+ uint64_t *res_postcopy_only)
+{
+ RAMState **temp = opaque;
+ RAMState *rs = *temp;
+
+ uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
+
+ if (!migration_in_postcopy()) {
qemu_mutex_lock_iothread();
WITH_RCU_READ_LOCK_GUARD() {
migration_bitmap_sync_precopy(rs);
@@ -4091,12 +4150,6 @@
return ret;
}
-static bool postcopy_is_advised(void)
-{
- PostcopyState ps = postcopy_state_get();
- return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
-}
-
static bool postcopy_is_running(void)
{
PostcopyState ps = postcopy_state_get();
@@ -4167,7 +4220,7 @@
MigrationIncomingState *mis = migration_incoming_get_current();
int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
/* ADVISE is earlier, it shows the source has the postcopy capability on */
- bool postcopy_advised = postcopy_is_advised();
+ bool postcopy_advised = migration_incoming_postcopy_advised();
if (!migrate_use_compression()) {
invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
}
@@ -4560,7 +4613,8 @@
.save_live_complete_postcopy = ram_save_complete,
.save_live_complete_precopy = ram_save_complete,
.has_postcopy = ram_has_postcopy,
- .save_live_pending = ram_save_pending,
+ .state_pending_exact = ram_state_pending_exact,
+ .state_pending_estimate = ram_state_pending_estimate,
.load_state = ram_load,
.save_cleanup = ram_save_cleanup,
.load_setup = ram_load_setup,
diff --git a/migration/rdma.c b/migration/rdma.c
index 94a55dd..288eadc 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -2785,7 +2785,8 @@
rdma = qatomic_rcu_read(&rioc->rdmaout);
if (!rdma) {
- return -EIO;
+ error_setg(errp, "RDMA control channel output is not set");
+ return -1;
}
CHECK_ERROR_STATE();
@@ -2797,7 +2798,8 @@
ret = qemu_rdma_write_flush(f, rdma);
if (ret < 0) {
rdma->error_state = ret;
- return ret;
+ error_setg(errp, "qemu_rdma_write_flush returned %d", ret);
+ return -1;
}
for (i = 0; i < niov; i++) {
@@ -2816,7 +2818,8 @@
if (ret < 0) {
rdma->error_state = ret;
- return ret;
+ error_setg(errp, "qemu_rdma_exchange_send returned %d", ret);
+ return -1;
}
data += len;
@@ -2854,6 +2857,7 @@
size_t niov,
int **fds,
size_t *nfds,
+ int flags,
Error **errp)
{
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
@@ -2867,7 +2871,8 @@
rdma = qatomic_rcu_read(&rioc->rdmain);
if (!rdma) {
- return -EIO;
+ error_setg(errp, "RDMA control channel input is not set");
+ return -1;
}
CHECK_ERROR_STATE();
@@ -2903,7 +2908,8 @@
if (ret < 0) {
rdma->error_state = ret;
- return ret;
+ error_setg(errp, "qemu_rdma_exchange_recv returned %d", ret);
+ return -1;
}
/*
diff --git a/migration/savevm.c b/migration/savevm.c
index a783789..e9cf499 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -42,7 +42,6 @@
#include "postcopy-ram.h"
#include "qapi/error.h"
#include "qapi/qapi-commands-migration.h"
-#include "qapi/qmp/json-writer.h"
#include "qapi/clone-visitor.h"
#include "qapi/qapi-builtin-visit.h"
#include "qapi/qmp/qerror.h"
@@ -67,6 +66,7 @@
#include "net/announce.h"
#include "qemu/yank.h"
#include "yank_functions.h"
+#include "sysemu/qtest.h"
const unsigned int postcopy_ram_discard_version;
@@ -586,6 +586,7 @@
field++;
first = false;
}
+ assert(field->flags == VMS_END);
fprintf(out_file, "\n%*s]", indent, "");
}
if (vmsd->subsections != NULL) {
@@ -804,6 +805,42 @@
}
}
+/*
+ * Perform some basic checks on vmsd's at registration
+ * time.
+ */
+static void vmstate_check(const VMStateDescription *vmsd)
+{
+ const VMStateField *field = vmsd->fields;
+ const VMStateDescription **subsection = vmsd->subsections;
+
+ if (field) {
+ while (field->name) {
+ if (field->flags & (VMS_STRUCT | VMS_VSTRUCT)) {
+ /* Recurse to sub structures */
+ vmstate_check(field->vmsd);
+ }
+ /* Carry on */
+ field++;
+ }
+ /* Check for the end of field list canary */
+ if (field->flags != VMS_END) {
+ error_report("VMSTATE not ending with VMS_END: %s", vmsd->name);
+ g_assert_not_reached();
+ }
+ }
+
+ while (subsection && *subsection) {
+ /*
+ * The name of a subsection should start with the name of the
+ * current object.
+ */
+ assert(!strncmp(vmsd->name, (*subsection)->name, strlen(vmsd->name)));
+ vmstate_check(*subsection);
+ subsection++;
+ }
+}
+
int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id,
const VMStateDescription *vmsd,
void *opaque, int alias_id,
@@ -849,6 +886,11 @@
} else {
se->instance_id = instance_id;
}
+
+ /* Perform a recursive sanity check during the test runs */
+ if (qtest_enabled()) {
+ vmstate_check(vmsd);
+ }
assert(!se->compat || se->instance_id == 0);
savevm_state_handler_insert(se);
return 0;
@@ -898,17 +940,6 @@
}
}
-static int vmstate_save(QEMUFile *f, SaveStateEntry *se,
- JSONWriter *vmdesc)
-{
- trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
- if (!se->vmsd) {
- vmstate_save_old_style(f, se, vmdesc);
- return 0;
- }
- return vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
-}
-
/*
* Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL)
*/
@@ -942,6 +973,43 @@
}
}
+static int vmstate_save(QEMUFile *f, SaveStateEntry *se, JSONWriter *vmdesc)
+{
+ int ret;
+
+ if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
+ return 0;
+ }
+ if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
+ trace_savevm_section_skip(se->idstr, se->section_id);
+ return 0;
+ }
+
+ trace_savevm_section_start(se->idstr, se->section_id);
+ save_section_header(f, se, QEMU_VM_SECTION_FULL);
+ if (vmdesc) {
+ json_writer_start_object(vmdesc, NULL);
+ json_writer_str(vmdesc, "name", se->idstr);
+ json_writer_int64(vmdesc, "instance_id", se->instance_id);
+ }
+
+ trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
+ if (!se->vmsd) {
+ vmstate_save_old_style(f, se, vmdesc);
+ } else {
+ ret = vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
+ if (ret) {
+ return ret;
+ }
+ }
+
+ trace_savevm_section_end(se->idstr, se->section_id, 0);
+ save_section_footer(f, se);
+ if (vmdesc) {
+ json_writer_end_object(vmdesc);
+ }
+ return 0;
+}
/**
* qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
* command and associated data.
@@ -1164,12 +1232,27 @@
void qemu_savevm_state_setup(QEMUFile *f)
{
+ MigrationState *ms = migrate_get_current();
SaveStateEntry *se;
Error *local_err = NULL;
int ret;
+ ms->vmdesc = json_writer_new(false);
+ json_writer_start_object(ms->vmdesc, NULL);
+ json_writer_int64(ms->vmdesc, "page_size", qemu_target_page_size());
+ json_writer_start_array(ms->vmdesc, "devices");
+
trace_savevm_state_setup();
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+ if (se->vmsd && se->vmsd->early_setup) {
+ ret = vmstate_save(f, se, ms->vmdesc);
+ if (ret) {
+ qemu_file_set_error(f, ret);
+ break;
+ }
+ continue;
+ }
+
if (!se->ops || !se->ops->save_setup) {
continue;
}
@@ -1365,41 +1448,23 @@
bool in_postcopy,
bool inactivate_disks)
{
- g_autoptr(JSONWriter) vmdesc = NULL;
+ MigrationState *ms = migrate_get_current();
+ JSONWriter *vmdesc = ms->vmdesc;
int vmdesc_len;
SaveStateEntry *se;
int ret;
- vmdesc = json_writer_new(false);
- json_writer_start_object(vmdesc, NULL);
- json_writer_int64(vmdesc, "page_size", qemu_target_page_size());
- json_writer_start_array(vmdesc, "devices");
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
-
- if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
- continue;
- }
- if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
- trace_savevm_section_skip(se->idstr, se->section_id);
+ if (se->vmsd && se->vmsd->early_setup) {
+ /* Already saved during qemu_savevm_state_setup(). */
continue;
}
- trace_savevm_section_start(se->idstr, se->section_id);
-
- json_writer_start_object(vmdesc, NULL);
- json_writer_str(vmdesc, "name", se->idstr);
- json_writer_int64(vmdesc, "instance_id", se->instance_id);
-
- save_section_header(f, se, QEMU_VM_SECTION_FULL);
ret = vmstate_save(f, se, vmdesc);
if (ret) {
qemu_file_set_error(f, ret);
return ret;
}
- trace_savevm_section_end(se->idstr, se->section_id, 0);
- save_section_footer(f, se);
-
- json_writer_end_object(vmdesc);
}
if (inactivate_disks) {
@@ -1428,6 +1493,10 @@
qemu_put_buffer(f, (uint8_t *)json_writer_get(vmdesc), vmdesc_len);
}
+ /* Free it now to detect any inconsistencies. */
+ json_writer_free(vmdesc);
+ ms->vmdesc = NULL;
+
return 0;
}
@@ -1472,10 +1541,9 @@
* the result is split into the amount for units that can and
* for units that can't do postcopy.
*/
-void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
- uint64_t *res_precopy_only,
- uint64_t *res_compatible,
- uint64_t *res_postcopy_only)
+void qemu_savevm_state_pending_estimate(uint64_t *res_precopy_only,
+ uint64_t *res_compatible,
+ uint64_t *res_postcopy_only)
{
SaveStateEntry *se;
@@ -1483,9 +1551,8 @@
*res_compatible = 0;
*res_postcopy_only = 0;
-
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
- if (!se->ops || !se->ops->save_live_pending) {
+ if (!se->ops || !se->ops->state_pending_exact) {
continue;
}
if (se->ops->is_active) {
@@ -1493,9 +1560,34 @@
continue;
}
}
- se->ops->save_live_pending(f, se->opaque, threshold_size,
- res_precopy_only, res_compatible,
- res_postcopy_only);
+ se->ops->state_pending_exact(se->opaque,
+ res_precopy_only, res_compatible,
+ res_postcopy_only);
+ }
+}
+
+void qemu_savevm_state_pending_exact(uint64_t *res_precopy_only,
+ uint64_t *res_compatible,
+ uint64_t *res_postcopy_only)
+{
+ SaveStateEntry *se;
+
+ *res_precopy_only = 0;
+ *res_compatible = 0;
+ *res_postcopy_only = 0;
+
+ QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+ if (!se->ops || !se->ops->state_pending_estimate) {
+ continue;
+ }
+ if (se->ops->is_active) {
+ if (!se->ops->is_active(se->opaque)) {
+ continue;
+ }
+ }
+ se->ops->state_pending_estimate(se->opaque,
+ res_precopy_only, res_compatible,
+ res_postcopy_only);
}
}
@@ -1595,21 +1687,10 @@
if (se->is_ram) {
continue;
}
- if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
- continue;
- }
- if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
- continue;
- }
-
- save_section_header(f, se, QEMU_VM_SECTION_FULL);
-
ret = vmstate_save(f, se, NULL);
if (ret) {
return ret;
}
-
- save_section_footer(f, se);
}
qemu_put_byte(f, QEMU_VM_EOF);
diff --git a/migration/savevm.h b/migration/savevm.h
index 6461342..b1901e6 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -40,10 +40,12 @@
void qemu_savevm_state_complete_postcopy(QEMUFile *f);
int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
bool inactivate_disks);
-void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size,
- uint64_t *res_precopy_only,
- uint64_t *res_compatible,
- uint64_t *res_postcopy_only);
+void qemu_savevm_state_pending_exact(uint64_t *res_precopy_only,
+ uint64_t *res_compatible,
+ uint64_t *res_postcopy_only);
+void qemu_savevm_state_pending_estimate(uint64_t *res_precopy_only,
+ uint64_t *res_compatible,
+ uint64_t *res_postcopy_only);
void qemu_savevm_send_ping(QEMUFile *f, uint32_t value);
void qemu_savevm_send_open_return_path(QEMUFile *f);
int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len);
diff --git a/migration/threadinfo.c b/migration/threadinfo.c
new file mode 100644
index 0000000..1de8b31
--- /dev/null
+++ b/migration/threadinfo.c
@@ -0,0 +1,51 @@
+/*
+ * Migration Threads info
+ *
+ * Copyright (c) 2022 HUAWEI TECHNOLOGIES CO., LTD.
+ *
+ * Authors:
+ * Jiang Jiacheng <jiangjiacheng@huawei.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "threadinfo.h"
+
+static QLIST_HEAD(, MigrationThread) migration_threads;
+
+MigrationThread *MigrationThreadAdd(const char *name, int thread_id)
+{
+ MigrationThread *thread = g_new0(MigrationThread, 1);
+ thread->name = name;
+ thread->thread_id = thread_id;
+
+ QLIST_INSERT_HEAD(&migration_threads, thread, node);
+
+ return thread;
+}
+
+void MigrationThreadDel(MigrationThread *thread)
+{
+ if (thread) {
+ QLIST_REMOVE(thread, node);
+ g_free(thread);
+ }
+}
+
+MigrationThreadInfoList *qmp_query_migrationthreads(Error **errp)
+{
+ MigrationThreadInfoList *head = NULL;
+ MigrationThreadInfoList **tail = &head;
+ MigrationThread *thread = NULL;
+
+ QLIST_FOREACH(thread, &migration_threads, node) {
+ MigrationThreadInfo *info = g_new0(MigrationThreadInfo, 1);
+ info->name = g_strdup(thread->name);
+ info->thread_id = thread->thread_id;
+
+ QAPI_LIST_APPEND(tail, info);
+ }
+
+ return head;
+}
diff --git a/migration/threadinfo.h b/migration/threadinfo.h
new file mode 100644
index 0000000..4d69423
--- /dev/null
+++ b/migration/threadinfo.h
@@ -0,0 +1,28 @@
+/*
+ * Migration Threads info
+ *
+ * Copyright (c) 2022 HUAWEI TECHNOLOGIES CO., LTD.
+ *
+ * Authors:
+ * Jiang Jiacheng <jiangjiacheng@huawei.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/queue.h"
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-migration.h"
+
+typedef struct MigrationThread MigrationThread;
+
+struct MigrationThread {
+ const char *name; /* the name of migration thread */
+ int thread_id; /* ID of the underlying host thread */
+ QLIST_ENTRY(MigrationThread) node;
+};
+
+MigrationThread *MigrationThreadAdd(const char *name, int thread_id);
+
+void MigrationThreadDel(MigrationThread *info);
diff --git a/migration/trace-events b/migration/trace-events
index 57003ed..67b65a7 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -150,7 +150,8 @@
migrate_fd_error(const char *error_desc) "error=%s"
migrate_fd_cancel(void) ""
migrate_handle_rp_req_pages(const char *rbname, size_t start, size_t len) "in %s at 0x%zx len 0x%zx"
-migrate_pending(uint64_t size, uint64_t max, uint64_t pre, uint64_t compat, uint64_t post) "pending size %" PRIu64 " max %" PRIu64 " (pre = %" PRIu64 " compat=%" PRIu64 " post=%" PRIu64 ")"
+migrate_pending_exact(uint64_t size, uint64_t pre, uint64_t compat, uint64_t post) "exact pending size %" PRIu64 " (pre = %" PRIu64 " compat=%" PRIu64 " post=%" PRIu64 ")"
+migrate_pending_estimate(uint64_t size, uint64_t pre, uint64_t compat, uint64_t post) "estimate pending size %" PRIu64 " (pre = %" PRIu64 " compat=%" PRIu64 " post=%" PRIu64 ")"
migrate_send_rp_message(int msg_type, uint16_t len) "%d: len %d"
migrate_send_rp_recv_bitmap(char *name, int64_t size) "block '%s' size 0x%"PRIi64
migration_completion_file_err(void) ""
@@ -330,7 +331,7 @@
dirty_bitmap_save_iterate(int in_postcopy) "in postcopy: %d"
dirty_bitmap_save_complete_enter(void) ""
dirty_bitmap_save_complete_finish(void) ""
-dirty_bitmap_save_pending(uint64_t pending, uint64_t max_size) "pending %" PRIu64 " max: %" PRIu64
+dirty_bitmap_state_pending(uint64_t pending) "pending %" PRIu64
dirty_bitmap_load_complete(void) ""
dirty_bitmap_load_bits_enter(uint64_t first_sector, uint32_t nr_sectors) "chunk: %" PRIu64 " %" PRIu32
dirty_bitmap_load_bits_zeroes(void) ""
@@ -355,7 +356,7 @@
migration_block_flush_blks(const char *action, int submitted, int read_done, int transferred) "%s submitted %d read_done %d transferred %d"
migration_block_save(const char *mig_stage, int submitted, int transferred) "Enter save live %s submitted %d transferred %d"
migration_block_save_complete(void) "Block migration completed"
-migration_block_save_pending(uint64_t pending) "Enter save live pending %" PRIu64
+migration_block_state_pending(uint64_t pending) "Enter save live pending %" PRIu64
# page_cache.c
migration_pagecache_init(int64_t max_num_items) "Setting cache buckets to %" PRId64
diff --git a/migration/vmstate.c b/migration/vmstate.c
index 924494b..83ca4c7 100644
--- a/migration/vmstate.c
+++ b/migration/vmstate.c
@@ -154,6 +154,7 @@
}
field++;
}
+ assert(field->flags == VMS_END);
ret = vmstate_subsection_load(f, vmsd, opaque);
if (ret != 0) {
return ret;
@@ -408,6 +409,7 @@
}
field++;
}
+ assert(field->flags == VMS_END);
if (vmdesc) {
json_writer_end_array(vmdesc);
diff --git a/qapi/migration.json b/qapi/migration.json
index 88ecf86..c84fa10 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -1959,6 +1959,35 @@
'returns': [ 'DirtyLimitInfo' ] }
##
+# @MigrationThreadInfo:
+#
+# Information about migrationthreads
+#
+# @name: the name of migration thread
+#
+# @thread-id: ID of the underlying host thread
+#
+# Since: 7.2
+##
+{ 'struct': 'MigrationThreadInfo',
+ 'data': {'name': 'str',
+ 'thread-id': 'int'} }
+
+##
+# @query-migrationthreads:
+#
+# Returns information of migration threads
+#
+# data: migration thread name
+#
+# returns: information about migration threads
+#
+# Since: 7.2
+##
+{ 'command': 'query-migrationthreads',
+ 'returns': ['MigrationThreadInfo'] }
+
+##
# @snapshot-save:
#
# Save a VM snapshot
diff --git a/scsi/qemu-pr-helper.c b/scsi/qemu-pr-helper.c
index 196b78c..199227a 100644
--- a/scsi/qemu-pr-helper.c
+++ b/scsi/qemu-pr-helper.c
@@ -614,7 +614,7 @@
iov.iov_base = buf;
iov.iov_len = sz;
n_read = qio_channel_readv_full(QIO_CHANNEL(client->ioc), &iov, 1,
- &fds, &nfds, errp);
+ &fds, &nfds, 0, errp);
if (n_read == QIO_CHANNEL_ERR_BLOCK) {
qio_channel_yield(QIO_CHANNEL(client->ioc), G_IO_IN);
diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 1dd32c9..109bc8e 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -61,14 +61,14 @@
#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
#include <sys/eventfd.h>
#include <sys/ioctl.h>
-#include <linux/userfaultfd.h>
+#include "qemu/userfaultfd.h"
static bool ufd_version_check(void)
{
struct uffdio_api api_struct;
uint64_t ioctl_mask;
- int ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
+ int ufd = uffd_open(O_CLOEXEC);
if (ufd == -1) {
g_test_message("Skipping test: userfaultfd not available");
diff --git a/tests/qtest/tpm-emu.c b/tests/qtest/tpm-emu.c
index 73e0000..f05fe12 100644
--- a/tests/qtest/tpm-emu.c
+++ b/tests/qtest/tpm-emu.c
@@ -115,7 +115,7 @@
int *pfd = NULL;
size_t nfd = 0;
- qio_channel_readv_full(ioc, &iov, 1, &pfd, &nfd, &error_abort);
+ qio_channel_readv_full(ioc, &iov, 1, &pfd, &nfd, 0, &error_abort);
cmd = be32_to_cpu(cmd);
g_assert_cmpint(cmd, ==, CMD_SET_DATAFD);
g_assert_cmpint(nfd, ==, 1);
diff --git a/tests/unit/test-io-channel-socket.c b/tests/unit/test-io-channel-socket.c
index b36a5d9..b964bb2 100644
--- a/tests/unit/test-io-channel-socket.c
+++ b/tests/unit/test-io-channel-socket.c
@@ -460,6 +460,7 @@
G_N_ELEMENTS(iorecv),
&fdrecv,
&nfdrecv,
+ 0,
&error_abort);
g_assert(nfdrecv == G_N_ELEMENTS(fdsend));
diff --git a/util/userfaultfd.c b/util/userfaultfd.c
index f1cd6af..4953b31 100644
--- a/util/userfaultfd.c
+++ b/util/userfaultfd.c
@@ -19,6 +19,15 @@
#include <sys/syscall.h>
#include <sys/ioctl.h>
+int uffd_open(int flags)
+{
+#if defined(__NR_userfaultfd)
+ return syscall(__NR_userfaultfd, flags);
+#else
+ return -EINVAL;
+#endif
+}
+
/**
* uffd_query_features: query UFFD features
*
@@ -32,7 +41,7 @@
struct uffdio_api api_struct = { 0 };
int ret = -1;
- uffd_fd = syscall(__NR_userfaultfd, O_CLOEXEC);
+ uffd_fd = uffd_open(O_CLOEXEC);
if (uffd_fd < 0) {
trace_uffd_query_features_nosys(errno);
return -1;
@@ -69,7 +78,7 @@
uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0);
- uffd_fd = syscall(__NR_userfaultfd, flags);
+ uffd_fd = uffd_open(flags);
if (uffd_fd < 0) {
trace_uffd_create_fd_nosys(errno);
return -1;
diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c
index 232984a..145eb17 100644
--- a/util/vhost-user-server.c
+++ b/util/vhost-user-server.c
@@ -116,7 +116,7 @@
* qio_channel_readv_full may have short reads, keeping calling it
* until getting VHOST_USER_HDR_SIZE or 0 bytes in total
*/
- rc = qio_channel_readv_full(ioc, &iov, 1, &fds, &nfds, &local_err);
+ rc = qio_channel_readv_full(ioc, &iov, 1, &fds, &nfds, 0, &local_err);
if (rc < 0) {
if (rc == QIO_CHANNEL_ERR_BLOCK) {
assert(local_err == NULL);