[RFC KERNEL PATCH 0/2] Add Dom0 NVDIMM support for Xen

Discussion:

[RFC KERNEL PATCH 0/2] Add Dom0 NVDIMM support for Xen

(too old to reply)

Haozhong Zhang

2016-10-10 00:35:42 UTC

Overview
========
This RFC kernel patch series along with corresponding patch series of
Xen, QEMU and ndctl implements Xen vNVDIMM, which can map the host
NVDIMM devices to Xen HVM domU as vNVDIMM devices.

Xen hypervisor does not include an NVDIMM driver, so it needs the
assistance from the driver in Dom0 Linux kernel to manage NVDIMM
devices. We currently only supports NVDIMM devices in pmem mode.

Design and Implementation
=========================
The complete design can be found at
https://lists.xenproject.org/archives/html/xen-devel/2016-07/msg01921.html.

All patch series can be found at
Xen: https://github.com/hzzhan9/xen.git nvdimm-rfc-v1
QEMU: https://github.com/hzzhan9/qemu.git xen-nvdimm-rfc-v1
Linux kernel: https://github.com/hzzhan9/nvdimm.git xen-nvdimm-rfc-v1
ndctl: https://github.com/hzzhan9/ndctl.git pfn-xen-rfc-v1

Xen hypervisor needs assistance from Dom0 Linux kernel for following tasks:
1) Reserve an area on NVDIMM devices for Xen hypervisor to place
memory management data structures, i.e. frame table and M2P table.
2) Report SPA ranges of NVDIMM devices and the reserved area to Xen
hypervisor.

For 1), Patch 1 implements a new mode PFN_MODE_XEN to pfn devices, which
make the reservation for Xen hypervisor.

For 2), Patch 2 uses a new Xen hypercall to report the address
information of pfn devices in PFN_MODE_XEN.

How to test
===========
Please refer to the cover letter of Xen patch series
"[RFC XEN PATCH 00/16] Add vNVDIMM support to HVM domains".

Haozhong Zhang (2):
nvdimm: add PFN_MODE_XEN to pfn device for Xen usage
xen, nvdimm: report pfn devices in PFN_MODE_XEN to Xen hypervisor

drivers/nvdimm/namespace_devs.c | 2 ++
drivers/nvdimm/nd.h | 7 +++++
drivers/nvdimm/pfn_devs.c | 37 +++++++++++++++++++++---
drivers/nvdimm/pmem.c | 61 ++++++++++++++++++++++++++++++++++++++--
drivers/xen/Makefile | 2 +-
drivers/xen/pmem.c | 53 ++++++++++++++++++++++++++++++++++
include/linux/pfn_t.h | 2 ++
include/xen/interface/platform.h | 13 +++++++++
include/xen/pmem.h | 32 +++++++++++++++++++++
9 files changed, 201 insertions(+), 8 deletions(-)
create mode 100644 drivers/xen/pmem.c
create mode 100644 include/xen/pmem.h

--
2.10.1

Haozhong Zhang

2016-10-10 00:35:50 UTC

pfn device in PFN_MODE_XEN reserves an area for Xen hypervisor to place
its own pmem management data structures (i.e. frame table and M2P
table). The reserved area is not used and not mapped by Linux kernel,
and only the data area is mapped.

Signed-off-by: Haozhong Zhang <***@intel.com>
---
Cc: Dan Williams <***@intel.com>
Cc: Ross Zwisler <***@linux.intel.com>
Cc: Andrew Morton <***@linux-foundation.org>
Cc: Johannes Thumshirn <***@suse.de>
Cc: linux-***@vger.kernel.org
---
drivers/nvdimm/namespace_devs.c | 2 ++
drivers/nvdimm/nd.h | 7 +++++++
drivers/nvdimm/pfn_devs.c | 37 +++++++++++++++++++++++++++++++++----
drivers/nvdimm/pmem.c | 36 +++++++++++++++++++++++++++++++++---
include/linux/pfn_t.h | 2 ++
5 files changed, 77 insertions(+), 7 deletions(-)

diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 3509cff..b1df653 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1394,6 +1394,8 @@ static ssize_t mode_show(struct device *dev,
claim = ndns->claim;
if (claim && is_nd_btt(claim))
mode = "safe";
+ else if (claim && is_nd_pfn_xen(claim))
+ mode = "xen";
else if (claim && is_nd_pfn(claim))
mode = "memory";
else if (claim && is_nd_dax(claim))
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index d3b2fca..6af3a78 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -192,6 +192,7 @@ enum nd_pfn_mode {
PFN_MODE_NONE,
PFN_MODE_RAM,
PFN_MODE_PMEM,
+ PFN_MODE_XEN,
};

struct nd_pfn {
@@ -272,6 +273,7 @@ struct nd_pfn *to_nd_pfn(struct device *dev);
#if IS_ENABLED(CONFIG_NVDIMM_PFN)
int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns);
bool is_nd_pfn(struct device *dev);
+bool is_nd_pfn_xen(struct device *dev);
struct device *nd_pfn_create(struct nd_region *nd_region);
struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn,
struct nd_namespace_common *ndns);
@@ -289,6 +291,11 @@ static inline bool is_nd_pfn(struct device *dev)
return false;
}

+static inline bool is_nd_pfn_xen(struct device *dev)
+{
+ return false;
+}
+
static inline struct device *nd_pfn_create(struct nd_region *nd_region)
{
return NULL;
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index cea8350..6624f72 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -45,6 +45,12 @@ bool is_nd_pfn(struct device *dev)
}
EXPORT_SYMBOL(is_nd_pfn);

+bool is_nd_pfn_xen(struct device *dev)
+{
+ return is_nd_pfn(dev) ? to_nd_pfn(dev)->mode == PFN_MODE_XEN : false;
+}
+EXPORT_SYMBOL(is_nd_pfn_xen);
+
struct nd_pfn *to_nd_pfn(struct device *dev)
{
struct nd_pfn *nd_pfn = container_of(dev, struct nd_pfn, dev);
@@ -64,6 +70,8 @@ static ssize_t mode_show(struct device *dev,
return sprintf(buf, "ram\n");
case PFN_MODE_PMEM:
return sprintf(buf, "pmem\n");
+ case PFN_MODE_XEN:
+ return sprintf(buf, "xen\n");
default:
return sprintf(buf, "none\n");
}
@@ -88,6 +96,9 @@ static ssize_t mode_store(struct device *dev,
} else if (strncmp(buf, "ram\n", n) == 0
|| strncmp(buf, "ram", n) == 0)
nd_pfn->mode = PFN_MODE_RAM;
+ else if (strncmp(buf, "xen\n", n) == 0
+ || strncmp(buf, "xen", n) == 0)
+ nd_pfn->mode = PFN_MODE_XEN;
else if (strncmp(buf, "none\n", n) == 0
|| strncmp(buf, "none", n) == 0)
nd_pfn->mode = PFN_MODE_NONE;
@@ -383,6 +394,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
switch (le32_to_cpu(pfn_sb->mode)) {
case PFN_MODE_RAM:
case PFN_MODE_PMEM:
+ case PFN_MODE_XEN:
break;
default:
return -ENXIO;
@@ -532,11 +544,10 @@ static struct vmem_altmap *__nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
res->start += start_pad;
res->end -= end_trunc;

- if (nd_pfn->mode == PFN_MODE_RAM) {
+ if (nd_pfn->mode == PFN_MODE_RAM || nd_pfn->mode == PFN_MODE_XEN) {
if (offset < SZ_8K)
return ERR_PTR(-EINVAL);
nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
- altmap = NULL;
} else if (nd_pfn->mode == PFN_MODE_PMEM) {
nd_pfn->npfns = (resource_size(res) - offset) / PAGE_SIZE;
if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
@@ -544,11 +555,15 @@ static struct vmem_altmap *__nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
"number of pfns truncated from %lld to %ld\n",
le64_to_cpu(nd_pfn->pfn_sb->npfns),
nd_pfn->npfns);
+ } else
+ return ERR_PTR(-ENXIO);
+
+ if (nd_pfn->mode == PFN_MODE_PMEM || nd_pfn->mode == PFN_MODE_XEN) {
memcpy(altmap, &__altmap, sizeof(*altmap));
altmap->free = PHYS_PFN(offset - SZ_8K);
altmap->alloc = 0;
} else
- return ERR_PTR(-ENXIO);
+ altmap = NULL;

return altmap;
}
@@ -639,7 +654,21 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
} else if (nd_pfn->mode == PFN_MODE_RAM)
offset = ALIGN(start + SZ_8K + dax_label_reserve,
nd_pfn->align) - start;
- else
+ else if (nd_pfn->mode == PFN_MODE_XEN) {
+ /*
+ * Reserve 64 bytes for each entry of Xen frame table
+ * and 8 bytes for each entry of Xen M2P table. The
+ * frame table and M2P table are used by Xen for its
+ * memory management.
+ */
+ unsigned long reserved_size;
+ unsigned long nr_pfns = ALIGN(size, SZ_4K) / SZ_4K;
+
+ reserved_size = ALIGN(64 * nr_pfns, HPAGE_SIZE);
+ reserved_size += ALIGN(8 * nr_pfns, HPAGE_SIZE);
+ offset = ALIGN(start + SZ_8K + reserved_size + dax_label_reserve,
+ nd_pfn->align) - start;
+ } else
return -ENXIO;

if (offset + start_pad + end_trunc >= size) {
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 42b3a82..d2c9ead 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -92,7 +92,12 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
{
int rc = 0;
bool bad_pmem = false;
- phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
+ /*
+ * Only the data area of pfn_xen is mapped, so its offset
+ * should be calculated from the beginning of the data area.
+ */
+ phys_addr_t pmem_off = sector * 512 +
+ ((pmem->pfn_flags & PFN_XEN) ? 0 : pmem->data_offset);
void *pmem_addr = pmem->virt_addr + pmem_off;

if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
@@ -194,7 +199,12 @@ __weak long pmem_direct_access(struct block_device *bdev, sector_t sector,
void **kaddr, pfn_t *pfn, long size)
{
struct pmem_device *pmem = bdev->bd_queue->queuedata;
- resource_size_t offset = sector * 512 + pmem->data_offset;
+ /*
+ * Only the data area of pfn_xen is mapped, so its offset
+ * should be calculated from the beginning of the data area.
+ */
+ resource_size_t offset = sector * 512 +
+ ((pmem->pfn_flags & PFN_XEN) ? 0 : pmem->data_offset);

if (unlikely(is_bad_pmem(&pmem->bb, sector, size)))
return -EIO;
@@ -276,7 +286,27 @@ static int pmem_attach_disk(struct device *dev,
return -ENOMEM;

pmem->pfn_flags = PFN_DEV;
- if (is_nd_pfn(dev)) {
+ if (is_nd_pfn_xen(dev)) {
+ /*
+ * The reserved area on nd_pfn_xen is used by Xen
+ * hypervisor other than Linux kernel, so it is not
+ * necessary and should not be mapped here. We only
+ * create the memory map for the data area.
+ */
+ resource_size_t dataoff;
+ size_t datasize;
+
+ pfn_sb = nd_pfn->pfn_sb;
+ dataoff = pmem->phys_addr + le32_to_cpu(pfn_sb->start_pad) +
+ le64_to_cpu(pfn_sb->dataoff);
+ datasize = resource_size(&pfn_res) - le64_to_cpu(pfn_sb->dataoff);
+ addr = devm_memremap(dev, dataoff, datasize, ARCH_MEMREMAP_PMEM);
+ pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
+ pmem->pfn_pad = resource_size(res) - resource_size(&pfn_res);
+ pmem->pfn_flags |= PFN_XEN;
+ res = &pfn_res; /* for badblocks populate */
+ res->start += pmem->data_offset;
+ } else if (is_nd_pfn(dev)) {
addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter,
altmap);
pfn_sb = nd_pfn->pfn_sb;
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index a3d90b9..65f90f8 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -8,12 +8,14 @@
* PFN_SG_LAST - pfn references a page and is the last scatterlist entry
* PFN_DEV - pfn is not covered by system memmap by default
* PFN_MAP - pfn has a dynamic page mapping established by a device driver
+ * PFN_XEN - pfn has an area reserved for Xen hypervisor
*/
#define PFN_FLAGS_MASK (((u64) ~PAGE_MASK) << (BITS_PER_LONG_LONG - PAGE_SHIFT))
#define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1))
#define PFN_SG_LAST (1ULL << (BITS_PER_LONG_LONG - 2))
#define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3))
#define PFN_MAP (1ULL << (BITS_PER_LONG_LONG - 4))
+#define PFN_XEN (1ULL << (BITS_PER_LONG_LONG - 5))

static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, u64 flags)
{

--
2.10.1

Haozhong Zhang

2016-10-10 00:36:00 UTC

Xen hypervisor does not include NVDIMM driver and relies on the driver
in Dom0 Linux to probe pfn devices in PFN_MODE_XEN. Whenever such a pfn
device is probed, Dom0 Linux reports pages of the entire device, its
reserved area and data area to Xen hypervisor.

Signed-off-by: Haozhong Zhang <***@intel.com>
---
Cc: Ross Zwisler <***@linux.intel.com>
Cc: Dan Williams <***@intel.com>
Cc: Boris Ostrovsky <***@oracle.com>
Cc: David Vrabel <***@citrix.com>
Cc: Juergen Gross <***@suse.com>
Cc: Stefano Stabellini <***@aporeto.com>
Cc: Arnd Bergmann <***@arndb.de>
Cc: linux-***@vger.kernel.org
Cc: xen-***@lists.xenproject.org
---
drivers/nvdimm/pmem.c | 25 +++++++++++++++++++
drivers/xen/Makefile | 2 +-
drivers/xen/pmem.c | 53 ++++++++++++++++++++++++++++++++++++++++
include/xen/interface/platform.h | 13 ++++++++++
include/xen/pmem.h | 32 ++++++++++++++++++++++++
5 files changed, 124 insertions(+), 1 deletion(-)
create mode 100644 drivers/xen/pmem.c
create mode 100644 include/xen/pmem.h

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index d2c9ead..eab1ee4 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -33,6 +33,11 @@
#include "pfn.h"
#include "nd.h"

+#ifdef CONFIG_XEN
+#include <xen/xen.h>
+#include <xen/pmem.h>
+#endif
+
static struct device *to_dev(struct pmem_device *pmem)
{
/*
@@ -364,6 +369,26 @@ static int pmem_attach_disk(struct device *dev,

revalidate_disk(disk);

+#ifdef CONFIG_XEN
+ if (xen_initial_domain() && is_nd_pfn_xen(dev)) {
+ uint64_t rsv_off, rsv_size, data_off, data_size;
+ int err;
+
+ rsv_off = le64_to_cpu(pfn_sb->start_pad) +
+ PFN_PHYS(altmap->reserve);
+ rsv_size = PFN_PHYS(altmap->free);
+ data_off = le32_to_cpu(pfn_sb->start_pad) + pmem->data_offset;
+ data_size = pmem->size - pmem->pfn_pad - pmem->data_offset;
+
+ err = xen_pmem_add(pmem->phys_addr, pmem->size,
+ rsv_off, rsv_size, data_off, data_size);
+ if (err) {
+ dev_err(dev, "failed to register to Xen\n");
+ return err;
+ }
+ }
+#endif
+
return 0;
}

diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 8feab810..7f95156 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,6 +1,6 @@
obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
obj-$(CONFIG_X86) += fallback.o
-obj-y += grant-table.o features.o balloon.o manage.o preempt.o time.o
+obj-y += grant-table.o features.o balloon.o manage.o preempt.o time.o pmem.o
obj-y += events/
obj-y += xenbus/

diff --git a/drivers/xen/pmem.c b/drivers/xen/pmem.c
new file mode 100644
index 0000000..bb027a5
--- /dev/null
+++ b/drivers/xen/pmem.c
@@ -0,0 +1,53 @@
+/******************************************************************************
+ * pmem.c
+ * pmem file for domain 0 kernel
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Haozhong Zhang <***@intel.com>
+ */
+
+#include <linux/types.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+
+int xen_pmem_add(uint64_t spa, size_t size,
+ uint64_t rsv_off, size_t rsv_size,
+ uint64_t data_off, size_t data_size)
+{
+ int rc;
+ struct xen_platform_op op;
+
+ if ((spa | size | rsv_off | rsv_size | data_off | data_size) &
+ (PAGE_SIZE - 1))
+ return -EINVAL;
+
+ op.cmd = XENPF_pmem_add;
+ op.u.pmem_add.spfn = PHYS_PFN(spa);
+ op.u.pmem_add.epfn = PHYS_PFN(spa) + PHYS_PFN(size);
+ op.u.pmem_add.rsv_spfn = PHYS_PFN(spa + rsv_off);
+ op.u.pmem_add.rsv_epfn = PHYS_PFN(spa + rsv_off + rsv_size);
+ op.u.pmem_add.data_spfn = PHYS_PFN(spa + data_off);
+ op.u.pmem_add.data_epfn = PHYS_PFN(spa + data_off + data_size);
+
+ rc = HYPERVISOR_platform_op(&op);
+ if (rc)
+ pr_err("Xen pmem add failed on 0x%llx ~ 0x%llx, error: %d\n",
+ spa, spa + size, rc);
+
+ return rc;
+}
+EXPORT_SYMBOL(xen_pmem_add);
diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h
index 732efb0..6c51f0c 100644
--- a/include/xen/interface/platform.h
+++ b/include/xen/interface/platform.h
@@ -500,6 +500,18 @@ struct xenpf_symdata {
};
DEFINE_GUEST_HANDLE_STRUCT(xenpf_symdata);

+#define XENPF_pmem_add 64
+struct xenpf_pmem_add {
+ /* IN variables */
+ uint64_t spfn; /* start PFN of the whole pmem region */
+ uint64_t epfn; /* end PFN of the whole pmem region */
+ uint64_t rsv_spfn; /* start PFN of the reserved area */
+ uint64_t rsv_epfn; /* end PFN of the reserved area */
+ uint64_t data_spfn; /* start PFN of the data area */
+ uint64_t data_epfn; /* end PFN of the data area */
+};
+DEFINE_GUEST_HANDLE_STRUCT(xenpf_pmem_add);
+
struct xen_platform_op {
uint32_t cmd;
uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
@@ -523,6 +535,7 @@ struct xen_platform_op {
struct xenpf_mem_hotadd mem_add;
struct xenpf_core_parking core_parking;
struct xenpf_symdata symdata;
+ struct xenpf_pmem_add pmem_add;
uint8_t pad[128];
} u;
};
diff --git a/include/xen/pmem.h b/include/xen/pmem.h
new file mode 100644
index 0000000..896422a
--- /dev/null
+++ b/include/xen/pmem.h
@@ -0,0 +1,32 @@
+/******************************************************************************
+ * pmem.h
+ * pmem file for domain 0 kernel
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Haozhong Zhang <***@intel.com>
+ */
+
+#ifndef __XEN_PMEM_H__
+#define __XEN_PMEM_H__
+
+#include <linux/types.h>
+
+int xen_pmem_add(uint64_t spa, size_t size,
+ uint64_t rsv_off, size_t rsv_size,
+ uint64_t data_off, size_t data_size);
+
+#endif /* __XEN_PMEM_H__ */

--
2.10.1

Dan Williams

2016-10-10 03:45:46 UTC

Post by Haozhong Zhang
Overview
========
This RFC kernel patch series along with corresponding patch series of
Xen, QEMU and ndctl implements Xen vNVDIMM, which can map the host
NVDIMM devices to Xen HVM domU as vNVDIMM devices.
Xen hypervisor does not include an NVDIMM driver, so it needs the
assistance from the driver in Dom0 Linux kernel to manage NVDIMM
devices. We currently only supports NVDIMM devices in pmem mode.
Design and Implementation
=========================
The complete design can be found at
https://lists.xenproject.org/archives/html/xen-devel/2016-07/msg01921.html.

The KVM enabling for persistent memory does not need this support from
the kernel, and as far as I can see neither does Xen. If the
hypervisor needs to reserve some space it can simply trim the amount
that it hands to the guest.

The usage of fiemap and the sysfs resource for the pmem device, as
mentioned in the design document, does not seem to comprehend that
file block allocations may be discontiguous and may change over time
depending on the file.

Haozhong Zhang

2016-10-10 06:32:26 UTC

Post by Dan Williams

Post by Haozhong Zhang
Overview
========
This RFC kernel patch series along with corresponding patch series of
Xen, QEMU and ndctl implements Xen vNVDIMM, which can map the host
NVDIMM devices to Xen HVM domU as vNVDIMM devices.
Xen hypervisor does not include an NVDIMM driver, so it needs the
assistance from the driver in Dom0 Linux kernel to manage NVDIMM
devices. We currently only supports NVDIMM devices in pmem mode.
Design and Implementation
=========================
The complete design can be found at
https://lists.xenproject.org/archives/html/xen-devel/2016-07/msg01921.html.

The KVM enabling for persistent memory does not need this support from
the kernel, and as far as I can see neither does Xen. If the
hypervisor needs to reserve some space it can simply trim the amount
that it hands to the guest.

Xen does not have the NVDIMM driver, so it cannot operate on NVDIMM
devices by itself. Instead it relies on the driver in Dom0 Linux to
probe NVDIMM and make the reservation.

Post by Dan Williams
The usage of fiemap and the sysfs resource for the pmem device, as
mentioned in the design document, does not seem to comprehend that
file block allocations may be discontiguous and may change over time
depending on the file.

True. I may need to find a way to notify Xen of the underlying
changes, so that Xen can then adjust the address mapping.

Thanks,
Haozhong

Dan Williams

2016-10-10 16:24:34 UTC

On Sun, Oct 9, 2016 at 11:32 PM, Haozhong Zhang

Post by Haozhong Zhang

Post by Dan Williams

Post by Haozhong Zhang
Overview
========
This RFC kernel patch series along with corresponding patch series of
Xen, QEMU and ndctl implements Xen vNVDIMM, which can map the host
NVDIMM devices to Xen HVM domU as vNVDIMM devices.
Xen hypervisor does not include an NVDIMM driver, so it needs the
assistance from the driver in Dom0 Linux kernel to manage NVDIMM
devices. We currently only supports NVDIMM devices in pmem mode.
Design and Implementation
=========================
The complete design can be found at
https://lists.xenproject.org/archives/html/xen-devel/2016-07/msg01921.html.

The KVM enabling for persistent memory does not need this support from
the kernel, and as far as I can see neither does Xen. If the
hypervisor needs to reserve some space it can simply trim the amount
that it hands to the guest.

Xen does not have the NVDIMM driver, so it cannot operate on NVDIMM
devices by itself. Instead it relies on the driver in Dom0 Linux to
probe NVDIMM and make the reservation.

I'm missing something because the design document talks about mmap'ing
files on a DAX filesystem. So, I'm assuming it is similar to the KVM
NVDIMM virtualization case where an mmap range in dom0 is translated
into a guest physical range. The suggestion is to reserve some memory
out of that mapping rather than introduce a new info block /
reservation type to the sub-system.

Haozhong Zhang

2016-10-11 07:12:03 UTC

Post by Dan Williams
On Sun, Oct 9, 2016 at 11:32 PM, Haozhong Zhang

Post by Haozhong Zhang

Post by Dan Williams

Post by Haozhong Zhang
Overview
========
This RFC kernel patch series along with corresponding patch series of
Xen, QEMU and ndctl implements Xen vNVDIMM, which can map the host
NVDIMM devices to Xen HVM domU as vNVDIMM devices.
Xen hypervisor does not include an NVDIMM driver, so it needs the
assistance from the driver in Dom0 Linux kernel to manage NVDIMM
devices. We currently only supports NVDIMM devices in pmem mode.
Design and Implementation
=========================
The complete design can be found at
https://lists.xenproject.org/archives/html/xen-devel/2016-07/msg01921.html.

The KVM enabling for persistent memory does not need this support from
the kernel, and as far as I can see neither does Xen. If the
hypervisor needs to reserve some space it can simply trim the amount
that it hands to the guest.

Xen does not have the NVDIMM driver, so it cannot operate on NVDIMM
devices by itself. Instead it relies on the driver in Dom0 Linux to
probe NVDIMM and make the reservation.

I'm missing something because the design document talks about mmap'ing
files on a DAX filesystem. So, I'm assuming it is similar to the KVM
NVDIMM virtualization case where an mmap range in dom0 is translated
into a guest physical range. The suggestion is to reserve some memory
out of that mapping rather than introduce a new info block /
reservation type to the sub-system.

Just like struct page to linux, Xen hypervisor uses a struct page_info
for its memory management. We are facing the same problem as linux
kernel: where we store those structs for pmem, and decided to put them
on a reserved area on pmem, similar to what pfn device in kernel does.

Reserving at the moment of mmap and out of what is mapped does not
work. It's a bootstrap problem: Xen needs the information of those
pages, which are stored in struct page_info, at the moment of
mapping. That is, page_info structs for pmem pages should be prepared
before those pages are actually used.

However, as the ongoing discussion in another thread with Andrew
Cooper, if Xen hypervisor turns to treat pmem pages as MMIO, then the
reservation may not be needed. Let's see what conclusion will be
reached there.

Thanks,
Haozhong

Andrew Cooper

2016-10-10 16:47:34 UTC

Post by Haozhong Zhang
Overview
========
This RFC kernel patch series along with corresponding patch series of
Xen, QEMU and ndctl implements Xen vNVDIMM, which can map the host
NVDIMM devices to Xen HVM domU as vNVDIMM devices.
Xen hypervisor does not include an NVDIMM driver, so it needs the
assistance from the driver in Dom0 Linux kernel to manage NVDIMM
devices. We currently only supports NVDIMM devices in pmem mode.
Design and Implementation
=========================
The complete design can be found at
https://lists.xenproject.org/archives/html/xen-devel/2016-07/msg01921.html.
All patch series can be found at
Xen: https://github.com/hzzhan9/xen.git nvdimm-rfc-v1
QEMU: https://github.com/hzzhan9/qemu.git xen-nvdimm-rfc-v1
Linux kernel: https://github.com/hzzhan9/nvdimm.git xen-nvdimm-rfc-v1
ndctl: https://github.com/hzzhan9/ndctl.git pfn-xen-rfc-v1
1) Reserve an area on NVDIMM devices for Xen hypervisor to place
memory management data structures, i.e. frame table and M2P table.
2) Report SPA ranges of NVDIMM devices and the reserved area to Xen
hypervisor.

Please can we take a step back here before diving down a rabbit hole.

How do pblk/pmem regions appear in the E820 map at boot? At the very
least, I would expect at least a large reserved region.

Is the MFN information (SPA in your terminology, so far as I can tell)
available in any static APCI tables, or are they only available as a
result of executing AML methods?

If the MFN information is only available via AML, then point 2) is
needed, although the reporting back to Xen should be restricted to a xen
component, rather than polluting the main device driver.

However, I can't see any justification for 1). Dom0 should not be
involved in Xen's management of its own frame table and m2p. The mfns
making up the pmem/pblk regions should be treated just like any other
MMIO regions, and be handed wholesale to dom0 by default.

~Andrew

Haozhong Zhang

2016-10-11 05:53:08 UTC

Post by Andrew Cooper

Post by Haozhong Zhang
Overview
========
This RFC kernel patch series along with corresponding patch series of
Xen, QEMU and ndctl implements Xen vNVDIMM, which can map the host
NVDIMM devices to Xen HVM domU as vNVDIMM devices.
Xen hypervisor does not include an NVDIMM driver, so it needs the
assistance from the driver in Dom0 Linux kernel to manage NVDIMM
devices. We currently only supports NVDIMM devices in pmem mode.
Design and Implementation
=========================
The complete design can be found at
https://lists.xenproject.org/archives/html/xen-devel/2016-07/msg01921.html.
All patch series can be found at
Xen: https://github.com/hzzhan9/xen.git nvdimm-rfc-v1
QEMU: https://github.com/hzzhan9/qemu.git xen-nvdimm-rfc-v1
Linux kernel: https://github.com/hzzhan9/nvdimm.git xen-nvdimm-rfc-v1
ndctl: https://github.com/hzzhan9/ndctl.git pfn-xen-rfc-v1
1) Reserve an area on NVDIMM devices for Xen hypervisor to place
memory management data structures, i.e. frame table and M2P table.
2) Report SPA ranges of NVDIMM devices and the reserved area to Xen
hypervisor.

Please can we take a step back here before diving down a rabbit hole.
How do pblk/pmem regions appear in the E820 map at boot? At the very
least, I would expect at least a large reserved region.

ACPI specification does not require them to appear in E820, though
it defines E820 type-7 for persistent memory.

Post by Andrew Cooper
Is the MFN information (SPA in your terminology, so far as I can tell)
available in any static APCI tables, or are they only available as a
result of executing AML methods?

For NVDIMM devices already plugged at power on, their MFN information
can be got from NFIT table. However, MFN information for hotplugged
NVDIMM devices should be got via AML _FIT method, so point 2) is needed.

Post by Andrew Cooper
If the MFN information is only available via AML, then point 2) is
needed, although the reporting back to Xen should be restricted to a xen
component, rather than polluting the main device driver.
However, I can't see any justification for 1). Dom0 should not be
involved in Xen's management of its own frame table and m2p. The mfns
making up the pmem/pblk regions should be treated just like any other
MMIO regions, and be handed wholesale to dom0 by default.

Do you mean to treat them as mmio pages of type p2m_mmio_direct and
map them to guest by map_mmio_regions()?

Thanks,
Haozhong

Konrad Rzeszutek Wilk

2016-10-11 18:46:14 UTC

Post by Haozhong Zhang

Post by Andrew Cooper

Post by Haozhong Zhang
Overview
========
This RFC kernel patch series along with corresponding patch series of
Xen, QEMU and ndctl implements Xen vNVDIMM, which can map the host
NVDIMM devices to Xen HVM domU as vNVDIMM devices.
Xen hypervisor does not include an NVDIMM driver, so it needs the
assistance from the driver in Dom0 Linux kernel to manage NVDIMM
devices. We currently only supports NVDIMM devices in pmem mode.
Design and Implementation
=========================
The complete design can be found at
https://lists.xenproject.org/archives/html/xen-devel/2016-07/msg01921.html.
All patch series can be found at
Xen: https://github.com/hzzhan9/xen.git nvdimm-rfc-v1
QEMU: https://github.com/hzzhan9/qemu.git xen-nvdimm-rfc-v1
Linux kernel: https://github.com/hzzhan9/nvdimm.git xen-nvdimm-rfc-v1
ndctl: https://github.com/hzzhan9/ndctl.git pfn-xen-rfc-v1
1) Reserve an area on NVDIMM devices for Xen hypervisor to place
memory management data structures, i.e. frame table and M2P table.
2) Report SPA ranges of NVDIMM devices and the reserved area to Xen
hypervisor.

Please can we take a step back here before diving down a rabbit hole.
How do pblk/pmem regions appear in the E820 map at boot? At the very
least, I would expect at least a large reserved region.

ACPI specification does not require them to appear in E820, though
it defines E820 type-7 for persistent memory.

Ok, so we might get some E820 type-7 ranges, or some holes.

Post by Haozhong Zhang

Post by Andrew Cooper
Is the MFN information (SPA in your terminology, so far as I can tell)
available in any static APCI tables, or are they only available as a
result of executing AML methods?

For NVDIMM devices already plugged at power on, their MFN information
can be got from NFIT table. However, MFN information for hotplugged
NVDIMM devices should be got via AML _FIT method, so point 2) is needed.

How does NVDIMM hotplug compare to RAM hotplug? Are the hotplug regions
described at boot and marked as initially not present, or do you only
know the hotplugged SPA at the point that it is hotplugged?
I certainly agree that there needs to be a propagation of the hotplug
notification from OSPM to Xen, which will involve some glue in the Xen
subsystem in Linux, but I would expect that this would be similar to the
existing plain RAM hotplug mechanism.

Post by Haozhong Zhang

Post by Andrew Cooper
If the MFN information is only available via AML, then point 2) is
needed, although the reporting back to Xen should be restricted to a xen
component, rather than polluting the main device driver.
However, I can't see any justification for 1). Dom0 should not be
involved in Xen's management of its own frame table and m2p. The mfns
making up the pmem/pblk regions should be treated just like any other
MMIO regions, and be handed wholesale to dom0 by default.

Do you mean to treat them as mmio pages of type p2m_mmio_direct and
map them to guest by map_mmio_regions()?

I don't see any reason why it shouldn't be treated like this. Xen
shouldn't be treating it as anything other than an opaque block of MFNs.
The concept of trying to map a DAX file into the guest physical address
space of a VM is indeed new and doesn't fit into Xen's current model,
but all that fixing this requires is a new privileged mapping hypercall
which takes a source domid and gfn scatter list, and a destination domid
and scatter list. (I see from a quick look at your Xen series that your
XENMEM_populate_pmemmap looks roughly like this)

That can be quite big. Say you want to map an DAX file that has
size of 1TB and the this GFN scatter list has 1073741824 entries?

How do you envision handling this in Xen and populating the P2M entries
with this information?

~Andrew
_______________________________________________
Linux-nvdimm mailing list
https://lists.01.org/mailman/listinfo/linux-nvdimm

Konrad Rzeszutek Wilk

2016-10-11 18:50:06 UTC

Post by Haozhong Zhang

Post by Andrew Cooper

Post by Haozhong Zhang
Overview
========
This RFC kernel patch series along with corresponding patch series of
Xen, QEMU and ndctl implements Xen vNVDIMM, which can map the host
NVDIMM devices to Xen HVM domU as vNVDIMM devices.
Xen hypervisor does not include an NVDIMM driver, so it needs the
assistance from the driver in Dom0 Linux kernel to manage NVDIMM
devices. We currently only supports NVDIMM devices in pmem mode.
Design and Implementation
=========================
The complete design can be found at
https://lists.xenproject.org/archives/html/xen-devel/2016-07/msg01921.html.
All patch series can be found at
Xen: https://github.com/hzzhan9/xen.git nvdimm-rfc-v1
QEMU: https://github.com/hzzhan9/qemu.git xen-nvdimm-rfc-v1
Linux kernel: https://github.com/hzzhan9/nvdimm.git xen-nvdimm-rfc-v1
ndctl: https://github.com/hzzhan9/ndctl.git pfn-xen-rfc-v1
1) Reserve an area on NVDIMM devices for Xen hypervisor to place
memory management data structures, i.e. frame table and M2P table.
2) Report SPA ranges of NVDIMM devices and the reserved area to Xen
hypervisor.

Please can we take a step back here before diving down a rabbit hole.
How do pblk/pmem regions appear in the E820 map at boot? At the very
least, I would expect at least a large reserved region.

ACPI specification does not require them to appear in E820, though
it defines E820 type-7 for persistent memory.

Ok, so we might get some E820 type-7 ranges, or some holes.

Post by Haozhong Zhang

Post by Andrew Cooper
Is the MFN information (SPA in your terminology, so far as I can tell)
available in any static APCI tables, or are they only available as a
result of executing AML methods?

For NVDIMM devices already plugged at power on, their MFN information
can be got from NFIT table. However, MFN information for hotplugged
NVDIMM devices should be got via AML _FIT method, so point 2) is needed.

How does NVDIMM hotplug compare to RAM hotplug? Are the hotplug regions
described at boot and marked as initially not present, or do you only
know the hotplugged SPA at the point that it is hotplugged?

The latter. You have no idea of the size until you get an ACPI hotplug.
The ACPI hotplug contains the NFIT MADT table so based on that you
can populate the machine.

I certainly agree that there needs to be a propagation of the hotplug
notification from OSPM to Xen, which will involve some glue in the Xen
subsystem in Linux, but I would expect that this would be similar to the
existing plain RAM hotplug mechanism.

I am actually not sure how ACPI RAM hotplug mechanism is suppose to work
in practice. I thought that the regions (E820) are marked as reserved
and the 'RAM' slots nicely in there.

Andrew Cooper

2016-10-11 18:50:34 UTC

Post by Haozhong Zhang

Post by Andrew Cooper

Post by Haozhong Zhang
Overview
========
This RFC kernel patch series along with corresponding patch series of
Xen, QEMU and ndctl implements Xen vNVDIMM, which can map the host
NVDIMM devices to Xen HVM domU as vNVDIMM devices.
Xen hypervisor does not include an NVDIMM driver, so it needs the
assistance from the driver in Dom0 Linux kernel to manage NVDIMM
devices. We currently only supports NVDIMM devices in pmem mode.
Design and Implementation
=========================
The complete design can be found at
https://lists.xenproject.org/archives/html/xen-devel/2016-07/msg01921.html.
All patch series can be found at
Xen: https://github.com/hzzhan9/xen.git nvdimm-rfc-v1
QEMU: https://github.com/hzzhan9/qemu.git xen-nvdimm-rfc-v1
Linux kernel: https://github.com/hzzhan9/nvdimm.git xen-nvdimm-rfc-v1
ndctl: https://github.com/hzzhan9/ndctl.git pfn-xen-rfc-v1
1) Reserve an area on NVDIMM devices for Xen hypervisor to place
memory management data structures, i.e. frame table and M2P table.
2) Report SPA ranges of NVDIMM devices and the reserved area to Xen
hypervisor.

Please can we take a step back here before diving down a rabbit hole.
How do pblk/pmem regions appear in the E820 map at boot? At the very
least, I would expect at least a large reserved region.

ACPI specification does not require them to appear in E820, though
it defines E820 type-7 for persistent memory.

Ok, so we might get some E820 type-7 ranges, or some holes.

Post by Haozhong Zhang

Post by Andrew Cooper
Is the MFN information (SPA in your terminology, so far as I can tell)
available in any static APCI tables, or are they only available as a
result of executing AML methods?

For NVDIMM devices already plugged at power on, their MFN information
can be got from NFIT table. However, MFN information for hotplugged
NVDIMM devices should be got via AML _FIT method, so point 2) is needed.

How does NVDIMM hotplug compare to RAM hotplug? Are the hotplug regions
described at boot and marked as initially not present, or do you only
know the hotplugged SPA at the point that it is hotplugged?

I certainly agree that there needs to be a propagation of the hotplug
notification from OSPM to Xen, which will involve some glue in the Xen
subsystem in Linux, but I would expect that this would be similar to the
existing plain RAM hotplug mechanism.

Post by Haozhong Zhang

Post by Andrew Cooper
If the MFN information is only available via AML, then point 2) is
needed, although the reporting back to Xen should be restricted to a xen
component, rather than polluting the main device driver.
However, I can't see any justification for 1). Dom0 should not be
involved in Xen's management of its own frame table and m2p. The mfns
making up the pmem/pblk regions should be treated just like any other
MMIO regions, and be handed wholesale to dom0 by default.

Do you mean to treat them as mmio pages of type p2m_mmio_direct and
map them to guest by map_mmio_regions()?

I don't see any reason why it shouldn't be treated like this. Xen
shouldn't be treating it as anything other than an opaque block of MFNs.

The concept of trying to map a DAX file into the guest physical address
space of a VM is indeed new and doesn't fit into Xen's current model,
but all that fixing this requires is a new privileged mapping hypercall
which takes a source domid and gfn scatter list, and a destination domid
and scatter list. (I see from a quick look at your Xen series that your
XENMEM_populate_pmemmap looks roughly like this)

~Andrew

Jan Beulich

2016-10-11 13:08:50 UTC

Post by Andrew Cooper

Post by Haozhong Zhang
1) Reserve an area on NVDIMM devices for Xen hypervisor to place
memory management data structures, i.e. frame table and M2P table.
2) Report SPA ranges of NVDIMM devices and the reserved area to Xen
hypervisor.

However, I can't see any justification for 1). Dom0 should not be
involved in Xen's management of its own frame table and m2p. The mfns
making up the pmem/pblk regions should be treated just like any other
MMIO regions, and be handed wholesale to dom0 by default.

That precludes the use as RAM extension, and I thought earlier rounds of
discussion had got everyone in agreement that at least for the pmem case
we will need some control data in Xen.

Jan

Dan Williams

2016-10-11 15:59:14 UTC

Post by Jan Beulich

Post by Andrew Cooper

Post by Haozhong Zhang
1) Reserve an area on NVDIMM devices for Xen hypervisor to place
memory management data structures, i.e. frame table and M2P table.
2) Report SPA ranges of NVDIMM devices and the reserved area to Xen
hypervisor.

However, I can't see any justification for 1). Dom0 should not be
involved in Xen's management of its own frame table and m2p. The mfns
making up the pmem/pblk regions should be treated just like any other
MMIO regions, and be handed wholesale to dom0 by default.

That precludes the use as RAM extension, and I thought earlier rounds of
discussion had got everyone in agreement that at least for the pmem case
we will need some control data in Xen.

The missing piece for me is why this reservation for control data
needs to be done in the libnvdimm core? I would expect that any dax
capable file could be mapped and made available to a guest. This
includes /dev/ramX devices that are dax capable, but are external to
the libnvdimm sub-system.

Konrad Rzeszutek Wilk

2016-10-11 16:59:46 UTC

Post by Dan Williams

Post by Jan Beulich

Post by Andrew Cooper

Post by Haozhong Zhang
1) Reserve an area on NVDIMM devices for Xen hypervisor to place
memory management data structures, i.e. frame table and M2P table.
2) Report SPA ranges of NVDIMM devices and the reserved area to Xen
hypervisor.

However, I can't see any justification for 1). Dom0 should not be
involved in Xen's management of its own frame table and m2p. The mfns
making up the pmem/pblk regions should be treated just like any other
MMIO regions, and be handed wholesale to dom0 by default.

That precludes the use as RAM extension, and I thought earlier rounds of
discussion had got everyone in agreement that at least for the pmem case
we will need some control data in Xen.

The missing piece for me is why this reservation for control data
needs to be done in the libnvdimm core? I would expect that any dax

Isn't it done this way with Linux? That is say if the machine has
4GB of RAM and the NVDIMM is in TB range. You want to put the 'struct page'
for the NVDIMM ranges somewhere. That place can be in regions on the
NVDIMM that ndctl can reserve.

Post by Dan Williams
capable file could be mapped and made available to a guest. This
includes /dev/ramX devices that are dax capable, but are external to
the libnvdimm sub-system.

This is more of just keeping track of the ranges if say the DAX file is
extremely fragmented and requires a lot of 'struct pages' to keep track of
when stiching up the VMA.

Post by Dan Williams
_______________________________________________
Xen-devel mailing list
https://lists.xen.org/xen-devel

Dan Williams

2016-10-11 17:57:36 UTC

On Tue, Oct 11, 2016 at 9:58 AM, Konrad Rzeszutek Wilk

Post by Konrad Rzeszutek Wilk

Post by Dan Williams

Post by Jan Beulich

Post by Andrew Cooper

Post by Haozhong Zhang
1) Reserve an area on NVDIMM devices for Xen hypervisor to place
memory management data structures, i.e. frame table and M2P table.
2) Report SPA ranges of NVDIMM devices and the reserved area to Xen
hypervisor.

However, I can't see any justification for 1). Dom0 should not be
involved in Xen's management of its own frame table and m2p. The mfns
making up the pmem/pblk regions should be treated just like any other
MMIO regions, and be handed wholesale to dom0 by default.

That precludes the use as RAM extension, and I thought earlier rounds of
discussion had got everyone in agreement that at least for the pmem case
we will need some control data in Xen.

The missing piece for me is why this reservation for control data
needs to be done in the libnvdimm core? I would expect that any dax

Isn't it done this way with Linux? That is say if the machine has
4GB of RAM and the NVDIMM is in TB range. You want to put the 'struct page'
for the NVDIMM ranges somewhere. That place can be in regions on the
NVDIMM that ndctl can reserve.

Yes.

Post by Konrad Rzeszutek Wilk

Post by Dan Williams
capable file could be mapped and made available to a guest. This
includes /dev/ramX devices that are dax capable, but are external to
the libnvdimm sub-system.

This is more of just keeping track of the ranges if say the DAX file is
extremely fragmented and requires a lot of 'struct pages' to keep track of
when stiching up the VMA.

Right, but why does the libnvdimm core need to know about this
specific Xen reservation? For example, if Xen wants some in-kernel
driver to own a pmem region and place its own metadata on the device I
would recommend something like:

bdev = blkdev_get_by_path("/dev/pmemX", FMODE_EXCL...);
bdev_direct_access(bdev, ...);

..in other words, I don't think we want libnvdimm to grow new device
types for every possible in-kernel user, Xen, MD, DM, etc. Instead,
just claim the resulting device.

Andrew Cooper

2016-10-11 18:27:18 UTC

Post by Dan Williams
On Tue, Oct 11, 2016 at 9:58 AM, Konrad Rzeszutek Wilk

Post by Konrad Rzeszutek Wilk

Post by Dan Williams

Post by Jan Beulich

Post by Andrew Cooper

Post by Haozhong Zhang
1) Reserve an area on NVDIMM devices for Xen hypervisor to place
memory management data structures, i.e. frame table and M2P table.
2) Report SPA ranges of NVDIMM devices and the reserved area to Xen
hypervisor.

However, I can't see any justification for 1). Dom0 should not be
involved in Xen's management of its own frame table and m2p. The mfns
making up the pmem/pblk regions should be treated just like any other
MMIO regions, and be handed wholesale to dom0 by default.

That precludes the use as RAM extension, and I thought earlier rounds of
discussion had got everyone in agreement that at least for the pmem case
we will need some control data in Xen.

The missing piece for me is why this reservation for control data
needs to be done in the libnvdimm core? I would expect that any dax

Isn't it done this way with Linux? That is say if the machine has
4GB of RAM and the NVDIMM is in TB range. You want to put the 'struct page'
for the NVDIMM ranges somewhere. That place can be in regions on the
NVDIMM that ndctl can reserve.

Yes.

I do not see any sensible usecase for Xen to use NVDIMMs as plain RAM;
NVDIMMs are far more valuable for higher level management in dom0.

I certainly think that such a usecase should be out-of-scope for initial
Xen/NVDIMM support, even if only to reduce the complexity to start with.

A repeated complain I have of large feature submissions like this is
that, by trying to solve all potential usecases at one, end up being
overly complicated to develop, understand and review.

Post by Dan Williams

Post by Konrad Rzeszutek Wilk

Post by Dan Williams
capable file could be mapped and made available to a guest. This
includes /dev/ramX devices that are dax capable, but are external to
the libnvdimm sub-system.

This is more of just keeping track of the ranges if say the DAX file is
extremely fragmented and requires a lot of 'struct pages' to keep track of
when stiching up the VMA.

Right, but why does the libnvdimm core need to know about this
specific Xen reservation? For example, if Xen wants some in-kernel
driver to own a pmem region and place its own metadata on the device I
bdev = blkdev_get_by_path("/dev/pmemX", FMODE_EXCL...);
bdev_direct_access(bdev, ...);
...in other words, I don't think we want libnvdimm to grow new device
types for every possible in-kernel user, Xen, MD, DM, etc. Instead,
just claim the resulting device.

I completely agree.

Whatever ends up happening between Xen and dom0, there should be no
modifications like this to the nvdimm driver. I will go so far as to
say that there shouldn't be any modifications to the nvdimm driver
(other than perhaps new query hooks so the Xen subsystem in Linux can
query information to then pass up to Xen, if the existing queryability
is insufficient).

~Andrew

Konrad Rzeszutek Wilk

2016-10-11 18:43:46 UTC

Post by Andrew Cooper

Post by Dan Williams
On Tue, Oct 11, 2016 at 9:58 AM, Konrad Rzeszutek Wilk

Post by Konrad Rzeszutek Wilk

Post by Dan Williams

Post by Jan Beulich

Post by Andrew Cooper

Post by Haozhong Zhang
1) Reserve an area on NVDIMM devices for Xen hypervisor to place
memory management data structures, i.e. frame table and M2P table.
2) Report SPA ranges of NVDIMM devices and the reserved area to Xen
hypervisor.

However, I can't see any justification for 1). Dom0 should not be
involved in Xen's management of its own frame table and m2p. The mfns
making up the pmem/pblk regions should be treated just like any other
MMIO regions, and be handed wholesale to dom0 by default.

That precludes the use as RAM extension, and I thought earlier rounds of
discussion had got everyone in agreement that at least for the pmem case
we will need some control data in Xen.

The missing piece for me is why this reservation for control data
needs to be done in the libnvdimm core? I would expect that any dax

Isn't it done this way with Linux? That is say if the machine has
4GB of RAM and the NVDIMM is in TB range. You want to put the 'struct page'
for the NVDIMM ranges somewhere. That place can be in regions on the
NVDIMM that ndctl can reserve.

Yes.

I do not see any sensible usecase for Xen to use NVDIMMs as plain RAM;

I just gave you one. This is the 'usecase' that Linux has to deal with
now that the core kernel folks have pointed out that they don't want
'struct page' for the MMIO regions. This mechanism came about this and
finding a place _somewhere_ to deal with having to have 'struct page'
for the SPA ranges of the NVDIMM.

Post by Andrew Cooper
NVDIMMs are far more valuable for higher level management in dom0.

Andrew, why are you providing input to this so late?

Haozhong provided an nice design document outlining the problem and
the solution he suggested.

Post by Andrew Cooper
I certainly think that such a usecase should be out-of-scope for initial
Xen/NVDIMM support, even if only to reduce the complexity to start with.
A repeated complain I have of large feature submissions like this is
that, by trying to solve all potential usecases at one, end up being
overly complicated to develop, understand and review.

On the other hand - if you don't take these complicated issues from the
start, then you may have to redesign and redevelop this after the first
version which has been set in stone and committed.

Post by Andrew Cooper

Post by Dan Williams

Post by Konrad Rzeszutek Wilk

Post by Dan Williams
capable file could be mapped and made available to a guest. This
includes /dev/ramX devices that are dax capable, but are external to
the libnvdimm sub-system.

This is more of just keeping track of the ranges if say the DAX file is
extremely fragmented and requires a lot of 'struct pages' to keep track of
when stiching up the VMA.

Right, but why does the libnvdimm core need to know about this
specific Xen reservation? For example, if Xen wants some in-kernel
driver to own a pmem region and place its own metadata on the device I
bdev = blkdev_get_by_path("/dev/pmemX", FMODE_EXCL...);
bdev_direct_access(bdev, ...);
...in other words, I don't think we want libnvdimm to grow new device
types for every possible in-kernel user, Xen, MD, DM, etc. Instead,
just claim the resulting device.

I completely agree.
Whatever ends up happening between Xen and dom0, there should be no
modifications like this to the nvdimm driver. I will go so far as to
say that there shouldn't be any modifications to the nvdimm driver
(other than perhaps new query hooks so the Xen subsystem in Linux can
query information to then pass up to Xen, if the existing queryability
is insufficient).

Haozhong and Jan had been chatting about this in terms of how to keep
track of a guest having non-contingous SPAs of NVDIMM stiched to a guest.

The initial idea was to treat it as MMIO, but of course if you have 1
page ranges over say 1TB you end up consuming tons of memory to keep
track of this (the same way Linux would if you wanted to mmap an file
from DAX fs).

Other solutions were an bitmap, but that can also be cumbersome to deal
with. In the end the suggestion that was proposed was the one that Linux
choose - stash the 'struct page' in the NVDIMM.

Konrad Rzeszutek Wilk

2016-10-11 19:47:26 UTC

Post by Konrad Rzeszutek Wilk
Andrew, why are you providing input to this so late?

First of sorry for this outburst. It was quite uncalled for and quite
unprofessional. You of all people have so much on your plate that I am
astonished that you are able to operate with some many pokers in the
fire.

Again, my sincere apology.

Konrad Rzeszutek Wilk

2016-10-11 18:33:37 UTC

260sn3756f-1
(version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128 verify=NOT)
Received: by mail-oi0-f43.google.com with SMTP id d132so32700570oib.2
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=intel-com.20150623.gappssmtp.com; s=20150623;
h=mime-version:in-reply-to:references:from:date:message-id:subject:to
:cc;
bh=vXHG8Ke0lr+jk8ivMDq3ZpmmHHjC205aTSytpqjXFgo=;
b=CiKg4tJf1DGU2x/pSCYU7Jx79oCXMSIApwY2zJjO9Lny3erPxUyjNhszNyQkceYK1A
Gzuw05eETGT/k0UWamFdN/ZXF3PucSXIXqrVtTS9kLQBlKPTWQJvndSRqZ6lPb36mlSA
BrkdOREz5O/V7p/iGYhnxZU9eyfVY1ekgeMvTKP3su9Ye4Nk6GJYMEb5HSTCm1Ckmoq5
T4Rlw6gcnbHCLx27vcghySG4YXcQ4r2qSPcSmAysve77sYCPYlM9XRVpzfPBTmINKGUo
9w7MgVs5KG0dG60j1fJNjXoY0WSoP3uI67e69afqjAChzVndGDgMXjOzGrQ6+KQF088Q
JeiQ==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20130820;
h=x-gm-message-state:mime-version:in-reply-to:references:from:date
:message-id:subject:to:cc;
bh=vXHG8Ke0lr+jk8ivMDq3ZpmmHHjC205aTSytpqjXFgo=;
b=SmizBvFSmUHAy/WKfbD4m+QVSajIfcD9SQW7hwqmiwUtrACa2PxQWyx0dHe6DOqVVx
jYHSxbbMiz105BMwxfv2pZlAl+phFkj8APxpL2XF36SIsq5u9+evlqBUuzGcpVJ+tXyI
0xO0qfyspvNwLwJnkZ2bOxO9FM5cRhGGIAQ2uJCVIixLTPstJgkFL3taQ6bfr/epJGoF
VbYrGRu0nxGTWEqk14q0YBt2uiDLWu6WiF8izG/fnyM39wzS0ZsO31hco3jpBWiq7X5N
Ehn8ePiR9iYfowHhT3s2PefnrirD0zlJAamVqnbTNQS93PT26dWpm/vc8HVYiMLj+Fq8
s2rw==
X-Gm-Message-State: AA6/9RlGCiscMzjRlXRLSGCPLACOp/VdD9I/y/dQ+vytyQN0tniPrwPxFp4VQtNbW/PYF1zzfyAX+iUOa+dgEsrg
X-Received: by 10.202.84.69 with SMTP id i66mr3504473oib.93.1476208279931;
Tue, 11 Oct 2016 10:51:19 -0700 (PDT)
MIME-Version: 1.0
Received: by 10.157.39.201 with HTTP; Tue, 11 Oct 2016 10:51:19 -0700 (PDT)
Date: Tue, 11 Oct 2016 10:51:19 -0700
Subject: Re: [Xen-devel] [RFC KERNEL PATCH 0/2] Add Dom0 NVDIMM support for Xen
Content-Type: text/plain; charset=UTF-8
X-Source-IP: 209.85.218.43
X-ServerName: mail-oi0-f43.google.com
X-Proofpoint-SPF-Result: pass
X-Proofpoint-SPF-Record: v=spf1 mx:intel.com include:_spf.google.com -all
X-Proofpoint-Virus-Version: vendor=nai engine=5800 definitions=8315 signatures=670727
X-Proofpoint-Spam-Details: rule=notspam policy=default score=0 spamscore=0 suspectscore=1
malwarescore=0 phishscore=0 adultscore=0 bulkscore=0 classifier=spam
adjust=0 reason=mlx scancount=1 engine=8.0.1-1609300000
definitions=main-1610110304
X-Spam: Clean
On Tue, Oct 11, 2016 at 9:58 AM, Konrad Rzeszutek Wilk

Post by Konrad Rzeszutek Wilk

Post by Dan Williams

Post by Jan Beulich

Post by Andrew Cooper

Post by Haozhong Zhang
1) Reserve an area on NVDIMM devices for Xen hypervisor to place
memory management data structures, i.e. frame table and M2P table.
2) Report SPA ranges of NVDIMM devices and the reserved area to Xen
hypervisor.

However, I can't see any justification for 1). Dom0 should not be
involved in Xen's management of its own frame table and m2p. The mfns
making up the pmem/pblk regions should be treated just like any other
MMIO regions, and be handed wholesale to dom0 by default.

That precludes the use as RAM extension, and I thought earlier rounds of
discussion had got everyone in agreement that at least for the pmem case
we will need some control data in Xen.

The missing piece for me is why this reservation for control data
needs to be done in the libnvdimm core? I would expect that any dax

Isn't it done this way with Linux? That is say if the machine has
4GB of RAM and the NVDIMM is in TB range. You want to put the 'struct page'
for the NVDIMM ranges somewhere. That place can be in regions on the
NVDIMM that ndctl can reserve.

Yes.

Post by Konrad Rzeszutek Wilk

Post by Dan Williams
capable file could be mapped and made available to a guest. This
includes /dev/ramX devices that are dax capable, but are external to
the libnvdimm sub-system.

This is more of just keeping track of the ranges if say the DAX file is
extremely fragmented and requires a lot of 'struct pages' to keep track of
when stiching up the VMA.

Right, but why does the libnvdimm core need to know about this
specific Xen reservation? For example, if Xen wants some in-kernel

Let me turn this around - why does the libnvdimm core need to know about
Linux specific parts? Shouldn't this be OS agnostic, so that FreeBSD
for example can also poke a hole in this and fill it with its
OS-management meta-data?

driver to own a pmem region and place its own metadata on the device I
bdev = blkdev_get_by_path("/dev/pmemX", FMODE_EXCL...);
bdev_direct_access(bdev, ...);
...in other words, I don't think we want libnvdimm to grow new device
types for every possible in-kernel user, Xen, MD, DM, etc. Instead,
just claim the resulting device.

Dan Williams

2016-10-11 19:29:15 UTC

On Tue, Oct 11, 2016 at 11:33 AM, Konrad Rzeszutek Wilk
[..]

Post by Konrad Rzeszutek Wilk

Post by Dan Williams
Right, but why does the libnvdimm core need to know about this
specific Xen reservation? For example, if Xen wants some in-kernel

Let me turn this around - why does the libnvdimm core need to know about
Linux specific parts? Shouldn't this be OS agnostic, so that FreeBSD
for example can also poke a hole in this and fill it with its
OS-management meta-data?

Specifically the core needs to know so that it can answer the Linux
specific question of whether the pfn returned by ->direct_access() has
a corresponding struct page or not. It's tied to the lifetime of the
device and the usage of the reservation needs to be coordinated
against the references of those pages. If FreeBSD decides it needs to
reserve "struct page" capacity at the start of the device, I would
hope that it reuses the same on-device info block that Linux is using
and not create a new "FreeBSD-mode" device type.

To be honest I do not yet understand what metadata Xen wants to store
in the device, but it seems the producer and consumer of that metadata
is Xen itself and not the wider Linux kernel as is the case with
struct page. Can you fill me in on what problem Xen solves with this
reservation?

Konrad Rzeszutek Wilk

2016-10-11 19:50:11 UTC

Post by Dan Williams
On Tue, Oct 11, 2016 at 11:33 AM, Konrad Rzeszutek Wilk
[..]

Post by Konrad Rzeszutek Wilk

Post by Dan Williams
Right, but why does the libnvdimm core need to know about this
specific Xen reservation? For example, if Xen wants some in-kernel

Let me turn this around - why does the libnvdimm core need to know about
Linux specific parts? Shouldn't this be OS agnostic, so that FreeBSD
for example can also poke a hole in this and fill it with its
OS-management meta-data?

Specifically the core needs to know so that it can answer the Linux
specific question of whether the pfn returned by ->direct_access() has
a corresponding struct page or not. It's tied to the lifetime of the
device and the usage of the reservation needs to be coordinated
against the references of those pages. If FreeBSD decides it needs to
reserve "struct page" capacity at the start of the device, I would
hope that it reuses the same on-device info block that Linux is using
and not create a new "FreeBSD-mode" device type.

The issue here (as I understand, I may be missing something new)
is that the size of this special namespace may be different. That is
the 'struct page' on FreeBSD could be 256 bytes while on Linux it is
64 bytes (numbers pulled out of the sky).

Hence one would have to expand or such to re-use this.

Post by Dan Williams
To be honest I do not yet understand what metadata Xen wants to store
in the device, but it seems the producer and consumer of that metadata
is Xen itself and not the wider Linux kernel as is the case with
struct page. Can you fill me in on what problem Xen solves with this

Exactly!

Post by Dan Williams
reservation?

The same as Linux - its variant of 'struct page'. Which I think is
smaller than the Linux one, but perhaps it is not?

Dan Williams

2016-10-11 20:18:08 UTC

On Tue, Oct 11, 2016 at 12:48 PM, Konrad Rzeszutek Wilk

Post by Konrad Rzeszutek Wilk

Post by Dan Williams
On Tue, Oct 11, 2016 at 11:33 AM, Konrad Rzeszutek Wilk
[..]

Post by Konrad Rzeszutek Wilk

Post by Dan Williams
Right, but why does the libnvdimm core need to know about this
specific Xen reservation? For example, if Xen wants some in-kernel

Let me turn this around - why does the libnvdimm core need to know about
Linux specific parts? Shouldn't this be OS agnostic, so that FreeBSD
for example can also poke a hole in this and fill it with its
OS-management meta-data?

Specifically the core needs to know so that it can answer the Linux
specific question of whether the pfn returned by ->direct_access() has
a corresponding struct page or not. It's tied to the lifetime of the
device and the usage of the reservation needs to be coordinated
against the references of those pages. If FreeBSD decides it needs to
reserve "struct page" capacity at the start of the device, I would
hope that it reuses the same on-device info block that Linux is using
and not create a new "FreeBSD-mode" device type.

The issue here (as I understand, I may be missing something new)
is that the size of this special namespace may be different. That is
the 'struct page' on FreeBSD could be 256 bytes while on Linux it is
64 bytes (numbers pulled out of the sky).
Hence one would have to expand or such to re-use this.

Sure, but we could support that today. If FreeBSD lays down the info
block it is free to make a bigger reservation and Linux would be happy
to use a smaller subset. If we, as an industry, want this "struct
page" reservation to be common we can take it to a standards body to
make as a cross-OS guarantee... but I think this is separate from the
Xen reservation.

Post by Konrad Rzeszutek Wilk

Post by Dan Williams
To be honest I do not yet understand what metadata Xen wants to store
in the device, but it seems the producer and consumer of that metadata
is Xen itself and not the wider Linux kernel as is the case with
struct page. Can you fill me in on what problem Xen solves with this

Exactly!

Post by Dan Williams
reservation?

The same as Linux - its variant of 'struct page'. Which I think is
smaller than the Linux one, but perhaps it is not?

If the hypervisor needs to know where it can store some metadata, can
that be satisfied with userspace tooling in Dom0? Something like,
"/dev/pmem0p1 == Xen metadata" and "/dev/pmem0p2 == DAX filesystem
with files to hand to guests". So my question is not about the
rationale for having metadata, it's why does the Linux kernel need to
know about the Xen reservation? As far as I can see it is independent
/ opaque to the kernel.

Haozhong Zhang

2016-10-12 10:33:33 UTC

Post by Dan Williams
On Tue, Oct 11, 2016 at 12:48 PM, Konrad Rzeszutek Wilk

Post by Konrad Rzeszutek Wilk

Post by Dan Williams
On Tue, Oct 11, 2016 at 11:33 AM, Konrad Rzeszutek Wilk
[..]

Post by Konrad Rzeszutek Wilk

Post by Dan Williams
Right, but why does the libnvdimm core need to know about this
specific Xen reservation? For example, if Xen wants some in-kernel

Let me turn this around - why does the libnvdimm core need to know about
Linux specific parts? Shouldn't this be OS agnostic, so that FreeBSD
for example can also poke a hole in this and fill it with its
OS-management meta-data?

Specifically the core needs to know so that it can answer the Linux
specific question of whether the pfn returned by ->direct_access() has
a corresponding struct page or not. It's tied to the lifetime of the
device and the usage of the reservation needs to be coordinated
against the references of those pages. If FreeBSD decides it needs to
reserve "struct page" capacity at the start of the device, I would
hope that it reuses the same on-device info block that Linux is using
and not create a new "FreeBSD-mode" device type.

The issue here (as I understand, I may be missing something new)
is that the size of this special namespace may be different. That is
the 'struct page' on FreeBSD could be 256 bytes while on Linux it is
64 bytes (numbers pulled out of the sky).
Hence one would have to expand or such to re-use this.

Sure, but we could support that today. If FreeBSD lays down the info
block it is free to make a bigger reservation and Linux would be happy
to use a smaller subset. If we, as an industry, want this "struct
page" reservation to be common we can take it to a standards body to
make as a cross-OS guarantee... but I think this is separate from the
Xen reservation.

Post by Konrad Rzeszutek Wilk

Post by Dan Williams
To be honest I do not yet understand what metadata Xen wants to store
in the device, but it seems the producer and consumer of that metadata
is Xen itself and not the wider Linux kernel as is the case with
struct page. Can you fill me in on what problem Xen solves with this

Exactly!

Post by Dan Williams
reservation?

The same as Linux - its variant of 'struct page'. Which I think is
smaller than the Linux one, but perhaps it is not?

If the hypervisor needs to know where it can store some metadata, can
that be satisfied with userspace tooling in Dom0? Something like,
"/dev/pmem0p1 == Xen metadata" and "/dev/pmem0p2 == DAX filesystem
with files to hand to guests". So my question is not about the
rationale for having metadata, it's why does the Linux kernel need to
know about the Xen reservation? As far as I can see it is independent
/ opaque to the kernel.

Thank everyone for all these comments!

How about doing the reservation in the following way:

1. Create partition(s) on /dev/pmemX and make sure space besides the
partition table and potential padding before the first partition is
large enough to hold Xen's management structures and a super block
introduced in step 2. The space besides the partition table,
padding and the super block will be used as the reserved area.

2. Write a super block before above reserved area. The super block
records the base address and the size of the reserved area. It also
contains a signature and a checksum to identify itself.

The layout is shown as the following diagram.

+---------------+-----------+-------+----------+--------------+
| whatever used | Partition | Super | Reserved | /dev/pmem0p1 |
| by kernel | Table | Block | for Xen | |
+---------------+-----------+-------+----------+--------------+
\_____________________ _______________________/
V
/dev/pmem0

Above two steps can be done via a userspace program and do not need
Xen hypervisor running. The partitions on the device can be used
regardless of the existence of Xen hypervisor.

3. When Xen is running, implement a function in Dom0 Linux xen driver
(drivers/xen/) to response to udevd events that notify the
detection of the pmem regions.

This function searches on the pmem region for the super block
created in step 2. If one is found, it will know this pmem region
has been prepared for Xen usage.

Then it gets the base address and size of the reserved area (from
super block) and the entire address ranges of the pmem region (from
pmem driver), and reports them to Xen hypervisor.

The implementation of this step can be completely included in the
kernel Xen driver. (It may also be implemented as a udevd service in
userspace, if it's not considered as unsafe)

Thanks,
Haozhong

Jan Beulich

2016-10-12 11:33:19 UTC

Post by Haozhong Zhang
The layout is shown as the following diagram.
+---------------+-----------+-------+----------+--------------+
| whatever used | Partition | Super | Reserved | /dev/pmem0p1 |
| by kernel | Table | Block | for Xen | |
+---------------+-----------+-------+----------+--------------+
\_____________________ _______________________/
V
/dev/pmem0

I have to admit that I dislike this, for not being OS-agnostic.
Neither should there be any Xen-specific region, nor should the
"whatever used by kernel" one be restricted to just Linux. What
I could see is an OS-reserved area ahead of the partition table,
the exact usage of which depends on which OS is currently
running (and in the Xen case this might be both Xen _and_ the
Dom0 kernel, arbitrated by a tbd protocol). After all, when
running under Xen, the Dom0 may not have a need for as much
control data as it has when running on bare hardware, for it
controlling less (if any) of the actual memory ranges when Xen
is present.

The assumption of course is that the reserved area holds no
persistent data. If that assumption didn't hold, you'd have to
have per-OS reserved areas anyway (as many of them as
there might be OSes [planned to get] installed on a particular
system).

Jan

Haozhong Zhang

2016-10-12 15:00:52 UTC

Post by Jan Beulich

Post by Haozhong Zhang
The layout is shown as the following diagram.
+---------------+-----------+-------+----------+--------------+
| whatever used | Partition | Super | Reserved | /dev/pmem0p1 |
| by kernel | Table | Block | for Xen | |
+---------------+-----------+-------+----------+--------------+
\_____________________ _______________________/
V
/dev/pmem0

I have to admit that I dislike this, for not being OS-agnostic.
Neither should there be any Xen-specific region, nor should the
"whatever used by kernel" one be restricted to just Linux. What
I could see is an OS-reserved area ahead of the partition table,
the exact usage of which depends on which OS is currently
running (and in the Xen case this might be both Xen _and_ the
Dom0 kernel, arbitrated by a tbd protocol). After all, when
running under Xen, the Dom0 may not have a need for as much
control data as it has when running on bare hardware, for it
controlling less (if any) of the actual memory ranges when Xen
is present.

Isn't this OS-reserved area still not OS-agnostic, as it requires OS
to know where the reserved area is? Or do you mean it's not if it's
defined by a protocol that is accepted by all OSes?

Let me list another two methods just coming to my mind.

1. The first method extends the usage of the super block used by
current Linux kernel to reserve space on pmem.

Current Linux kernel places a super block of the following
structure near the beginning of a pmem namespace.

struct nd_pfn_sb {
u8 signature[PFN_SIG_LEN];
u8 uuid[16];
u8 parent_uuid[16];
__le32 flags;
__le16 version_major;
__le16 version_minor;
__le64 dataoff; /* relative to namespace_base + start_pad */
__le64 npfns;
__le32 mode;
/* minor-version-1 additions for section alignment */
__le32 start_pad;
__le32 end_trunc;
/* minor-version-2 record the base alignment of the mapping */
__le32 align;
u8 padding[4000];
__le64 checksum;
}

Two interesting fields here are 'dataoff' and 'mode':
- 'dataoff' indicates the offset where the data area starts,
ie. IIUC, the part that can be accessed via /dev/pmemN or
/dev/daxN.
- 'mode' indicates whether Linux puts struct page for this
namespace in the ram (= PFN_MODE_RAM) or on the device (=
PFN_MODE_PMEM).

Currently for Linux, only 'mode' is customizable, while 'dataoff'
is not. If mode == PFN_MODE_RAM, no reservation for struct page is
made on the device, and dataoff starts almost immediately after
the super block except a small reserved area in between for other
structures and alignment. If mode == PFN_MODE_PMEM, the size of
the reservation is decided by kernel, i.e. 64 bytes per struct
page.

I propose to make the size of the reserved area customizable,
e.g. via ioctl and ndctl.
- If mode == PFN_MODE_PMEM and
* if the given reserved size is large enough to hold what an OS
(not limited to Linux) wants to put in, then the OS just
starts use it as desired;
* if the given reserved size is not enough, then the OS reports
error and may take other fallback actions.
- If mode == PFN_MODE_RAM and
* if the reserved size is zero, then it's the current way that
Linux uses the device;
* if the reserved size is non-zero, I would like to reserve this
case for hypervisor (right now, namely Xen hypervisor)
usage. That is, the OS should not use the reserved area. For
Xen, we could add a function in xen driver in kernel to report
the reserved area to hypervisor.

I guess this might be the OS-agnostic way Jan expects, but Dan may
object to.

2. Lay another pseudo device on the block device (e.g. /dev/pmemN)
provided by the NVDIMM driver.

This pseudo device can reserve the size according to user's
requirement. The reservation information can be persistently
recorded in a super block before the reserved area.

This pseudo device also implements another pseudo block device to
allow the non-reserved area be accessed as a block device (we can
even implement it as DAX-capable).

pseudo block device
/---------^-----------\
+------------------+-------+---------------+-----------------------+
| whatever used | Super | reserved by | |
| by NVDIMM driver | Block | pseudo device | |
+------------------+-------+---------------+-----------------------+
\_____________________ _______________________/
V
/dev/pmem0
(provided by NVDIMM driver)

In order to make it work across difference OSes, it requires other
OS recognizes the same types of pmem block devices made by Linux,
and implements the driver for the pseudo device.

This is inspired by Dan's reply at
https://lists.xenproject.org/archives/html/xen-devel/2016-10/msg00651.html.

However, it's essentially the same as my partition solution, so I guess
Jan will still dislike.

Any comments?

Post by Jan Beulich
The assumption of course is that the reserved area holds no
persistent data. If that assumption didn't hold, you'd have to
have per-OS reserved areas anyway (as many of them as
there might be OSes [planned to get] installed on a particular
system).

No persistent data should be placed in the reserved area.

Thanks,
Haozhong

Jan Beulich

2016-10-12 15:39:58 UTC

Post by Haozhong Zhang

Post by Jan Beulich

Post by Haozhong Zhang
The layout is shown as the following diagram.
+---------------+-----------+-------+----------+--------------+
| whatever used | Partition | Super | Reserved | /dev/pmem0p1 |
| by kernel | Table | Block | for Xen | |
+---------------+-----------+-------+----------+--------------+
\_____________________ _______________________/
V
/dev/pmem0

I have to admit that I dislike this, for not being OS-agnostic.
Neither should there be any Xen-specific region, nor should the
"whatever used by kernel" one be restricted to just Linux. What
I could see is an OS-reserved area ahead of the partition table,
the exact usage of which depends on which OS is currently
running (and in the Xen case this might be both Xen _and_ the
Dom0 kernel, arbitrated by a tbd protocol). After all, when
running under Xen, the Dom0 may not have a need for as much
control data as it has when running on bare hardware, for it
controlling less (if any) of the actual memory ranges when Xen
is present.

Isn't this OS-reserved area still not OS-agnostic, as it requires OS
to know where the reserved area is? Or do you mean it's not if it's
defined by a protocol that is accepted by all OSes?

The latter - we clearly won't get away without some agreement on
where to retrieve position and size of this area. I was simply
assuming that such a protocol already exists.

Jan

Dan Williams

2016-10-12 15:43:01 UTC

Post by Jan Beulich

Post by Haozhong Zhang

Post by Jan Beulich

Post by Haozhong Zhang
The layout is shown as the following diagram.
+---------------+-----------+-------+----------+--------------+
| whatever used | Partition | Super | Reserved | /dev/pmem0p1 |
| by kernel | Table | Block | for Xen | |
+---------------+-----------+-------+----------+--------------+
\_____________________ _______________________/
V
/dev/pmem0

I have to admit that I dislike this, for not being OS-agnostic.
Neither should there be any Xen-specific region, nor should the
"whatever used by kernel" one be restricted to just Linux. What
I could see is an OS-reserved area ahead of the partition table,
the exact usage of which depends on which OS is currently
running (and in the Xen case this might be both Xen _and_ the
Dom0 kernel, arbitrated by a tbd protocol). After all, when
running under Xen, the Dom0 may not have a need for as much
control data as it has when running on bare hardware, for it
controlling less (if any) of the actual memory ranges when Xen
is present.

Isn't this OS-reserved area still not OS-agnostic, as it requires OS
to know where the reserved area is? Or do you mean it's not if it's
defined by a protocol that is accepted by all OSes?

The latter - we clearly won't get away without some agreement on
where to retrieve position and size of this area. I was simply
assuming that such a protocol already exists.

No, we should not mix the struct page reservation that the Dom0 kernel
may actively use with the Xen reservation that the Dom0 kernel does
not consume. Explain again what is wrong with the partition approach?

Dan Williams

2016-10-12 16:20:21 UTC

Post by Dan Williams

Post by Jan Beulich

Post by Haozhong Zhang

Post by Jan Beulich

Post by Haozhong Zhang
The layout is shown as the following diagram.
+---------------+-----------+-------+----------+--------------+
| whatever used | Partition | Super | Reserved | /dev/pmem0p1 |
| by kernel | Table | Block | for Xen | |
+---------------+-----------+-------+----------+--------------+
\_____________________ _______________________/
V
/dev/pmem0

I have to admit that I dislike this, for not being OS-agnostic.
Neither should there be any Xen-specific region, nor should the
"whatever used by kernel" one be restricted to just Linux. What
I could see is an OS-reserved area ahead of the partition table,
the exact usage of which depends on which OS is currently
running (and in the Xen case this might be both Xen _and_ the
Dom0 kernel, arbitrated by a tbd protocol). After all, when
running under Xen, the Dom0 may not have a need for as much
control data as it has when running on bare hardware, for it
controlling less (if any) of the actual memory ranges when Xen
is present.

Isn't this OS-reserved area still not OS-agnostic, as it requires OS
to know where the reserved area is? Or do you mean it's not if it's
defined by a protocol that is accepted by all OSes?

The latter - we clearly won't get away without some agreement on
where to retrieve position and size of this area. I was simply
assuming that such a protocol already exists.

No, we should not mix the struct page reservation that the Dom0 kernel
may actively use with the Xen reservation that the Dom0 kernel does
not consume. Explain again what is wrong with the partition approach?

Not sure what was unclear in my previous reply. I don't think there
should be apriori knowledge of whether Xen is (going to be) used on
a system, and even if it gets used, but just occasionally, it would
(apart from the abstract considerations already given) be a waste
of resources to set something aside that could be used for other
purposes while Xen is not running. Static partitioning should only be
needed for persistent data.

The reservation needs to be persistent / static even if the data is
volatile, as is the case with struct page, because we can't have the
size of the device change depending on use. So, from the aspect of
wasting space while Xen is not in use, both partitions and the
intrinsic reservation approach suffer the same problem. Setting that
aside I don't want to mix 2 different use cases into the same
reservation.

The kernel needs to know about the struct page reservation because it
needs to manage the lifetime of page references vs the lifetime of the
device. It does not have the same relationship with a Xen reservation
which is why I'm proposing they be managed separately.

Note that Toshi and Mike added DM for DAX. This enabling ends up
writing DM metadata on the device without adding new reservation
mechanisms to the nvdimm core. I'm struggling to see how the Xen use
case is materially different DM. In the end it's an application
specific metadata space.

Jan Beulich

2016-10-13 08:35:02 UTC

Post by Dan Williams

Post by Dan Williams

Post by Jan Beulich

Post by Haozhong Zhang

Post by Jan Beulich

Post by Haozhong Zhang
The layout is shown as the following diagram.
+---------------+-----------+-------+----------+--------------+
| whatever used | Partition | Super | Reserved | /dev/pmem0p1 |
| by kernel | Table | Block | for Xen | |
+---------------+-----------+-------+----------+--------------+
\_____________________ _______________________/
V
/dev/pmem0

I have to admit that I dislike this, for not being OS-agnostic.
Neither should there be any Xen-specific region, nor should the
"whatever used by kernel" one be restricted to just Linux. What
I could see is an OS-reserved area ahead of the partition table,
the exact usage of which depends on which OS is currently
running (and in the Xen case this might be both Xen _and_ the
Dom0 kernel, arbitrated by a tbd protocol). After all, when
running under Xen, the Dom0 may not have a need for as much
control data as it has when running on bare hardware, for it
controlling less (if any) of the actual memory ranges when Xen
is present.

Isn't this OS-reserved area still not OS-agnostic, as it requires OS
to know where the reserved area is? Or do you mean it's not if it's
defined by a protocol that is accepted by all OSes?

The latter - we clearly won't get away without some agreement on
where to retrieve position and size of this area. I was simply
assuming that such a protocol already exists.

No, we should not mix the struct page reservation that the Dom0 kernel
may actively use with the Xen reservation that the Dom0 kernel does
not consume. Explain again what is wrong with the partition approach?

Not sure what was unclear in my previous reply. I don't think there
should be apriori knowledge of whether Xen is (going to be) used on
a system, and even if it gets used, but just occasionally, it would
(apart from the abstract considerations already given) be a waste
of resources to set something aside that could be used for other
purposes while Xen is not running. Static partitioning should only be
needed for persistent data.

The reservation needs to be persistent / static even if the data is
volatile, as is the case with struct page, because we can't have the
size of the device change depending on use. So, from the aspect of
wasting space while Xen is not in use, both partitions and the
intrinsic reservation approach suffer the same problem. Setting that
aside I don't want to mix 2 different use cases into the same
reservation.

Then you didn't understand what I've said: I certainly didn't mean
the reservation to vary from a device perspective. However, when
Xen is in use I don't see why part of that static reservation couldn't
be used by Xen, and another part by the Dom0 kernel. The kernel
obviously would need to ask the hypervisor how much of the space
is left, and where that area starts.

Post by Dan Williams
The kernel needs to know about the struct page reservation because it
needs to manage the lifetime of page references vs the lifetime of the
device. It does not have the same relationship with a Xen reservation
which is why I'm proposing they be managed separately.

I don't think I understand the difference you try to point out here.
Linux'es struct page and Xen's struct page_info serve the same
fundamental purpose.

Jan

Haozhong Zhang

2016-10-13 08:55:07 UTC

Post by Jan Beulich

Post by Dan Williams

Post by Dan Williams

Post by Jan Beulich

Post by Haozhong Zhang

Post by Jan Beulich

Post by Haozhong Zhang
The layout is shown as the following diagram.
+---------------+-----------+-------+----------+--------------+
| whatever used | Partition | Super | Reserved | /dev/pmem0p1 |
| by kernel | Table | Block | for Xen | |
+---------------+-----------+-------+----------+--------------+
\_____________________ _______________________/
V
/dev/pmem0

I have to admit that I dislike this, for not being OS-agnostic.
Neither should there be any Xen-specific region, nor should the
"whatever used by kernel" one be restricted to just Linux. What
I could see is an OS-reserved area ahead of the partition table,
the exact usage of which depends on which OS is currently
running (and in the Xen case this might be both Xen _and_ the
Dom0 kernel, arbitrated by a tbd protocol). After all, when
running under Xen, the Dom0 may not have a need for as much
control data as it has when running on bare hardware, for it
controlling less (if any) of the actual memory ranges when Xen
is present.

Isn't this OS-reserved area still not OS-agnostic, as it requires OS
to know where the reserved area is? Or do you mean it's not if it's
defined by a protocol that is accepted by all OSes?

The latter - we clearly won't get away without some agreement on
where to retrieve position and size of this area. I was simply
assuming that such a protocol already exists.

No, we should not mix the struct page reservation that the Dom0 kernel
may actively use with the Xen reservation that the Dom0 kernel does
not consume. Explain again what is wrong with the partition approach?

Not sure what was unclear in my previous reply. I don't think there
should be apriori knowledge of whether Xen is (going to be) used on
a system, and even if it gets used, but just occasionally, it would
(apart from the abstract considerations already given) be a waste
of resources to set something aside that could be used for other
purposes while Xen is not running. Static partitioning should only be
needed for persistent data.

The reservation needs to be persistent / static even if the data is
volatile, as is the case with struct page, because we can't have the
size of the device change depending on use. So, from the aspect of
wasting space while Xen is not in use, both partitions and the
intrinsic reservation approach suffer the same problem. Setting that
aside I don't want to mix 2 different use cases into the same
reservation.

Then you didn't understand what I've said: I certainly didn't mean
the reservation to vary from a device perspective. However, when
Xen is in use I don't see why part of that static reservation couldn't
be used by Xen, and another part by the Dom0 kernel. The kernel
obviously would need to ask the hypervisor how much of the space
is left, and where that area starts.

I think Dan means that there should be a clear separation between
reservations for different usages (kernel/xen/...). The libnvdimm
driver is for the linux kernel and only needs to maintain the
reservation for kernel functionality. For others including xen/dm/...,
if they want reservation for their own purpose, they should maintain
their own reservations out of libnvdimm driver and avoid bothering the
libnvdimm driver (e.g. add specific handling in libnvdimm driver).

IIUC, one existing example is device-mapper device (dm) which needs to
reserve on-device area for its own meta-data. Its choice is to store
the meta-data on the block device (/dev/pmemN) provided by the
libnvdimm driver.

I think we can do the similar for Xen, like to lay another pseudo
device on /dev/pmem and do the reservation, like 2. in my previous
reply.

Thanks,
Haozhong

Post by Jan Beulich

Post by Dan Williams
The kernel needs to know about the struct page reservation because it
needs to manage the lifetime of page references vs the lifetime of the
device. It does not have the same relationship with a Xen reservation
which is why I'm proposing they be managed separately.

I don't think I understand the difference you try to point out here.
Linux'es struct page and Xen's struct page_info serve the same
fundamental purpose.
Jan

Jan Beulich

2016-10-13 09:08:17 UTC

Post by Haozhong Zhang

Post by Jan Beulich

Post by Dan Williams

Post by Dan Williams

Post by Jan Beulich

Post by Haozhong Zhang

Post by Jan Beulich

Post by Haozhong Zhang
The layout is shown as the following diagram.
+---------------+-----------+-------+----------+--------------+
| whatever used | Partition | Super | Reserved | /dev/pmem0p1 |
| by kernel | Table | Block | for Xen | |
+---------------+-----------+-------+----------+--------------+
\_____________________ _______________________/
V
/dev/pmem0

I have to admit that I dislike this, for not being OS-agnostic.
Neither should there be any Xen-specific region, nor should the
"whatever used by kernel" one be restricted to just Linux. What
I could see is an OS-reserved area ahead of the partition table,
the exact usage of which depends on which OS is currently
running (and in the Xen case this might be both Xen _and_ the
Dom0 kernel, arbitrated by a tbd protocol). After all, when
running under Xen, the Dom0 may not have a need for as much
control data as it has when running on bare hardware, for it
controlling less (if any) of the actual memory ranges when Xen
is present.

Isn't this OS-reserved area still not OS-agnostic, as it requires OS
to know where the reserved area is? Or do you mean it's not if it's
defined by a protocol that is accepted by all OSes?

The latter - we clearly won't get away without some agreement on
where to retrieve position and size of this area. I was simply
assuming that such a protocol already exists.

No, we should not mix the struct page reservation that the Dom0 kernel
may actively use with the Xen reservation that the Dom0 kernel does
not consume. Explain again what is wrong with the partition approach?

Not sure what was unclear in my previous reply. I don't think there
should be apriori knowledge of whether Xen is (going to be) used on
a system, and even if it gets used, but just occasionally, it would
(apart from the abstract considerations already given) be a waste
of resources to set something aside that could be used for other
purposes while Xen is not running. Static partitioning should only be
needed for persistent data.

The reservation needs to be persistent / static even if the data is
volatile, as is the case with struct page, because we can't have the
size of the device change depending on use. So, from the aspect of
wasting space while Xen is not in use, both partitions and the
intrinsic reservation approach suffer the same problem. Setting that
aside I don't want to mix 2 different use cases into the same
reservation.

Then you didn't understand what I've said: I certainly didn't mean
the reservation to vary from a device perspective. However, when
Xen is in use I don't see why part of that static reservation couldn't
be used by Xen, and another part by the Dom0 kernel. The kernel
obviously would need to ask the hypervisor how much of the space
is left, and where that area starts.

I think Dan means that there should be a clear separation between
reservations for different usages (kernel/xen/...). The libnvdimm
driver is for the linux kernel and only needs to maintain the
reservation for kernel functionality. For others including xen/dm/...,
if they want reservation for their own purpose, they should maintain
their own reservations out of libnvdimm driver and avoid bothering the
libnvdimm driver (e.g. add specific handling in libnvdimm driver).
IIUC, one existing example is device-mapper device (dm) which needs to
reserve on-device area for its own meta-data. Its choice is to store
the meta-data on the block device (/dev/pmemN) provided by the
libnvdimm driver.
I think we can do the similar for Xen, like to lay another pseudo
device on /dev/pmem and do the reservation, like 2. in my previous
reply.

Well, my opinion certainly doesn't count much here, but I continue to
consider this a bad idea. For entities like drivers it may well be
appropriate, but I think there ought to be an independent concept
of "OS reserved", and in the Xen case this could then be shared
between hypervisor and Dom0 kernel. Or if we were to consider Dom0
"just a guest", things should even be the other way around: Xen gets
all of the OS reserved space, and Dom0 needs something custom.

Jan

Dan Williams

2016-10-13 15:41:11 UTC

On Thu, Oct 13, 2016 at 2:08 AM, Jan Beulich <***@suse.com> wrote:
[..]

Post by Jan Beulich

Post by Haozhong Zhang
I think we can do the similar for Xen, like to lay another pseudo
device on /dev/pmem and do the reservation, like 2. in my previous
reply.

Well, my opinion certainly doesn't count much here, but I continue to
consider this a bad idea. For entities like drivers it may well be
appropriate, but I think there ought to be an independent concept
of "OS reserved", and in the Xen case this could then be shared
between hypervisor and Dom0 kernel. Or if we were to consider Dom0
"just a guest", things should even be the other way around: Xen gets
all of the OS reserved space, and Dom0 needs something custom.

You haven't made the case why Xen is special and other applications of
persistent memory are not. The current struct page reservation
supports fundamental address-ability of persistent memory namespaces
for the rest of the kernel. The Xen reservation is application
specific. XFS, EXT4, and DM also have application specific usages of
persistent memory and consume metadata space out of a block device. If
we don't need an XFS-mode nvdimm device, why do we need Xen-mode?

Andrew Cooper

2016-10-13 16:03:42 UTC

Post by Dan Williams
[..]

Post by Jan Beulich

Post by Haozhong Zhang
I think we can do the similar for Xen, like to lay another pseudo
device on /dev/pmem and do the reservation, like 2. in my previous
reply.

Well, my opinion certainly doesn't count much here, but I continue to
consider this a bad idea. For entities like drivers it may well be
appropriate, but I think there ought to be an independent concept
of "OS reserved", and in the Xen case this could then be shared
between hypervisor and Dom0 kernel. Or if we were to consider Dom0
"just a guest", things should even be the other way around: Xen gets
all of the OS reserved space, and Dom0 needs something custom.

You haven't made the case why Xen is special and other applications of
persistent memory are not.

In a Xen system, Xen runs in the baremetal root-mode ring0, and dom0 is
a VM running in ring1/3 with the nvdimm driver. This is the opposite
way around to the KVM model.

Dom0, being the hardware domain, has default ownership of all the
hardware, but to gain access in the first place, it must request a
mapping from Xen. Xen therefore needs to know and cope with being able
to give dom0 a mapping to the nvdimms, without touching the content of
the nvidmm itself (so as to avoid corrupting data).

Once dom0 has a mapping of the nvdimm, the nvdimm driver can go to work
and figure out what is on the DIMM, and which areas are safe to use.

At this point, a Xen subsystem in Linux could choose one or more areas
to hand back to the hypervisor to use as RAM/other.

~Andrew

Dan Williams

2016-10-13 19:08:21 UTC

On Thu, Oct 13, 2016 at 9:01 AM, Andrew Cooper

Post by Andrew Cooper

Post by Dan Williams
[..]

Post by Jan Beulich

Post by Haozhong Zhang
I think we can do the similar for Xen, like to lay another pseudo
device on /dev/pmem and do the reservation, like 2. in my previous
reply.

Well, my opinion certainly doesn't count much here, but I continue to
consider this a bad idea. For entities like drivers it may well be
appropriate, but I think there ought to be an independent concept
of "OS reserved", and in the Xen case this could then be shared
between hypervisor and Dom0 kernel. Or if we were to consider Dom0
"just a guest", things should even be the other way around: Xen gets
all of the OS reserved space, and Dom0 needs something custom.

You haven't made the case why Xen is special and other applications of
persistent memory are not.

In a Xen system, Xen runs in the baremetal root-mode ring0, and dom0 is
a VM running in ring1/3 with the nvdimm driver. This is the opposite
way around to the KVM model.
Dom0, being the hardware domain, has default ownership of all the
hardware, but to gain access in the first place, it must request a
mapping from Xen.

This is where my understanding the Xen model breaks down. Are you
saying dom0 can't access the persistent memory range unless the ring0
agent has metadata storage space for tracking what it maps into dom0?
That can't be true because then PCI memory ranges would not work
without metadata reserve space. Dom0 still needs to map and write the
DIMMs to even set up the struct page reservation, it isn't established
by default.

Post by Andrew Cooper
Xen therefore needs to know and cope with being able
to give dom0 a mapping to the nvdimms, without touching the content of
the nvidmm itself (so as to avoid corrupting data).

Is it true that this metadata only comes into use when remapping the
dom0 discovered range(s) into a guest VM?

Post by Andrew Cooper
Once dom0 has a mapping of the nvdimm, the nvdimm driver can go to work
and figure out what is on the DIMM, and which areas are safe to use.

I don't understand this ordering of events. Dom0 needs to have a
mapping to even write the on-media structure to indicate a
reservation. So, initial dom0 access can't depend on metadata
reservation already being present.

Post by Andrew Cooper
At this point, a Xen subsystem in Linux could choose one or more areas
to hand back to the hypervisor to use as RAM/other.

To me all this configuration seems to come after the fact. After dom0
sees /dev/pmemX devices, then it can go to work carving it up and
writing Xen specific metadata to the range(s). The struct page
reservation never comes into the picture. In fact, a raw mode
namespace (one without a reservation) could be used in this model, the
nvdimm core never needs to know what is happening.

Andrew Cooper

2016-10-13 19:34:40 UTC

Post by Dan Williams
On Thu, Oct 13, 2016 at 9:01 AM, Andrew Cooper

Post by Andrew Cooper

Post by Dan Williams
[..]

Post by Jan Beulich

Post by Haozhong Zhang
I think we can do the similar for Xen, like to lay another pseudo
device on /dev/pmem and do the reservation, like 2. in my previous
reply.

Well, my opinion certainly doesn't count much here, but I continue to
consider this a bad idea. For entities like drivers it may well be
appropriate, but I think there ought to be an independent concept
of "OS reserved", and in the Xen case this could then be shared
between hypervisor and Dom0 kernel. Or if we were to consider Dom0
"just a guest", things should even be the other way around: Xen gets
all of the OS reserved space, and Dom0 needs something custom.

You haven't made the case why Xen is special and other applications of
persistent memory are not.

In a Xen system, Xen runs in the baremetal root-mode ring0, and dom0 is
a VM running in ring1/3 with the nvdimm driver. This is the opposite
way around to the KVM model.
Dom0, being the hardware domain, has default ownership of all the
hardware, but to gain access in the first place, it must request a
mapping from Xen.

This is where my understanding the Xen model breaks down. Are you
saying dom0 can't access the persistent memory range unless the ring0
agent has metadata storage space for tracking what it maps into dom0?

No. I am trying to point out that the current suggestion wont work, and
needs re-designing.

Xen *must* be able to properly configure mappings of the NVDIMM for
dom0, *without* modifying any content on the NVDIMM. Otherwise, data
corruption will occur.

Whether this means no Xen metadata, or the metadata living elsewhere in
regular ram, such as the main frametable, is an implementation detail.

Post by Dan Williams

Post by Andrew Cooper
Once dom0 has a mapping of the nvdimm, the nvdimm driver can go to work
and figure out what is on the DIMM, and which areas are safe to use.

I don't understand this ordering of events. Dom0 needs to have a
mapping to even write the on-media structure to indicate a
reservation. So, initial dom0 access can't depend on metadata
reservation already being present.

I agree.

Overall, I think the following is needed.

* Xen starts up.
** Xen might find some NVDIMM SPA/MFN ranges in the NFIT table, and
needs to note this information somehow.
** Xen might find some Type 7 E820 regions, and needs to note this
information somehow.
* Xen starts dom0.
* Once OSPM is running, a Xen component in Linux needs to collect and
report all NVDIMM SPA/MFN regions it knowns about.
** This covers the AML-only case, and the hotplug case.
* Dom0 requests a mapping of the NVDIMMs via the usual mechanism.
** This should work, as Xen is aware that there is something there to be
mapped (rather than just empty physical address space).
* Dom0 finds that some NVDIMM ranges are now available for use (probably
modelled as hotplug events).
* /dev/pmem $STUFF starts happening as normal.

At some pointer later after dom0 policy decisions are made (ultimately,
by the host administrator):
* If an area of NVDIMM is chosen for Xen to use, Dom0 needs to inform
Xen of the SPA/MFN regions which are safe to use.
* Xen then incorporates these regions into its idea of RAM, and starts
using them for whatever.

~Andrew

Haozhong Zhang

2016-10-14 07:09:13 UTC

Post by Andrew Cooper

Post by Dan Williams
On Thu, Oct 13, 2016 at 9:01 AM, Andrew Cooper

Post by Andrew Cooper

Post by Dan Williams
[..]

Post by Jan Beulich

Post by Haozhong Zhang
I think we can do the similar for Xen, like to lay another pseudo
device on /dev/pmem and do the reservation, like 2. in my previous
reply.

Well, my opinion certainly doesn't count much here, but I continue to
consider this a bad idea. For entities like drivers it may well be
appropriate, but I think there ought to be an independent concept
of "OS reserved", and in the Xen case this could then be shared
between hypervisor and Dom0 kernel. Or if we were to consider Dom0
"just a guest", things should even be the other way around: Xen gets
all of the OS reserved space, and Dom0 needs something custom.

You haven't made the case why Xen is special and other applications of
persistent memory are not.

In a Xen system, Xen runs in the baremetal root-mode ring0, and dom0 is
a VM running in ring1/3 with the nvdimm driver. This is the opposite
way around to the KVM model.
Dom0, being the hardware domain, has default ownership of all the
hardware, but to gain access in the first place, it must request a
mapping from Xen.

This is where my understanding the Xen model breaks down. Are you
saying dom0 can't access the persistent memory range unless the ring0
agent has metadata storage space for tracking what it maps into dom0?

No. I am trying to point out that the current suggestion wont work, and
needs re-designing.
Xen *must* be able to properly configure mappings of the NVDIMM for
dom0, *without* modifying any content on the NVDIMM. Otherwise, data
corruption will occur.
Whether this means no Xen metadata, or the metadata living elsewhere in
regular ram, such as the main frametable, is an implementation detail.

Post by Dan Williams

Post by Andrew Cooper
Once dom0 has a mapping of the nvdimm, the nvdimm driver can go to work
and figure out what is on the DIMM, and which areas are safe to use.

I don't understand this ordering of events. Dom0 needs to have a
mapping to even write the on-media structure to indicate a
reservation. So, initial dom0 access can't depend on metadata
reservation already being present.

I agree.
Overall, I think the following is needed.
* Xen starts up.
** Xen might find some NVDIMM SPA/MFN ranges in the NFIT table, and
needs to note this information somehow.
** Xen might find some Type 7 E820 regions, and needs to note this
information somehow.

IIUC, this is to collect MFNs and no need to create frame table and
M2P at this stage. If so, what is different from ...

Post by Andrew Cooper
* Xen starts dom0.
* Once OSPM is running, a Xen component in Linux needs to collect and
report all NVDIMM SPA/MFN regions it knowns about.
** This covers the AML-only case, and the hotplug case.

.. the MFNs reported here, especially that the former is a subset
(hotplug ones not included in the former) of latter.

(There is no E820 hole or SRAT entries to tell which address range is
reserved for hotplugged NVDIMM)

Post by Andrew Cooper
* Dom0 requests a mapping of the NVDIMMs via the usual mechanism.

Two questions:
1. Why is this request necessary? Even without such requests like what
my current implementation, Dom0 can still access NVDIMM.

Or do you mean Xen hypervisor should by default disallow Dom0 to
access MFNs reported in previous step until they are requested?

2. Who initiates the requests? If it's the libnvdimm driver, that
means we still need to introduce Xen specific code to the driver.

Or the requests are issued by OSPM (or the Xen component you
mentioned above) when they probe new dimms?

For the latter, Dan, do you think it's acceptable in NFIT code to
call the Xen component to request the access permission of the pmem
regions, e.g. in apic_nfit_insert_resource(). Of course, it's only
used for Dom0 case.

Post by Andrew Cooper
** This should work, as Xen is aware that there is something there to be
mapped (rather than just empty physical address space).
* Dom0 finds that some NVDIMM ranges are now available for use (probably
modelled as hotplug events).
* /dev/pmem $STUFF starts happening as normal.
At some pointer later after dom0 policy decisions are made (ultimately,
* If an area of NVDIMM is chosen for Xen to use, Dom0 needs to inform
Xen of the SPA/MFN regions which are safe to use.
* Xen then incorporates these regions into its idea of RAM, and starts
using them for whatever.

Agree. I think we may not need to fix the way/format/... to make the
reservation, and instead let the users (host administrators), who have
better understanding of their data, make the proper decision.

In a worse case that no reservation is made, Xen hypervisor could turn
to use RAM for management structures for NVDIMM, with the cost of less
RAM for guests.

Thanks,
Haozhong

Andrew Cooper

2016-10-14 12:19:09 UTC

Post by Haozhong Zhang

Post by Andrew Cooper

Post by Dan Williams
On Thu, Oct 13, 2016 at 9:01 AM, Andrew Cooper

Post by Andrew Cooper

Post by Dan Williams
[..]

Post by Jan Beulich

Post by Haozhong Zhang
I think we can do the similar for Xen, like to lay another pseudo
device on /dev/pmem and do the reservation, like 2. in my previous
reply.

Well, my opinion certainly doesn't count much here, but I
continue to
consider this a bad idea. For entities like drivers it may well be
appropriate, but I think there ought to be an independent concept
of "OS reserved", and in the Xen case this could then be shared
between hypervisor and Dom0 kernel. Or if we were to consider Dom0
"just a guest", things should even be the other way around: Xen gets
all of the OS reserved space, and Dom0 needs something custom.

You haven't made the case why Xen is special and other
applications of
persistent memory are not.

In a Xen system, Xen runs in the baremetal root-mode ring0, and dom0 is
a VM running in ring1/3 with the nvdimm driver. This is the opposite
way around to the KVM model.
Dom0, being the hardware domain, has default ownership of all the
hardware, but to gain access in the first place, it must request a
mapping from Xen.

This is where my understanding the Xen model breaks down. Are you
saying dom0 can't access the persistent memory range unless the ring0
agent has metadata storage space for tracking what it maps into dom0?

No. I am trying to point out that the current suggestion wont work, and
needs re-designing.
Xen *must* be able to properly configure mappings of the NVDIMM for
dom0, *without* modifying any content on the NVDIMM. Otherwise, data
corruption will occur.
Whether this means no Xen metadata, or the metadata living elsewhere in
regular ram, such as the main frametable, is an implementation detail.

Post by Dan Williams

Post by Andrew Cooper
Once dom0 has a mapping of the nvdimm, the nvdimm driver can go to work
and figure out what is on the DIMM, and which areas are safe to use.

I don't understand this ordering of events. Dom0 needs to have a
mapping to even write the on-media structure to indicate a
reservation. So, initial dom0 access can't depend on metadata
reservation already being present.

I agree.
Overall, I think the following is needed.
* Xen starts up.
** Xen might find some NVDIMM SPA/MFN ranges in the NFIT table, and
needs to note this information somehow.
** Xen might find some Type 7 E820 regions, and needs to note this
information somehow.

IIUC, this is to collect MFNs and no need to create frame table and
M2P at this stage. If so, what is different from ...

Post by Andrew Cooper
* Xen starts dom0.
* Once OSPM is running, a Xen component in Linux needs to collect and
report all NVDIMM SPA/MFN regions it knowns about.
** This covers the AML-only case, and the hotplug case.

... the MFNs reported here, especially that the former is a subset
(hotplug ones not included in the former) of latter.

Hopefully nothing. However, Xen shouldn't exclusively rely on the dom0
when it is capable of working things out itself, (which can aid with
debugging one half of this arrangement). Also, the MFNS found by Xen
alone can be present in the default memory map for dom0.

Post by Haozhong Zhang
(There is no E820 hole or SRAT entries to tell which address range is
reserved for hotplugged NVDIMM)

Post by Andrew Cooper
* Dom0 requests a mapping of the NVDIMMs via the usual mechanism.

1. Why is this request necessary? Even without such requests like what
my current implementation, Dom0 can still access NVDIMM.

Can it? (if so, great, but I don't think this holds in the general
case.) Is that a side effect of the NVDIMM being covered by a hole in
the E820? The current logic for what dom0 may access by default is
somewhat ad-hoc, and I have a gut feeling that it won't work with E820
type 7 regions.

Post by Haozhong Zhang
Or do you mean Xen hypervisor should by default disallow Dom0 to
access MFNs reported in previous step until they are requested?

No - I am not suggesting this.

Post by Haozhong Zhang
2. Who initiates the requests? If it's the libnvdimm driver, that
means we still need to introduce Xen specific code to the driver.
Or the requests are issued by OSPM (or the Xen component you
mentioned above) when they probe new dimms?
For the latter, Dan, do you think it's acceptable in NFIT code to
call the Xen component to request the access permission of the pmem
regions, e.g. in apic_nfit_insert_resource(). Of course, it's only
used for Dom0 case.

The libnvdimm driver should continue to use ioremap() or whatever it
currently does. There shouldn't be Xen modifications like that.

The one issue will come if libnvdimm tries to ioremap()/other an area
which Xen is unaware is an NVDIMM, and rejects the mapping request.
Somehow, a Xen component will need to find the MFN/SPA layout and
register this information with Xen, before the ioremap() call made by
the libnvdimm driver. Perhaps a notifier mechanism out from the ACPI
subsystem might be the best way to make this work in a clean way.

Post by Haozhong Zhang

Post by Andrew Cooper
** This should work, as Xen is aware that there is something there to be
mapped (rather than just empty physical address space).
* Dom0 finds that some NVDIMM ranges are now available for use (probably
modelled as hotplug events).
* /dev/pmem $STUFF starts happening as normal.
At some pointer later after dom0 policy decisions are made (ultimately,
* If an area of NVDIMM is chosen for Xen to use, Dom0 needs to inform
Xen of the SPA/MFN regions which are safe to use.
* Xen then incorporates these regions into its idea of RAM, and starts
using them for whatever.

Agree. I think we may not need to fix the way/format/... to make the
reservation, and instead let the users (host administrators), who have
better understanding of their data, make the proper decision.

Yes. This is the best course of action.

Post by Haozhong Zhang
In a worse case that no reservation is made, Xen hypervisor could turn
to use RAM for management structures for NVDIMM, with the cost of less
RAM for guests.

Or simply not manage the NVDIMM at all.

OTOH, a different usecase might be to register a small area for Xen to
use to crash log into.

~Andrew

Jan Beulich

2016-10-14 10:03:21 UTC

Post by Dan Williams
[..]

Post by Jan Beulich

Post by Haozhong Zhang
I think we can do the similar for Xen, like to lay another pseudo
device on /dev/pmem and do the reservation, like 2. in my previous
reply.

Well, my opinion certainly doesn't count much here, but I continue to
consider this a bad idea. For entities like drivers it may well be
appropriate, but I think there ought to be an independent concept
of "OS reserved", and in the Xen case this could then be shared
between hypervisor and Dom0 kernel. Or if we were to consider Dom0
"just a guest", things should even be the other way around: Xen gets
all of the OS reserved space, and Dom0 needs something custom.

You haven't made the case why Xen is special and other applications of
persistent memory are not.

Well, I'm implying this from there being a special Linux reservation.
Xen (as explained by Andrew) sitting underneath the Dom0 kernel
(other than ...

Post by Dan Williams
The current struct page reservation
supports fundamental address-ability of persistent memory namespaces
for the rest of the kernel. The Xen reservation is application
specific. XFS, EXT4, and DM also have application specific usages of
persistent memory and consume metadata space out of a block device. If
we don't need an XFS-mode nvdimm device, why do we need Xen-mode?

.. all the examples you give) by implication is special then too. If
you made the kernel be no different than the other examples you
give, Xen probably shouldn't be any different anymore either.

Jan

Haozhong Zhang

2016-10-13 15:47:25 UTC

Post by Jan Beulich

Post by Haozhong Zhang

Post by Jan Beulich

Post by Dan Williams

Post by Dan Williams

Post by Jan Beulich

Post by Haozhong Zhang

Post by Jan Beulich

Post by Haozhong Zhang
The layout is shown as the following diagram.
+---------------+-----------+-------+----------+--------------+
| whatever used | Partition | Super | Reserved | /dev/pmem0p1 |
| by kernel | Table | Block | for Xen | |
+---------------+-----------+-------+----------+--------------+
\_____________________ _______________________/
V
/dev/pmem0

I have to admit that I dislike this, for not being OS-agnostic.
Neither should there be any Xen-specific region, nor should the
"whatever used by kernel" one be restricted to just Linux. What
I could see is an OS-reserved area ahead of the partition table,
the exact usage of which depends on which OS is currently
running (and in the Xen case this might be both Xen _and_ the
Dom0 kernel, arbitrated by a tbd protocol). After all, when
running under Xen, the Dom0 may not have a need for as much
control data as it has when running on bare hardware, for it
controlling less (if any) of the actual memory ranges when Xen
is present.

Isn't this OS-reserved area still not OS-agnostic, as it requires OS
to know where the reserved area is? Or do you mean it's not if it's
defined by a protocol that is accepted by all OSes?

The latter - we clearly won't get away without some agreement on
where to retrieve position and size of this area. I was simply
assuming that such a protocol already exists.

No, we should not mix the struct page reservation that the Dom0 kernel
may actively use with the Xen reservation that the Dom0 kernel does
not consume. Explain again what is wrong with the partition approach?

Not sure what was unclear in my previous reply. I don't think there
should be apriori knowledge of whether Xen is (going to be) used on
a system, and even if it gets used, but just occasionally, it would
(apart from the abstract considerations already given) be a waste
of resources to set something aside that could be used for other
purposes while Xen is not running. Static partitioning should only be
needed for persistent data.

The reservation needs to be persistent / static even if the data is
volatile, as is the case with struct page, because we can't have the
size of the device change depending on use. So, from the aspect of
wasting space while Xen is not in use, both partitions and the
intrinsic reservation approach suffer the same problem. Setting that
aside I don't want to mix 2 different use cases into the same
reservation.

Then you didn't understand what I've said: I certainly didn't mean
the reservation to vary from a device perspective. However, when
Xen is in use I don't see why part of that static reservation couldn't
be used by Xen, and another part by the Dom0 kernel. The kernel
obviously would need to ask the hypervisor how much of the space
is left, and where that area starts.

I think Dan means that there should be a clear separation between
reservations for different usages (kernel/xen/...). The libnvdimm
driver is for the linux kernel and only needs to maintain the
reservation for kernel functionality. For others including xen/dm/...,
if they want reservation for their own purpose, they should maintain
their own reservations out of libnvdimm driver and avoid bothering the
libnvdimm driver (e.g. add specific handling in libnvdimm driver).
IIUC, one existing example is device-mapper device (dm) which needs to
reserve on-device area for its own meta-data. Its choice is to store
the meta-data on the block device (/dev/pmemN) provided by the
libnvdimm driver.
I think we can do the similar for Xen, like to lay another pseudo
device on /dev/pmem and do the reservation, like 2. in my previous
reply.

Well, my opinion certainly doesn't count much here, but I continue to
consider this a bad idea. For entities like drivers it may well be
appropriate, but I think there ought to be an independent concept
of "OS reserved", and in the Xen case this could then be shared
between hypervisor and Dom0 kernel.

No such independent concept seems exist right now. It may be hard to
define such concept, because it's hard to know the common requirements
(e.g. size/alignment/...) from ALL OSes. Making each component to
maintain its own reservation in its own way seems more flexible.

Post by Jan Beulich
Or if we were to consider Dom0
"just a guest", things should even be the other way around: Xen gets
all of the OS reserved space, and Dom0 needs something custom.

Sure, it's possible to implement the driver in a way that if the
driver finds it runs on Xen, then it just leaves the OS reserved area
for Xen and itself goes to other reservation. Are there some
differences in practice from the way that Xen goes to other
reservation that makes we have to do so? If not and it's possible to
not touch the existing libnvdimm driver, why don't we just use the
existing libnvdimm driver and let xen driver make the reservation on
what the libnvdimm driver provides?

In addition (not sure it's related), my Xen patch series (specially
patch 3) does not have many requirements for the location of the
reserved area, as long as it's in the nvdimm. I mean if we find a
better way for the reservation in future, there should be no changes
to Xen. For now, I think we could just choose the way to not touch the
libnvdimm driver.

Thanks,
Haozhong

Jan Beulich

2016-10-14 10:16:54 UTC

Post by Haozhong Zhang

Post by Jan Beulich

Post by Haozhong Zhang

Post by Jan Beulich

Post by Dan Williams

Post by Dan Williams

Post by Jan Beulich

Post by Haozhong Zhang

Post by Jan Beulich

Post by Haozhong Zhang
The layout is shown as the following diagram.
+---------------+-----------+-------+----------+--------------+
| whatever used | Partition | Super | Reserved | /dev/pmem0p1 |
| by kernel | Table | Block | for Xen | |
+---------------+-----------+-------+----------+--------------+
\_____________________ _______________________/
V
/dev/pmem0

I have to admit that I dislike this, for not being OS-agnostic.
Neither should there be any Xen-specific region, nor should the
"whatever used by kernel" one be restricted to just Linux. What
I could see is an OS-reserved area ahead of the partition table,
the exact usage of which depends on which OS is currently
running (and in the Xen case this might be both Xen _and_ the
Dom0 kernel, arbitrated by a tbd protocol). After all, when
running under Xen, the Dom0 may not have a need for as much
control data as it has when running on bare hardware, for it
controlling less (if any) of the actual memory ranges when Xen
is present.

Isn't this OS-reserved area still not OS-agnostic, as it requires OS
to know where the reserved area is? Or do you mean it's not if it's
defined by a protocol that is accepted by all OSes?

The latter - we clearly won't get away without some agreement on
where to retrieve position and size of this area. I was simply
assuming that such a protocol already exists.

No, we should not mix the struct page reservation that the Dom0 kernel
may actively use with the Xen reservation that the Dom0 kernel does
not consume. Explain again what is wrong with the partition approach?

Not sure what was unclear in my previous reply. I don't think there
should be apriori knowledge of whether Xen is (going to be) used on
a system, and even if it gets used, but just occasionally, it would
(apart from the abstract considerations already given) be a waste
of resources to set something aside that could be used for other
purposes while Xen is not running. Static partitioning should only be
needed for persistent data.

The reservation needs to be persistent / static even if the data is
volatile, as is the case with struct page, because we can't have the
size of the device change depending on use. So, from the aspect of
wasting space while Xen is not in use, both partitions and the
intrinsic reservation approach suffer the same problem. Setting that
aside I don't want to mix 2 different use cases into the same
reservation.

Then you didn't understand what I've said: I certainly didn't mean
the reservation to vary from a device perspective. However, when
Xen is in use I don't see why part of that static reservation couldn't
be used by Xen, and another part by the Dom0 kernel. The kernel
obviously would need to ask the hypervisor how much of the space
is left, and where that area starts.

I think Dan means that there should be a clear separation between
reservations for different usages (kernel/xen/...). The libnvdimm
driver is for the linux kernel and only needs to maintain the
reservation for kernel functionality. For others including xen/dm/...,
if they want reservation for their own purpose, they should maintain
their own reservations out of libnvdimm driver and avoid bothering the
libnvdimm driver (e.g. add specific handling in libnvdimm driver).
IIUC, one existing example is device-mapper device (dm) which needs to
reserve on-device area for its own meta-data. Its choice is to store
the meta-data on the block device (/dev/pmemN) provided by the
libnvdimm driver.
I think we can do the similar for Xen, like to lay another pseudo
device on /dev/pmem and do the reservation, like 2. in my previous
reply.

Well, my opinion certainly doesn't count much here, but I continue to
consider this a bad idea. For entities like drivers it may well be
appropriate, but I think there ought to be an independent concept
of "OS reserved", and in the Xen case this could then be shared
between hypervisor and Dom0 kernel.

No such independent concept seems exist right now. It may be hard to
define such concept, because it's hard to know the common requirements
(e.g. size/alignment/...) from ALL OSes. Making each component to
maintain its own reservation in its own way seems more flexible.

Post by Jan Beulich
Or if we were to consider Dom0
"just a guest", things should even be the other way around: Xen gets
all of the OS reserved space, and Dom0 needs something custom.

Sure, it's possible to implement the driver in a way that if the
driver finds it runs on Xen, then it just leaves the OS reserved area
for Xen and itself goes to other reservation. Are there some
differences in practice from the way that Xen goes to other
reservation that makes we have to do so? If not and it's possible to
not touch the existing libnvdimm driver, why don't we just use the
existing libnvdimm driver and let xen driver make the reservation on
what the libnvdimm driver provides?

It continues to feel like you're trying to make the second step before
the first: You talk about implementation, whereas I talk about the
concept that should underly your implementation. One could view it
the way that how said driver works with Xen in the picture should
have been decided before it got implemented, and then the question
of whether it would be okay to leave the existing implementation
alone would not have appeared in the first place.

With that, I can't reasonably answer your questions.

Jan

Haozhong Zhang

2016-10-13 09:08:48 UTC

+Dan Williams

I accidentally dropped him in my last reply. Add him back.

Post by Haozhong Zhang

Post by Jan Beulich

Post by Dan Williams

Post by Dan Williams

Post by Jan Beulich

Post by Haozhong Zhang

Post by Jan Beulich

Post by Haozhong Zhang
The layout is shown as the following diagram.
+---------------+-----------+-------+----------+--------------+
| whatever used | Partition | Super | Reserved | /dev/pmem0p1 |
| by kernel | Table | Block | for Xen | |
+---------------+-----------+-------+----------+--------------+
\_____________________ _______________________/
V
/dev/pmem0

I have to admit that I dislike this, for not being OS-agnostic.
Neither should there be any Xen-specific region, nor should the
"whatever used by kernel" one be restricted to just Linux. What
I could see is an OS-reserved area ahead of the partition table,
the exact usage of which depends on which OS is currently
running (and in the Xen case this might be both Xen _and_ the
Dom0 kernel, arbitrated by a tbd protocol). After all, when
running under Xen, the Dom0 may not have a need for as much
control data as it has when running on bare hardware, for it
controlling less (if any) of the actual memory ranges when Xen
is present.

Isn't this OS-reserved area still not OS-agnostic, as it requires OS
to know where the reserved area is? Or do you mean it's not if it's
defined by a protocol that is accepted by all OSes?

The latter - we clearly won't get away without some agreement on
where to retrieve position and size of this area. I was simply
assuming that such a protocol already exists.

No, we should not mix the struct page reservation that the Dom0 kernel
may actively use with the Xen reservation that the Dom0 kernel does
not consume. Explain again what is wrong with the partition approach?

Not sure what was unclear in my previous reply. I don't think there
should be apriori knowledge of whether Xen is (going to be) used on
a system, and even if it gets used, but just occasionally, it would
(apart from the abstract considerations already given) be a waste
of resources to set something aside that could be used for other
purposes while Xen is not running. Static partitioning should only be
needed for persistent data.

The reservation needs to be persistent / static even if the data is
volatile, as is the case with struct page, because we can't have the
size of the device change depending on use. So, from the aspect of
wasting space while Xen is not in use, both partitions and the
intrinsic reservation approach suffer the same problem. Setting that
aside I don't want to mix 2 different use cases into the same
reservation.

Then you didn't understand what I've said: I certainly didn't mean
the reservation to vary from a device perspective. However, when
Xen is in use I don't see why part of that static reservation couldn't
be used by Xen, and another part by the Dom0 kernel. The kernel
obviously would need to ask the hypervisor how much of the space
is left, and where that area starts.

I think Dan means that there should be a clear separation between
reservations for different usages (kernel/xen/...). The libnvdimm
driver is for the linux kernel and only needs to maintain the
reservation for kernel functionality. For others including xen/dm/...,
if they want reservation for their own purpose, they should maintain
their own reservations out of libnvdimm driver and avoid bothering the
libnvdimm driver (e.g. add specific handling in libnvdimm driver).
IIUC, one existing example is device-mapper device (dm) which needs to
reserve on-device area for its own meta-data. Its choice is to store
the meta-data on the block device (/dev/pmemN) provided by the
libnvdimm driver.
I think we can do the similar for Xen, like to lay another pseudo
device on /dev/pmem and do the reservation, like 2. in my previous
reply.
Thanks,
Haozhong

Post by Jan Beulich

Post by Dan Williams
The kernel needs to know about the struct page reservation because it
needs to manage the lifetime of page references vs the lifetime of the
device. It does not have the same relationship with a Xen reservation
which is why I'm proposing they be managed separately.

I don't think I understand the difference you try to point out here.
Linux'es struct page and Xen's struct page_info serve the same
fundamental purpose.
Jan

_______________________________________________
Linux-nvdimm mailing list
https://lists.01.org/mailman/listinfo/linux-nvdimm

Jan Beulich

2016-10-12 16:22:42 UTC

Post by Dan Williams

Post by Jan Beulich

Post by Haozhong Zhang

Post by Jan Beulich

Post by Haozhong Zhang
The layout is shown as the following diagram.
+---------------+-----------+-------+----------+--------------+
| whatever used | Partition | Super | Reserved | /dev/pmem0p1 |
| by kernel | Table | Block | for Xen | |
+---------------+-----------+-------+----------+--------------+
\_____________________ _______________________/
V
/dev/pmem0

I have to admit that I dislike this, for not being OS-agnostic.
Neither should there be any Xen-specific region, nor should the
"whatever used by kernel" one be restricted to just Linux. What
I could see is an OS-reserved area ahead of the partition table,
the exact usage of which depends on which OS is currently
running (and in the Xen case this might be both Xen _and_ the
Dom0 kernel, arbitrated by a tbd protocol). After all, when
running under Xen, the Dom0 may not have a need for as much
control data as it has when running on bare hardware, for it
controlling less (if any) of the actual memory ranges when Xen
is present.

Isn't this OS-reserved area still not OS-agnostic, as it requires OS
to know where the reserved area is? Or do you mean it's not if it's
defined by a protocol that is accepted by all OSes?

The latter - we clearly won't get away without some agreement on
where to retrieve position and size of this area. I was simply
assuming that such a protocol already exists.

No, we should not mix the struct page reservation that the Dom0 kernel
may actively use with the Xen reservation that the Dom0 kernel does
not consume. Explain again what is wrong with the partition approach?

Not sure what was unclear in my previous reply. I don't think there
should be apriori knowledge of whether Xen is (going to be) used on
a system, and even if it gets used, but just occasionally, it would
(apart from the abstract considerations already given) be a waste
of resources to set something aside that could be used for other
purposes while Xen is not running. Static partitioning should only be
needed for persistent data.

Jan

Andrew Cooper

2016-10-11 20:19:16 UTC

Post by Konrad Rzeszutek Wilk

Post by Dan Williams
On Tue, Oct 11, 2016 at 11:33 AM, Konrad Rzeszutek Wilk
[..]

Post by Konrad Rzeszutek Wilk

Post by Dan Williams
Right, but why does the libnvdimm core need to know about this
specific Xen reservation? For example, if Xen wants some in-kernel

Let me turn this around - why does the libnvdimm core need to know about
Linux specific parts? Shouldn't this be OS agnostic, so that FreeBSD
for example can also poke a hole in this and fill it with its
OS-management meta-data?

Specifically the core needs to know so that it can answer the Linux
specific question of whether the pfn returned by ->direct_access() has
a corresponding struct page or not. It's tied to the lifetime of the
device and the usage of the reservation needs to be coordinated
against the references of those pages. If FreeBSD decides it needs to
reserve "struct page" capacity at the start of the device, I would
hope that it reuses the same on-device info block that Linux is using
and not create a new "FreeBSD-mode" device type.

The issue here (as I understand, I may be missing something new)
is that the size of this special namespace may be different. That is
the 'struct page' on FreeBSD could be 256 bytes while on Linux it is
64 bytes (numbers pulled out of the sky).
Hence one would have to expand or such to re-use this.

Post by Dan Williams
To be honest I do not yet understand what metadata Xen wants to store
in the device, but it seems the producer and consumer of that metadata
is Xen itself and not the wider Linux kernel as is the case with
struct page. Can you fill me in on what problem Xen solves with this

Exactly!

Post by Dan Williams
reservation?

The same as Linux - its variant of 'struct page'. Which I think is
smaller than the Linux one, but perhaps it is not?

There is still a bootstrapping issue though, which looks (in its current
form) to cause data corruption.

I hope I am mistaken, and apologies if I am, but clearly we cannot build
a solution that has data corruption in anything other than an
exceptional circumstance.

So far, the sequence of boot operations appears to look like this:

Xen boots, and may find some NVDIMM SPA/MFN ranges via the NFIT table.
Any ranges available only from AML need dynamically reporting back to
Xen at a later point, once OSPM is up and running.

The NVDIMMs must be mappable by dom0 so the contents can be inspected
and deemed to be safe by the nvdimm driver/host admin, before Xen starts
writing to any of it (for whatever reason).

If this isn't the case, then simply booting a Xen/dom0 combo will end up
corrupting a region before working out that it is safe to do so.

~Andrew

Jan Beulich

2016-10-12 07:25:50 UTC

Post by Dan Williams

Post by Jan Beulich

Post by Andrew Cooper

Post by Haozhong Zhang
1) Reserve an area on NVDIMM devices for Xen hypervisor to place
memory management data structures, i.e. frame table and M2P table.
2) Report SPA ranges of NVDIMM devices and the reserved area to Xen
hypervisor.

However, I can't see any justification for 1). Dom0 should not be
involved in Xen's management of its own frame table and m2p. The mfns
making up the pmem/pblk regions should be treated just like any other
MMIO regions, and be handed wholesale to dom0 by default.

That precludes the use as RAM extension, and I thought earlier rounds of
discussion had got everyone in agreement that at least for the pmem case
we will need some control data in Xen.

The missing piece for me is why this reservation for control data
needs to be done in the libnvdimm core? I would expect that any dax
capable file could be mapped and made available to a guest. This
includes /dev/ramX devices that are dax capable, but are external to
the libnvdimm sub-system.

Despite me being the only one on the To list, I don't think the question
was really meant to be directed to me.

Jan

44 Replies
6 Views
Permalink to this page
Disable enhanced parsing

Thread Navigation

Haozhong Zhang 2016-10-10 00:35:42 UTC

Haozhong Zhang 2016-10-10 00:35:50 UTC

Haozhong Zhang 2016-10-10 00:36:00 UTC

Dan Williams 2016-10-10 03:45:46 UTC

Haozhong Zhang 2016-10-10 06:32:26 UTC

Dan Williams 2016-10-10 16:24:34 UTC

Haozhong Zhang 2016-10-11 07:12:03 UTC

Andrew Cooper 2016-10-10 16:47:34 UTC

Haozhong Zhang 2016-10-11 05:53:08 UTC

Konrad Rzeszutek Wilk 2016-10-11 18:46:14 UTC

Konrad Rzeszutek Wilk 2016-10-11 18:50:06 UTC

Andrew Cooper 2016-10-11 18:50:34 UTC

Jan Beulich 2016-10-11 13:08:50 UTC

Dan Williams 2016-10-11 15:59:14 UTC

Konrad Rzeszutek Wilk 2016-10-11 16:59:46 UTC

Dan Williams 2016-10-11 17:57:36 UTC

Andrew Cooper 2016-10-11 18:27:18 UTC

Konrad Rzeszutek Wilk 2016-10-11 18:43:46 UTC

Konrad Rzeszutek Wilk 2016-10-11 19:47:26 UTC

Konrad Rzeszutek Wilk 2016-10-11 18:33:37 UTC

Dan Williams 2016-10-11 19:29:15 UTC

Konrad Rzeszutek Wilk 2016-10-11 19:50:11 UTC

Dan Williams 2016-10-11 20:18:08 UTC

Haozhong Zhang 2016-10-12 10:33:33 UTC

Jan Beulich 2016-10-12 11:33:19 UTC

Haozhong Zhang 2016-10-12 15:00:52 UTC

Jan Beulich 2016-10-12 15:39:58 UTC

Dan Williams 2016-10-12 15:43:01 UTC

Dan Williams 2016-10-12 16:20:21 UTC

Jan Beulich 2016-10-13 08:35:02 UTC

Haozhong Zhang 2016-10-13 08:55:07 UTC

Jan Beulich 2016-10-13 09:08:17 UTC

Dan Williams 2016-10-13 15:41:11 UTC

Andrew Cooper 2016-10-13 16:03:42 UTC

Dan Williams 2016-10-13 19:08:21 UTC

Andrew Cooper 2016-10-13 19:34:40 UTC

Haozhong Zhang 2016-10-14 07:09:13 UTC

Andrew Cooper 2016-10-14 12:19:09 UTC

Jan Beulich 2016-10-14 10:03:21 UTC

Haozhong Zhang 2016-10-13 15:47:25 UTC

Jan Beulich 2016-10-14 10:16:54 UTC

Haozhong Zhang 2016-10-13 09:08:48 UTC

Jan Beulich 2016-10-12 16:22:42 UTC

Andrew Cooper 2016-10-11 20:19:16 UTC

Jan Beulich 2016-10-12 07:25:50 UTC

about - legalese

Loading...