On Sun, Sep 12, 2021 at 07:53:09PM +0300, Oded Gabbay wrote:
From: Tomer Tayar ttayar@habana.ai
Implement the calls to the dma-buf kernel api to create a dma-buf object backed by FD.
We block the option to mmap the DMA-BUF object because we don't support DIRECT_IO and implicit P2P.
This statement doesn't make sense, you can mmap your dmabuf if you like. All dmabuf mmaps are supposed to set the special bit/etc to exclude them from get_user_pages() anyhow - and since this is BAR memory not struct page memory this driver would be doing it anyhow.
We check the p2p distance using pci_p2pdma_distance_many() and refusing to map dmabuf in case the distance doesn't allow p2p.
Does this actually allow the p2p transfer for your intended use cases?
diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index 33986933aa9e..8cf5437c0390 100644 +++ b/drivers/misc/habanalabs/common/memory.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /*
- Copyright 2016-2019 HabanaLabs, Ltd.
*/
- Copyright 2016-2021 HabanaLabs, Ltd.
- All Rights Reserved.
@@ -11,11 +11,13 @@ #include <linux/uaccess.h> #include <linux/slab.h> +#include <linux/pci-p2pdma.h> #define HL_MMU_DEBUG 0 /* use small pages for supporting non-pow2 (32M/40M/48M) DRAM phys page sizes */ -#define DRAM_POOL_PAGE_SIZE SZ_8M +#define DRAM_POOL_PAGE_SIZE SZ_8M
??
/*
- The va ranges in context object contain a list with the available chunks of
@@ -347,6 +349,13 @@ static int free_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args) return -EINVAL; }
if (phys_pg_pack->exporting_cnt) {
dev_err(hdev->dev,
"handle %u is exported, cannot free\n", handle);
spin_unlock(&vm->idr_lock);
Don't write to the kernel log from user space triggered actions
+static int alloc_sgt_from_device_pages(struct hl_device *hdev,
struct sg_table **sgt, u64 *pages,
u64 npages, u64 page_size,
struct device *dev,
enum dma_data_direction dir)
Why doesn't this return a sg_table * and an ERR_PTR?
+{
- u64 chunk_size, bar_address, dma_max_seg_size;
- struct asic_fixed_properties *prop;
- int rc, i, j, nents, cur_page;
- struct scatterlist *sg;
- prop = &hdev->asic_prop;
- dma_max_seg_size = dma_get_max_seg_size(dev);
- /* We would like to align the max segment size to PAGE_SIZE, so the
* SGL will contain aligned addresses that can be easily mapped to
* an MMU
*/
- dma_max_seg_size = ALIGN_DOWN(dma_max_seg_size, PAGE_SIZE);
- if (dma_max_seg_size < PAGE_SIZE) {
dev_err_ratelimited(hdev->dev,
"dma_max_seg_size %llu can't be smaller than PAGE_SIZE\n",
dma_max_seg_size);
return -EINVAL;
- }
- *sgt = kzalloc(sizeof(**sgt), GFP_KERNEL);
- if (!*sgt)
return -ENOMEM;
- /* If the size of each page is larger than the dma max segment size,
* then we can't combine pages and the number of entries in the SGL
* will just be the
* <number of pages> * <chunks of max segment size in each page>
*/
- if (page_size > dma_max_seg_size)
nents = npages * DIV_ROUND_UP_ULL(page_size, dma_max_seg_size);
- else
/* Get number of non-contiguous chunks */
for (i = 1, nents = 1, chunk_size = page_size ; i < npages ; i++) {
if (pages[i - 1] + page_size != pages[i] ||
chunk_size + page_size > dma_max_seg_size) {
nents++;
chunk_size = page_size;
continue;
}
chunk_size += page_size;
}
- rc = sg_alloc_table(*sgt, nents, GFP_KERNEL | __GFP_ZERO);
- if (rc)
goto error_free;
- /* Because we are not going to include a CPU list we want to have some
* chance that other users will detect this by setting the orig_nents
* to 0 and using only nents (length of DMA list) when going over the
* sgl
*/
- (*sgt)->orig_nents = 0;
Maybe do this at the end so you'd have to undo it on the error path?
- cur_page = 0;
- if (page_size > dma_max_seg_size) {
u64 size_left, cur_device_address = 0;
size_left = page_size;
/* Need to split each page into the number of chunks of
* dma_max_seg_size
*/
for_each_sgtable_dma_sg((*sgt), sg, i) {
if (size_left == page_size)
cur_device_address =
pages[cur_page] - prop->dram_base_address;
else
cur_device_address += dma_max_seg_size;
chunk_size = min(size_left, dma_max_seg_size);
bar_address = hdev->dram_pci_bar_start + cur_device_address;
rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir);
if (rc)
goto error_unmap;
if (size_left > dma_max_seg_size) {
size_left -= dma_max_seg_size;
} else {
cur_page++;
size_left = page_size;
}
}
- } else {
/* Merge pages and put them into the scatterlist */
for_each_sgtable_dma_sg((*sgt), sg, i) {
chunk_size = page_size;
for (j = cur_page + 1 ; j < npages ; j++) {
if (pages[j - 1] + page_size != pages[j] ||
chunk_size + page_size > dma_max_seg_size)
break;
chunk_size += page_size;
}
bar_address = hdev->dram_pci_bar_start +
(pages[cur_page] - prop->dram_base_address);
rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir);
if (rc)
goto error_unmap;
cur_page = j;
}
- }
We have this sg_append stuff now that is intended to help building these things. It can only build CPU page lists, not these DMA lists, but I do wonder if open coding in drivers is slipping back a bit. Especially since AMD seems to be doing something different.
Could the DMABUF layer gain some helpers styled after the sg_append to simplify building these things? and convert the AMD driver of course.
+static int hl_dmabuf_attach(struct dma_buf *dmabuf,
struct dma_buf_attachment *attachment)
+{
- struct hl_dmabuf_wrapper *hl_dmabuf;
- struct hl_device *hdev;
- int rc;
- hl_dmabuf = dmabuf->priv;
- hdev = hl_dmabuf->ctx->hdev;
- rc = pci_p2pdma_distance_many(hdev->pdev, &attachment->dev, 1, true);
- if (rc < 0)
attachment->peer2peer = false;
Extra blank line
- return 0;
+}
+static struct sg_table *hl_map_dmabuf(struct dma_buf_attachment *attachment,
enum dma_data_direction dir)
+{
- struct dma_buf *dma_buf = attachment->dmabuf;
- struct hl_vm_phys_pg_pack *phys_pg_pack;
- struct hl_dmabuf_wrapper *hl_dmabuf;
- struct hl_device *hdev;
- struct sg_table *sgt;
- int rc;
- hl_dmabuf = dma_buf->priv;
- hdev = hl_dmabuf->ctx->hdev;
- phys_pg_pack = hl_dmabuf->phys_pg_pack;
- if (!attachment->peer2peer) {
dev_err(hdev->dev,
"Failed to map dmabuf because p2p is disabled\n");
return ERR_PTR(-EPERM);
User triggered printable again?
+static void hl_unmap_dmabuf(struct dma_buf_attachment *attachment,
struct sg_table *sgt,
enum dma_data_direction dir)
+{
- struct scatterlist *sg;
- int i;
- for_each_sgtable_dma_sg(sgt, sg, i)
dma_unmap_resource(attachment->dev, sg_dma_address(sg),
sg_dma_len(sg), dir,
DMA_ATTR_SKIP_CPU_SYNC);
Why can we skip the CPU_SYNC? Seems like a comment is needed
Something has to do a CPU_SYNC before recylcing this memory for another purpose, where is it?
+static void hl_release_dmabuf(struct dma_buf *dmabuf) +{
- struct hl_dmabuf_wrapper *hl_dmabuf = dmabuf->priv;
Maybe hl_dmabuf_wrapper should be hl_dmabuf_priv
- export_dmabuf_from_addr() - export a dma-buf object for the given memory
address and size.
- @ctx: pointer to the context structure.
- @device_addr: device memory physical address.
- @size: size of device memory.
- @flags: DMA-BUF file/FD flags.
- @dmabuf_fd: pointer to result FD that represents the dma-buf object.
- Create and export a dma-buf object for an existing memory allocation inside
- the device memory, and return a FD which is associated with the dma-buf
- object.
- Return: 0 on success, non-zero for failure.
- */
+static int export_dmabuf_from_addr(struct hl_ctx *ctx, u64 device_addr,
u64 size, int flags, int *dmabuf_fd)
+{
- struct hl_dmabuf_wrapper *hl_dmabuf;
- struct hl_device *hdev = ctx->hdev;
- struct asic_fixed_properties *prop;
- u64 bar_address;
- int rc;
- prop = &hdev->asic_prop;
- if (!IS_ALIGNED(device_addr, PAGE_SIZE)) {
dev_err_ratelimited(hdev->dev,
"address of exported device memory should be aligned to 0x%lx, address 0x%llx\n",
PAGE_SIZE, device_addr);
return -EINVAL;
- }
- if (size < PAGE_SIZE) {
dev_err_ratelimited(hdev->dev,
"size %llu of exported device memory should be equal to or greater than %lu\n",
size, PAGE_SIZE);
return -EINVAL;
- }
- if (device_addr < prop->dram_user_base_address ||
device_addr + size > prop->dram_end_address ||
device_addr + size < device_addr) {
dev_err_ratelimited(hdev->dev,
"DRAM memory range is outside of DRAM boundaries, address 0x%llx, size 0x%llx\n",
device_addr, size);
return -EINVAL;
- }
- bar_address = hdev->dram_pci_bar_start +
(device_addr - prop->dram_base_address);
- if (bar_address + size >
hdev->dram_pci_bar_start + prop->dram_pci_bar_size ||
bar_address + size < bar_address) {
dev_err_ratelimited(hdev->dev,
"DRAM memory range is outside of PCI BAR boundaries, address 0x%llx, size 0x%llx\n",
device_addr, size);
return -EINVAL;
- }
More prints from userspace
+static int export_dmabuf_from_handle(struct hl_ctx *ctx, u64 handle, int flags,
int *dmabuf_fd)
+{
- struct hl_vm_phys_pg_pack *phys_pg_pack;
- struct hl_dmabuf_wrapper *hl_dmabuf;
- struct hl_device *hdev = ctx->hdev;
- struct asic_fixed_properties *prop;
- struct hl_vm *vm = &hdev->vm;
- u64 bar_address;
- u32 idr_handle;
- int rc, i;
- prop = &hdev->asic_prop;
- idr_handle = lower_32_bits(handle);
Why silent truncation? Shouldn't setting the upper 32 bits be an error?
- case HL_MEM_OP_EXPORT_DMABUF_FD:
rc = export_dmabuf_from_addr(ctx,
args->in.export_dmabuf_fd.handle,
args->in.export_dmabuf_fd.mem_size,
args->in.flags,
&dmabuf_fd);
memset(args, 0, sizeof(*args));
args->out.fd = dmabuf_fd;
Would expect the installed fd to be the positive return, not a pointer
Jason
linaro-mm-sig@lists.linaro.org