Hello!
This is yet another quick update on the patchset which replaces custom consistent dma regions usage in dma-mapping framework in favour of generic vmalloc areas created on demand for each allocation. The main purpose for this patchset is to remove 2MiB limit of dma coherent/writecombine allocations.
This version addresses a few more cleanups pointed by Minchan Kim.
This patch is based on vanilla v3.5 release.
Best regards Marek Szyprowski Samsung Poland R&D Center
Changelog:
v6: - more cleanups of minor issues pointed by Minchan Kim, moved arm_dma_mmap() changes into separate patch
v5: http://thread.gmane.org/gmane.linux.kernel.mm/83096 - fixed another minor issues pointed by Minchan Kim: added more comments here and there, changed pr_err() + stack_dump() to WARN(), added a fix for no-MMU systems
v4: http://thread.gmane.org/gmane.linux.kernel.mm/80906 - replaced arch-independent VM_DMA flag with ARM-specific VM_ARM_DMA_CONSISTENT flag
v3: http://thread.gmane.org/gmane.linux.kernel.mm/80028 - rebased onto v3.4-rc2: added support for IOMMU-aware implementation of dma-mapping calls, unified with CMA coherent dma pool - implemented changes requested by Minchan Kim: added more checks for vmarea->flags & VM_DMA, renamed some variables, removed obsole locks, squashed find_vm_area() exporting patch into the main redesign patch
v2: http://thread.gmane.org/gmane.linux.kernel.mm/78563 - added support for atomic allocations (served from preallocated pool) - minor cleanup here and there - rebased onto v3.4-rc7
v1: http://thread.gmane.org/gmane.linux.kernel.mm/76703 - initial version
Patch summary:
Marek Szyprowski (2): mm: vmalloc: use const void * for caller argument ARM: dma-mapping: remove custom consistent dma region
Documentation/kernel-parameters.txt | 2 +- arch/arm/include/asm/dma-mapping.h | 2 +- arch/arm/mm/dma-mapping.c | 486 ++++++++++++----------------------- arch/arm/mm/mm.h | 3 + include/linux/vmalloc.h | 9 +- mm/vmalloc.c | 28 ++- 6 files changed, 194 insertions(+), 336 deletions(-)
'const void *' is a safer type for caller function type. This patch updates all references to caller function type.
Signed-off-by: Marek Szyprowski m.szyprowski@samsung.com Reviewed-by: Kyungmin Park kyungmin.park@samsung.com Reviewed-by: Minchan Kim minchan@kernel.org --- include/linux/vmalloc.h | 8 ++++---- mm/vmalloc.c | 18 +++++++++--------- 2 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index dcdfc2b..2e28f4d 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -32,7 +32,7 @@ struct vm_struct { struct page **pages; unsigned int nr_pages; phys_addr_t phys_addr; - void *caller; + const void *caller; };
/* @@ -62,7 +62,7 @@ extern void *vmalloc_32_user(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, - pgprot_t prot, int node, void *caller); + pgprot_t prot, int node, const void *caller); extern void vfree(const void *addr);
extern void *vmap(struct page **pages, unsigned int count, @@ -85,13 +85,13 @@ static inline size_t get_vm_area_size(const struct vm_struct *area)
extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); extern struct vm_struct *get_vm_area_caller(unsigned long size, - unsigned long flags, void *caller); + unsigned long flags, const void *caller); extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, unsigned long start, unsigned long end); extern struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, - void *caller); + const void *caller); extern struct vm_struct *remove_vm_area(const void *addr);
extern int map_vm_area(struct vm_struct *area, pgprot_t prot, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2aad499..11308f0 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1280,7 +1280,7 @@ DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist;
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, - unsigned long flags, void *caller) + unsigned long flags, const void *caller) { vm->flags = flags; vm->addr = (void *)va->va_start; @@ -1306,7 +1306,7 @@ static void insert_vmalloc_vmlist(struct vm_struct *vm) }
static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, - unsigned long flags, void *caller) + unsigned long flags, const void *caller) { setup_vmalloc_vm(vm, va, flags, caller); insert_vmalloc_vmlist(vm); @@ -1314,7 +1314,7 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long flags, unsigned long start, - unsigned long end, int node, gfp_t gfp_mask, void *caller) + unsigned long end, int node, gfp_t gfp_mask, const void *caller) { struct vmap_area *va; struct vm_struct *area; @@ -1375,7 +1375,7 @@ EXPORT_SYMBOL_GPL(__get_vm_area);
struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, - void *caller) + const void *caller) { return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, caller); @@ -1397,7 +1397,7 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) }
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, - void *caller) + const void *caller) { return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, -1, GFP_KERNEL, caller); @@ -1568,9 +1568,9 @@ EXPORT_SYMBOL(vmap);
static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, - int node, void *caller); + int node, const void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot, int node, void *caller) + pgprot_t prot, int node, const void *caller) { const int order = 0; struct page **pages; @@ -1643,7 +1643,7 @@ fail: */ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, - pgprot_t prot, int node, void *caller) + pgprot_t prot, int node, const void *caller) { struct vm_struct *area; void *addr; @@ -1699,7 +1699,7 @@ fail: */ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, - int node, void *caller) + int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, prot, node, caller);
This patch changes dma-mapping subsystem to use generic vmalloc areas for all consistent dma allocations. This increases the total size limit of the consistent allocations and removes platform hacks and a lot of duplicated code.
Atomic allocations are served from special pool preallocated on boot, because vmalloc areas cannot be reliably created in atomic context.
Signed-off-by: Marek Szyprowski m.szyprowski@samsung.com Reviewed-by: Kyungmin Park kyungmin.park@samsung.com --- Documentation/kernel-parameters.txt | 2 +- arch/arm/include/asm/dma-mapping.h | 2 +- arch/arm/mm/dma-mapping.c | 486 ++++++++++++----------------------- arch/arm/mm/mm.h | 3 + include/linux/vmalloc.h | 1 + mm/vmalloc.c | 10 +- 6 files changed, 181 insertions(+), 323 deletions(-)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index a92c5eb..4ee28f3 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -526,7 +526,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
coherent_pool=nn[KMG] [ARM,KNL] Sets the size of memory pool for coherent, atomic dma - allocations if Contiguous Memory Allocator (CMA) is used. + allocations, by default set to 256K.
code_bytes [X86] How many bytes of object code to print in an oops report. diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h index bbef15d..80777d87 100644 --- a/arch/arm/include/asm/dma-mapping.h +++ b/arch/arm/include/asm/dma-mapping.h @@ -226,7 +226,7 @@ static inline int dma_mmap_writecombine(struct device *dev, struct vm_area_struc * DMA region above it's default value of 2MB. It must be called before the * memory allocator is initialised, i.e. before any core_initcall. */ -extern void __init init_consistent_dma_size(unsigned long size); +static inline void init_consistent_dma_size(unsigned long size) { }
/* * For SA-1111, IXP425, and ADI systems the dma-mapping functions are "magic" diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 655878b..f906d5f 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -22,6 +22,7 @@ #include <linux/memblock.h> #include <linux/slab.h> #include <linux/iommu.h> +#include <linux/io.h> #include <linux/vmalloc.h>
#include <asm/memory.h> @@ -217,115 +218,70 @@ static void __dma_free_buffer(struct page *page, size_t size) }
#ifdef CONFIG_MMU +#ifdef CONFIG_HUGETLB_PAGE +#error ARM Coherent DMA allocator does not (yet) support huge TLB +#endif
-#define CONSISTENT_OFFSET(x) (((unsigned long)(x) - consistent_base) >> PAGE_SHIFT) -#define CONSISTENT_PTE_INDEX(x) (((unsigned long)(x) - consistent_base) >> PMD_SHIFT) - -/* - * These are the page tables (2MB each) covering uncached, DMA consistent allocations - */ -static pte_t **consistent_pte; - -#define DEFAULT_CONSISTENT_DMA_SIZE SZ_2M +static void *__alloc_from_contiguous(struct device *dev, size_t size, + pgprot_t prot, struct page **ret_page);
-static unsigned long consistent_base = CONSISTENT_END - DEFAULT_CONSISTENT_DMA_SIZE; +static void *__alloc_remap_buffer(struct device *dev, size_t size, gfp_t gfp, + pgprot_t prot, struct page **ret_page, + const void *caller);
-void __init init_consistent_dma_size(unsigned long size) +static void * +__dma_alloc_remap(struct page *page, size_t size, gfp_t gfp, pgprot_t prot, + const void *caller) { - unsigned long base = CONSISTENT_END - ALIGN(size, SZ_2M); + struct vm_struct *area; + unsigned long addr;
- BUG_ON(consistent_pte); /* Check we're called before DMA region init */ - BUG_ON(base < VMALLOC_END); + /* + * DMA allocation can be mapped to user space, so lets + * set VM_USERMAP flags too. + */ + area = get_vm_area_caller(size, VM_ARM_DMA_CONSISTENT | VM_USERMAP, + caller); + if (!area) + return NULL; + addr = (unsigned long)area->addr; + area->phys_addr = __pfn_to_phys(page_to_pfn(page));
- /* Grow region to accommodate specified size */ - if (base < consistent_base) - consistent_base = base; + if (ioremap_page_range(addr, addr + size, area->phys_addr, prot)) { + vunmap((void *)addr); + return NULL; + } + return (void *)addr; }
-#include "vmregion.h" - -static struct arm_vmregion_head consistent_head = { - .vm_lock = __SPIN_LOCK_UNLOCKED(&consistent_head.vm_lock), - .vm_list = LIST_HEAD_INIT(consistent_head.vm_list), - .vm_end = CONSISTENT_END, -}; - -#ifdef CONFIG_HUGETLB_PAGE -#error ARM Coherent DMA allocator does not (yet) support huge TLB -#endif - -/* - * Initialise the consistent memory allocation. - */ -static int __init consistent_init(void) +static void __dma_free_remap(void *cpu_addr, size_t size) { - int ret = 0; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int i = 0; - unsigned long base = consistent_base; - unsigned long num_ptes = (CONSISTENT_END - base) >> PMD_SHIFT; - - if (IS_ENABLED(CONFIG_CMA) && !IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU)) - return 0; - - consistent_pte = kmalloc(num_ptes * sizeof(pte_t), GFP_KERNEL); - if (!consistent_pte) { - pr_err("%s: no memory\n", __func__); - return -ENOMEM; + unsigned int flags = VM_ARM_DMA_CONSISTENT | VM_USERMAP; + struct vm_struct *area = find_vm_area(cpu_addr); + if (!area || (area->flags & flags) != flags) { + WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr); + return; } - - pr_debug("DMA memory: 0x%08lx - 0x%08lx:\n", base, CONSISTENT_END); - consistent_head.vm_start = base; - - do { - pgd = pgd_offset(&init_mm, base); - - pud = pud_alloc(&init_mm, pgd, base); - if (!pud) { - pr_err("%s: no pud tables\n", __func__); - ret = -ENOMEM; - break; - } - - pmd = pmd_alloc(&init_mm, pud, base); - if (!pmd) { - pr_err("%s: no pmd tables\n", __func__); - ret = -ENOMEM; - break; - } - WARN_ON(!pmd_none(*pmd)); - - pte = pte_alloc_kernel(pmd, base); - if (!pte) { - pr_err("%s: no pte tables\n", __func__); - ret = -ENOMEM; - break; - } - - consistent_pte[i++] = pte; - base += PMD_SIZE; - } while (base < CONSISTENT_END); - - return ret; + unmap_kernel_range((unsigned long)cpu_addr, size); + vunmap(cpu_addr); } -core_initcall(consistent_init); - -static void *__alloc_from_contiguous(struct device *dev, size_t size, - pgprot_t prot, struct page **ret_page);
-static struct arm_vmregion_head coherent_head = { - .vm_lock = __SPIN_LOCK_UNLOCKED(&coherent_head.vm_lock), - .vm_list = LIST_HEAD_INIT(coherent_head.vm_list), +struct dma_pool { + size_t size; + spinlock_t lock; + unsigned long *bitmap; + unsigned long nr_pages; + void *vaddr; + struct page *page; };
-static size_t coherent_pool_size = DEFAULT_CONSISTENT_DMA_SIZE / 8; +static struct dma_pool atomic_pool = { + .size = SZ_256K, +};
static int __init early_coherent_pool(char *p) { - coherent_pool_size = memparse(p, &p); + atomic_pool.size = memparse(p, &p); return 0; } early_param("coherent_pool", early_coherent_pool); @@ -333,32 +289,45 @@ early_param("coherent_pool", early_coherent_pool); /* * Initialise the coherent pool for atomic allocations. */ -static int __init coherent_init(void) +static int __init atomic_pool_init(void) { + struct dma_pool *pool = &atomic_pool; pgprot_t prot = pgprot_dmacoherent(pgprot_kernel); - size_t size = coherent_pool_size; + unsigned long nr_pages = pool->size >> PAGE_SHIFT; + unsigned long *bitmap; struct page *page; void *ptr; + int bitmap_size = BITS_TO_LONGS(nr_pages) * sizeof(long);
- if (!IS_ENABLED(CONFIG_CMA)) - return 0; + bitmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!bitmap) + goto no_bitmap;
- ptr = __alloc_from_contiguous(NULL, size, prot, &page); + if (IS_ENABLED(CONFIG_CMA)) + ptr = __alloc_from_contiguous(NULL, pool->size, prot, &page); + else + ptr = __alloc_remap_buffer(NULL, pool->size, GFP_KERNEL, prot, + &page, NULL); if (ptr) { - coherent_head.vm_start = (unsigned long) ptr; - coherent_head.vm_end = (unsigned long) ptr + size; - printk(KERN_INFO "DMA: preallocated %u KiB pool for atomic coherent allocations\n", - (unsigned)size / 1024); + spin_lock_init(&pool->lock); + pool->vaddr = ptr; + pool->page = page; + pool->bitmap = bitmap; + pool->nr_pages = nr_pages; + pr_info("DMA: preallocated %u KiB pool for atomic coherent allocations\n", + (unsigned)pool->size / 1024); return 0; } - printk(KERN_ERR "DMA: failed to allocate %u KiB pool for atomic coherent allocation\n", - (unsigned)size / 1024); + kfree(bitmap); +no_bitmap: + pr_err("DMA: failed to allocate %u KiB pool for atomic coherent allocation\n", + (unsigned)pool->size / 1024); return -ENOMEM; } /* * CMA is activated by core_initcall, so we must be called after it. */ -postcore_initcall(coherent_init); +postcore_initcall(atomic_pool_init);
struct dma_contig_early_reserve { phys_addr_t base; @@ -406,112 +375,6 @@ void __init dma_contiguous_remap(void) } }
-static void * -__dma_alloc_remap(struct page *page, size_t size, gfp_t gfp, pgprot_t prot, - const void *caller) -{ - struct arm_vmregion *c; - size_t align; - int bit; - - if (!consistent_pte) { - pr_err("%s: not initialised\n", __func__); - dump_stack(); - return NULL; - } - - /* - * Align the virtual region allocation - maximum alignment is - * a section size, minimum is a page size. This helps reduce - * fragmentation of the DMA space, and also prevents allocations - * smaller than a section from crossing a section boundary. - */ - bit = fls(size - 1); - if (bit > SECTION_SHIFT) - bit = SECTION_SHIFT; - align = 1 << bit; - - /* - * Allocate a virtual address in the consistent mapping region. - */ - c = arm_vmregion_alloc(&consistent_head, align, size, - gfp & ~(__GFP_DMA | __GFP_HIGHMEM), caller); - if (c) { - pte_t *pte; - int idx = CONSISTENT_PTE_INDEX(c->vm_start); - u32 off = CONSISTENT_OFFSET(c->vm_start) & (PTRS_PER_PTE-1); - - pte = consistent_pte[idx] + off; - c->priv = page; - - do { - BUG_ON(!pte_none(*pte)); - - set_pte_ext(pte, mk_pte(page, prot), 0); - page++; - pte++; - off++; - if (off >= PTRS_PER_PTE) { - off = 0; - pte = consistent_pte[++idx]; - } - } while (size -= PAGE_SIZE); - - dsb(); - - return (void *)c->vm_start; - } - return NULL; -} - -static void __dma_free_remap(void *cpu_addr, size_t size) -{ - struct arm_vmregion *c; - unsigned long addr; - pte_t *ptep; - int idx; - u32 off; - - c = arm_vmregion_find_remove(&consistent_head, (unsigned long)cpu_addr); - if (!c) { - pr_err("%s: trying to free invalid coherent area: %p\n", - __func__, cpu_addr); - dump_stack(); - return; - } - - if ((c->vm_end - c->vm_start) != size) { - pr_err("%s: freeing wrong coherent size (%ld != %d)\n", - __func__, c->vm_end - c->vm_start, size); - dump_stack(); - size = c->vm_end - c->vm_start; - } - - idx = CONSISTENT_PTE_INDEX(c->vm_start); - off = CONSISTENT_OFFSET(c->vm_start) & (PTRS_PER_PTE-1); - ptep = consistent_pte[idx] + off; - addr = c->vm_start; - do { - pte_t pte = ptep_get_and_clear(&init_mm, addr, ptep); - - ptep++; - addr += PAGE_SIZE; - off++; - if (off >= PTRS_PER_PTE) { - off = 0; - ptep = consistent_pte[++idx]; - } - - if (pte_none(pte) || !pte_present(pte)) - pr_crit("%s: bad page in kernel page table\n", - __func__); - } while (size -= PAGE_SIZE); - - flush_tlb_kernel_range(c->vm_start, c->vm_end); - - arm_vmregion_free(&consistent_head, c); -} - static int __dma_update_pte(pte_t *pte, pgtable_t token, unsigned long addr, void *data) { @@ -552,16 +415,17 @@ static void *__alloc_remap_buffer(struct device *dev, size_t size, gfp_t gfp, return ptr; }
-static void *__alloc_from_pool(struct device *dev, size_t size, - struct page **ret_page, const void *caller) +static void *__alloc_from_pool(size_t size, struct page **ret_page) { - struct arm_vmregion *c; + struct dma_pool *pool = &atomic_pool; + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned int pageno; + unsigned long flags; + void *ptr = NULL; size_t align;
- if (!coherent_head.vm_start) { - printk(KERN_ERR "%s: coherent pool not initialised!\n", - __func__); - dump_stack(); + if (!pool->vaddr) { + WARN(1, "coherent pool not initialised!\n"); return NULL; }
@@ -571,35 +435,41 @@ static void *__alloc_from_pool(struct device *dev, size_t size, * size. This helps reduce fragmentation of the DMA space. */ align = PAGE_SIZE << get_order(size); - c = arm_vmregion_alloc(&coherent_head, align, size, 0, caller); - if (c) { - void *ptr = (void *)c->vm_start; - struct page *page = virt_to_page(ptr); - *ret_page = page; - return ptr; + + spin_lock_irqsave(&pool->lock, flags); + pageno = bitmap_find_next_zero_area(pool->bitmap, pool->nr_pages, + 0, count, (1 << align) - 1); + if (pageno < pool->nr_pages) { + bitmap_set(pool->bitmap, pageno, count); + ptr = pool->vaddr + PAGE_SIZE * pageno; + *ret_page = pool->page + pageno; } - return NULL; + spin_unlock_irqrestore(&pool->lock, flags); + + return ptr; }
-static int __free_from_pool(void *cpu_addr, size_t size) +static int __free_from_pool(void *start, size_t size) { - unsigned long start = (unsigned long)cpu_addr; - unsigned long end = start + size; - struct arm_vmregion *c; + struct dma_pool *pool = &atomic_pool; + unsigned long pageno, count; + unsigned long flags;
- if (start < coherent_head.vm_start || end > coherent_head.vm_end) + if (start < pool->vaddr || start > pool->vaddr + pool->size) return 0;
- c = arm_vmregion_find_remove(&coherent_head, (unsigned long)start); - - if ((c->vm_end - c->vm_start) != size) { - printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n", - __func__, c->vm_end - c->vm_start, size); - dump_stack(); - size = c->vm_end - c->vm_start; + if (start + size > pool->vaddr + pool->size) { + WARN(1, "freeing wrong coherent size from pool\n"); + return 0; }
- arm_vmregion_free(&coherent_head, c); + pageno = (start - pool->vaddr) >> PAGE_SHIFT; + count = size >> PAGE_SHIFT; + + spin_lock_irqsave(&pool->lock, flags); + bitmap_clear(pool->bitmap, pageno, count); + spin_unlock_irqrestore(&pool->lock, flags); + return 1; }
@@ -644,7 +514,7 @@ static inline pgprot_t __get_dma_pgprot(struct dma_attrs *attrs, pgprot_t prot)
#define __get_dma_pgprot(attrs, prot) __pgprot(0) #define __alloc_remap_buffer(dev, size, gfp, prot, ret, c) NULL -#define __alloc_from_pool(dev, size, ret_page, c) NULL +#define __alloc_from_pool(size, ret_page) NULL #define __alloc_from_contiguous(dev, size, prot, ret) NULL #define __free_from_pool(cpu_addr, size) 0 #define __free_from_contiguous(dev, page, size) do { } while (0) @@ -702,10 +572,10 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
if (arch_is_coherent() || nommu()) addr = __alloc_simple_buffer(dev, size, gfp, &page); + else if (gfp & GFP_ATOMIC) + addr = __alloc_from_pool(size, &page); else if (!IS_ENABLED(CONFIG_CMA)) addr = __alloc_remap_buffer(dev, size, gfp, prot, &page, caller); - else if (gfp & GFP_ATOMIC) - addr = __alloc_from_pool(dev, size, &page, caller); else addr = __alloc_from_contiguous(dev, size, prot, &page);
@@ -998,9 +868,6 @@ static int arm_dma_set_mask(struct device *dev, u64 dma_mask)
static int __init dma_debug_do_init(void) { -#ifdef CONFIG_MMU - arm_vmregion_create_proc("dma-mappings", &consistent_head); -#endif dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); return 0; } @@ -1117,61 +984,32 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages, size_t s * Create a CPU mapping for a specified pages */ static void * -__iommu_alloc_remap(struct page **pages, size_t size, gfp_t gfp, pgprot_t prot) +__iommu_alloc_remap(struct page **pages, size_t size, gfp_t gfp, pgprot_t prot, + const void *caller) { - struct arm_vmregion *c; - size_t align; - size_t count = size >> PAGE_SHIFT; - int bit; + unsigned int i, nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; + struct vm_struct *area; + unsigned long p;
- if (!consistent_pte[0]) { - pr_err("%s: not initialised\n", __func__); - dump_stack(); + area = get_vm_area_caller(size, VM_ARM_DMA_CONSISTENT | VM_USERMAP, + caller); + if (!area) return NULL; - }
- /* - * Align the virtual region allocation - maximum alignment is - * a section size, minimum is a page size. This helps reduce - * fragmentation of the DMA space, and also prevents allocations - * smaller than a section from crossing a section boundary. - */ - bit = fls(size - 1); - if (bit > SECTION_SHIFT) - bit = SECTION_SHIFT; - align = 1 << bit; - - /* - * Allocate a virtual address in the consistent mapping region. - */ - c = arm_vmregion_alloc(&consistent_head, align, size, - gfp & ~(__GFP_DMA | __GFP_HIGHMEM), NULL); - if (c) { - pte_t *pte; - int idx = CONSISTENT_PTE_INDEX(c->vm_start); - int i = 0; - u32 off = CONSISTENT_OFFSET(c->vm_start) & (PTRS_PER_PTE-1); - - pte = consistent_pte[idx] + off; - c->priv = pages; - - do { - BUG_ON(!pte_none(*pte)); - - set_pte_ext(pte, mk_pte(pages[i], prot), 0); - pte++; - off++; - i++; - if (off >= PTRS_PER_PTE) { - off = 0; - pte = consistent_pte[++idx]; - } - } while (i < count); - - dsb(); + area->pages = pages; + area->nr_pages = nr_pages; + p = (unsigned long)area->addr;
- return (void *)c->vm_start; + for (i = 0; i < nr_pages; i++) { + phys_addr_t phys = __pfn_to_phys(page_to_pfn(pages[i])); + if (ioremap_page_range(p, p + PAGE_SIZE, phys, prot)) + goto err; + p += PAGE_SIZE; } + return area->addr; +err: + unmap_kernel_range((unsigned long)area->addr, size); + vunmap(area->addr); return NULL; }
@@ -1230,6 +1068,16 @@ static int __iommu_remove_mapping(struct device *dev, dma_addr_t iova, size_t si return 0; }
+static struct page **__iommu_get_pages(void *cpu_addr) +{ + struct vm_struct *area; + + area = find_vm_area(cpu_addr); + if (area && (area->flags & VM_ARM_DMA_CONSISTENT)) + return area->pages; + return NULL; +} + static void *arm_iommu_alloc_attrs(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs) { @@ -1248,7 +1096,8 @@ static void *arm_iommu_alloc_attrs(struct device *dev, size_t size, if (*handle == DMA_ERROR_CODE) goto err_buffer;
- addr = __iommu_alloc_remap(pages, size, gfp, prot); + addr = __iommu_alloc_remap(pages, size, gfp, prot, + __builtin_return_address(0)); if (!addr) goto err_mapping;
@@ -1265,31 +1114,25 @@ static int arm_iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, struct dma_attrs *attrs) { - struct arm_vmregion *c; + unsigned long uaddr = vma->vm_start; + unsigned long usize = vma->vm_end - vma->vm_start; + struct page **pages = __iommu_get_pages(cpu_addr);
vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot); - c = arm_vmregion_find(&consistent_head, (unsigned long)cpu_addr);
- if (c) { - struct page **pages = c->priv; - - unsigned long uaddr = vma->vm_start; - unsigned long usize = vma->vm_end - vma->vm_start; - int i = 0; - - do { - int ret; + if (!pages) + return -ENXIO;
- ret = vm_insert_page(vma, uaddr, pages[i++]); - if (ret) { - pr_err("Remapping memory, error: %d\n", ret); - return ret; - } + do { + int ret = vm_insert_page(vma, uaddr, *pages++); + if (ret) { + pr_err("Remapping memory failed: %d\n", ret); + return ret; + } + uaddr += PAGE_SIZE; + usize -= PAGE_SIZE; + } while (usize > 0);
- uaddr += PAGE_SIZE; - usize -= PAGE_SIZE; - } while (usize > 0); - } return 0; }
@@ -1300,16 +1143,19 @@ static int arm_iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void arm_iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle, struct dma_attrs *attrs) { - struct arm_vmregion *c; + struct page **pages = __iommu_get_pages(cpu_addr); size = PAGE_ALIGN(size);
- c = arm_vmregion_find(&consistent_head, (unsigned long)cpu_addr); - if (c) { - struct page **pages = c->priv; - __dma_free_remap(cpu_addr, size); - __iommu_remove_mapping(dev, handle, size); - __iommu_free_buffer(dev, pages, size); + if (!pages) { + WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr); + return; } + + unmap_kernel_range((unsigned long)cpu_addr, size); + vunmap(cpu_addr); + + __iommu_remove_mapping(dev, handle, size); + __iommu_free_buffer(dev, pages, size); }
/* diff --git a/arch/arm/mm/mm.h b/arch/arm/mm/mm.h index 2e8a1ef..6776160 100644 --- a/arch/arm/mm/mm.h +++ b/arch/arm/mm/mm.h @@ -59,6 +59,9 @@ extern void __flush_dcache_page(struct address_space *mapping, struct page *page #define VM_ARM_MTYPE(mt) ((mt) << 20) #define VM_ARM_MTYPE_MASK (0x1f << 20)
+/* consistent regions used by dma_alloc_attrs() */ +#define VM_ARM_DMA_CONSISTENT 0x20000000 + #endif
#ifdef CONFIG_ZONE_DMA diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 2e28f4d..6071e91 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -93,6 +93,7 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long start, unsigned long end, const void *caller); extern struct vm_struct *remove_vm_area(const void *addr); +extern struct vm_struct *find_vm_area(const void *addr);
extern int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 11308f0..65fc4dc 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1403,7 +1403,15 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, -1, GFP_KERNEL, caller); }
-static struct vm_struct *find_vm_area(const void *addr) +/** + * find_vm_area - find a continuous kernel virtual area + * @addr: base address + * + * Search for the kernel VM area starting at @addr, and return it. + * It is up to the caller to do all required locking to keep the returned + * pointer valid. + */ +struct vm_struct *find_vm_area(const void *addr) { struct vmap_area *va;
Hi,
On Mon, 30 Jul 2012 10:28:19 +0200 Marek Szyprowski m.szyprowski@samsung.com wrote:
This patch changes dma-mapping subsystem to use generic vmalloc areas for all consistent dma allocations. This increases the total size limit of the consistent allocations and removes platform hacks and a lot of duplicated code.
Atomic allocations are served from special pool preallocated on boot, because vmalloc areas cannot be reliably created in atomic context.
Signed-off-by: Marek Szyprowski m.szyprowski@samsung.com Reviewed-by: Kyungmin Park kyungmin.park@samsung.com
Documentation/kernel-parameters.txt | 2 +- arch/arm/include/asm/dma-mapping.h | 2 +- arch/arm/mm/dma-mapping.c | 486 ++++++++++++----------------------- arch/arm/mm/mm.h | 3 + include/linux/vmalloc.h | 1 + mm/vmalloc.c | 10 +- 6 files changed, 181 insertions(+), 323 deletions(-)
...
@@ -1117,61 +984,32 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages, size_t s
- Create a CPU mapping for a specified pages
*/ static void * -__iommu_alloc_remap(struct page **pages, size_t size, gfp_t gfp, pgprot_t prot) +__iommu_alloc_remap(struct page **pages, size_t size, gfp_t gfp, pgprot_t prot,
const void *caller)
{
struct arm_vmregion *c;
size_t align;
size_t count = size >> PAGE_SHIFT;
int bit;
unsigned int i, nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
struct vm_struct *area;
unsigned long p;
if (!consistent_pte[0]) {
pr_err("%s: not initialised\n", __func__);
dump_stack();
area = get_vm_area_caller(size, VM_ARM_DMA_CONSISTENT | VM_USERMAP,
caller);
if (!area)
This patch replaced the custom "consistent_pte" with get_vm_area_caller()", which breaks the compatibility with the existing driver. This causes the following kernel oops(*1). That driver has called dma_pool_alloc() to allocate memory from the interrupt context, and it hits BUG_ON(in_interrpt()) in "get_vm_area_caller()"(*2). Regardless of the badness of allocation from interrupt handler in the driver, I have the following question.
The following "__get_vm_area_node()" can take gfp_mask, it means that this function is expected to be called from atomic context, but why it's _NOT_ allowed _ONLY_ from interrupt context?
According to the following definitions, "in_interrupt()" is in "in_atomic()".
#define in_interrupt() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK | NMI_MASK)) #define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0)
Does anyone know why BUG_ON(in_interrupt()) is set in __get_vm_area_node(*3)?
*2: static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller) { struct vmap_area *va; struct vm_struct *area;
BUG_ON(in_interrupt()); ^^^^^^^^^^^^^^^^^^^^^^^^^^^^*3: *1: [ 8.321343] ------------[ cut here ]------------ [ 8.325971] kernel BUG at /home/hdoyu/mydroid-k340-cardhu/kernel/mm/vmalloc.c:1322! [ 8.333615] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP ARM [ 8.339436] Modules linked in: [ 8.342496] CPU: 0 Tainted: G W (3.4.6-00067-g5d485f7 #67) [ 8.349192] PC is at __get_vm_area_node.isra.29+0x164/0x16c [ 8.354758] LR is at get_vm_area_caller+0x4c/0x54 [ 8.359454] pc : [<c011297c>] lr : [<c011318c>] psr: 20000193 [ 8.359458] sp : c09edca0 ip : c09ec000 fp : ae278000 [ 8.370922] r10: f0000000 r9 : c011aa54 r8 : c0a26cb8 [ 8.376136] r7 : 00000001 r6 : 000000d0 r5 : 20000008 r4 : c09edca0 [ 8.382651] r3 : 00010000 r2 : 20000008 r1 : 00000001 r0 : 00001000 [ 8.389166] Flags: nzCv IRQs off FIQs on Mode SVC_32 ISA ARM Segment kernel [ 8.396549] Control: 10c5387d Table: ad98c04a DAC: 00000015 .... [ 9.169162] dfa0: 412fc099 c09ec000 00000000 c000fdd8 c06df1e4 c0a1b080 00000000 00000000 [ 9.177329] dfc0: c0a235cc 8000406a 00000000 c0986818 ffffffff ffffffff c0986404 00000000 [ 9.185497] dfe0: 00000000 c09bb070 10c5387d c0a19c58 c09bb064 80008044 00000000 00000000 [ 9.193673] [<c011297c>] (__get_vm_area_node.isra.29+0x164/0x16c) from [<c011318c>] (get_vm_area_caller+0x4c/0x54) [ 9.204022] [<c011318c>] (get_vm_area_caller+0x4c/0x54) from [<c001aed8>] (__iommu_alloc_remap.isra.14+0x2c/0xfc) [ 9.214276] [<c001aed8>] (__iommu_alloc_remap.isra.14+0x2c/0xfc) from [<c001b06c>] (arm_iommu_alloc_attrs+0xc4/0xf8) [ 9.224795] [<c001b06c>] (arm_iommu_alloc_attrs+0xc4/0xf8) from [<c011aa54>] (pool_alloc_page.constprop.5+0x6c/0xf8) [ 9.235309] [<c011aa54>] (pool_alloc_page.constprop.5+0x6c/0xf8) from [<c011ab60>] (dma_pool_alloc+0x80/0x170) [ 9.245304] [<c011ab60>] (dma_pool_alloc+0x80/0x170) from [<c03cbbcc>] (tegra_build_dtd+0x48/0x14c) [ 9.254344] [<c03cbbcc>] (tegra_build_dtd+0x48/0x14c) from [<c03cbd4c>] (tegra_req_to_dtd+0x7c/0xa8) [ 9.263467] [<c03cbd4c>] (tegra_req_to_dtd+0x7c/0xa8) from [<c03cc140>] (tegra_ep_queue+0x154/0x33c) [ 9.272592] [<c03cc140>] (tegra_ep_queue+0x154/0x33c) from [<c03dd5b4>] (composite_setup+0x364/0x6d4) [ 9.281804] [<c03dd5b4>] (composite_setup+0x364/0x6d4) from [<c03dd9dc>] (android_setup+0xb8/0x14c) [ 9.290843] [<c03dd9dc>] (android_setup+0xb8/0x14c) from [<c03cd144>] (setup_received_irq+0xbc/0x270) [ 9.300053] [<c03cd144>] (setup_received_irq+0xbc/0x270) from [<c03cda64>] (tegra_udc_irq+0x2ac/0x2c4) [ 9.309353] [<c03cda64>] (tegra_udc_irq+0x2ac/0x2c4) from [<c00b5708>] (handle_irq_event_percpu+0x78/0x2e0) [ 9.319087] [<c00b5708>] (handle_irq_event_percpu+0x78/0x2e0) from [<c00b59b4>] (handle_irq_event+0x44/0x64) [ 9.328907] [<c00b59b4>] (handle_irq_event+0x44/0x64) from [<c00b8688>] (handle_fasteoi_irq+0xc4/0x16c) [ 9.338294] [<c00b8688>] (handle_fasteoi_irq+0xc4/0x16c) from [<c00b4f14>] (generic_handle_irq+0x34/0x48) [ 9.347858] [<c00b4f14>] (generic_handle_irq+0x34/0x48) from [<c000f6f4>] (handle_IRQ+0x54/0xb4) [ 9.356637] [<c000f6f4>] (handle_IRQ+0x54/0xb4) from [<c00084b0>] (gic_handle_irq+0x2c/0x60) [ 9.365068] [<c00084b0>] (gic_handle_irq+0x2c/0x60) from [<c000e900>] (__irq_svc+0x40/0x70) [ 9.373405] Exception stack(0xc09edf10 to 0xc09edf58) [ 9.378447] df00: 00000000 000f4240 00000003 00000000 [ 9.386615] df20: 00000000 e55bbc00 ef66f3ca 00000001 00000000 412fc099 c0abb9c8 00000000 [ 9.394781] df40: 3b9ac9ff c09edf58 c027a9bc c0042880 20000113 ffffffff [ 9.401396] [<c000e900>] (__irq_svc+0x40/0x70) from [<c0042880>] (tegra_idle_enter_lp3+0x68/0x78) [ 9.410272] [<c0042880>] (tegra_idle_enter_lp3+0x68/0x78) from [<c04701d4>] (cpuidle_idle_call+0xdc/0x3a4) [ 9.419922] [<c04701d4>] (cpuidle_idle_call+0xdc/0x3a4) from [<c000fdd8>] (cpu_idle+0xd8/0x134) [ 9.428612] [<c000fdd8>] (cpu_idle+0xd8/0x134) from [<c0986818>] (start_kernel+0x27c/0x2cc) [ 9.436952] Code: e1a00004 e3a04000 eb002265 eaffffe0 (e7f001f2) [ 9.443038] ---[ end trace 1b75b31a2719ed24 ]--- [ 9.447645] Kernel panic - not syncing: Fatal exception in interrupt
Hiroshi Doyu hdoyu@nvidia.com wrote @ Tue, 21 Aug 2012 13:22:35 +0200:
Hi,
On Mon, 30 Jul 2012 10:28:19 +0200 Marek Szyprowski m.szyprowski@samsung.com wrote:
This patch changes dma-mapping subsystem to use generic vmalloc areas for all consistent dma allocations. This increases the total size limit of the consistent allocations and removes platform hacks and a lot of duplicated code.
Atomic allocations are served from special pool preallocated on boot, because vmalloc areas cannot be reliably created in atomic context.
Signed-off-by: Marek Szyprowski m.szyprowski@samsung.com Reviewed-by: Kyungmin Park kyungmin.park@samsung.com
Documentation/kernel-parameters.txt | 2 +- arch/arm/include/asm/dma-mapping.h | 2 +- arch/arm/mm/dma-mapping.c | 486 ++++++++++++----------------------- arch/arm/mm/mm.h | 3 + include/linux/vmalloc.h | 1 + mm/vmalloc.c | 10 +- 6 files changed, 181 insertions(+), 323 deletions(-)
...
@@ -1117,61 +984,32 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages, size_t s
- Create a CPU mapping for a specified pages
*/ static void * -__iommu_alloc_remap(struct page **pages, size_t size, gfp_t gfp, pgprot_t prot) +__iommu_alloc_remap(struct page **pages, size_t size, gfp_t gfp, pgprot_t prot,
const void *caller)
{
struct arm_vmregion *c;
size_t align;
size_t count = size >> PAGE_SHIFT;
int bit;
unsigned int i, nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
struct vm_struct *area;
unsigned long p;
if (!consistent_pte[0]) {
pr_err("%s: not initialised\n", __func__);
dump_stack();
area = get_vm_area_caller(size, VM_ARM_DMA_CONSISTENT | VM_USERMAP,
caller);
if (!area)
This patch replaced the custom "consistent_pte" with get_vm_area_caller()", which breaks the compatibility with the existing driver. This causes the following kernel oops(*1). That driver has called dma_pool_alloc() to allocate memory from the interrupt context, and it hits BUG_ON(in_interrpt()) in "get_vm_area_caller()"(*2). Regardless of the badness of allocation from interrupt handler in the driver, I have the following question.
The following "__get_vm_area_node()" can take gfp_mask, it means that this function is expected to be called from atomic context, but why it's _NOT_ allowed _ONLY_ from interrupt context?
According to the following definitions, "in_interrupt()" is in "in_atomic()".
#define in_interrupt() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK | NMI_MASK)) #define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0)
Does anyone know why BUG_ON(in_interrupt()) is set in __get_vm_area_node(*3)?
For arm_dma_alloc(), it allocates from the pool if GFP_ATOMIC, but for arm_iommu_alloc_attrs() doesn't have pre-allocate pool at all, and it always call "get_vm_area_caller()". That's why it hits BUG(). But still I don't understand why it's not BUG_ON(in_atomic) as Russell already pointed out(*1).
Hi Hiroshi,
On Tuesday, August 21, 2012 2:15 PM Hiroshi Doyu wrote:
Hiroshi Doyu hdoyu@nvidia.com wrote @ Tue, 21 Aug 2012 13:22:35 +0200:
Hi,
On Mon, 30 Jul 2012 10:28:19 +0200 Marek Szyprowski m.szyprowski@samsung.com wrote:
This patch changes dma-mapping subsystem to use generic vmalloc areas for all consistent dma allocations. This increases the total size limit of the consistent allocations and removes platform hacks and a lot of duplicated code.
Atomic allocations are served from special pool preallocated on boot, because vmalloc areas cannot be reliably created in atomic context.
Signed-off-by: Marek Szyprowski m.szyprowski@samsung.com Reviewed-by: Kyungmin Park kyungmin.park@samsung.com
Documentation/kernel-parameters.txt | 2 +- arch/arm/include/asm/dma-mapping.h | 2 +- arch/arm/mm/dma-mapping.c | 486 ++++++++++++----------------------- arch/arm/mm/mm.h | 3 + include/linux/vmalloc.h | 1 + mm/vmalloc.c | 10 +- 6 files changed, 181 insertions(+), 323 deletions(-)
...
@@ -1117,61 +984,32 @@ static int __iommu_free_buffer(struct device *dev, struct page
**pages, size_t s
- Create a CPU mapping for a specified pages
*/ static void * -__iommu_alloc_remap(struct page **pages, size_t size, gfp_t gfp, pgprot_t prot) +__iommu_alloc_remap(struct page **pages, size_t size, gfp_t gfp, pgprot_t prot,
const void *caller)
{
struct arm_vmregion *c;
size_t align;
size_t count = size >> PAGE_SHIFT;
int bit;
unsigned int i, nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
struct vm_struct *area;
unsigned long p;
if (!consistent_pte[0]) {
pr_err("%s: not initialised\n", __func__);
dump_stack();
area = get_vm_area_caller(size, VM_ARM_DMA_CONSISTENT | VM_USERMAP,
caller);
if (!area)
This patch replaced the custom "consistent_pte" with get_vm_area_caller()", which breaks the compatibility with the existing driver. This causes the following kernel oops(*1). That driver has called dma_pool_alloc() to allocate memory from the interrupt context, and it hits BUG_ON(in_interrpt()) in "get_vm_area_caller()"(*2). Regardless of the badness of allocation from interrupt handler in the driver, I have the following question.
The following "__get_vm_area_node()" can take gfp_mask, it means that this function is expected to be called from atomic context, but why it's _NOT_ allowed _ONLY_ from interrupt context?
According to the following definitions, "in_interrupt()" is in "in_atomic()".
#define in_interrupt() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK | NMI_MASK)) #define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0)
Does anyone know why BUG_ON(in_interrupt()) is set in __get_vm_area_node(*3)?
For arm_dma_alloc(), it allocates from the pool if GFP_ATOMIC, but for arm_iommu_alloc_attrs() doesn't have pre-allocate pool at all, and it always call "get_vm_area_caller()". That's why it hits BUG(). But still I don't understand why it's not BUG_ON(in_atomic) as Russell already pointed out(*1).
Ok, now I see the problem. I will try to find out a solution for your issue.
Best regards
Hi Marek,
On Tue, 21 Aug 2012 17:01:08 +0200 Marek Szyprowski m.szyprowski@samsung.com wrote:
-__iommu_alloc_remap(struct page **pages, size_t size, gfp_t gfp, pgprot_t prot) +__iommu_alloc_remap(struct page **pages, size_t size, gfp_t gfp, pgprot_t prot,
const void *caller)
{
struct arm_vmregion *c;
size_t align;
size_t count = size >> PAGE_SHIFT;
int bit;
unsigned int i, nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
struct vm_struct *area;
unsigned long p;
if (!consistent_pte[0]) {
pr_err("%s: not initialised\n", __func__);
dump_stack();
area = get_vm_area_caller(size, VM_ARM_DMA_CONSISTENT | VM_USERMAP,
caller);
if (!area)
This patch replaced the custom "consistent_pte" with get_vm_area_caller()", which breaks the compatibility with the existing driver. This causes the following kernel oops(*1). That driver has called dma_pool_alloc() to allocate memory from the interrupt context, and it hits BUG_ON(in_interrpt()) in "get_vm_area_caller()"(*2). Regardless of the badness of allocation from interrupt handler in the driver, I have the following question.
The following "__get_vm_area_node()" can take gfp_mask, it means that this function is expected to be called from atomic context, but why it's _NOT_ allowed _ONLY_ from interrupt context?
According to the following definitions, "in_interrupt()" is in "in_atomic()".
#define in_interrupt() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK | NMI_MASK)) #define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0)
Does anyone know why BUG_ON(in_interrupt()) is set in __get_vm_area_node(*3)?
For arm_dma_alloc(), it allocates from the pool if GFP_ATOMIC, but for arm_iommu_alloc_attrs() doesn't have pre-allocate pool at all, and it always call "get_vm_area_caller()". That's why it hits BUG(). But still I don't understand why it's not BUG_ON(in_atomic) as Russell already pointed out(*1).
Ok, now I see the problem. I will try to find out a solution for your issue.
My explanation wasn't so good.
For a solution, I thought that, in order to allow IOMMU'able device drivers to allocate memory from atomic context/ISR, there were the following 2 solutions:
(1) To provide the pre-allocate area like arm_dma_alloc() does, or (2) __get_vm_area_node() can be called from ISR.
But (2) doesn't work because PGALLOC_GFP(GFP_KERNEL) is used to allocate a page table. This is called from:
arm_iommu_alloc_attrs() -> __iommu_alloc_remap() -> ioremap_page_range() -> ..... -> pte_alloc_one_kernel() -> pte = (pte_t *)__get_free_page(PGALLOC_GFP);
We always have to avoid changing a page table for atomic allocation. So for me, the only remaining solution is (1) pre-allocation. We can make use of the same atomic pool both for DMA and IOMMU. I'll send the patch.
On Tue, Aug 21, 2012 at 02:22:35PM +0300, Hiroshi Doyu wrote:
The following "__get_vm_area_node()" can take gfp_mask, it means that this function is expected to be called from atomic context, but why it's _NOT_ allowed _ONLY_ from interrupt context?
One reason is it takes read/write locks without using the IRQ safe versions for starters (vmap_area_lock and vmlist_lock). I don't see any other reasons in that bit of code though.
Russell King - ARM Linux linux@arm.linux.org.uk wrote @ Tue, 21 Aug 2012 14:34:51 +0200:
On Tue, Aug 21, 2012 at 02:22:35PM +0300, Hiroshi Doyu wrote:
The following "__get_vm_area_node()" can take gfp_mask, it means that this function is expected to be called from atomic context, but why it's _NOT_ allowed _ONLY_ from interrupt context?
One reason is it takes read/write locks without using the IRQ safe versions for starters (vmap_area_lock and vmlist_lock). I don't see any other reasons in that bit of code though.
IIRC, if *_{irqsave,irqrestore} versions were introduced to protect from IRQ context, could we remove this BUG_ON(in_interrupt()) in __get_vm_area_node() at least? Or is it not encouraged from performance POV?
It seems that the solution to allow IOMMU'able device driver to allocate from ISR are: (1) To provide the pre-allocate area like arm_dma_alloc() does, or (2) __get_vm_area_node() can be called from ISR.
linaro-mm-sig@lists.linaro.org