With some architectures like ppc64, set_pmd_at() cannot cope with a situation where there is already some (different) valid entry present.
Use pmdp_set_access_flags() instead to modify the pfn which is built to deal with modifying existing PMD entries.
This is similar to commit cae85cb8add3 ("mm/memory.c: fix modifying of page protection by insert_pfn()")
We also do similar update w.r.t insert_pfn_pud eventhough ppc64 don't support pud pfn entries now.
Without this patch we also see the below message in kernel log "BUG: non-zero pgtables_bytes on freeing mm:"
CC: stable@vger.kernel.org Reported-by: Chandan Rajendra chandan@linux.ibm.com Signed-off-by: Aneesh Kumar K.V aneesh.kumar@linux.ibm.com --- Changes from v1: * Fix the pgtable leak
mm/huge_memory.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 404acdcd0455..165ea46bf149 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -755,6 +755,21 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, spinlock_t *ptl;
ptl = pmd_lock(mm, pmd); + if (!pmd_none(*pmd)) { + if (write) { + if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); + goto out_unlock; + } + entry = pmd_mkyoung(*pmd); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + if (pmdp_set_access_flags(vma, addr, pmd, entry, 1)) + update_mmu_cache_pmd(vma, addr, pmd); + } + + goto out_unlock; + } + entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pmd_mkdevmap(entry); @@ -766,11 +781,16 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, if (pgtable) { pgtable_trans_huge_deposit(mm, pmd, pgtable); mm_inc_nr_ptes(mm); + pgtable = NULL; }
set_pmd_at(mm, addr, pmd, entry); update_mmu_cache_pmd(vma, addr, pmd); + +out_unlock: spin_unlock(ptl); + if (pgtable) + pte_free(mm, pgtable); }
vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -821,6 +841,20 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, spinlock_t *ptl;
ptl = pud_lock(mm, pud); + if (!pud_none(*pud)) { + if (write) { + if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_huge_zero_pud(*pud)); + goto out_unlock; + } + entry = pud_mkyoung(*pud); + entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); + if (pudp_set_access_flags(vma, addr, pud, entry, 1)) + update_mmu_cache_pud(vma, addr, pud); + } + goto out_unlock; + } + entry = pud_mkhuge(pfn_t_pud(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pud_mkdevmap(entry); @@ -830,6 +864,8 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, } set_pud_at(mm, addr, pud, entry); update_mmu_cache_pud(vma, addr, pud); + +out_unlock: spin_unlock(ptl); }
On Tue 02-04-19 17:21:25, Aneesh Kumar K.V wrote:
With some architectures like ppc64, set_pmd_at() cannot cope with a situation where there is already some (different) valid entry present.
Use pmdp_set_access_flags() instead to modify the pfn which is built to deal with modifying existing PMD entries.
This is similar to commit cae85cb8add3 ("mm/memory.c: fix modifying of page protection by insert_pfn()")
We also do similar update w.r.t insert_pfn_pud eventhough ppc64 don't support pud pfn entries now.
Without this patch we also see the below message in kernel log "BUG: non-zero pgtables_bytes on freeing mm:"
CC: stable@vger.kernel.org Reported-by: Chandan Rajendra chandan@linux.ibm.com Signed-off-by: Aneesh Kumar K.V aneesh.kumar@linux.ibm.com
Looks good to me. You can add:
Reviewed-by: Jan Kara jack@suse.cz
Honza
Changes from v1:
- Fix the pgtable leak
mm/huge_memory.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 404acdcd0455..165ea46bf149 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -755,6 +755,21 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, spinlock_t *ptl; ptl = pmd_lock(mm, pmd);
- if (!pmd_none(*pmd)) {
if (write) {
if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
goto out_unlock;
}
entry = pmd_mkyoung(*pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
update_mmu_cache_pmd(vma, addr, pmd);
}
goto out_unlock;
- }
- entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pmd_mkdevmap(entry);
@@ -766,11 +781,16 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, if (pgtable) { pgtable_trans_huge_deposit(mm, pmd, pgtable); mm_inc_nr_ptes(mm);
}pgtable = NULL;
set_pmd_at(mm, addr, pmd, entry); update_mmu_cache_pmd(vma, addr, pmd);
+out_unlock: spin_unlock(ptl);
- if (pgtable)
pte_free(mm, pgtable);
} vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -821,6 +841,20 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, spinlock_t *ptl; ptl = pud_lock(mm, pud);
- if (!pud_none(*pud)) {
if (write) {
if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
WARN_ON_ONCE(!is_huge_zero_pud(*pud));
goto out_unlock;
}
entry = pud_mkyoung(*pud);
entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
if (pudp_set_access_flags(vma, addr, pud, entry, 1))
update_mmu_cache_pud(vma, addr, pud);
}
goto out_unlock;
- }
- entry = pud_mkhuge(pfn_t_pud(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pud_mkdevmap(entry);
@@ -830,6 +864,8 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, } set_pud_at(mm, addr, pud, entry); update_mmu_cache_pud(vma, addr, pud);
+out_unlock: spin_unlock(ptl); } -- 2.20.1
On Tue, Apr 2, 2019 at 4:51 AM Aneesh Kumar K.V aneesh.kumar@linux.ibm.com wrote:
With some architectures like ppc64, set_pmd_at() cannot cope with a situation where there is already some (different) valid entry present.
Use pmdp_set_access_flags() instead to modify the pfn which is built to deal with modifying existing PMD entries.
This is similar to commit cae85cb8add3 ("mm/memory.c: fix modifying of page protection by insert_pfn()")
We also do similar update w.r.t insert_pfn_pud eventhough ppc64 don't support pud pfn entries now.
Without this patch we also see the below message in kernel log "BUG: non-zero pgtables_bytes on freeing mm:"
CC: stable@vger.kernel.org Reported-by: Chandan Rajendra chandan@linux.ibm.com Signed-off-by: Aneesh Kumar K.V aneesh.kumar@linux.ibm.com
Changes from v1:
- Fix the pgtable leak
mm/huge_memory.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+)
This patch is triggering the following bug in v4.19.35.
kernel BUG at arch/x86/mm/pgtable.c:515! invalid opcode: 0000 [#1] SMP NOPTI CPU: 51 PID: 43713 Comm: java Tainted: G OE 4.19.35 #1 RIP: 0010:pmdp_set_access_flags+0x48/0x50 [..] Call Trace: vmf_insert_pfn_pmd+0x198/0x350 dax_iomap_fault+0xe82/0x1190 ext4_dax_huge_fault+0x103/0x1f0 ? __switch_to_asm+0x40/0x70 __handle_mm_fault+0x3f6/0x1370 ? __switch_to_asm+0x34/0x70 ? __switch_to_asm+0x40/0x70 handle_mm_fault+0xda/0x200 __do_page_fault+0x249/0x4f0 do_page_fault+0x32/0x110 ? page_fault+0x8/0x30 page_fault+0x1e/0x30
I asked the reporter to try a kernel with commit c6f3c5ee40c1 "mm/huge_memory.c: fix modifying of page protection by insert_pfn_pmd()" reverted and the failing test passed.
I think unaligned addresses have always been passed to vmf_insert_pfn_pmd(), but nothing cared until this patch. I *think* the only change needed is the following, thoughts?
diff --git a/fs/dax.c b/fs/dax.c index ca0671d55aa6..82aee9a87efa 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1560,7 +1560,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, }
trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry); - result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn, + result = vmf_insert_pfn_pmd(vma, pmd_addr, vmf->pmd, pfn, write); break; case IOMAP_UNWRITTEN:
I'll ask the reporter to try this fix as well.
On Wed, Apr 24, 2019 at 10:13:15AM -0700, Dan Williams wrote:
I think unaligned addresses have always been passed to vmf_insert_pfn_pmd(), but nothing cared until this patch. I *think* the only change needed is the following, thoughts?
diff --git a/fs/dax.c b/fs/dax.c index ca0671d55aa6..82aee9a87efa 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1560,7 +1560,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, }
trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
result = vmf_insert_pfn_pmd(vma, pmd_addr, vmf->pmd, pfn, write);
We also call vmf_insert_pfn_pmd() in dax_insert_pfn_mkwrite() -- does that need to change too?
On Wed, Apr 24, 2019 at 10:38 AM Matthew Wilcox willy@infradead.org wrote:
On Wed, Apr 24, 2019 at 10:13:15AM -0700, Dan Williams wrote:
I think unaligned addresses have always been passed to vmf_insert_pfn_pmd(), but nothing cared until this patch. I *think* the only change needed is the following, thoughts?
diff --git a/fs/dax.c b/fs/dax.c index ca0671d55aa6..82aee9a87efa 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1560,7 +1560,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, }
trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
result = vmf_insert_pfn_pmd(vma, pmd_addr, vmf->pmd, pfn, write);
We also call vmf_insert_pfn_pmd() in dax_insert_pfn_mkwrite() -- does that need to change too?
It wasn't clear to me that it was a problem. I think that one already happens to be pmd-aligned.
On 4/24/19 11:43 PM, Dan Williams wrote:
On Wed, Apr 24, 2019 at 10:38 AM Matthew Wilcox willy@infradead.org wrote:
On Wed, Apr 24, 2019 at 10:13:15AM -0700, Dan Williams wrote:
I think unaligned addresses have always been passed to vmf_insert_pfn_pmd(), but nothing cared until this patch. I *think* the only change needed is the following, thoughts?
diff --git a/fs/dax.c b/fs/dax.c index ca0671d55aa6..82aee9a87efa 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1560,7 +1560,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, }
trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
result = vmf_insert_pfn_pmd(vma, pmd_addr, vmf->pmd, pfn, write);
We also call vmf_insert_pfn_pmd() in dax_insert_pfn_mkwrite() -- does that need to change too?
It wasn't clear to me that it was a problem. I think that one already happens to be pmd-aligned.
How about vmf_insert_pfn_pud()?
-aneesh
On Wed, Apr 24, 2019 at 6:37 PM Aneesh Kumar K.V aneesh.kumar@linux.ibm.com wrote:
On 4/24/19 11:43 PM, Dan Williams wrote:
On Wed, Apr 24, 2019 at 10:38 AM Matthew Wilcox willy@infradead.org wrote:
On Wed, Apr 24, 2019 at 10:13:15AM -0700, Dan Williams wrote:
I think unaligned addresses have always been passed to vmf_insert_pfn_pmd(), but nothing cared until this patch. I *think* the only change needed is the following, thoughts?
diff --git a/fs/dax.c b/fs/dax.c index ca0671d55aa6..82aee9a87efa 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1560,7 +1560,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, }
trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
result = vmf_insert_pfn_pmd(vma, pmd_addr, vmf->pmd, pfn, write);
We also call vmf_insert_pfn_pmd() in dax_insert_pfn_mkwrite() -- does that need to change too?
It wasn't clear to me that it was a problem. I think that one already happens to be pmd-aligned.
How about vmf_insert_pfn_pud()?
That is currently not used by fsdax, only devdax, but it does seem that it passes the unaligned fault address rather than the pud aligned address. I'll add that to the proposed fix.
On Wed 24-04-19 11:13:48, Dan Williams wrote:
On Wed, Apr 24, 2019 at 10:38 AM Matthew Wilcox willy@infradead.org wrote:
On Wed, Apr 24, 2019 at 10:13:15AM -0700, Dan Williams wrote:
I think unaligned addresses have always been passed to vmf_insert_pfn_pmd(), but nothing cared until this patch. I *think* the only change needed is the following, thoughts?
diff --git a/fs/dax.c b/fs/dax.c index ca0671d55aa6..82aee9a87efa 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1560,7 +1560,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, }
trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
result = vmf_insert_pfn_pmd(vma, pmd_addr, vmf->pmd, pfn, write);
We also call vmf_insert_pfn_pmd() in dax_insert_pfn_mkwrite() -- does that need to change too?
It wasn't clear to me that it was a problem. I think that one already happens to be pmd-aligned.
Why would it need to be? The address is taken from vmf->address and that's set up in __handle_mm_fault() like .address = address & PAGE_MASK. So I don't see anything forcing PMD alignment of the virtual address...
Honza
On Thu, Apr 25, 2019 at 12:32 AM Jan Kara jack@suse.cz wrote:
On Wed 24-04-19 11:13:48, Dan Williams wrote:
On Wed, Apr 24, 2019 at 10:38 AM Matthew Wilcox willy@infradead.org wrote:
On Wed, Apr 24, 2019 at 10:13:15AM -0700, Dan Williams wrote:
I think unaligned addresses have always been passed to vmf_insert_pfn_pmd(), but nothing cared until this patch. I *think* the only change needed is the following, thoughts?
diff --git a/fs/dax.c b/fs/dax.c index ca0671d55aa6..82aee9a87efa 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1560,7 +1560,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, }
trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
result = vmf_insert_pfn_pmd(vma, pmd_addr, vmf->pmd, pfn, write);
We also call vmf_insert_pfn_pmd() in dax_insert_pfn_mkwrite() -- does that need to change too?
It wasn't clear to me that it was a problem. I think that one already happens to be pmd-aligned.
Why would it need to be? The address is taken from vmf->address and that's set up in __handle_mm_fault() like .address = address & PAGE_MASK. So I don't see anything forcing PMD alignment of the virtual address...
True. So now I'm wondering if the masking should be done internal to the routine. Given it's prefixed vmf_ it seems to imply the api is prepared to take raw 'struct vm_fault' parameters. I think I'll go that route unless someone sees a reason to require the caller to handle this responsibility.
On Thu, Apr 25, 2019 at 05:33:04PM -0700, Dan Williams wrote:
On Thu, Apr 25, 2019 at 12:32 AM Jan Kara jack@suse.cz wrote:
We also call vmf_insert_pfn_pmd() in dax_insert_pfn_mkwrite() -- does that need to change too?
It wasn't clear to me that it was a problem. I think that one already happens to be pmd-aligned.
Why would it need to be? The address is taken from vmf->address and that's set up in __handle_mm_fault() like .address = address & PAGE_MASK. So I don't see anything forcing PMD alignment of the virtual address...
True. So now I'm wondering if the masking should be done internal to the routine. Given it's prefixed vmf_ it seems to imply the api is prepared to take raw 'struct vm_fault' parameters. I think I'll go that route unless someone sees a reason to require the caller to handle this responsibility.
The vmf_ prefix was originally used to indicate 'returns a vm_fault_t' instead of 'returns an errno'. That said, I like the interpretation you're coming up with here, and it makes me wonder if we shouldn't change vmf_insert_pfn_pmd() to take (vmf, pfn, write) as arguments instead of separate vma, address & pmd arguments.
On Thu 25-04-19 17:33:04, Dan Williams wrote:
On Thu, Apr 25, 2019 at 12:32 AM Jan Kara jack@suse.cz wrote:
On Wed 24-04-19 11:13:48, Dan Williams wrote:
On Wed, Apr 24, 2019 at 10:38 AM Matthew Wilcox willy@infradead.org wrote:
On Wed, Apr 24, 2019 at 10:13:15AM -0700, Dan Williams wrote:
I think unaligned addresses have always been passed to vmf_insert_pfn_pmd(), but nothing cared until this patch. I *think* the only change needed is the following, thoughts?
diff --git a/fs/dax.c b/fs/dax.c index ca0671d55aa6..82aee9a87efa 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1560,7 +1560,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, }
trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
result = vmf_insert_pfn_pmd(vma, pmd_addr, vmf->pmd, pfn, write);
We also call vmf_insert_pfn_pmd() in dax_insert_pfn_mkwrite() -- does that need to change too?
It wasn't clear to me that it was a problem. I think that one already happens to be pmd-aligned.
Why would it need to be? The address is taken from vmf->address and that's set up in __handle_mm_fault() like .address = address & PAGE_MASK. So I don't see anything forcing PMD alignment of the virtual address...
True. So now I'm wondering if the masking should be done internal to the routine. Given it's prefixed vmf_ it seems to imply the api is prepared to take raw 'struct vm_fault' parameters. I think I'll go that route unless someone sees a reason to require the caller to handle this responsibility.
Yeah, that sounds good to me. Thanks for fixing this.
Honza
linux-stable-mirror@lists.linaro.org