From: Fuad Tabba tabba@google.com
When a page is shared, the exclusive pin is dropped, but one normal pin is maintained. In order to be able to unshare a page, add the ability to reaquire the exclusive pin, but only if there is only one normal pin on the page, and only if the page is marked as AnonExclusive.
Co-Developed-by: Elliot Berman quic_eberman@quicinc.com Signed-off-by: Elliot Berman quic_eberman@quicinc.com Signed-off-by: Fuad Tabba tabba@google.com Signed-off-by: Elliot Berman quic_eberman@quicinc.com --- include/linux/mm.h | 1 + include/linux/page_ref.h | 18 ++++++++++++------ mm/gup.c | 48 +++++++++++++++++++++++++++++++++++++----------- 3 files changed, 50 insertions(+), 17 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h index d03d62bceba0..628ab936dd2b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1590,6 +1590,7 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, void unpin_user_pages(struct page **pages, unsigned long npages); void unpin_exc_pages(struct page **pages, unsigned long npages); void unexc_user_page(struct page *page); +int reexc_user_page(struct page *page);
static inline bool is_cow_mapping(vm_flags_t flags) { diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 9d16e1f4db09..e66130fe995d 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -92,7 +92,8 @@ static inline void __page_ref_unfreeze(struct page *page, int v) * provides safe operation for get_user_pages(), page_mkclean() and * other calls that race to set up page table entries. */ -#define GUP_PIN_COUNTING_BIAS (1U << 10) +#define GUP_PIN_COUNTING_SHIFT (10) +#define GUP_PIN_COUNTING_BIAS (1U << GUP_PIN_COUNTING_SHIFT)
/* * GUP_PIN_EXCLUSIVE_BIAS is used to grab an exclusive pin over a page. @@ -100,7 +101,8 @@ static inline void __page_ref_unfreeze(struct page *page, int v) * exist for the page. * After it's taken, no other gup pins can be taken. */ -#define GUP_PIN_EXCLUSIVE_BIAS (1U << 30) +#define GUP_PIN_EXCLUSIVE_SHIFT (30) +#define GUP_PIN_EXCLUSIVE_BIAS (1U << GUP_PIN_EXCLUSIVE_SHIFT)
static inline int page_ref_count(const struct page *page) { @@ -155,7 +157,9 @@ static inline void init_page_count(struct page *page) set_page_count(page, 1); }
-static __must_check inline bool page_ref_setexc(struct page *page, unsigned int refs) +static __must_check inline bool page_ref_setexc(struct page *page, + unsigned int expected_pins, + unsigned int refs) { unsigned int old_count, new_count;
@@ -165,7 +169,7 @@ static __must_check inline bool page_ref_setexc(struct page *page, unsigned int do { old_count = atomic_read(&page->_refcount);
- if (old_count >= GUP_PIN_COUNTING_BIAS) + if ((old_count >> GUP_PIN_COUNTING_SHIFT) != expected_pins) return false;
if (check_add_overflow(old_count, refs + GUP_PIN_EXCLUSIVE_BIAS, &new_count)) @@ -178,9 +182,11 @@ static __must_check inline bool page_ref_setexc(struct page *page, unsigned int return true; }
-static __must_check inline bool folio_ref_setexc(struct folio *folio, unsigned int refs) +static __must_check inline bool folio_ref_setexc(struct folio *folio, + unsigned int expected_pins, + unsigned int refs) { - return page_ref_setexc(&folio->page, refs); + return page_ref_setexc(&folio->page, expected_pins, refs); }
static inline void page_ref_add(struct page *page, int nr) diff --git a/mm/gup.c b/mm/gup.c index 7f20de33221d..663030d03d95 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -97,7 +97,9 @@ static inline struct folio *try_get_folio(struct page *page, int refs) return folio; }
-static bool large_folio_pin_setexc(struct folio *folio, unsigned int pins) +static bool large_folio_pin_setexc(struct folio *folio, + unsigned int expected_pins, + unsigned int pins) { unsigned int old_pincount, new_pincount;
@@ -107,7 +109,7 @@ static bool large_folio_pin_setexc(struct folio *folio, unsigned int pins) do { old_pincount = atomic_read(&folio->_pincount);
- if (old_pincount > 0) + if (old_pincount != expected_pins) return false;
if (check_add_overflow(old_pincount, pins + GUP_PIN_EXCLUSIVE_BIAS, &new_pincount)) @@ -117,15 +119,18 @@ static bool large_folio_pin_setexc(struct folio *folio, unsigned int pins) return true; }
-static bool __try_grab_folio_excl(struct folio *folio, int pincount, int refcount) +static bool __try_grab_folio_excl(struct folio *folio, + unsigned int expected_pins, + int pincount, + int refcount) { if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_EXCLUSIVE_PIN))) return false;
if (folio_test_large(folio)) { - if (!large_folio_pin_setexc(folio, pincount)) + if (!large_folio_pin_setexc(folio, expected_pins, pincount)) return false; - } else if (!folio_ref_setexc(folio, refcount)) { + } else if (!folio_ref_setexc(folio, expected_pins, refcount)) { return false; }
@@ -135,7 +140,9 @@ static bool __try_grab_folio_excl(struct folio *folio, int pincount, int refcoun return true; }
-static bool try_grab_folio_excl(struct folio *folio, int refs) +static bool try_grab_folio_excl(struct folio *folio, + unsigned int expected_pins, + int refs) { /* * When pinning a large folio, use an exact count to track it. @@ -145,15 +152,17 @@ static bool try_grab_folio_excl(struct folio *folio, int refs) * is pinned. That's why the refcount from the earlier * try_get_folio() is left intact. */ - return __try_grab_folio_excl(folio, refs, + return __try_grab_folio_excl(folio, expected_pins, refs, refs * (GUP_PIN_COUNTING_BIAS - 1)); }
-static bool try_grab_page_excl(struct page *page) +static bool try_grab_page_excl(struct page *page, + unsigned int expected_pins) { struct folio *folio = page_folio(page);
- return __try_grab_folio_excl(folio, 1, GUP_PIN_COUNTING_BIAS); + return __try_grab_folio_excl(folio, expected_pins, 1, + GUP_PIN_COUNTING_BIAS); }
/** @@ -227,7 +236,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) }
if (unlikely(flags & FOLL_EXCLUSIVE)) { - if (!try_grab_folio_excl(folio, refs)) + if (!try_grab_folio_excl(folio, 0, refs)) return NULL; } else { /* @@ -347,7 +356,7 @@ int __must_check try_grab_page(struct page *page, unsigned int flags) return -EBUSY;
if (unlikely(flags & FOLL_EXCLUSIVE)) { - if (!try_grab_page_excl(page)) + if (!try_grab_page_excl(page, 0)) return -EBUSY; } else { /* @@ -661,6 +670,23 @@ void unexc_user_page(struct page *page) } EXPORT_SYMBOL(unexc_user_page);
+int reexc_user_page(struct page *page) +{ + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_EXCLUSIVE_PIN))) + return -EINVAL; + + sanity_check_pinned_pages(&page, 1); + + if (!PageAnonExclusive(page)) + return -EINVAL; + + if (!try_grab_page_excl(page, 1)) + return -EBUSY; + + return 0; +} +EXPORT_SYMBOL(reexc_user_page); + /* * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's * lifecycle. Avoid setting the bit unless necessary, or it might cause write