Under heavy load, the SGX EPC reclaimers (current ksgxd or future EPC cgroup worker) may reclaim the SECS EPC page for an enclave and set encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in the SGX #PF handler without checking for NULL and reloading.
Fix this by checking if SECS is loaded before EAUG and load it if it was reclaimed.
Fixes: 5a90d2c3f5ef8 ("x86/sgx: Support adding of pages to an initialized enclave") Cc: stable@vger.kernel.org Signed-off-by: Haitao Huang haitao.huang@linux.intel.com --- arch/x86/kernel/cpu/sgx/encl.c | 25 ++++++++++++++++++++----- arch/x86/kernel/cpu/sgx/main.c | 4 ++++ 2 files changed, 24 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 2a0e90fe2abc..d39e502bb7b0 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -235,6 +235,16 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, return epc_page; }
+static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl) +{ + struct sgx_epc_page *epc_page = encl->secs.epc_page; + + if (!epc_page) { + epc_page = sgx_encl_eldu(&encl->secs, NULL); + } + return epc_page; +} + static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, struct sgx_encl_page *entry) { @@ -248,11 +258,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, return entry; }
- if (!(encl->secs.epc_page)) { - epc_page = sgx_encl_eldu(&encl->secs, NULL); - if (IS_ERR(epc_page)) - return ERR_CAST(epc_page); - } + epc_page = sgx_encl_load_secs(encl); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page);
epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); if (IS_ERR(epc_page)) @@ -339,6 +347,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
mutex_lock(&encl->lock);
+ epc_page = sgx_encl_load_secs(encl); + if (IS_ERR(epc_page)) { + if (PTR_ERR(epc_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; + goto err_out_unlock; + } + epc_page = sgx_alloc_epc_page(encl_page, false); if (IS_ERR(epc_page)) { if (PTR_ERR(epc_page) == -EBUSY) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 166692f2d501..4662a364ce62 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -257,6 +257,10 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
mutex_lock(&encl->lock);
+ /* Should not be possible */ + if (WARN_ON(!(encl->secs.epc_page))) + goto out; + sgx_encl_ewb(epc_page, backing); encl_page->epc_page = NULL; encl->secs_child_cnt--;
On Mon Jul 17, 2023 at 6:17 PM UTC, Haitao Huang wrote:
Under heavy load, the SGX EPC reclaimers (current ksgxd or future EPC cgroup worker) may reclaim the SECS EPC page for an enclave and set encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in the SGX #PF handler without checking for NULL and reloading.
Fix this by checking if SECS is loaded before EAUG and load it if it was reclaimed.
Fixes: 5a90d2c3f5ef8 ("x86/sgx: Support adding of pages to an initialized enclave") Cc: stable@vger.kernel.org Signed-off-by: Haitao Huang haitao.huang@linux.intel.com
arch/x86/kernel/cpu/sgx/encl.c | 25 ++++++++++++++++++++----- arch/x86/kernel/cpu/sgx/main.c | 4 ++++ 2 files changed, 24 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 2a0e90fe2abc..d39e502bb7b0 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -235,6 +235,16 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, return epc_page; } +static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl) +{
- struct sgx_epc_page *epc_page = encl->secs.epc_page;
- if (!epc_page) {
epc_page = sgx_encl_eldu(&encl->secs, NULL);
- }
remove curly braces
- return epc_page;
+}
static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, struct sgx_encl_page *entry) { @@ -248,11 +258,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, return entry; }
- if (!(encl->secs.epc_page)) {
epc_page = sgx_encl_eldu(&encl->secs, NULL);
if (IS_ERR(epc_page))
return ERR_CAST(epc_page);
- }
- epc_page = sgx_encl_load_secs(encl);
- if (IS_ERR(epc_page))
return ERR_CAST(epc_page);
epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); if (IS_ERR(epc_page)) @@ -339,6 +347,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, mutex_lock(&encl->lock);
- epc_page = sgx_encl_load_secs(encl);
- if (IS_ERR(epc_page)) {
if (PTR_ERR(epc_page) == -EBUSY)
vmret = VM_FAULT_NOPAGE;
goto err_out_unlock;
- }
- epc_page = sgx_alloc_epc_page(encl_page, false); if (IS_ERR(epc_page)) { if (PTR_ERR(epc_page) == -EBUSY)
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 166692f2d501..4662a364ce62 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -257,6 +257,10 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, mutex_lock(&encl->lock);
- /* Should not be possible */
- if (WARN_ON(!(encl->secs.epc_page)))
goto out;
- sgx_encl_ewb(epc_page, backing); encl_page->epc_page = NULL; encl->secs_child_cnt--;
-- 2.25.1
BR, Jarkko
On Mon Jul 17, 2023 at 6:53 PM UTC, Jarkko Sakkinen wrote:
On Mon Jul 17, 2023 at 6:17 PM UTC, Haitao Huang wrote:
+static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl) +{
- struct sgx_epc_page *epc_page = encl->secs.epc_page;
- if (!epc_page) {
epc_page = sgx_encl_eldu(&encl->secs, NULL);
- }
remove curly braces
And add an empty line before the return statement.
BR, Jarkko
Under heavy load, the SGX EPC reclaimers (current ksgxd or future EPC cgroup worker) may reclaim the SECS EPC page for an enclave and set encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in the SGX #PF handler without checking for NULL and reloading.
Fix this by checking if SECS is loaded before EAUG and load it if it was reclaimed.
Fixes: 5a90d2c3f5ef8 ("x86/sgx: Support adding of pages to an initialized enclave") Cc: stable@vger.kernel.org Signed-off-by: Haitao Huang haitao.huang@linux.intel.com --- arch/x86/kernel/cpu/sgx/encl.c | 25 ++++++++++++++++++++----- arch/x86/kernel/cpu/sgx/main.c | 4 ++++ 2 files changed, 24 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 2a0e90fe2abc..2ab544da1664 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -235,6 +235,16 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, return epc_page; }
+static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl) +{ + struct sgx_epc_page *epc_page = encl->secs.epc_page; + + if (!epc_page) + epc_page = sgx_encl_eldu(&encl->secs, NULL); + + return epc_page; +} + static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, struct sgx_encl_page *entry) { @@ -248,11 +258,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, return entry; }
- if (!(encl->secs.epc_page)) { - epc_page = sgx_encl_eldu(&encl->secs, NULL); - if (IS_ERR(epc_page)) - return ERR_CAST(epc_page); - } + epc_page = sgx_encl_load_secs(encl); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page);
epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); if (IS_ERR(epc_page)) @@ -339,6 +347,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
mutex_lock(&encl->lock);
+ epc_page = sgx_encl_load_secs(encl); + if (IS_ERR(epc_page)) { + if (PTR_ERR(epc_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; + goto err_out_unlock; + } + epc_page = sgx_alloc_epc_page(encl_page, false); if (IS_ERR(epc_page)) { if (PTR_ERR(epc_page) == -EBUSY) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 166692f2d501..4662a364ce62 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -257,6 +257,10 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
mutex_lock(&encl->lock);
+ /* Should not be possible */ + if (WARN_ON(!(encl->secs.epc_page))) + goto out; + sgx_encl_ewb(epc_page, backing); encl_page->epc_page = NULL; encl->secs_child_cnt--;
On Mon, 2023-07-17 at 13:29 -0700, Haitao Huang wrote:
Under heavy load, the SGX EPC reclaimers (current ksgxd or future EPC cgroup worker) may reclaim the SECS EPC page for an enclave and set encl->secs.epc_page to NULL.
As a bug fix, I don't think you need to mention "future EPC cgroup worker".
But the SECS EPC page is used for EAUG in the SGX #PF handler without checking for NULL and reloading.
Fix this by checking if SECS is loaded before EAUG and load it if it was
^ loading
reclaimed.
Fixes: 5a90d2c3f5ef8 ("x86/sgx: Support adding of pages to an initialized enclave") Cc: stable@vger.kernel.org Signed-off-by: Haitao Huang haitao.huang@linux.intel.com
arch/x86/kernel/cpu/sgx/encl.c | 25 ++++++++++++++++++++----- arch/x86/kernel/cpu/sgx/main.c | 4 ++++ 2 files changed, 24 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 2a0e90fe2abc..2ab544da1664 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -235,6 +235,16 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, return epc_page; } +static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl) +{
- struct sgx_epc_page *epc_page = encl->secs.epc_page;
- if (!epc_page)
epc_page = sgx_encl_eldu(&encl->secs, NULL);
- return epc_page;
+}
static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, struct sgx_encl_page *entry) { @@ -248,11 +258,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, return entry; }
- if (!(encl->secs.epc_page)) {
epc_page = sgx_encl_eldu(&encl->secs, NULL);
if (IS_ERR(epc_page))
return ERR_CAST(epc_page);
- }
- epc_page = sgx_encl_load_secs(encl);
- if (IS_ERR(epc_page))
return ERR_CAST(epc_page);
epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); if (IS_ERR(epc_page)) @@ -339,6 +347,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, mutex_lock(&encl->lock);
- epc_page = sgx_encl_load_secs(encl);
- if (IS_ERR(epc_page)) {
if (PTR_ERR(epc_page) == -EBUSY)
vmret = VM_FAULT_NOPAGE;
goto err_out_unlock;
- }
- epc_page = sgx_alloc_epc_page(encl_page, false); if (IS_ERR(epc_page)) { if (PTR_ERR(epc_page) == -EBUSY)
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 166692f2d501..4662a364ce62 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -257,6 +257,10 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, mutex_lock(&encl->lock);
- /* Should not be possible */
- if (WARN_ON(!(encl->secs.epc_page)))
goto out;
This shouldn't be a mandatory part of this fix, no?
If there's good reason to do, then probably you should describe the reason in the changelog.
sgx_encl_ewb(epc_page, backing); encl_page->epc_page = NULL; encl->secs_child_cnt--;
Under heavy load, the SGX EPC reclaimer (ksgxd) may reclaim the SECS EPC page for an enclave and set encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in the SGX #PF handler without checking for NULL and reloading.
Fix this by checking if SECS is loaded before EAUG and loading it if it was reclaimed.
Fixes: 5a90d2c3f5ef ("x86/sgx: Support adding of pages to an initialized enclave") Cc: stable@vger.kernel.org Signed-off-by: Haitao Huang haitao.huang@linux.intel.com --- arch/x86/kernel/cpu/sgx/encl.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 2a0e90fe2abc..2ab544da1664 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -235,6 +235,16 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, return epc_page; }
+static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl) +{ + struct sgx_epc_page *epc_page = encl->secs.epc_page; + + if (!epc_page) + epc_page = sgx_encl_eldu(&encl->secs, NULL); + + return epc_page; +} + static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, struct sgx_encl_page *entry) { @@ -248,11 +258,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, return entry; }
- if (!(encl->secs.epc_page)) { - epc_page = sgx_encl_eldu(&encl->secs, NULL); - if (IS_ERR(epc_page)) - return ERR_CAST(epc_page); - } + epc_page = sgx_encl_load_secs(encl); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page);
epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); if (IS_ERR(epc_page)) @@ -339,6 +347,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
mutex_lock(&encl->lock);
+ epc_page = sgx_encl_load_secs(encl); + if (IS_ERR(epc_page)) { + if (PTR_ERR(epc_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; + goto err_out_unlock; + } + epc_page = sgx_alloc_epc_page(encl_page, false); if (IS_ERR(epc_page)) { if (PTR_ERR(epc_page) == -EBUSY)
On Mon, 2023-07-17 at 17:45 -0700, Haitao Huang wrote:
Under heavy load, the SGX EPC reclaimer (ksgxd) may reclaim the SECS EPC page for an enclave and set encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in the SGX #PF handler without checking for NULL and reloading.
Fix this by checking if SECS is loaded before EAUG and loading it if it was reclaimed.
Nit:
Looks the sentence break of the second paragraph isn't consistent with the first paragraph, i.e., looks the first line is too long and the "was" should be broken to the second line.
And I think even for this simple patch, you are sending too frequently. The patch subject should contain the version number too so people can distinguish it from the previous one. Even better, please use "--base=auto" when generating the patch so people can know the base commit to apply to.
Fixes: 5a90d2c3f5ef ("x86/sgx: Support adding of pages to an initialized enclave") Cc: stable@vger.kernel.org Signed-off-by: Haitao Huang haitao.huang@linux.intel.com
arch/x86/kernel/cpu/sgx/encl.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 2a0e90fe2abc..2ab544da1664 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -235,6 +235,16 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, return epc_page; } +static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl) +{
- struct sgx_epc_page *epc_page = encl->secs.epc_page;
- if (!epc_page)
epc_page = sgx_encl_eldu(&encl->secs, NULL);
- return epc_page;
+}
static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, struct sgx_encl_page *entry) { @@ -248,11 +258,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, return entry; }
- if (!(encl->secs.epc_page)) {
epc_page = sgx_encl_eldu(&encl->secs, NULL);
if (IS_ERR(epc_page))
return ERR_CAST(epc_page);
- }
- epc_page = sgx_encl_load_secs(encl);
- if (IS_ERR(epc_page))
return ERR_CAST(epc_page);
epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); if (IS_ERR(epc_page)) @@ -339,6 +347,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, mutex_lock(&encl->lock);
- epc_page = sgx_encl_load_secs(encl);
- if (IS_ERR(epc_page)) {
if (PTR_ERR(epc_page) == -EBUSY)
vmret = VM_FAULT_NOPAGE;
^ Nit: two spaces here (yeah you copied from the existing code, and sorry forgot to point out in the previous version).
goto err_out_unlock;
- }
- epc_page = sgx_alloc_epc_page(encl_page, false); if (IS_ERR(epc_page)) { if (PTR_ERR(epc_page) == -EBUSY)
Hi On Mon, 17 Jul 2023 20:39:31 -0500, Huang, Kai kai.huang@intel.com wrote:
On Mon, 2023-07-17 at 17:45 -0700, Haitao Huang wrote:
Under heavy load, the SGX EPC reclaimer (ksgxd) may reclaim the SECS EPC page for an enclave and set encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in the SGX #PF handler without checking for NULL and reloading.
Fix this by checking if SECS is loaded before EAUG and loading it if it was reclaimed.
Nit:
Looks the sentence break of the second paragraph isn't consistent with the first paragraph, i.e., looks the first line is too long and the "was" should be broken to the second line.
Yeah, I think I forgot to reformat this line after revising.
And I think even for this simple patch, you are sending too frequently. The patch subject should contain the version number too so people can distinguish it from the previous one. Even better, please use "--base=auto" when generating the patch so people can know the base commit to apply to.
I'll send the next one as V2 and start a separate email thread.
Fixes: 5a90d2c3f5ef ("x86/sgx: Support adding of pages to an initialized enclave") Cc: stable@vger.kernel.org Signed-off-by: Haitao Huang haitao.huang@linux.intel.com
arch/x86/kernel/cpu/sgx/encl.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 2a0e90fe2abc..2ab544da1664 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -235,6 +235,16 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, return epc_page; }
+static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl) +{
- struct sgx_epc_page *epc_page = encl->secs.epc_page;
- if (!epc_page)
epc_page = sgx_encl_eldu(&encl->secs, NULL);
- return epc_page;
+}
static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, struct sgx_encl_page *entry) { @@ -248,11 +258,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, return entry; }
- if (!(encl->secs.epc_page)) {
epc_page = sgx_encl_eldu(&encl->secs, NULL);
if (IS_ERR(epc_page))
return ERR_CAST(epc_page);
- }
epc_page = sgx_encl_load_secs(encl);
if (IS_ERR(epc_page))
return ERR_CAST(epc_page);
epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); if (IS_ERR(epc_page))
@@ -339,6 +347,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
mutex_lock(&encl->lock);
- epc_page = sgx_encl_load_secs(encl);
- if (IS_ERR(epc_page)) {
if (PTR_ERR(epc_page) == -EBUSY)
vmret = VM_FAULT_NOPAGE;
^
Nit: two spaces here (yeah you copied from the existing code, and sorry forgot to point out in the previous version).
Sure. will fix in V2.
Thanks Haitao
On 7/17/23 13:29, Haitao Huang wrote:
Under heavy load, the SGX EPC reclaimers (current ksgxd or future EPC cgroup worker) may reclaim the SECS EPC page for an enclave and set encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in the SGX #PF handler without checking for NULL and reloading.
Fix this by checking if SECS is loaded before EAUG and load it if it was reclaimed.
It would be nice to see a _bit_ more theory of the bug in here.
What is an SECS page and why is it special in a reclaim context? Why is this so hard to hit? What led you to discover this issue now? What is EAUG?
On Tue, 18 Jul 2023 09:27:49 -0500, Dave Hansen dave.hansen@intel.com wrote:
On 7/17/23 13:29, Haitao Huang wrote:
Under heavy load, the SGX EPC reclaimers (current ksgxd or future EPC cgroup worker) may reclaim the SECS EPC page for an enclave and set encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in the SGX #PF handler without checking for NULL and reloading.
Fix this by checking if SECS is loaded before EAUG and load it if it was reclaimed.
It would be nice to see a _bit_ more theory of the bug in here.
What is an SECS page and why is it special in a reclaim context? Why is this so hard to hit? What led you to discover this issue now? What is EAUG?
Let me know if this clarify things.
The SECS page holds global states of an enclave, and all reclaimable pages tracked by the SGX EPC reclaimer (ksgxd) are considered 'child' pages of the SECS page corresponding to that enclave. The reclaimer only reclaims the SECS page when all its children are reclaimed. That can happen on system under high EPC pressure where multiple large enclaves demanding much more EPC page than physically available. In a rare case, the reclaimer may reclaim all EPC pages of an enclave and it SECS page, setting encl->secs.epc_page to NULL, right before the #PF handler get the chance to handle a #PF for that enclave. In that case, if that #PF happens to require kernel to invoke the EAUG instruction to add a new EPC page for the enclave, then a NULL pointer results as current code does not check if encl->secs.epc_page is NULL before using it.
The bug is easier to reproduce with the EPC cgroup implementation when a low EPC limit is set for a group of enclave hosting processes. Without the EPC cgroup it's hard to trigger the reclaimer to reclaim all child pages of an SECS page. And it'd also require a machine configured with large RAM relative to EPC so no OOM killer triggered before this happens.
Thanks Haitao
On 7/18/23 11:11, Haitao Huang wrote:
On Tue, 18 Jul 2023 09:27:49 -0500, Dave Hansen dave.hansen@intel.com wrote:
On 7/17/23 13:29, Haitao Huang wrote:
Under heavy load, the SGX EPC reclaimers (current ksgxd or future EPC cgroup worker) may reclaim the SECS EPC page for an enclave and set encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in the SGX #PF handler without checking for NULL and reloading.
Fix this by checking if SECS is loaded before EAUG and load it if it was reclaimed.
It would be nice to see a _bit_ more theory of the bug in here.
What is an SECS page and why is it special in a reclaim context? Why is this so hard to hit? What led you to discover this issue now? What is EAUG?
Let me know if this clarify things.
The SECS page holds global states of an enclave, and all reclaimable pages tracked by the SGX EPC reclaimer (ksgxd) are considered 'child' pages of the SECS page corresponding to that enclave. The reclaimer only reclaims the SECS page when all its children are reclaimed. That can happen on system under high EPC pressure where multiple large enclaves demanding much more EPC page than physically available. In a rare case, the reclaimer may reclaim all EPC pages of an enclave and it SECS page, setting encl->secs.epc_page to NULL, right before the #PF handler get the chance to handle a #PF for that enclave. In that case, if that #PF happens to require kernel to invoke the EAUG instruction to add a new EPC page for the enclave, then a NULL pointer results as current code does not check if encl->secs.epc_page is NULL before using it.
Better, but that's *REALLY* verbose and really imprecise. It doesn't _require_ "high pressure". It could literally happen at very, very low pressures over a long period of time. Please stick to the facts and it'll actually simplify the description.
The SECS page holds global enclave metadata. It can only be reclaimed when there are no other enclave pages remaining. At that point, virtually nothing can be done with the enclave until the SECS page is paged back in.
An enclave can not run nor generate page faults without without a resident SECS page. But it is still possible for a #PF for a non-SECS page to race with paging out the SECS page.
Hitting this bug requires triggering that race.
The bug is easier to reproduce with the EPC cgroup implementation when a low EPC limit is set for a group of enclave hosting processes. Without the EPC cgroup it's hard to trigger the reclaimer to reclaim all child pages of an SECS page. And it'd also require a machine configured with large RAM relative to EPC so no OOM killer triggered before this happens.
Isn't this the _normal_ case? EPC is relatively tiny compared to RAM normally.
On Tue, 18 Jul 2023 13:53:47 -0500, Dave Hansen dave.hansen@intel.com wrote:
On 7/18/23 11:11, Haitao Huang wrote:
On Tue, 18 Jul 2023 09:27:49 -0500, Dave Hansen dave.hansen@intel.com wrote:
On 7/17/23 13:29, Haitao Huang wrote:
Under heavy load, the SGX EPC reclaimers (current ksgxd or future EPC cgroup worker) may reclaim the SECS EPC page for an enclave and set encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in the SGX #PF handler without checking for NULL and reloading.
Fix this by checking if SECS is loaded before EAUG and load it if it was reclaimed.
It would be nice to see a _bit_ more theory of the bug in here.
What is an SECS page and why is it special in a reclaim context? Why is this so hard to hit? What led you to discover this issue now? What is EAUG?
Let me know if this clarify things.
The SECS page holds global states of an enclave, and all reclaimable pages tracked by the SGX EPC reclaimer (ksgxd) are considered 'child' pages of the SECS page corresponding to that enclave. The reclaimer only reclaims the SECS page when all its children are reclaimed. That can happen on system under high EPC pressure where multiple large enclaves demanding much more EPC page than physically available. In a rare case, the reclaimer may reclaim all EPC pages of an enclave and it SECS page, setting encl->secs.epc_page to NULL, right before the #PF handler get the chance to handle a #PF for that enclave. In that case, if that #PF happens to require kernel to invoke the EAUG instruction to add a new EPC page for the enclave, then a NULL pointer results as current code does not check if encl->secs.epc_page is NULL before using it.
Better, but that's *REALLY* verbose and really imprecise. It doesn't _require_ "high pressure". It could literally happen at very, very low pressures over a long period of time.
I don't quite get this part. In low pressure scenario, the reclaimer never need to reclaim all children of SECs. So it would not reclaim SECS no matter how long you run?
Ignore VA pages for now. Say for a system with 10 page EPC, 2 enclaves, each needs 5 pages non-SECS so total demand would be 12 pages. The ksgxd would only need to swap out 2 pages at the most to get one enclave fully loaded with 6 pages, and the other one with 4 pages. There is no chance the ksgxd would swap any one of two SECS pages.
We would need at least one enclave A of 10 pages total to squeeze out the other B completely. For that to happen B pretty much has to be sleeping all the time so the LRU based reclaiming would hit it but not pages of A. So no chance to hit #PF on pages of B still.
So some minimal pressure is needed to ensure SECS swapped. The higher the pressure the higher the chance to hit #PF while SECS is swapped.
Please stick to the facts and it'll actually simplify the description.
The SECS page holds global enclave metadata. It can only be reclaimed when there are no other enclave pages remaining. At that point, virtually nothing can be done with the enclave until the SECS page is paged back in.
An enclave can not run nor generate page faults without without a resident SECS page. But it is still possible for a #PF for a non-SECS page to race with paging out the SECS page.
Hitting this bug requires triggering that race.
Thanks for the suggestion. I agree on those.
The bug is easier to reproduce with the EPC cgroup implementation when a low EPC limit is set for a group of enclave hosting processes. Without the EPC cgroup it's hard to trigger the reclaimer to reclaim all child pages of an SECS page. And it'd also require a machine configured with large RAM relative to EPC so no OOM killer triggered before this happens.
Isn't this the _normal_ case? EPC is relatively tiny compared to RAM normally.
I don't know what is perceived as normal here. But for this to happen, the swapping space should be able to hold content much bigger than EPC, if my reasoning above for high pressure required is correct. I tried 70 concurrent sgx selftests instances on a server with 4G EPC, 512G RAM and no disk swapping, and encountered OOM first. Those selftests instance each demand about 8G EPC.
Thanks Haitao
On 7/18/23 13:32, Haitao Huang wrote: ...
Ignore VA pages for now. Say for a system with 10 page EPC, 2 enclaves, each needs 5 pages non-SECS so total demand would be 12 pages. The ksgxd would only need to swap out 2 pages at the most to get one enclave fully loaded with 6 pages, and the other one with 4 pages. There is no chance the ksgxd would swap any one of two SECS pages.
We would need at least one enclave A of 10 pages total to squeeze out the other B completely. For that to happen B pretty much has to be sleeping all the time so the LRU based reclaiming would hit it but not pages of A. So no chance to hit #PF on pages of B still.
So some minimal pressure is needed to ensure SECS swapped. The higher the pressure the higher the chance to hit #PF while SECS is swapped.
What would the second-to-last non-SECS page be? A thread control page? VA page?
As long as *that* page can generate a page fault, then you only need two pages for this scenario to happen:
1. Reclaimer takes encl->lock 2. #PF occurs from another thread, blocks on encl->lock 3. SECS is reclaimed 4. encl->lock released 5. #PF sees reclaimed SECS
On Tue, 18 Jul 2023 15:56:27 -0500, Dave Hansen dave.hansen@intel.com wrote:
On 7/18/23 13:32, Haitao Huang wrote: ...
Ignore VA pages for now. Say for a system with 10 page EPC, 2 enclaves, each needs 5 pages non-SECS so total demand would be 12 pages. The ksgxd would only need to swap out 2 pages at the most to get one enclave fully loaded with 6 pages, and the other one with 4 pages. There is no chance the ksgxd would swap any one of two SECS pages.
We would need at least one enclave A of 10 pages total to squeeze out the other B completely. For that to happen B pretty much has to be sleeping all the time so the LRU based reclaiming would hit it but not pages of A. So no chance to hit #PF on pages of B still.
So some minimal pressure is needed to ensure SECS swapped. The higher the pressure the higher the chance to hit #PF while SECS is swapped.
What would the second-to-last non-SECS page be? A thread control page? VA page?
As long as *that* page can generate a page fault, then you only need two pages for this scenario to happen:
- Reclaimer takes encl->lock
- #PF occurs from another thread, blocks on encl->lock
- SECS is reclaimed
- encl->lock released
- #PF sees reclaimed SECS
I agree this is the race. But for this to happen, that is at #1 you have only one non-SECS page left so #3 can happen. That means it is already high pressure because reclaimer has swapped all other non-SECS. In my example of two enclaves of 5 non-EPC pages. #3 won't happen because you don't reach #1 with only one non-SECS left.
Thanks Haitao
On 7/18/23 14:22, Haitao Huang wrote:
I agree this is the race. But for this to happen, that is at #1 you have only one non-SECS page left so #3 can happen. That means it is already high pressure
I think our definitions of memory pressure differ.
Pressure is raised by allocations and dropped by reclaim. This raise->drop cycle is (or should be) time-limited and can't take forever. The reclaim either works in a short period of time or something dies. If allocations are transient, pressure is transient.
Let's say a pressure blip (a one-time event) comes along and pages out that second-to-last page. That's pretty low pressure. Years pass. The enclave never gets run. Nothing pages the second-to-last page back in. A second pressure blip comes along. The SECS page gets paged out.
That's two pressure blips in, say 10 years. Is that "high pressure"?
because reclaimer has swapped all other non-SECS. In my example of two enclaves of 5 non-EPC pages. #3 won't happen because you don't reach #1 with only one non-SECS left.
On Tue, 18 Jul 2023 16:36:53 -0500, Dave Hansen dave.hansen@intel.com wrote:
On 7/18/23 14:22, Haitao Huang wrote:
I agree this is the race. But for this to happen, that is at #1 you have only one non-SECS page left so #3 can happen. That means it is already high pressure
I think our definitions of memory pressure differ.
Pressure is raised by allocations and dropped by reclaim. This raise->drop cycle is (or should be) time-limited and can't take forever. The reclaim either works in a short period of time or something dies. If allocations are transient, pressure is transient.
Let's say a pressure blip (a one-time event) comes along and pages out that second-to-last page. That's pretty low pressure. Years pass. The enclave never gets run. Nothing pages the second-to-last page back in. A second pressure blip comes along. The SECS page gets paged out.
That's two pressure blips in, say 10 years. Is that "high pressure"?
Okay, that explains. I would consider it still triggered by high pressure blips :-)
But I agree we can drop the mentioning of pressure altogether and just state the race so no confusions. Thanks Haitao
On 7/18/23 14:57, Haitao Huang wrote:
Okay, that explains. I would consider it still triggered by high pressure blips 😄
I'm talking about a "blip" being a single allocation. It's *LITERALLY* the smallest (aka. lowest) possible quantum of memory pressure.
So go ahead and try to write the changelog without "high" or "low". But, sheesh, if you and are somehow using "high" and "low" to describe the exact same condition, I think that's a rather large communication or understanding problem somewhere. It doesn't bode well for this simple patch.
On Tue, 18 Jul 2023 17:05:59 -0500, Dave Hansen dave.hansen@intel.com wrote:
On 7/18/23 14:57, Haitao Huang wrote:
Okay, that explains. I would consider it still triggered by high pressure blips 😄
I'm talking about a "blip" being a single allocation. It's *LITERALLY* the smallest (aka. lowest) possible quantum of memory pressure.
So go ahead and try to write the changelog without "high" or "low". But, sheesh, if you and are somehow using "high" and "low" to describe the exact same condition, I think that's a rather large communication or understanding problem somewhere. It doesn't bode well for this simple patch.
Sorry that did not come across right. I'll send v3 (please skip v2) with the comments added as you suggested.
Thanks a lot for your time and the review. BR Haitao
On Tue, 2023-07-18 at 16:57 -0500, Haitao Huang wrote:
On Tue, 18 Jul 2023 16:36:53 -0500, Dave Hansen dave.hansen@intel.com wrote:
On 7/18/23 14:22, Haitao Huang wrote:
I agree this is the race. But for this to happen, that is at #1 you have only one non-SECS page left so #3 can happen. That means it is already high pressure
I think our definitions of memory pressure differ.
Pressure is raised by allocations and dropped by reclaim. This raise->drop cycle is (or should be) time-limited and can't take forever. The reclaim either works in a short period of time or something dies. If allocations are transient, pressure is transient.
Let's say a pressure blip (a one-time event) comes along and pages out that second-to-last page. That's pretty low pressure. Years pass. The enclave never gets run. Nothing pages the second-to-last page back in. A second pressure blip comes along. The SECS page gets paged out.
That's two pressure blips in, say 10 years. Is that "high pressure"?
Okay, that explains. I would consider it still triggered by high pressure blips :-)
But I agree we can drop the mentioning of pressure altogether and just state the race so no confusions.
Also perhaps the patch title is too vague. Adding more information doesn't hurt I think, e.g., mentioning it is a fix for NULL pointer dereference in the EAUG flow.
On 7/18/23 17:14, Huang, Kai wrote:
Also perhaps the patch title is too vague. Adding more information doesn't hurt I think, e.g., mentioning it is a fix for NULL pointer dereference in the EAUG flow.
Yeah, let's say something like:
x86/sgx: Resolve SECS reclaim vs. page fault race
please
Hi Dave and Kai On Tue, 18 Jul 2023 19:21:54 -0500, Dave Hansen dave.hansen@intel.com wrote:
On 7/18/23 17:14, Huang, Kai wrote:
Also perhaps the patch title is too vague. Adding more information doesn't hurt I think, e.g., mentioning it is a fix for NULL pointer dereference in the EAUG flow.
Yeah, let's say something like:
x86/sgx: Resolve SECS reclaim vs. page fault race
The patch is not to resolve SECS vs #PF race though the race is a necessary condition to cause the NULL pointer. The same condition does not cause NULL pointer in the ELDU path of #PF, only in EAUG path of #PF.
And the issue really is the NULL pointer not checked and fix was to reuse the same code to reload SECS in ELDU code path for EAUG code path
How about this:
x86/sgx: Reload reclaimed SECS for EAUG on #PF
or
x86/sgx: Fix a NULL pointer to SECS used for EAUG on #PF
BR Haitao
On Wed, 2023-07-19 at 08:53 -0500, Haitao Huang wrote:
Hi Dave and Kai On Tue, 18 Jul 2023 19:21:54 -0500, Dave Hansen dave.hansen@intel.com wrote:
On 7/18/23 17:14, Huang, Kai wrote:
Also perhaps the patch title is too vague. Adding more information doesn't hurt I think, e.g., mentioning it is a fix for NULL pointer dereference in the EAUG flow.
Yeah, let's say something like:
x86/sgx: Resolve SECS reclaim vs. page fault race
The patch is not to resolve SECS vs #PF race though the race is a necessary condition to cause the NULL pointer. The same condition does not cause NULL pointer in the ELDU path of #PF, only in EAUG path of #PF.
And the issue really is the NULL pointer not checked and fix was to reuse the same code to reload SECS in ELDU code path for EAUG code path
How about this:
x86/sgx: Reload reclaimed SECS for EAUG on #PF
or
x86/sgx: Fix a NULL pointer to SECS used for EAUG on #PF
Perhaps you can add "EAUG" part to what Dave suggested?
x86/sgx: Resolves SECS reclaim vs. page fault race on EAUG
(assuming Dave is fine with this :-))
On Fri, 2023-07-21 at 00:32 +0000, Huang, Kai wrote:
On Wed, 2023-07-19 at 08:53 -0500, Haitao Huang wrote:
Hi Dave and Kai On Tue, 18 Jul 2023 19:21:54 -0500, Dave Hansen dave.hansen@intel.com wrote:
On 7/18/23 17:14, Huang, Kai wrote:
Also perhaps the patch title is too vague. Adding more information doesn't hurt I think, e.g., mentioning it is a fix for NULL pointer dereference in the EAUG flow.
Yeah, let's say something like:
x86/sgx: Resolve SECS reclaim vs. page fault race
The patch is not to resolve SECS vs #PF race though the race is a necessary condition to cause the NULL pointer. The same condition does not cause NULL pointer in the ELDU path of #PF, only in EAUG path of #PF.
And the issue really is the NULL pointer not checked and fix was to reuse the same code to reload SECS in ELDU code path for EAUG code path
How about this:
x86/sgx: Reload reclaimed SECS for EAUG on #PF
or
x86/sgx: Fix a NULL pointer to SECS used for EAUG on #PF
Perhaps you can add "EAUG" part to what Dave suggested?
x86/sgx: Resolves SECS reclaim vs. page fault race on EAUG
(assuming Dave is fine with this :-))
Btw, do you have a real call trace? If you have, I think you can add that to the changelog too because that catches people's eye immediately.
On Thu, 20 Jul 2023 19:52:22 -0500, Huang, Kai kai.huang@intel.com wrote:
On Fri, 2023-07-21 at 00:32 +0000, Huang, Kai wrote:
On Wed, 2023-07-19 at 08:53 -0500, Haitao Huang wrote:
Hi Dave and Kai On Tue, 18 Jul 2023 19:21:54 -0500, Dave Hansen
wrote:
On 7/18/23 17:14, Huang, Kai wrote:
Also perhaps the patch title is too vague. Adding more
information
doesn't hurt I think, e.g., mentioning it is a fix for NULL pointer
dereference in
the EAUG flow.
Yeah, let's say something like:
x86/sgx: Resolve SECS reclaim vs. page fault race
The patch is not to resolve SECS vs #PF race though the race is a necessary condition to cause the NULL pointer. The same condition
does not
cause NULL pointer in the ELDU path of #PF, only in EAUG path of #PF.
And the issue really is the NULL pointer not checked and fix was to
reuse
the same code to reload SECS in ELDU code path for EAUG code path
How about this:
x86/sgx: Reload reclaimed SECS for EAUG on #PF
or
x86/sgx: Fix a NULL pointer to SECS used for EAUG on #PF
Perhaps you can add "EAUG" part to what Dave suggested?
x86/sgx: Resolves SECS reclaim vs. page fault race on EAUG
(assuming Dave is fine with this :-))
Sure, I can use this too.
Btw, do you have a real call trace? If you have, I think you can add that to the changelog too because that catches people's eye immediately.
Previously I was not able to reproduce without SGX cgroup patches. Now I managed to get a trace with a QEMU setup with small EPC (8M), large RAM (128G) and 128 vCPUs:
[ 1682.914263] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 1682.922966] #PF: supervisor read access in kernel mode [ 1682.929115] #PF: error_code(0x0000) - not-present page [ 1682.935264] PGD 0 P4D 0 [ 1682.938383] Oops: 0000 [#1] PREEMPT SMP NOPTI [ 1682.943620] CPU: 43 PID: 2681 Comm: test_sgx Not tainted 6.3.0-rc4sgxcet #12 [ 1682.951989] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014 [ 1682.965504] RIP: 0010:sgx_encl_eaug_page+0xc7/0x210 [ 1682.971359] Code: 25 49 8b 96 98 04 00 00 48 8d 40 48 48 89 42 08 48 89 56 48 49 8d 96 98 04 00 00 48 89 56 50 49 89 86 98 04 00 00 49 8b 46 60 <8b> 10 48 c1 e2 05 488 [ 1682.993330] RSP: 0000:ffffb2e64725bc00 EFLAGS: 00010246 [ 1682.999585] RAX: 0000000000000000 RBX: ffff987e5abac428 RCX: 0000000000000000 [ 1683.008059] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff987e61aee000 [ 1683.016533] RBP: ffffb2e64725bcf0 R08: 0000000000000000 R09: ffffb2e64725bb58 [ 1683.025008] R10: 0000000000000000 R11: 00007f3f5c418fff R12: ffff987e61aee020 [ 1683.033479] R13: ffff987e505bc080 R14: ffff987e61aee000 R15: ffffb2e6420fcb20 [ 1683.041949] FS: 00007f3f5cb48740(0000) GS:ffff989cfe8c0000(0000) knlGS:0000000000000000 [ 1683.051540] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1683.058478] CR2: 0000000000000000 CR3: 0000000115896002 CR4: 0000000000770ee0 [ 1683.067018] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1683.075539] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1683.084085] PKRU: 55555554 [ 1683.087465] Call Trace: [ 1683.090547] <TASK> [ 1683.093220] ? __kmem_cache_alloc_node+0x16a/0x440 [ 1683.099034] ? xa_load+0x6e/0xa0 [ 1683.103038] sgx_vma_fault+0x119/0x230 [ 1683.107630] __do_fault+0x36/0x140 [ 1683.111828] do_fault+0x12f/0x400 [ 1683.115928] __handle_mm_fault+0x728/0x1110 [ 1683.121050] handle_mm_fault+0x105/0x310 [ 1683.125850] do_user_addr_fault+0x1ee/0x750 [ 1683.130957] ? __this_cpu_preempt_check+0x13/0x20 [ 1683.136667] exc_page_fault+0x76/0x180 [ 1683.141265] asm_exc_page_fault+0x27/0x30 [ 1683.146160] RIP: 0033:0x7ffc6496beea [ 1683.150563] Code: 43 48 8b 4d 10 48 c7 c3 28 00 00 00 48 83 3c 19 00 75 31 48 83 c3 08 48 81 fb 00 01 00 00 75 ec 48 8b 19 48 8d 0d 00 00 00 00 <0f> 01 d7 48 8b 5d 101 [ 1683.172773] RSP: 002b:00007ffc64935b68 EFLAGS: 00000202 [ 1683.179138] RAX: 0000000000000003 RBX: 00007f3800000000 RCX: 00007ffc6496beea [ 1683.187675] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ 1683.196200] RBP: 00007ffc64935b70 R08: 0000000000000000 R09: 0000000000000000 [ 1683.204724] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 1683.213310] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 1683.221850] </TASK> [ 1683.224636] Modules linked in: isofs intel_rapl_msr intel_rapl_common binfmt_misc kvm_intel nls_iso8859_1 kvm ppdev irqbypass input_leds parport_pc joydev parport rapi [ 1683.291173] CR2: 0000000000000000 [ 1683.295271] ---[ end trace 0000000000000000 ]---
I'll add this to the commit as well.
Thanks Haitao
Under heavy load, the SGX EPC reclaimer (ksgxd) may reclaim the SECS EPC page for an enclave and set encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in the SGX page fault handler without checking for NULL and reloading.
Fix this by checking if SECS is loaded before EAUG and loading it if it was reclaimed.
The SECS page holds global enclave metadata. It can only be reclaimed when there are no other enclave pages remaining. At that point, virtually nothing can be done with the enclave until the SECS page is paged back in.
An enclave can not run nor generate page faults without a resident SECS page. But it is still possible for a #PF for a non-SECS page to race with paging out the SECS page: when the last resident non-SECS page A triggers a #PF in a non-resident page B, and then page A and the SECS both are paged out before the #PF on B is handled.
Hitting this bug requires that race triggered with a #PF for EAUG. Following is a trace when it happens.
[ 1682.914263] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 1682.922966] #PF: supervisor read access in kernel mode [ 1682.929115] #PF: error_code(0x0000) - not-present page [ 1682.935264] PGD 0 P4D 0 [ 1682.938383] Oops: 0000 [#1] PREEMPT SMP NOPTI [ 1682.943620] CPU: 43 PID: 2681 Comm: test_sgx Not tainted 6.3.0-rc4sgxcet #12 [ 1682.951989] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014 [ 1682.965504] RIP: 0010:sgx_encl_eaug_page+0xc7/0x210 [ 1682.971359] Code: 25 49 8b 96 98 04 00 00 48 8d 40 48 48 89 42 08 48 89 56 48 49 8d 96 98 04 00 00 48 89 56 50 49 89 86 98 04 00 00 49 8b 46 60 <8b> 10 48 c1 e2 05 488 [ 1682.993330] RSP: 0000:ffffb2e64725bc00 EFLAGS: 00010246 [ 1682.999585] RAX: 0000000000000000 RBX: ffff987e5abac428 RCX: 0000000000000000 [ 1683.008059] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff987e61aee000 [ 1683.016533] RBP: ffffb2e64725bcf0 R08: 0000000000000000 R09: ffffb2e64725bb58 [ 1683.025008] R10: 0000000000000000 R11: 00007f3f5c418fff R12: ffff987e61aee020 [ 1683.033479] R13: ffff987e505bc080 R14: ffff987e61aee000 R15: ffffb2e6420fcb20 [ 1683.041949] FS: 00007f3f5cb48740(0000) GS:ffff989cfe8c0000(0000) knlGS:0000000000000000 [ 1683.051540] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1683.058478] CR2: 0000000000000000 CR3: 0000000115896002 CR4: 0000000000770ee0 [ 1683.067018] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1683.075539] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1683.084085] PKRU: 55555554 [ 1683.087465] Call Trace: [ 1683.090547] <TASK> [ 1683.093220] ? __kmem_cache_alloc_node+0x16a/0x440 [ 1683.099034] ? xa_load+0x6e/0xa0 [ 1683.103038] sgx_vma_fault+0x119/0x230 [ 1683.107630] __do_fault+0x36/0x140 [ 1683.111828] do_fault+0x12f/0x400 [ 1683.115928] __handle_mm_fault+0x728/0x1110 [ 1683.121050] handle_mm_fault+0x105/0x310 [ 1683.125850] do_user_addr_fault+0x1ee/0x750 [ 1683.130957] ? __this_cpu_preempt_check+0x13/0x20 [ 1683.136667] exc_page_fault+0x76/0x180 [ 1683.141265] asm_exc_page_fault+0x27/0x30 [ 1683.146160] RIP: 0033:0x7ffc6496beea [ 1683.150563] Code: 43 48 8b 4d 10 48 c7 c3 28 00 00 00 48 83 3c 19 00 75 31 48 83 c3 08 48 81 fb 00 01 00 00 75 ec 48 8b 19 48 8d 0d 00 00 00 00 <0f> 01 d7 48 8b 5d 101 [ 1683.172773] RSP: 002b:00007ffc64935b68 EFLAGS: 00000202 [ 1683.179138] RAX: 0000000000000003 RBX: 00007f3800000000 RCX: 00007ffc6496beea [ 1683.187675] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ 1683.196200] RBP: 00007ffc64935b70 R08: 0000000000000000 R09: 0000000000000000 [ 1683.204724] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 1683.213310] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 1683.221850] </TASK> [ 1683.224636] Modules linked in: isofs intel_rapl_msr intel_rapl_common binfmt_misc kvm_intel nls_iso8859_1 kvm ppdev irqbypass input_leds parport_pc joydev parport rapi [ 1683.291173] CR2: 0000000000000000 [ 1683.295271] ---[ end trace 0000000000000000 ]---
Fixes: 5a90d2c3f5ef ("x86/sgx: Support adding of pages to an initialized enclave") Cc: stable@vger.kernel.org # v6.0+ Signed-off-by: Haitao Huang haitao.huang@linux.intel.com Reviewed-by: Jarkko Sakkinen jarkko@kernel.org --- v4: - Refined the title (Kai, Dave) - Added a trace to commit meesage (Kai) - Added a few details for the race.
v3: - Added comments on sgx_encl_load_secs(). (Dave) - Added theory of the race condition to hit the bug. (Dave) - Added Reviewed-by, and applicable stable release. (Jarkko)
v2: - Fixes for style, commit message (Jarkko, Kai) - Removed unneeded WARN_ON (Kai) --- arch/x86/kernel/cpu/sgx/encl.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 91fa70e51004..279148e72459 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -235,6 +235,21 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, return epc_page; }
+/* + * Ensure the SECS page is not swapped out. Must be called with encl->lock + * to protect the enclave states including SECS and ensure the SECS page is + * not swapped out again while being used. + */ +static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl) +{ + struct sgx_epc_page *epc_page = encl->secs.epc_page; + + if (!epc_page) + epc_page = sgx_encl_eldu(&encl->secs, NULL); + + return epc_page; +} + static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, struct sgx_encl_page *entry) { @@ -248,11 +263,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, return entry; }
- if (!(encl->secs.epc_page)) { - epc_page = sgx_encl_eldu(&encl->secs, NULL); - if (IS_ERR(epc_page)) - return ERR_CAST(epc_page); - } + epc_page = sgx_encl_load_secs(encl); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page);
epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); if (IS_ERR(epc_page)) @@ -339,6 +352,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
mutex_lock(&encl->lock);
+ epc_page = sgx_encl_load_secs(encl); + if (IS_ERR(epc_page)) { + if (PTR_ERR(epc_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; + goto err_out_unlock; + } + epc_page = sgx_alloc_epc_page(encl_page, false); if (IS_ERR(epc_page)) { if (PTR_ERR(epc_page) == -EBUSY)
base-commit: 6eaae198076080886b9e7d57f4ae06fa782f90ef
Hi Haitao,
On 7/26/2023 1:50 PM, Haitao Huang wrote:
Under heavy load, the SGX EPC reclaimer (ksgxd) may reclaim the SECS EPC page for an enclave and set encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in the SGX page fault handler without checking for NULL and reloading.
Fix this by checking if SECS is loaded before EAUG and loading it if it was reclaimed.
The SECS page holds global enclave metadata. It can only be reclaimed when there are no other enclave pages remaining. At that point, virtually nothing can be done with the enclave until the SECS page is paged back in.
An enclave can not run nor generate page faults without a resident SECS page. But it is still possible for a #PF for a non-SECS page to race with paging out the SECS page: when the last resident non-SECS page A triggers a #PF in a non-resident page B, and then page A and the SECS both are paged out before the #PF on B is handled.
Hitting this bug requires that race triggered with a #PF for EAUG. Following is a trace when it happens.
Thank you very much for finding this issue as well as providing a fix.
[ 1682.914263] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 1682.922966] #PF: supervisor read access in kernel mode [ 1682.929115] #PF: error_code(0x0000) - not-present page [ 1682.935264] PGD 0 P4D 0 [ 1682.938383] Oops: 0000 [#1] PREEMPT SMP NOPTI [ 1682.943620] CPU: 43 PID: 2681 Comm: test_sgx Not tainted 6.3.0-rc4sgxcet #12 [ 1682.951989] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014 [ 1682.965504] RIP: 0010:sgx_encl_eaug_page+0xc7/0x210 [ 1682.971359] Code: 25 49 8b 96 98 04 00 00 48 8d 40 48 48 89 42 08 48 89 56 48 49 8d 96 98 04 00 00 48 89 56 50 49 89 86 98 04 00 00 49 8b 46 60 <8b> 10 48 c1 e2 05 488 [ 1682.993330] RSP: 0000:ffffb2e64725bc00 EFLAGS: 00010246 [ 1682.999585] RAX: 0000000000000000 RBX: ffff987e5abac428 RCX: 0000000000000000 [ 1683.008059] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff987e61aee000 [ 1683.016533] RBP: ffffb2e64725bcf0 R08: 0000000000000000 R09: ffffb2e64725bb58 [ 1683.025008] R10: 0000000000000000 R11: 00007f3f5c418fff R12: ffff987e61aee020 [ 1683.033479] R13: ffff987e505bc080 R14: ffff987e61aee000 R15: ffffb2e6420fcb20 [ 1683.041949] FS: 00007f3f5cb48740(0000) GS:ffff989cfe8c0000(0000) knlGS:0000000000000000 [ 1683.051540] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1683.058478] CR2: 0000000000000000 CR3: 0000000115896002 CR4: 0000000000770ee0 [ 1683.067018] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1683.075539] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1683.084085] PKRU: 55555554 [ 1683.087465] Call Trace: [ 1683.090547] <TASK> [ 1683.093220] ? __kmem_cache_alloc_node+0x16a/0x440 [ 1683.099034] ? xa_load+0x6e/0xa0 [ 1683.103038] sgx_vma_fault+0x119/0x230 [ 1683.107630] __do_fault+0x36/0x140 [ 1683.111828] do_fault+0x12f/0x400 [ 1683.115928] __handle_mm_fault+0x728/0x1110 [ 1683.121050] handle_mm_fault+0x105/0x310 [ 1683.125850] do_user_addr_fault+0x1ee/0x750 [ 1683.130957] ? __this_cpu_preempt_check+0x13/0x20 [ 1683.136667] exc_page_fault+0x76/0x180 [ 1683.141265] asm_exc_page_fault+0x27/0x30 [ 1683.146160] RIP: 0033:0x7ffc6496beea [ 1683.150563] Code: 43 48 8b 4d 10 48 c7 c3 28 00 00 00 48 83 3c 19 00 75 31 48 83 c3 08 48 81 fb 00 01 00 00 75 ec 48 8b 19 48 8d 0d 00 00 00 00 <0f> 01 d7 48 8b 5d 101 [ 1683.172773] RSP: 002b:00007ffc64935b68 EFLAGS: 00000202 [ 1683.179138] RAX: 0000000000000003 RBX: 00007f3800000000 RCX: 00007ffc6496beea [ 1683.187675] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ 1683.196200] RBP: 00007ffc64935b70 R08: 0000000000000000 R09: 0000000000000000 [ 1683.204724] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 1683.213310] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 1683.221850] </TASK> [ 1683.224636] Modules linked in: isofs intel_rapl_msr intel_rapl_common binfmt_misc kvm_intel nls_iso8859_1 kvm ppdev irqbypass input_leds parport_pc joydev parport rapi [ 1683.291173] CR2: 0000000000000000 [ 1683.295271] ---[ end trace 0000000000000000 ]---
Could you please trim this trace? There is more detail in Documentation/process/submitting-patches.rst (search for "Backtraces in commit messages"), but the ideal trace should have just the information needed to describe the issue (no timestamps, register dumps, etc.).
With that addressed, feel free to add:
Acked-by: Reinette Chatre reinette.chatre@intel.com
Reinette
Under heavy load, the SGX EPC reclaimer (ksgxd) may reclaim the SECS EPC page for an enclave and set encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in the SGX page fault handler without checking for NULL and reloading.
Fix this by checking if SECS is loaded before EAUG and loading it if it was reclaimed.
The SECS page holds global enclave metadata. It can only be reclaimed when there are no other enclave pages remaining. At that point, virtually nothing can be done with the enclave until the SECS page is paged back in.
An enclave can not run nor generate page faults without a resident SECS page. But it is still possible for a #PF for a non-SECS page to race with paging out the SECS page: when the last resident non-SECS page A triggers a #PF in a non-resident page B, and then page A and the SECS both are paged out before the #PF on B is handled.
Hitting this bug requires that race triggered with a #PF for EAUG. Following is a trace when it happens.
BUG: kernel NULL pointer dereference, address: 0000000000000000 RIP: 0010:sgx_encl_eaug_page+0xc7/0x210 Call Trace: ? __kmem_cache_alloc_node+0x16a/0x440 ? xa_load+0x6e/0xa0 sgx_vma_fault+0x119/0x230 __do_fault+0x36/0x140 do_fault+0x12f/0x400 __handle_mm_fault+0x728/0x1110 handle_mm_fault+0x105/0x310 do_user_addr_fault+0x1ee/0x750 ? __this_cpu_preempt_check+0x13/0x20 exc_page_fault+0x76/0x180 asm_exc_page_fault+0x27/0x30
Fixes: 5a90d2c3f5ef ("x86/sgx: Support adding of pages to an initialized enclave") Cc: stable@vger.kernel.org # v6.0+ Signed-off-by: Haitao Huang haitao.huang@linux.intel.com Reviewed-by: Jarkko Sakkinen jarkko@kernel.org Acked-by: Reinette Chatre reinette.chatre@intel.com --- v5: - Trimmed trace and added Acked-by (Reinette)
v4: - Refined the title (Kai, Dave) - Added a trace to commit meesage (Kai) - Added a few details for the race.
v3: - Added comments on sgx_encl_load_secs(). (Dave) - Added theory of the race condition to hit the bug. (Dave) - Added Reviewed-by, and applicable stable release. (Jarkko)
v2: - Fixes for style, commit message (Jarkko, Kai) - Removed unneeded WARN_ON (Kai) --- arch/x86/kernel/cpu/sgx/encl.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 91fa70e51004..279148e72459 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -235,6 +235,21 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, return epc_page; }
+/* + * Ensure the SECS page is not swapped out. Must be called with encl->lock + * to protect the enclave states including SECS and ensure the SECS page is + * not swapped out again while being used. + */ +static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl) +{ + struct sgx_epc_page *epc_page = encl->secs.epc_page; + + if (!epc_page) + epc_page = sgx_encl_eldu(&encl->secs, NULL); + + return epc_page; +} + static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, struct sgx_encl_page *entry) { @@ -248,11 +263,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, return entry; }
- if (!(encl->secs.epc_page)) { - epc_page = sgx_encl_eldu(&encl->secs, NULL); - if (IS_ERR(epc_page)) - return ERR_CAST(epc_page); - } + epc_page = sgx_encl_load_secs(encl); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page);
epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); if (IS_ERR(epc_page)) @@ -339,6 +352,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
mutex_lock(&encl->lock);
+ epc_page = sgx_encl_load_secs(encl); + if (IS_ERR(epc_page)) { + if (PTR_ERR(epc_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; + goto err_out_unlock; + } + epc_page = sgx_alloc_epc_page(encl_page, false); if (IS_ERR(epc_page)) { if (PTR_ERR(epc_page) == -EBUSY)
base-commit: 6eaae198076080886b9e7d57f4ae06fa782f90ef
On Wed, 2023-07-26 at 18:02 -0700, Haitao Huang wrote:
Under heavy load, the SGX EPC reclaimer (ksgxd) may reclaim the SECS EPC
If I read correctly, Dave suggested to not use "high" (heavy in this sentence) or "low" pressure:
https://lore.kernel.org/lkml/op.179a4xs0wjvjmi@hhuan26-mobl.amr.corp.intel.c...
And I agree. For instance, consider this happens to one extremely "small" enclave, while there's a new "big" enclave starts to run. I don't think we should say this is "under heavy load". Just stick to the fact that the reclaimer may reclaim the SECS page.
page for an enclave and set encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in the SGX page fault handler without checking for NULL and reloading.
Fix this by checking if SECS is loaded before EAUG and loading it if it was reclaimed.
The SECS page holds global enclave metadata. It can only be reclaimed when there are no other enclave pages remaining. At that point, virtually nothing can be done with the enclave until the SECS page is paged back in.
An enclave can not run nor generate page faults without a resident SECS page.
I am not sure whether "nor generate page faults without a resident SECS page" is accurate? When SECS is swapped out, I suppose the first EENTER should trigger a #PF on the TSC page and in the #PF handler the SECS will be swapped in first.
I guess you can just remove this sentence?
But it is still possible for a #PF for a non-SECS page to race with paging out the SECS page: when the last resident non-SECS page A triggers a #PF in a non-resident page B, and then page A and the SECS both are paged out before the #PF on B is handled.
Hitting this bug requires that race triggered with a #PF for EAUG.
The above race can happen for the normal ELDU path too, thus I suppose it will be better to mention why the normal ELDU path doesn't have this issue: it already does what this fix does.
Following is a trace when it happens.
BUG: kernel NULL pointer dereference, address: 0000000000000000 RIP: 0010:sgx_encl_eaug_page+0xc7/0x210 Call Trace: ? __kmem_cache_alloc_node+0x16a/0x440 ? xa_load+0x6e/0xa0 sgx_vma_fault+0x119/0x230 __do_fault+0x36/0x140 do_fault+0x12f/0x400 __handle_mm_fault+0x728/0x1110 handle_mm_fault+0x105/0x310 do_user_addr_fault+0x1ee/0x750 ? __this_cpu_preempt_check+0x13/0x20 exc_page_fault+0x76/0x180 asm_exc_page_fault+0x27/0x30
Fixes: 5a90d2c3f5ef ("x86/sgx: Support adding of pages to an initialized enclave") Cc: stable@vger.kernel.org # v6.0+ Signed-off-by: Haitao Huang haitao.huang@linux.intel.com Reviewed-by: Jarkko Sakkinen jarkko@kernel.org Acked-by: Reinette Chatre reinette.chatre@intel.com
With above addressed, feel free to add:
Reviewed-by: Kai Huang kai.huang@intel.com
On Thu, 2023-07-27 at 02:50 +0000, Huang, Kai wrote:
An enclave can not run nor generate page faults without a resident SECS page.
I am not sure whether "nor generate page faults without a resident SECS page" is accurate? When SECS is swapped out, I suppose the first EENTER should trigger a #PF on the TSC page and in the #PF handler the SECS will be swapped in first.
I guess you can just remove this sentence?
Hmm.. Probably I should interpret this sentence as the enclave "code" itself cannot generate page faults without a resident SECS. This is true. So feel free to ignore this comment.
On Wed, 26 Jul 2023 21:50:02 -0500, Huang, Kai kai.huang@intel.com wrote:
On Wed, 2023-07-26 at 18:02 -0700, Haitao Huang wrote:
Under heavy load, the SGX EPC reclaimer (ksgxd) may reclaim the SECS EPC
If I read correctly, Dave suggested to not use "high" (heavy in this sentence) or "low" pressure:
https://lore.kernel.org/lkml/op.179a4xs0wjvjmi@hhuan26-mobl.amr.corp.intel.c...
And I agree. For instance, consider this happens to one extremely "small" enclave, while there's a new "big" enclave starts to run. I don't think we should say this is "under heavy load". Just stick to the fact that the reclaimer may reclaim the SECS page.
Mybe I have some confusion here but I did not think Dave had issues with 'heavy load'. When this happens, the last page causing #PF (page A below) should be the the "youngest" in PTE and it got paged out together with the SECS before the #PF is even handled. Based on that the ksgxd moves 'young' pages to the back of the queue for reclaiming, for that to happen, almost all EPC pages must be paged out for all enclaves at that time, so it means heavy load to me. And that's also consistent with my tests.
page for an enclave and set encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in the SGX page fault handler without checking for NULL and reloading.
Fix this by checking if SECS is loaded before EAUG and loading it if it was reclaimed.
The SECS page holds global enclave metadata. It can only be reclaimed when there are no other enclave pages remaining. At that point, virtually nothing can be done with the enclave until the SECS page is paged back in.
...
But it is still possible for a #PF for a non-SECS page to race with paging out the SECS page: when the last resident non-SECS page A triggers a #PF in a non-resident page B, and then page A and the SECS both are paged out before the #PF on B is handled.
Hitting this bug requires that race triggered with a #PF for EAUG.
The above race can happen for the normal ELDU path too, thus I suppose it will be better to mention why the normal ELDU path doesn't have this issue: it already does what this fix does.
Should we focus on the bug and fix itself instead of explaining a non-bug case? And the simple changes in this patch clearly show that too if people look for that.
Thanks Haitao
On Thu, 2023-07-27 at 09:16 -0500, Haitao Huang wrote:
On Wed, 26 Jul 2023 21:50:02 -0500, Huang, Kai kai.huang@intel.com wrote:
On Wed, 2023-07-26 at 18:02 -0700, Haitao Huang wrote:
Under heavy load, the SGX EPC reclaimer (ksgxd) may reclaim the SECS EPC
If I read correctly, Dave suggested to not use "high" (heavy in this sentence) or "low" pressure:
https://lore.kernel.org/lkml/op.179a4xs0wjvjmi@hhuan26-mobl.amr.corp.intel.c...
And I agree. For instance, consider this happens to one extremely "small" enclave, while there's a new "big" enclave starts to run. I don't think we should say this is "under heavy load". Just stick to the fact that the reclaimer may reclaim the SECS page.
Mybe I have some confusion here but I did not think Dave had issues with 'heavy load'. When this happens, the last page causing #PF (page A below) should be the the "youngest" in PTE and it got paged out together with the SECS before the #PF is even handled. Based on that the ksgxd moves 'young' pages to the back of the queue for reclaiming, for that to happen, almost all EPC pages must be paged out for all enclaves at that time, so it means heavy load to me. And that's also consistent with my tests.
I already provided an example: swapping out an "extreme small" enclave.
Anyway, no big deal to me.
page for an enclave and set encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in the SGX page fault handler without checking for NULL and reloading.
Fix this by checking if SECS is loaded before EAUG and loading it if it was reclaimed.
The SECS page holds global enclave metadata. It can only be reclaimed when there are no other enclave pages remaining. At that point, virtually nothing can be done with the enclave until the SECS page is paged back in.
...
But it is still possible for a #PF for a non-SECS page to race with paging out the SECS page: when the last resident non-SECS page A triggers a #PF in a non-resident page B, and then page A and the SECS both are paged out before the #PF on B is handled.
Hitting this bug requires that race triggered with a #PF for EAUG.
The above race can happen for the normal ELDU path too, thus I suppose it will be better to mention why the normal ELDU path doesn't have this issue: it already does what this fix does.
Should we focus on the bug and fix itself instead of explaining a non-bug case? And the simple changes in this patch clearly show that too if people look for that.
So you spent a lot of text explaining the race condition, but such race condition applies to both ELDU and EAUG. I personally went to see the code whether ELDU has such issue too, and it turned out only EAUG has issue. If you mention this in the changelog perhaps I wouldn't need to go to read the code.
Anyway, just my 2cents.
And I don't want to let those block this patch, so feel free to add my tag:
Reviewed-by: Kai Huang kai.huang@intel.com
The SGX EPC reclaimer (ksgxd) may reclaim the SECS EPC page for an enclave and set secs.epc_page to NULL. The SECS page is used for EAUG and ELDU in the SGX page fault handler. However, the NULL check for secs.epc_page is only done for ELDU, not EAUG before being used.
Fix this by doing the same NULL check and reloading of the SECS page as needed for both EAUG and ELDU.
The SECS page holds global enclave metadata. It can only be reclaimed when there are no other enclave pages remaining. At that point, virtually nothing can be done with the enclave until the SECS page is paged back in.
An enclave can not run nor generate page faults without a resident SECS page. But it is still possible for a #PF for a non-SECS page to race with paging out the SECS page: when the last resident non-SECS page A triggers a #PF in a non-resident page B, and then page A and the SECS both are paged out before the #PF on B is handled.
Hitting this bug requires that race triggered with a #PF for EAUG. Following is a trace when it happens.
BUG: kernel NULL pointer dereference, address: 0000000000000000 RIP: 0010:sgx_encl_eaug_page+0xc7/0x210 Call Trace: ? __kmem_cache_alloc_node+0x16a/0x440 ? xa_load+0x6e/0xa0 sgx_vma_fault+0x119/0x230 __do_fault+0x36/0x140 do_fault+0x12f/0x400 __handle_mm_fault+0x728/0x1110 handle_mm_fault+0x105/0x310 do_user_addr_fault+0x1ee/0x750 ? __this_cpu_preempt_check+0x13/0x20 exc_page_fault+0x76/0x180 asm_exc_page_fault+0x27/0x30
Fixes: 5a90d2c3f5ef ("x86/sgx: Support adding of pages to an initialized enclave") Cc: stable@vger.kernel.org # v6.0+ Signed-off-by: Haitao Huang haitao.huang@linux.intel.com Reviewed-by: Jarkko Sakkinen jarkko@kernel.org Reviewed-by: Kai Huang kai.huang@intel.com Acked-by: Reinette Chatre reinette.chatre@intel.com --- v6: - Removed 'Under heavy load' as it is not the required condition though it makes the bug more likely happen. (Kai) - Added mentioning of the NULL check and reloading already done for ELDU (Kai) - Added Reviewed-by (Kai)
v5: - Trimmed trace and added Acked-by (Reinette)
v4: - Refined the title (Kai, Dave) - Added a trace to commit meesage (Kai) - Added a few details for the race.
v3: - Added comments on sgx_encl_load_secs(). (Dave) - Added theory of the race condition to hit the bug. (Dave) - Added Reviewed-by, and applicable stable release. (Jarkko)
v2: - Fixes for style, commit message (Jarkko, Kai) - Removed unneeded WARN_ON (Kai) --- arch/x86/kernel/cpu/sgx/encl.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 91fa70e51004..279148e72459 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -235,6 +235,21 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, return epc_page; }
+/* + * Ensure the SECS page is not swapped out. Must be called with encl->lock + * to protect the enclave states including SECS and ensure the SECS page is + * not swapped out again while being used. + */ +static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl) +{ + struct sgx_epc_page *epc_page = encl->secs.epc_page; + + if (!epc_page) + epc_page = sgx_encl_eldu(&encl->secs, NULL); + + return epc_page; +} + static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, struct sgx_encl_page *entry) { @@ -248,11 +263,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, return entry; }
- if (!(encl->secs.epc_page)) { - epc_page = sgx_encl_eldu(&encl->secs, NULL); - if (IS_ERR(epc_page)) - return ERR_CAST(epc_page); - } + epc_page = sgx_encl_load_secs(encl); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page);
epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); if (IS_ERR(epc_page)) @@ -339,6 +352,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
mutex_lock(&encl->lock);
+ epc_page = sgx_encl_load_secs(encl); + if (IS_ERR(epc_page)) { + if (PTR_ERR(epc_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; + goto err_out_unlock; + } + epc_page = sgx_alloc_epc_page(encl_page, false); if (IS_ERR(epc_page)) { if (PTR_ERR(epc_page) == -EBUSY)
base-commit: 6eaae198076080886b9e7d57f4ae06fa782f90ef
Hi Everybody,
I'd like to check in on the status of this patch. This two month old patch looks to be a needed fix and has Jarkko and Kai's review tags, but I am not able to find it queued or merged in tip or upstream. Apologies if I did not look in the right spot, I just want to make sure it did not fall through the cracks if deemed needed.
Reinette
On 7/27/2023 10:10 PM, Haitao Huang wrote:
The SGX EPC reclaimer (ksgxd) may reclaim the SECS EPC page for an enclave and set secs.epc_page to NULL. The SECS page is used for EAUG and ELDU in the SGX page fault handler. However, the NULL check for secs.epc_page is only done for ELDU, not EAUG before being used.
Fix this by doing the same NULL check and reloading of the SECS page as needed for both EAUG and ELDU.
The SECS page holds global enclave metadata. It can only be reclaimed when there are no other enclave pages remaining. At that point, virtually nothing can be done with the enclave until the SECS page is paged back in.
An enclave can not run nor generate page faults without a resident SECS page. But it is still possible for a #PF for a non-SECS page to race with paging out the SECS page: when the last resident non-SECS page A triggers a #PF in a non-resident page B, and then page A and the SECS both are paged out before the #PF on B is handled.
Hitting this bug requires that race triggered with a #PF for EAUG. Following is a trace when it happens.
BUG: kernel NULL pointer dereference, address: 0000000000000000 RIP: 0010:sgx_encl_eaug_page+0xc7/0x210 Call Trace: ? __kmem_cache_alloc_node+0x16a/0x440 ? xa_load+0x6e/0xa0 sgx_vma_fault+0x119/0x230 __do_fault+0x36/0x140 do_fault+0x12f/0x400 __handle_mm_fault+0x728/0x1110 handle_mm_fault+0x105/0x310 do_user_addr_fault+0x1ee/0x750 ? __this_cpu_preempt_check+0x13/0x20 exc_page_fault+0x76/0x180 asm_exc_page_fault+0x27/0x30
Fixes: 5a90d2c3f5ef ("x86/sgx: Support adding of pages to an initialized enclave") Cc: stable@vger.kernel.org # v6.0+ Signed-off-by: Haitao Huang haitao.huang@linux.intel.com Reviewed-by: Jarkko Sakkinen jarkko@kernel.org Reviewed-by: Kai Huang kai.huang@intel.com Acked-by: Reinette Chatre reinette.chatre@intel.com
v6:
- Removed 'Under heavy load' as it is not the required condition though
it makes the bug more likely happen. (Kai)
- Added mentioning of the NULL check and reloading already done for ELDU (Kai)
- Added Reviewed-by (Kai)
v5:
- Trimmed trace and added Acked-by (Reinette)
v4:
- Refined the title (Kai, Dave)
- Added a trace to commit meesage (Kai)
- Added a few details for the race.
v3:
- Added comments on sgx_encl_load_secs(). (Dave)
- Added theory of the race condition to hit the bug. (Dave)
- Added Reviewed-by, and applicable stable release. (Jarkko)
v2:
- Fixes for style, commit message (Jarkko, Kai)
- Removed unneeded WARN_ON (Kai)
arch/x86/kernel/cpu/sgx/encl.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 91fa70e51004..279148e72459 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -235,6 +235,21 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, return epc_page; } +/*
- Ensure the SECS page is not swapped out. Must be called with encl->lock
- to protect the enclave states including SECS and ensure the SECS page is
- not swapped out again while being used.
- */
+static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl) +{
- struct sgx_epc_page *epc_page = encl->secs.epc_page;
- if (!epc_page)
epc_page = sgx_encl_eldu(&encl->secs, NULL);
- return epc_page;
+}
static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, struct sgx_encl_page *entry) { @@ -248,11 +263,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, return entry; }
- if (!(encl->secs.epc_page)) {
epc_page = sgx_encl_eldu(&encl->secs, NULL);
if (IS_ERR(epc_page))
return ERR_CAST(epc_page);
- }
- epc_page = sgx_encl_load_secs(encl);
- if (IS_ERR(epc_page))
return ERR_CAST(epc_page);
epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); if (IS_ERR(epc_page)) @@ -339,6 +352,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, mutex_lock(&encl->lock);
- epc_page = sgx_encl_load_secs(encl);
- if (IS_ERR(epc_page)) {
if (PTR_ERR(epc_page) == -EBUSY)
vmret = VM_FAULT_NOPAGE;
goto err_out_unlock;
- }
- epc_page = sgx_alloc_epc_page(encl_page, false); if (IS_ERR(epc_page)) { if (PTR_ERR(epc_page) == -EBUSY)
base-commit: 6eaae198076080886b9e7d57f4ae06fa782f90ef
On 9/28/23 16:08, Reinette Chatre wrote:
I'd like to check in on the status of this patch. This two month old patch looks to be a needed fix and has Jarkko and Kai's review tags, but I am not able to find it queued or merged in tip or upstream. Apologies if I did not look in the right spot, I just want to make sure it did not fall through the cracks if deemed needed.
It fell through the cracks. Sorry about that. It's in x86/urgent now.
Hi Dave,
On 9/28/2023 5:11 PM, Dave Hansen wrote:
On 9/28/23 16:08, Reinette Chatre wrote:
I'd like to check in on the status of this patch. This two month old patch looks to be a needed fix and has Jarkko and Kai's review tags, but I am not able to find it queued or merged in tip or upstream. Apologies if I did not look in the right spot, I just want to make sure it did not fall through the cracks if deemed needed.
It fell through the cracks. Sorry about that. It's in x86/urgent now.
No problem. Thank you very much for including it.
Reinette
On 7/17/23 13:29, Haitao Huang wrote: ...
@@ -248,11 +258,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, return entry; }
- if (!(encl->secs.epc_page)) {
epc_page = sgx_encl_eldu(&encl->secs, NULL);
if (IS_ERR(epc_page))
return ERR_CAST(epc_page);
- }
- epc_page = sgx_encl_load_secs(encl);
- if (IS_ERR(epc_page))
return ERR_CAST(epc_page);
epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); if (IS_ERR(epc_page)) @@ -339,6 +347,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, mutex_lock(&encl->lock);
- epc_page = sgx_encl_load_secs(encl);
- if (IS_ERR(epc_page)) {
if (PTR_ERR(epc_page) == -EBUSY)
vmret = VM_FAULT_NOPAGE;
goto err_out_unlock;
- }
Whenever I see one of these "make sure it isn't NULL", I always jump to asking what *keeps* it from becoming NULL again. In both cases here, I think that's encl->lock.
A comment would be really nice here, maybe on sgx_encl_load_secs(). Maybe:
/* * Ensure the SECS page is not swapped out. Must be called with * encl->lock to protect _____ and ensure the SECS page is not * swapped out again. */
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 166692f2d501..4662a364ce62 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -257,6 +257,10 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, mutex_lock(&encl->lock);
- /* Should not be possible */
- if (WARN_ON(!(encl->secs.epc_page)))
goto out;
That comment isn't super helpful. We generally don't WARN_ON() things that should happen. *Why* is it not possible?
On Tue, 18 Jul 2023 09:30:11 -0500, Dave Hansen dave.hansen@intel.com wrote:
On 7/17/23 13:29, Haitao Huang wrote: ...
@@ -248,11 +258,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, return entry; }
- if (!(encl->secs.epc_page)) {
epc_page = sgx_encl_eldu(&encl->secs, NULL);
if (IS_ERR(epc_page))
return ERR_CAST(epc_page);
- }
epc_page = sgx_encl_load_secs(encl);
if (IS_ERR(epc_page))
return ERR_CAST(epc_page);
epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); if (IS_ERR(epc_page))
@@ -339,6 +347,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
mutex_lock(&encl->lock);
- epc_page = sgx_encl_load_secs(encl);
- if (IS_ERR(epc_page)) {
if (PTR_ERR(epc_page) == -EBUSY)
vmret = VM_FAULT_NOPAGE;
goto err_out_unlock;
- }
Whenever I see one of these "make sure it isn't NULL", I always jump to asking what *keeps* it from becoming NULL again. In both cases here, I think that's encl->lock.
Yes, encl->lock protects all enclave states, the xarray holding encl_pages, SECS, VAs, etc.
A comment would be really nice here, maybe on sgx_encl_load_secs(). Maybe:
/*
- Ensure the SECS page is not swapped out. Must be called with
- encl->lock to protect _____ and ensure the SECS page is not
- swapped out again.
*/
Thanks for the suggestion. Lock should be held for the duration of SECS usage. So something like this? /* * Ensure the SECS page is not swapped out. Must be called with * encl->lock to protect the enclave states including SECS and * ensure the SECS page is not swapped out again while being used. */
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 166692f2d501..4662a364ce62 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -257,6 +257,10 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
mutex_lock(&encl->lock);
- /* Should not be possible */
- if (WARN_ON(!(encl->secs.epc_page)))
goto out;
That comment isn't super helpful. We generally don't WARN_ON() things that should happen. *Why* is it not possible?
When this part of code is reached, the reclaimer is holding at least one reclaimable EPC page to reclaim for the enclave and the code below only reclaims SECS when no reclaimable EPCs (number of SECS children being zero) of the enclave left. So it should not be possible. I'll remove this change because this is really not needed for fixing the bug as Kai pointed out.
I added this for sanity check when implementing multiple EPC tracking lists for cgroups. At one point there were list corruption issues if moving EPCs between lists not managed well. With those straightened out, and clear definitions of EPC states for moving them from one list to another, I no longer see much value to keep this even in later cgroup patches.
Thanks Haitao
On Mon Jul 17, 2023 at 11:29 PM EEST, Haitao Huang wrote:
Under heavy load, the SGX EPC reclaimers (current ksgxd or future EPC cgroup worker) may reclaim the SECS EPC page for an enclave and set encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in the SGX #PF handler without checking for NULL and reloading.
Fix this by checking if SECS is loaded before EAUG and load it if it was reclaimed.
Fixes: 5a90d2c3f5ef8 ("x86/sgx: Support adding of pages to an initialized enclave") Cc: stable@vger.kernel.org
Given that
$ git describe --contains 5a90d2c3f5ef8 v6.0-rc1~102^2~16
You could also describe this as:
Cc: stable@vger.kernel.org # v6.0+
Signed-off-by: Haitao Huang haitao.huang@linux.intel.com
arch/x86/kernel/cpu/sgx/encl.c | 25 ++++++++++++++++++++----- arch/x86/kernel/cpu/sgx/main.c | 4 ++++ 2 files changed, 24 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 2a0e90fe2abc..2ab544da1664 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -235,6 +235,16 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, return epc_page; } +static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl) +{
- struct sgx_epc_page *epc_page = encl->secs.epc_page;
- if (!epc_page)
epc_page = sgx_encl_eldu(&encl->secs, NULL);
- return epc_page;
+}
static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, struct sgx_encl_page *entry) { @@ -248,11 +258,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, return entry; }
- if (!(encl->secs.epc_page)) {
epc_page = sgx_encl_eldu(&encl->secs, NULL);
if (IS_ERR(epc_page))
return ERR_CAST(epc_page);
- }
- epc_page = sgx_encl_load_secs(encl);
- if (IS_ERR(epc_page))
return ERR_CAST(epc_page);
epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); if (IS_ERR(epc_page)) @@ -339,6 +347,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, mutex_lock(&encl->lock);
- epc_page = sgx_encl_load_secs(encl);
- if (IS_ERR(epc_page)) {
if (PTR_ERR(epc_page) == -EBUSY)
vmret = VM_FAULT_NOPAGE;
goto err_out_unlock;
- }
- epc_page = sgx_alloc_epc_page(encl_page, false); if (IS_ERR(epc_page)) { if (PTR_ERR(epc_page) == -EBUSY)
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 166692f2d501..4662a364ce62 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -257,6 +257,10 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, mutex_lock(&encl->lock);
- /* Should not be possible */
- if (WARN_ON(!(encl->secs.epc_page)))
goto out;
- sgx_encl_ewb(epc_page, backing); encl_page->epc_page = NULL; encl->secs_child_cnt--;
-- 2.25.1
Reviewed-by: Jarkko Sakkinen jarkko@kernel.org
BR, Jarkko
On Tue, 18 Jul 2023 10:37:45 -0500, Jarkko Sakkinen jarkko@kernel.org wrote:
On Mon Jul 17, 2023 at 11:29 PM EEST, Haitao Huang wrote:
Under heavy load, the SGX EPC reclaimers (current ksgxd or future EPC cgroup worker) may reclaim the SECS EPC page for an enclave and set encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in the SGX #PF handler without checking for NULL and reloading.
Fix this by checking if SECS is loaded before EAUG and load it if it was reclaimed.
Fixes: 5a90d2c3f5ef8 ("x86/sgx: Support adding of pages to an initialized enclave") Cc: stable@vger.kernel.org
Given that
$ git describe --contains 5a90d2c3f5ef8 v6.0-rc1~102^2~16
You could also describe this as:
Cc: stable@vger.kernel.org # v6.0+
Will add
...
Reviewed-by: Jarkko Sakkinen jarkko@kernel.org
Thank you. Haitao
linux-stable-mirror@lists.linaro.org