This patchset extends IOCTL interface to retrieve KVM statistics data in aggregated binary format. It is meant to provide a lightweight, flexible, scalable and efficient lock-free solution for userspace telemetry applications to pull the statistics data periodically for large scale systems. The capability is indicated by KVM_CAP_STATS_BINARY_FORM. Ioctl KVM_STATS_GET_INFO is used to get the information about VM or vCPU statistics data (The number of supported statistics data which is used for buffer allocation). Ioctl KVM_STATS_GET_NAMES is used to get the list of name strings of all supported statistics data. Ioctl KVM_STATS_GET_DATA is used to get the aggregated statistics data per VM or vCPU in the same order as the list of name strings. This is the ioctl which would be called periodically to retrieve statistics data per VM or vCPU.
Jing Zhang (4): KVM: stats: Separate statistics name strings from debugfs code KVM: stats: Define APIs for aggregated stats retrieval in binary format KVM: stats: Add ioctl commands to pull statistics in binary format KVM: selftests: Add selftest for KVM binary form statistics interface
Documentation/virt/kvm/api.rst | 79 +++++ arch/arm64/kvm/guest.c | 47 ++- arch/mips/kvm/mips.c | 114 +++++-- arch/powerpc/kvm/book3s.c | 107 ++++-- arch/powerpc/kvm/booke.c | 84 +++-- arch/s390/kvm/kvm-s390.c | 320 ++++++++++++------ arch/x86/kvm/x86.c | 127 ++++--- include/linux/kvm_host.h | 30 +- include/uapi/linux/kvm.h | 60 ++++ tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 3 + .../selftests/kvm/kvm_bin_form_stats.c | 89 +++++ virt/kvm/kvm_main.c | 115 +++++++ 13 files changed, 935 insertions(+), 241 deletions(-) create mode 100644 tools/testing/selftests/kvm/kvm_bin_form_stats.c
base-commit: 357ad203d45c0f9d76a8feadbd5a1c5d460c638b
Prepare the statistics name strings for supporting binary format aggregated statistics data retrieval.
No functional change intended.
Signed-off-by: Jing Zhang jingzhangos@google.com --- arch/arm64/kvm/guest.c | 47 ++++-- arch/mips/kvm/mips.c | 114 ++++++++++---- arch/powerpc/kvm/book3s.c | 107 +++++++++---- arch/powerpc/kvm/booke.c | 84 +++++++--- arch/s390/kvm/kvm-s390.c | 320 ++++++++++++++++++++++++++------------ arch/x86/kvm/x86.c | 127 ++++++++++----- include/linux/kvm_host.h | 31 +++- 7 files changed, 589 insertions(+), 241 deletions(-)
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 9bbd30e62799..fb3aafe76b52 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -28,19 +28,42 @@
#include "trace.h"
+const char kvm_vm_stat_strings[][KVM_STATS_NAME_LEN] = { + "remote_tlb_flush", +}; +static_assert(sizeof(kvm_vm_stat_strings) == + VM_STAT_COUNT * KVM_STATS_NAME_LEN); + +const char kvm_vcpu_stat_strings[][KVM_STATS_NAME_LEN] = { + "halt_successful_poll", + "halt_attempted_poll", + "halt_poll_success_ns", + "halt_poll_fail_ns", + "halt_poll_invalid", + "halt_wakeup", + "hvc_exit_stat", + "wfe_exit_stat", + "wfi_exit_stat", + "mmio_exit_user", + "mmio_exit_kernel", + "exits", +}; +static_assert(sizeof(kvm_vcpu_stat_strings) == + VCPU_STAT_COUNT * KVM_STATS_NAME_LEN); + struct kvm_stats_debugfs_item debugfs_entries[] = { - VCPU_STAT("halt_successful_poll", halt_successful_poll), - VCPU_STAT("halt_attempted_poll", halt_attempted_poll), - VCPU_STAT("halt_poll_invalid", halt_poll_invalid), - VCPU_STAT("halt_wakeup", halt_wakeup), - VCPU_STAT("hvc_exit_stat", hvc_exit_stat), - VCPU_STAT("wfe_exit_stat", wfe_exit_stat), - VCPU_STAT("wfi_exit_stat", wfi_exit_stat), - VCPU_STAT("mmio_exit_user", mmio_exit_user), - VCPU_STAT("mmio_exit_kernel", mmio_exit_kernel), - VCPU_STAT("exits", exits), - VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), - VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), + VCPU_STAT(halt_successful_poll), + VCPU_STAT(halt_attempted_poll), + VCPU_STAT(halt_poll_invalid), + VCPU_STAT(halt_wakeup), + VCPU_STAT(hvc_exit_stat), + VCPU_STAT(wfe_exit_stat), + VCPU_STAT(wfi_exit_stat), + VCPU_STAT(mmio_exit_user), + VCPU_STAT(mmio_exit_kernel), + VCPU_STAT(exits), + VCPU_STAT(halt_poll_success_ns), + VCPU_STAT(halt_poll_fail_ns), { NULL } };
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 3d6a7f5827b1..8b9cbd9d6205 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -39,44 +39,92 @@ #define VECTORSPACING 0x100 /* for EI/VI mode */ #endif
+const char kvm_vm_stat_strings[][KVM_STATS_NAME_LEN] = { + "remote_tlb_flush", +}; +static_assert(sizeof(kvm_vm_stat_strings) == + VM_STAT_COUNT * KVM_STATS_NAME_LEN); + +const char kvm_vcpu_stat_strings[][KVM_STATS_NAME_LEN] = { + "wait", + "cache", + "signal", + "interrupt", + "cop_unusable", + "tlbmod", + "tlbmiss_ld", + "tlbmiss_st", + "addrerr_st", + "addrerr_ld", + "syscall", + "resvd_inst", + "break_inst", + "trap_inst", + "msa_fpe", + "fpe", + "msa_disabled", + "flush_dcache", +#ifdef CONFIG_KVM_MIPS_VZ + "vz_gpsi", + "vz_gsfc", + "vz_hc", + "vz_grr", + "vz_gva", + "vz_ghfc", + "vz_gpa", + "vz_resvd", +#ifdef CONFIG_CPU_LOONGSON64 + "vz_cpucfg", +#endif +#endif + "halt_successful_poll", + "halt_attempted_poll", + "halt_poll_success_ns", + "halt_poll_fail_ns", + "halt_poll_invalid", + "halt_wakeup", +}; +static_assert(sizeof(kvm_vcpu_stat_strings) == + VCPU_STAT_COUNT * KVM_STATS_NAME_LEN); + struct kvm_stats_debugfs_item debugfs_entries[] = { - VCPU_STAT("wait", wait_exits), - VCPU_STAT("cache", cache_exits), - VCPU_STAT("signal", signal_exits), - VCPU_STAT("interrupt", int_exits), - VCPU_STAT("cop_unusable", cop_unusable_exits), - VCPU_STAT("tlbmod", tlbmod_exits), - VCPU_STAT("tlbmiss_ld", tlbmiss_ld_exits), - VCPU_STAT("tlbmiss_st", tlbmiss_st_exits), - VCPU_STAT("addrerr_st", addrerr_st_exits), - VCPU_STAT("addrerr_ld", addrerr_ld_exits), - VCPU_STAT("syscall", syscall_exits), - VCPU_STAT("resvd_inst", resvd_inst_exits), - VCPU_STAT("break_inst", break_inst_exits), - VCPU_STAT("trap_inst", trap_inst_exits), - VCPU_STAT("msa_fpe", msa_fpe_exits), - VCPU_STAT("fpe", fpe_exits), - VCPU_STAT("msa_disabled", msa_disabled_exits), - VCPU_STAT("flush_dcache", flush_dcache_exits), + VCPU_STAT(wait_exits), + VCPU_STAT(cache_exits), + VCPU_STAT(signal_exits), + VCPU_STAT(int_exits), + VCPU_STAT(cop_unusable_exits), + VCPU_STAT(tlbmod_exits), + VCPU_STAT(tlbmiss_ld_exits), + VCPU_STAT(tlbmiss_st_exits), + VCPU_STAT(addrerr_st_exits), + VCPU_STAT(addrerr_ld_exits), + VCPU_STAT(syscall_exits), + VCPU_STAT(resvd_inst_exits), + VCPU_STAT(break_inst_exits), + VCPU_STAT(trap_inst_exits), + VCPU_STAT(msa_fpe_exits), + VCPU_STAT(fpe_exits), + VCPU_STAT(msa_disabled_exits), + VCPU_STAT(flush_dcache_exits), #ifdef CONFIG_KVM_MIPS_VZ - VCPU_STAT("vz_gpsi", vz_gpsi_exits), - VCPU_STAT("vz_gsfc", vz_gsfc_exits), - VCPU_STAT("vz_hc", vz_hc_exits), - VCPU_STAT("vz_grr", vz_grr_exits), - VCPU_STAT("vz_gva", vz_gva_exits), - VCPU_STAT("vz_ghfc", vz_ghfc_exits), - VCPU_STAT("vz_gpa", vz_gpa_exits), - VCPU_STAT("vz_resvd", vz_resvd_exits), + VCPU_STAT(vz_gpsi_exits), + VCPU_STAT(vz_gsfc_exits), + VCPU_STAT(vz_hc_exits), + VCPU_STAT(vz_grr_exits), + VCPU_STAT(vz_gva_exits), + VCPU_STAT(vz_ghfc_exits), + VCPU_STAT(vz_gpa_exits), + VCPU_STAT(vz_resvd_exits), #ifdef CONFIG_CPU_LOONGSON64 - VCPU_STAT("vz_cpucfg", vz_cpucfg_exits), + VCPU_STAT(vz_cpucfg_exits), #endif #endif - VCPU_STAT("halt_successful_poll", halt_successful_poll), - VCPU_STAT("halt_attempted_poll", halt_attempted_poll), - VCPU_STAT("halt_poll_invalid", halt_poll_invalid), - VCPU_STAT("halt_wakeup", halt_wakeup), - VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), - VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), + VCPU_STAT(halt_successful_poll), + VCPU_STAT(halt_attempted_poll), + VCPU_STAT(halt_poll_invalid), + VCPU_STAT(halt_wakeup), + VCPU_STAT(halt_poll_success_ns), + VCPU_STAT(halt_poll_fail_ns), {NULL} };
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 44bf567b6589..22dd5a804468 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -36,38 +36,87 @@ #include "book3s.h" #include "trace.h"
+const char kvm_vm_stat_strings[][KVM_STATS_NAME_LEN] = { + "remote_tlb_flush", + "largepages_2M", + "largepages_1G", +}; +static_assert(sizeof(kvm_vm_stat_strings) == + VM_STAT_COUNT * KVM_STATS_NAME_LEN); + +const char kvm_vcpu_stat_strings[][KVM_STATS_NAME_LEN] = { + "exits", + "mmio", + "sig", + "light", + "itlb_real_miss", + "itlb_virt_miss", + "dtlb_real_miss", + "dtlb_virt_miss", + "sysc", + "isi", + "dsi", + "inst_emu", + "dec", + "ext_intr", + "halt_poll_success_ns", + "halt_poll_fail_ns", + "halt_wait_ns", + "halt_successful_poll", + "halt_attempted_poll", + "halt_successful_wait", + "halt_poll_invalid", + "halt_wakeup", + "dbell", + "gdbell", + "ld", + "st", + "pf_storage", + "pf_instruc", + "sp_storage", + "sp_instruc", + "queue_intr", + "ld_slow", + "st_slow", + "pthru_all", + "pthru_host", + "pthru_bad_aff", +}; +static_assert(sizeof(kvm_vcpu_stat_strings) == + VCPU_STAT_COUNT * KVM_STATS_NAME_LEN); + /* #define EXIT_DEBUG */
struct kvm_stats_debugfs_item debugfs_entries[] = { - VCPU_STAT("exits", sum_exits), - VCPU_STAT("mmio", mmio_exits), - VCPU_STAT("sig", signal_exits), - VCPU_STAT("sysc", syscall_exits), - VCPU_STAT("inst_emu", emulated_inst_exits), - VCPU_STAT("dec", dec_exits), - VCPU_STAT("ext_intr", ext_intr_exits), - VCPU_STAT("queue_intr", queue_intr), - VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), - VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), - VCPU_STAT("halt_wait_ns", halt_wait_ns), - VCPU_STAT("halt_successful_poll", halt_successful_poll), - VCPU_STAT("halt_attempted_poll", halt_attempted_poll), - VCPU_STAT("halt_successful_wait", halt_successful_wait), - VCPU_STAT("halt_poll_invalid", halt_poll_invalid), - VCPU_STAT("halt_wakeup", halt_wakeup), - VCPU_STAT("pf_storage", pf_storage), - VCPU_STAT("sp_storage", sp_storage), - VCPU_STAT("pf_instruc", pf_instruc), - VCPU_STAT("sp_instruc", sp_instruc), - VCPU_STAT("ld", ld), - VCPU_STAT("ld_slow", ld_slow), - VCPU_STAT("st", st), - VCPU_STAT("st_slow", st_slow), - VCPU_STAT("pthru_all", pthru_all), - VCPU_STAT("pthru_host", pthru_host), - VCPU_STAT("pthru_bad_aff", pthru_bad_aff), - VM_STAT("largepages_2M", num_2M_pages, .mode = 0444), - VM_STAT("largepages_1G", num_1G_pages, .mode = 0444), + VCPU_STAT(sum_exits), + VCPU_STAT(mmio_exits), + VCPU_STAT(signal_exits), + VCPU_STAT(syscall_exits), + VCPU_STAT(emulated_inst_exits), + VCPU_STAT(dec_exits), + VCPU_STAT(ext_intr_exits), + VCPU_STAT(queue_intr), + VCPU_STAT(halt_poll_success_ns), + VCPU_STAT(halt_poll_fail_ns), + VCPU_STAT(halt_wait_ns), + VCPU_STAT(halt_successful_poll), + VCPU_STAT(halt_attempted_poll), + VCPU_STAT(halt_successful_wait), + VCPU_STAT(halt_poll_invalid), + VCPU_STAT(halt_wakeup), + VCPU_STAT(pf_storage), + VCPU_STAT(sp_storage), + VCPU_STAT(pf_instruc), + VCPU_STAT(sp_instruc), + VCPU_STAT(ld), + VCPU_STAT(ld_slow), + VCPU_STAT(st), + VCPU_STAT(st_slow), + VCPU_STAT(pthru_all), + VCPU_STAT(pthru_host), + VCPU_STAT(pthru_bad_aff), + VM_STAT(num_2M_pages, .mode = 0444), + VM_STAT(num_1G_pages, .mode = 0444), { NULL } };
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index f38ae3e54b37..faa830f8178b 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -35,28 +35,70 @@
unsigned long kvmppc_booke_handlers;
+const char kvm_vm_stat_strings[][KVM_STATS_NAME_LEN] = { + "remote_tlb_flush", + "largepages_2M", + "largepages_1G", +}; +static_assert(sizeof(kvm_vm_stat_strings) == + VM_STAT_COUNT * KVM_STATS_NAME_LEN); + +const char kvm_vcpu_stat_strings[][KVM_STATS_NAME_LEN] = { + "exits", + "mmio", + "sig", + "light", + "itlb_real_miss", + "itlb_virt_miss", + "dtlb_real_miss", + "dtlb_virt_miss", + "sysc", + "isi", + "dsi", + "inst_emu", + "dec", + "ext_intr", + "halt_poll_success_ns", + "halt_poll_fail_ns", + "halt_wait_ns", + "halt_successful_poll", + "halt_attempted_poll", + "halt_successful_wait", + "halt_poll_invalid", + "halt_wakeup", + "dbell", + "gdbell", + "ld", + "st", + "pthru_all", + "pthru_host", + "pthru_bad_aff", +}; +static_assert(sizeof(kvm_vcpu_stat_strings) == + VCPU_STAT_COUNT * KVM_STATS_NAME_LEN); + struct kvm_stats_debugfs_item debugfs_entries[] = { - VCPU_STAT("mmio", mmio_exits), - VCPU_STAT("sig", signal_exits), - VCPU_STAT("itlb_r", itlb_real_miss_exits), - VCPU_STAT("itlb_v", itlb_virt_miss_exits), - VCPU_STAT("dtlb_r", dtlb_real_miss_exits), - VCPU_STAT("dtlb_v", dtlb_virt_miss_exits), - VCPU_STAT("sysc", syscall_exits), - VCPU_STAT("isi", isi_exits), - VCPU_STAT("dsi", dsi_exits), - VCPU_STAT("inst_emu", emulated_inst_exits), - VCPU_STAT("dec", dec_exits), - VCPU_STAT("ext_intr", ext_intr_exits), - VCPU_STAT("halt_successful_poll", halt_successful_poll), - VCPU_STAT("halt_attempted_poll", halt_attempted_poll), - VCPU_STAT("halt_poll_invalid", halt_poll_invalid), - VCPU_STAT("halt_wakeup", halt_wakeup), - VCPU_STAT("doorbell", dbell_exits), - VCPU_STAT("guest doorbell", gdbell_exits), - VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), - VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), - VM_STAT("remote_tlb_flush", remote_tlb_flush), + VCPU_STAT(mmio_exits), + VCPU_STAT(signal_exits), + VCPU_STAT(itlb_real_miss_exits), + VCPU_STAT(itlb_virt_miss_exits), + VCPU_STAT(dtlb_real_miss_exits), + VCPU_STAT(dtlb_virt_miss_exits), + VCPU_STAT(syscall_exits), + VCPU_STAT(isi_exits), + VCPU_STAT(dsi_exits), + VCPU_STAT(emulated_inst_exits), + VCPU_STAT(dec_exits), + VCPU_STAT(ext_intr_exits), + VCPU_STAT(halt_successful_poll), + VCPU_STAT(halt_attempted_poll), + VCPU_STAT(halt_poll_invalid), + VCPU_STAT(halt_wakeup), + VCPU_STAT(dbell_exits), + VCPU_STAT(gdbell_exits), + VCPU_STAT(halt_poll_success_ns), + VCPU_STAT(halt_poll_fail_ns), + VM_STAT(remote_tlb_flush), { NULL } };
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index dbafd057ca6a..cefec8c1e87a 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -57,110 +57,224 @@ #define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \ (KVM_MAX_VCPUS + LOCAL_IRQS))
+const char kvm_vm_stat_strings[][KVM_STATS_NAME_LEN] = { + "inject_io", + "inject_float_mchk", + "inject_pfault_done", + "inject_service_signal", + "inject_virtio", + "remote_tlb_flush", +}; +static_assert(sizeof(kvm_vm_stat_strings) == + VM_STAT_COUNT * KVM_STATS_NAME_LEN); + +const char kvm_vcpu_stat_strings[][KVM_STATS_NAME_LEN] = { + "userspace_handled", + "exit_null", + "exit_external_request", + "exit_io_request", + "exit_external_interrupt", + "exit_stop_request", + "exit_validity", + "exit_instruction", + "exit_pei", + "halt_successful_poll", + "halt_attempted_poll", + "halt_poll_invalid", + "halt_no_poll_steal", + "halt_wakeup", + "halt_poll_success_ns", + "halt_poll_fail_ns", + "instruction_lctlg", + "instruction_lctl", + "instruction_stctl", + "instruction_stctg", + "exit_program_interruption", + "exit_instr_and_program_int", + "exit_operation_exception", + "deliver_ckc", + "deliver_cputm", + "deliver_external_call", + "deliver_emergency_signal", + "deliver_service_signal", + "deliver_virtio", + "deliver_stop_signal", + "deliver_prefix_signal", + "deliver_restart_signal", + "deliver_program", + "deliver_io", + "deliver_machine_check", + "exit_wait_state", + "inject_ckc", + "inject_cputm", + "inject_external_call", + "inject_emergency_signal", + "inject_mchk", + "inject_pfault_init", + "inject_program", + "inject_restart", + "inject_set_prefix", + "inject_stop_signal", + "instruction_epsw", + "instruction_gs", + "instruction_io_other", + "instruction_lpsw", + "instruction_lpswe", + "instruction_pfmf", + "instruction_ptff", + "instruction_sck", + "instruction_sckpf", + "instruction_stidp", + "instruction_spx", + "instruction_stpx", + "instruction_stap", + "instruction_iske", + "instruction_ri", + "instruction_rrbe", + "instruction_sske", + "instruction_ipte_interlock", + "instruction_stsi", + "instruction_stfl", + "instruction_tb", + "instruction_tpi", + "instruction_tprot", + "instruction_tsch", + "instruction_sie", + "instruction_essa", + "instruction_sthyi", + "instruction_sigp_sense", + "instruction_sigp_sense_running", + "instruction_sigp_external_call", + "instruction_sigp_emergency", + "instruction_sigp_cond_emergency", + "instruction_sigp_start", + "instruction_sigp_stop", + "instruction_sigp_stop_store_status", + "instruction_sigp_store_status", + "instruction_sigp_store_adtl_status", + "instruction_sigp_set_arch", + "instruction_sigp_set_prefix", + "instruction_sigp_restart", + "instruction_sigp_init_cpu_reset", + "instruction_sigp_cpu_reset", + "instruction_sigp_unknown", + "instruction_diag_10", + "instruction_diag_44", + "instruction_diag_9c", + "diag_9c_ignored", + "instruction_diag_258", + "instruction_diag_308", + "instruction_diag_500", + "instruction_diag_other", + "pfault_sync", +}; +static_assert(sizeof(kvm_vcpu_stat_strings) == + VCPU_STAT_COUNT * KVM_STATS_NAME_LEN); + struct kvm_stats_debugfs_item debugfs_entries[] = { - VCPU_STAT("userspace_handled", exit_userspace), - VCPU_STAT("exit_null", exit_null), - VCPU_STAT("pfault_sync", pfault_sync), - VCPU_STAT("exit_validity", exit_validity), - VCPU_STAT("exit_stop_request", exit_stop_request), - VCPU_STAT("exit_external_request", exit_external_request), - VCPU_STAT("exit_io_request", exit_io_request), - VCPU_STAT("exit_external_interrupt", exit_external_interrupt), - VCPU_STAT("exit_instruction", exit_instruction), - VCPU_STAT("exit_pei", exit_pei), - VCPU_STAT("exit_program_interruption", exit_program_interruption), - VCPU_STAT("exit_instr_and_program_int", exit_instr_and_program), - VCPU_STAT("exit_operation_exception", exit_operation_exception), - VCPU_STAT("halt_successful_poll", halt_successful_poll), - VCPU_STAT("halt_attempted_poll", halt_attempted_poll), - VCPU_STAT("halt_poll_invalid", halt_poll_invalid), - VCPU_STAT("halt_no_poll_steal", halt_no_poll_steal), - VCPU_STAT("halt_wakeup", halt_wakeup), - VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), - VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), - VCPU_STAT("instruction_lctlg", instruction_lctlg), - VCPU_STAT("instruction_lctl", instruction_lctl), - VCPU_STAT("instruction_stctl", instruction_stctl), - VCPU_STAT("instruction_stctg", instruction_stctg), - VCPU_STAT("deliver_ckc", deliver_ckc), - VCPU_STAT("deliver_cputm", deliver_cputm), - VCPU_STAT("deliver_emergency_signal", deliver_emergency_signal), - VCPU_STAT("deliver_external_call", deliver_external_call), - VCPU_STAT("deliver_service_signal", deliver_service_signal), - VCPU_STAT("deliver_virtio", deliver_virtio), - VCPU_STAT("deliver_stop_signal", deliver_stop_signal), - VCPU_STAT("deliver_prefix_signal", deliver_prefix_signal), - VCPU_STAT("deliver_restart_signal", deliver_restart_signal), - VCPU_STAT("deliver_program", deliver_program), - VCPU_STAT("deliver_io", deliver_io), - VCPU_STAT("deliver_machine_check", deliver_machine_check), - VCPU_STAT("exit_wait_state", exit_wait_state), - VCPU_STAT("inject_ckc", inject_ckc), - VCPU_STAT("inject_cputm", inject_cputm), - VCPU_STAT("inject_external_call", inject_external_call), - VM_STAT("inject_float_mchk", inject_float_mchk), - VCPU_STAT("inject_emergency_signal", inject_emergency_signal), - VM_STAT("inject_io", inject_io), - VCPU_STAT("inject_mchk", inject_mchk), - VM_STAT("inject_pfault_done", inject_pfault_done), - VCPU_STAT("inject_program", inject_program), - VCPU_STAT("inject_restart", inject_restart), - VM_STAT("inject_service_signal", inject_service_signal), - VCPU_STAT("inject_set_prefix", inject_set_prefix), - VCPU_STAT("inject_stop_signal", inject_stop_signal), - VCPU_STAT("inject_pfault_init", inject_pfault_init), - VM_STAT("inject_virtio", inject_virtio), - VCPU_STAT("instruction_epsw", instruction_epsw), - VCPU_STAT("instruction_gs", instruction_gs), - VCPU_STAT("instruction_io_other", instruction_io_other), - VCPU_STAT("instruction_lpsw", instruction_lpsw), - VCPU_STAT("instruction_lpswe", instruction_lpswe), - VCPU_STAT("instruction_pfmf", instruction_pfmf), - VCPU_STAT("instruction_ptff", instruction_ptff), - VCPU_STAT("instruction_stidp", instruction_stidp), - VCPU_STAT("instruction_sck", instruction_sck), - VCPU_STAT("instruction_sckpf", instruction_sckpf), - VCPU_STAT("instruction_spx", instruction_spx), - VCPU_STAT("instruction_stpx", instruction_stpx), - VCPU_STAT("instruction_stap", instruction_stap), - VCPU_STAT("instruction_iske", instruction_iske), - VCPU_STAT("instruction_ri", instruction_ri), - VCPU_STAT("instruction_rrbe", instruction_rrbe), - VCPU_STAT("instruction_sske", instruction_sske), - VCPU_STAT("instruction_ipte_interlock", instruction_ipte_interlock), - VCPU_STAT("instruction_essa", instruction_essa), - VCPU_STAT("instruction_stsi", instruction_stsi), - VCPU_STAT("instruction_stfl", instruction_stfl), - VCPU_STAT("instruction_tb", instruction_tb), - VCPU_STAT("instruction_tpi", instruction_tpi), - VCPU_STAT("instruction_tprot", instruction_tprot), - VCPU_STAT("instruction_tsch", instruction_tsch), - VCPU_STAT("instruction_sthyi", instruction_sthyi), - VCPU_STAT("instruction_sie", instruction_sie), - VCPU_STAT("instruction_sigp_sense", instruction_sigp_sense), - VCPU_STAT("instruction_sigp_sense_running", instruction_sigp_sense_running), - VCPU_STAT("instruction_sigp_external_call", instruction_sigp_external_call), - VCPU_STAT("instruction_sigp_emergency", instruction_sigp_emergency), - VCPU_STAT("instruction_sigp_cond_emergency", instruction_sigp_cond_emergency), - VCPU_STAT("instruction_sigp_start", instruction_sigp_start), - VCPU_STAT("instruction_sigp_stop", instruction_sigp_stop), - VCPU_STAT("instruction_sigp_stop_store_status", instruction_sigp_stop_store_status), - VCPU_STAT("instruction_sigp_store_status", instruction_sigp_store_status), - VCPU_STAT("instruction_sigp_store_adtl_status", instruction_sigp_store_adtl_status), - VCPU_STAT("instruction_sigp_set_arch", instruction_sigp_arch), - VCPU_STAT("instruction_sigp_set_prefix", instruction_sigp_prefix), - VCPU_STAT("instruction_sigp_restart", instruction_sigp_restart), - VCPU_STAT("instruction_sigp_cpu_reset", instruction_sigp_cpu_reset), - VCPU_STAT("instruction_sigp_init_cpu_reset", instruction_sigp_init_cpu_reset), - VCPU_STAT("instruction_sigp_unknown", instruction_sigp_unknown), - VCPU_STAT("instruction_diag_10", diagnose_10), - VCPU_STAT("instruction_diag_44", diagnose_44), - VCPU_STAT("instruction_diag_9c", diagnose_9c), - VCPU_STAT("diag_9c_ignored", diagnose_9c_ignored), - VCPU_STAT("instruction_diag_258", diagnose_258), - VCPU_STAT("instruction_diag_308", diagnose_308), - VCPU_STAT("instruction_diag_500", diagnose_500), - VCPU_STAT("instruction_diag_other", diagnose_other), + VCPU_STAT(exit_userspace), + VCPU_STAT(exit_null), + VCPU_STAT(pfault_sync), + VCPU_STAT(exit_validity), + VCPU_STAT(exit_stop_request), + VCPU_STAT(exit_external_request), + VCPU_STAT(exit_io_request), + VCPU_STAT(exit_external_interrupt), + VCPU_STAT(exit_instruction), + VCPU_STAT(exit_pei), + VCPU_STAT(exit_program_interruption), + VCPU_STAT(exit_instr_and_program), + VCPU_STAT(exit_operation_exception), + VCPU_STAT(halt_successful_poll), + VCPU_STAT(halt_attempted_poll), + VCPU_STAT(halt_poll_invalid), + VCPU_STAT(halt_no_poll_steal), + VCPU_STAT(halt_wakeup), + VCPU_STAT(halt_poll_success_ns), + VCPU_STAT(halt_poll_fail_ns), + VCPU_STAT(instruction_lctlg), + VCPU_STAT(instruction_lctl), + VCPU_STAT(instruction_stctl), + VCPU_STAT(instruction_stctg), + VCPU_STAT(deliver_ckc), + VCPU_STAT(deliver_cputm), + VCPU_STAT(deliver_emergency_signal), + VCPU_STAT(deliver_external_call), + VCPU_STAT(deliver_service_signal), + VCPU_STAT(deliver_virtio), + VCPU_STAT(deliver_stop_signal), + VCPU_STAT(deliver_prefix_signal), + VCPU_STAT(deliver_restart_signal), + VCPU_STAT(deliver_program), + VCPU_STAT(deliver_io), + VCPU_STAT(deliver_machine_check), + VCPU_STAT(exit_wait_state), + VCPU_STAT(inject_ckc), + VCPU_STAT(inject_cputm), + VCPU_STAT(inject_external_call), + VM_STAT(inject_float_mchk), + VCPU_STAT(inject_emergency_signal), + VM_STAT(inject_io), + VCPU_STAT(inject_mchk), + VM_STAT(inject_pfault_done), + VCPU_STAT(inject_program), + VCPU_STAT(inject_restart), + VM_STAT(inject_service_signal), + VCPU_STAT(inject_set_prefix), + VCPU_STAT(inject_stop_signal), + VCPU_STAT(inject_pfault_init), + VM_STAT(inject_virtio), + VCPU_STAT(instruction_epsw), + VCPU_STAT(instruction_gs), + VCPU_STAT(instruction_io_other), + VCPU_STAT(instruction_lpsw), + VCPU_STAT(instruction_lpswe), + VCPU_STAT(instruction_pfmf), + VCPU_STAT(instruction_ptff), + VCPU_STAT(instruction_stidp), + VCPU_STAT(instruction_sck), + VCPU_STAT(instruction_sckpf), + VCPU_STAT(instruction_spx), + VCPU_STAT(instruction_stpx), + VCPU_STAT(instruction_stap), + VCPU_STAT(instruction_iske), + VCPU_STAT(instruction_ri), + VCPU_STAT(instruction_rrbe), + VCPU_STAT(instruction_sske), + VCPU_STAT(instruction_ipte_interlock), + VCPU_STAT(instruction_essa), + VCPU_STAT(instruction_stsi), + VCPU_STAT(instruction_stfl), + VCPU_STAT(instruction_tb), + VCPU_STAT(instruction_tpi), + VCPU_STAT(instruction_tprot), + VCPU_STAT(instruction_tsch), + VCPU_STAT(instruction_sthyi), + VCPU_STAT(instruction_sie), + VCPU_STAT(instruction_sigp_sense), + VCPU_STAT(instruction_sigp_sense_running), + VCPU_STAT(instruction_sigp_external_call), + VCPU_STAT(instruction_sigp_emergency), + VCPU_STAT(instruction_sigp_cond_emergency), + VCPU_STAT(instruction_sigp_start), + VCPU_STAT(instruction_sigp_stop), + VCPU_STAT(instruction_sigp_stop_store_status), + VCPU_STAT(instruction_sigp_store_status), + VCPU_STAT(instruction_sigp_store_adtl_status), + VCPU_STAT(instruction_sigp_arch), + VCPU_STAT(instruction_sigp_prefix), + VCPU_STAT(instruction_sigp_restart), + VCPU_STAT(instruction_sigp_cpu_reset), + VCPU_STAT(instruction_sigp_init_cpu_reset), + VCPU_STAT(instruction_sigp_unknown), + VCPU_STAT(diagnose_10), + VCPU_STAT(diagnose_44), + VCPU_STAT(diagnose_9c), + VCPU_STAT(diagnose_9c_ignored), + VCPU_STAT(diagnose_258), + VCPU_STAT(diagnose_308), + VCPU_STAT(diagnose_500), + VCPU_STAT(diagnose_other), { NULL } };
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 46b0e52671bb..e0b02a29c760 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -216,46 +216,95 @@ EXPORT_SYMBOL_GPL(host_xss); u64 __read_mostly supported_xss; EXPORT_SYMBOL_GPL(supported_xss);
+const char kvm_vm_stat_strings[][KVM_STATS_NAME_LEN] = { + "mmu_shadow_zapped", + "mmu_pte_write", + "mmu_pde_zapped", + "mmu_flooded", + "mmu_recycled", + "mmu_cache_miss", + "mmu_unsync", + "remote_tlb_flush", + "largepages", + "nx_largepages_splitted", + "max_mmu_page_hash_collisions", +}; +static_assert(sizeof(kvm_vm_stat_strings) == + VM_STAT_COUNT * KVM_STATS_NAME_LEN); + +const char kvm_vcpu_stat_strings[][KVM_STATS_NAME_LEN] = { + "pf_fixed", + "pf_guest", + "tlb_flush", + "invlpg", + "exits", + "io_exits", + "mmio_exits", + "signal_exits", + "irq_window", + "nmi_window", + "l1d_flush", + "halt_exits", + "halt_successful_poll", + "halt_attempted_poll", + "halt_poll_invalid", + "halt_wakeup", + "request_irq", + "irq_exits", + "host_state_reload", + "fpu_reload", + "insn_emulation", + "insn_emulation_fail", + "hypercalls", + "irq_injections", + "nmi_injections", + "req_event", + "halt_poll_success_ns", + "halt_poll_fail_ns", +}; +static_assert(sizeof(kvm_vcpu_stat_strings) == + VCPU_STAT_COUNT * KVM_STATS_NAME_LEN); + struct kvm_stats_debugfs_item debugfs_entries[] = { - VCPU_STAT("pf_fixed", pf_fixed), - VCPU_STAT("pf_guest", pf_guest), - VCPU_STAT("tlb_flush", tlb_flush), - VCPU_STAT("invlpg", invlpg), - VCPU_STAT("exits", exits), - VCPU_STAT("io_exits", io_exits), - VCPU_STAT("mmio_exits", mmio_exits), - VCPU_STAT("signal_exits", signal_exits), - VCPU_STAT("irq_window", irq_window_exits), - VCPU_STAT("nmi_window", nmi_window_exits), - VCPU_STAT("halt_exits", halt_exits), - VCPU_STAT("halt_successful_poll", halt_successful_poll), - VCPU_STAT("halt_attempted_poll", halt_attempted_poll), - VCPU_STAT("halt_poll_invalid", halt_poll_invalid), - VCPU_STAT("halt_wakeup", halt_wakeup), - VCPU_STAT("hypercalls", hypercalls), - VCPU_STAT("request_irq", request_irq_exits), - VCPU_STAT("irq_exits", irq_exits), - VCPU_STAT("host_state_reload", host_state_reload), - VCPU_STAT("fpu_reload", fpu_reload), - VCPU_STAT("insn_emulation", insn_emulation), - VCPU_STAT("insn_emulation_fail", insn_emulation_fail), - VCPU_STAT("irq_injections", irq_injections), - VCPU_STAT("nmi_injections", nmi_injections), - VCPU_STAT("req_event", req_event), - VCPU_STAT("l1d_flush", l1d_flush), - VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), - VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), - VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), - VM_STAT("mmu_pte_write", mmu_pte_write), - VM_STAT("mmu_pde_zapped", mmu_pde_zapped), - VM_STAT("mmu_flooded", mmu_flooded), - VM_STAT("mmu_recycled", mmu_recycled), - VM_STAT("mmu_cache_miss", mmu_cache_miss), - VM_STAT("mmu_unsync", mmu_unsync), - VM_STAT("remote_tlb_flush", remote_tlb_flush), - VM_STAT("largepages", lpages, .mode = 0444), - VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444), - VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions), + VCPU_STAT(pf_fixed), + VCPU_STAT(pf_guest), + VCPU_STAT(tlb_flush), + VCPU_STAT(invlpg), + VCPU_STAT(exits), + VCPU_STAT(io_exits), + VCPU_STAT(mmio_exits), + VCPU_STAT(signal_exits), + VCPU_STAT(irq_window_exits), + VCPU_STAT(nmi_window_exits), + VCPU_STAT(halt_exits), + VCPU_STAT(halt_successful_poll), + VCPU_STAT(halt_attempted_poll), + VCPU_STAT(halt_poll_invalid), + VCPU_STAT(halt_wakeup), + VCPU_STAT(hypercalls), + VCPU_STAT(request_irq_exits), + VCPU_STAT(irq_exits), + VCPU_STAT(host_state_reload), + VCPU_STAT(fpu_reload), + VCPU_STAT(insn_emulation), + VCPU_STAT(insn_emulation_fail), + VCPU_STAT(irq_injections), + VCPU_STAT(nmi_injections), + VCPU_STAT(req_event), + VCPU_STAT(l1d_flush), + VCPU_STAT(halt_poll_success_ns), + VCPU_STAT(halt_poll_fail_ns), + VM_STAT(mmu_shadow_zapped), + VM_STAT(mmu_pte_write), + VM_STAT(mmu_pde_zapped), + VM_STAT(mmu_flooded), + VM_STAT(mmu_recycled), + VM_STAT(mmu_cache_miss), + VM_STAT(mmu_unsync), + VM_STAT(remote_tlb_flush), + VM_STAT(lpages, .mode = 0444), + VM_STAT(nx_lpage_splits, .mode = 0444), + VM_STAT(max_mmu_page_hash_collisions), { NULL } };
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1b65e7204344..1ea297458306 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1162,6 +1162,18 @@ static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa) return kvm_is_error_hva(hva); }
+#define VM_STAT_COUNT (sizeof(struct kvm_vm_stat)/sizeof(ulong)) +#define VCPU_STAT_COUNT (sizeof(struct kvm_vcpu_stat)/sizeof(u64)) +#define KVM_STATS_NAME_LEN 32 + +/* Make sure it is synced with fields in struct kvm_vm_stat. */ +extern const char kvm_vm_stat_strings[][KVM_STATS_NAME_LEN]; +/* Make sure it is synced with fields in struct kvm_vcpu_stat. */ +extern const char kvm_vcpu_stat_strings[][KVM_STATS_NAME_LEN]; + +#define VM_STAT_NAME(id) (kvm_vm_stat_strings[id]) +#define VCPU_STAT_NAME(id) (kvm_vcpu_stat_strings[id]) + enum kvm_stat_kind { KVM_STAT_VM, KVM_STAT_VCPU, @@ -1182,10 +1194,21 @@ struct kvm_stats_debugfs_item { #define KVM_DBGFS_GET_MODE(dbgfs_item) \ ((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644)
-#define VM_STAT(n, x, ...) \ - { n, offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ } -#define VCPU_STAT(n, x, ...) \ - { n, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ } +#define VM_STAT(x, ...) \ + { \ + VM_STAT_NAME(offsetof(struct kvm_vm_stat, x)/sizeof(ulong)), \ + offsetof(struct kvm, stat.x), \ + KVM_STAT_VM, \ + ## __VA_ARGS__ \ + } + +#define VCPU_STAT(x, ...) \ + { \ + VCPU_STAT_NAME(offsetof(struct kvm_vcpu_stat, x)/sizeof(u64)), \ + offsetof(struct kvm_vcpu, stat.x), \ + KVM_STAT_VCPU, \ + ## __VA_ARGS__ \ + }
extern struct kvm_stats_debugfs_item debugfs_entries[]; extern struct dentry *kvm_debugfs_dir;
Hi Jing,
On Wed, 10 Mar 2021 00:30:21 +0000, Jing Zhang jingzhangos@google.com wrote:
Prepare the statistics name strings for supporting binary format aggregated statistics data retrieval.
No functional change intended.
Signed-off-by: Jing Zhang jingzhangos@google.com
arch/arm64/kvm/guest.c | 47 ++++-- arch/mips/kvm/mips.c | 114 ++++++++++---- arch/powerpc/kvm/book3s.c | 107 +++++++++---- arch/powerpc/kvm/booke.c | 84 +++++++--- arch/s390/kvm/kvm-s390.c | 320 ++++++++++++++++++++++++++------------ arch/x86/kvm/x86.c | 127 ++++++++++----- include/linux/kvm_host.h | 31 +++- 7 files changed, 589 insertions(+), 241 deletions(-)
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 9bbd30e62799..fb3aafe76b52 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -28,19 +28,42 @@ #include "trace.h" +const char kvm_vm_stat_strings[][KVM_STATS_NAME_LEN] = {
- "remote_tlb_flush",
+}; +static_assert(sizeof(kvm_vm_stat_strings) ==
VM_STAT_COUNT * KVM_STATS_NAME_LEN);
+const char kvm_vcpu_stat_strings[][KVM_STATS_NAME_LEN] = {
- "halt_successful_poll",
- "halt_attempted_poll",
- "halt_poll_success_ns",
- "halt_poll_fail_ns",
- "halt_poll_invalid",
- "halt_wakeup",
- "hvc_exit_stat",
- "wfe_exit_stat",
- "wfi_exit_stat",
- "mmio_exit_user",
- "mmio_exit_kernel",
- "exits",
+}; +static_assert(sizeof(kvm_vcpu_stat_strings) ==
VCPU_STAT_COUNT * KVM_STATS_NAME_LEN);
struct kvm_stats_debugfs_item debugfs_entries[] = {
- VCPU_STAT("halt_successful_poll", halt_successful_poll),
- VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
- VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
- VCPU_STAT("halt_wakeup", halt_wakeup),
- VCPU_STAT("hvc_exit_stat", hvc_exit_stat),
- VCPU_STAT("wfe_exit_stat", wfe_exit_stat),
- VCPU_STAT("wfi_exit_stat", wfi_exit_stat),
- VCPU_STAT("mmio_exit_user", mmio_exit_user),
- VCPU_STAT("mmio_exit_kernel", mmio_exit_kernel),
- VCPU_STAT("exits", exits),
- VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
- VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
- VCPU_STAT(halt_successful_poll),
- VCPU_STAT(halt_attempted_poll),
- VCPU_STAT(halt_poll_invalid),
- VCPU_STAT(halt_wakeup),
- VCPU_STAT(hvc_exit_stat),
- VCPU_STAT(wfe_exit_stat),
- VCPU_STAT(wfi_exit_stat),
- VCPU_STAT(mmio_exit_user),
- VCPU_STAT(mmio_exit_kernel),
- VCPU_STAT(exits),
- VCPU_STAT(halt_poll_success_ns),
- VCPU_STAT(halt_poll_fail_ns),
So we now have two arrays that can easily deviate in their order, whilst we didn't have that risk before. What is the advantage of doing this? The commit message doesn't really say...
[...]
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1b65e7204344..1ea297458306 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1162,6 +1162,18 @@ static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa) return kvm_is_error_hva(hva); } +#define VM_STAT_COUNT (sizeof(struct kvm_vm_stat)/sizeof(ulong)) +#define VCPU_STAT_COUNT (sizeof(struct kvm_vcpu_stat)/sizeof(u64)) +#define KVM_STATS_NAME_LEN 32
+/* Make sure it is synced with fields in struct kvm_vm_stat. */ +extern const char kvm_vm_stat_strings[][KVM_STATS_NAME_LEN]; +/* Make sure it is synced with fields in struct kvm_vcpu_stat. */ +extern const char kvm_vcpu_stat_strings[][KVM_STATS_NAME_LEN];
+#define VM_STAT_NAME(id) (kvm_vm_stat_strings[id]) +#define VCPU_STAT_NAME(id) (kvm_vcpu_stat_strings[id])
enum kvm_stat_kind { KVM_STAT_VM, KVM_STAT_VCPU, @@ -1182,10 +1194,21 @@ struct kvm_stats_debugfs_item { #define KVM_DBGFS_GET_MODE(dbgfs_item) \ ((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644) -#define VM_STAT(n, x, ...) \
- { n, offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ }
-#define VCPU_STAT(n, x, ...) \
- { n, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ }
+#define VM_STAT(x, ...) \
- { \
VM_STAT_NAME(offsetof(struct kvm_vm_stat, x)/sizeof(ulong)), \
offsetof(struct kvm, stat.x), \
KVM_STAT_VM, \
## __VA_ARGS__ \
- }
+#define VCPU_STAT(x, ...) \
- { \
VCPU_STAT_NAME(offsetof(struct kvm_vcpu_stat, x)/sizeof(u64)), \
offsetof(struct kvm_vcpu, stat.x), \
KVM_STAT_VCPU, \
## __VA_ARGS__ \
- }
Is there any reason why we want to keep kvm_vm_stat populated with ulong, while kvm_vcpu_stat is populated with u64? I have the feeling that this is a fairly pointless difference, and that some of the macros could be unified.
Also, using names initialisers would help with the readability of the macros.
Thanks,
M.
Hi Marc,
On Wed, Mar 10, 2021 at 8:19 AM Marc Zyngier maz@kernel.org wrote:
Hi Jing,
On Wed, 10 Mar 2021 00:30:21 +0000, Jing Zhang jingzhangos@google.com wrote:
Prepare the statistics name strings for supporting binary format aggregated statistics data retrieval.
No functional change intended.
Signed-off-by: Jing Zhang jingzhangos@google.com
arch/arm64/kvm/guest.c | 47 ++++-- arch/mips/kvm/mips.c | 114 ++++++++++---- arch/powerpc/kvm/book3s.c | 107 +++++++++---- arch/powerpc/kvm/booke.c | 84 +++++++--- arch/s390/kvm/kvm-s390.c | 320 ++++++++++++++++++++++++++------------ arch/x86/kvm/x86.c | 127 ++++++++++----- include/linux/kvm_host.h | 31 +++- 7 files changed, 589 insertions(+), 241 deletions(-)
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 9bbd30e62799..fb3aafe76b52 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -28,19 +28,42 @@
#include "trace.h"
+const char kvm_vm_stat_strings[][KVM_STATS_NAME_LEN] = {
"remote_tlb_flush",
+}; +static_assert(sizeof(kvm_vm_stat_strings) ==
VM_STAT_COUNT * KVM_STATS_NAME_LEN);
+const char kvm_vcpu_stat_strings[][KVM_STATS_NAME_LEN] = {
"halt_successful_poll",
"halt_attempted_poll",
"halt_poll_success_ns",
"halt_poll_fail_ns",
"halt_poll_invalid",
"halt_wakeup",
"hvc_exit_stat",
"wfe_exit_stat",
"wfi_exit_stat",
"mmio_exit_user",
"mmio_exit_kernel",
"exits",
+}; +static_assert(sizeof(kvm_vcpu_stat_strings) ==
VCPU_STAT_COUNT * KVM_STATS_NAME_LEN);
struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("halt_successful_poll", halt_successful_poll),
VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
VCPU_STAT("halt_wakeup", halt_wakeup),
VCPU_STAT("hvc_exit_stat", hvc_exit_stat),
VCPU_STAT("wfe_exit_stat", wfe_exit_stat),
VCPU_STAT("wfi_exit_stat", wfi_exit_stat),
VCPU_STAT("mmio_exit_user", mmio_exit_user),
VCPU_STAT("mmio_exit_kernel", mmio_exit_kernel),
VCPU_STAT("exits", exits),
VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
VCPU_STAT(halt_successful_poll),
VCPU_STAT(halt_attempted_poll),
VCPU_STAT(halt_poll_invalid),
VCPU_STAT(halt_wakeup),
VCPU_STAT(hvc_exit_stat),
VCPU_STAT(wfe_exit_stat),
VCPU_STAT(wfi_exit_stat),
VCPU_STAT(mmio_exit_user),
VCPU_STAT(mmio_exit_kernel),
VCPU_STAT(exits),
VCPU_STAT(halt_poll_success_ns),
VCPU_STAT(halt_poll_fail_ns),
So we now have two arrays that can easily deviate in their order, whilst we didn't have that risk before. What is the advantage of doing this? The commit message doesn't really say...
You are right about the risk here. The new string array would be returned to userspace by the new Ioctl API. I didn't figure out any other good way for this. Will add this into commit message.
[...]
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1b65e7204344..1ea297458306 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1162,6 +1162,18 @@ static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa) return kvm_is_error_hva(hva); }
+#define VM_STAT_COUNT (sizeof(struct kvm_vm_stat)/sizeof(ulong)) +#define VCPU_STAT_COUNT (sizeof(struct kvm_vcpu_stat)/sizeof(u64)) +#define KVM_STATS_NAME_LEN 32
+/* Make sure it is synced with fields in struct kvm_vm_stat. */ +extern const char kvm_vm_stat_strings[][KVM_STATS_NAME_LEN]; +/* Make sure it is synced with fields in struct kvm_vcpu_stat. */ +extern const char kvm_vcpu_stat_strings[][KVM_STATS_NAME_LEN];
+#define VM_STAT_NAME(id) (kvm_vm_stat_strings[id]) +#define VCPU_STAT_NAME(id) (kvm_vcpu_stat_strings[id])
enum kvm_stat_kind { KVM_STAT_VM, KVM_STAT_VCPU, @@ -1182,10 +1194,21 @@ struct kvm_stats_debugfs_item { #define KVM_DBGFS_GET_MODE(dbgfs_item) \ ((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644)
-#define VM_STAT(n, x, ...) \
{ n, offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ }
-#define VCPU_STAT(n, x, ...) \
{ n, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ }
+#define VM_STAT(x, ...) \
{ \
VM_STAT_NAME(offsetof(struct kvm_vm_stat, x)/sizeof(ulong)), \
offsetof(struct kvm, stat.x), \
KVM_STAT_VM, \
## __VA_ARGS__ \
}
+#define VCPU_STAT(x, ...) \
{ \
VCPU_STAT_NAME(offsetof(struct kvm_vcpu_stat, x)/sizeof(u64)), \
offsetof(struct kvm_vcpu, stat.x), \
KVM_STAT_VCPU, \
## __VA_ARGS__ \
}
Is there any reason why we want to keep kvm_vm_stat populated with ulong, while kvm_vcpu_stat is populated with u64? I have the feeling that this is a fairly pointless difference, and that some of the macros could be unified.
The use of ulong for vm stats is to avoid the overhead of atomics, since vm stats could potentially be updated by multiple vcpus from that vm at a time. Check commit 8a7e75d47b68193339f8727cf4503271d0a0b1d0 for details.
Also, using names initialisers would help with the readability of the macros.
Sure, will do.
Thanks,
M.
-- Without deviation from the norm, progress is not possible.
Thanks, Jing
Define ioctl commands for VM/vCPU aggregated statistics data retrieval in binary format and update corresponding API documentation.
The capability and ioctl are not enabled for now. No functional change intended.
Signed-off-by: Jing Zhang jingzhangos@google.com --- Documentation/virt/kvm/api.rst | 79 ++++++++++++++++++++++++++++++++++ include/linux/kvm_host.h | 1 - include/uapi/linux/kvm.h | 60 ++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 1 deletion(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 1a2b5210cdbf..aa4b5270c966 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4938,6 +4938,76 @@ see KVM_XEN_VCPU_SET_ATTR above. The KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST type may not be used with the KVM_XEN_VCPU_GET_ATTR ioctl.
+4.131 KVM_STATS_GET_INFO +------------------------ + +:Capability: KVM_CAP_STATS_BINARY_FORM +:Architectures: all +:Type: vm ioctl, vcpu ioctl +:Parameters: struct kvm_stats_info (out) +:Returns: 0 on success, < 0 on error + +:: + + struct kvm_stats_info { + __u32 num_stats; + }; + +This ioctl is used to get the information about VM or vCPU statistics data. +The number of statistics data would be returned in field num_stats in +struct kvm_stats_info. This ioctl only needs to be called once on running +VMs on the same architecture. + +4.132 KVM_STATS_GET_NAMES +------------------------- + +:Capability: KVM_CAP_STATS_BINARY_FORM +:Architectures: all +:Type: vm ioctl, vcpu ioctl +:Parameters: struct kvm_stats_names (in/out) +:Returns: 0 on success, < 0 on error + +:: + + #define KVM_STATS_NAME_LEN 32 + struct kvm_stats_names { + __u32 size; + __u8 names[0]; + }; + +This ioctl is used to get the string names of all the statistics data for VM +or vCPU. Users must use KVM_STATS_GET_INFO to find the number of statistics. +They must allocate a buffer of the size num_stats * KVM_STATS_NAME_LEN +immediately following struct kvm_stats_names. The size field of kvm_stats_name +must contain the buffer size as an input. +The buffer can be treated like a string array, each name string is null-padded +to a size of KVM_STATS_NAME_LEN; +This ioclt only needs to be called once on running VMs on the same architecture. + +4.133 KVM_STATS_GET_DATA +------------------------- + +:Capability: KVM_CAP_STATS_BINARY_FORM +:Architectures: all +:Type: vm ioctl, vcpu ioctl +:Parameters: struct kvm_stats_data (in/out) +:Returns: 0 on success, < 0 on error + +:: + + struct kvm_stats_data { + __u32 size; + __u64 data[0]; + }; + +This ioctl is used to get the aggregated statistics data for VM or vCPU. +Users must use KVM_STATS_GET_INFO to find the number of statistics. +They must allocate a buffer of the appropriate size num_stats * sizeof(data[0]) +immediately following struct kvm_stats_data. The size field of kvm_stats_data +must contain the buffer size as an input. +The data buffer 1-1 maps to name strings buffer in sequential order. +This ioctl is usually called periodically to pull statistics data. + 5. The kvm_run structure ========================
@@ -6721,3 +6791,12 @@ vcpu_info is set. The KVM_XEN_HVM_CONFIG_RUNSTATE flag indicates that the runstate-related features KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR/_CURRENT/_DATA/_ADJUST are supported by the KVM_XEN_VCPU_SET_ATTR/KVM_XEN_VCPU_GET_ATTR ioctls. + +8.31 KVM_CAP_STATS_BINARY_FORM +------------------------------ + +:Architectures: all + +This capability indicates that KVM supports retrieving aggregated statistics +data in binary format with corresponding VM/VCPU ioctl KVM_STATS_GET_INFO, +KVM_STATS_GET_NAMES and KVM_STATS_GET_DATA. diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1ea297458306..f2dabf457717 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1164,7 +1164,6 @@ static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa)
#define VM_STAT_COUNT (sizeof(struct kvm_vm_stat)/sizeof(ulong)) #define VCPU_STAT_COUNT (sizeof(struct kvm_vcpu_stat)/sizeof(u64)) -#define KVM_STATS_NAME_LEN 32
/* Make sure it is synced with fields in struct kvm_vm_stat. */ extern const char kvm_vm_stat_strings[][KVM_STATS_NAME_LEN]; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f6afee209620..ad185d4c5e42 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1078,6 +1078,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_DIRTY_LOG_RING 192 #define KVM_CAP_X86_BUS_LOCK_EXIT 193 #define KVM_CAP_PPC_DAWR1 194 +#define KVM_CAP_STATS_BINARY_FORM 195
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1853,4 +1854,63 @@ struct kvm_dirty_gfn { #define KVM_BUS_LOCK_DETECTION_OFF (1 << 0) #define KVM_BUS_LOCK_DETECTION_EXIT (1 << 1)
+/* Available with KVM_CAP_STATS_BINARY_FORM */ + +#define KVM_STATS_NAME_LEN 32 + +/** + * struct kvm_stats_info - statistics information + * + * Used as parameter for ioctl %KVM_STATS_GET_INFO. + * + * @num_stats: On return, the number of statistics data of vm or vcpu. + * + */ +struct kvm_stats_info { + __u32 num_stats; +}; + +/** + * struct kvm_stats_names - string list of statistics names + * + * Used as parameter for ioctl %KVM_STATS_GET_NAMES. + * + * @size: Input from user, indicating the size of buffer after the struture. + * @names: Buffer of name string list for vm or vcpu. Each string is + * null-padded to a size of %KVM_STATS_NAME_LEN. + * + * Users must use %KVM_STATS_GET_INFO to find the number of + * statistics. They must allocate a buffer of the appropriate + * size (>= &struct kvm_stats_info @num_stats * %KVM_STATS_NAME_LEN) + * immediately following this struture. + */ +struct kvm_stats_names { + __u32 size; + __u8 names[0]; +}; + +/** + * struct kvm_stats_data - statistics data array + * + * Used as parameter for ioctl %KVM_STATS_GET_DATA. + * + * @size: Input from user, indicating the size of buffer after the struture. + * @data: Buffer of statistics data for vm or vcpu. + * + * Users must use %KVM_STATS_GET_INFO to find the number of + * statistics. They must allocate a buffer of the appropriate + * size (>= &struct kvm_stats_info @num_stats * sizeof(@data[0]) + * immediately following this structure. + * &struct kvm_stats_names @names 1-1 maps to &structkvm_stats_data + * @data in sequential order. + */ +struct kvm_stats_data { + __u32 size; + __u64 data[0]; +}; + +#define KVM_STATS_GET_INFO _IOR(KVMIO, 0xcc, struct kvm_stats_info) +#define KVM_STATS_GET_NAMES _IOR(KVMIO, 0xcd, struct kvm_stats_names) +#define KVM_STATS_GET_DATA _IOR(KVMIO, 0xce, struct kvm_stats_data) + #endif /* __LINUX_KVM_H */
On Wed, 10 Mar 2021 00:30:22 +0000, Jing Zhang jingzhangos@google.com wrote:
Define ioctl commands for VM/vCPU aggregated statistics data retrieval in binary format and update corresponding API documentation.
The capability and ioctl are not enabled for now. No functional change intended.
Signed-off-by: Jing Zhang jingzhangos@google.com
Documentation/virt/kvm/api.rst | 79 ++++++++++++++++++++++++++++++++++ include/linux/kvm_host.h | 1 - include/uapi/linux/kvm.h | 60 ++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 1 deletion(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 1a2b5210cdbf..aa4b5270c966 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4938,6 +4938,76 @@ see KVM_XEN_VCPU_SET_ATTR above. The KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST type may not be used with the KVM_XEN_VCPU_GET_ATTR ioctl. +4.131 KVM_STATS_GET_INFO +------------------------
+:Capability: KVM_CAP_STATS_BINARY_FORM +:Architectures: all +:Type: vm ioctl, vcpu ioctl +:Parameters: struct kvm_stats_info (out) +:Returns: 0 on success, < 0 on error
Missing description of the errors (this is throughout the document).
+::
- struct kvm_stats_info {
__u32 num_stats;
- };
+This ioctl is used to get the information about VM or vCPU statistics data. +The number of statistics data would be returned in field num_stats in +struct kvm_stats_info. This ioctl only needs to be called once on running +VMs on the same architecture.
Is this allowed to be variable across VMs? Or is that a constant for a given host system boot?
Given that this returns a single field, is there any value in copying this structure across? Could it be returned by the ioctl itself instead, at the expense of only being a 31bit value?
+4.132 KVM_STATS_GET_NAMES +-------------------------
+:Capability: KVM_CAP_STATS_BINARY_FORM +:Architectures: all +:Type: vm ioctl, vcpu ioctl +:Parameters: struct kvm_stats_names (in/out) +:Returns: 0 on success, < 0 on error
+::
- #define KVM_STATS_NAME_LEN 32
- struct kvm_stats_names {
- __u32 size;
- __u8 names[0];
- };
+This ioctl is used to get the string names of all the statistics data for VM +or vCPU. Users must use KVM_STATS_GET_INFO to find the number of statistics. +They must allocate a buffer of the size num_stats * KVM_STATS_NAME_LEN +immediately following struct kvm_stats_names. The size field of kvm_stats_name +must contain the buffer size as an input.
What is the unit for the buffer size? bytes? or number of "names"?
+The buffer can be treated like a string array, each name string is null-padded +to a size of KVM_STATS_NAME_LEN;
Is this allowed to query fewer strings than described by kvm_stats_info? If it isn't, I question the need for the "size" field. Either there is enough space in the buffer passed by userspace, or -EFAULT is returned.
+This ioclt only needs to be called once on running VMs on the same architecture.
Same question about the immutability of these names.
+4.133 KVM_STATS_GET_DATA +-------------------------
+:Capability: KVM_CAP_STATS_BINARY_FORM +:Architectures: all +:Type: vm ioctl, vcpu ioctl +:Parameters: struct kvm_stats_data (in/out) +:Returns: 0 on success, < 0 on error
+::
- struct kvm_stats_data {
- __u32 size;
Same question about the actual need for this field.
- __u64 data[0];
So userspace always sees a 64bit quantify per stat. My earlier comment about the ulong/u64 discrepancy stands! ;-)
- };
+This ioctl is used to get the aggregated statistics data for VM or vCPU. +Users must use KVM_STATS_GET_INFO to find the number of statistics. +They must allocate a buffer of the appropriate size num_stats * sizeof(data[0]) +immediately following struct kvm_stats_data. The size field of kvm_stats_data +must contain the buffer size as an input. +The data buffer 1-1 maps to name strings buffer in sequential order. +This ioctl is usually called periodically to pull statistics data.
Is there any provision to reset the counters on read?
- The kvm_run structure
======================== @@ -6721,3 +6791,12 @@ vcpu_info is set. The KVM_XEN_HVM_CONFIG_RUNSTATE flag indicates that the runstate-related features KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR/_CURRENT/_DATA/_ADJUST are supported by the KVM_XEN_VCPU_SET_ATTR/KVM_XEN_VCPU_GET_ATTR ioctls.
+8.31 KVM_CAP_STATS_BINARY_FORM +------------------------------
+:Architectures: all
+This capability indicates that KVM supports retrieving aggregated statistics +data in binary format with corresponding VM/VCPU ioctl KVM_STATS_GET_INFO, +KVM_STATS_GET_NAMES and KVM_STATS_GET_DATA.
nit: for ease of reviewing, consider splitting the documentation in a separate patch.
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1ea297458306..f2dabf457717 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1164,7 +1164,6 @@ static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa) #define VM_STAT_COUNT (sizeof(struct kvm_vm_stat)/sizeof(ulong)) #define VCPU_STAT_COUNT (sizeof(struct kvm_vcpu_stat)/sizeof(u64)) -#define KVM_STATS_NAME_LEN 32 /* Make sure it is synced with fields in struct kvm_vm_stat. */ extern const char kvm_vm_stat_strings[][KVM_STATS_NAME_LEN]; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f6afee209620..ad185d4c5e42 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1078,6 +1078,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_DIRTY_LOG_RING 192 #define KVM_CAP_X86_BUS_LOCK_EXIT 193 #define KVM_CAP_PPC_DAWR1 194 +#define KVM_CAP_STATS_BINARY_FORM 195 #ifdef KVM_CAP_IRQ_ROUTING @@ -1853,4 +1854,63 @@ struct kvm_dirty_gfn { #define KVM_BUS_LOCK_DETECTION_OFF (1 << 0) #define KVM_BUS_LOCK_DETECTION_EXIT (1 << 1) +/* Available with KVM_CAP_STATS_BINARY_FORM */
+#define KVM_STATS_NAME_LEN 32
+/**
- struct kvm_stats_info - statistics information
- Used as parameter for ioctl %KVM_STATS_GET_INFO.
- @num_stats: On return, the number of statistics data of vm or vcpu.
- */
+struct kvm_stats_info {
- __u32 num_stats;
+};
+/**
- struct kvm_stats_names - string list of statistics names
- Used as parameter for ioctl %KVM_STATS_GET_NAMES.
- @size: Input from user, indicating the size of buffer after the struture.
- @names: Buffer of name string list for vm or vcpu. Each string is
- null-padded to a size of %KVM_STATS_NAME_LEN.
- Users must use %KVM_STATS_GET_INFO to find the number of
- statistics. They must allocate a buffer of the appropriate
- size (>= &struct kvm_stats_info @num_stats * %KVM_STATS_NAME_LEN)
- immediately following this struture.
- */
+struct kvm_stats_names {
- __u32 size;
- __u8 names[0];
+};
+/**
- struct kvm_stats_data - statistics data array
- Used as parameter for ioctl %KVM_STATS_GET_DATA.
- @size: Input from user, indicating the size of buffer after the struture.
- @data: Buffer of statistics data for vm or vcpu.
- Users must use %KVM_STATS_GET_INFO to find the number of
- statistics. They must allocate a buffer of the appropriate
- size (>= &struct kvm_stats_info @num_stats * sizeof(@data[0])
- immediately following this structure.
- &struct kvm_stats_names @names 1-1 maps to &structkvm_stats_data
- @data in sequential order.
- */
+struct kvm_stats_data {
- __u32 size;
- __u64 data[0];
+};
+#define KVM_STATS_GET_INFO _IOR(KVMIO, 0xcc, struct kvm_stats_info) +#define KVM_STATS_GET_NAMES _IOR(KVMIO, 0xcd, struct kvm_stats_names) +#define KVM_STATS_GET_DATA _IOR(KVMIO, 0xce, struct kvm_stats_data)
#endif /* __LINUX_KVM_H */
2.30.1.766.gb4fecdf3b7-goog
Thanks,
M.
Hi Marc,
On Wed, Mar 10, 2021 at 8:58 AM Marc Zyngier maz@kernel.org wrote:
On Wed, 10 Mar 2021 00:30:22 +0000, Jing Zhang jingzhangos@google.com wrote:
Define ioctl commands for VM/vCPU aggregated statistics data retrieval in binary format and update corresponding API documentation.
The capability and ioctl are not enabled for now. No functional change intended.
Signed-off-by: Jing Zhang jingzhangos@google.com
Documentation/virt/kvm/api.rst | 79 ++++++++++++++++++++++++++++++++++ include/linux/kvm_host.h | 1 - include/uapi/linux/kvm.h | 60 ++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 1 deletion(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 1a2b5210cdbf..aa4b5270c966 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4938,6 +4938,76 @@ see KVM_XEN_VCPU_SET_ATTR above. The KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST type may not be used with the KVM_XEN_VCPU_GET_ATTR ioctl.
+4.131 KVM_STATS_GET_INFO +------------------------
+:Capability: KVM_CAP_STATS_BINARY_FORM +:Architectures: all +:Type: vm ioctl, vcpu ioctl +:Parameters: struct kvm_stats_info (out) +:Returns: 0 on success, < 0 on error
Missing description of the errors (this is throughout the document).
Will add errors description.
+::
- struct kvm_stats_info {
__u32 num_stats;
- };
+This ioctl is used to get the information about VM or vCPU statistics data. +The number of statistics data would be returned in field num_stats in +struct kvm_stats_info. This ioctl only needs to be called once on running +VMs on the same architecture.
Is this allowed to be variable across VMs? Or is that a constant for a given host system boot?
It is a constant for a given host system boot.
Given that this returns a single field, is there any value in copying this structure across? Could it be returned by the ioctl itself instead, at the expense of only being a 31bit value?
It is done in this way for potential information we need in the future. One candidate is the length of stats names. I am considering to return the length of name string in this info structure instead of as a constant exported by uapi, which would be more flexible and put no limit on the length of stats names.
+4.132 KVM_STATS_GET_NAMES +-------------------------
+:Capability: KVM_CAP_STATS_BINARY_FORM +:Architectures: all +:Type: vm ioctl, vcpu ioctl +:Parameters: struct kvm_stats_names (in/out) +:Returns: 0 on success, < 0 on error
+::
- #define KVM_STATS_NAME_LEN 32
- struct kvm_stats_names {
__u32 size;
__u8 names[0];
- };
+This ioctl is used to get the string names of all the statistics data for VM +or vCPU. Users must use KVM_STATS_GET_INFO to find the number of statistics. +They must allocate a buffer of the size num_stats * KVM_STATS_NAME_LEN +immediately following struct kvm_stats_names. The size field of kvm_stats_name +must contain the buffer size as an input.
What is the unit for the buffer size? bytes? or number of "names"?
The unit for the buffer size is bytes. Will clearly indicate the unit.
+The buffer can be treated like a string array, each name string is null-padded +to a size of KVM_STATS_NAME_LEN;
Is this allowed to query fewer strings than described by kvm_stats_info? If it isn't, I question the need for the "size" field. Either there is enough space in the buffer passed by userspace, or -EFAULT is returned.
It isn't. Will remove the size field.
+This ioclt only needs to be called once on running VMs on the same architecture.
Same question about the immutability of these names.
The names are immutable for a given system boot.
+4.133 KVM_STATS_GET_DATA +-------------------------
+:Capability: KVM_CAP_STATS_BINARY_FORM +:Architectures: all +:Type: vm ioctl, vcpu ioctl +:Parameters: struct kvm_stats_data (in/out) +:Returns: 0 on success, < 0 on error
+::
- struct kvm_stats_data {
__u32 size;
Same question about the actual need for this field.
Will remove size field.
__u64 data[0];
So userspace always sees a 64bit quantify per stat. My earlier comment about the ulong/u64 discrepancy stands! ;-)
Yes, userspace always sees a 64 bit, but the ulong in KVM would be 64bit or 32bit.
- };
+This ioctl is used to get the aggregated statistics data for VM or vCPU. +Users must use KVM_STATS_GET_INFO to find the number of statistics. +They must allocate a buffer of the appropriate size num_stats * sizeof(data[0]) +immediately following struct kvm_stats_data. The size field of kvm_stats_data +must contain the buffer size as an input. +The data buffer 1-1 maps to name strings buffer in sequential order. +This ioctl is usually called periodically to pull statistics data.
Is there any provision to reset the counters on read?
Nope.
- The kvm_run structure
========================
@@ -6721,3 +6791,12 @@ vcpu_info is set. The KVM_XEN_HVM_CONFIG_RUNSTATE flag indicates that the runstate-related features KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR/_CURRENT/_DATA/_ADJUST are supported by the KVM_XEN_VCPU_SET_ATTR/KVM_XEN_VCPU_GET_ATTR ioctls.
+8.31 KVM_CAP_STATS_BINARY_FORM +------------------------------
+:Architectures: all
+This capability indicates that KVM supports retrieving aggregated statistics +data in binary format with corresponding VM/VCPU ioctl KVM_STATS_GET_INFO, +KVM_STATS_GET_NAMES and KVM_STATS_GET_DATA.
nit: for ease of reviewing, consider splitting the documentation in a separate patch.
Will do.
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1ea297458306..f2dabf457717 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1164,7 +1164,6 @@ static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa)
#define VM_STAT_COUNT (sizeof(struct kvm_vm_stat)/sizeof(ulong)) #define VCPU_STAT_COUNT (sizeof(struct kvm_vcpu_stat)/sizeof(u64)) -#define KVM_STATS_NAME_LEN 32
/* Make sure it is synced with fields in struct kvm_vm_stat. */ extern const char kvm_vm_stat_strings[][KVM_STATS_NAME_LEN]; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f6afee209620..ad185d4c5e42 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1078,6 +1078,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_DIRTY_LOG_RING 192 #define KVM_CAP_X86_BUS_LOCK_EXIT 193 #define KVM_CAP_PPC_DAWR1 194 +#define KVM_CAP_STATS_BINARY_FORM 195
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1853,4 +1854,63 @@ struct kvm_dirty_gfn { #define KVM_BUS_LOCK_DETECTION_OFF (1 << 0) #define KVM_BUS_LOCK_DETECTION_EXIT (1 << 1)
+/* Available with KVM_CAP_STATS_BINARY_FORM */
+#define KVM_STATS_NAME_LEN 32
+/**
- struct kvm_stats_info - statistics information
- Used as parameter for ioctl %KVM_STATS_GET_INFO.
- @num_stats: On return, the number of statistics data of vm or vcpu.
- */
+struct kvm_stats_info {
__u32 num_stats;
+};
+/**
- struct kvm_stats_names - string list of statistics names
- Used as parameter for ioctl %KVM_STATS_GET_NAMES.
- @size: Input from user, indicating the size of buffer after the struture.
- @names: Buffer of name string list for vm or vcpu. Each string is
- null-padded to a size of %KVM_STATS_NAME_LEN.
- Users must use %KVM_STATS_GET_INFO to find the number of
- statistics. They must allocate a buffer of the appropriate
- size (>= &struct kvm_stats_info @num_stats * %KVM_STATS_NAME_LEN)
- immediately following this struture.
- */
+struct kvm_stats_names {
__u32 size;
__u8 names[0];
+};
+/**
- struct kvm_stats_data - statistics data array
- Used as parameter for ioctl %KVM_STATS_GET_DATA.
- @size: Input from user, indicating the size of buffer after the struture.
- @data: Buffer of statistics data for vm or vcpu.
- Users must use %KVM_STATS_GET_INFO to find the number of
- statistics. They must allocate a buffer of the appropriate
- size (>= &struct kvm_stats_info @num_stats * sizeof(@data[0])
- immediately following this structure.
- &struct kvm_stats_names @names 1-1 maps to &structkvm_stats_data
- @data in sequential order.
- */
+struct kvm_stats_data {
__u32 size;
__u64 data[0];
+};
+#define KVM_STATS_GET_INFO _IOR(KVMIO, 0xcc, struct kvm_stats_info) +#define KVM_STATS_GET_NAMES _IOR(KVMIO, 0xcd, struct kvm_stats_names) +#define KVM_STATS_GET_DATA _IOR(KVMIO, 0xce, struct kvm_stats_data)
#endif /* __LINUX_KVM_H */
2.30.1.766.gb4fecdf3b7-goog
Thanks,
M.
-- Without deviation from the norm, progress is not possible.
Thanks, Jing
Three ioctl commands are added to support binary form statistics data retrieval. KVM_STATS_GET_INFO, KVM_STATS_GET_NAMES, KVM_STATS_GET_DATA. KVM_CAP_STATS_BINARY_FORM indicates the capability.
Signed-off-by: Jing Zhang jingzhangos@google.com --- virt/kvm/kvm_main.c | 115 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 383df23514b9..87dd62516c8b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3464,6 +3464,51 @@ static long kvm_vcpu_ioctl(struct file *filp, r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); break; } + case KVM_STATS_GET_INFO: { + struct kvm_stats_info stats_info; + + r = -EFAULT; + stats_info.num_stats = VCPU_STAT_COUNT; + if (copy_to_user(argp, &stats_info, sizeof(stats_info))) + goto out; + r = 0; + break; + } + case KVM_STATS_GET_NAMES: { + struct kvm_stats_names stats_names; + + r = -EFAULT; + if (copy_from_user(&stats_names, argp, sizeof(stats_names))) + goto out; + r = -EINVAL; + if (stats_names.size < VCPU_STAT_COUNT * KVM_STATS_NAME_LEN) + goto out; + + r = -EFAULT; + if (copy_to_user(argp + sizeof(stats_names), + kvm_vcpu_stat_strings, + VCPU_STAT_COUNT * KVM_STATS_NAME_LEN)) + goto out; + r = 0; + break; + } + case KVM_STATS_GET_DATA: { + struct kvm_stats_data stats_data; + + r = -EFAULT; + if (copy_from_user(&stats_data, argp, sizeof(stats_data))) + goto out; + r = -EINVAL; + if (stats_data.size < sizeof(vcpu->stat)) + goto out; + + r = -EFAULT; + argp += sizeof(stats_data); + if (copy_to_user(argp, &vcpu->stat, sizeof(vcpu->stat))) + goto out; + r = 0; + break; + } default: r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); } @@ -3695,6 +3740,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_CHECK_EXTENSION_VM: case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_HALT_POLL: + case KVM_CAP_STATS_BINARY_FORM: return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: @@ -3825,6 +3871,40 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, } }
+static long kvm_vm_ioctl_stats_get_data(struct kvm *kvm, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct kvm_vcpu *vcpu; + struct kvm_stats_data stats_data; + u64 *data = NULL, *pdata; + int i, j, ret = 0; + size_t dsize = (VM_STAT_COUNT + VCPU_STAT_COUNT) * sizeof(*data); + + + if (copy_from_user(&stats_data, argp, sizeof(stats_data))) + return -EFAULT; + if (stats_data.size < dsize) + return -EINVAL; + data = kzalloc(dsize, GFP_KERNEL_ACCOUNT); + if (!data) + return -ENOMEM; + + for (i = 0; i < VM_STAT_COUNT; i++) + *(data + i) = *((ulong *)&kvm->stat + i); + + kvm_for_each_vcpu(j, vcpu, kvm) { + pdata = data + VM_STAT_COUNT; + for (i = 0; i < VCPU_STAT_COUNT; i++, pdata++) + *pdata += *((u64 *)&vcpu->stat + i); + } + + if (copy_to_user(argp + sizeof(stats_data), data, dsize)) + ret = -EFAULT; + + kfree(data); + return ret; +} + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4001,6 +4081,41 @@ static long kvm_vm_ioctl(struct file *filp, r = 0; break; } + case KVM_STATS_GET_INFO: { + struct kvm_stats_info stats_info; + + r = -EFAULT; + stats_info.num_stats = VM_STAT_COUNT + VCPU_STAT_COUNT; + if (copy_to_user(argp, &stats_info, sizeof(stats_info))) + goto out; + r = 0; + break; + } + case KVM_STATS_GET_NAMES: { + struct kvm_stats_names stats_names; + + r = -EFAULT; + if (copy_from_user(&stats_names, argp, sizeof(stats_names))) + goto out; + r = -EINVAL; + if (stats_names.size < + (VM_STAT_COUNT + VCPU_STAT_COUNT) * KVM_STATS_NAME_LEN) + goto out; + r = -EFAULT; + argp += sizeof(stats_names); + if (copy_to_user(argp, kvm_vm_stat_strings, + VM_STAT_COUNT * KVM_STATS_NAME_LEN)) + goto out; + argp += VM_STAT_COUNT * KVM_STATS_NAME_LEN; + if (copy_to_user(argp, kvm_vcpu_stat_strings, + VCPU_STAT_COUNT * KVM_STATS_NAME_LEN)) + goto out; + r = 0; + break; + } + case KVM_STATS_GET_DATA: + r = kvm_vm_ioctl_stats_get_data(kvm, arg); + break; case KVM_CHECK_EXTENSION: r = kvm_vm_ioctl_check_extension_generic(kvm, arg); break;
On 10/03/21 01:30, Jing Zhang wrote:
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 383df23514b9..87dd62516c8b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3464,6 +3464,51 @@ static long kvm_vcpu_ioctl(struct file *filp, r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); break; }
- case KVM_STATS_GET_INFO: {
struct kvm_stats_info stats_info;
r = -EFAULT;
stats_info.num_stats = VCPU_STAT_COUNT;
if (copy_to_user(argp, &stats_info, sizeof(stats_info)))
goto out;
r = 0;
break;
- }
- case KVM_STATS_GET_NAMES: {
struct kvm_stats_names stats_names;
r = -EFAULT;
if (copy_from_user(&stats_names, argp, sizeof(stats_names)))
goto out;
r = -EINVAL;
if (stats_names.size < VCPU_STAT_COUNT * KVM_STATS_NAME_LEN)
goto out;
r = -EFAULT;
if (copy_to_user(argp + sizeof(stats_names),
kvm_vcpu_stat_strings,
VCPU_STAT_COUNT * KVM_STATS_NAME_LEN))
The only reason to separate the strings in patch 1 is to pass them here. But this is a poor API because it imposes a limit on the length of the statistics, and makes that length part of the binary interface.
I would prefer a completely different interface, where you have a file descriptor that can be created and associated to a vCPU or VM (or even to /dev/kvm). Having a file descriptor is important because the fd can be passed to a less-privileged process that takes care of gathering the metrics
The result of reading the file descriptor could be either ASCII or binary. IMO the real cost lies in opening and reading a multitude of files rather than in the ASCII<->binary conversion.
The format could be one of the following:
* binary:
4 bytes flags (always zero) 4 bytes number of statistics 4 bytes offset of the first stat description 4 bytes offset of the first stat value stat descriptions: - 4 bytes for the type (for now always zero: uint64_t) - 4 bytes for the flags (for now always zero) - length of name - name statistics in 64-bit format
* text:
stat1_name uint64 123 stat2_name uint64 456 ...
What do you think?
Paolo
Hi Paolo,
On Wed, Mar 10, 2021 at 8:55 AM Paolo Bonzini pbonzini@redhat.com wrote:
On 10/03/21 01:30, Jing Zhang wrote:
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 383df23514b9..87dd62516c8b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3464,6 +3464,51 @@ static long kvm_vcpu_ioctl(struct file *filp, r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); break; }
case KVM_STATS_GET_INFO: {
struct kvm_stats_info stats_info;
r = -EFAULT;
stats_info.num_stats = VCPU_STAT_COUNT;
if (copy_to_user(argp, &stats_info, sizeof(stats_info)))
goto out;
r = 0;
break;
}
case KVM_STATS_GET_NAMES: {
struct kvm_stats_names stats_names;
r = -EFAULT;
if (copy_from_user(&stats_names, argp, sizeof(stats_names)))
goto out;
r = -EINVAL;
if (stats_names.size < VCPU_STAT_COUNT * KVM_STATS_NAME_LEN)
goto out;
r = -EFAULT;
if (copy_to_user(argp + sizeof(stats_names),
kvm_vcpu_stat_strings,
VCPU_STAT_COUNT * KVM_STATS_NAME_LEN))
The only reason to separate the strings in patch 1 is to pass them here. But this is a poor API because it imposes a limit on the length of the statistics, and makes that length part of the binary interface.
Agreed. I am considering returning the length of stats name strings in the kvm_stats_info structure instead of exporting it as a constant in uapi, which would put no limit on the length of the stats name strings.
I would prefer a completely different interface, where you have a file descriptor that can be created and associated to a vCPU or VM (or even to /dev/kvm). Having a file descriptor is important because the fd can be passed to a less-privileged process that takes care of gathering the metrics
Separate file descriptor solution is very tempting. We are still considering it seriously. Our biggest concern is that the metrics gathering/handling process is not necessary running on the same node as the one file descriptor belongs to. It scales better to pass metrics data directly than to pass file descriptors.
The result of reading the file descriptor could be either ASCII or binary. IMO the real cost lies in opening and reading a multitude of files rather than in the ASCII<->binary conversion.
Agreed.
The format could be one of the following:
- binary:
4 bytes flags (always zero) 4 bytes number of statistics 4 bytes offset of the first stat description 4 bytes offset of the first stat value stat descriptions:
- 4 bytes for the type (for now always zero: uint64_t)
- 4 bytes for the flags (for now always zero)
- length of name
- name
statistics in 64-bit format
- text:
stat1_name uint64 123 stat2_name uint64 456 ...
What do you think?
The binary format presented above is very flexible. I understand why it is organized this way. In our situation, the metrics data could be pulled periodically as short as half second. They are used by different kinds of monitors/triggers/alerts. To enhance efficiency and reduce traffic caused by metrics passing, we treat all metrics info/data as two kinds. One is immutable information, which doesn't change in a given system boot. The other is mutable data (statistics data), which is pulled/transferred periodically at a high frequency.
Paolo
Thanks, Jing
On 10/03/21 22:41, Jing Zhang wrote:
I would prefer a completely different interface, where you have a file descriptor that can be created and associated to a vCPU or VM (or even to /dev/kvm). Having a file descriptor is important because the fd can be passed to a less-privileged process that takes care of gathering the metrics
Separate file descriptor solution is very tempting. We are still considering it seriously. Our biggest concern is that the metrics gathering/handling process is not necessary running on the same node as the one file descriptor belongs to. It scales better to pass metrics data directly than to pass file descriptors.
If you want to pass metrics data directly, you can just read the file descriptor from your VMM, just like you're using the ioctls now. However the file descriptor also allows a privilege-separated same-host interface.
4 bytes flags (always zero) 4 bytes number of statistics 4 bytes offset of the first stat description 4 bytes offset of the first stat value stat descriptions:
- 4 bytes for the type (for now always zero: uint64_t)
- 4 bytes for the flags (for now always zero)
- length of name
- name
statistics in 64-bit format
The binary format presented above is very flexible. I understand why it is organized this way. In our situation, the metrics data could be pulled periodically as short as half second. They are used by different kinds of monitors/triggers/alerts. To enhance efficiency and reduce traffic caused by metrics passing, we treat all metrics info/data as two kinds. One is immutable information, which doesn't change in a given system boot. The other is mutable data (statistics data), which is pulled/transferred periodically at a high frequency.
The format allows to place the values before the descriptions. So you could use pread to only read the first part of the file descriptor, and the file_operations implementation would then skip the work of building the immutable data. It doesn't have to be implemented from the beginning like that, but the above format supports it.
Paolo
Hi Paolo,
On Fri, Mar 12, 2021 at 12:11 PM Paolo Bonzini pbonzini@redhat.com wrote:
On 10/03/21 22:41, Jing Zhang wrote:
I would prefer a completely different interface, where you have a file descriptor that can be created and associated to a vCPU or VM (or even to /dev/kvm). Having a file descriptor is important because the fd can be passed to a less-privileged process that takes care of gathering the metrics
Separate file descriptor solution is very tempting. We are still considering it seriously. Our biggest concern is that the metrics gathering/handling process is not necessary running on the same node as the one file descriptor belongs to. It scales better to pass metrics data directly than to pass file descriptors.
If you want to pass metrics data directly, you can just read the file descriptor from your VMM, just like you're using the ioctls now. However the file descriptor also allows a privilege-separated same-host interface.
It makes sense.
4 bytes flags (always zero)
Could you give some potential use for this flag?
4 bytes number of statistics 4 bytes offset of the first stat description 4 bytes offset of the first stat value stat descriptions:
- 4 bytes for the type (for now always zero: uint64_t)
Potential use for this type? Should we move this outside descriptor? Since all stats probably have the same size.
- 4 bytes for the flags (for now always zero)
Potential use for this flag?
- length of name
- name
statistics in 64-bit format
The binary format presented above is very flexible. I understand why it is organized this way. In our situation, the metrics data could be pulled periodically as short as half second. They are used by different kinds of monitors/triggers/alerts. To enhance efficiency and reduce traffic caused by metrics passing, we treat all metrics info/data as two kinds. One is immutable information, which doesn't change in a given system boot. The other is mutable data (statistics data), which is pulled/transferred periodically at a high frequency.
The format allows to place the values before the descriptions. So you could use pread to only read the first part of the file descriptor, and the file_operations implementation would then skip the work of building the immutable data. It doesn't have to be implemented from the beginning like that, but the above format supports it.
Good point! I'll be working on the new fd-based interface and come back with new patchset.
Paolo
Thanks, Jing
On 12/03/21 23:27, Jing Zhang wrote:
4 bytes flags (always zero)
Could you give some potential use for this flag?
No idea, honestly. It probably would signal the presence of more fields after "offset of the first stat value". In general it's better to leave some room for extension.
4 bytes number of statistics 4 bytes offset of the first stat description 4 bytes offset of the first stat value stat descriptions: - 4 bytes for the type (for now always zero: uint64_t)
Potential use for this type? Should we move this outside descriptor? Since all stats probably have the same size.
Yes, all stats should be 8 bytes. But for example:
- 0 = uint64_t
- 1 = int64_t
- 0x80000000 | n: enum with n different values, which are stored after the name
- 4 bytes for the flags (for now always zero)
Potential use for this flag?
Looking back at Emanuele's statsfs, it could be:
- bit 0: can be cleared (by writing eight zero bytes in the statistics' offset)
- bit 1: cumulative value (count of events, can only grow) vs. instantaneous value (can go up or down)
This is currently stored in the debugfs mode, so we can already use these flags.
Paolo
Hi Paolo,
On Wed, Mar 10, 2021 at 8:55 AM Paolo Bonzini pbonzini@redhat.com wrote:
On 10/03/21 01:30, Jing Zhang wrote:
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 383df23514b9..87dd62516c8b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3464,6 +3464,51 @@ static long kvm_vcpu_ioctl(struct file *filp, r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); break; }
case KVM_STATS_GET_INFO: {
struct kvm_stats_info stats_info;
r = -EFAULT;
stats_info.num_stats = VCPU_STAT_COUNT;
if (copy_to_user(argp, &stats_info, sizeof(stats_info)))
goto out;
r = 0;
break;
}
case KVM_STATS_GET_NAMES: {
struct kvm_stats_names stats_names;
r = -EFAULT;
if (copy_from_user(&stats_names, argp, sizeof(stats_names)))
goto out;
r = -EINVAL;
if (stats_names.size < VCPU_STAT_COUNT * KVM_STATS_NAME_LEN)
goto out;
r = -EFAULT;
if (copy_to_user(argp + sizeof(stats_names),
kvm_vcpu_stat_strings,
VCPU_STAT_COUNT * KVM_STATS_NAME_LEN))
The only reason to separate the strings in patch 1 is to pass them here. But this is a poor API because it imposes a limit on the length of the statistics, and makes that length part of the binary interface.
I would prefer a completely different interface, where you have a file descriptor that can be created and associated to a vCPU or VM (or even to /dev/kvm). Having a file descriptor is important because the fd can
We are considering about how to create the file descriptor. It might be risky to create an extra fd for every vCPU. It will easily hit the fd limit for the process or the system for machines running a ton of small VMs. Looks like creating an extra file descriptor for every VM is a better option. And then we can check per vCPU stats through Ioctl of this VM fd by passing the vCPU index. What do you think?
be passed to a less-privileged process that takes care of gathering the metrics
The result of reading the file descriptor could be either ASCII or binary. IMO the real cost lies in opening and reading a multitude of files rather than in the ASCII<->binary conversion.
The format could be one of the following:
- binary:
4 bytes flags (always zero) 4 bytes number of statistics 4 bytes offset of the first stat description 4 bytes offset of the first stat value stat descriptions:
- 4 bytes for the type (for now always zero: uint64_t)
- 4 bytes for the flags (for now always zero)
- length of name
- name
statistics in 64-bit format
- text:
stat1_name uint64 123 stat2_name uint64 456 ...
What do you think?
Paolo
On 15/03/21 23:31, Jing Zhang wrote:
We are considering about how to create the file descriptor. It might be risky to create an extra fd for every vCPU. It will easily hit the fd limit for the process or the system for machines running a ton of small VMs.
You already have a file descriptor for every vCPU, but I agree that having twice as many is not very good.
Looks like creating an extra file descriptor for every VM is a better option. And then we can check per vCPU stats through Ioctl of this VM fd by passing the vCPU index.
The file descriptor idea is not really infeasible I think (not just because the # of file descriptors is "only" doubled, but also because most of the time I think you'd only care of per-VM stats).
If you really believe it's not usable for you, you can use two ioctls to fill the description and the data respectively (i.e. ioctl(fd, KVM_GET_STATS_{DESCRIPTION,VALUES}, pdata) using the same layout as below. If called with NULL argument, the ioctl returns how much data they will fill in.
The (always zero) global flags can be replaced by the value returned by KVM_CHECK_EXTENSION.
The number of statistics can be obtained by ioctl(fd, KVM_GET_STATS_VALUES, NULL), just divide the returned value by 8.
Paolo
4 bytes flags (always zero) 4 bytes number of statistics 4 bytes offset of the first stat description 4 bytes offset of the first stat value stat descriptions: - 4 bytes for the type (for now always zero: uint64_t) - 4 bytes for the flags (for now always zero) - length of name - name statistics in 64-bit format
On Wed, 10 Mar 2021 00:30:23 +0000, Jing Zhang jingzhangos@google.com wrote:
Three ioctl commands are added to support binary form statistics data retrieval. KVM_STATS_GET_INFO, KVM_STATS_GET_NAMES, KVM_STATS_GET_DATA. KVM_CAP_STATS_BINARY_FORM indicates the capability.
Signed-off-by: Jing Zhang jingzhangos@google.com
virt/kvm/kvm_main.c | 115 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 383df23514b9..87dd62516c8b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3464,6 +3464,51 @@ static long kvm_vcpu_ioctl(struct file *filp, r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); break; }
- case KVM_STATS_GET_INFO: {
struct kvm_stats_info stats_info;
r = -EFAULT;
stats_info.num_stats = VCPU_STAT_COUNT;
if (copy_to_user(argp, &stats_info, sizeof(stats_info)))
goto out;
r = 0;
break;
- }
- case KVM_STATS_GET_NAMES: {
struct kvm_stats_names stats_names;
r = -EFAULT;
if (copy_from_user(&stats_names, argp, sizeof(stats_names)))
goto out;
r = -EINVAL;
if (stats_names.size < VCPU_STAT_COUNT * KVM_STATS_NAME_LEN)
goto out;
r = -EFAULT;
if (copy_to_user(argp + sizeof(stats_names),
kvm_vcpu_stat_strings,
VCPU_STAT_COUNT * KVM_STATS_NAME_LEN))
goto out;
r = 0;
break;
- }
- case KVM_STATS_GET_DATA: {
struct kvm_stats_data stats_data;
r = -EFAULT;
if (copy_from_user(&stats_data, argp, sizeof(stats_data)))
goto out;
r = -EINVAL;
if (stats_data.size < sizeof(vcpu->stat))
goto out;
r = -EFAULT;
argp += sizeof(stats_data);
if (copy_to_user(argp, &vcpu->stat, sizeof(vcpu->stat)))
goto out;
r = 0;
break;
- } default: r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); }
@@ -3695,6 +3740,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_CHECK_EXTENSION_VM: case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_HALT_POLL:
- case KVM_CAP_STATS_BINARY_FORM: return 1;
#ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: @@ -3825,6 +3871,40 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, } } +static long kvm_vm_ioctl_stats_get_data(struct kvm *kvm, unsigned long arg) +{
- void __user *argp = (void __user *)arg;
- struct kvm_vcpu *vcpu;
- struct kvm_stats_data stats_data;
- u64 *data = NULL, *pdata;
- int i, j, ret = 0;
- size_t dsize = (VM_STAT_COUNT + VCPU_STAT_COUNT) * sizeof(*data);
- if (copy_from_user(&stats_data, argp, sizeof(stats_data)))
return -EFAULT;
- if (stats_data.size < dsize)
return -EINVAL;
- data = kzalloc(dsize, GFP_KERNEL_ACCOUNT);
- if (!data)
return -ENOMEM;
- for (i = 0; i < VM_STAT_COUNT; i++)
*(data + i) = *((ulong *)&kvm->stat + i);
This kind of dance could be avoided if your stats were just an array, or a union of the current data structure and an array.
- kvm_for_each_vcpu(j, vcpu, kvm) {
pdata = data + VM_STAT_COUNT;
for (i = 0; i < VCPU_STAT_COUNT; i++, pdata++)
*pdata += *((u64 *)&vcpu->stat + i);
Do you really need the in-kernel copy? Why not directly organise the data structures in a way that would allow a bulk copy using copy_to_user()?
Another thing is the atomicity of what you are reporting. Don't you care about the consistency of the counters?
Thanks,
M.
On 10/03/21 16:51, Marc Zyngier wrote:
- kvm_for_each_vcpu(j, vcpu, kvm) {
pdata = data + VM_STAT_COUNT;
for (i = 0; i < VCPU_STAT_COUNT; i++, pdata++)
*pdata += *((u64 *)&vcpu->stat + i);
Do you really need the in-kernel copy? Why not directly organise the data structures in a way that would allow a bulk copy using copy_to_user()?
The result is built by summing per-vCPU counters, so that the counter updates are fast and do not require a lock. So consistency basically cannot be guaranteed.
Paolo
On Wed, 10 Mar 2021 16:03:42 +0000, Paolo Bonzini pbonzini@redhat.com wrote:
On 10/03/21 16:51, Marc Zyngier wrote:
- kvm_for_each_vcpu(j, vcpu, kvm) {
pdata = data + VM_STAT_COUNT;
for (i = 0; i < VCPU_STAT_COUNT; i++, pdata++)
*pdata += *((u64 *)&vcpu->stat + i);
Do you really need the in-kernel copy? Why not directly organise the data structures in a way that would allow a bulk copy using copy_to_user()?
The result is built by summing per-vCPU counters, so that the counter updates are fast and do not require a lock. So consistency basically cannot be guaranteed.
Sure, but I wonder whether there is scope for VM-global counters to be maintained in parallel with per-vCPU counters if speed/efficiency is of the essence (and this seems to be how it is sold in the cover letter).
Thanks,
M.
On 10/03/21 18:05, Marc Zyngier wrote:
On Wed, 10 Mar 2021 16:03:42 +0000, Paolo Bonzini pbonzini@redhat.com wrote:
On 10/03/21 16:51, Marc Zyngier wrote:
- kvm_for_each_vcpu(j, vcpu, kvm) {
pdata = data + VM_STAT_COUNT;
for (i = 0; i < VCPU_STAT_COUNT; i++, pdata++)
*pdata += *((u64 *)&vcpu->stat + i);
Do you really need the in-kernel copy? Why not directly organise the data structures in a way that would allow a bulk copy using copy_to_user()?
The result is built by summing per-vCPU counters, so that the counter updates are fast and do not require a lock. So consistency basically cannot be guaranteed.
Sure, but I wonder whether there is scope for VM-global counters to be maintained in parallel with per-vCPU counters if speed/efficiency is of the essence (and this seems to be how it is sold in the cover letter).
Maintaining VM-global counters would require an atomic instruction and would suffer lots of cacheline bouncing even on architectures that have relaxed atomic memory operations.
Speed/efficiency of retrieving statistics is important, but let's keep in mind that the baseline for comparison is hundreds of syscalls and filesystem lookups.
Paolo
On Wed, 10 Mar 2021 17:11:47 +0000, Paolo Bonzini pbonzini@redhat.com wrote:
On 10/03/21 18:05, Marc Zyngier wrote:
On Wed, 10 Mar 2021 16:03:42 +0000, Paolo Bonzini pbonzini@redhat.com wrote:
On 10/03/21 16:51, Marc Zyngier wrote:
- kvm_for_each_vcpu(j, vcpu, kvm) {
pdata = data + VM_STAT_COUNT;
for (i = 0; i < VCPU_STAT_COUNT; i++, pdata++)
*pdata += *((u64 *)&vcpu->stat + i);
Do you really need the in-kernel copy? Why not directly organise the data structures in a way that would allow a bulk copy using copy_to_user()?
The result is built by summing per-vCPU counters, so that the counter updates are fast and do not require a lock. So consistency basically cannot be guaranteed.
Sure, but I wonder whether there is scope for VM-global counters to be maintained in parallel with per-vCPU counters if speed/efficiency is of the essence (and this seems to be how it is sold in the cover letter).
Maintaining VM-global counters would require an atomic instruction and would suffer lots of cacheline bouncing even on architectures that have relaxed atomic memory operations.
Which is why we have per-cpu counters already. Making use of them doesn't seem that outlandish.
Speed/efficiency of retrieving statistics is important, but let's keep in mind that the baseline for comparison is hundreds of syscalls and filesystem lookups.
Having that baseline in the cover letter would be a good start, as well as an indication of the frequency this is used at.
M.
On 10/03/21 18:31, Marc Zyngier wrote:
Maintaining VM-global counters would require an atomic instruction and would suffer lots of cacheline bouncing even on architectures that have relaxed atomic memory operations.
Which is why we have per-cpu counters already. Making use of them doesn't seem that outlandish.
But you wouldn't be able to guarantee consistency anyway, would you? You *could* copy N*M counters to userspace, but there's no guarantee that they are consistent, neither within a single vCPU nor within a single counter.
Speed/efficiency of retrieving statistics is important, but let's keep in mind that the baseline for comparison is hundreds of syscalls and filesystem lookups.
Having that baseline in the cover letter would be a good start, as well as an indication of the frequency this is used at.
Can't disagree, especially on the latter which I have no idea about.
Paolo
On Wed, Mar 10, 2021 at 11:44 AM Paolo Bonzini pbonzini@redhat.com wrote:
On 10/03/21 18:31, Marc Zyngier wrote:
Maintaining VM-global counters would require an atomic instruction and would suffer lots of cacheline bouncing even on architectures that have relaxed atomic memory operations.
Which is why we have per-cpu counters already. Making use of them doesn't seem that outlandish.
But you wouldn't be able to guarantee consistency anyway, would you? You *could* copy N*M counters to userspace, but there's no guarantee that they are consistent, neither within a single vCPU nor within a single counter.
Speed/efficiency of retrieving statistics is important, but let's keep in mind that the baseline for comparison is hundreds of syscalls and filesystem lookups.
Having that baseline in the cover letter would be a good start, as well as an indication of the frequency this is used at.
Can't disagree, especially on the latter which I have no idea about.
Paolo
Marc, Paolo, thanks for the comments. I will add some more information in the cover letter.
Thanks, Jing
Check if the KVM binary form statistics works correctly and whether the statistics name strings are synced correctly with KVM internal stats data.
Signed-off-by: Jing Zhang jingzhangos@google.com --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 3 + .../selftests/kvm/kvm_bin_form_stats.c | 89 +++++++++++++++++++ 3 files changed, 93 insertions(+) create mode 100644 tools/testing/selftests/kvm/kvm_bin_form_stats.c
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 32b87cc77c8e..0c8241bd9a17 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -38,3 +38,4 @@ /memslot_modification_stress_test /set_memory_region_test /steal_time +/kvm_bin_form_stats diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index a6d61f451f88..5cdd52ccedf2 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -72,6 +72,7 @@ TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test TEST_GEN_PROGS_x86_64 += set_memory_region_test TEST_GEN_PROGS_x86_64 += steal_time +TEST_GEN_PROGS_x86_64 += kvm_bin_form_stats
TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve @@ -81,6 +82,7 @@ TEST_GEN_PROGS_aarch64 += dirty_log_perf_test TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus TEST_GEN_PROGS_aarch64 += set_memory_region_test TEST_GEN_PROGS_aarch64 += steal_time +TEST_GEN_PROGS_aarch64 += kvm_bin_form_stats
TEST_GEN_PROGS_s390x = s390x/memop TEST_GEN_PROGS_s390x += s390x/resets @@ -89,6 +91,7 @@ TEST_GEN_PROGS_s390x += demand_paging_test TEST_GEN_PROGS_s390x += dirty_log_test TEST_GEN_PROGS_s390x += kvm_create_max_vcpus TEST_GEN_PROGS_s390x += set_memory_region_test +TEST_GEN_PROGS_s390x += kvm_bin_form_stats
TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) LIBKVM += $(LIBKVM_$(UNAME_M)) diff --git a/tools/testing/selftests/kvm/kvm_bin_form_stats.c b/tools/testing/selftests/kvm/kvm_bin_form_stats.c new file mode 100644 index 000000000000..36cf206470b1 --- /dev/null +++ b/tools/testing/selftests/kvm/kvm_bin_form_stats.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kvm_bin_form_stats + * + * Copyright (C) 2021, Google LLC. + * + * Test for fd-based IOCTL commands for retrieving KVM statistics data in + * binary form. KVM_CAP_STATS_BINARY_FORM, KVM_STATS_GET_INFO, + * KVM_STATS_GET_NAMES and KVM_STATS_GET_DATA are checked. + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "test_util.h" + +#include "kvm_util.h" +#include "asm/kvm.h" +#include "linux/kvm.h" + +int main(int argc, char *argv[]) +{ + struct kvm_stats_info stats_info = {0}; + struct kvm_stats_names *stats_names; + struct kvm_stats_data *stats_data; + struct kvm_vm *kvm; + int i, ret; + + kvm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + + ret = kvm_check_cap(KVM_CAP_STATS_BINARY_FORM); + if (ret < 0) { + pr_info("Binary form statistics interface is not supported!\n"); + goto out_free_kvm; + } + + ret = -1; + vm_ioctl(kvm, KVM_STATS_GET_INFO, &stats_info); + if (stats_info.num_stats == 0) { + pr_info("KVM_STATS_GET_INFO failed!\n"); + pr_info("Or the number of stistics data is zero.\n"); + goto out_free_kvm; + } + + /* Allocate memory for stats name strings and value */ + stats_names = malloc(sizeof(*stats_names) + + stats_info.num_stats * KVM_STATS_NAME_LEN); + if (!stats_names) { + pr_info("Memory allocation failed!\n"); + goto out_free_kvm; + } + + stats_data = malloc(sizeof(*stats_data) + + stats_info.num_stats * sizeof(__u64)); + if (!stats_data) { + pr_info("Memory allocation failed!\n"); + goto out_free_names; + } + + /* Retrieve the name strings and data */ + stats_names->size = stats_info.num_stats * KVM_STATS_NAME_LEN; + vm_ioctl(kvm, KVM_STATS_GET_NAMES, stats_names); + + stats_data->size = stats_info.num_stats * sizeof(__u64); + vm_ioctl(kvm, KVM_STATS_GET_DATA, stats_data); + + /* Display supported statistics names */ + for (i = 0; i < (int)stats_info.num_stats; i++) { + char *name = (char *)stats_names->names + i * KVM_STATS_NAME_LEN; + + if (strnlen(name, KVM_STATS_NAME_LEN) == 0) { + pr_info("Empty stats name at offset %d!\n", i); + goto out_free_data; + } + pr_info("%s\n", name); + } + + ret = 0; +out_free_data: + free(stats_data); +out_free_names: + free(stats_names); +out_free_kvm: + kvm_vm_free(kvm); + return ret; +}
linux-kselftest-mirror@lists.linaro.org