From 1a38584bb794095bc1ac875898ec33ac4ac086a3 Mon Sep 17 00:00:00 2001 From: Adam Sakareassen Date: Wed, 14 Nov 2012 08:15:47 +1100 Subject: [PATCH] AVX Instruction Set Support AVX Instruction Set Support. This patch enables operating system support for the AVX instruction set for the 64-bit version of Dragonfly BSD. AVX includes new floating point routines and CPU registers. Specifically: 1. A new configuration option is added. CPU_ENABLE_AVX. It is disabled by default. When it is disabled this patch will have practically no effect. 2. When you enable this option you will need to compile with gcc4.7 or later. 3. The CPU will be checked for XSAVE and AVX support on boot. If both are found they will be enabled. 4. If enabled, the FPU registers including the new 256-bit YMM registers will be saved using the new XSAVE function. (Otherwise it will revert to former code). Issues: 1. The kernel does not check the CPU for the size of the XSAVE structure. I didn't attempt to do this because I didn't want to mess too much with the existing FPU code. This only has an effect if someone wishes to add support for new CPU features that use the XSAVE interface. They will need to manually allocate space for this structure. I checked the FreeBSD AVX patch and it does the same thing. 2. Compilation is restricted to gcc4.7 or later. (Actually 4.6 probably works but I haven't checked as its not part of our system). The FreeBSD patch directly includes machine code to support older compilers. I thought it more elegant to use readable assembler. Besides, if users don't have a new compiler, they can't compile AVX support for their programs. The kernel still compiles fine on gcc4.4 with the configuration option disabled. I suggest that AVX support become the default when gcc4.7 becomes Dragonfly's default compiler. --- sys/config/LINT64 | 4 +++ sys/config/X86_64_GENERIC | 1 + sys/cpu/x86_64/include/cpufunc.h | 8 ++++++ sys/cpu/x86_64/include/npx.h | 12 +++++++++ sys/cpu/x86_64/include/specialreg.h | 51 ++++++++++++++++++++++--------------- sys/platform/pc64/conf/options | 1 + sys/platform/pc64/include/md_var.h | 1 + sys/platform/pc64/x86_64/initcpu.c | 14 ++++++++++ sys/platform/pc64/x86_64/npx.c | 19 +++++++++++++- 10 files changed, 95 insertions(+), 22 deletions(-) diff --git a/sys/config/LINT64 b/sys/config/LINT64 index aa1975b..79d0bca 100644 --- a/sys/config/LINT64 +++ b/sys/config/LINT64 @@ -117,8 +117,12 @@ cpu HAMMER_CPU # CPU_ENABLE_EST enables support for Enhanced SpeedStep technology # found in Pentium(tm) M processors. # +# CPU_ENABLE_AVX enables AVX instruction set. +# This option requires gcc version 4.7 or later. +# #options CPU_DISABLE_SSE options CPU_ENABLE_EST +options CPU_ENABLE_AVX #Requires gcc 4.7 or later. ##################################################################### # COMPATIBILITY OPTIONS diff --git a/sys/config/X86_64_GENERIC b/sys/config/X86_64_GENERIC index f132db1..5ec58ac 100644 --- a/sys/config/X86_64_GENERIC +++ b/sys/config/X86_64_GENERIC @@ -10,6 +10,7 @@ machine_arch x86_64 cpu HAMMER_CPU ident X86_64_GENERIC maxusers 0 +#options CPU_ENABLE_AVX #Support AVX Instructions (GCC4.7 Req) makeoptions DEBUG=-g #Build kernel with gdb(1) debug symbols diff --git a/sys/cpu/x86_64/include/cpufunc.h b/sys/cpu/x86_64/include/cpufunc.h index 7a8fbcf..33e789c 100644 --- a/sys/cpu/x86_64/include/cpufunc.h +++ b/sys/cpu/x86_64/include/cpufunc.h @@ -579,6 +579,14 @@ wrmsr(u_int msr, u_int64_t newval) } static __inline void +xsetbv(u_int ecx, u_int eax, u_int edx) +{ + __asm __volatile("xsetbv" + : + : "a" (eax), "c" (ecx), "d" (edx)); +} + +static __inline void load_cr0(u_long data) { diff --git a/sys/cpu/x86_64/include/npx.h b/sys/cpu/x86_64/include/npx.h index 2c36be7..cb67ee7 100644 --- a/sys/cpu/x86_64/include/npx.h +++ b/sys/cpu/x86_64/include/npx.h @@ -46,6 +46,8 @@ #ifndef _CPU_NPX_H_ #define _CPU_NPX_H_ +#include "opt_cpu.h" + #ifndef _SYS_TYPES_H_ #include #endif @@ -124,9 +126,19 @@ struct savexmm { u_char sv_pad[220]; } __attribute__((aligned(16))); +#ifdef CPU_ENABLE_AVX +struct saveymm { + u_char xsavedata[832]; +} __attribute__((aligned(64))); + +#endif + union savefpu { struct save87 sv_87; struct savexmm sv_xmm; +#ifdef CPU_ENABLE_AVX + struct saveymm sv_ymm; +#endif }; /* diff --git a/sys/cpu/x86_64/include/specialreg.h b/sys/cpu/x86_64/include/specialreg.h index b039cc9..f5660d8 100644 --- a/sys/cpu/x86_64/include/specialreg.h +++ b/sys/cpu/x86_64/include/specialreg.h @@ -41,40 +41,40 @@ #define CR0_MP 0x00000002 /* "Math" (fpu) Present */ #define CR0_EM 0x00000004 /* EMulate FPU instructions. (trap ESC only) */ #define CR0_TS 0x00000008 /* Task Switched (if MP, trap ESC and WAIT) */ -#define CR0_PG 0x80000000 /* PaGing enable */ +#define CR0_PG 0x80000000 /* Paging enable */ /* * Bits in 486 special registers: */ #define CR0_NE 0x00000020 /* Numeric Error enable (EX16 vs IRQ13) */ -#define CR0_WP 0x00010000 /* Write Protect (honor page protect in - all modes) */ +#define CR0_WP 0x00010000 /* Write Protect (honor page protect in all modes) */ #define CR0_AM 0x00040000 /* Alignment Mask (set to enable AC flag) */ #define CR0_NW 0x20000000 /* Not Write-through */ #define CR0_CD 0x40000000 /* Cache Disable */ /* - * Bits in PPro special registers + * Bits in CR4 special register */ -#define CR4_VME 0x00000001 /* Virtual 8086 mode extensions */ -#define CR4_PVI 0x00000002 /* Protected-mode virtual interrupts */ -#define CR4_TSD 0x00000004 /* Time stamp disable */ -#define CR4_DE 0x00000008 /* Debugging extensions */ -#define CR4_PSE 0x00000010 /* Page size extensions */ -#define CR4_PAE 0x00000020 /* Physical address extension */ -#define CR4_MCE 0x00000040 /* Machine check enable */ -#define CR4_PGE 0x00000080 /* Page global enable */ -#define CR4_PCE 0x00000100 /* Performance monitoring counter enable */ -#define CR4_FXSR 0x00000200 /* Fast FPU save/restore used by OS */ -#define CR4_XMM 0x00000400 /* enable SIMD/MMX2 to use except 16 */ +#define CR4_VME 0x00000001 /* Virtual 8086 mode extensions */ +#define CR4_PVI 0x00000002 /* Protected-mode virtual interrupts */ +#define CR4_TSD 0x00000004 /* Time stamp disable */ +#define CR4_DE 0x00000008 /* Debugging extensions */ +#define CR4_PSE 0x00000010 /* Page size extensions */ +#define CR4_PAE 0x00000020 /* Physical address extension */ +#define CR4_MCE 0x00000040 /* Machine check enable */ +#define CR4_PGE 0x00000080 /* Page global enable */ +#define CR4_PCE 0x00000100 /* Performance monitoring counter enable */ +#define CR4_FXSR 0x00000200 /* Fast FPU save/restore used by OS */ +#define CR4_XMM 0x00000400 /* Enable SIMD/MMX2 to use except 16 */ +#define CR4_XSAVE 0x00040000 /* Enable XSave (for AVX Instructions)*/ /* * Bits in x86_64 special registers. EFER is 64 bits wide. */ -#define EFER_SCE 0x000000001 /* System Call Extensions (R/W) */ -#define EFER_LME 0x000000100 /* Long mode enable (R/W) */ -#define EFER_LMA 0x000000400 /* Long mode active (R) */ -#define EFER_NXE 0x000000800 /* PTE No-Execute bit enable (R/W) */ +#define EFER_SCE 0x000000001 /* System Call Extensions (R/W) */ +#define EFER_LME 0x000000100 /* Long mode enable (R/W) */ +#define EFER_LMA 0x000000400 /* Long mode active (R) */ +#define EFER_NXE 0x000000800 /* PTE No-Execute bit enable (R/W) */ /* * CPUID instruction features register @@ -132,10 +132,19 @@ #define CPUID2_SSE42 0x00100000 #define CPUID2_X2APIC 0x00200000 #define CPUID2_POPCNT 0x00800000 -#define CPUID2_AESNI 0x02000000 -#define CPUID2_RDRAND 0x40000000 +#define CPUID2_AESNI 0x02000000 /* AES Instruction Set */ +#define CPUID2_XSAVE 0x04000000 /* XSave supported by CPU */ +#define CPUID2_OSXSAVE 0x08000000 /* XSave and AVX supported by OS */ +#define CPUID2_AVX 0x10000000 /* AVX instruction set support */ +#define CPUID2_F16C 0x20000000 /* CVT16 instruction set support */ +#define CPUID2_RDRAND 0x40000000 /* RdRand. On chip random numbers */ #define CPUID2_VMM 0x80000000 /* AMD 25481 2.34 page 11 */ +/*Bits related to the XFEATURE_ENABLED_MASK control register*/ +#define CPU_XFEATURE_X87 0x00000001 +#define CPU_XFEATURE_SSE 0x00000002 +#define CPU_XFEATURE_YMM 0x00000004 + /* * Important bits in the AMD extended cpuid flags */ diff --git a/sys/platform/pc64/conf/options b/sys/platform/pc64/conf/options index b149a4e..bdee1df 100644 --- a/sys/platform/pc64/conf/options +++ b/sys/platform/pc64/conf/options @@ -18,6 +18,7 @@ NDISAPI opt_dontuse.h # x86_64 SMP options CPU_ENABLE_EST opt_cpu.h +CPU_ENABLE_AVX opt_cpu.h # The cpu type # diff --git a/sys/platform/pc64/include/md_var.h b/sys/platform/pc64/include/md_var.h index f929264..7dd5da9 100644 --- a/sys/platform/pc64/include/md_var.h +++ b/sys/platform/pc64/include/md_var.h @@ -51,6 +51,7 @@ extern u_int amd_feature; extern u_int amd_feature2; extern u_int cpu_clflush_line_size; extern u_int cpu_fxsr; +extern u_int cpu_xsave; extern u_int cpu_high; extern u_int cpu_id; extern u_int cpu_procinfo; diff --git a/sys/platform/pc64/x86_64/initcpu.c b/sys/platform/pc64/x86_64/initcpu.c index 7a49bdd..1e9900b 100644 --- a/sys/platform/pc64/x86_64/initcpu.c +++ b/sys/platform/pc64/x86_64/initcpu.c @@ -62,6 +62,7 @@ u_int cpu_procinfo2; /* Multicore info */ char cpu_vendor[20]; /* CPU Origin code */ u_int cpu_vendor_id; /* CPU vendor ID */ u_int cpu_fxsr; /* SSE enabled */ +u_int cpu_xsave; /* AVX enabled by OS*/ u_int cpu_mxcsr_mask; /* Valid bits in mxcsr */ u_int cpu_clflush_line_size = 32; /* Default CLFLUSH line size */ @@ -152,11 +153,24 @@ initializecpu(void) { uint64_t msr; + /*Check for FXSR and SSE support and enable if available.*/ if ((cpu_feature & CPUID_XMM) && (cpu_feature & CPUID_FXSR)) { load_cr4(rcr4() | CR4_FXSR | CR4_XMM); cpu_fxsr = hw_instruction_sse = 1; } +#if defined(CPU_ENABLE_AVX) + /*Check for XSAVE and AVX support and enable if available.*/ + if ((cpu_feature2 & CPUID2_AVX) && (cpu_feature2 & CPUID2_XSAVE) + && (cpu_feature & CPUID_SSE)){ + load_cr4(rcr4() | CR4_XSAVE); + + /* Adjust size of savefpu in npx.h before adding to mask.*/ + xsetbv(0,CPU_XFEATURE_X87 | CPU_XFEATURE_SSE | CPU_XFEATURE_YMM,0); + cpu_xsave = 1; + } +#endif + if (cpu_vendor_id == CPU_VENDOR_AMD) { switch((cpu_id & 0xFF0000)) { case 0x100000: diff --git a/sys/platform/pc64/x86_64/npx.c b/sys/platform/pc64/x86_64/npx.c index 20e4baf..fc72d66 100644 --- a/sys/platform/pc64/x86_64/npx.c +++ b/sys/platform/pc64/x86_64/npx.c @@ -37,6 +37,7 @@ */ #include "opt_debug_npx.h" +#include "opt_cpu.h" #include #include @@ -76,6 +77,10 @@ #define fxrstor(addr) __asm("fxrstor %0" : : "m" (*(addr))) #define fxsave(addr) __asm __volatile("fxsave %0" : "=m" (*(addr))) #endif +#ifdef CPU_ENABLE_AVX +#define xsave(eax,edx,addr) __asm __volatile("xsave %0" : "=m" (*(addr)) : "a" (eax),"d" (edx) ) +#define xrstor(eax,edx,addr) __asm __volatile("xrstor %0" : : "m" (*(addr)), "a" (eax), "d" (edx)) +#endif #define start_emulating() __asm("smsw %%ax; orb %0,%%al; lmsw %%ax" \ : : "n" (CR0_TS) : "ax") #define stop_emulating() __asm("clts") @@ -96,7 +101,8 @@ static void fpurstor (union savefpu *); void npxinit(u_short control) { - static union savefpu dummy __aligned(16); + /*64-Byte alignment required for xsave*/ + static union savefpu dummy __aligned(64); /* * fninit has the same h/w bugs as fnsave. Use the detoxified @@ -405,6 +411,12 @@ npxsave(union savefpu *addr) static void fpusave(union savefpu *addr) { +#ifdef CPU_ENABLE_AVX + if (cpu_xsave) + + xsave(CPU_XFEATURE_X87 | CPU_XFEATURE_SSE | CPU_XFEATURE_YMM ,0,addr); + else +#endif #ifndef CPU_DISABLE_SSE if (cpu_fxsr) fxsave(addr); @@ -545,6 +557,11 @@ fpu_clean_state(void) static void fpurstor(union savefpu *addr) { +#ifdef CPU_ENABLE_AVX + if (cpu_xsave){ + xrstor(CPU_XFEATURE_X87 | CPU_XFEATURE_SSE | CPU_XFEATURE_YMM ,0,addr); + }else +#endif #ifndef CPU_DISABLE_SSE if (cpu_fxsr) { fpu_clean_state(); -- 1.7.12