From: "Rémi Bernon" Subject: [PATCH v4 2/3] msvcrt: Check for ERMS support and use rep stosb for large memset calls. Message-Id: <20210914142816.1226702-2-rbernon@codeweavers.com> Date: Tue, 14 Sep 2021 16:28:15 +0200 In-Reply-To: <20210914142816.1226702-1-rbernon@codeweavers.com> References: <20210914142816.1226702-1-rbernon@codeweavers.com> Signed-off-by: Rémi Bernon --- dlls/msvcrt/math.c | 13 +++++++++ dlls/msvcrt/msvcrt.h | 1 + dlls/msvcrt/string.c | 64 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+) diff --git a/dlls/msvcrt/math.c b/dlls/msvcrt/math.c index 7f59a4d20d4..9974e72d78f 100644 --- a/dlls/msvcrt/math.c +++ b/dlls/msvcrt/math.c @@ -43,6 +43,7 @@ #include #include #include +#include #include "msvcrt.h" #include "winternl.h" @@ -64,11 +65,23 @@ typedef int (CDECL *MSVCRT_matherr_func)(struct _exception *); static MSVCRT_matherr_func MSVCRT_default_matherr_func = NULL; +BOOL erms_supported; BOOL sse2_supported; static BOOL sse2_enabled; void msvcrt_init_math( void *module ) { +#if defined(__i386__) || defined(__x86_64__) + int regs[4]; + + __cpuid(regs, 0); + if (regs[0] >= 7) + { + __cpuidex(regs, 7, 0); + erms_supported = ((regs[1] >> 9) & 1); + } +#endif + sse2_supported = IsProcessorFeaturePresent( PF_XMMI64_INSTRUCTIONS_AVAILABLE ); #if _MSVCR_VER <=71 sse2_enabled = FALSE; diff --git a/dlls/msvcrt/msvcrt.h b/dlls/msvcrt/msvcrt.h index 60f8c2f5ef2..022eced35d9 100644 --- a/dlls/msvcrt/msvcrt.h +++ b/dlls/msvcrt/msvcrt.h @@ -33,6 +33,7 @@ #undef strncpy #undef wcsncpy +extern BOOL erms_supported DECLSPEC_HIDDEN; extern BOOL sse2_supported DECLSPEC_HIDDEN; #define DBL80_MAX_10_EXP 4932 diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index f2b1b4a5b11..32291f06001 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2732,6 +2732,13 @@ __ASM_GLOBAL_FUNC( sse2_memmove, MEMMOVE_CLEANUP "ret" ) +#undef MEMMOVE_INIT +#undef MEMMOVE_CLEANUP +#undef DEST_REG +#undef SRC_REG +#undef LEN_REG +#undef TMP_REG + #endif /********************************************************************* @@ -2855,6 +2862,56 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n) return memmove(dst, src, n); } +#if defined(__i386__) || defined(__x86_64__) + +#ifdef __i386__ +#define DEST_REG "%edi" +#define LEN_REG "%ecx" +#define VAL_REG "%eax" + +#define MEMSET_INIT \ + "movl " DEST_REG ", %edx\n\t" \ + "movl 4(%esp), " DEST_REG "\n\t" \ + "movl 8(%esp), " VAL_REG "\n\t" \ + "movl 12(%esp), " LEN_REG "\n\t" + +#define MEMSET_RET \ + "movl %edx, " DEST_REG "\n\t" \ + "ret" + +#else + +#define DEST_REG "%rdi" +#define LEN_REG "%rcx" +#define VAL_REG "%eax" + +#define MEMSET_INIT \ + "movq " DEST_REG ", %r9\n\t" \ + "movq %rcx, " DEST_REG "\n\t" \ + "movl %edx, " VAL_REG "\n\t" \ + "movq %r8, " LEN_REG "\n\t" + +#define MEMSET_RET \ + "movq %r9, " DEST_REG "\n\t" \ + "ret" + +#endif + +void __cdecl erms_memset_aligned_32(unsigned char *d, unsigned int c, size_t n); +__ASM_GLOBAL_FUNC( erms_memset_aligned_32, + MEMSET_INIT + "rep\n\t" + "stosb\n\t" + MEMSET_RET ) + +#undef MEMSET_INIT +#undef MEMSET_RET +#undef DEST_REG +#undef LEN_REG +#undef VAL_REG + +#endif + static inline void memset_aligned_32(unsigned char *d, uint64_t v, size_t n) { while (n >= 32) @@ -2890,6 +2947,13 @@ void *__cdecl memset(void *dst, int c, size_t n) if (n <= 64) return dst; n = (n - a) & ~0x1f; +#if defined(__i386__) || defined(__x86_64__) + if (n >= 2048 && erms_supported) + { + erms_memset_aligned_32(d + a, v, n); + return dst; + } +#endif memset_aligned_32(d + a, v, n); return dst; } -- 2.33.0