From: "Rémi Bernon" Subject: Re: [PATCH] msvcrt: Avoid disallowed unaligned writes in memset on ARM Message-Id: <298e7603-e625-e218-b101-d35f62a2e2cf@codeweavers.com> Date: Wed, 15 Sep 2021 23:55:21 +0200 In-Reply-To: <20210915202745.3661089-1-martin@martin.st> References: <20210915202745.3661089-1-martin@martin.st> On 9/15/21 10:27 PM, Martin Storsjo wrote: > From: Martin Storsjö > > This fixes a regression in memset on ARM since > 7b17d7081512db52ef852705445762ac4016c29f. > > ARM can do 64 bit writes with the STRD instruction, but that > instruction requires a 32 bit aligned address - while these stores > are unaligned. > > Two consecutive stores to uint32_t* pointers can also be fused > into one single STRD, as a uint32_t* is supposed to be properly > aligned - therefore, do these stores as stores to volatile uint32_t* > to avoid fusing them. > > Signed-off-by: Martin Storsjö > --- > dlls/msvcrt/string.c | 29 +++++++++++++++++++++++++++++ > 1 file changed, 29 insertions(+) > > diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c > index f2b1b4a5b11..bf491a91f40 100644 > --- a/dlls/msvcrt/string.c > +++ b/dlls/msvcrt/string.c > @@ -2878,15 +2878,37 @@ void *__cdecl memset(void *dst, int c, size_t n) > > if (n >= 16) > { > +#ifdef __arm__ > + *(volatile uint32_t *)(d + 0) = v; > + *(volatile uint32_t *)(d + 4) = v; > + *(volatile uint32_t *)(d + 8) = v; > + *(volatile uint32_t *)(d + 12) = v; > + *(volatile uint32_t *)(d + n - 16) = v; > + *(volatile uint32_t *)(d + n - 12) = v; > + *(volatile uint32_t *)(d + n - 8) = v; > + *(volatile uint32_t *)(d + n - 4) = v; > +#else > *(uint64_t *)(d + 0) = v; > *(uint64_t *)(d + 8) = v; > *(uint64_t *)(d + n - 16) = v; > *(uint64_t *)(d + n - 8) = v; > +#endif > if (n <= 32) return dst; > +#ifdef __arm__ > + *(volatile uint32_t *)(d + 16) = v; > + *(volatile uint32_t *)(d + 20) = v; > + *(volatile uint32_t *)(d + 24) = v; > + *(volatile uint32_t *)(d + 28) = v; > + *(volatile uint32_t *)(d + n - 32) = v; > + *(volatile uint32_t *)(d + n - 28) = v; > + *(volatile uint32_t *)(d + n - 24) = v; > + *(volatile uint32_t *)(d + n - 20) = v; > +#else > *(uint64_t *)(d + 16) = v; > *(uint64_t *)(d + 24) = v; > *(uint64_t *)(d + n - 32) = v; > *(uint64_t *)(d + n - 24) = v; > +#endif > if (n <= 64) return dst; > > n = (n - a) & ~0x1f; > @@ -2895,8 +2917,15 @@ void *__cdecl memset(void *dst, int c, size_t n) > } > if (n >= 8) > { > +#ifdef __arm__ > + *(volatile uint32_t *)d = v; > + *(volatile uint32_t *)(d + 4) = v; > + *(volatile uint32_t *)(d + n - 4) = v; > + *(volatile uint32_t *)(d + n - 8) = v; > +#else > *(uint64_t *)d = v; > *(uint64_t *)(d + n - 8) = v; > +#endif > return dst; > } > if (n >= 4) > I'm confused that it causes trouble when I thought it could benefit more than just Intel architectures... Maybe it could be made not too ugly with some macro to wrap the 64bit stores, defined differently for arm? -- Rémi Bernon