Why is it necessary to use edi constraint in this inline assembly?

centos 6.5 64bit vps, 500MB ram gcc 4.8.2

I have the following function that works only if I use edi as the constraint to hold the string pointer. If I try to use any other register or constraintg or q etc, it segfaults.

BUT this problem only occurs when both link time optimization and o3 are used together. If o2 it’s fine. If I don’t use -flto, it’s fine. But both together then the only register I can use that doesn’t crash is edi

gcc -flto
CFLAGS=-I. -flto -std=gnu11 -msse4.2 -fno-builtin-printf -Wall -Winline -Wstrict-aliasing -g -pg -O3 -lrt -lpthread

It seems like there might be some sort of register clobbering going on or something else. I’m really at a loss to understand why and how to fix this. Another interesting aspect is the generated assembly puts rdi into rdx before using the pointer but if I try to use either register as the input constraint… it segfaults! If it fails under aggressive compiling options it suggests to me either the compiler is stuffing up somehow, or more likely I’m doing something wrong.

char *sse4_strCRLF(char *str)
  __m128i M = _mm_set1_epi8(13);
  char *res;
  __asm__ __volatile__(
     "xor %0,%0nt"
     "sub $1, %1nt"
"1:" "sub $15,%1nt"
    ".align 16nt"
"2:" "add $16, %1nt"
     "pcmpistri  $0x08,(%1),%2nt"
     "ja 2bnt"
     "jnc 2fnt"

     "cmpb $10,1(%1,%%rcx)nt"
     "jne 1bnt"
     "add %%rcx,%1nt"
     "mov %1,%0nt"
     :"edi"(str),"x"(M)  //<-- if use anything except edi, it segfaults
return (char*) res;

Disassembled output:

00000000000002e0 <sse4_strCRLF>:
2e0:   55                      push   rbp
2e1:   48 89 e5                mov    rbp,rsp
2e4:   e8 00 00 00 00          call   2e9 <sse4_strCRLF+0x9>
2e9:   66 0f 6f 05 00 00 00 00 movdqa xmm0,[rip+0x0]          # 2f1 <sse4_strCRLF+0x11>
2f1:   48 89 fa                mov    rdx,rdi   //<--- puts rdi into rdx!
2f4:   48 31 c0                xor    rax,rax
2f7:   48 83 ea 01             sub    rdx,0x1
2fb:   48 83 ea 0f             sub    rdx,0xf
2ff:   90                      nop
300:   48 83 c2 10             add    rdx,0x10
304:   66 0f 3a 63 02 08       pcmpistri xmm0,[rdx],0x8
30a:   77 f4                   ja     300 <sse4_strCRLF+0x20>
30c:   73 0d                   jae    31b <sse4_strCRLF+0x3b>
30e:   80 7c 0a 01 0a          cmp    byte[rdx+rcx*1+0x1],0xa
313:   75 e6                   jne    2fb <sse4_strCRLF+0x1b>
315:   48 01 ca                add    rdx,rcx
318:   48 89 d0                mov    rax,rdx
31b:   5d                      pop    rbp
31c:   c3                      ret

Source: gcc

Leave a Reply