[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
9 messages Options
Reply | Threaded
Open this post in threaded view
|

[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

Alberto Barbaro via llvm-dev
Hi @ll,

while clang/LLVM recognizes common bit-twiddling idioms/expressions
like

unsigned int rotate(unsigned int x, unsigned int n)
{
    return (x << n) | (x >> (32 - n));
}

and typically generates "rotate" machine instructions for this
expression, it fails to recognize other also common bit-twiddling
idioms/expressions.

The standard IEEE CRC-32 for "big endian" alias "network" byte order
(see <https://tools.ietf.org/html/rfc1952#section-8> for example):

unsigned int crc32be(unsigned char const *octets, unsigned int count)
{
    unsigned int crc = 0L;
    unsigned int i;

    while (count--) {
        crc ^= *octets++ << 24;
        for (i = 8; i > 0; i--)
            if (crc & 0x80000000L)             // the code generated
                crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from
            else                               // these 4 lines is
                crc <<= 1;                     // rather poor!
    }
    return crc;
}

The same function for "little endian" byte order, using the "inverse"
or "mirrored" polynom:

unsigned int crc32le(unsigned char const *octets, unsigned int count)
{
    unsigned int crc = ~0L;
    unsigned int i;

    while (count--) {
        crc ^= *octets++;
        for (i = 8; i > 0; i--)
            if (crc & 1L)                      // the code generated
                crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from
            else                               // these 4 lines is
                crc >>= 1;                     // rather poor!
    }
    return ~crc;
}

See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm> (-O2)

crc32be: # @crc32be
        xor    eax, eax
        test   esi, esi
        jne    .LBB0_2
        jmp    .LBB0_5
.LBB0_4: # in Loop: Header=BB0_2 Depth=1
        add    rdi, 1
        test   esi, esi
        je    .LBB0_5
.LBB0_2: # =>This Loop Header: Depth=1
        add    esi, -1
        movzx  edx, byte ptr [rdi]
        shl    edx, 24
        xor    edx, eax
        mov    ecx, -8
        mov    eax, edx
.LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions instead of 6, r8 not clobbered!
        lea    r8d, [rax + rax]        |     add   eax, eax
        mov    edx, r8d                | # CF is set from the MSB of EAX
        xor    edx, -306674912         |     sbb   edx, edx
        test   eax, eax                | # EDX is 0xFFFFFFFF if CF set, else 0
        mov    eax, edx                |     and   edx, -306674912
        cmovns eax, r8d                |     xor   eax, edx
        add    ecx, 1
        jne    .LBB0_3
        jmp    .LBB0_4
.LBB0_5:
        ret
crc32le: # @crc32le
        test   esi, esi
        je    .LBB1_1
        mov    eax, -1
.LBB1_4: # =>This Loop Header: Depth=1
        add    esi, -1
        movzx  ecx, byte ptr [rdi]
        xor    eax, ecx
        mov    r8d, -8
.LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions instead of 7, and
        mov    edx, eax                | #  neither r8 nor rcx clobbered!
        shr    edx                     |     shr   eax, 1
        mov    ecx, edx                | # CF is set from the LSB of EAX
        xor    ecx, 79764919           |     sbb   edx, edx
        test   al, 1                   | # EDX is 0xFFFFFFFF if CF set, else 0
        mov    eax, ecx                |     and   edx, 79764919
        cmove  eax, edx                |     xor   eax, edx
        add    r8d, 1
        jne    .LBB1_5
        add    rdi, 1
        test   esi, esi
        jne    .LBB1_4
        not    eax
        ret
.LBB1_1:
        xor    eax, eax
        ret

JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal
      code sequence with 6 and 7 instructions; this accounts for a total
      of 16 and 24 superfluous instructions respectively.
_______________________________________________
LLVM Developers mailing list
[hidden email]
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
Reply | Threaded
Open this post in threaded view
|

Re: [llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

Alberto Barbaro via llvm-dev
IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like this:
unsigned int foo(unsigned int crc) {
    if (crc & 0x80000000)
      crc <<= 1, crc ^= 0xEDB88320;
    else
      crc <<= 1;
    return crc;
}

Which is this in LLVM IR:
define i32 @foo(i32 %x) {
  %t2 = icmp slt i32 %x, 0
  %t3 = shl i32 %x, 1
  %t4 = xor i32 %t3, -306674912
  %t5 = select i1 %t2, i32 %t4, i32 %t3
  ret i32 %t5
}

Please a file a bug report for the x86 backend (including performance numbers if you have that data).

On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev <[hidden email]> wrote:
Hi @ll,

while clang/LLVM recognizes common bit-twiddling idioms/expressions
like

unsigned int rotate(unsigned int x, unsigned int n)
{
    return (x << n) | (x >> (32 - n));
}

and typically generates "rotate" machine instructions for this
expression, it fails to recognize other also common bit-twiddling
idioms/expressions.

The standard IEEE CRC-32 for "big endian" alias "network" byte order
(see <https://tools.ietf.org/html/rfc1952#section-8> for example):

unsigned int crc32be(unsigned char const *octets, unsigned int count)
{
    unsigned int crc = 0L;
    unsigned int i;

    while (count--) {
        crc ^= *octets++ << 24;
        for (i = 8; i > 0; i--)
            if (crc & 0x80000000L)             // the code generated
                crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from
            else                               // these 4 lines is
                crc <<= 1;                     // rather poor!
    }
    return crc;
}

The same function for "little endian" byte order, using the "inverse"
or "mirrored" polynom:

unsigned int crc32le(unsigned char const *octets, unsigned int count)
{
    unsigned int crc = ~0L;
    unsigned int i;

    while (count--) {
        crc ^= *octets++;
        for (i = 8; i > 0; i--)
            if (crc & 1L)                      // the code generated
                crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from
            else                               // these 4 lines is
                crc >>= 1;                     // rather poor!
    }
    return ~crc;
}

See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm> (-O2)

crc32be: # @crc32be
        xor    eax, eax
        test   esi, esi
        jne    .LBB0_2
        jmp    .LBB0_5
.LBB0_4: # in Loop: Header=BB0_2 Depth=1
        add    rdi, 1
        test   esi, esi
        je    .LBB0_5
.LBB0_2: # =>This Loop Header: Depth=1
        add    esi, -1
        movzx  edx, byte ptr [rdi]
        shl    edx, 24
        xor    edx, eax
        mov    ecx, -8
        mov    eax, edx
.LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions instead of 6, r8 not clobbered!
        lea    r8d, [rax + rax]        |     add   eax, eax
        mov    edx, r8d                | # CF is set from the MSB of EAX
        xor    edx, -306674912         |     sbb   edx, edx
        test   eax, eax                | # EDX is 0xFFFFFFFF if CF set, else 0
        mov    eax, edx                |     and   edx, -306674912
        cmovns eax, r8d                |     xor   eax, edx
        add    ecx, 1
        jne    .LBB0_3
        jmp    .LBB0_4
.LBB0_5:
        ret
crc32le: # @crc32le
        test   esi, esi
        je    .LBB1_1
        mov    eax, -1
.LBB1_4: # =>This Loop Header: Depth=1
        add    esi, -1
        movzx  ecx, byte ptr [rdi]
        xor    eax, ecx
        mov    r8d, -8
.LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions instead of 7, and
        mov    edx, eax                | #  neither r8 nor rcx clobbered!
        shr    edx                     |     shr   eax, 1
        mov    ecx, edx                | # CF is set from the LSB of EAX
        xor    ecx, 79764919           |     sbb   edx, edx
        test   al, 1                   | # EDX is 0xFFFFFFFF if CF set, else 0
        mov    eax, ecx                |     and   edx, 79764919
        cmove  eax, edx                |     xor   eax, edx
        add    r8d, 1
        jne    .LBB1_5
        add    rdi, 1
        test   esi, esi
        jne    .LBB1_4
        not    eax
        ret
.LBB1_1:
        xor    eax, eax
        ret

JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal
      code sequence with 6 and 7 instructions; this accounts for a total
      of 16 and 24 superfluous instructions respectively.
_______________________________________________
LLVM Developers mailing list
[hidden email]
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev

_______________________________________________
LLVM Developers mailing list
[hidden email]
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
Reply | Threaded
Open this post in threaded view
|

Re: [llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

Alberto Barbaro via llvm-dev
"Sanjay Patel" <[hidden email]> wrote:

> IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like
> this:
> unsigned int foo(unsigned int crc) {
>    if (crc & 0x80000000)
>      crc <<= 1, crc ^= 0xEDB88320;
>    else
>      crc <<= 1;
>    return crc;
> }

Generalize this a little bit: the optimizer "knows" that
(crc & 0x80000000) is equivalent to testing the sign-bit,
which sets SF on x86.

On x86, both (crc <<= 1) and (crc += crc) shift the sign-bit
into CF, so there is no need for an explicit "test crc, crc"
in the above case: testing SF before the shift is equivalent
to testing CF after the shift.
I expect that the optimizer "knows" about this equivalence.
If it doesn't take "sbb masking" into account, the above code
might also be translated to

    lea    edx, [eax+eax]
    xor    edx, 0EDB88320h
    add    eax, eax
    cmovc  eax, edx

The same holds for the opposite case: for (crc & 1) followed
by (crc >>= 1) there is no need for an explicit "test crc, 1"
since the right shift "moves" the LSB into CF.

regards
Stefan

> Which is this in LLVM IR:
> define i32 @foo(i32 %x) {
>  %t2 = icmp slt i32 %x, 0
>  %t3 = shl i32 %x, 1
>  %t4 = xor i32 %t3, -306674912
>  %t5 = select i1 %t2, i32 %t4, i32 %t3
>  ret i32 %t5
> }
>
> Please a file a bug report for the x86 backend (including performance
> numbers if you have that data).
>
> On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev <
> [hidden email]> wrote:
>
>> Hi @ll,
>>
>> while clang/LLVM recognizes common bit-twiddling idioms/expressions
>> like
>>
>> unsigned int rotate(unsigned int x, unsigned int n)
>> {
>>     return (x << n) | (x >> (32 - n));
>> }
>>
>> and typically generates "rotate" machine instructions for this
>> expression, it fails to recognize other also common bit-twiddling
>> idioms/expressions.
>>
>> The standard IEEE CRC-32 for "big endian" alias "network" byte order
>> (see <https://tools.ietf.org/html/rfc1952#section-8> for example):
>>
>> unsigned int crc32be(unsigned char const *octets, unsigned int count)
>> {
>>     unsigned int crc = 0L;
>>     unsigned int i;
>>
>>     while (count--) {
>>         crc ^= *octets++ << 24;
>>         for (i = 8; i > 0; i--)
>>             if (crc & 0x80000000L)             // the code generated
>>                 crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from
>>             else                               // these 4 lines is
>>                 crc <<= 1;                     // rather poor!
>>     }
>>     return crc;
>> }
>>
>> The same function for "little endian" byte order, using the "inverse"
>> or "mirrored" polynom:
>>
>> unsigned int crc32le(unsigned char const *octets, unsigned int count)
>> {
>>     unsigned int crc = ~0L;
>>     unsigned int i;
>>
>>     while (count--) {
>>         crc ^= *octets++;
>>         for (i = 8; i > 0; i--)
>>             if (crc & 1L)                      // the code generated
>>                 crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from
>>             else                               // these 4 lines is
>>                 crc >>= 1;                     // rather poor!
>>     }
>>     return ~crc;
>> }
>>
>> See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm>
>> (-O2)
>>
>> crc32be: # @crc32be
>>         xor    eax, eax
>>         test   esi, esi
>>         jne    .LBB0_2
>>         jmp    .LBB0_5
>> .LBB0_4: # in Loop: Header=BB0_2 Depth=1
>>         add    rdi, 1
>>         test   esi, esi
>>         je    .LBB0_5
>> .LBB0_2: # =>This Loop Header: Depth=1
>>         add    esi, -1
>>         movzx  edx, byte ptr [rdi]
>>         shl    edx, 24
>>         xor    edx, eax
>>         mov    ecx, -8
>>         mov    eax, edx
>> .LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions instead of 6, r8
>> not clobbered!
>>         lea    r8d, [rax + rax]        |     add   eax, eax
>>         mov    edx, r8d                | # CF is set from the MSB of EAX
>>         xor    edx, -306674912         |     sbb   edx, edx
>>         test   eax, eax                | # EDX is 0xFFFFFFFF if CF set,
>> else 0
>>         mov    eax, edx                |     and   edx, -306674912
>>         cmovns eax, r8d                |     xor   eax, edx
>>         add    ecx, 1
>>         jne    .LBB0_3
>>         jmp    .LBB0_4
>> .LBB0_5:
>>         ret
>> crc32le: # @crc32le
>>         test   esi, esi
>>         je    .LBB1_1
>>         mov    eax, -1
>> .LBB1_4: # =>This Loop Header: Depth=1
>>         add    esi, -1
>>         movzx  ecx, byte ptr [rdi]
>>         xor    eax, ecx
>>         mov    r8d, -8
>> .LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions instead of 7, and
>>         mov    edx, eax                | #  neither r8 nor rcx clobbered!
>>         shr    edx                     |     shr   eax, 1
>>         mov    ecx, edx                | # CF is set from the LSB of EAX
>>         xor    ecx, 79764919           |     sbb   edx, edx
>>         test   al, 1                   | # EDX is 0xFFFFFFFF if CF set,
>> else 0
>>         mov    eax, ecx                |     and   edx, 79764919
>>         cmove  eax, edx                |     xor   eax, edx
>>         add    r8d, 1
>>         jne    .LBB1_5
>>         add    rdi, 1
>>         test   esi, esi
>>         jne    .LBB1_4
>>         not    eax
>>         ret
>> .LBB1_1:
>>         xor    eax, eax
>>         ret
>>
>> JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal
>>       code sequence with 6 and 7 instructions; this accounts for a total
>>       of 16 and 24 superfluous instructions respectively.
>> _______________________________________________
>> LLVM Developers mailing list
>> [hidden email]
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>>
_______________________________________________
LLVM Developers mailing list
[hidden email]
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
Reply | Threaded
Open this post in threaded view
|

Re: [llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

Alberto Barbaro via llvm-dev
In reply to this post by Alberto Barbaro via llvm-dev
"Sanjay Patel" <[hidden email]> wrote:

> IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like
> this:
> unsigned int foo(unsigned int crc) {
>    if (crc & 0x80000000)
>      crc <<= 1, crc ^= 0xEDB88320;
>    else
>      crc <<= 1;
>    return crc;
> }
>
> Which is this in LLVM IR:
> define i32 @foo(i32 %x) {
>  %t2 = icmp slt i32 %x, 0
>  %t3 = shl i32 %x, 1
>  %t4 = xor i32 %t3, -306674912
>  %t5 = select i1 %t2, i32 %t4, i32 %t3
>  ret i32 %t5
> }
>
> Please a file a bug report for the x86 backend (including performance
> numbers if you have that data).

JFTR: as soon as the ternary operator is moved into a function,
      LLVM/clang performs an EQUIVALENT optimisation for the left-
      shifting CRC/LFSR, for both for the x86 and x86-64: see
      <https://godbolt.org/z/J1KY2d>

      The right-shifting CRC/LFSR is but still NOT optimal!

--- test.c ---

unsigned long long lfsr64right(unsigned long long argument, unsigned long long polynomial)
{
    return argument & 1 ? polynomial ^ (argument >> 1) : argument >> 1;
}

...

unsigned long long lfsr64left(unsigned long long argument, unsigned long long polynomial)
{
    return (long long) argument < 0 ? polynomial ^ (argument << 1) : argument << 1;
}

...

--- EOF ---

lfsr64right: # @lfsr64right
;;; remove these 3 instructions
;;; mov    eax, edi
;;; and    eax, 1
;;; neg    rax
    shr    rdi
;;; add the next instruction
    sbb    rax, rax
    and    rax, rsi
    xor    rax, rdi
    ret

...

lfsr64left: # @lfsr64left
    lea    rax, [rdi + rdi]
    sar    rdi, 63
    and    rdi, rsi
    xor    rax, rdi
    ret

These 5 instructions are perfect!
If now LLVM/clang would use this sequence without moving the
ternary operator into its own function (which is inlined anyway)...

regards
Stefan Kanthak

> On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev <
> [hidden email]> wrote:
>
>> Hi @ll,
>>
>> while clang/LLVM recognizes common bit-twiddling idioms/expressions
>> like
>>
>> unsigned int rotate(unsigned int x, unsigned int n)
>> {
>>     return (x << n) | (x >> (32 - n));
>> }
>>
>> and typically generates "rotate" machine instructions for this
>> expression, it fails to recognize other also common bit-twiddling
>> idioms/expressions.
>>
>> The standard IEEE CRC-32 for "big endian" alias "network" byte order
>> (see <https://tools.ietf.org/html/rfc1952#section-8> for example):
>>
>> unsigned int crc32be(unsigned char const *octets, unsigned int count)
>> {
>>     unsigned int crc = 0L;
>>     unsigned int i;
>>
>>     while (count--) {
>>         crc ^= *octets++ << 24;
>>         for (i = 8; i > 0; i--)
>>             if (crc & 0x80000000L)             // the code generated
>>                 crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from
>>             else                               // these 4 lines is
>>                 crc <<= 1;                     // rather poor!
>>     }
>>     return crc;
>> }
>>
>> The same function for "little endian" byte order, using the "inverse"
>> or "mirrored" polynom:
>>
>> unsigned int crc32le(unsigned char const *octets, unsigned int count)
>> {
>>     unsigned int crc = ~0L;
>>     unsigned int i;
>>
>>     while (count--) {
>>         crc ^= *octets++;
>>         for (i = 8; i > 0; i--)
>>             if (crc & 1L)                      // the code generated
>>                 crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from
>>             else                               // these 4 lines is
>>                 crc >>= 1;                     // rather poor!
>>     }
>>     return ~crc;
>> }
>>
>> See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm>
>> (-O2)
>>
>> crc32be: # @crc32be
>>         xor    eax, eax
>>         test   esi, esi
>>         jne    .LBB0_2
>>         jmp    .LBB0_5
>> .LBB0_4: # in Loop: Header=BB0_2 Depth=1
>>         add    rdi, 1
>>         test   esi, esi
>>         je    .LBB0_5
>> .LBB0_2: # =>This Loop Header: Depth=1
>>         add    esi, -1
>>         movzx  edx, byte ptr [rdi]
>>         shl    edx, 24
>>         xor    edx, eax
>>         mov    ecx, -8
>>         mov    eax, edx
>> .LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions instead of 6, r8
>> not clobbered!
>>         lea    r8d, [rax + rax]        |     add   eax, eax
>>         mov    edx, r8d                | # CF is set from the MSB of EAX
>>         xor    edx, -306674912         |     sbb   edx, edx
>>         test   eax, eax                | # EDX is 0xFFFFFFFF if CF set,
>> else 0
>>         mov    eax, edx                |     and   edx, -306674912
>>         cmovns eax, r8d                |     xor   eax, edx
>>         add    ecx, 1
>>         jne    .LBB0_3
>>         jmp    .LBB0_4
>> .LBB0_5:
>>         ret
>> crc32le: # @crc32le
>>         test   esi, esi
>>         je    .LBB1_1
>>         mov    eax, -1
>> .LBB1_4: # =>This Loop Header: Depth=1
>>         add    esi, -1
>>         movzx  ecx, byte ptr [rdi]
>>         xor    eax, ecx
>>         mov    r8d, -8
>> .LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions instead of 7, and
>>         mov    edx, eax                | #  neither r8 nor rcx clobbered!
>>         shr    edx                     |     shr   eax, 1
>>         mov    ecx, edx                | # CF is set from the LSB of EAX
>>         xor    ecx, 79764919           |     sbb   edx, edx
>>         test   al, 1                   | # EDX is 0xFFFFFFFF if CF set,
>> else 0
>>         mov    eax, ecx                |     and   edx, 79764919
>>         cmove  eax, edx                |     xor   eax, edx
>>         add    r8d, 1
>>         jne    .LBB1_5
>>         add    rdi, 1
>>         test   esi, esi
>>         jne    .LBB1_4
>>         not    eax
>>         ret
>> .LBB1_1:
>>         xor    eax, eax
>>         ret
>>
>> JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal
>>       code sequence with 6 and 7 instructions; this accounts for a total
>>       of 16 and 24 superfluous instructions respectively.
>> _______________________________________________
>> LLVM Developers mailing list
>> [hidden email]
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>>
>
_______________________________________________
LLVM Developers mailing list
[hidden email]
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
Reply | Threaded
Open this post in threaded view
|

Re: [llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

Alberto Barbaro via llvm-dev
In reply to this post by Alberto Barbaro via llvm-dev
"Sanjay Patel" <[hidden email]> wrote:

> IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like
> this:
> unsigned int foo(unsigned int crc) {
>    if (crc & 0x80000000)
>      crc <<= 1, crc ^= 0xEDB88320;
>    else
>      crc <<= 1;
>    return crc;
> }

To document this for x86 too: rewrite the function slightly

unsigned int foo(unsigned int crc, unsigned int poly) {
    if (crc & 0x80000000)
        crc <<= 1, crc ^= poly;
    else
        crc <<= 1;
    return crc;
}

unsigned int bar(unsigned int crc, unsigned int poly) {
    if (crc & 1)
        crc >>= 1, crc ^= poly;
    else
        crc >>= 1;
    return crc;
}

and you get the perfect code for the left-shifting case!

foo: # @foo
    lea eax, [rdi + rdi]
    sar edi, 31
    and edi, esi
    xor eax, edi
    ret

The right-shifting case leaves but still room for improvement!

bar: # @bar                | bar: # @bar
    mov eax, edi           |
    and eax, 1             |
    neg eax                |
    shr edi                |     shr edi
                           |     sbb eax, eax
    and eax, esi           |     and eax, esi
    xor eax, edi           |     xor eax, edi
    ret                    |     ret

See <https://godbolt.org/z/aPKweG>

regards
Stefan Kanthak

> Which is this in LLVM IR:
> define i32 @foo(i32 %x) {
>  %t2 = icmp slt i32 %x, 0
>  %t3 = shl i32 %x, 1
>  %t4 = xor i32 %t3, -306674912
>  %t5 = select i1 %t2, i32 %t4, i32 %t3
>  ret i32 %t5
> }
>
> Please a file a bug report for the x86 backend (including performance
> numbers if you have that data).
>
> On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev <
> [hidden email]> wrote:
>
>> Hi @ll,
>>
>> while clang/LLVM recognizes common bit-twiddling idioms/expressions
>> like
>>
>> unsigned int rotate(unsigned int x, unsigned int n)
>> {
>>     return (x << n) | (x >> (32 - n));
>> }
>>
>> and typically generates "rotate" machine instructions for this
>> expression, it fails to recognize other also common bit-twiddling
>> idioms/expressions.
>>
>> The standard IEEE CRC-32 for "big endian" alias "network" byte order
>> (see <https://tools.ietf.org/html/rfc1952#section-8> for example):
>>
>> unsigned int crc32be(unsigned char const *octets, unsigned int count)
>> {
>>     unsigned int crc = 0L;
>>     unsigned int i;
>>
>>     while (count--) {
>>         crc ^= *octets++ << 24;
>>         for (i = 8; i > 0; i--)
>>             if (crc & 0x80000000L)             // the code generated
>>                 crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from
>>             else                               // these 4 lines is
>>                 crc <<= 1;                     // rather poor!
>>     }
>>     return crc;
>> }
>>
>> The same function for "little endian" byte order, using the "inverse"
>> or "mirrored" polynom:
>>
>> unsigned int crc32le(unsigned char const *octets, unsigned int count)
>> {
>>     unsigned int crc = ~0L;
>>     unsigned int i;
>>
>>     while (count--) {
>>         crc ^= *octets++;
>>         for (i = 8; i > 0; i--)
>>             if (crc & 1L)                      // the code generated
>>                 crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from
>>             else                               // these 4 lines is
>>                 crc >>= 1;                     // rather poor!
>>     }
>>     return ~crc;
>> }
>>
>> See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm>
>> (-O2)
>>
>> crc32be: # @crc32be
>>         xor    eax, eax
>>         test   esi, esi
>>         jne    .LBB0_2
>>         jmp    .LBB0_5
>> .LBB0_4: # in Loop: Header=BB0_2 Depth=1
>>         add    rdi, 1
>>         test   esi, esi
>>         je    .LBB0_5
>> .LBB0_2: # =>This Loop Header: Depth=1
>>         add    esi, -1
>>         movzx  edx, byte ptr [rdi]
>>         shl    edx, 24
>>         xor    edx, eax
>>         mov    ecx, -8
>>         mov    eax, edx
>> .LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions instead of 6, r8
>> not clobbered!
>>         lea    r8d, [rax + rax]        |     add   eax, eax
>>         mov    edx, r8d                | # CF is set from the MSB of EAX
>>         xor    edx, -306674912         |     sbb   edx, edx
>>         test   eax, eax                | # EDX is 0xFFFFFFFF if CF set,
>> else 0
>>         mov    eax, edx                |     and   edx, -306674912
>>         cmovns eax, r8d                |     xor   eax, edx
>>         add    ecx, 1
>>         jne    .LBB0_3
>>         jmp    .LBB0_4
>> .LBB0_5:
>>         ret
>> crc32le: # @crc32le
>>         test   esi, esi
>>         je    .LBB1_1
>>         mov    eax, -1
>> .LBB1_4: # =>This Loop Header: Depth=1
>>         add    esi, -1
>>         movzx  ecx, byte ptr [rdi]
>>         xor    eax, ecx
>>         mov    r8d, -8
>> .LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions instead of 7, and
>>         mov    edx, eax                | #  neither r8 nor rcx clobbered!
>>         shr    edx                     |     shr   eax, 1
>>         mov    ecx, edx                | # CF is set from the LSB of EAX
>>         xor    ecx, 79764919           |     sbb   edx, edx
>>         test   al, 1                   | # EDX is 0xFFFFFFFF if CF set,
>> else 0
>>         mov    eax, ecx                |     and   edx, 79764919
>>         cmove  eax, edx                |     xor   eax, edx
>>         add    r8d, 1
>>         jne    .LBB1_5
>>         add    rdi, 1
>>         test   esi, esi
>>         jne    .LBB1_4
>>         not    eax
>>         ret
>> .LBB1_1:
>>         xor    eax, eax
>>         ret
>>
>> JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal
>>       code sequence with 6 and 7 instructions; this accounts for a total
>>       of 16 and 24 superfluous instructions respectively.
>> _______________________________________________
>> LLVM Developers mailing list
>> [hidden email]
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>>
>
_______________________________________________
LLVM Developers mailing list
[hidden email]
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
Reply | Threaded
Open this post in threaded view
|

Re: [llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

Alberto Barbaro via llvm-dev
Thanks for reporting this and other perf opportunities. As I mentioned before, if you could file bug reports for these, that's probably the only way they're ever going to get fixed (unless you're planning to fix them yourself). It's not an ideal situation, but that's how this project works in my experience.

Someone filed 1 of your examples on your behalf here:

Please do consider following that model for your other problems. Without that kind of report, your code examples are likely to be forgotten.

Finally, sending mail to llvm-bugs is practically useless. I don't know of anyone that is tracking mails to that list rather than actual bug reports.

On Tue, Nov 27, 2018 at 4:37 PM Stefan Kanthak <[hidden email]> wrote:
"Sanjay Patel" <[hidden email]> wrote:

> IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like
> this:
> unsigned int foo(unsigned int crc) {
>    if (crc & 0x80000000)
>      crc <<= 1, crc ^= 0xEDB88320;
>    else
>      crc <<= 1;
>    return crc;
> }

To document this for x86 too: rewrite the function slightly

unsigned int foo(unsigned int crc, unsigned int poly) {
    if (crc & 0x80000000)
        crc <<= 1, crc ^= poly;
    else
        crc <<= 1;
    return crc;
}

unsigned int bar(unsigned int crc, unsigned int poly) {
    if (crc & 1)
        crc >>= 1, crc ^= poly;
    else
        crc >>= 1;
    return crc;
}

and you get the perfect code for the left-shifting case!

foo: # @foo
    lea eax, [rdi + rdi]
    sar edi, 31
    and edi, esi
    xor eax, edi
    ret

The right-shifting case leaves but still room for improvement!

bar: # @bar                | bar: # @bar
    mov eax, edi           |
    and eax, 1             |
    neg eax                |
    shr edi                |     shr edi
                           |     sbb eax, eax
    and eax, esi           |     and eax, esi
    xor eax, edi           |     xor eax, edi
    ret                    |     ret

See <https://godbolt.org/z/aPKweG>

regards
Stefan Kanthak

> Which is this in LLVM IR:
> define i32 @foo(i32 %x) {
>  %t2 = icmp slt i32 %x, 0
>  %t3 = shl i32 %x, 1
>  %t4 = xor i32 %t3, -306674912
>  %t5 = select i1 %t2, i32 %t4, i32 %t3
>  ret i32 %t5
> }
>
> Please a file a bug report for the x86 backend (including performance
> numbers if you have that data).
>
> On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev <
> [hidden email]> wrote:
>
>> Hi @ll,
>>
>> while clang/LLVM recognizes common bit-twiddling idioms/expressions
>> like
>>
>> unsigned int rotate(unsigned int x, unsigned int n)
>> {
>>     return (x << n) | (x >> (32 - n));
>> }
>>
>> and typically generates "rotate" machine instructions for this
>> expression, it fails to recognize other also common bit-twiddling
>> idioms/expressions.
>>
>> The standard IEEE CRC-32 for "big endian" alias "network" byte order
>> (see <https://tools.ietf.org/html/rfc1952#section-8> for example):
>>
>> unsigned int crc32be(unsigned char const *octets, unsigned int count)
>> {
>>     unsigned int crc = 0L;
>>     unsigned int i;
>>
>>     while (count--) {
>>         crc ^= *octets++ << 24;
>>         for (i = 8; i > 0; i--)
>>             if (crc & 0x80000000L)             // the code generated
>>                 crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from
>>             else                               // these 4 lines is
>>                 crc <<= 1;                     // rather poor!
>>     }
>>     return crc;
>> }
>>
>> The same function for "little endian" byte order, using the "inverse"
>> or "mirrored" polynom:
>>
>> unsigned int crc32le(unsigned char const *octets, unsigned int count)
>> {
>>     unsigned int crc = ~0L;
>>     unsigned int i;
>>
>>     while (count--) {
>>         crc ^= *octets++;
>>         for (i = 8; i > 0; i--)
>>             if (crc & 1L)                      // the code generated
>>                 crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from
>>             else                               // these 4 lines is
>>                 crc >>= 1;                     // rather poor!
>>     }
>>     return ~crc;
>> }
>>
>> See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm>
>> (-O2)
>>
>> crc32be: # @crc32be
>>         xor    eax, eax
>>         test   esi, esi
>>         jne    .LBB0_2
>>         jmp    .LBB0_5
>> .LBB0_4: # in Loop: Header=BB0_2 Depth=1
>>         add    rdi, 1
>>         test   esi, esi
>>         je    .LBB0_5
>> .LBB0_2: # =>This Loop Header: Depth=1
>>         add    esi, -1
>>         movzx  edx, byte ptr [rdi]
>>         shl    edx, 24
>>         xor    edx, eax
>>         mov    ecx, -8
>>         mov    eax, edx
>> .LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions instead of 6, r8
>> not clobbered!
>>         lea    r8d, [rax + rax]        |     add   eax, eax
>>         mov    edx, r8d                | # CF is set from the MSB of EAX
>>         xor    edx, -306674912         |     sbb   edx, edx
>>         test   eax, eax                | # EDX is 0xFFFFFFFF if CF set,
>> else 0
>>         mov    eax, edx                |     and   edx, -306674912
>>         cmovns eax, r8d                |     xor   eax, edx
>>         add    ecx, 1
>>         jne    .LBB0_3
>>         jmp    .LBB0_4
>> .LBB0_5:
>>         ret
>> crc32le: # @crc32le
>>         test   esi, esi
>>         je    .LBB1_1
>>         mov    eax, -1
>> .LBB1_4: # =>This Loop Header: Depth=1
>>         add    esi, -1
>>         movzx  ecx, byte ptr [rdi]
>>         xor    eax, ecx
>>         mov    r8d, -8
>> .LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions instead of 7, and
>>         mov    edx, eax                | #  neither r8 nor rcx clobbered!
>>         shr    edx                     |     shr   eax, 1
>>         mov    ecx, edx                | # CF is set from the LSB of EAX
>>         xor    ecx, 79764919           |     sbb   edx, edx
>>         test   al, 1                   | # EDX is 0xFFFFFFFF if CF set,
>> else 0
>>         mov    eax, ecx                |     and   edx, 79764919
>>         cmove  eax, edx                |     xor   eax, edx
>>         add    r8d, 1
>>         jne    .LBB1_5
>>         add    rdi, 1
>>         test   esi, esi
>>         jne    .LBB1_4
>>         not    eax
>>         ret
>> .LBB1_1:
>>         xor    eax, eax
>>         ret
>>
>> JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal
>>       code sequence with 6 and 7 instructions; this accounts for a total
>>       of 16 and 24 superfluous instructions respectively.
>> _______________________________________________
>> LLVM Developers mailing list
>> [hidden email]
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>>
>

_______________________________________________
LLVM Developers mailing list
[hidden email]
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
Reply | Threaded
Open this post in threaded view
|

Re: [llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

Alberto Barbaro via llvm-dev


On Wed, Nov 28, 2018 at 7:11 AM Sanjay Patel via llvm-dev <[hidden email]> wrote:
Thanks for reporting this and other perf opportunities. As I mentioned before, if you could file bug reports for these, that's probably the only way they're ever going to get fixed (unless you're planning to fix them yourself). It's not an ideal situation, but that's how this project works in my experience.

Someone filed 1 of your examples on your behalf here:

Please do consider following that model for your other problems. Without that kind of report, your code examples are likely to be forgotten.

Finally, sending mail to llvm-bugs is practically useless. I don't know of anyone that is tracking mails to that list rather than actual bug reports.

The list is setup to not accept email, basically - I'm one of the moderators there & any email sent there that doesn't come from the bug database will get automatically or manually rejected with the suggestion/request that the bug database is used instead.
 

On Tue, Nov 27, 2018 at 4:37 PM Stefan Kanthak <[hidden email]> wrote:
"Sanjay Patel" <[hidden email]> wrote:

> IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like
> this:
> unsigned int foo(unsigned int crc) {
>    if (crc & 0x80000000)
>      crc <<= 1, crc ^= 0xEDB88320;
>    else
>      crc <<= 1;
>    return crc;
> }

To document this for x86 too: rewrite the function slightly

unsigned int foo(unsigned int crc, unsigned int poly) {
    if (crc & 0x80000000)
        crc <<= 1, crc ^= poly;
    else
        crc <<= 1;
    return crc;
}

unsigned int bar(unsigned int crc, unsigned int poly) {
    if (crc & 1)
        crc >>= 1, crc ^= poly;
    else
        crc >>= 1;
    return crc;
}

and you get the perfect code for the left-shifting case!

foo: # @foo
    lea eax, [rdi + rdi]
    sar edi, 31
    and edi, esi
    xor eax, edi
    ret

The right-shifting case leaves but still room for improvement!

bar: # @bar                | bar: # @bar
    mov eax, edi           |
    and eax, 1             |
    neg eax                |
    shr edi                |     shr edi
                           |     sbb eax, eax
    and eax, esi           |     and eax, esi
    xor eax, edi           |     xor eax, edi
    ret                    |     ret

See <https://godbolt.org/z/aPKweG>

regards
Stefan Kanthak

> Which is this in LLVM IR:
> define i32 @foo(i32 %x) {
>  %t2 = icmp slt i32 %x, 0
>  %t3 = shl i32 %x, 1
>  %t4 = xor i32 %t3, -306674912
>  %t5 = select i1 %t2, i32 %t4, i32 %t3
>  ret i32 %t5
> }
>
> Please a file a bug report for the x86 backend (including performance
> numbers if you have that data).
>
> On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev <
> [hidden email]> wrote:
>
>> Hi @ll,
>>
>> while clang/LLVM recognizes common bit-twiddling idioms/expressions
>> like
>>
>> unsigned int rotate(unsigned int x, unsigned int n)
>> {
>>     return (x << n) | (x >> (32 - n));
>> }
>>
>> and typically generates "rotate" machine instructions for this
>> expression, it fails to recognize other also common bit-twiddling
>> idioms/expressions.
>>
>> The standard IEEE CRC-32 for "big endian" alias "network" byte order
>> (see <https://tools.ietf.org/html/rfc1952#section-8> for example):
>>
>> unsigned int crc32be(unsigned char const *octets, unsigned int count)
>> {
>>     unsigned int crc = 0L;
>>     unsigned int i;
>>
>>     while (count--) {
>>         crc ^= *octets++ << 24;
>>         for (i = 8; i > 0; i--)
>>             if (crc & 0x80000000L)             // the code generated
>>                 crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from
>>             else                               // these 4 lines is
>>                 crc <<= 1;                     // rather poor!
>>     }
>>     return crc;
>> }
>>
>> The same function for "little endian" byte order, using the "inverse"
>> or "mirrored" polynom:
>>
>> unsigned int crc32le(unsigned char const *octets, unsigned int count)
>> {
>>     unsigned int crc = ~0L;
>>     unsigned int i;
>>
>>     while (count--) {
>>         crc ^= *octets++;
>>         for (i = 8; i > 0; i--)
>>             if (crc & 1L)                      // the code generated
>>                 crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from
>>             else                               // these 4 lines is
>>                 crc >>= 1;                     // rather poor!
>>     }
>>     return ~crc;
>> }
>>
>> See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm>
>> (-O2)
>>
>> crc32be: # @crc32be
>>         xor    eax, eax
>>         test   esi, esi
>>         jne    .LBB0_2
>>         jmp    .LBB0_5
>> .LBB0_4: # in Loop: Header=BB0_2 Depth=1
>>         add    rdi, 1
>>         test   esi, esi
>>         je    .LBB0_5
>> .LBB0_2: # =>This Loop Header: Depth=1
>>         add    esi, -1
>>         movzx  edx, byte ptr [rdi]
>>         shl    edx, 24
>>         xor    edx, eax
>>         mov    ecx, -8
>>         mov    eax, edx
>> .LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions instead of 6, r8
>> not clobbered!
>>         lea    r8d, [rax + rax]        |     add   eax, eax
>>         mov    edx, r8d                | # CF is set from the MSB of EAX
>>         xor    edx, -306674912         |     sbb   edx, edx
>>         test   eax, eax                | # EDX is 0xFFFFFFFF if CF set,
>> else 0
>>         mov    eax, edx                |     and   edx, -306674912
>>         cmovns eax, r8d                |     xor   eax, edx
>>         add    ecx, 1
>>         jne    .LBB0_3
>>         jmp    .LBB0_4
>> .LBB0_5:
>>         ret
>> crc32le: # @crc32le
>>         test   esi, esi
>>         je    .LBB1_1
>>         mov    eax, -1
>> .LBB1_4: # =>This Loop Header: Depth=1
>>         add    esi, -1
>>         movzx  ecx, byte ptr [rdi]
>>         xor    eax, ecx
>>         mov    r8d, -8
>> .LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions instead of 7, and
>>         mov    edx, eax                | #  neither r8 nor rcx clobbered!
>>         shr    edx                     |     shr   eax, 1
>>         mov    ecx, edx                | # CF is set from the LSB of EAX
>>         xor    ecx, 79764919           |     sbb   edx, edx
>>         test   al, 1                   | # EDX is 0xFFFFFFFF if CF set,
>> else 0
>>         mov    eax, ecx                |     and   edx, 79764919
>>         cmove  eax, edx                |     xor   eax, edx
>>         add    r8d, 1
>>         jne    .LBB1_5
>>         add    rdi, 1
>>         test   esi, esi
>>         jne    .LBB1_4
>>         not    eax
>>         ret
>> .LBB1_1:
>>         xor    eax, eax
>>         ret
>>
>> JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal
>>       code sequence with 6 and 7 instructions; this accounts for a total
>>       of 16 and 24 superfluous instructions respectively.
>> _______________________________________________
>> LLVM Developers mailing list
>> [hidden email]
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>>
>
_______________________________________________
LLVM Developers mailing list
[hidden email]
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev

_______________________________________________
LLVM Developers mailing list
[hidden email]
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
Reply | Threaded
Open this post in threaded view
|

Re: [llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

Alberto Barbaro via llvm-dev
"David Blaikie" <[hidden email]> wrote:

> On Wed, Nov 28, 2018 at 7:11 AM Sanjay Patel via llvm-dev <
> [hidden email]> wrote:

[...]

>> Finally, sending mail to llvm-bugs is practically useless. I don't know of
>> anyone that is tracking mails to that list rather than actual bug reports.
>>
>
> The list is setup to not accept email, basically

That's VERY nice for a mailing list. NOT!

> - I'm one of the moderators there & any email sent there that doesn't
> come from the bug database will get automatically or manually rejected
> with the suggestion/request that the bug database is used instead.

NO!
The following text is sent back:

| You are not allowed to post to this mailing list, and your message has
| been automatically rejected.  If you think that your messages are
| being rejected in error, contact the mailing list owner at
| [hidden email].

I recommend/request that this list EITHER be opened for bug reports,
OR not announced on <http://lists.llvm.org/mailman/listinfo> any more!

Stefan

[ fullquote removed ]
_______________________________________________
LLVM Developers mailing list
[hidden email]
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
Reply | Threaded
Open this post in threaded view
|

Re: [llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

Alberto Barbaro via llvm-dev


On Mon, Dec 3, 2018 at 11:18 AM Stefan Kanthak <[hidden email]> wrote:
"David Blaikie" <[hidden email]> wrote:

> On Wed, Nov 28, 2018 at 7:11 AM Sanjay Patel via llvm-dev <
> [hidden email]> wrote:

[...]

>> Finally, sending mail to llvm-bugs is practically useless. I don't know of
>> anyone that is tracking mails to that list rather than actual bug reports.
>>
>
> The list is setup to not accept email, basically

That's VERY nice for a mailing list. NOT!

This sort of antagonistic/sarcastic language isn't really conducive to a productive discussion - please avoid it in future discussions with the LLVM community (others have mentioned this already/previously).

> - I'm one of the moderators there & any email sent there that doesn't
> come from the bug database will get automatically or manually rejected
> with the suggestion/request that the bug database is used instead.

NO!
The following text is sent back:

| You are not allowed to post to this mailing list, and your message has
| been automatically rejected.  If you think that your messages are
| being rejected in error, contact the mailing list owner at
| [hidden email].

Yeah, I just went through the mail filtering this morning - and since other people (& myself here) had already explained why llvm-bugs wasn't an appropriate mailing list to send to - I didn't write a more comprehensive response & used the auto-response when rejecting all those emails.

I recommend/request that this list EITHER be opened for bug reports,
OR not announced on <http://lists.llvm.org/mailman/listinfo> any more!

It's useful to have it listed there - people subscribe to it to read the bug reports that are filed through the bugzilla instance.

- Dave
 

Stefan

[ fullquote removed ]

_______________________________________________
LLVM Developers mailing list
[hidden email]
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev