[llvm-dev] windows ABI problem with i128?

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
4 messages Options
Reply | Threaded
Open this post in threaded view
|

[llvm-dev] windows ABI problem with i128?

Dean Michael Berris via llvm-dev
I'm trying to use LLVM to create compiler-rt.o on Windows. I use this command from the compiler-rt project:

[nix-shell:~/downloads/llvm-project/compiler-rt]$ clang -nostdlib  -S -emit-llvm lib/builtins/udivti3.c  -g -target x86_64-windows -DCRT_HAS_128BIT

The resulting LLVM IR is:
=================================================================

; ModuleID = 'lib/builtins/udivti3.c'
source_filename = "lib/builtins/udivti3.c"
target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64--windows-msvc19.11.0"

; Function Attrs: noinline nounwind optnone uwtable
define i128 @__udivti3(i128, i128) #0 {
  %3 = alloca i128, align 16
  %4 = alloca i128, align 16
  store i128 %1, i128* %3, align 16
  store i128 %0, i128* %4, align 16
  %5 = load i128, i128* %3, align 16
  %6 = load i128, i128* %4, align 16
  %7 = call i128 @__udivmodti4(i128 %6, i128 %5, i128* null)
  ret i128 %7
}

declare i128 @__udivmodti4(i128, i128, i128*) #1

attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 1, !"wchar_size", i32 2}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{!"clang version 6.0.0 (tags/RELEASE_600/final)"}


=================================================================
However I think this results in a different ABI than LLVM will use when you do i128 division. For example, here is my test case (in zig code):
=================================================================

pub extern "kernel32" stdcallcc fn ExitProcess(exit_code: c_uint) noreturn;

export fn WinMainCRTStartup() noreturn {
    @setAlignStack(16);
    @setRuntimeSafety(false);

    var a: u128 = 152313999999999991610955792383;
    var b: u128 = 10000000000000000000;
    var c = a / b; // this generates a call to __udivti3

    if (c != b) {
        @breakpoint();
    }
    ExitProcess(0);
}

export fn __udivti3(a: u128, b: u128) u128 {
    @setRuntimeSafety(false);
    return b;
}


=================================================================
This results in this LLVM IR:
=================================================================

; ModuleID = 'test'
source_filename = "test"
target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-windows-msvc"

%"[]u8" = type { i8*, i64 }
%StackTrace = type { i64, %"[]usize" }
%"[]usize" = type { i64*, i64 }

; Function Attrs: nounwind readnone speculatable
declare void @llvm.dbg.declare(metadata, metadata, metadata) #1

; Function Attrs: nobuiltin noinline noreturn nounwind uwtable alignstack(16)
define void @WinMainCRTStartup() #2 !dbg !41 {
Entry:
  %a = alloca i128, align 8
  %b = alloca i128, align 8
  %c = alloca i128, align 8
  store i128 152313999999999991610955792383, i128* %a, align 8, !dbg !52
  call void @llvm.dbg.declare(metadata i128* %a, metadata !45, metadata !DIExpression()), !dbg !52
  store i128 10000000000000000000, i128* %b, align 8, !dbg !53
  call void @llvm.dbg.declare(metadata i128* %b, metadata !48, metadata !DIExpression()), !dbg !53
  %0 = load i128, i128* %a, align 8, !dbg !54
  %1 = load i128, i128* %b, align 8, !dbg !55
  %2 = udiv i128 %0, %1, !dbg !56
  store i128 %2, i128* %c, align 8, !dbg !57
  call void @llvm.dbg.declare(metadata i128* %c, metadata !50, metadata !DIExpression()), !dbg !57
  %3 = load i128, i128* %c, align 8, !dbg !58
  %4 = load i128, i128* %b, align 8, !dbg !60
  %5 = icmp ne i128 %3, %4, !dbg !61
  br i1 %5, label %Then, label %Else, !dbg !61

Then:                                             ; preds = %Entry
  call void @llvm.debugtrap(), !dbg !62
  br label %EndIf, !dbg !64

Else:                                             ; preds = %Entry
  br label %EndIf, !dbg !64

EndIf:                                            ; preds = %Else, %Then
  call void @ExitProcess(i32 0), !dbg !65
  unreachable, !dbg !65
}

; Function Attrs: nounwind
declare void @llvm.debugtrap() #3

; Function Attrs: nobuiltin noreturn nounwind uwtable
declare void @ExitProcess(i32) #0

; Function Attrs: nobuiltin nounwind uwtable
define i128 @__udivti3(i128, i128) #4 !dbg !66 {
Entry:
  %a = alloca i128, align 8
  %b = alloca i128, align 8
  store i128 %0, i128* %a, align 8
  call void @llvm.dbg.declare(metadata i128* %a, metadata !70, metadata !DIExpression()), !dbg !73
  store i128 %1, i128* %b, align 8
  call void @llvm.dbg.declare(metadata i128* %b, metadata !71, metadata !DIExpression()), !dbg !74
  %2 = load i128, i128* %b, align 8, !dbg !75
  ret i128 %2, !dbg !78
}

; Function Attrs: nounwind
declare void @llvm.stackprotector(i8*, i8**) #3

attributes #0 = { nobuiltin noreturn nounwind uwtable "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
attributes #1 = { nounwind readnone speculatable }
attributes #2 = { nobuiltin noinline noreturn nounwind uwtable alignstack=16 "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
attributes #3 = { nounwind }
attributes #4 = { nobuiltin nounwind uwtable "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }

!llvm.module.flags = !{!0}
!llvm.dbg.cu = !{!1}

=================================================================

When I link this with (link.exe or LLD, it does not matter):
link.exe /OUT:test.exe /ENTRY:WinMainCRTStartup test.obj /subsystem:console kernel32.lib /nologo

And run it, it triggers the breakpoint.

Meanwhile on linux, this test passes.

I suspect it may be a calling convention issue. Here is the assembly for the linux x86_64 version:


=================================================================
0000000000000010 <_start>:
  10:    55                       push   %rbp
  11:    48 89 e5                 mov    %rsp,%rbp
  14:    48 83 ec 40              sub    $0x40,%rsp
  18:    48 b8 14 30 27 ec 01     movabs $0x1ec273014,%rax
  1f:    00 00 00
  22:    48 89 45 f8              mov    %rax,-0x8(%rbp)
  26:    48 b8 ff ff ff ff ff     movabs $0xff7377ffffffffff,%rax
  2d:    77 73 ff
  30:    48 89 45 f0              mov    %rax,-0x10(%rbp)
  34:    48 b8 00 00 e8 89 04     movabs $0x8ac7230489e80000,%rax
  3b:    23 c7 8a
  3e:    48 89 45 e0              mov    %rax,-0x20(%rbp)
  42:    48 c7 45 e8 00 00 00     movq   $0x0,-0x18(%rbp)
  49:    00
  4a:    48 8b 7d f0              mov    -0x10(%rbp),%rdi
  4e:    48 8b 75 f8              mov    -0x8(%rbp),%rsi
  52:    48 8b 55 e0              mov    -0x20(%rbp),%rdx
  56:    48 8b 4d e8              mov    -0x18(%rbp),%rcx
  5a:    e8 00 00 00 00           callq  5f <_start+0x4f>
  5f:    48 89 55 d8              mov    %rdx,-0x28(%rbp)
  63:    48 89 45 d0              mov    %rax,-0x30(%rbp)
  67:    c5 fa 6f 45 d0           vmovdqu -0x30(%rbp),%xmm0
  6c:    c5 fa 6f 4d e0           vmovdqu -0x20(%rbp),%xmm1
  71:    c5 f9 74 c1              vpcmpeqb %xmm1,%xmm0,%xmm0
  75:    c5 79 d7 c0              vpmovmskb %xmm0,%r8d
  79:    41 81 e8 ff ff 00 00     sub    $0xffff,%r8d
  80:    44 89 45 cc              mov    %r8d,-0x34(%rbp)
  84:    74 06                    je     8c <_start+0x7c>
  86:    eb 00                    jmp    88 <_start+0x78>
  88:    eb 00                    jmp    8a <_start+0x7a>
  8a:    eb fe                    jmp    8a <_start+0x7a>
  8c:    eb 00                    jmp    8e <_start+0x7e>
  8e:    48 83 c4 40              add    $0x40,%rsp
  92:    5d                       pop    %rbp
  93:    c3                       retq  
  94:    66 66 66 2e 0f 1f 84     data16 data16 nopw %cs:0x0(%rax,%rax,1)
  9b:    00 00 00 00 00

00000000000000a0 <__udivti3>:
  a0:    55                       push   %rbp
  a1:    48 89 e5                 mov    %rsp,%rbp
  a4:    48 89 7d f0              mov    %rdi,-0x10(%rbp)
  a8:    48 89 75 f8              mov    %rsi,-0x8(%rbp)
  ac:    48 89 4d e8              mov    %rcx,-0x18(%rbp)
  b0:    48 89 55 e0              mov    %rdx,-0x20(%rbp)
  b4:    48 8b 45 e0              mov    -0x20(%rbp),%rax
  b8:    48 8b 55 e8              mov    -0x18(%rbp),%rdx
  bc:    5d                       pop    %rbp
  bd:    c3                       retq  


=================================================================

And here is the assembly for the windows x86_64 version:


=================================================================
0000000000000010 <_start>:
  10:    55                       push   %rbp
  11:    48 81 ec 80 00 00 00     sub    $0x80,%rsp
  18:    48 8d ac 24 80 00 00     lea    0x80(%rsp),%rbp
  1f:    00
  20:    48 b8 14 30 27 ec 01     movabs $0x1ec273014,%rax
  27:    00 00 00
  2a:    48 89 45 f8              mov    %rax,-0x8(%rbp)
  2e:    48 b8 ff ff ff ff ff     movabs $0xff7377ffffffffff,%rax
  35:    77 73 ff
  38:    48 89 45 f0              mov    %rax,-0x10(%rbp)
  3c:    48 b8 00 00 e8 89 04     movabs $0x8ac7230489e80000,%rax
  43:    23 c7 8a
  46:    48 89 45 e0              mov    %rax,-0x20(%rbp)
  4a:    48 c7 45 e8 00 00 00     movq   $0x0,-0x18(%rbp)
  51:    00
  52:    48 8b 45 f0              mov    -0x10(%rbp),%rax
  56:    48 8b 4d f8              mov    -0x8(%rbp),%rcx
  5a:    48 8b 55 e0              mov    -0x20(%rbp),%rdx
  5e:    4c 8b 45 e8              mov    -0x18(%rbp),%r8
  62:    48 89 4d c8              mov    %rcx,-0x38(%rbp)
  66:    48 89 45 c0              mov    %rax,-0x40(%rbp)
  6a:    4c 89 45 b8              mov    %r8,-0x48(%rbp)
  6e:    48 89 55 b0              mov    %rdx,-0x50(%rbp)
  72:    48 8d 4d c0              lea    -0x40(%rbp),%rcx
  76:    48 8d 55 b0              lea    -0x50(%rbp),%rdx
  7a:    e8 41 00 00 00           callq  c0 <__udivti3>
  7f:    66 0f 70 c8 4e           pshufd $0x4e,%xmm0,%xmm1
  84:    66 0f d6 45 d0           movq   %xmm0,-0x30(%rbp)
  89:    66 0f d6 4d d8           movq   %xmm1,-0x28(%rbp)
  8e:    0f 10 45 d0              movups -0x30(%rbp),%xmm0
  92:    0f 10 4d e0              movups -0x20(%rbp),%xmm1
  96:    66 0f 74 c1              pcmpeqb %xmm1,%xmm0
  9a:    66 44 0f d7 c8           pmovmskb %xmm0,%r9d
  9f:    41 81 e9 ff ff 00 00     sub    $0xffff,%r9d
  a6:    44 89 4d ac              mov    %r9d,-0x54(%rbp)
  aa:    74 06                    je     b2 <_start+0xa2>
  ac:    eb 00                    jmp    ae <_start+0x9e>
  ae:    eb 00                    jmp    b0 <_start+0xa0>
  b0:    eb fe                    jmp    b0 <_start+0xa0>
  b2:    eb 00                    jmp    b4 <_start+0xa4>
  b4:    48 81 c4 80 00 00 00     add    $0x80,%rsp
  bb:    5d                       pop    %rbp
  bc:    c3                       retq  
  bd:    90                       nop
  be:    90                       nop
  bf:    90                       nop

00000000000000c0 <__udivti3>:
  c0:    55                       push   %rbp
  c1:    48 83 ec 20              sub    $0x20,%rsp
  c5:    48 8d 6c 24 20           lea    0x20(%rsp),%rbp
  ca:    48 89 4d f0              mov    %rcx,-0x10(%rbp)
  ce:    48 89 55 f8              mov    %rdx,-0x8(%rbp)
  d2:    4c 89 4d e8              mov    %r9,-0x18(%rbp)
  d6:    4c 89 45 e0              mov    %r8,-0x20(%rbp)
  da:    48 8b 45 e0              mov    -0x20(%rbp),%rax
  de:    48 8b 55 e8              mov    -0x18(%rbp),%rdx
  e2:    48 83 c4 20              add    $0x20,%rsp
  e6:    5d                       pop    %rbp
  e7:    c3                       retq  

=================================================================


Finally, my question:

What is the correct LLVM IR to represent i128 values so that it will be compatible with the compiler-rt calls that LLVM generates? For example, what should be the LLVM IR definition of  __udivti3?

Because even though clang/compiler-rt project generates `define i128 @__udivti3(i128, i128) #0 {`, this appears to be incorrect when run on windows.

Thanks,
Andrew

_______________________________________________
LLVM Developers mailing list
[hidden email]
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
Reply | Threaded
Open this post in threaded view
|

Re: [llvm-dev] windows ABI problem with i128?

Dean Michael Berris via llvm-dev
Most probably you need to properly specify the calling convention the
backend is using for calling the runtime functions. Or implement the
stub for udivti3 that performs the necessary argument lifting.

I guess there is no standard ABI document describing the intended
calling convention here, so I'd just do what mingw64 does here and
make everything here compatible.

On Thu, Apr 26, 2018 at 4:44 AM, Andrew Kelley via llvm-dev
<[hidden email]> wrote:

> I'm trying to use LLVM to create compiler-rt.o on Windows. I use this
> command from the compiler-rt project:
>
> [nix-shell:~/downloads/llvm-project/compiler-rt]$ clang -nostdlib  -S
> -emit-llvm lib/builtins/udivti3.c  -g -target x86_64-windows
> -DCRT_HAS_128BIT
>
> The resulting LLVM IR is:
> =================================================================
>
> ; ModuleID = 'lib/builtins/udivti3.c'
> source_filename = "lib/builtins/udivti3.c"
> target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64--windows-msvc19.11.0"
>
> ; Function Attrs: noinline nounwind optnone uwtable
> define i128 @__udivti3(i128, i128) #0 {
>   %3 = alloca i128, align 16
>   %4 = alloca i128, align 16
>   store i128 %1, i128* %3, align 16
>   store i128 %0, i128* %4, align 16
>   %5 = load i128, i128* %3, align 16
>   %6 = load i128, i128* %4, align 16
>   %7 = call i128 @__udivmodti4(i128 %6, i128 %5, i128* null)
>   ret i128 %7
> }
>
> declare i128 @__udivmodti4(i128, i128, i128*) #1
>
> attributes #0 = { noinline nounwind optnone uwtable
> "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false"
> "less-precise-fpmad"="false" "no-frame-pointer-elim"="false"
> "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false"
> "no-signed-zeros-fp-math"="false" "no-trapping-math"="false"
> "stack-protector-buffer-size"="8" "target-cpu"="x86-64"
> "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false"
> "use-soft-float"="false" }
> attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false"
> "disable-tail-calls"="false" "less-precise-fpmad"="false"
> "no-frame-pointer-elim"="false" "no-infs-fp-math"="false"
> "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false"
> "no-trapping-math"="false" "stack-protector-buffer-size"="8"
> "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"
> "unsafe-fp-math"="false" "use-soft-float"="false" }
>
> !llvm.module.flags = !{!0, !1}
> !llvm.ident = !{!2}
>
> !0 = !{i32 1, !"wchar_size", i32 2}
> !1 = !{i32 7, !"PIC Level", i32 2}
> !2 = !{!"clang version 6.0.0 (tags/RELEASE_600/final)"}
>
>
> =================================================================
> However I think this results in a different ABI than LLVM will use when you
> do i128 division. For example, here is my test case (in zig code):
> =================================================================
>
> pub extern "kernel32" stdcallcc fn ExitProcess(exit_code: c_uint) noreturn;
>
> export fn WinMainCRTStartup() noreturn {
>     @setAlignStack(16);
>     @setRuntimeSafety(false);
>
>     var a: u128 = 152313999999999991610955792383;
>     var b: u128 = 10000000000000000000;
>     var c = a / b; // this generates a call to __udivti3
>
>     if (c != b) {
>         @breakpoint();
>     }
>     ExitProcess(0);
> }
>
> export fn __udivti3(a: u128, b: u128) u128 {
>     @setRuntimeSafety(false);
>     return b;
> }
>
>
> =================================================================
> This results in this LLVM IR:
> =================================================================
>
> ; ModuleID = 'test'
> source_filename = "test"
> target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-pc-windows-msvc"
>
> %"[]u8" = type { i8*, i64 }
> %StackTrace = type { i64, %"[]usize" }
> %"[]usize" = type { i64*, i64 }
>
> ; Function Attrs: nounwind readnone speculatable
> declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
>
> ; Function Attrs: nobuiltin noinline noreturn nounwind uwtable
> alignstack(16)
> define void @WinMainCRTStartup() #2 !dbg !41 {
> Entry:
>   %a = alloca i128, align 8
>   %b = alloca i128, align 8
>   %c = alloca i128, align 8
>   store i128 152313999999999991610955792383, i128* %a, align 8, !dbg !52
>   call void @llvm.dbg.declare(metadata i128* %a, metadata !45, metadata
> !DIExpression()), !dbg !52
>   store i128 10000000000000000000, i128* %b, align 8, !dbg !53
>   call void @llvm.dbg.declare(metadata i128* %b, metadata !48, metadata
> !DIExpression()), !dbg !53
>   %0 = load i128, i128* %a, align 8, !dbg !54
>   %1 = load i128, i128* %b, align 8, !dbg !55
>   %2 = udiv i128 %0, %1, !dbg !56
>   store i128 %2, i128* %c, align 8, !dbg !57
>   call void @llvm.dbg.declare(metadata i128* %c, metadata !50, metadata
> !DIExpression()), !dbg !57
>   %3 = load i128, i128* %c, align 8, !dbg !58
>   %4 = load i128, i128* %b, align 8, !dbg !60
>   %5 = icmp ne i128 %3, %4, !dbg !61
>   br i1 %5, label %Then, label %Else, !dbg !61
>
> Then:                                             ; preds = %Entry
>   call void @llvm.debugtrap(), !dbg !62
>   br label %EndIf, !dbg !64
>
> Else:                                             ; preds = %Entry
>   br label %EndIf, !dbg !64
>
> EndIf:                                            ; preds = %Else, %Then
>   call void @ExitProcess(i32 0), !dbg !65
>   unreachable, !dbg !65
> }
>
> ; Function Attrs: nounwind
> declare void @llvm.debugtrap() #3
>
> ; Function Attrs: nobuiltin noreturn nounwind uwtable
> declare void @ExitProcess(i32) #0
>
> ; Function Attrs: nobuiltin nounwind uwtable
> define i128 @__udivti3(i128, i128) #4 !dbg !66 {
> Entry:
>   %a = alloca i128, align 8
>   %b = alloca i128, align 8
>   store i128 %0, i128* %a, align 8
>   call void @llvm.dbg.declare(metadata i128* %a, metadata !70, metadata
> !DIExpression()), !dbg !73
>   store i128 %1, i128* %b, align 8
>   call void @llvm.dbg.declare(metadata i128* %b, metadata !71, metadata
> !DIExpression()), !dbg !74
>   %2 = load i128, i128* %b, align 8, !dbg !75
>   ret i128 %2, !dbg !78
> }
>
> ; Function Attrs: nounwind
> declare void @llvm.stackprotector(i8*, i8**) #3
>
> attributes #0 = { nobuiltin noreturn nounwind uwtable
> "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
> attributes #1 = { nounwind readnone speculatable }
> attributes #2 = { nobuiltin noinline noreturn nounwind uwtable alignstack=16
> "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
> attributes #3 = { nounwind }
> attributes #4 = { nobuiltin nounwind uwtable "no-frame-pointer-elim"="true"
> "no-frame-pointer-elim-non-leaf" }
>
> !llvm.module.flags = !{!0}
> !llvm.dbg.cu = !{!1}
>
> =================================================================
>
> When I link this with (link.exe or LLD, it does not matter):
> link.exe /OUT:test.exe /ENTRY:WinMainCRTStartup test.obj /subsystem:console
> kernel32.lib /nologo
>
> And run it, it triggers the breakpoint.
>
> Meanwhile on linux, this test passes.
>
> I suspect it may be a calling convention issue. Here is the assembly for the
> linux x86_64 version:
>
>
> =================================================================
> 0000000000000010 <_start>:
>   10:    55                       push   %rbp
>   11:    48 89 e5                 mov    %rsp,%rbp
>   14:    48 83 ec 40              sub    $0x40,%rsp
>   18:    48 b8 14 30 27 ec 01     movabs $0x1ec273014,%rax
>   1f:    00 00 00
>   22:    48 89 45 f8              mov    %rax,-0x8(%rbp)
>   26:    48 b8 ff ff ff ff ff     movabs $0xff7377ffffffffff,%rax
>   2d:    77 73 ff
>   30:    48 89 45 f0              mov    %rax,-0x10(%rbp)
>   34:    48 b8 00 00 e8 89 04     movabs $0x8ac7230489e80000,%rax
>   3b:    23 c7 8a
>   3e:    48 89 45 e0              mov    %rax,-0x20(%rbp)
>   42:    48 c7 45 e8 00 00 00     movq   $0x0,-0x18(%rbp)
>   49:    00
>   4a:    48 8b 7d f0              mov    -0x10(%rbp),%rdi
>   4e:    48 8b 75 f8              mov    -0x8(%rbp),%rsi
>   52:    48 8b 55 e0              mov    -0x20(%rbp),%rdx
>   56:    48 8b 4d e8              mov    -0x18(%rbp),%rcx
>   5a:    e8 00 00 00 00           callq  5f <_start+0x4f>
>   5f:    48 89 55 d8              mov    %rdx,-0x28(%rbp)
>   63:    48 89 45 d0              mov    %rax,-0x30(%rbp)
>   67:    c5 fa 6f 45 d0           vmovdqu -0x30(%rbp),%xmm0
>   6c:    c5 fa 6f 4d e0           vmovdqu -0x20(%rbp),%xmm1
>   71:    c5 f9 74 c1              vpcmpeqb %xmm1,%xmm0,%xmm0
>   75:    c5 79 d7 c0              vpmovmskb %xmm0,%r8d
>   79:    41 81 e8 ff ff 00 00     sub    $0xffff,%r8d
>   80:    44 89 45 cc              mov    %r8d,-0x34(%rbp)
>   84:    74 06                    je     8c <_start+0x7c>
>   86:    eb 00                    jmp    88 <_start+0x78>
>   88:    eb 00                    jmp    8a <_start+0x7a>
>   8a:    eb fe                    jmp    8a <_start+0x7a>
>   8c:    eb 00                    jmp    8e <_start+0x7e>
>   8e:    48 83 c4 40              add    $0x40,%rsp
>   92:    5d                       pop    %rbp
>   93:    c3                       retq
>   94:    66 66 66 2e 0f 1f 84     data16 data16 nopw %cs:0x0(%rax,%rax,1)
>   9b:    00 00 00 00 00
>
> 00000000000000a0 <__udivti3>:
>   a0:    55                       push   %rbp
>   a1:    48 89 e5                 mov    %rsp,%rbp
>   a4:    48 89 7d f0              mov    %rdi,-0x10(%rbp)
>   a8:    48 89 75 f8              mov    %rsi,-0x8(%rbp)
>   ac:    48 89 4d e8              mov    %rcx,-0x18(%rbp)
>   b0:    48 89 55 e0              mov    %rdx,-0x20(%rbp)
>   b4:    48 8b 45 e0              mov    -0x20(%rbp),%rax
>   b8:    48 8b 55 e8              mov    -0x18(%rbp),%rdx
>   bc:    5d                       pop    %rbp
>   bd:    c3                       retq
>
>
> =================================================================
>
> And here is the assembly for the windows x86_64 version:
>
>
> =================================================================
> 0000000000000010 <_start>:
>   10:    55                       push   %rbp
>   11:    48 81 ec 80 00 00 00     sub    $0x80,%rsp
>   18:    48 8d ac 24 80 00 00     lea    0x80(%rsp),%rbp
>   1f:    00
>   20:    48 b8 14 30 27 ec 01     movabs $0x1ec273014,%rax
>   27:    00 00 00
>   2a:    48 89 45 f8              mov    %rax,-0x8(%rbp)
>   2e:    48 b8 ff ff ff ff ff     movabs $0xff7377ffffffffff,%rax
>   35:    77 73 ff
>   38:    48 89 45 f0              mov    %rax,-0x10(%rbp)
>   3c:    48 b8 00 00 e8 89 04     movabs $0x8ac7230489e80000,%rax
>   43:    23 c7 8a
>   46:    48 89 45 e0              mov    %rax,-0x20(%rbp)
>   4a:    48 c7 45 e8 00 00 00     movq   $0x0,-0x18(%rbp)
>   51:    00
>   52:    48 8b 45 f0              mov    -0x10(%rbp),%rax
>   56:    48 8b 4d f8              mov    -0x8(%rbp),%rcx
>   5a:    48 8b 55 e0              mov    -0x20(%rbp),%rdx
>   5e:    4c 8b 45 e8              mov    -0x18(%rbp),%r8
>   62:    48 89 4d c8              mov    %rcx,-0x38(%rbp)
>   66:    48 89 45 c0              mov    %rax,-0x40(%rbp)
>   6a:    4c 89 45 b8              mov    %r8,-0x48(%rbp)
>   6e:    48 89 55 b0              mov    %rdx,-0x50(%rbp)
>   72:    48 8d 4d c0              lea    -0x40(%rbp),%rcx
>   76:    48 8d 55 b0              lea    -0x50(%rbp),%rdx
>   7a:    e8 41 00 00 00           callq  c0 <__udivti3>
>   7f:    66 0f 70 c8 4e           pshufd $0x4e,%xmm0,%xmm1
>   84:    66 0f d6 45 d0           movq   %xmm0,-0x30(%rbp)
>   89:    66 0f d6 4d d8           movq   %xmm1,-0x28(%rbp)
>   8e:    0f 10 45 d0              movups -0x30(%rbp),%xmm0
>   92:    0f 10 4d e0              movups -0x20(%rbp),%xmm1
>   96:    66 0f 74 c1              pcmpeqb %xmm1,%xmm0
>   9a:    66 44 0f d7 c8           pmovmskb %xmm0,%r9d
>   9f:    41 81 e9 ff ff 00 00     sub    $0xffff,%r9d
>   a6:    44 89 4d ac              mov    %r9d,-0x54(%rbp)
>   aa:    74 06                    je     b2 <_start+0xa2>
>   ac:    eb 00                    jmp    ae <_start+0x9e>
>   ae:    eb 00                    jmp    b0 <_start+0xa0>
>   b0:    eb fe                    jmp    b0 <_start+0xa0>
>   b2:    eb 00                    jmp    b4 <_start+0xa4>
>   b4:    48 81 c4 80 00 00 00     add    $0x80,%rsp
>   bb:    5d                       pop    %rbp
>   bc:    c3                       retq
>   bd:    90                       nop
>   be:    90                       nop
>   bf:    90                       nop
>
> 00000000000000c0 <__udivti3>:
>   c0:    55                       push   %rbp
>   c1:    48 83 ec 20              sub    $0x20,%rsp
>   c5:    48 8d 6c 24 20           lea    0x20(%rsp),%rbp
>   ca:    48 89 4d f0              mov    %rcx,-0x10(%rbp)
>   ce:    48 89 55 f8              mov    %rdx,-0x8(%rbp)
>   d2:    4c 89 4d e8              mov    %r9,-0x18(%rbp)
>   d6:    4c 89 45 e0              mov    %r8,-0x20(%rbp)
>   da:    48 8b 45 e0              mov    -0x20(%rbp),%rax
>   de:    48 8b 55 e8              mov    -0x18(%rbp),%rdx
>   e2:    48 83 c4 20              add    $0x20,%rsp
>   e6:    5d                       pop    %rbp
>   e7:    c3                       retq
>
> =================================================================
>
>
> Finally, my question:
>
> What is the correct LLVM IR to represent i128 values so that it will be
> compatible with the compiler-rt calls that LLVM generates? For example, what
> should be the LLVM IR definition of  __udivti3?
>
> Because even though clang/compiler-rt project generates `define i128
> @__udivti3(i128, i128) #0 {`, this appears to be incorrect when run on
> windows.
>
> Thanks,
> Andrew
>
> _______________________________________________
> LLVM Developers mailing list
> [hidden email]
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>



--
With best regards, Anton Korobeynikov
Department of Statistical Modelling, Saint Petersburg State University
_______________________________________________
LLVM Developers mailing list
[hidden email]
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
Reply | Threaded
Open this post in threaded view
|

Re: [llvm-dev] windows ABI problem with i128?

Dean Michael Berris via llvm-dev
On Thu, Apr 26, 2018 at 3:44 AM, Anton Korobeynikov <[hidden email]> wrote:
Most probably you need to properly specify the calling convention the
backend is using for calling the runtime functions.

Thanks for the tip. Can you be more specific? Are you suggesting there is some config parameter I can set before running TargetMachineEmitToFile?

Do you know what calling convention it is trying to use at the callsite? Perhaps I can simply select a different convention from this list for the implementation of udivti3? http://llvm.org/docs/LangRef.html#calling-conventions

Or implement the
stub for udivti3 that performs the necessary argument lifting.

I guess there is no standard ABI document describing the intended
calling convention here, so I'd just do what mingw64 does here and
make everything here compatible.

On Thu, Apr 26, 2018 at 4:44 AM, Andrew Kelley via llvm-dev
<[hidden email]> wrote:
> I'm trying to use LLVM to create compiler-rt.o on Windows. I use this
> command from the compiler-rt project:
>
> [nix-shell:~/downloads/llvm-project/compiler-rt]$ clang -nostdlib  -S
> -emit-llvm lib/builtins/udivti3.c  -g -target x86_64-windows
> -DCRT_HAS_128BIT
>
> The resulting LLVM IR is:
> =================================================================
>
> ; ModuleID = 'lib/builtins/udivti3.c'
> source_filename = "lib/builtins/udivti3.c"
> target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64--windows-msvc19.11.0"
>
> ; Function Attrs: noinline nounwind optnone uwtable
> define i128 @__udivti3(i128, i128) #0 {
>   %3 = alloca i128, align 16
>   %4 = alloca i128, align 16
>   store i128 %1, i128* %3, align 16
>   store i128 %0, i128* %4, align 16
>   %5 = load i128, i128* %3, align 16
>   %6 = load i128, i128* %4, align 16
>   %7 = call i128 @__udivmodti4(i128 %6, i128 %5, i128* null)
>   ret i128 %7
> }
>
> declare i128 @__udivmodti4(i128, i128, i128*) #1
>
> attributes #0 = { noinline nounwind optnone uwtable
> "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false"
> "less-precise-fpmad"="false" "no-frame-pointer-elim"="false"
> "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false"
> "no-signed-zeros-fp-math"="false" "no-trapping-math"="false"
> "stack-protector-buffer-size"="8" "target-cpu"="x86-64"
> "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false"
> "use-soft-float"="false" }
> attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false"
> "disable-tail-calls"="false" "less-precise-fpmad"="false"
> "no-frame-pointer-elim"="false" "no-infs-fp-math"="false"
> "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false"
> "no-trapping-math"="false" "stack-protector-buffer-size"="8"
> "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"
> "unsafe-fp-math"="false" "use-soft-float"="false" }
>
> !llvm.module.flags = !{!0, !1}
> !llvm.ident = !{!2}
>
> !0 = !{i32 1, !"wchar_size", i32 2}
> !1 = !{i32 7, !"PIC Level", i32 2}
> !2 = !{!"clang version 6.0.0 (tags/RELEASE_600/final)"}
>
>
> =================================================================
> However I think this results in a different ABI than LLVM will use when you
> do i128 division. For example, here is my test case (in zig code):
> =================================================================
>
> pub extern "kernel32" stdcallcc fn ExitProcess(exit_code: c_uint) noreturn;
>
> export fn WinMainCRTStartup() noreturn {
>     @setAlignStack(16);
>     @setRuntimeSafety(false);
>
>     var a: u128 = 152313999999999991610955792383;
>     var b: u128 = 10000000000000000000;
>     var c = a / b; // this generates a call to __udivti3
>
>     if (c != b) {
>         @breakpoint();
>     }
>     ExitProcess(0);
> }
>
> export fn __udivti3(a: u128, b: u128) u128 {
>     @setRuntimeSafety(false);
>     return b;
> }
>
>
> =================================================================
> This results in this LLVM IR:
> =================================================================
>
> ; ModuleID = 'test'
> source_filename = "test"
> target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-pc-windows-msvc"
>
> %"[]u8" = type { i8*, i64 }
> %StackTrace = type { i64, %"[]usize" }
> %"[]usize" = type { i64*, i64 }
>
> ; Function Attrs: nounwind readnone speculatable
> declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
>
> ; Function Attrs: nobuiltin noinline noreturn nounwind uwtable
> alignstack(16)
> define void @WinMainCRTStartup() #2 !dbg !41 {
> Entry:
>   %a = alloca i128, align 8
>   %b = alloca i128, align 8
>   %c = alloca i128, align 8
>   store i128 152313999999999991610955792383, i128* %a, align 8, !dbg !52
>   call void @llvm.dbg.declare(metadata i128* %a, metadata !45, metadata
> !DIExpression()), !dbg !52
>   store i128 10000000000000000000, i128* %b, align 8, !dbg !53
>   call void @llvm.dbg.declare(metadata i128* %b, metadata !48, metadata
> !DIExpression()), !dbg !53
>   %0 = load i128, i128* %a, align 8, !dbg !54
>   %1 = load i128, i128* %b, align 8, !dbg !55
>   %2 = udiv i128 %0, %1, !dbg !56
>   store i128 %2, i128* %c, align 8, !dbg !57
>   call void @llvm.dbg.declare(metadata i128* %c, metadata !50, metadata
> !DIExpression()), !dbg !57
>   %3 = load i128, i128* %c, align 8, !dbg !58
>   %4 = load i128, i128* %b, align 8, !dbg !60
>   %5 = icmp ne i128 %3, %4, !dbg !61
>   br i1 %5, label %Then, label %Else, !dbg !61
>
> Then:                                             ; preds = %Entry
>   call void @llvm.debugtrap(), !dbg !62
>   br label %EndIf, !dbg !64
>
> Else:                                             ; preds = %Entry
>   br label %EndIf, !dbg !64
>
> EndIf:                                            ; preds = %Else, %Then
>   call void @ExitProcess(i32 0), !dbg !65
>   unreachable, !dbg !65
> }
>
> ; Function Attrs: nounwind
> declare void @llvm.debugtrap() #3
>
> ; Function Attrs: nobuiltin noreturn nounwind uwtable
> declare void @ExitProcess(i32) #0
>
> ; Function Attrs: nobuiltin nounwind uwtable
> define i128 @__udivti3(i128, i128) #4 !dbg !66 {
> Entry:
>   %a = alloca i128, align 8
>   %b = alloca i128, align 8
>   store i128 %0, i128* %a, align 8
>   call void @llvm.dbg.declare(metadata i128* %a, metadata !70, metadata
> !DIExpression()), !dbg !73
>   store i128 %1, i128* %b, align 8
>   call void @llvm.dbg.declare(metadata i128* %b, metadata !71, metadata
> !DIExpression()), !dbg !74
>   %2 = load i128, i128* %b, align 8, !dbg !75
>   ret i128 %2, !dbg !78
> }
>
> ; Function Attrs: nounwind
> declare void @llvm.stackprotector(i8*, i8**) #3
>
> attributes #0 = { nobuiltin noreturn nounwind uwtable
> "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
> attributes #1 = { nounwind readnone speculatable }
> attributes #2 = { nobuiltin noinline noreturn nounwind uwtable alignstack=16
> "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
> attributes #3 = { nounwind }
> attributes #4 = { nobuiltin nounwind uwtable "no-frame-pointer-elim"="true"
> "no-frame-pointer-elim-non-leaf" }
>
> !llvm.module.flags = !{!0}
> !llvm.dbg.cu = !{!1}
>
> =================================================================
>
> When I link this with (link.exe or LLD, it does not matter):
> link.exe /OUT:test.exe /ENTRY:WinMainCRTStartup test.obj /subsystem:console
> kernel32.lib /nologo
>
> And run it, it triggers the breakpoint.
>
> Meanwhile on linux, this test passes.
>
> I suspect it may be a calling convention issue. Here is the assembly for the
> linux x86_64 version:
>
>
> =================================================================
> 0000000000000010 <_start>:
>   10:    55                       push   %rbp
>   11:    48 89 e5                 mov    %rsp,%rbp
>   14:    48 83 ec 40              sub    $0x40,%rsp
>   18:    48 b8 14 30 27 ec 01     movabs $0x1ec273014,%rax
>   1f:    00 00 00
>   22:    48 89 45 f8              mov    %rax,-0x8(%rbp)
>   26:    48 b8 ff ff ff ff ff     movabs $0xff7377ffffffffff,%rax
>   2d:    77 73 ff
>   30:    48 89 45 f0              mov    %rax,-0x10(%rbp)
>   34:    48 b8 00 00 e8 89 04     movabs $0x8ac7230489e80000,%rax
>   3b:    23 c7 8a
>   3e:    48 89 45 e0              mov    %rax,-0x20(%rbp)
>   42:    48 c7 45 e8 00 00 00     movq   $0x0,-0x18(%rbp)
>   49:    00
>   4a:    48 8b 7d f0              mov    -0x10(%rbp),%rdi
>   4e:    48 8b 75 f8              mov    -0x8(%rbp),%rsi
>   52:    48 8b 55 e0              mov    -0x20(%rbp),%rdx
>   56:    48 8b 4d e8              mov    -0x18(%rbp),%rcx
>   5a:    e8 00 00 00 00           callq  5f <_start+0x4f>
>   5f:    48 89 55 d8              mov    %rdx,-0x28(%rbp)
>   63:    48 89 45 d0              mov    %rax,-0x30(%rbp)
>   67:    c5 fa 6f 45 d0           vmovdqu -0x30(%rbp),%xmm0
>   6c:    c5 fa 6f 4d e0           vmovdqu -0x20(%rbp),%xmm1
>   71:    c5 f9 74 c1              vpcmpeqb %xmm1,%xmm0,%xmm0
>   75:    c5 79 d7 c0              vpmovmskb %xmm0,%r8d
>   79:    41 81 e8 ff ff 00 00     sub    $0xffff,%r8d
>   80:    44 89 45 cc              mov    %r8d,-0x34(%rbp)
>   84:    74 06                    je     8c <_start+0x7c>
>   86:    eb 00                    jmp    88 <_start+0x78>
>   88:    eb 00                    jmp    8a <_start+0x7a>
>   8a:    eb fe                    jmp    8a <_start+0x7a>
>   8c:    eb 00                    jmp    8e <_start+0x7e>
>   8e:    48 83 c4 40              add    $0x40,%rsp
>   92:    5d                       pop    %rbp
>   93:    c3                       retq
>   94:    66 66 66 2e 0f 1f 84     data16 data16 nopw %cs:0x0(%rax,%rax,1)
>   9b:    00 00 00 00 00
>
> 00000000000000a0 <__udivti3>:
>   a0:    55                       push   %rbp
>   a1:    48 89 e5                 mov    %rsp,%rbp
>   a4:    48 89 7d f0              mov    %rdi,-0x10(%rbp)
>   a8:    48 89 75 f8              mov    %rsi,-0x8(%rbp)
>   ac:    48 89 4d e8              mov    %rcx,-0x18(%rbp)
>   b0:    48 89 55 e0              mov    %rdx,-0x20(%rbp)
>   b4:    48 8b 45 e0              mov    -0x20(%rbp),%rax
>   b8:    48 8b 55 e8              mov    -0x18(%rbp),%rdx
>   bc:    5d                       pop    %rbp
>   bd:    c3                       retq
>
>
> =================================================================
>
> And here is the assembly for the windows x86_64 version:
>
>
> =================================================================
> 0000000000000010 <_start>:
>   10:    55                       push   %rbp
>   11:    48 81 ec 80 00 00 00     sub    $0x80,%rsp
>   18:    48 8d ac 24 80 00 00     lea    0x80(%rsp),%rbp
>   1f:    00
>   20:    48 b8 14 30 27 ec 01     movabs $0x1ec273014,%rax
>   27:    00 00 00
>   2a:    48 89 45 f8              mov    %rax,-0x8(%rbp)
>   2e:    48 b8 ff ff ff ff ff     movabs $0xff7377ffffffffff,%rax
>   35:    77 73 ff
>   38:    48 89 45 f0              mov    %rax,-0x10(%rbp)
>   3c:    48 b8 00 00 e8 89 04     movabs $0x8ac7230489e80000,%rax
>   43:    23 c7 8a
>   46:    48 89 45 e0              mov    %rax,-0x20(%rbp)
>   4a:    48 c7 45 e8 00 00 00     movq   $0x0,-0x18(%rbp)
>   51:    00
>   52:    48 8b 45 f0              mov    -0x10(%rbp),%rax
>   56:    48 8b 4d f8              mov    -0x8(%rbp),%rcx
>   5a:    48 8b 55 e0              mov    -0x20(%rbp),%rdx
>   5e:    4c 8b 45 e8              mov    -0x18(%rbp),%r8
>   62:    48 89 4d c8              mov    %rcx,-0x38(%rbp)
>   66:    48 89 45 c0              mov    %rax,-0x40(%rbp)
>   6a:    4c 89 45 b8              mov    %r8,-0x48(%rbp)
>   6e:    48 89 55 b0              mov    %rdx,-0x50(%rbp)
>   72:    48 8d 4d c0              lea    -0x40(%rbp),%rcx
>   76:    48 8d 55 b0              lea    -0x50(%rbp),%rdx
>   7a:    e8 41 00 00 00           callq  c0 <__udivti3>
>   7f:    66 0f 70 c8 4e           pshufd $0x4e,%xmm0,%xmm1
>   84:    66 0f d6 45 d0           movq   %xmm0,-0x30(%rbp)
>   89:    66 0f d6 4d d8           movq   %xmm1,-0x28(%rbp)
>   8e:    0f 10 45 d0              movups -0x30(%rbp),%xmm0
>   92:    0f 10 4d e0              movups -0x20(%rbp),%xmm1
>   96:    66 0f 74 c1              pcmpeqb %xmm1,%xmm0
>   9a:    66 44 0f d7 c8           pmovmskb %xmm0,%r9d
>   9f:    41 81 e9 ff ff 00 00     sub    $0xffff,%r9d
>   a6:    44 89 4d ac              mov    %r9d,-0x54(%rbp)
>   aa:    74 06                    je     b2 <_start+0xa2>
>   ac:    eb 00                    jmp    ae <_start+0x9e>
>   ae:    eb 00                    jmp    b0 <_start+0xa0>
>   b0:    eb fe                    jmp    b0 <_start+0xa0>
>   b2:    eb 00                    jmp    b4 <_start+0xa4>
>   b4:    48 81 c4 80 00 00 00     add    $0x80,%rsp
>   bb:    5d                       pop    %rbp
>   bc:    c3                       retq
>   bd:    90                       nop
>   be:    90                       nop
>   bf:    90                       nop
>
> 00000000000000c0 <__udivti3>:
>   c0:    55                       push   %rbp
>   c1:    48 83 ec 20              sub    $0x20,%rsp
>   c5:    48 8d 6c 24 20           lea    0x20(%rsp),%rbp
>   ca:    48 89 4d f0              mov    %rcx,-0x10(%rbp)
>   ce:    48 89 55 f8              mov    %rdx,-0x8(%rbp)
>   d2:    4c 89 4d e8              mov    %r9,-0x18(%rbp)
>   d6:    4c 89 45 e0              mov    %r8,-0x20(%rbp)
>   da:    48 8b 45 e0              mov    -0x20(%rbp),%rax
>   de:    48 8b 55 e8              mov    -0x18(%rbp),%rdx
>   e2:    48 83 c4 20              add    $0x20,%rsp
>   e6:    5d                       pop    %rbp
>   e7:    c3                       retq
>
> =================================================================
>
>
> Finally, my question:
>
> What is the correct LLVM IR to represent i128 values so that it will be
> compatible with the compiler-rt calls that LLVM generates? For example, what
> should be the LLVM IR definition of  __udivti3?
>
> Because even though clang/compiler-rt project generates `define i128
> @__udivti3(i128, i128) #0 {`, this appears to be incorrect when run on
> windows.
>
> Thanks,
> Andrew
>
> _______________________________________________
> LLVM Developers mailing list
> [hidden email]
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>



--
With best regards, Anton Korobeynikov
Department of Statistical Modelling, Saint Petersburg State University


_______________________________________________
LLVM Developers mailing list
[hidden email]
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
Reply | Threaded
Open this post in threaded view
|

Re: [llvm-dev] windows ABI problem with i128?

Dean Michael Berris via llvm-dev
I figured out the problem:

The definition of __udivti3 is putting the return value on the stack, like this:
 106:    48 8b 01                 mov    (%rcx),%rax
 109:    48 8b 51 08              mov    0x8(%rcx),%rdx

However the callsite expects the result to be in %xmm0, which matches [microsoft's ABI](https://msdn.microsoft.com/en-us/library/7572ztz4.aspx):
  a8:    e8 43 00 00 00           callq  f0 <__udivti3>
  ad:    66 0f 70 c8 4e           pshufd $0x4e,%xmm0,%xmm1
  b2:    66 0f d6 45 00           movq   %xmm0,0x0(%rbp)
  b7:    66 0f d6 4d 08           movq   %xmm1,0x8(%rbp)

So I tried using inline assembly to set xmm0:
call void asm sideeffect "", "{xmm0}"(i128 %3)

However this gives  "couldn't allocate input reg for constraint '{xmm0}'"

Then I tried @llvm.write_register:
call void @llvm.write_register.i128(metadata !83, i128 %3)
!83 = !{!"xmm0\00"}

This crashed llc. Here's a bug report: https://bugs.llvm.org/show_bug.cgi?id=37285

What's the best way to put a i128 in xmm0?

Regards,
Andrew


On Thu, Apr 26, 2018 at 11:30 AM, Andrew Kelley <[hidden email]> wrote:
On Thu, Apr 26, 2018 at 3:44 AM, Anton Korobeynikov <[hidden email]> wrote:
Most probably you need to properly specify the calling convention the
backend is using for calling the runtime functions.

Thanks for the tip. Can you be more specific? Are you suggesting there is some config parameter I can set before running TargetMachineEmitToFile?

Do you know what calling convention it is trying to use at the callsite? Perhaps I can simply select a different convention from this list for the implementation of udivti3? http://llvm.org/docs/LangRef.html#calling-conventions

Or implement the
stub for udivti3 that performs the necessary argument lifting.

I guess there is no standard ABI document describing the intended
calling convention here, so I'd just do what mingw64 does here and
make everything here compatible.

On Thu, Apr 26, 2018 at 4:44 AM, Andrew Kelley via llvm-dev
<[hidden email]> wrote:
> I'm trying to use LLVM to create compiler-rt.o on Windows. I use this
> command from the compiler-rt project:
>
> [nix-shell:~/downloads/llvm-project/compiler-rt]$ clang -nostdlib  -S
> -emit-llvm lib/builtins/udivti3.c  -g -target x86_64-windows
> -DCRT_HAS_128BIT
>
> The resulting LLVM IR is:
> =================================================================
>
> ; ModuleID = 'lib/builtins/udivti3.c'
> source_filename = "lib/builtins/udivti3.c"
> target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64--windows-msvc19.11.0"
>
> ; Function Attrs: noinline nounwind optnone uwtable
> define i128 @__udivti3(i128, i128) #0 {
>   %3 = alloca i128, align 16
>   %4 = alloca i128, align 16
>   store i128 %1, i128* %3, align 16
>   store i128 %0, i128* %4, align 16
>   %5 = load i128, i128* %3, align 16
>   %6 = load i128, i128* %4, align 16
>   %7 = call i128 @__udivmodti4(i128 %6, i128 %5, i128* null)
>   ret i128 %7
> }
>
> declare i128 @__udivmodti4(i128, i128, i128*) #1
>
> attributes #0 = { noinline nounwind optnone uwtable
> "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false"
> "less-precise-fpmad"="false" "no-frame-pointer-elim"="false"
> "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false"
> "no-signed-zeros-fp-math"="false" "no-trapping-math"="false"
> "stack-protector-buffer-size"="8" "target-cpu"="x86-64"
> "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false"
> "use-soft-float"="false" }
> attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false"
> "disable-tail-calls"="false" "less-precise-fpmad"="false"
> "no-frame-pointer-elim"="false" "no-infs-fp-math"="false"
> "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false"
> "no-trapping-math"="false" "stack-protector-buffer-size"="8"
> "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"
> "unsafe-fp-math"="false" "use-soft-float"="false" }
>
> !llvm.module.flags = !{!0, !1}
> !llvm.ident = !{!2}
>
> !0 = !{i32 1, !"wchar_size", i32 2}
> !1 = !{i32 7, !"PIC Level", i32 2}
> !2 = !{!"clang version 6.0.0 (tags/RELEASE_600/final)"}
>
>
> =================================================================
> However I think this results in a different ABI than LLVM will use when you
> do i128 division. For example, here is my test case (in zig code):
> =================================================================
>
> pub extern "kernel32" stdcallcc fn ExitProcess(exit_code: c_uint) noreturn;
>
> export fn WinMainCRTStartup() noreturn {
>     @setAlignStack(16);
>     @setRuntimeSafety(false);
>
>     var a: u128 = 152313999999999991610955792383;
>     var b: u128 = 10000000000000000000;
>     var c = a / b; // this generates a call to __udivti3
>
>     if (c != b) {
>         @breakpoint();
>     }
>     ExitProcess(0);
> }
>
> export fn __udivti3(a: u128, b: u128) u128 {
>     @setRuntimeSafety(false);
>     return b;
> }
>
>
> =================================================================
> This results in this LLVM IR:
> =================================================================
>
> ; ModuleID = 'test'
> source_filename = "test"
> target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-pc-windows-msvc"
>
> %"[]u8" = type { i8*, i64 }
> %StackTrace = type { i64, %"[]usize" }
> %"[]usize" = type { i64*, i64 }
>
> ; Function Attrs: nounwind readnone speculatable
> declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
>
> ; Function Attrs: nobuiltin noinline noreturn nounwind uwtable
> alignstack(16)
> define void @WinMainCRTStartup() #2 !dbg !41 {
> Entry:
>   %a = alloca i128, align 8
>   %b = alloca i128, align 8
>   %c = alloca i128, align 8
>   store i128 152313999999999991610955792383, i128* %a, align 8, !dbg !52
>   call void @llvm.dbg.declare(metadata i128* %a, metadata !45, metadata
> !DIExpression()), !dbg !52
>   store i128 10000000000000000000, i128* %b, align 8, !dbg !53
>   call void @llvm.dbg.declare(metadata i128* %b, metadata !48, metadata
> !DIExpression()), !dbg !53
>   %0 = load i128, i128* %a, align 8, !dbg !54
>   %1 = load i128, i128* %b, align 8, !dbg !55
>   %2 = udiv i128 %0, %1, !dbg !56
>   store i128 %2, i128* %c, align 8, !dbg !57
>   call void @llvm.dbg.declare(metadata i128* %c, metadata !50, metadata
> !DIExpression()), !dbg !57
>   %3 = load i128, i128* %c, align 8, !dbg !58
>   %4 = load i128, i128* %b, align 8, !dbg !60
>   %5 = icmp ne i128 %3, %4, !dbg !61
>   br i1 %5, label %Then, label %Else, !dbg !61
>
> Then:                                             ; preds = %Entry
>   call void @llvm.debugtrap(), !dbg !62
>   br label %EndIf, !dbg !64
>
> Else:                                             ; preds = %Entry
>   br label %EndIf, !dbg !64
>
> EndIf:                                            ; preds = %Else, %Then
>   call void @ExitProcess(i32 0), !dbg !65
>   unreachable, !dbg !65
> }
>
> ; Function Attrs: nounwind
> declare void @llvm.debugtrap() #3
>
> ; Function Attrs: nobuiltin noreturn nounwind uwtable
> declare void @ExitProcess(i32) #0
>
> ; Function Attrs: nobuiltin nounwind uwtable
> define i128 @__udivti3(i128, i128) #4 !dbg !66 {
> Entry:
>   %a = alloca i128, align 8
>   %b = alloca i128, align 8
>   store i128 %0, i128* %a, align 8
>   call void @llvm.dbg.declare(metadata i128* %a, metadata !70, metadata
> !DIExpression()), !dbg !73
>   store i128 %1, i128* %b, align 8
>   call void @llvm.dbg.declare(metadata i128* %b, metadata !71, metadata
> !DIExpression()), !dbg !74
>   %2 = load i128, i128* %b, align 8, !dbg !75
>   ret i128 %2, !dbg !78
> }
>
> ; Function Attrs: nounwind
> declare void @llvm.stackprotector(i8*, i8**) #3
>
> attributes #0 = { nobuiltin noreturn nounwind uwtable
> "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
> attributes #1 = { nounwind readnone speculatable }
> attributes #2 = { nobuiltin noinline noreturn nounwind uwtable alignstack=16
> "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
> attributes #3 = { nounwind }
> attributes #4 = { nobuiltin nounwind uwtable "no-frame-pointer-elim"="true"
> "no-frame-pointer-elim-non-leaf" }
>
> !llvm.module.flags = !{!0}
> !llvm.dbg.cu = !{!1}
>
> =================================================================
>
> When I link this with (link.exe or LLD, it does not matter):
> link.exe /OUT:test.exe /ENTRY:WinMainCRTStartup test.obj /subsystem:console
> kernel32.lib /nologo
>
> And run it, it triggers the breakpoint.
>
> Meanwhile on linux, this test passes.
>
> I suspect it may be a calling convention issue. Here is the assembly for the
> linux x86_64 version:
>
>
> =================================================================
> 0000000000000010 <_start>:
>   10:    55                       push   %rbp
>   11:    48 89 e5                 mov    %rsp,%rbp
>   14:    48 83 ec 40              sub    $0x40,%rsp
>   18:    48 b8 14 30 27 ec 01     movabs $0x1ec273014,%rax
>   1f:    00 00 00
>   22:    48 89 45 f8              mov    %rax,-0x8(%rbp)
>   26:    48 b8 ff ff ff ff ff     movabs $0xff7377ffffffffff,%rax
>   2d:    77 73 ff
>   30:    48 89 45 f0              mov    %rax,-0x10(%rbp)
>   34:    48 b8 00 00 e8 89 04     movabs $0x8ac7230489e80000,%rax
>   3b:    23 c7 8a
>   3e:    48 89 45 e0              mov    %rax,-0x20(%rbp)
>   42:    48 c7 45 e8 00 00 00     movq   $0x0,-0x18(%rbp)
>   49:    00
>   4a:    48 8b 7d f0              mov    -0x10(%rbp),%rdi
>   4e:    48 8b 75 f8              mov    -0x8(%rbp),%rsi
>   52:    48 8b 55 e0              mov    -0x20(%rbp),%rdx
>   56:    48 8b 4d e8              mov    -0x18(%rbp),%rcx
>   5a:    e8 00 00 00 00           callq  5f <_start+0x4f>
>   5f:    48 89 55 d8              mov    %rdx,-0x28(%rbp)
>   63:    48 89 45 d0              mov    %rax,-0x30(%rbp)
>   67:    c5 fa 6f 45 d0           vmovdqu -0x30(%rbp),%xmm0
>   6c:    c5 fa 6f 4d e0           vmovdqu -0x20(%rbp),%xmm1
>   71:    c5 f9 74 c1              vpcmpeqb %xmm1,%xmm0,%xmm0
>   75:    c5 79 d7 c0              vpmovmskb %xmm0,%r8d
>   79:    41 81 e8 ff ff 00 00     sub    $0xffff,%r8d
>   80:    44 89 45 cc              mov    %r8d,-0x34(%rbp)
>   84:    74 06                    je     8c <_start+0x7c>
>   86:    eb 00                    jmp    88 <_start+0x78>
>   88:    eb 00                    jmp    8a <_start+0x7a>
>   8a:    eb fe                    jmp    8a <_start+0x7a>
>   8c:    eb 00                    jmp    8e <_start+0x7e>
>   8e:    48 83 c4 40              add    $0x40,%rsp
>   92:    5d                       pop    %rbp
>   93:    c3                       retq
>   94:    66 66 66 2e 0f 1f 84     data16 data16 nopw %cs:0x0(%rax,%rax,1)
>   9b:    00 00 00 00 00
>
> 00000000000000a0 <__udivti3>:
>   a0:    55                       push   %rbp
>   a1:    48 89 e5                 mov    %rsp,%rbp
>   a4:    48 89 7d f0              mov    %rdi,-0x10(%rbp)
>   a8:    48 89 75 f8              mov    %rsi,-0x8(%rbp)
>   ac:    48 89 4d e8              mov    %rcx,-0x18(%rbp)
>   b0:    48 89 55 e0              mov    %rdx,-0x20(%rbp)
>   b4:    48 8b 45 e0              mov    -0x20(%rbp),%rax
>   b8:    48 8b 55 e8              mov    -0x18(%rbp),%rdx
>   bc:    5d                       pop    %rbp
>   bd:    c3                       retq
>
>
> =================================================================
>
> And here is the assembly for the windows x86_64 version:
>
>
> =================================================================
> 0000000000000010 <_start>:
>   10:    55                       push   %rbp
>   11:    48 81 ec 80 00 00 00     sub    $0x80,%rsp
>   18:    48 8d ac 24 80 00 00     lea    0x80(%rsp),%rbp
>   1f:    00
>   20:    48 b8 14 30 27 ec 01     movabs $0x1ec273014,%rax
>   27:    00 00 00
>   2a:    48 89 45 f8              mov    %rax,-0x8(%rbp)
>   2e:    48 b8 ff ff ff ff ff     movabs $0xff7377ffffffffff,%rax
>   35:    77 73 ff
>   38:    48 89 45 f0              mov    %rax,-0x10(%rbp)
>   3c:    48 b8 00 00 e8 89 04     movabs $0x8ac7230489e80000,%rax
>   43:    23 c7 8a
>   46:    48 89 45 e0              mov    %rax,-0x20(%rbp)
>   4a:    48 c7 45 e8 00 00 00     movq   $0x0,-0x18(%rbp)
>   51:    00
>   52:    48 8b 45 f0              mov    -0x10(%rbp),%rax
>   56:    48 8b 4d f8              mov    -0x8(%rbp),%rcx
>   5a:    48 8b 55 e0              mov    -0x20(%rbp),%rdx
>   5e:    4c 8b 45 e8              mov    -0x18(%rbp),%r8
>   62:    48 89 4d c8              mov    %rcx,-0x38(%rbp)
>   66:    48 89 45 c0              mov    %rax,-0x40(%rbp)
>   6a:    4c 89 45 b8              mov    %r8,-0x48(%rbp)
>   6e:    48 89 55 b0              mov    %rdx,-0x50(%rbp)
>   72:    48 8d 4d c0              lea    -0x40(%rbp),%rcx
>   76:    48 8d 55 b0              lea    -0x50(%rbp),%rdx
>   7a:    e8 41 00 00 00           callq  c0 <__udivti3>
>   7f:    66 0f 70 c8 4e           pshufd $0x4e,%xmm0,%xmm1
>   84:    66 0f d6 45 d0           movq   %xmm0,-0x30(%rbp)
>   89:    66 0f d6 4d d8           movq   %xmm1,-0x28(%rbp)
>   8e:    0f 10 45 d0              movups -0x30(%rbp),%xmm0
>   92:    0f 10 4d e0              movups -0x20(%rbp),%xmm1
>   96:    66 0f 74 c1              pcmpeqb %xmm1,%xmm0
>   9a:    66 44 0f d7 c8           pmovmskb %xmm0,%r9d
>   9f:    41 81 e9 ff ff 00 00     sub    $0xffff,%r9d
>   a6:    44 89 4d ac              mov    %r9d,-0x54(%rbp)
>   aa:    74 06                    je     b2 <_start+0xa2>
>   ac:    eb 00                    jmp    ae <_start+0x9e>
>   ae:    eb 00                    jmp    b0 <_start+0xa0>
>   b0:    eb fe                    jmp    b0 <_start+0xa0>
>   b2:    eb 00                    jmp    b4 <_start+0xa4>
>   b4:    48 81 c4 80 00 00 00     add    $0x80,%rsp
>   bb:    5d                       pop    %rbp
>   bc:    c3                       retq
>   bd:    90                       nop
>   be:    90                       nop
>   bf:    90                       nop
>
> 00000000000000c0 <__udivti3>:
>   c0:    55                       push   %rbp
>   c1:    48 83 ec 20              sub    $0x20,%rsp
>   c5:    48 8d 6c 24 20           lea    0x20(%rsp),%rbp
>   ca:    48 89 4d f0              mov    %rcx,-0x10(%rbp)
>   ce:    48 89 55 f8              mov    %rdx,-0x8(%rbp)
>   d2:    4c 89 4d e8              mov    %r9,-0x18(%rbp)
>   d6:    4c 89 45 e0              mov    %r8,-0x20(%rbp)
>   da:    48 8b 45 e0              mov    -0x20(%rbp),%rax
>   de:    48 8b 55 e8              mov    -0x18(%rbp),%rdx
>   e2:    48 83 c4 20              add    $0x20,%rsp
>   e6:    5d                       pop    %rbp
>   e7:    c3                       retq
>
> =================================================================
>
>
> Finally, my question:
>
> What is the correct LLVM IR to represent i128 values so that it will be
> compatible with the compiler-rt calls that LLVM generates? For example, what
> should be the LLVM IR definition of  __udivti3?
>
> Because even though clang/compiler-rt project generates `define i128
> @__udivti3(i128, i128) #0 {`, this appears to be incorrect when run on
> windows.
>
> Thanks,
> Andrew
>
> _______________________________________________
> LLVM Developers mailing list
> [hidden email]
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>



--
With best regards, Anton Korobeynikov
Department of Statistical Modelling, Saint Petersburg State University



_______________________________________________
LLVM Developers mailing list
[hidden email]
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev