diff --git a/pkg/vectorize/add/avx2.s b/pkg/vectorize/add/avx2.s new file mode 100644 index 0000000000000000000000000000000000000000..4aab38668d93b3d82f085226ff6244e4415a21ce --- /dev/null +++ b/pkg/vectorize/add/avx2.s @@ -0,0 +1,1001 @@ +// Code generated by command: go run avx2.go -out add/avx2.s -stubs add/avx2_stubs.go. DO NOT EDIT. + +#include "textflag.h" + +// func int8AddAvx2Asm(x []int8, y []int8, r []int8) +// Requires: AVX, AVX2 +TEXT ·int8AddAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +int8AddBlockLoop: + CMPQ BX, $0x000000c0 + JL int8AddTailLoop + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VMOVDQU 128(AX), Y4 + VMOVDQU 160(AX), Y5 + VPADDB (CX), Y0, Y0 + VPADDB 32(CX), Y1, Y1 + VPADDB 64(CX), Y2, Y2 + VPADDB 96(CX), Y3, Y3 + VPADDB 128(CX), Y4, Y4 + VPADDB 160(CX), Y5, Y5 + VMOVDQU Y0, (DX) + VMOVDQU Y1, 32(DX) + VMOVDQU Y2, 64(DX) + VMOVDQU Y3, 96(DX) + VMOVDQU Y4, 128(DX) + VMOVDQU Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x000000c0, BX + JMP int8AddBlockLoop + +int8AddTailLoop: + CMPQ BX, $0x00000020 + JL int8AddDone + VMOVDQU (AX), Y0 + VPADDB (CX), Y0, Y0 + VMOVDQU Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000020, BX + JMP int8AddTailLoop + +int8AddDone: + RET + +// func int8AddScalarAvx2Asm(x int8, y []int8, r []int8) +// Requires: AVX, AVX2, SSE2 +TEXT ·int8AddScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVBLSX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTB X0, Y0 + +int8AddScalarBlockLoop: + CMPQ BX, $0x000000c0 + JL int8AddScalarTailLoop + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPADDB Y0, Y1, Y1 + VPADDB Y0, Y2, Y2 + VPADDB Y0, Y3, Y3 + VPADDB Y0, Y4, Y4 + VPADDB Y0, Y5, Y5 + VPADDB Y0, Y6, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x000000c0, BX + JMP int8AddScalarBlockLoop + +int8AddScalarTailLoop: + CMPQ BX, $0x00000020 + JL int8AddScalarDone + VMOVDQU (CX), Y1 + VPADDB Y0, Y1, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000020, BX + JMP int8AddScalarTailLoop + +int8AddScalarDone: + RET + +// func int16AddAvx2Asm(x []int16, y []int16, r []int16) +// Requires: AVX, AVX2 +TEXT ·int16AddAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +int16AddBlockLoop: + CMPQ BX, $0x00000060 + JL int16AddTailLoop + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VMOVDQU 128(AX), Y4 + VMOVDQU 160(AX), Y5 + VPADDW (CX), Y0, Y0 + VPADDW 32(CX), Y1, Y1 + VPADDW 64(CX), Y2, Y2 + VPADDW 96(CX), Y3, Y3 + VPADDW 128(CX), Y4, Y4 + VPADDW 160(CX), Y5, Y5 + VMOVDQU Y0, (DX) + VMOVDQU Y1, 32(DX) + VMOVDQU Y2, 64(DX) + VMOVDQU Y3, 96(DX) + VMOVDQU Y4, 128(DX) + VMOVDQU Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000060, BX + JMP int16AddBlockLoop + +int16AddTailLoop: + CMPQ BX, $0x00000010 + JL int16AddDone + VMOVDQU (AX), Y0 + VPADDW (CX), Y0, Y0 + VMOVDQU Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000010, BX + JMP int16AddTailLoop + +int16AddDone: + RET + +// func int16AddScalarAvx2Asm(x int16, y []int16, r []int16) +// Requires: AVX, AVX2, SSE2 +TEXT ·int16AddScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVWLSX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTW X0, Y0 + +int16AddScalarBlockLoop: + CMPQ BX, $0x00000060 + JL int16AddScalarTailLoop + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPADDW Y0, Y1, Y1 + VPADDW Y0, Y2, Y2 + VPADDW Y0, Y3, Y3 + VPADDW Y0, Y4, Y4 + VPADDW Y0, Y5, Y5 + VPADDW Y0, Y6, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000060, BX + JMP int16AddScalarBlockLoop + +int16AddScalarTailLoop: + CMPQ BX, $0x00000010 + JL int16AddScalarDone + VMOVDQU (CX), Y1 + VPADDW Y0, Y1, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000010, BX + JMP int16AddScalarTailLoop + +int16AddScalarDone: + RET + +// func int32AddAvx2Asm(x []int32, y []int32, r []int32) +// Requires: AVX, AVX2 +TEXT ·int32AddAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +int32AddBlockLoop: + CMPQ BX, $0x00000030 + JL int32AddTailLoop + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VMOVDQU 128(AX), Y4 + VMOVDQU 160(AX), Y5 + VPADDD (CX), Y0, Y0 + VPADDD 32(CX), Y1, Y1 + VPADDD 64(CX), Y2, Y2 + VPADDD 96(CX), Y3, Y3 + VPADDD 128(CX), Y4, Y4 + VPADDD 160(CX), Y5, Y5 + VMOVDQU Y0, (DX) + VMOVDQU Y1, 32(DX) + VMOVDQU Y2, 64(DX) + VMOVDQU Y3, 96(DX) + VMOVDQU Y4, 128(DX) + VMOVDQU Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000030, BX + JMP int32AddBlockLoop + +int32AddTailLoop: + CMPQ BX, $0x00000008 + JL int32AddDone + VMOVDQU (AX), Y0 + VPADDD (CX), Y0, Y0 + VMOVDQU Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000008, BX + JMP int32AddTailLoop + +int32AddDone: + RET + +// func int32AddScalarAvx2Asm(x int32, y []int32, r []int32) +// Requires: AVX, AVX2, SSE2 +TEXT ·int32AddScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVL x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTD X0, Y0 + +int32AddScalarBlockLoop: + CMPQ BX, $0x00000030 + JL int32AddScalarTailLoop + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPADDD Y0, Y1, Y1 + VPADDD Y0, Y2, Y2 + VPADDD Y0, Y3, Y3 + VPADDD Y0, Y4, Y4 + VPADDD Y0, Y5, Y5 + VPADDD Y0, Y6, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000030, BX + JMP int32AddScalarBlockLoop + +int32AddScalarTailLoop: + CMPQ BX, $0x00000008 + JL int32AddScalarDone + VMOVDQU (CX), Y1 + VPADDD Y0, Y1, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000008, BX + JMP int32AddScalarTailLoop + +int32AddScalarDone: + RET + +// func int64AddAvx2Asm(x []int64, y []int64, r []int64) +// Requires: AVX, AVX2 +TEXT ·int64AddAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +int64AddBlockLoop: + CMPQ BX, $0x00000018 + JL int64AddTailLoop + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VMOVDQU 128(AX), Y4 + VMOVDQU 160(AX), Y5 + VPADDQ (CX), Y0, Y0 + VPADDQ 32(CX), Y1, Y1 + VPADDQ 64(CX), Y2, Y2 + VPADDQ 96(CX), Y3, Y3 + VPADDQ 128(CX), Y4, Y4 + VPADDQ 160(CX), Y5, Y5 + VMOVDQU Y0, (DX) + VMOVDQU Y1, 32(DX) + VMOVDQU Y2, 64(DX) + VMOVDQU Y3, 96(DX) + VMOVDQU Y4, 128(DX) + VMOVDQU Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000018, BX + JMP int64AddBlockLoop + +int64AddTailLoop: + CMPQ BX, $0x00000004 + JL int64AddDone + VMOVDQU (AX), Y0 + VPADDQ (CX), Y0, Y0 + VMOVDQU Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000004, BX + JMP int64AddTailLoop + +int64AddDone: + RET + +// func int64AddScalarAvx2Asm(x int64, y []int64, r []int64) +// Requires: AVX, AVX2, SSE2 +TEXT ·int64AddScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVQ x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVQ AX, X0 + VPBROADCASTQ X0, Y0 + +int64AddScalarBlockLoop: + CMPQ BX, $0x00000018 + JL int64AddScalarTailLoop + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPADDQ Y0, Y1, Y1 + VPADDQ Y0, Y2, Y2 + VPADDQ Y0, Y3, Y3 + VPADDQ Y0, Y4, Y4 + VPADDQ Y0, Y5, Y5 + VPADDQ Y0, Y6, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000018, BX + JMP int64AddScalarBlockLoop + +int64AddScalarTailLoop: + CMPQ BX, $0x00000004 + JL int64AddScalarDone + VMOVDQU (CX), Y1 + VPADDQ Y0, Y1, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000004, BX + JMP int64AddScalarTailLoop + +int64AddScalarDone: + RET + +// func uint8AddAvx2Asm(x []uint8, y []uint8, r []uint8) +// Requires: AVX, AVX2 +TEXT ·uint8AddAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +uint8AddBlockLoop: + CMPQ BX, $0x000000c0 + JL uint8AddTailLoop + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VMOVDQU 128(AX), Y4 + VMOVDQU 160(AX), Y5 + VPADDB (CX), Y0, Y0 + VPADDB 32(CX), Y1, Y1 + VPADDB 64(CX), Y2, Y2 + VPADDB 96(CX), Y3, Y3 + VPADDB 128(CX), Y4, Y4 + VPADDB 160(CX), Y5, Y5 + VMOVDQU Y0, (DX) + VMOVDQU Y1, 32(DX) + VMOVDQU Y2, 64(DX) + VMOVDQU Y3, 96(DX) + VMOVDQU Y4, 128(DX) + VMOVDQU Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x000000c0, BX + JMP uint8AddBlockLoop + +uint8AddTailLoop: + CMPQ BX, $0x00000020 + JL uint8AddDone + VMOVDQU (AX), Y0 + VPADDB (CX), Y0, Y0 + VMOVDQU Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000020, BX + JMP uint8AddTailLoop + +uint8AddDone: + RET + +// func uint8AddScalarAvx2Asm(x uint8, y []uint8, r []uint8) +// Requires: AVX, AVX2, SSE2 +TEXT ·uint8AddScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVBLZX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTB X0, Y0 + +uint8AddScalarBlockLoop: + CMPQ BX, $0x000000c0 + JL uint8AddScalarTailLoop + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPADDB Y0, Y1, Y1 + VPADDB Y0, Y2, Y2 + VPADDB Y0, Y3, Y3 + VPADDB Y0, Y4, Y4 + VPADDB Y0, Y5, Y5 + VPADDB Y0, Y6, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x000000c0, BX + JMP uint8AddScalarBlockLoop + +uint8AddScalarTailLoop: + CMPQ BX, $0x00000020 + JL uint8AddScalarDone + VMOVDQU (CX), Y1 + VPADDB Y0, Y1, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000020, BX + JMP uint8AddScalarTailLoop + +uint8AddScalarDone: + RET + +// func uint16AddAvx2Asm(x []uint16, y []uint16, r []uint16) +// Requires: AVX, AVX2 +TEXT ·uint16AddAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +uint16AddBlockLoop: + CMPQ BX, $0x00000060 + JL uint16AddTailLoop + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VMOVDQU 128(AX), Y4 + VMOVDQU 160(AX), Y5 + VPADDW (CX), Y0, Y0 + VPADDW 32(CX), Y1, Y1 + VPADDW 64(CX), Y2, Y2 + VPADDW 96(CX), Y3, Y3 + VPADDW 128(CX), Y4, Y4 + VPADDW 160(CX), Y5, Y5 + VMOVDQU Y0, (DX) + VMOVDQU Y1, 32(DX) + VMOVDQU Y2, 64(DX) + VMOVDQU Y3, 96(DX) + VMOVDQU Y4, 128(DX) + VMOVDQU Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000060, BX + JMP uint16AddBlockLoop + +uint16AddTailLoop: + CMPQ BX, $0x00000010 + JL uint16AddDone + VMOVDQU (AX), Y0 + VPADDW (CX), Y0, Y0 + VMOVDQU Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000010, BX + JMP uint16AddTailLoop + +uint16AddDone: + RET + +// func uint16AddScalarAvx2Asm(x uint16, y []uint16, r []uint16) +// Requires: AVX, AVX2, SSE2 +TEXT ·uint16AddScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVWLZX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTW X0, Y0 + +uint16AddScalarBlockLoop: + CMPQ BX, $0x00000060 + JL uint16AddScalarTailLoop + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPADDW Y0, Y1, Y1 + VPADDW Y0, Y2, Y2 + VPADDW Y0, Y3, Y3 + VPADDW Y0, Y4, Y4 + VPADDW Y0, Y5, Y5 + VPADDW Y0, Y6, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000060, BX + JMP uint16AddScalarBlockLoop + +uint16AddScalarTailLoop: + CMPQ BX, $0x00000010 + JL uint16AddScalarDone + VMOVDQU (CX), Y1 + VPADDW Y0, Y1, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000010, BX + JMP uint16AddScalarTailLoop + +uint16AddScalarDone: + RET + +// func uint32AddAvx2Asm(x []uint32, y []uint32, r []uint32) +// Requires: AVX, AVX2 +TEXT ·uint32AddAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +uint32AddBlockLoop: + CMPQ BX, $0x00000030 + JL uint32AddTailLoop + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VMOVDQU 128(AX), Y4 + VMOVDQU 160(AX), Y5 + VPADDD (CX), Y0, Y0 + VPADDD 32(CX), Y1, Y1 + VPADDD 64(CX), Y2, Y2 + VPADDD 96(CX), Y3, Y3 + VPADDD 128(CX), Y4, Y4 + VPADDD 160(CX), Y5, Y5 + VMOVDQU Y0, (DX) + VMOVDQU Y1, 32(DX) + VMOVDQU Y2, 64(DX) + VMOVDQU Y3, 96(DX) + VMOVDQU Y4, 128(DX) + VMOVDQU Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000030, BX + JMP uint32AddBlockLoop + +uint32AddTailLoop: + CMPQ BX, $0x00000008 + JL uint32AddDone + VMOVDQU (AX), Y0 + VPADDD (CX), Y0, Y0 + VMOVDQU Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000008, BX + JMP uint32AddTailLoop + +uint32AddDone: + RET + +// func uint32AddScalarAvx2Asm(x uint32, y []uint32, r []uint32) +// Requires: AVX, AVX2, SSE2 +TEXT ·uint32AddScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVL x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTD X0, Y0 + +uint32AddScalarBlockLoop: + CMPQ BX, $0x00000030 + JL uint32AddScalarTailLoop + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPADDD Y0, Y1, Y1 + VPADDD Y0, Y2, Y2 + VPADDD Y0, Y3, Y3 + VPADDD Y0, Y4, Y4 + VPADDD Y0, Y5, Y5 + VPADDD Y0, Y6, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000030, BX + JMP uint32AddScalarBlockLoop + +uint32AddScalarTailLoop: + CMPQ BX, $0x00000008 + JL uint32AddScalarDone + VMOVDQU (CX), Y1 + VPADDD Y0, Y1, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000008, BX + JMP uint32AddScalarTailLoop + +uint32AddScalarDone: + RET + +// func uint64AddAvx2Asm(x []uint64, y []uint64, r []uint64) +// Requires: AVX, AVX2 +TEXT ·uint64AddAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +uint64AddBlockLoop: + CMPQ BX, $0x00000018 + JL uint64AddTailLoop + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VMOVDQU 128(AX), Y4 + VMOVDQU 160(AX), Y5 + VPADDQ (CX), Y0, Y0 + VPADDQ 32(CX), Y1, Y1 + VPADDQ 64(CX), Y2, Y2 + VPADDQ 96(CX), Y3, Y3 + VPADDQ 128(CX), Y4, Y4 + VPADDQ 160(CX), Y5, Y5 + VMOVDQU Y0, (DX) + VMOVDQU Y1, 32(DX) + VMOVDQU Y2, 64(DX) + VMOVDQU Y3, 96(DX) + VMOVDQU Y4, 128(DX) + VMOVDQU Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000018, BX + JMP uint64AddBlockLoop + +uint64AddTailLoop: + CMPQ BX, $0x00000004 + JL uint64AddDone + VMOVDQU (AX), Y0 + VPADDQ (CX), Y0, Y0 + VMOVDQU Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000004, BX + JMP uint64AddTailLoop + +uint64AddDone: + RET + +// func uint64AddScalarAvx2Asm(x uint64, y []uint64, r []uint64) +// Requires: AVX, AVX2, SSE2 +TEXT ·uint64AddScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVQ x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVQ AX, X0 + VPBROADCASTQ X0, Y0 + +uint64AddScalarBlockLoop: + CMPQ BX, $0x00000018 + JL uint64AddScalarTailLoop + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPADDQ Y0, Y1, Y1 + VPADDQ Y0, Y2, Y2 + VPADDQ Y0, Y3, Y3 + VPADDQ Y0, Y4, Y4 + VPADDQ Y0, Y5, Y5 + VPADDQ Y0, Y6, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000018, BX + JMP uint64AddScalarBlockLoop + +uint64AddScalarTailLoop: + CMPQ BX, $0x00000004 + JL uint64AddScalarDone + VMOVDQU (CX), Y1 + VPADDQ Y0, Y1, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000004, BX + JMP uint64AddScalarTailLoop + +uint64AddScalarDone: + RET + +// func float32AddAvx2Asm(x []float32, y []float32, r []float32) +// Requires: AVX +TEXT ·float32AddAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +float32AddBlockLoop: + CMPQ BX, $0x00000030 + JL float32AddTailLoop + VMOVUPS (AX), Y0 + VMOVUPS 32(AX), Y1 + VMOVUPS 64(AX), Y2 + VMOVUPS 96(AX), Y3 + VMOVUPS 128(AX), Y4 + VMOVUPS 160(AX), Y5 + VADDPS (CX), Y0, Y0 + VADDPS 32(CX), Y1, Y1 + VADDPS 64(CX), Y2, Y2 + VADDPS 96(CX), Y3, Y3 + VADDPS 128(CX), Y4, Y4 + VADDPS 160(CX), Y5, Y5 + VMOVUPS Y0, (DX) + VMOVUPS Y1, 32(DX) + VMOVUPS Y2, 64(DX) + VMOVUPS Y3, 96(DX) + VMOVUPS Y4, 128(DX) + VMOVUPS Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000030, BX + JMP float32AddBlockLoop + +float32AddTailLoop: + CMPQ BX, $0x00000008 + JL float32AddDone + VMOVUPS (AX), Y0 + VADDPS (CX), Y0, Y0 + VMOVUPS Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000008, BX + JMP float32AddTailLoop + +float32AddDone: + RET + +// func float32AddScalarAvx2Asm(x float32, y []float32, r []float32) +// Requires: AVX, AVX2, SSE +TEXT ·float32AddScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVSS x+0(FP), X0 + MOVQ y_base+8(FP), AX + MOVQ r_base+32(FP), CX + MOVQ y_len+16(FP), DX + VBROADCASTSS X0, Y0 + +float32AddScalarBlockLoop: + CMPQ DX, $0x00000030 + JL float32AddScalarTailLoop + VMOVUPS (AX), Y1 + VMOVUPS 32(AX), Y2 + VMOVUPS 64(AX), Y3 + VMOVUPS 96(AX), Y4 + VMOVUPS 128(AX), Y5 + VMOVUPS 160(AX), Y6 + VADDPS Y0, Y1, Y1 + VADDPS Y0, Y2, Y2 + VADDPS Y0, Y3, Y3 + VADDPS Y0, Y4, Y4 + VADDPS Y0, Y5, Y5 + VADDPS Y0, Y6, Y6 + VMOVUPS Y1, (CX) + VMOVUPS Y2, 32(CX) + VMOVUPS Y3, 64(CX) + VMOVUPS Y4, 96(CX) + VMOVUPS Y5, 128(CX) + VMOVUPS Y6, 160(CX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + SUBQ $0x00000030, DX + JMP float32AddScalarBlockLoop + +float32AddScalarTailLoop: + CMPQ DX, $0x00000008 + JL float32AddScalarDone + VMOVUPS (AX), Y1 + VADDPS Y0, Y1, Y1 + VMOVUPS Y1, (CX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + SUBQ $0x00000008, DX + JMP float32AddScalarTailLoop + +float32AddScalarDone: + RET + +// func float64AddAvx2Asm(x []float64, y []float64, r []float64) +// Requires: AVX +TEXT ·float64AddAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +float64AddBlockLoop: + CMPQ BX, $0x00000018 + JL float64AddTailLoop + VMOVUPD (AX), Y0 + VMOVUPD 32(AX), Y1 + VMOVUPD 64(AX), Y2 + VMOVUPD 96(AX), Y3 + VMOVUPD 128(AX), Y4 + VMOVUPD 160(AX), Y5 + VADDPD (CX), Y0, Y0 + VADDPD 32(CX), Y1, Y1 + VADDPD 64(CX), Y2, Y2 + VADDPD 96(CX), Y3, Y3 + VADDPD 128(CX), Y4, Y4 + VADDPD 160(CX), Y5, Y5 + VMOVUPD Y0, (DX) + VMOVUPD Y1, 32(DX) + VMOVUPD Y2, 64(DX) + VMOVUPD Y3, 96(DX) + VMOVUPD Y4, 128(DX) + VMOVUPD Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000018, BX + JMP float64AddBlockLoop + +float64AddTailLoop: + CMPQ BX, $0x00000004 + JL float64AddDone + VMOVUPD (AX), Y0 + VADDPD (CX), Y0, Y0 + VMOVUPD Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000004, BX + JMP float64AddTailLoop + +float64AddDone: + RET + +// func float64AddScalarAvx2Asm(x float64, y []float64, r []float64) +// Requires: AVX, AVX2, SSE2 +TEXT ·float64AddScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVSD x+0(FP), X0 + MOVQ y_base+8(FP), AX + MOVQ r_base+32(FP), CX + MOVQ y_len+16(FP), DX + VBROADCASTSD X0, Y0 + +float64AddScalarBlockLoop: + CMPQ DX, $0x00000018 + JL float64AddScalarTailLoop + VMOVUPD (AX), Y1 + VMOVUPD 32(AX), Y2 + VMOVUPD 64(AX), Y3 + VMOVUPD 96(AX), Y4 + VMOVUPD 128(AX), Y5 + VMOVUPD 160(AX), Y6 + VADDPD Y0, Y1, Y1 + VADDPD Y0, Y2, Y2 + VADDPD Y0, Y3, Y3 + VADDPD Y0, Y4, Y4 + VADDPD Y0, Y5, Y5 + VADDPD Y0, Y6, Y6 + VMOVUPD Y1, (CX) + VMOVUPD Y2, 32(CX) + VMOVUPD Y3, 64(CX) + VMOVUPD Y4, 96(CX) + VMOVUPD Y5, 128(CX) + VMOVUPD Y6, 160(CX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + SUBQ $0x00000018, DX + JMP float64AddScalarBlockLoop + +float64AddScalarTailLoop: + CMPQ DX, $0x00000004 + JL float64AddScalarDone + VMOVUPD (AX), Y1 + VADDPD Y0, Y1, Y1 + VMOVUPD Y1, (CX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + SUBQ $0x00000004, DX + JMP float64AddScalarTailLoop + +float64AddScalarDone: + RET diff --git a/pkg/vectorize/add/avx2_stubs.go b/pkg/vectorize/add/avx2_stubs.go new file mode 100644 index 0000000000000000000000000000000000000000..daaae3813295f1080b1002b1d18b6bfb05c0c511 --- /dev/null +++ b/pkg/vectorize/add/avx2_stubs.go @@ -0,0 +1,43 @@ +// Code generated by command: go run avx2.go -out add/avx2.s -stubs add/avx2_stubs.go. DO NOT EDIT. + +package vectorize + +func int8AddAvx2Asm(x []int8, y []int8, r []int8) + +func int8AddScalarAvx2Asm(x int8, y []int8, r []int8) + +func int16AddAvx2Asm(x []int16, y []int16, r []int16) + +func int16AddScalarAvx2Asm(x int16, y []int16, r []int16) + +func int32AddAvx2Asm(x []int32, y []int32, r []int32) + +func int32AddScalarAvx2Asm(x int32, y []int32, r []int32) + +func int64AddAvx2Asm(x []int64, y []int64, r []int64) + +func int64AddScalarAvx2Asm(x int64, y []int64, r []int64) + +func uint8AddAvx2Asm(x []uint8, y []uint8, r []uint8) + +func uint8AddScalarAvx2Asm(x uint8, y []uint8, r []uint8) + +func uint16AddAvx2Asm(x []uint16, y []uint16, r []uint16) + +func uint16AddScalarAvx2Asm(x uint16, y []uint16, r []uint16) + +func uint32AddAvx2Asm(x []uint32, y []uint32, r []uint32) + +func uint32AddScalarAvx2Asm(x uint32, y []uint32, r []uint32) + +func uint64AddAvx2Asm(x []uint64, y []uint64, r []uint64) + +func uint64AddScalarAvx2Asm(x uint64, y []uint64, r []uint64) + +func float32AddAvx2Asm(x []float32, y []float32, r []float32) + +func float32AddScalarAvx2Asm(x float32, y []float32, r []float32) + +func float64AddAvx2Asm(x []float64, y []float64, r []float64) + +func float64AddScalarAvx2Asm(x float64, y []float64, r []float64) diff --git a/pkg/vectorize/add/avx512.s b/pkg/vectorize/add/avx512.s new file mode 100644 index 0000000000000000000000000000000000000000..577ea214882ef729b9842febd2e9dacd5ff18c5e --- /dev/null +++ b/pkg/vectorize/add/avx512.s @@ -0,0 +1,1361 @@ +// Code generated by command: go run avx512.go -out plus/avx512.s -stubs plus/avx512_stubs.go. DO NOT EDIT. + +#include "textflag.h" + +// func int8PlusAvx512Asm(x []int8, y []int8, r []int8) +// Requires: AVX512BW, AVX512F +TEXT ·int8PlusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +int8PlusBlockLoop: + CMPQ BX, $0x00000300 + JL int8PlusTailLoop + VMOVDQU32 (AX), Z0 + VMOVDQU32 64(AX), Z1 + VMOVDQU32 128(AX), Z2 + VMOVDQU32 192(AX), Z3 + VMOVDQU32 256(AX), Z4 + VMOVDQU32 320(AX), Z5 + VMOVDQU32 384(AX), Z6 + VMOVDQU32 448(AX), Z7 + VMOVDQU32 512(AX), Z8 + VMOVDQU32 576(AX), Z9 + VMOVDQU32 640(AX), Z10 + VMOVDQU32 704(AX), Z11 + VPADDB (CX), Z0, Z0 + VPADDB 64(CX), Z1, Z1 + VPADDB 128(CX), Z2, Z2 + VPADDB 192(CX), Z3, Z3 + VPADDB 256(CX), Z4, Z4 + VPADDB 320(CX), Z5, Z5 + VPADDB 384(CX), Z6, Z6 + VPADDB 448(CX), Z7, Z7 + VPADDB 512(CX), Z8, Z8 + VPADDB 576(CX), Z9, Z9 + VPADDB 640(CX), Z10, Z10 + VPADDB 704(CX), Z11, Z11 + VMOVDQU32 Z0, (DX) + VMOVDQU32 Z1, 64(DX) + VMOVDQU32 Z2, 128(DX) + VMOVDQU32 Z3, 192(DX) + VMOVDQU32 Z4, 256(DX) + VMOVDQU32 Z5, 320(DX) + VMOVDQU32 Z6, 384(DX) + VMOVDQU32 Z7, 448(DX) + VMOVDQU32 Z8, 512(DX) + VMOVDQU32 Z9, 576(DX) + VMOVDQU32 Z10, 640(DX) + VMOVDQU32 Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000300, BX + JMP int8PlusBlockLoop + +int8PlusTailLoop: + CMPQ BX, $0x00000040 + JL int8PlusDone + VMOVDQU32 (AX), Z0 + VPADDB (CX), Z0, Z0 + VMOVDQU32 Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000040, BX + JMP int8PlusTailLoop + +int8PlusDone: + RET + +// func int8PlusScalarAvx512Asm(x int8, y []int8, r []int8) +// Requires: AVX512BW, AVX512F, SSE2 +TEXT ·int8PlusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVBLSX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTB X0, Z0 + +int8PlusScalarBlockLoop: + CMPQ BX, $0x00000300 + JL int8PlusScalarTailLoop + VMOVDQU32 (CX), Z1 + VMOVDQU32 64(CX), Z2 + VMOVDQU32 128(CX), Z3 + VMOVDQU32 192(CX), Z4 + VMOVDQU32 256(CX), Z5 + VMOVDQU32 320(CX), Z6 + VMOVDQU32 384(CX), Z7 + VMOVDQU32 448(CX), Z8 + VMOVDQU32 512(CX), Z9 + VMOVDQU32 576(CX), Z10 + VMOVDQU32 640(CX), Z11 + VMOVDQU32 704(CX), Z12 + VPADDB Z0, Z1, Z1 + VPADDB Z0, Z2, Z2 + VPADDB Z0, Z3, Z3 + VPADDB Z0, Z4, Z4 + VPADDB Z0, Z5, Z5 + VPADDB Z0, Z6, Z6 + VPADDB Z0, Z7, Z7 + VPADDB Z0, Z8, Z8 + VPADDB Z0, Z9, Z9 + VPADDB Z0, Z10, Z10 + VPADDB Z0, Z11, Z11 + VPADDB Z0, Z12, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000300, BX + JMP int8PlusScalarBlockLoop + +int8PlusScalarTailLoop: + CMPQ BX, $0x00000040 + JL int8PlusScalarDone + VMOVDQU32 (CX), Z1 + VPADDB Z0, Z1, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000040, BX + JMP int8PlusScalarTailLoop + +int8PlusScalarDone: + RET + +// func int16PlusAvx512Asm(x []int16, y []int16, r []int16) +// Requires: AVX512BW, AVX512F +TEXT ·int16PlusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +int16PlusBlockLoop: + CMPQ BX, $0x00000180 + JL int16PlusTailLoop + VMOVDQU32 (AX), Z0 + VMOVDQU32 64(AX), Z1 + VMOVDQU32 128(AX), Z2 + VMOVDQU32 192(AX), Z3 + VMOVDQU32 256(AX), Z4 + VMOVDQU32 320(AX), Z5 + VMOVDQU32 384(AX), Z6 + VMOVDQU32 448(AX), Z7 + VMOVDQU32 512(AX), Z8 + VMOVDQU32 576(AX), Z9 + VMOVDQU32 640(AX), Z10 + VMOVDQU32 704(AX), Z11 + VPADDW (CX), Z0, Z0 + VPADDW 64(CX), Z1, Z1 + VPADDW 128(CX), Z2, Z2 + VPADDW 192(CX), Z3, Z3 + VPADDW 256(CX), Z4, Z4 + VPADDW 320(CX), Z5, Z5 + VPADDW 384(CX), Z6, Z6 + VPADDW 448(CX), Z7, Z7 + VPADDW 512(CX), Z8, Z8 + VPADDW 576(CX), Z9, Z9 + VPADDW 640(CX), Z10, Z10 + VPADDW 704(CX), Z11, Z11 + VMOVDQU32 Z0, (DX) + VMOVDQU32 Z1, 64(DX) + VMOVDQU32 Z2, 128(DX) + VMOVDQU32 Z3, 192(DX) + VMOVDQU32 Z4, 256(DX) + VMOVDQU32 Z5, 320(DX) + VMOVDQU32 Z6, 384(DX) + VMOVDQU32 Z7, 448(DX) + VMOVDQU32 Z8, 512(DX) + VMOVDQU32 Z9, 576(DX) + VMOVDQU32 Z10, 640(DX) + VMOVDQU32 Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000180, BX + JMP int16PlusBlockLoop + +int16PlusTailLoop: + CMPQ BX, $0x00000020 + JL int16PlusDone + VMOVDQU32 (AX), Z0 + VPADDW (CX), Z0, Z0 + VMOVDQU32 Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000020, BX + JMP int16PlusTailLoop + +int16PlusDone: + RET + +// func int16PlusScalarAvx512Asm(x int16, y []int16, r []int16) +// Requires: AVX512BW, AVX512F, SSE2 +TEXT ·int16PlusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVWLSX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTW X0, Z0 + +int16PlusScalarBlockLoop: + CMPQ BX, $0x00000180 + JL int16PlusScalarTailLoop + VMOVDQU32 (CX), Z1 + VMOVDQU32 64(CX), Z2 + VMOVDQU32 128(CX), Z3 + VMOVDQU32 192(CX), Z4 + VMOVDQU32 256(CX), Z5 + VMOVDQU32 320(CX), Z6 + VMOVDQU32 384(CX), Z7 + VMOVDQU32 448(CX), Z8 + VMOVDQU32 512(CX), Z9 + VMOVDQU32 576(CX), Z10 + VMOVDQU32 640(CX), Z11 + VMOVDQU32 704(CX), Z12 + VPADDW Z0, Z1, Z1 + VPADDW Z0, Z2, Z2 + VPADDW Z0, Z3, Z3 + VPADDW Z0, Z4, Z4 + VPADDW Z0, Z5, Z5 + VPADDW Z0, Z6, Z6 + VPADDW Z0, Z7, Z7 + VPADDW Z0, Z8, Z8 + VPADDW Z0, Z9, Z9 + VPADDW Z0, Z10, Z10 + VPADDW Z0, Z11, Z11 + VPADDW Z0, Z12, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000180, BX + JMP int16PlusScalarBlockLoop + +int16PlusScalarTailLoop: + CMPQ BX, $0x00000020 + JL int16PlusScalarDone + VMOVDQU32 (CX), Z1 + VPADDW Z0, Z1, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000020, BX + JMP int16PlusScalarTailLoop + +int16PlusScalarDone: + RET + +// func int32PlusAvx512Asm(x []int32, y []int32, r []int32) +// Requires: AVX512F +TEXT ·int32PlusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +int32PlusBlockLoop: + CMPQ BX, $0x000000c0 + JL int32PlusTailLoop + VMOVDQU32 (AX), Z0 + VMOVDQU32 64(AX), Z1 + VMOVDQU32 128(AX), Z2 + VMOVDQU32 192(AX), Z3 + VMOVDQU32 256(AX), Z4 + VMOVDQU32 320(AX), Z5 + VMOVDQU32 384(AX), Z6 + VMOVDQU32 448(AX), Z7 + VMOVDQU32 512(AX), Z8 + VMOVDQU32 576(AX), Z9 + VMOVDQU32 640(AX), Z10 + VMOVDQU32 704(AX), Z11 + VPADDD (CX), Z0, Z0 + VPADDD 64(CX), Z1, Z1 + VPADDD 128(CX), Z2, Z2 + VPADDD 192(CX), Z3, Z3 + VPADDD 256(CX), Z4, Z4 + VPADDD 320(CX), Z5, Z5 + VPADDD 384(CX), Z6, Z6 + VPADDD 448(CX), Z7, Z7 + VPADDD 512(CX), Z8, Z8 + VPADDD 576(CX), Z9, Z9 + VPADDD 640(CX), Z10, Z10 + VPADDD 704(CX), Z11, Z11 + VMOVDQU32 Z0, (DX) + VMOVDQU32 Z1, 64(DX) + VMOVDQU32 Z2, 128(DX) + VMOVDQU32 Z3, 192(DX) + VMOVDQU32 Z4, 256(DX) + VMOVDQU32 Z5, 320(DX) + VMOVDQU32 Z6, 384(DX) + VMOVDQU32 Z7, 448(DX) + VMOVDQU32 Z8, 512(DX) + VMOVDQU32 Z9, 576(DX) + VMOVDQU32 Z10, 640(DX) + VMOVDQU32 Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x000000c0, BX + JMP int32PlusBlockLoop + +int32PlusTailLoop: + CMPQ BX, $0x00000010 + JL int32PlusDone + VMOVDQU32 (AX), Z0 + VPADDD (CX), Z0, Z0 + VMOVDQU32 Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000010, BX + JMP int32PlusTailLoop + +int32PlusDone: + RET + +// func int32PlusScalarAvx512Asm(x int32, y []int32, r []int32) +// Requires: AVX512F, SSE2 +TEXT ·int32PlusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVL x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTD X0, Z0 + +int32PlusScalarBlockLoop: + CMPQ BX, $0x000000c0 + JL int32PlusScalarTailLoop + VMOVDQU32 (CX), Z1 + VMOVDQU32 64(CX), Z2 + VMOVDQU32 128(CX), Z3 + VMOVDQU32 192(CX), Z4 + VMOVDQU32 256(CX), Z5 + VMOVDQU32 320(CX), Z6 + VMOVDQU32 384(CX), Z7 + VMOVDQU32 448(CX), Z8 + VMOVDQU32 512(CX), Z9 + VMOVDQU32 576(CX), Z10 + VMOVDQU32 640(CX), Z11 + VMOVDQU32 704(CX), Z12 + VPADDD Z0, Z1, Z1 + VPADDD Z0, Z2, Z2 + VPADDD Z0, Z3, Z3 + VPADDD Z0, Z4, Z4 + VPADDD Z0, Z5, Z5 + VPADDD Z0, Z6, Z6 + VPADDD Z0, Z7, Z7 + VPADDD Z0, Z8, Z8 + VPADDD Z0, Z9, Z9 + VPADDD Z0, Z10, Z10 + VPADDD Z0, Z11, Z11 + VPADDD Z0, Z12, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x000000c0, BX + JMP int32PlusScalarBlockLoop + +int32PlusScalarTailLoop: + CMPQ BX, $0x00000010 + JL int32PlusScalarDone + VMOVDQU32 (CX), Z1 + VPADDD Z0, Z1, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000010, BX + JMP int32PlusScalarTailLoop + +int32PlusScalarDone: + RET + +// func int64PlusAvx512Asm(x []int64, y []int64, r []int64) +// Requires: AVX512F +TEXT ·int64PlusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +int64PlusBlockLoop: + CMPQ BX, $0x00000060 + JL int64PlusTailLoop + VMOVDQU32 (AX), Z0 + VMOVDQU32 64(AX), Z1 + VMOVDQU32 128(AX), Z2 + VMOVDQU32 192(AX), Z3 + VMOVDQU32 256(AX), Z4 + VMOVDQU32 320(AX), Z5 + VMOVDQU32 384(AX), Z6 + VMOVDQU32 448(AX), Z7 + VMOVDQU32 512(AX), Z8 + VMOVDQU32 576(AX), Z9 + VMOVDQU32 640(AX), Z10 + VMOVDQU32 704(AX), Z11 + VPADDQ (CX), Z0, Z0 + VPADDQ 64(CX), Z1, Z1 + VPADDQ 128(CX), Z2, Z2 + VPADDQ 192(CX), Z3, Z3 + VPADDQ 256(CX), Z4, Z4 + VPADDQ 320(CX), Z5, Z5 + VPADDQ 384(CX), Z6, Z6 + VPADDQ 448(CX), Z7, Z7 + VPADDQ 512(CX), Z8, Z8 + VPADDQ 576(CX), Z9, Z9 + VPADDQ 640(CX), Z10, Z10 + VPADDQ 704(CX), Z11, Z11 + VMOVDQU32 Z0, (DX) + VMOVDQU32 Z1, 64(DX) + VMOVDQU32 Z2, 128(DX) + VMOVDQU32 Z3, 192(DX) + VMOVDQU32 Z4, 256(DX) + VMOVDQU32 Z5, 320(DX) + VMOVDQU32 Z6, 384(DX) + VMOVDQU32 Z7, 448(DX) + VMOVDQU32 Z8, 512(DX) + VMOVDQU32 Z9, 576(DX) + VMOVDQU32 Z10, 640(DX) + VMOVDQU32 Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000060, BX + JMP int64PlusBlockLoop + +int64PlusTailLoop: + CMPQ BX, $0x00000008 + JL int64PlusDone + VMOVDQU32 (AX), Z0 + VPADDQ (CX), Z0, Z0 + VMOVDQU32 Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000008, BX + JMP int64PlusTailLoop + +int64PlusDone: + RET + +// func int64PlusScalarAvx512Asm(x int64, y []int64, r []int64) +// Requires: AVX512F, SSE2 +TEXT ·int64PlusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVQ x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVQ AX, X0 + VPBROADCASTQ X0, Z0 + +int64PlusScalarBlockLoop: + CMPQ BX, $0x00000060 + JL int64PlusScalarTailLoop + VMOVDQU32 (CX), Z1 + VMOVDQU32 64(CX), Z2 + VMOVDQU32 128(CX), Z3 + VMOVDQU32 192(CX), Z4 + VMOVDQU32 256(CX), Z5 + VMOVDQU32 320(CX), Z6 + VMOVDQU32 384(CX), Z7 + VMOVDQU32 448(CX), Z8 + VMOVDQU32 512(CX), Z9 + VMOVDQU32 576(CX), Z10 + VMOVDQU32 640(CX), Z11 + VMOVDQU32 704(CX), Z12 + VPADDQ Z0, Z1, Z1 + VPADDQ Z0, Z2, Z2 + VPADDQ Z0, Z3, Z3 + VPADDQ Z0, Z4, Z4 + VPADDQ Z0, Z5, Z5 + VPADDQ Z0, Z6, Z6 + VPADDQ Z0, Z7, Z7 + VPADDQ Z0, Z8, Z8 + VPADDQ Z0, Z9, Z9 + VPADDQ Z0, Z10, Z10 + VPADDQ Z0, Z11, Z11 + VPADDQ Z0, Z12, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000060, BX + JMP int64PlusScalarBlockLoop + +int64PlusScalarTailLoop: + CMPQ BX, $0x00000008 + JL int64PlusScalarDone + VMOVDQU32 (CX), Z1 + VPADDQ Z0, Z1, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000008, BX + JMP int64PlusScalarTailLoop + +int64PlusScalarDone: + RET + +// func uint8PlusAvx512Asm(x []uint8, y []uint8, r []uint8) +// Requires: AVX512BW, AVX512F +TEXT ·uint8PlusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +uint8PlusBlockLoop: + CMPQ BX, $0x00000300 + JL uint8PlusTailLoop + VMOVDQU32 (AX), Z0 + VMOVDQU32 64(AX), Z1 + VMOVDQU32 128(AX), Z2 + VMOVDQU32 192(AX), Z3 + VMOVDQU32 256(AX), Z4 + VMOVDQU32 320(AX), Z5 + VMOVDQU32 384(AX), Z6 + VMOVDQU32 448(AX), Z7 + VMOVDQU32 512(AX), Z8 + VMOVDQU32 576(AX), Z9 + VMOVDQU32 640(AX), Z10 + VMOVDQU32 704(AX), Z11 + VPADDB (CX), Z0, Z0 + VPADDB 64(CX), Z1, Z1 + VPADDB 128(CX), Z2, Z2 + VPADDB 192(CX), Z3, Z3 + VPADDB 256(CX), Z4, Z4 + VPADDB 320(CX), Z5, Z5 + VPADDB 384(CX), Z6, Z6 + VPADDB 448(CX), Z7, Z7 + VPADDB 512(CX), Z8, Z8 + VPADDB 576(CX), Z9, Z9 + VPADDB 640(CX), Z10, Z10 + VPADDB 704(CX), Z11, Z11 + VMOVDQU32 Z0, (DX) + VMOVDQU32 Z1, 64(DX) + VMOVDQU32 Z2, 128(DX) + VMOVDQU32 Z3, 192(DX) + VMOVDQU32 Z4, 256(DX) + VMOVDQU32 Z5, 320(DX) + VMOVDQU32 Z6, 384(DX) + VMOVDQU32 Z7, 448(DX) + VMOVDQU32 Z8, 512(DX) + VMOVDQU32 Z9, 576(DX) + VMOVDQU32 Z10, 640(DX) + VMOVDQU32 Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000300, BX + JMP uint8PlusBlockLoop + +uint8PlusTailLoop: + CMPQ BX, $0x00000040 + JL uint8PlusDone + VMOVDQU32 (AX), Z0 + VPADDB (CX), Z0, Z0 + VMOVDQU32 Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000040, BX + JMP uint8PlusTailLoop + +uint8PlusDone: + RET + +// func uint8PlusScalarAvx512Asm(x uint8, y []uint8, r []uint8) +// Requires: AVX512BW, AVX512F, SSE2 +TEXT ·uint8PlusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVBLZX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTB X0, Z0 + +uint8PlusScalarBlockLoop: + CMPQ BX, $0x00000300 + JL uint8PlusScalarTailLoop + VMOVDQU32 (CX), Z1 + VMOVDQU32 64(CX), Z2 + VMOVDQU32 128(CX), Z3 + VMOVDQU32 192(CX), Z4 + VMOVDQU32 256(CX), Z5 + VMOVDQU32 320(CX), Z6 + VMOVDQU32 384(CX), Z7 + VMOVDQU32 448(CX), Z8 + VMOVDQU32 512(CX), Z9 + VMOVDQU32 576(CX), Z10 + VMOVDQU32 640(CX), Z11 + VMOVDQU32 704(CX), Z12 + VPADDB Z0, Z1, Z1 + VPADDB Z0, Z2, Z2 + VPADDB Z0, Z3, Z3 + VPADDB Z0, Z4, Z4 + VPADDB Z0, Z5, Z5 + VPADDB Z0, Z6, Z6 + VPADDB Z0, Z7, Z7 + VPADDB Z0, Z8, Z8 + VPADDB Z0, Z9, Z9 + VPADDB Z0, Z10, Z10 + VPADDB Z0, Z11, Z11 + VPADDB Z0, Z12, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000300, BX + JMP uint8PlusScalarBlockLoop + +uint8PlusScalarTailLoop: + CMPQ BX, $0x00000040 + JL uint8PlusScalarDone + VMOVDQU32 (CX), Z1 + VPADDB Z0, Z1, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000040, BX + JMP uint8PlusScalarTailLoop + +uint8PlusScalarDone: + RET + +// func uint16PlusAvx512Asm(x []uint16, y []uint16, r []uint16) +// Requires: AVX512BW, AVX512F +TEXT ·uint16PlusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +uint16PlusBlockLoop: + CMPQ BX, $0x00000180 + JL uint16PlusTailLoop + VMOVDQU32 (AX), Z0 + VMOVDQU32 64(AX), Z1 + VMOVDQU32 128(AX), Z2 + VMOVDQU32 192(AX), Z3 + VMOVDQU32 256(AX), Z4 + VMOVDQU32 320(AX), Z5 + VMOVDQU32 384(AX), Z6 + VMOVDQU32 448(AX), Z7 + VMOVDQU32 512(AX), Z8 + VMOVDQU32 576(AX), Z9 + VMOVDQU32 640(AX), Z10 + VMOVDQU32 704(AX), Z11 + VPADDW (CX), Z0, Z0 + VPADDW 64(CX), Z1, Z1 + VPADDW 128(CX), Z2, Z2 + VPADDW 192(CX), Z3, Z3 + VPADDW 256(CX), Z4, Z4 + VPADDW 320(CX), Z5, Z5 + VPADDW 384(CX), Z6, Z6 + VPADDW 448(CX), Z7, Z7 + VPADDW 512(CX), Z8, Z8 + VPADDW 576(CX), Z9, Z9 + VPADDW 640(CX), Z10, Z10 + VPADDW 704(CX), Z11, Z11 + VMOVDQU32 Z0, (DX) + VMOVDQU32 Z1, 64(DX) + VMOVDQU32 Z2, 128(DX) + VMOVDQU32 Z3, 192(DX) + VMOVDQU32 Z4, 256(DX) + VMOVDQU32 Z5, 320(DX) + VMOVDQU32 Z6, 384(DX) + VMOVDQU32 Z7, 448(DX) + VMOVDQU32 Z8, 512(DX) + VMOVDQU32 Z9, 576(DX) + VMOVDQU32 Z10, 640(DX) + VMOVDQU32 Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000180, BX + JMP uint16PlusBlockLoop + +uint16PlusTailLoop: + CMPQ BX, $0x00000020 + JL uint16PlusDone + VMOVDQU32 (AX), Z0 + VPADDW (CX), Z0, Z0 + VMOVDQU32 Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000020, BX + JMP uint16PlusTailLoop + +uint16PlusDone: + RET + +// func uint16PlusScalarAvx512Asm(x uint16, y []uint16, r []uint16) +// Requires: AVX512BW, AVX512F, SSE2 +TEXT ·uint16PlusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVWLZX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTW X0, Z0 + +uint16PlusScalarBlockLoop: + CMPQ BX, $0x00000180 + JL uint16PlusScalarTailLoop + VMOVDQU32 (CX), Z1 + VMOVDQU32 64(CX), Z2 + VMOVDQU32 128(CX), Z3 + VMOVDQU32 192(CX), Z4 + VMOVDQU32 256(CX), Z5 + VMOVDQU32 320(CX), Z6 + VMOVDQU32 384(CX), Z7 + VMOVDQU32 448(CX), Z8 + VMOVDQU32 512(CX), Z9 + VMOVDQU32 576(CX), Z10 + VMOVDQU32 640(CX), Z11 + VMOVDQU32 704(CX), Z12 + VPADDW Z0, Z1, Z1 + VPADDW Z0, Z2, Z2 + VPADDW Z0, Z3, Z3 + VPADDW Z0, Z4, Z4 + VPADDW Z0, Z5, Z5 + VPADDW Z0, Z6, Z6 + VPADDW Z0, Z7, Z7 + VPADDW Z0, Z8, Z8 + VPADDW Z0, Z9, Z9 + VPADDW Z0, Z10, Z10 + VPADDW Z0, Z11, Z11 + VPADDW Z0, Z12, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000180, BX + JMP uint16PlusScalarBlockLoop + +uint16PlusScalarTailLoop: + CMPQ BX, $0x00000020 + JL uint16PlusScalarDone + VMOVDQU32 (CX), Z1 + VPADDW Z0, Z1, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000020, BX + JMP uint16PlusScalarTailLoop + +uint16PlusScalarDone: + RET + +// func uint32PlusAvx512Asm(x []uint32, y []uint32, r []uint32) +// Requires: AVX512F +TEXT ·uint32PlusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +uint32PlusBlockLoop: + CMPQ BX, $0x000000c0 + JL uint32PlusTailLoop + VMOVDQU32 (AX), Z0 + VMOVDQU32 64(AX), Z1 + VMOVDQU32 128(AX), Z2 + VMOVDQU32 192(AX), Z3 + VMOVDQU32 256(AX), Z4 + VMOVDQU32 320(AX), Z5 + VMOVDQU32 384(AX), Z6 + VMOVDQU32 448(AX), Z7 + VMOVDQU32 512(AX), Z8 + VMOVDQU32 576(AX), Z9 + VMOVDQU32 640(AX), Z10 + VMOVDQU32 704(AX), Z11 + VPADDD (CX), Z0, Z0 + VPADDD 64(CX), Z1, Z1 + VPADDD 128(CX), Z2, Z2 + VPADDD 192(CX), Z3, Z3 + VPADDD 256(CX), Z4, Z4 + VPADDD 320(CX), Z5, Z5 + VPADDD 384(CX), Z6, Z6 + VPADDD 448(CX), Z7, Z7 + VPADDD 512(CX), Z8, Z8 + VPADDD 576(CX), Z9, Z9 + VPADDD 640(CX), Z10, Z10 + VPADDD 704(CX), Z11, Z11 + VMOVDQU32 Z0, (DX) + VMOVDQU32 Z1, 64(DX) + VMOVDQU32 Z2, 128(DX) + VMOVDQU32 Z3, 192(DX) + VMOVDQU32 Z4, 256(DX) + VMOVDQU32 Z5, 320(DX) + VMOVDQU32 Z6, 384(DX) + VMOVDQU32 Z7, 448(DX) + VMOVDQU32 Z8, 512(DX) + VMOVDQU32 Z9, 576(DX) + VMOVDQU32 Z10, 640(DX) + VMOVDQU32 Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x000000c0, BX + JMP uint32PlusBlockLoop + +uint32PlusTailLoop: + CMPQ BX, $0x00000010 + JL uint32PlusDone + VMOVDQU32 (AX), Z0 + VPADDD (CX), Z0, Z0 + VMOVDQU32 Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000010, BX + JMP uint32PlusTailLoop + +uint32PlusDone: + RET + +// func uint32PlusScalarAvx512Asm(x uint32, y []uint32, r []uint32) +// Requires: AVX512F, SSE2 +TEXT ·uint32PlusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVL x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTD X0, Z0 + +uint32PlusScalarBlockLoop: + CMPQ BX, $0x000000c0 + JL uint32PlusScalarTailLoop + VMOVDQU32 (CX), Z1 + VMOVDQU32 64(CX), Z2 + VMOVDQU32 128(CX), Z3 + VMOVDQU32 192(CX), Z4 + VMOVDQU32 256(CX), Z5 + VMOVDQU32 320(CX), Z6 + VMOVDQU32 384(CX), Z7 + VMOVDQU32 448(CX), Z8 + VMOVDQU32 512(CX), Z9 + VMOVDQU32 576(CX), Z10 + VMOVDQU32 640(CX), Z11 + VMOVDQU32 704(CX), Z12 + VPADDD Z0, Z1, Z1 + VPADDD Z0, Z2, Z2 + VPADDD Z0, Z3, Z3 + VPADDD Z0, Z4, Z4 + VPADDD Z0, Z5, Z5 + VPADDD Z0, Z6, Z6 + VPADDD Z0, Z7, Z7 + VPADDD Z0, Z8, Z8 + VPADDD Z0, Z9, Z9 + VPADDD Z0, Z10, Z10 + VPADDD Z0, Z11, Z11 + VPADDD Z0, Z12, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x000000c0, BX + JMP uint32PlusScalarBlockLoop + +uint32PlusScalarTailLoop: + CMPQ BX, $0x00000010 + JL uint32PlusScalarDone + VMOVDQU32 (CX), Z1 + VPADDD Z0, Z1, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000010, BX + JMP uint32PlusScalarTailLoop + +uint32PlusScalarDone: + RET + +// func uint64PlusAvx512Asm(x []uint64, y []uint64, r []uint64) +// Requires: AVX512F +TEXT ·uint64PlusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +uint64PlusBlockLoop: + CMPQ BX, $0x00000060 + JL uint64PlusTailLoop + VMOVDQU32 (AX), Z0 + VMOVDQU32 64(AX), Z1 + VMOVDQU32 128(AX), Z2 + VMOVDQU32 192(AX), Z3 + VMOVDQU32 256(AX), Z4 + VMOVDQU32 320(AX), Z5 + VMOVDQU32 384(AX), Z6 + VMOVDQU32 448(AX), Z7 + VMOVDQU32 512(AX), Z8 + VMOVDQU32 576(AX), Z9 + VMOVDQU32 640(AX), Z10 + VMOVDQU32 704(AX), Z11 + VPADDQ (CX), Z0, Z0 + VPADDQ 64(CX), Z1, Z1 + VPADDQ 128(CX), Z2, Z2 + VPADDQ 192(CX), Z3, Z3 + VPADDQ 256(CX), Z4, Z4 + VPADDQ 320(CX), Z5, Z5 + VPADDQ 384(CX), Z6, Z6 + VPADDQ 448(CX), Z7, Z7 + VPADDQ 512(CX), Z8, Z8 + VPADDQ 576(CX), Z9, Z9 + VPADDQ 640(CX), Z10, Z10 + VPADDQ 704(CX), Z11, Z11 + VMOVDQU32 Z0, (DX) + VMOVDQU32 Z1, 64(DX) + VMOVDQU32 Z2, 128(DX) + VMOVDQU32 Z3, 192(DX) + VMOVDQU32 Z4, 256(DX) + VMOVDQU32 Z5, 320(DX) + VMOVDQU32 Z6, 384(DX) + VMOVDQU32 Z7, 448(DX) + VMOVDQU32 Z8, 512(DX) + VMOVDQU32 Z9, 576(DX) + VMOVDQU32 Z10, 640(DX) + VMOVDQU32 Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000060, BX + JMP uint64PlusBlockLoop + +uint64PlusTailLoop: + CMPQ BX, $0x00000008 + JL uint64PlusDone + VMOVDQU32 (AX), Z0 + VPADDQ (CX), Z0, Z0 + VMOVDQU32 Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000008, BX + JMP uint64PlusTailLoop + +uint64PlusDone: + RET + +// func uint64PlusScalarAvx512Asm(x uint64, y []uint64, r []uint64) +// Requires: AVX512F, SSE2 +TEXT ·uint64PlusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVQ x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVQ AX, X0 + VPBROADCASTQ X0, Z0 + +uint64PlusScalarBlockLoop: + CMPQ BX, $0x00000060 + JL uint64PlusScalarTailLoop + VMOVDQU32 (CX), Z1 + VMOVDQU32 64(CX), Z2 + VMOVDQU32 128(CX), Z3 + VMOVDQU32 192(CX), Z4 + VMOVDQU32 256(CX), Z5 + VMOVDQU32 320(CX), Z6 + VMOVDQU32 384(CX), Z7 + VMOVDQU32 448(CX), Z8 + VMOVDQU32 512(CX), Z9 + VMOVDQU32 576(CX), Z10 + VMOVDQU32 640(CX), Z11 + VMOVDQU32 704(CX), Z12 + VPADDQ Z0, Z1, Z1 + VPADDQ Z0, Z2, Z2 + VPADDQ Z0, Z3, Z3 + VPADDQ Z0, Z4, Z4 + VPADDQ Z0, Z5, Z5 + VPADDQ Z0, Z6, Z6 + VPADDQ Z0, Z7, Z7 + VPADDQ Z0, Z8, Z8 + VPADDQ Z0, Z9, Z9 + VPADDQ Z0, Z10, Z10 + VPADDQ Z0, Z11, Z11 + VPADDQ Z0, Z12, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000060, BX + JMP uint64PlusScalarBlockLoop + +uint64PlusScalarTailLoop: + CMPQ BX, $0x00000008 + JL uint64PlusScalarDone + VMOVDQU32 (CX), Z1 + VPADDQ Z0, Z1, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000008, BX + JMP uint64PlusScalarTailLoop + +uint64PlusScalarDone: + RET + +// func float32PlusAvx512Asm(x []float32, y []float32, r []float32) +// Requires: AVX512F +TEXT ·float32PlusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +float32PlusBlockLoop: + CMPQ BX, $0x000000c0 + JL float32PlusTailLoop + VMOVUPS (AX), Z0 + VMOVUPS 64(AX), Z1 + VMOVUPS 128(AX), Z2 + VMOVUPS 192(AX), Z3 + VMOVUPS 256(AX), Z4 + VMOVUPS 320(AX), Z5 + VMOVUPS 384(AX), Z6 + VMOVUPS 448(AX), Z7 + VMOVUPS 512(AX), Z8 + VMOVUPS 576(AX), Z9 + VMOVUPS 640(AX), Z10 + VMOVUPS 704(AX), Z11 + VADDPS (CX), Z0, Z0 + VADDPS 64(CX), Z1, Z1 + VADDPS 128(CX), Z2, Z2 + VADDPS 192(CX), Z3, Z3 + VADDPS 256(CX), Z4, Z4 + VADDPS 320(CX), Z5, Z5 + VADDPS 384(CX), Z6, Z6 + VADDPS 448(CX), Z7, Z7 + VADDPS 512(CX), Z8, Z8 + VADDPS 576(CX), Z9, Z9 + VADDPS 640(CX), Z10, Z10 + VADDPS 704(CX), Z11, Z11 + VMOVUPS Z0, (DX) + VMOVUPS Z1, 64(DX) + VMOVUPS Z2, 128(DX) + VMOVUPS Z3, 192(DX) + VMOVUPS Z4, 256(DX) + VMOVUPS Z5, 320(DX) + VMOVUPS Z6, 384(DX) + VMOVUPS Z7, 448(DX) + VMOVUPS Z8, 512(DX) + VMOVUPS Z9, 576(DX) + VMOVUPS Z10, 640(DX) + VMOVUPS Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x000000c0, BX + JMP float32PlusBlockLoop + +float32PlusTailLoop: + CMPQ BX, $0x00000010 + JL float32PlusDone + VMOVUPS (AX), Z0 + VADDPS (CX), Z0, Z0 + VMOVUPS Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000010, BX + JMP float32PlusTailLoop + +float32PlusDone: + RET + +// func float32PlusScalarAvx512Asm(x float32, y []float32, r []float32) +// Requires: AVX512F, SSE +TEXT ·float32PlusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVSS x+0(FP), X0 + MOVQ y_base+8(FP), AX + MOVQ r_base+32(FP), CX + MOVQ y_len+16(FP), DX + VBROADCASTSS X0, Z0 + +float32PlusScalarBlockLoop: + CMPQ DX, $0x000000c0 + JL float32PlusScalarTailLoop + VMOVUPS (AX), Z1 + VMOVUPS 64(AX), Z2 + VMOVUPS 128(AX), Z3 + VMOVUPS 192(AX), Z4 + VMOVUPS 256(AX), Z5 + VMOVUPS 320(AX), Z6 + VMOVUPS 384(AX), Z7 + VMOVUPS 448(AX), Z8 + VMOVUPS 512(AX), Z9 + VMOVUPS 576(AX), Z10 + VMOVUPS 640(AX), Z11 + VMOVUPS 704(AX), Z12 + VADDPS Z0, Z1, Z1 + VADDPS Z0, Z2, Z2 + VADDPS Z0, Z3, Z3 + VADDPS Z0, Z4, Z4 + VADDPS Z0, Z5, Z5 + VADDPS Z0, Z6, Z6 + VADDPS Z0, Z7, Z7 + VADDPS Z0, Z8, Z8 + VADDPS Z0, Z9, Z9 + VADDPS Z0, Z10, Z10 + VADDPS Z0, Z11, Z11 + VADDPS Z0, Z12, Z12 + VMOVUPS Z1, (CX) + VMOVUPS Z2, 64(CX) + VMOVUPS Z3, 128(CX) + VMOVUPS Z4, 192(CX) + VMOVUPS Z5, 256(CX) + VMOVUPS Z6, 320(CX) + VMOVUPS Z7, 384(CX) + VMOVUPS Z8, 448(CX) + VMOVUPS Z9, 512(CX) + VMOVUPS Z10, 576(CX) + VMOVUPS Z11, 640(CX) + VMOVUPS Z12, 704(CX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + SUBQ $0x000000c0, DX + JMP float32PlusScalarBlockLoop + +float32PlusScalarTailLoop: + CMPQ DX, $0x00000010 + JL float32PlusScalarDone + VMOVUPS (AX), Z1 + VADDPS Z0, Z1, Z1 + VMOVUPS Z1, (CX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + SUBQ $0x00000010, DX + JMP float32PlusScalarTailLoop + +float32PlusScalarDone: + RET + +// func float64PlusAvx512Asm(x []float64, y []float64, r []float64) +// Requires: AVX512F +TEXT ·float64PlusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +float64PlusBlockLoop: + CMPQ BX, $0x00000060 + JL float64PlusTailLoop + VMOVUPD (AX), Z0 + VMOVUPD 64(AX), Z1 + VMOVUPD 128(AX), Z2 + VMOVUPD 192(AX), Z3 + VMOVUPD 256(AX), Z4 + VMOVUPD 320(AX), Z5 + VMOVUPD 384(AX), Z6 + VMOVUPD 448(AX), Z7 + VMOVUPD 512(AX), Z8 + VMOVUPD 576(AX), Z9 + VMOVUPD 640(AX), Z10 + VMOVUPD 704(AX), Z11 + VADDPD (CX), Z0, Z0 + VADDPD 64(CX), Z1, Z1 + VADDPD 128(CX), Z2, Z2 + VADDPD 192(CX), Z3, Z3 + VADDPD 256(CX), Z4, Z4 + VADDPD 320(CX), Z5, Z5 + VADDPD 384(CX), Z6, Z6 + VADDPD 448(CX), Z7, Z7 + VADDPD 512(CX), Z8, Z8 + VADDPD 576(CX), Z9, Z9 + VADDPD 640(CX), Z10, Z10 + VADDPD 704(CX), Z11, Z11 + VMOVUPD Z0, (DX) + VMOVUPD Z1, 64(DX) + VMOVUPD Z2, 128(DX) + VMOVUPD Z3, 192(DX) + VMOVUPD Z4, 256(DX) + VMOVUPD Z5, 320(DX) + VMOVUPD Z6, 384(DX) + VMOVUPD Z7, 448(DX) + VMOVUPD Z8, 512(DX) + VMOVUPD Z9, 576(DX) + VMOVUPD Z10, 640(DX) + VMOVUPD Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000060, BX + JMP float64PlusBlockLoop + +float64PlusTailLoop: + CMPQ BX, $0x00000008 + JL float64PlusDone + VMOVUPD (AX), Z0 + VADDPD (CX), Z0, Z0 + VMOVUPD Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000008, BX + JMP float64PlusTailLoop + +float64PlusDone: + RET + +// func float64PlusScalarAvx512Asm(x float64, y []float64, r []float64) +// Requires: AVX512F, SSE2 +TEXT ·float64PlusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVSD x+0(FP), X0 + MOVQ y_base+8(FP), AX + MOVQ r_base+32(FP), CX + MOVQ y_len+16(FP), DX + VBROADCASTSD X0, Z0 + +float64PlusScalarBlockLoop: + CMPQ DX, $0x00000060 + JL float64PlusScalarTailLoop + VMOVUPD (AX), Z1 + VMOVUPD 64(AX), Z2 + VMOVUPD 128(AX), Z3 + VMOVUPD 192(AX), Z4 + VMOVUPD 256(AX), Z5 + VMOVUPD 320(AX), Z6 + VMOVUPD 384(AX), Z7 + VMOVUPD 448(AX), Z8 + VMOVUPD 512(AX), Z9 + VMOVUPD 576(AX), Z10 + VMOVUPD 640(AX), Z11 + VMOVUPD 704(AX), Z12 + VADDPD Z0, Z1, Z1 + VADDPD Z0, Z2, Z2 + VADDPD Z0, Z3, Z3 + VADDPD Z0, Z4, Z4 + VADDPD Z0, Z5, Z5 + VADDPD Z0, Z6, Z6 + VADDPD Z0, Z7, Z7 + VADDPD Z0, Z8, Z8 + VADDPD Z0, Z9, Z9 + VADDPD Z0, Z10, Z10 + VADDPD Z0, Z11, Z11 + VADDPD Z0, Z12, Z12 + VMOVUPD Z1, (CX) + VMOVUPD Z2, 64(CX) + VMOVUPD Z3, 128(CX) + VMOVUPD Z4, 192(CX) + VMOVUPD Z5, 256(CX) + VMOVUPD Z6, 320(CX) + VMOVUPD Z7, 384(CX) + VMOVUPD Z8, 448(CX) + VMOVUPD Z9, 512(CX) + VMOVUPD Z10, 576(CX) + VMOVUPD Z11, 640(CX) + VMOVUPD Z12, 704(CX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + SUBQ $0x00000060, DX + JMP float64PlusScalarBlockLoop + +float64PlusScalarTailLoop: + CMPQ DX, $0x00000008 + JL float64PlusScalarDone + VMOVUPD (AX), Z1 + VADDPD Z0, Z1, Z1 + VMOVUPD Z1, (CX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + SUBQ $0x00000008, DX + JMP float64PlusScalarTailLoop + +float64PlusScalarDone: + RET diff --git a/pkg/vectorize/add/avx512_stubs.go b/pkg/vectorize/add/avx512_stubs.go new file mode 100644 index 0000000000000000000000000000000000000000..16dc2b9b269c258e8f769409d07a36830e245fa8 --- /dev/null +++ b/pkg/vectorize/add/avx512_stubs.go @@ -0,0 +1,43 @@ +// Code generated by command: go run avx512.go -out plus/avx512.s -stubs plus/avx512_stubs.go. DO NOT EDIT. + +package plus + +func int8PlusAvx512Asm(x []int8, y []int8, r []int8) + +func int8PlusScalarAvx512Asm(x int8, y []int8, r []int8) + +func int16PlusAvx512Asm(x []int16, y []int16, r []int16) + +func int16PlusScalarAvx512Asm(x int16, y []int16, r []int16) + +func int32PlusAvx512Asm(x []int32, y []int32, r []int32) + +func int32PlusScalarAvx512Asm(x int32, y []int32, r []int32) + +func int64PlusAvx512Asm(x []int64, y []int64, r []int64) + +func int64PlusScalarAvx512Asm(x int64, y []int64, r []int64) + +func uint8PlusAvx512Asm(x []uint8, y []uint8, r []uint8) + +func uint8PlusScalarAvx512Asm(x uint8, y []uint8, r []uint8) + +func uint16PlusAvx512Asm(x []uint16, y []uint16, r []uint16) + +func uint16PlusScalarAvx512Asm(x uint16, y []uint16, r []uint16) + +func uint32PlusAvx512Asm(x []uint32, y []uint32, r []uint32) + +func uint32PlusScalarAvx512Asm(x uint32, y []uint32, r []uint32) + +func uint64PlusAvx512Asm(x []uint64, y []uint64, r []uint64) + +func uint64PlusScalarAvx512Asm(x uint64, y []uint64, r []uint64) + +func float32PlusAvx512Asm(x []float32, y []float32, r []float32) + +func float32PlusScalarAvx512Asm(x float32, y []float32, r []float32) + +func float64PlusAvx512Asm(x []float64, y []float64, r []float64) + +func float64PlusScalarAvx512Asm(x float64, y []float64, r []float64) diff --git a/pkg/vectorize/minus/minus.go b/pkg/vectorize/minus/minus.go deleted file mode 100644 index 7008f2715fd50989a40886e523822bed27826e04..0000000000000000000000000000000000000000 --- a/pkg/vectorize/minus/minus.go +++ /dev/null @@ -1,87 +0,0 @@ -package minus - -var ( - i64MinusOne func(int64, []int64, []int64) []int64 - i64MinusOneBy func(int64, []int64, []int64) []int64 - i64Minus func([]int64, []int64, []int64) []int64 - - f64MinusOne func(float64, []float64, []float64) []float64 - f64MinusOneBy func(float64, []float64, []float64) []float64 - f64Minus func([]float64, []float64, []float64) []float64 -) - -func init() { - i64Minus = i64MinusPure - i64MinusOne = i64MinusOnePure - i64MinusOneBy = i64MinusOneByPure - - f64Minus = f64MinusPure - f64MinusOne = f64MinusOnePure - i64MinusOneBy = i64MinusOneByPure -} - -func I64Minus(xs, ys, rs []int64) []int64 { - return i64Minus(xs, ys, rs) -} - -func I64MinusOne(x int64, ys, rs []int64) []int64 { - return i64MinusOnePure(x, ys, rs) -} - -func I64MinusOneBy(x int64, ys, rs []int64) []int64 { - return i64MinusOneByPure(x, ys, rs) -} - -func F64Minus(xs, ys, rs []float64) []float64 { - return f64Minus(xs, ys, rs) -} - -func F64MinusOne(x float64, ys, rs []float64) []float64 { - return f64MinusOnePure(x, ys, rs) -} - -func F64MinusOneBy(x float64, ys, rs []float64) []float64 { - return f64MinusOneByPure(x, ys, rs) -} - -func i64MinusPure(xs, ys, rs []int64) []int64 { - for i, x := range xs { - rs[i] = x - ys[i] - } - return rs -} - -func i64MinusOnePure(x int64, ys, rs []int64) []int64 { - for i, y := range ys { - rs[i] = x - y - } - return rs -} - -func i64MinusOneByPure(x int64, ys, rs []int64) []int64 { - for i, y := range rs { - rs[i] = y - x - } - return rs -} - -func f64MinusPure(xs, ys, rs []float64) []float64 { - for i, x := range xs { - rs[i] = x - ys[i] - } - return rs -} - -func f64MinusOnePure(x float64, ys, rs []float64) []float64 { - for i, y := range ys { - rs[i] = x - y - } - return rs -} - -func f64MinusOneByPure(x float64, ys, rs []float64) []float64 { - for i, y := range ys { - rs[i] = y - x - } - return rs -} diff --git a/pkg/vectorize/plus/fplus_amd64.go b/pkg/vectorize/plus/fplus_amd64.go deleted file mode 100644 index ecbe40ef8878b7109f941061a89159d2d863e444..0000000000000000000000000000000000000000 --- a/pkg/vectorize/plus/fplus_amd64.go +++ /dev/null @@ -1,69 +0,0 @@ -// +build ignore -//go:generate go run fplus_amd64.go -out fplus_amd64.s - -package main - -import ( - . "github.com/mmcloughlin/avo/build" - . "github.com/mmcloughlin/avo/operand" - . "github.com/mmcloughlin/avo/reg" -) - -var unroll = 6 - -func main() { - TEXT("fPlusAvx", NOSPLIT, "func(x, y, r []float64)") - x := Mem{Base: Load(Param("x").Base(), GP64())} - y := Mem{Base: Load(Param("y").Base(), GP64())} - r := Mem{Base: Load(Param("r").Base(), GP64())} - n := Load(Param("x").Len(), GP64()) - - blockitems := 4 * unroll - blocksize := 8 * blockitems - Label("blockloop") - CMPQ(n, U32(blockitems)) - JL(LabelRef("tailloop")) - - // Load x. - xs := make([]VecVirtual, unroll) - for i := 0; i < unroll; i++ { - xs[i] = YMM() - } - - for i := 0; i < unroll; i++ { - VMOVUPD(x.Offset(32*i), xs[i]) - } - - for i := 0; i < unroll; i++ { - VADDPD(y.Offset(32*i), xs[i], xs[i]) - } - - for i := 0; i < unroll; i++ { - VMOVUPD(xs[i], r.Offset(32*i)) - } - - ADDQ(U32(blocksize), x.Base) - ADDQ(U32(blocksize), y.Base) - ADDQ(U32(blocksize), r.Base) - SUBQ(U32(blockitems), n) - JMP(LabelRef("blockloop")) - - Label("tailloop") - CMPQ(n, U32(4)) - JL(LabelRef("done")) - - VMOVUPD(x.Offset(0), xs[0]) - VADDPD(y.Offset(0), xs[0], xs[0]) - VMOVUPD(xs[0], r.Offset(0)) - - ADDQ(U32(32), x.Base) - ADDQ(U32(32), y.Base) - ADDQ(U32(32), r.Base) - SUBQ(U32(4), n) - JMP(LabelRef("tailloop")) - - Label("done") - RET() - - Generate() -} diff --git a/pkg/vectorize/plus/fplus_amd64.s b/pkg/vectorize/plus/fplus_amd64.s deleted file mode 100644 index 617cd2a0253924e30e5531c865cf3f613ab028d0..0000000000000000000000000000000000000000 --- a/pkg/vectorize/plus/fplus_amd64.s +++ /dev/null @@ -1,53 +0,0 @@ -// Code generated by command: go run fplus_amd64.go -out fplus_amd64.s. DO NOT EDIT. - -#include "textflag.h" - -// func fPlusAvx(x []float64, y []float64, r []float64) -// Requires: AVX -TEXT ·fPlusAvx(SB), NOSPLIT, $0-72 - MOVQ x_base+0(FP), AX - MOVQ y_base+24(FP), CX - MOVQ r_base+48(FP), DX - MOVQ x_len+8(FP), BX - -blockloop: - CMPQ BX, $0x00000018 - JL tailloop - VMOVUPD (AX), Y0 - VMOVUPD 32(AX), Y1 - VMOVUPD 64(AX), Y2 - VMOVUPD 96(AX), Y3 - VMOVUPD 128(AX), Y4 - VMOVUPD 160(AX), Y5 - VADDPD (CX), Y0, Y0 - VADDPD 32(CX), Y1, Y1 - VADDPD 64(CX), Y2, Y2 - VADDPD 96(CX), Y3, Y3 - VADDPD 128(CX), Y4, Y4 - VADDPD 160(CX), Y5, Y5 - VMOVUPD Y0, (DX) - VMOVUPD Y1, 32(DX) - VMOVUPD Y2, 64(DX) - VMOVUPD Y3, 96(DX) - VMOVUPD Y4, 128(DX) - VMOVUPD Y5, 160(DX) - ADDQ $0x000000c0, AX - ADDQ $0x000000c0, CX - ADDQ $0x000000c0, DX - SUBQ $0x00000018, BX - JMP blockloop - -tailloop: - CMPQ BX, $0x00000004 - JL done - VMOVUPD (AX), Y0 - VADDPD (CX), Y0, Y0 - VMOVUPD Y0, (DX) - ADDQ $0x00000020, AX - ADDQ $0x00000020, CX - ADDQ $0x00000020, DX - SUBQ $0x00000004, BX - JMP tailloop - -done: - RET diff --git a/pkg/vectorize/plus/fplusone_amd64.go b/pkg/vectorize/plus/fplusone_amd64.go deleted file mode 100644 index 8bbad967ea0a3aa00e48d9b8dd1ae41135e5e753..0000000000000000000000000000000000000000 --- a/pkg/vectorize/plus/fplusone_amd64.go +++ /dev/null @@ -1,67 +0,0 @@ -// +build ignore -//go:generate go run fplusone_amd64.go -out fplusone_amd64.s - -package main - -import ( - . "github.com/mmcloughlin/avo/build" - . "github.com/mmcloughlin/avo/operand" - . "github.com/mmcloughlin/avo/reg" -) - -var unroll = 6 - -func main() { - TEXT("fPlusOneAvx", NOSPLIT, "func(x float64, y, r []float64)") - x := Load(Param("x"), XMM()) - y := Mem{Base: Load(Param("y").Base(), GP64())} - r := Mem{Base: Load(Param("r").Base(), GP64())} - n := Load(Param("y").Len(), GP64()) - - xs := YMM() - VBROADCASTSD(x, xs) - - // Loop over blocks and process them with vector instructions. - blockitems := 4 * unroll - blocksize := 8 * blockitems - Label("blockloop") - CMPQ(n, U32(blockitems)) - JL(LabelRef("tailloop")) - - // Load x. - rs := make([]VecVirtual, unroll) - for i := 0; i < unroll; i++ { - rs[i] = YMM() - } - - for i := 0; i < unroll; i++ { - VADDPD(y.Offset(32*i), xs, rs[i]) - } - - for i := 0; i < unroll; i++ { - VMOVUPD(rs[i], r.Offset(32*i)) - } - - ADDQ(U32(blocksize), y.Base) - ADDQ(U32(blocksize), r.Base) - SUBQ(U32(blockitems), n) - JMP(LabelRef("blockloop")) - - // Process any trailing entries. - Label("tailloop") - CMPQ(n, U32(4)) - JL(LabelRef("done")) - - VADDPD(y.Offset(0), xs, rs[0]) - VMOVUPD(rs[0], r.Offset(0)) - - ADDQ(U32(32), y.Base) - ADDQ(U32(32), r.Base) - SUBQ(U32(4), n) - JMP(LabelRef("tailloop")) - - Label("done") - RET() - - Generate() -} diff --git a/pkg/vectorize/plus/fplusone_amd64.s b/pkg/vectorize/plus/fplusone_amd64.s deleted file mode 100644 index 564c863e09bcf4013ac38772edb91144a56a1820..0000000000000000000000000000000000000000 --- a/pkg/vectorize/plus/fplusone_amd64.s +++ /dev/null @@ -1,45 +0,0 @@ -// Code generated by command: go run fplusone_amd64.go -out fplusone_amd64.s. DO NOT EDIT. - -#include "textflag.h" - -// func fPlusOneAvx(x float64, y []float64, r []float64) -// Requires: AVX, AVX2, SSE2 -TEXT ·fPlusOneAvx(SB), NOSPLIT, $0-56 - MOVSD x+0(FP), X0 - MOVQ y_base+8(FP), AX - MOVQ r_base+32(FP), CX - MOVQ y_len+16(FP), DX - VBROADCASTSD X0, Y0 - -blockloop: - CMPQ DX, $0x00000018 - JL tailloop - VADDPD (AX), Y0, Y1 - VADDPD 32(AX), Y0, Y2 - VADDPD 64(AX), Y0, Y3 - VADDPD 96(AX), Y0, Y4 - VADDPD 128(AX), Y0, Y5 - VADDPD 160(AX), Y0, Y6 - VMOVUPD Y1, (CX) - VMOVUPD Y2, 32(CX) - VMOVUPD Y3, 64(CX) - VMOVUPD Y4, 96(CX) - VMOVUPD Y5, 128(CX) - VMOVUPD Y6, 160(CX) - ADDQ $0x000000c0, AX - ADDQ $0x000000c0, CX - SUBQ $0x00000018, DX - JMP blockloop - -tailloop: - CMPQ DX, $0x00000004 - JL done - VADDPD (AX), Y0, Y1 - VMOVUPD Y1, (CX) - ADDQ $0x00000020, AX - ADDQ $0x00000020, CX - SUBQ $0x00000004, DX - JMP tailloop - -done: - RET diff --git a/pkg/vectorize/plus/iplus_amd64.go b/pkg/vectorize/plus/iplus_amd64.go deleted file mode 100644 index 9a477d9ed73ea2182c2f541545272f063d61d764..0000000000000000000000000000000000000000 --- a/pkg/vectorize/plus/iplus_amd64.go +++ /dev/null @@ -1,69 +0,0 @@ -// +build ignore -//go:generate go run iplus_amd64.go -out iplus_amd64.s - -package main - -import ( - . "github.com/mmcloughlin/avo/build" - . "github.com/mmcloughlin/avo/operand" - . "github.com/mmcloughlin/avo/reg" -) - -var unroll = 6 - -func main() { - TEXT("iPlusAvx", NOSPLIT, "func(x, y, r []int64)") - x := Mem{Base: Load(Param("x").Base(), GP64())} - y := Mem{Base: Load(Param("y").Base(), GP64())} - r := Mem{Base: Load(Param("r").Base(), GP64())} - n := Load(Param("x").Len(), GP64()) - - blockitems := 4 * unroll - blocksize := 8 * blockitems - Label("blockloop") - CMPQ(n, U32(blockitems)) - JL(LabelRef("tailloop")) - - // Load x. - xs := make([]VecVirtual, unroll) - for i := 0; i < unroll; i++ { - xs[i] = YMM() - } - - for i := 0; i < unroll; i++ { - VMOVDQU(x.Offset(32*i), xs[i]) - } - - for i := 0; i < unroll; i++ { - VPADDQ(y.Offset(32*i), xs[i], xs[i]) - } - - for i := 0; i < unroll; i++ { - VMOVDQU(xs[i], r.Offset(32*i)) - } - - ADDQ(U32(blocksize), x.Base) - ADDQ(U32(blocksize), y.Base) - ADDQ(U32(blocksize), r.Base) - SUBQ(U32(blockitems), n) - JMP(LabelRef("blockloop")) - - Label("tailloop") - CMPQ(n, U32(4)) - JL(LabelRef("done")) - - VMOVDQU(x.Offset(0), xs[0]) - VPADDQ(y.Offset(0), xs[0], xs[0]) - VMOVDQU(xs[0], r.Offset(0)) - - ADDQ(U32(32), x.Base) - ADDQ(U32(32), y.Base) - ADDQ(U32(32), r.Base) - SUBQ(U32(4), n) - JMP(LabelRef("tailloop")) - - Label("done") - RET() - - Generate() -} diff --git a/pkg/vectorize/plus/iplus_amd64.s b/pkg/vectorize/plus/iplus_amd64.s deleted file mode 100644 index a5a6f8c680634e4bf53e4f449d7552879bce687d..0000000000000000000000000000000000000000 --- a/pkg/vectorize/plus/iplus_amd64.s +++ /dev/null @@ -1,53 +0,0 @@ -// Code generated by command: go run iplus_amd64.go -out iplus_amd64.s. DO NOT EDIT. - -#include "textflag.h" - -// func iPlusAvx(x []int64, y []int64, r []int64) -// Requires: AVX, AVX2 -TEXT ·iPlusAvx(SB), NOSPLIT, $0-72 - MOVQ x_base+0(FP), AX - MOVQ y_base+24(FP), CX - MOVQ r_base+48(FP), DX - MOVQ x_len+8(FP), BX - -blockloop: - CMPQ BX, $0x00000018 - JL tailloop - VMOVDQU (AX), Y0 - VMOVDQU 32(AX), Y1 - VMOVDQU 64(AX), Y2 - VMOVDQU 96(AX), Y3 - VMOVDQU 128(AX), Y4 - VMOVDQU 160(AX), Y5 - VPADDQ (CX), Y0, Y0 - VPADDQ 32(CX), Y1, Y1 - VPADDQ 64(CX), Y2, Y2 - VPADDQ 96(CX), Y3, Y3 - VPADDQ 128(CX), Y4, Y4 - VPADDQ 160(CX), Y5, Y5 - VMOVDQU Y0, (DX) - VMOVDQU Y1, 32(DX) - VMOVDQU Y2, 64(DX) - VMOVDQU Y3, 96(DX) - VMOVDQU Y4, 128(DX) - VMOVDQU Y5, 160(DX) - ADDQ $0x000000c0, AX - ADDQ $0x000000c0, CX - ADDQ $0x000000c0, DX - SUBQ $0x00000018, BX - JMP blockloop - -tailloop: - CMPQ BX, $0x00000004 - JL done - VMOVDQU (AX), Y0 - VPADDQ (CX), Y0, Y0 - VMOVDQU Y0, (DX) - ADDQ $0x00000020, AX - ADDQ $0x00000020, CX - ADDQ $0x00000020, DX - SUBQ $0x00000004, BX - JMP tailloop - -done: - RET diff --git a/pkg/vectorize/plus/iplusone_amd64.go b/pkg/vectorize/plus/iplusone_amd64.go deleted file mode 100644 index 3c19371df71fee66896cc00e466b9f4cb8a9a437..0000000000000000000000000000000000000000 --- a/pkg/vectorize/plus/iplusone_amd64.go +++ /dev/null @@ -1,66 +0,0 @@ -// +build ignore -//go:generate go run iplusone_amd64.go -out iplusone_amd64.s - -package main - -import ( - . "github.com/mmcloughlin/avo/build" - . "github.com/mmcloughlin/avo/operand" - . "github.com/mmcloughlin/avo/reg" -) - -var unroll = 6 - -func main() { - TEXT("iPlusOneAvx", NOSPLIT, "func(x int64, y, r []int64)") - x := Load(Param("x"), GP64()) - y := Mem{Base: Load(Param("y").Base(), GP64())} - r := Mem{Base: Load(Param("r").Base(), GP64())} - n := Load(Param("y").Len(), GP64()) - - xs1 := XMM() - MOVQ(x, xs1) - xs := YMM() - VPBROADCASTQ(xs1, xs) - - blockitems := 4 * unroll - blocksize := 8 * blockitems - Label("blockloop") - CMPQ(n, U32(blockitems)) - JL(LabelRef("tailloop")) - - rs := make([]VecVirtual, unroll) - for i := 0; i < unroll; i++ { - rs[i] = YMM() - } - - for i := 0; i < unroll; i++ { - VPADDQ(y.Offset(32*i), xs, rs[i]) - } - - for i := 0; i < unroll; i++ { - VMOVDQU(rs[i], r.Offset(32*i)) - } - - ADDQ(U32(blocksize), y.Base) - ADDQ(U32(blocksize), r.Base) - SUBQ(U32(blockitems), n) - JMP(LabelRef("blockloop")) - - Label("tailloop") - CMPQ(n, U32(4)) - JL(LabelRef("done")) - - VPADDQ(y.Offset(0), xs, rs[0]) - VMOVDQU(rs[0], r.Offset(0)) - - ADDQ(U32(32), y.Base) - ADDQ(U32(32), r.Base) - SUBQ(U32(4), n) - JMP(LabelRef("tailloop")) - - Label("done") - RET() - - Generate() -} diff --git a/pkg/vectorize/plus/iplusone_amd64.s b/pkg/vectorize/plus/iplusone_amd64.s deleted file mode 100644 index f2b5ab1e222504fc54b341e86386248a4a4c75da..0000000000000000000000000000000000000000 --- a/pkg/vectorize/plus/iplusone_amd64.s +++ /dev/null @@ -1,46 +0,0 @@ -// Code generated by command: go run iplusone_amd64.go -out iplusone_amd64.s. DO NOT EDIT. - -#include "textflag.h" - -// func iPlusOneAvx(x int64, y []int64, r []int64) -// Requires: AVX, AVX2, SSE2 -TEXT ·iPlusOneAvx(SB), NOSPLIT, $0-56 - MOVQ x+0(FP), AX - MOVQ y_base+8(FP), CX - MOVQ r_base+32(FP), DX - MOVQ y_len+16(FP), BX - MOVQ AX, X0 - VPBROADCASTQ X0, Y0 - -blockloop: - CMPQ BX, $0x00000018 - JL tailloop - VPADDQ (CX), Y0, Y1 - VPADDQ 32(CX), Y0, Y2 - VPADDQ 64(CX), Y0, Y3 - VPADDQ 96(CX), Y0, Y4 - VPADDQ 128(CX), Y0, Y5 - VPADDQ 160(CX), Y0, Y6 - VMOVDQU Y1, (DX) - VMOVDQU Y2, 32(DX) - VMOVDQU Y3, 64(DX) - VMOVDQU Y4, 96(DX) - VMOVDQU Y5, 128(DX) - VMOVDQU Y6, 160(DX) - ADDQ $0x000000c0, CX - ADDQ $0x000000c0, DX - SUBQ $0x00000018, BX - JMP blockloop - -tailloop: - CMPQ BX, $0x00000004 - JL done - VPADDQ (CX), Y0, Y1 - VMOVDQU Y1, (DX) - ADDQ $0x00000020, CX - ADDQ $0x00000020, DX - SUBQ $0x00000004, BX - JMP tailloop - -done: - RET diff --git a/pkg/vectorize/plus/plus.go b/pkg/vectorize/plus/plus.go deleted file mode 100644 index e8421f0118e305a9f82f932a6f773f660ccb931d..0000000000000000000000000000000000000000 --- a/pkg/vectorize/plus/plus.go +++ /dev/null @@ -1,112 +0,0 @@ -package plus - -import "matrixbase/pkg/internal/cpu" - -var ( - i64PlusOne func(int64, []int64, []int64) []int64 - i64Plus func([]int64, []int64, []int64) []int64 - - f64PlusOne func(float64, []float64, []float64) []float64 - f64Plus func([]float64, []float64, []float64) []float64 -) - -func init() { - if cpu.X86.HasAVX2 { - i64Plus = i64PlusAvx - i64PlusOne = i64PlusOneAvx - - f64Plus = f64PlusAvx - f64PlusOne = f64PlusOneAvx - } else { - i64Plus = i64PlusPure - i64PlusOne = i64PlusOnePure - - f64Plus = f64PlusPure - f64PlusOne = f64PlusOnePure - } -} - -func iPlusAvx([]int64, []int64, []int64) -func iPlusOneAvx(int64, []int64, []int64) -func fPlusAvx([]float64, []float64, []float64) -func fPlusOneAvx(float64, []float64, []float64) - -func I64Plus(xs, ys, rs []int64) []int64 { - return i64Plus(xs, ys, rs) -} - -func I64PlusOne(x int64, ys, rs []int64) []int64 { - return i64PlusOnePure(x, ys, rs) -} - -func F64Plus(xs, ys, rs []float64) []float64 { - return f64Plus(xs, ys, rs) -} - -func F64PlusOne(x float64, ys, rs []float64) []float64 { - return f64PlusOnePure(x, ys, rs) -} - -func i64PlusAvx(xs, ys, rs []int64) []int64 { - n := len(xs) / 4 - iPlusAvx(xs[:n*4], ys[:n*4], rs[:n*4]) - for i, j := n*4, len(xs); i < j; i++ { - rs[i] = xs[i] + ys[i] - } - return rs -} - -func i64PlusOneAvx(x int64, ys, rs []int64) []int64 { - n := len(ys) / 4 - iPlusOneAvx(x, ys[:n*4], rs[:n*4]) - for i, j := n*4, len(ys); i < j; i++ { - rs[i] = x + ys[i] - } - return rs -} - -func f64PlusAvx(xs, ys, rs []float64) []float64 { - n := len(xs) / 4 - fPlusAvx(xs[:n*4], ys[:n*4], rs[:n*4]) - for i, j := n*4, len(xs); i < j; i++ { - rs[i] = xs[i] + ys[i] - } - return rs -} - -func f64PlusOneAvx(x float64, ys, rs []float64) []float64 { - n := len(ys) / 4 - fPlusOneAvx(x, ys[:n*4], rs[:n*4]) - for i, j := n*4, len(ys); i < j; i++ { - rs[i] = x + ys[i] - } - return rs -} - -func i64PlusPure(xs, ys, rs []int64) []int64 { - for i, x := range xs { - rs[i] = x + ys[i] - } - return rs -} - -func i64PlusOnePure(x int64, ys, rs []int64) []int64 { - for i, y := range ys { - rs[i] = x + y - } - return rs -} - -func f64PlusPure(xs, ys, rs []float64) []float64 { - for i, x := range xs { - rs[i] = x + ys[i] - } - return rs -} - -func f64PlusOnePure(x float64, ys, rs []float64) []float64 { - for i, y := range ys { - rs[i] = x + y - } - return rs -} diff --git a/pkg/vectorize/plus/plus_test.go b/pkg/vectorize/plus/plus_test.go deleted file mode 100644 index 2b71ee2057d36c17c4b29bc095011c1b5c70d1f2..0000000000000000000000000000000000000000 --- a/pkg/vectorize/plus/plus_test.go +++ /dev/null @@ -1,36 +0,0 @@ -package plus - -import ( - "fmt" - "testing" -) - -func makeIbuffer(l int) []int64 { - buf := make([]int64, l) - for i := range buf { - buf[i] = int64(i) - } - return buf -} - -func makeFbuffer(l int) []float64 { - buf := make([]float64, l) - for i := range buf { - buf[i] = float64(i) - } - return buf -} - -func TestF64Plus(t *testing.T) { - xs := makeFbuffer(13) - res := make([]float64, 13) - fmt.Printf("sum:\n\t%v\n", f64PlusAvx(xs, xs, res)) - fmt.Printf("pure sum:\n\t%v\n", f64PlusPure(xs, xs, res)) -} - -func TestI64Plus(t *testing.T) { - xs := makeIbuffer(100) - res := make([]int64, 50) - fmt.Printf("sum: %v\n", i64PlusAvx(xs[:50], xs[50:], res)) - fmt.Printf("pure sum: %v\n", i64PlusPure(xs[:50], xs[50:], res)) -} diff --git a/pkg/vectorize/sub/avx2.s b/pkg/vectorize/sub/avx2.s new file mode 100644 index 0000000000000000000000000000000000000000..924ed136f8ba7edf2c14dda31e4071c96071e0db --- /dev/null +++ b/pkg/vectorize/sub/avx2.s @@ -0,0 +1,1429 @@ +// Code generated by command: go run avx2.go -out sub/avx2.s -stubs sub/avx2_stubs.go. DO NOT EDIT. + +#include "textflag.h" + +// func int8SubAvx2Asm(x []int8, y []int8, r []int8) +// Requires: AVX, AVX2 +TEXT ·int8SubAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +int8SubBlockLoop: + CMPQ BX, $0x000000c0 + JL int8SubTailLoop + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VMOVDQU 128(AX), Y4 + VMOVDQU 160(AX), Y5 + VPSUBB (CX), Y0, Y0 + VPSUBB 32(CX), Y1, Y1 + VPSUBB 64(CX), Y2, Y2 + VPSUBB 96(CX), Y3, Y3 + VPSUBB 128(CX), Y4, Y4 + VPSUBB 160(CX), Y5, Y5 + VMOVDQU Y0, (DX) + VMOVDQU Y1, 32(DX) + VMOVDQU Y2, 64(DX) + VMOVDQU Y3, 96(DX) + VMOVDQU Y4, 128(DX) + VMOVDQU Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x000000c0, BX + JMP int8SubBlockLoop + +int8SubTailLoop: + CMPQ BX, $0x00000020 + JL int8SubDone + VMOVDQU (AX), Y0 + VPSUBB (CX), Y0, Y0 + VMOVDQU Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000020, BX + JMP int8SubTailLoop + +int8SubDone: + RET + +// func int8SubScalarAvx2Asm(x int8, y []int8, r []int8) +// Requires: AVX, AVX2, SSE2 +TEXT ·int8SubScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVBLSX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTB X0, Y0 + +int8SubScalarBlockLoop: + CMPQ BX, $0x000000c0 + JL int8SubScalarTailLoop + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSUBB Y0, Y1, Y1 + VPSUBB Y0, Y2, Y2 + VPSUBB Y0, Y3, Y3 + VPSUBB Y0, Y4, Y4 + VPSUBB Y0, Y5, Y5 + VPSUBB Y0, Y6, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x000000c0, BX + JMP int8SubScalarBlockLoop + +int8SubScalarTailLoop: + CMPQ BX, $0x00000020 + JL int8SubScalarDone + VMOVDQU (CX), Y1 + VPSUBB Y0, Y1, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000020, BX + JMP int8SubScalarTailLoop + +int8SubScalarDone: + RET + +// func int8SubByScalarAvx2Asm(x int8, y []int8, r []int8) +// Requires: AVX, AVX2, SSE2 +TEXT ·int8SubByScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVBLSX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTB X0, Y0 + +int8SubByScalarBlockLoop: + CMPQ BX, $0x000000c0 + JL int8SubByScalarTailLoop + VPSUBB (CX), Y0, Y1 + VPSUBB 32(CX), Y0, Y2 + VPSUBB 64(CX), Y0, Y3 + VPSUBB 96(CX), Y0, Y4 + VPSUBB 128(CX), Y0, Y5 + VPSUBB 160(CX), Y0, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x000000c0, BX + JMP int8SubByScalarBlockLoop + +int8SubByScalarTailLoop: + CMPQ BX, $0x00000020 + JL int8SubByScalarDone + VPSUBB (CX), Y0, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000020, BX + JMP int8SubByScalarTailLoop + +int8SubByScalarDone: + RET + +// func int16SubAvx2Asm(x []int16, y []int16, r []int16) +// Requires: AVX, AVX2 +TEXT ·int16SubAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +int16SubBlockLoop: + CMPQ BX, $0x00000060 + JL int16SubTailLoop + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VMOVDQU 128(AX), Y4 + VMOVDQU 160(AX), Y5 + VPSUBW (CX), Y0, Y0 + VPSUBW 32(CX), Y1, Y1 + VPSUBW 64(CX), Y2, Y2 + VPSUBW 96(CX), Y3, Y3 + VPSUBW 128(CX), Y4, Y4 + VPSUBW 160(CX), Y5, Y5 + VMOVDQU Y0, (DX) + VMOVDQU Y1, 32(DX) + VMOVDQU Y2, 64(DX) + VMOVDQU Y3, 96(DX) + VMOVDQU Y4, 128(DX) + VMOVDQU Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000060, BX + JMP int16SubBlockLoop + +int16SubTailLoop: + CMPQ BX, $0x00000010 + JL int16SubDone + VMOVDQU (AX), Y0 + VPSUBW (CX), Y0, Y0 + VMOVDQU Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000010, BX + JMP int16SubTailLoop + +int16SubDone: + RET + +// func int16SubScalarAvx2Asm(x int16, y []int16, r []int16) +// Requires: AVX, AVX2, SSE2 +TEXT ·int16SubScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVWLSX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTW X0, Y0 + +int16SubScalarBlockLoop: + CMPQ BX, $0x00000060 + JL int16SubScalarTailLoop + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSUBW Y0, Y1, Y1 + VPSUBW Y0, Y2, Y2 + VPSUBW Y0, Y3, Y3 + VPSUBW Y0, Y4, Y4 + VPSUBW Y0, Y5, Y5 + VPSUBW Y0, Y6, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000060, BX + JMP int16SubScalarBlockLoop + +int16SubScalarTailLoop: + CMPQ BX, $0x00000010 + JL int16SubScalarDone + VMOVDQU (CX), Y1 + VPSUBW Y0, Y1, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000010, BX + JMP int16SubScalarTailLoop + +int16SubScalarDone: + RET + +// func int16SubByScalarAvx2Asm(x int16, y []int16, r []int16) +// Requires: AVX, AVX2, SSE2 +TEXT ·int16SubByScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVWLSX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTW X0, Y0 + +int16SubByScalarBlockLoop: + CMPQ BX, $0x00000060 + JL int16SubByScalarTailLoop + VPSUBW (CX), Y0, Y1 + VPSUBW 32(CX), Y0, Y2 + VPSUBW 64(CX), Y0, Y3 + VPSUBW 96(CX), Y0, Y4 + VPSUBW 128(CX), Y0, Y5 + VPSUBW 160(CX), Y0, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000060, BX + JMP int16SubByScalarBlockLoop + +int16SubByScalarTailLoop: + CMPQ BX, $0x00000010 + JL int16SubByScalarDone + VPSUBW (CX), Y0, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000010, BX + JMP int16SubByScalarTailLoop + +int16SubByScalarDone: + RET + +// func int32SubAvx2Asm(x []int32, y []int32, r []int32) +// Requires: AVX, AVX2 +TEXT ·int32SubAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +int32SubBlockLoop: + CMPQ BX, $0x00000030 + JL int32SubTailLoop + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VMOVDQU 128(AX), Y4 + VMOVDQU 160(AX), Y5 + VPSUBD (CX), Y0, Y0 + VPSUBD 32(CX), Y1, Y1 + VPSUBD 64(CX), Y2, Y2 + VPSUBD 96(CX), Y3, Y3 + VPSUBD 128(CX), Y4, Y4 + VPSUBD 160(CX), Y5, Y5 + VMOVDQU Y0, (DX) + VMOVDQU Y1, 32(DX) + VMOVDQU Y2, 64(DX) + VMOVDQU Y3, 96(DX) + VMOVDQU Y4, 128(DX) + VMOVDQU Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000030, BX + JMP int32SubBlockLoop + +int32SubTailLoop: + CMPQ BX, $0x00000008 + JL int32SubDone + VMOVDQU (AX), Y0 + VPSUBD (CX), Y0, Y0 + VMOVDQU Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000008, BX + JMP int32SubTailLoop + +int32SubDone: + RET + +// func int32SubScalarAvx2Asm(x int32, y []int32, r []int32) +// Requires: AVX, AVX2, SSE2 +TEXT ·int32SubScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVL x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTD X0, Y0 + +int32SubScalarBlockLoop: + CMPQ BX, $0x00000030 + JL int32SubScalarTailLoop + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSUBD Y0, Y1, Y1 + VPSUBD Y0, Y2, Y2 + VPSUBD Y0, Y3, Y3 + VPSUBD Y0, Y4, Y4 + VPSUBD Y0, Y5, Y5 + VPSUBD Y0, Y6, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000030, BX + JMP int32SubScalarBlockLoop + +int32SubScalarTailLoop: + CMPQ BX, $0x00000008 + JL int32SubScalarDone + VMOVDQU (CX), Y1 + VPSUBD Y0, Y1, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000008, BX + JMP int32SubScalarTailLoop + +int32SubScalarDone: + RET + +// func int32SubByScalarAvx2Asm(x int32, y []int32, r []int32) +// Requires: AVX, AVX2, SSE2 +TEXT ·int32SubByScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVL x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTD X0, Y0 + +int32SubByScalarBlockLoop: + CMPQ BX, $0x00000030 + JL int32SubByScalarTailLoop + VPSUBD (CX), Y0, Y1 + VPSUBD 32(CX), Y0, Y2 + VPSUBD 64(CX), Y0, Y3 + VPSUBD 96(CX), Y0, Y4 + VPSUBD 128(CX), Y0, Y5 + VPSUBD 160(CX), Y0, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000030, BX + JMP int32SubByScalarBlockLoop + +int32SubByScalarTailLoop: + CMPQ BX, $0x00000008 + JL int32SubByScalarDone + VPSUBD (CX), Y0, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000008, BX + JMP int32SubByScalarTailLoop + +int32SubByScalarDone: + RET + +// func int64SubAvx2Asm(x []int64, y []int64, r []int64) +// Requires: AVX, AVX2 +TEXT ·int64SubAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +int64SubBlockLoop: + CMPQ BX, $0x00000018 + JL int64SubTailLoop + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VMOVDQU 128(AX), Y4 + VMOVDQU 160(AX), Y5 + VPSUBQ (CX), Y0, Y0 + VPSUBQ 32(CX), Y1, Y1 + VPSUBQ 64(CX), Y2, Y2 + VPSUBQ 96(CX), Y3, Y3 + VPSUBQ 128(CX), Y4, Y4 + VPSUBQ 160(CX), Y5, Y5 + VMOVDQU Y0, (DX) + VMOVDQU Y1, 32(DX) + VMOVDQU Y2, 64(DX) + VMOVDQU Y3, 96(DX) + VMOVDQU Y4, 128(DX) + VMOVDQU Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000018, BX + JMP int64SubBlockLoop + +int64SubTailLoop: + CMPQ BX, $0x00000004 + JL int64SubDone + VMOVDQU (AX), Y0 + VPSUBQ (CX), Y0, Y0 + VMOVDQU Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000004, BX + JMP int64SubTailLoop + +int64SubDone: + RET + +// func int64SubScalarAvx2Asm(x int64, y []int64, r []int64) +// Requires: AVX, AVX2, SSE2 +TEXT ·int64SubScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVQ x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVQ AX, X0 + VPBROADCASTQ X0, Y0 + +int64SubScalarBlockLoop: + CMPQ BX, $0x00000018 + JL int64SubScalarTailLoop + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSUBQ Y0, Y1, Y1 + VPSUBQ Y0, Y2, Y2 + VPSUBQ Y0, Y3, Y3 + VPSUBQ Y0, Y4, Y4 + VPSUBQ Y0, Y5, Y5 + VPSUBQ Y0, Y6, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000018, BX + JMP int64SubScalarBlockLoop + +int64SubScalarTailLoop: + CMPQ BX, $0x00000004 + JL int64SubScalarDone + VMOVDQU (CX), Y1 + VPSUBQ Y0, Y1, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000004, BX + JMP int64SubScalarTailLoop + +int64SubScalarDone: + RET + +// func int64SubByScalarAvx2Asm(x int64, y []int64, r []int64) +// Requires: AVX, AVX2, SSE2 +TEXT ·int64SubByScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVQ x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVQ AX, X0 + VPBROADCASTQ X0, Y0 + +int64SubByScalarBlockLoop: + CMPQ BX, $0x00000018 + JL int64SubByScalarTailLoop + VPSUBQ (CX), Y0, Y1 + VPSUBQ 32(CX), Y0, Y2 + VPSUBQ 64(CX), Y0, Y3 + VPSUBQ 96(CX), Y0, Y4 + VPSUBQ 128(CX), Y0, Y5 + VPSUBQ 160(CX), Y0, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000018, BX + JMP int64SubByScalarBlockLoop + +int64SubByScalarTailLoop: + CMPQ BX, $0x00000004 + JL int64SubByScalarDone + VPSUBQ (CX), Y0, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000004, BX + JMP int64SubByScalarTailLoop + +int64SubByScalarDone: + RET + +// func uint8SubAvx2Asm(x []uint8, y []uint8, r []uint8) +// Requires: AVX, AVX2 +TEXT ·uint8SubAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +uint8SubBlockLoop: + CMPQ BX, $0x000000c0 + JL uint8SubTailLoop + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VMOVDQU 128(AX), Y4 + VMOVDQU 160(AX), Y5 + VPSUBB (CX), Y0, Y0 + VPSUBB 32(CX), Y1, Y1 + VPSUBB 64(CX), Y2, Y2 + VPSUBB 96(CX), Y3, Y3 + VPSUBB 128(CX), Y4, Y4 + VPSUBB 160(CX), Y5, Y5 + VMOVDQU Y0, (DX) + VMOVDQU Y1, 32(DX) + VMOVDQU Y2, 64(DX) + VMOVDQU Y3, 96(DX) + VMOVDQU Y4, 128(DX) + VMOVDQU Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x000000c0, BX + JMP uint8SubBlockLoop + +uint8SubTailLoop: + CMPQ BX, $0x00000020 + JL uint8SubDone + VMOVDQU (AX), Y0 + VPSUBB (CX), Y0, Y0 + VMOVDQU Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000020, BX + JMP uint8SubTailLoop + +uint8SubDone: + RET + +// func uint8SubScalarAvx2Asm(x uint8, y []uint8, r []uint8) +// Requires: AVX, AVX2, SSE2 +TEXT ·uint8SubScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVBLZX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTB X0, Y0 + +uint8SubScalarBlockLoop: + CMPQ BX, $0x000000c0 + JL uint8SubScalarTailLoop + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSUBB Y0, Y1, Y1 + VPSUBB Y0, Y2, Y2 + VPSUBB Y0, Y3, Y3 + VPSUBB Y0, Y4, Y4 + VPSUBB Y0, Y5, Y5 + VPSUBB Y0, Y6, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x000000c0, BX + JMP uint8SubScalarBlockLoop + +uint8SubScalarTailLoop: + CMPQ BX, $0x00000020 + JL uint8SubScalarDone + VMOVDQU (CX), Y1 + VPSUBB Y0, Y1, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000020, BX + JMP uint8SubScalarTailLoop + +uint8SubScalarDone: + RET + +// func uint8SubByScalarAvx2Asm(x uint8, y []uint8, r []uint8) +// Requires: AVX, AVX2, SSE2 +TEXT ·uint8SubByScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVBLZX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTB X0, Y0 + +uint8SubByScalarBlockLoop: + CMPQ BX, $0x000000c0 + JL uint8SubByScalarTailLoop + VPSUBB (CX), Y0, Y1 + VPSUBB 32(CX), Y0, Y2 + VPSUBB 64(CX), Y0, Y3 + VPSUBB 96(CX), Y0, Y4 + VPSUBB 128(CX), Y0, Y5 + VPSUBB 160(CX), Y0, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x000000c0, BX + JMP uint8SubByScalarBlockLoop + +uint8SubByScalarTailLoop: + CMPQ BX, $0x00000020 + JL uint8SubByScalarDone + VPSUBB (CX), Y0, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000020, BX + JMP uint8SubByScalarTailLoop + +uint8SubByScalarDone: + RET + +// func uint16SubAvx2Asm(x []uint16, y []uint16, r []uint16) +// Requires: AVX, AVX2 +TEXT ·uint16SubAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +uint16SubBlockLoop: + CMPQ BX, $0x00000060 + JL uint16SubTailLoop + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VMOVDQU 128(AX), Y4 + VMOVDQU 160(AX), Y5 + VPSUBW (CX), Y0, Y0 + VPSUBW 32(CX), Y1, Y1 + VPSUBW 64(CX), Y2, Y2 + VPSUBW 96(CX), Y3, Y3 + VPSUBW 128(CX), Y4, Y4 + VPSUBW 160(CX), Y5, Y5 + VMOVDQU Y0, (DX) + VMOVDQU Y1, 32(DX) + VMOVDQU Y2, 64(DX) + VMOVDQU Y3, 96(DX) + VMOVDQU Y4, 128(DX) + VMOVDQU Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000060, BX + JMP uint16SubBlockLoop + +uint16SubTailLoop: + CMPQ BX, $0x00000010 + JL uint16SubDone + VMOVDQU (AX), Y0 + VPSUBW (CX), Y0, Y0 + VMOVDQU Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000010, BX + JMP uint16SubTailLoop + +uint16SubDone: + RET + +// func uint16SubScalarAvx2Asm(x uint16, y []uint16, r []uint16) +// Requires: AVX, AVX2, SSE2 +TEXT ·uint16SubScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVWLZX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTW X0, Y0 + +uint16SubScalarBlockLoop: + CMPQ BX, $0x00000060 + JL uint16SubScalarTailLoop + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSUBW Y0, Y1, Y1 + VPSUBW Y0, Y2, Y2 + VPSUBW Y0, Y3, Y3 + VPSUBW Y0, Y4, Y4 + VPSUBW Y0, Y5, Y5 + VPSUBW Y0, Y6, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000060, BX + JMP uint16SubScalarBlockLoop + +uint16SubScalarTailLoop: + CMPQ BX, $0x00000010 + JL uint16SubScalarDone + VMOVDQU (CX), Y1 + VPSUBW Y0, Y1, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000010, BX + JMP uint16SubScalarTailLoop + +uint16SubScalarDone: + RET + +// func uint16SubByScalarAvx2Asm(x uint16, y []uint16, r []uint16) +// Requires: AVX, AVX2, SSE2 +TEXT ·uint16SubByScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVWLZX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTW X0, Y0 + +uint16SubByScalarBlockLoop: + CMPQ BX, $0x00000060 + JL uint16SubByScalarTailLoop + VPSUBW (CX), Y0, Y1 + VPSUBW 32(CX), Y0, Y2 + VPSUBW 64(CX), Y0, Y3 + VPSUBW 96(CX), Y0, Y4 + VPSUBW 128(CX), Y0, Y5 + VPSUBW 160(CX), Y0, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000060, BX + JMP uint16SubByScalarBlockLoop + +uint16SubByScalarTailLoop: + CMPQ BX, $0x00000010 + JL uint16SubByScalarDone + VPSUBW (CX), Y0, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000010, BX + JMP uint16SubByScalarTailLoop + +uint16SubByScalarDone: + RET + +// func uint32SubAvx2Asm(x []uint32, y []uint32, r []uint32) +// Requires: AVX, AVX2 +TEXT ·uint32SubAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +uint32SubBlockLoop: + CMPQ BX, $0x00000030 + JL uint32SubTailLoop + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VMOVDQU 128(AX), Y4 + VMOVDQU 160(AX), Y5 + VPSUBD (CX), Y0, Y0 + VPSUBD 32(CX), Y1, Y1 + VPSUBD 64(CX), Y2, Y2 + VPSUBD 96(CX), Y3, Y3 + VPSUBD 128(CX), Y4, Y4 + VPSUBD 160(CX), Y5, Y5 + VMOVDQU Y0, (DX) + VMOVDQU Y1, 32(DX) + VMOVDQU Y2, 64(DX) + VMOVDQU Y3, 96(DX) + VMOVDQU Y4, 128(DX) + VMOVDQU Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000030, BX + JMP uint32SubBlockLoop + +uint32SubTailLoop: + CMPQ BX, $0x00000008 + JL uint32SubDone + VMOVDQU (AX), Y0 + VPSUBD (CX), Y0, Y0 + VMOVDQU Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000008, BX + JMP uint32SubTailLoop + +uint32SubDone: + RET + +// func uint32SubScalarAvx2Asm(x uint32, y []uint32, r []uint32) +// Requires: AVX, AVX2, SSE2 +TEXT ·uint32SubScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVL x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTD X0, Y0 + +uint32SubScalarBlockLoop: + CMPQ BX, $0x00000030 + JL uint32SubScalarTailLoop + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSUBD Y0, Y1, Y1 + VPSUBD Y0, Y2, Y2 + VPSUBD Y0, Y3, Y3 + VPSUBD Y0, Y4, Y4 + VPSUBD Y0, Y5, Y5 + VPSUBD Y0, Y6, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000030, BX + JMP uint32SubScalarBlockLoop + +uint32SubScalarTailLoop: + CMPQ BX, $0x00000008 + JL uint32SubScalarDone + VMOVDQU (CX), Y1 + VPSUBD Y0, Y1, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000008, BX + JMP uint32SubScalarTailLoop + +uint32SubScalarDone: + RET + +// func uint32SubByScalarAvx2Asm(x uint32, y []uint32, r []uint32) +// Requires: AVX, AVX2, SSE2 +TEXT ·uint32SubByScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVL x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTD X0, Y0 + +uint32SubByScalarBlockLoop: + CMPQ BX, $0x00000030 + JL uint32SubByScalarTailLoop + VPSUBD (CX), Y0, Y1 + VPSUBD 32(CX), Y0, Y2 + VPSUBD 64(CX), Y0, Y3 + VPSUBD 96(CX), Y0, Y4 + VPSUBD 128(CX), Y0, Y5 + VPSUBD 160(CX), Y0, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000030, BX + JMP uint32SubByScalarBlockLoop + +uint32SubByScalarTailLoop: + CMPQ BX, $0x00000008 + JL uint32SubByScalarDone + VPSUBD (CX), Y0, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000008, BX + JMP uint32SubByScalarTailLoop + +uint32SubByScalarDone: + RET + +// func uint64SubAvx2Asm(x []uint64, y []uint64, r []uint64) +// Requires: AVX, AVX2 +TEXT ·uint64SubAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +uint64SubBlockLoop: + CMPQ BX, $0x00000018 + JL uint64SubTailLoop + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VMOVDQU 128(AX), Y4 + VMOVDQU 160(AX), Y5 + VPSUBQ (CX), Y0, Y0 + VPSUBQ 32(CX), Y1, Y1 + VPSUBQ 64(CX), Y2, Y2 + VPSUBQ 96(CX), Y3, Y3 + VPSUBQ 128(CX), Y4, Y4 + VPSUBQ 160(CX), Y5, Y5 + VMOVDQU Y0, (DX) + VMOVDQU Y1, 32(DX) + VMOVDQU Y2, 64(DX) + VMOVDQU Y3, 96(DX) + VMOVDQU Y4, 128(DX) + VMOVDQU Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000018, BX + JMP uint64SubBlockLoop + +uint64SubTailLoop: + CMPQ BX, $0x00000004 + JL uint64SubDone + VMOVDQU (AX), Y0 + VPSUBQ (CX), Y0, Y0 + VMOVDQU Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000004, BX + JMP uint64SubTailLoop + +uint64SubDone: + RET + +// func uint64SubScalarAvx2Asm(x uint64, y []uint64, r []uint64) +// Requires: AVX, AVX2, SSE2 +TEXT ·uint64SubScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVQ x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVQ AX, X0 + VPBROADCASTQ X0, Y0 + +uint64SubScalarBlockLoop: + CMPQ BX, $0x00000018 + JL uint64SubScalarTailLoop + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSUBQ Y0, Y1, Y1 + VPSUBQ Y0, Y2, Y2 + VPSUBQ Y0, Y3, Y3 + VPSUBQ Y0, Y4, Y4 + VPSUBQ Y0, Y5, Y5 + VPSUBQ Y0, Y6, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000018, BX + JMP uint64SubScalarBlockLoop + +uint64SubScalarTailLoop: + CMPQ BX, $0x00000004 + JL uint64SubScalarDone + VMOVDQU (CX), Y1 + VPSUBQ Y0, Y1, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000004, BX + JMP uint64SubScalarTailLoop + +uint64SubScalarDone: + RET + +// func uint64SubByScalarAvx2Asm(x uint64, y []uint64, r []uint64) +// Requires: AVX, AVX2, SSE2 +TEXT ·uint64SubByScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVQ x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVQ AX, X0 + VPBROADCASTQ X0, Y0 + +uint64SubByScalarBlockLoop: + CMPQ BX, $0x00000018 + JL uint64SubByScalarTailLoop + VPSUBQ (CX), Y0, Y1 + VPSUBQ 32(CX), Y0, Y2 + VPSUBQ 64(CX), Y0, Y3 + VPSUBQ 96(CX), Y0, Y4 + VPSUBQ 128(CX), Y0, Y5 + VPSUBQ 160(CX), Y0, Y6 + VMOVDQU Y1, (DX) + VMOVDQU Y2, 32(DX) + VMOVDQU Y3, 64(DX) + VMOVDQU Y4, 96(DX) + VMOVDQU Y5, 128(DX) + VMOVDQU Y6, 160(DX) + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000018, BX + JMP uint64SubByScalarBlockLoop + +uint64SubByScalarTailLoop: + CMPQ BX, $0x00000004 + JL uint64SubByScalarDone + VPSUBQ (CX), Y0, Y1 + VMOVDQU Y1, (DX) + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000004, BX + JMP uint64SubByScalarTailLoop + +uint64SubByScalarDone: + RET + +// func float32SubAvx2Asm(x []float32, y []float32, r []float32) +// Requires: AVX +TEXT ·float32SubAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +float32SubBlockLoop: + CMPQ BX, $0x00000030 + JL float32SubTailLoop + VMOVUPS (AX), Y0 + VMOVUPS 32(AX), Y1 + VMOVUPS 64(AX), Y2 + VMOVUPS 96(AX), Y3 + VMOVUPS 128(AX), Y4 + VMOVUPS 160(AX), Y5 + VSUBPS (CX), Y0, Y0 + VSUBPS 32(CX), Y1, Y1 + VSUBPS 64(CX), Y2, Y2 + VSUBPS 96(CX), Y3, Y3 + VSUBPS 128(CX), Y4, Y4 + VSUBPS 160(CX), Y5, Y5 + VMOVUPS Y0, (DX) + VMOVUPS Y1, 32(DX) + VMOVUPS Y2, 64(DX) + VMOVUPS Y3, 96(DX) + VMOVUPS Y4, 128(DX) + VMOVUPS Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000030, BX + JMP float32SubBlockLoop + +float32SubTailLoop: + CMPQ BX, $0x00000008 + JL float32SubDone + VMOVUPS (AX), Y0 + VSUBPS (CX), Y0, Y0 + VMOVUPS Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000008, BX + JMP float32SubTailLoop + +float32SubDone: + RET + +// func float32SubScalarAvx2Asm(x float32, y []float32, r []float32) +// Requires: AVX, AVX2, SSE +TEXT ·float32SubScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVSS x+0(FP), X0 + MOVQ y_base+8(FP), AX + MOVQ r_base+32(FP), CX + MOVQ y_len+16(FP), DX + VBROADCASTSS X0, Y0 + +float32SubScalarBlockLoop: + CMPQ DX, $0x00000030 + JL float32SubScalarTailLoop + VMOVUPS (AX), Y1 + VMOVUPS 32(AX), Y2 + VMOVUPS 64(AX), Y3 + VMOVUPS 96(AX), Y4 + VMOVUPS 128(AX), Y5 + VMOVUPS 160(AX), Y6 + VSUBPS Y0, Y1, Y1 + VSUBPS Y0, Y2, Y2 + VSUBPS Y0, Y3, Y3 + VSUBPS Y0, Y4, Y4 + VSUBPS Y0, Y5, Y5 + VSUBPS Y0, Y6, Y6 + VMOVUPS Y1, (CX) + VMOVUPS Y2, 32(CX) + VMOVUPS Y3, 64(CX) + VMOVUPS Y4, 96(CX) + VMOVUPS Y5, 128(CX) + VMOVUPS Y6, 160(CX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + SUBQ $0x00000030, DX + JMP float32SubScalarBlockLoop + +float32SubScalarTailLoop: + CMPQ DX, $0x00000008 + JL float32SubScalarDone + VMOVUPS (AX), Y1 + VSUBPS Y0, Y1, Y1 + VMOVUPS Y1, (CX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + SUBQ $0x00000008, DX + JMP float32SubScalarTailLoop + +float32SubScalarDone: + RET + +// func float32SubByScalarAvx2Asm(x float32, y []float32, r []float32) +// Requires: AVX, AVX2, SSE +TEXT ·float32SubByScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVSS x+0(FP), X0 + MOVQ y_base+8(FP), AX + MOVQ r_base+32(FP), CX + MOVQ y_len+16(FP), DX + VBROADCASTSS X0, Y0 + +float32SubByScalarBlockLoop: + CMPQ DX, $0x00000030 + JL float32SubByScalarTailLoop + VSUBPS (AX), Y0, Y1 + VSUBPS 32(AX), Y0, Y2 + VSUBPS 64(AX), Y0, Y3 + VSUBPS 96(AX), Y0, Y4 + VSUBPS 128(AX), Y0, Y5 + VSUBPS 160(AX), Y0, Y6 + VMOVUPS Y1, (CX) + VMOVUPS Y2, 32(CX) + VMOVUPS Y3, 64(CX) + VMOVUPS Y4, 96(CX) + VMOVUPS Y5, 128(CX) + VMOVUPS Y6, 160(CX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + SUBQ $0x00000030, DX + JMP float32SubByScalarBlockLoop + +float32SubByScalarTailLoop: + CMPQ DX, $0x00000008 + JL float32SubByScalarDone + VSUBPS (AX), Y0, Y1 + VMOVUPS Y1, (CX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + SUBQ $0x00000008, DX + JMP float32SubByScalarTailLoop + +float32SubByScalarDone: + RET + +// func float64SubAvx2Asm(x []float64, y []float64, r []float64) +// Requires: AVX +TEXT ·float64SubAvx2Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +float64SubBlockLoop: + CMPQ BX, $0x00000018 + JL float64SubTailLoop + VMOVUPD (AX), Y0 + VMOVUPD 32(AX), Y1 + VMOVUPD 64(AX), Y2 + VMOVUPD 96(AX), Y3 + VMOVUPD 128(AX), Y4 + VMOVUPD 160(AX), Y5 + VSUBPD (CX), Y0, Y0 + VSUBPD 32(CX), Y1, Y1 + VSUBPD 64(CX), Y2, Y2 + VSUBPD 96(CX), Y3, Y3 + VSUBPD 128(CX), Y4, Y4 + VSUBPD 160(CX), Y5, Y5 + VMOVUPD Y0, (DX) + VMOVUPD Y1, 32(DX) + VMOVUPD Y2, 64(DX) + VMOVUPD Y3, 96(DX) + VMOVUPD Y4, 128(DX) + VMOVUPD Y5, 160(DX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + ADDQ $0x000000c0, DX + SUBQ $0x00000018, BX + JMP float64SubBlockLoop + +float64SubTailLoop: + CMPQ BX, $0x00000004 + JL float64SubDone + VMOVUPD (AX), Y0 + VSUBPD (CX), Y0, Y0 + VMOVUPD Y0, (DX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + ADDQ $0x00000020, DX + SUBQ $0x00000004, BX + JMP float64SubTailLoop + +float64SubDone: + RET + +// func float64SubScalarAvx2Asm(x float64, y []float64, r []float64) +// Requires: AVX, AVX2, SSE2 +TEXT ·float64SubScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVSD x+0(FP), X0 + MOVQ y_base+8(FP), AX + MOVQ r_base+32(FP), CX + MOVQ y_len+16(FP), DX + VBROADCASTSD X0, Y0 + +float64SubScalarBlockLoop: + CMPQ DX, $0x00000018 + JL float64SubScalarTailLoop + VMOVUPD (AX), Y1 + VMOVUPD 32(AX), Y2 + VMOVUPD 64(AX), Y3 + VMOVUPD 96(AX), Y4 + VMOVUPD 128(AX), Y5 + VMOVUPD 160(AX), Y6 + VSUBPD Y0, Y1, Y1 + VSUBPD Y0, Y2, Y2 + VSUBPD Y0, Y3, Y3 + VSUBPD Y0, Y4, Y4 + VSUBPD Y0, Y5, Y5 + VSUBPD Y0, Y6, Y6 + VMOVUPD Y1, (CX) + VMOVUPD Y2, 32(CX) + VMOVUPD Y3, 64(CX) + VMOVUPD Y4, 96(CX) + VMOVUPD Y5, 128(CX) + VMOVUPD Y6, 160(CX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + SUBQ $0x00000018, DX + JMP float64SubScalarBlockLoop + +float64SubScalarTailLoop: + CMPQ DX, $0x00000004 + JL float64SubScalarDone + VMOVUPD (AX), Y1 + VSUBPD Y0, Y1, Y1 + VMOVUPD Y1, (CX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + SUBQ $0x00000004, DX + JMP float64SubScalarTailLoop + +float64SubScalarDone: + RET + +// func float64SubByScalarAvx2Asm(x float64, y []float64, r []float64) +// Requires: AVX, AVX2, SSE2 +TEXT ·float64SubByScalarAvx2Asm(SB), NOSPLIT, $0-56 + MOVSD x+0(FP), X0 + MOVQ y_base+8(FP), AX + MOVQ r_base+32(FP), CX + MOVQ y_len+16(FP), DX + VBROADCASTSD X0, Y0 + +float64SubByScalarBlockLoop: + CMPQ DX, $0x00000018 + JL float64SubByScalarTailLoop + VSUBPD (AX), Y0, Y1 + VSUBPD 32(AX), Y0, Y2 + VSUBPD 64(AX), Y0, Y3 + VSUBPD 96(AX), Y0, Y4 + VSUBPD 128(AX), Y0, Y5 + VSUBPD 160(AX), Y0, Y6 + VMOVUPD Y1, (CX) + VMOVUPD Y2, 32(CX) + VMOVUPD Y3, 64(CX) + VMOVUPD Y4, 96(CX) + VMOVUPD Y5, 128(CX) + VMOVUPD Y6, 160(CX) + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + SUBQ $0x00000018, DX + JMP float64SubByScalarBlockLoop + +float64SubByScalarTailLoop: + CMPQ DX, $0x00000004 + JL float64SubByScalarDone + VSUBPD (AX), Y0, Y1 + VMOVUPD Y1, (CX) + ADDQ $0x00000020, AX + ADDQ $0x00000020, CX + SUBQ $0x00000004, DX + JMP float64SubByScalarTailLoop + +float64SubByScalarDone: + RET diff --git a/pkg/vectorize/sub/avx2_stubs.go b/pkg/vectorize/sub/avx2_stubs.go new file mode 100644 index 0000000000000000000000000000000000000000..4d4a729737ec20d4a67ef439ec411760aca7a752 --- /dev/null +++ b/pkg/vectorize/sub/avx2_stubs.go @@ -0,0 +1,63 @@ +// Code generated by command: go run avx2.go -out sub/avx2.s -stubs sub/avx2_stubs.go. DO NOT EDIT. + +package vectorize + +func int8SubAvx2Asm(x []int8, y []int8, r []int8) + +func int8SubScalarAvx2Asm(x int8, y []int8, r []int8) + +func int8SubByScalarAvx2Asm(x int8, y []int8, r []int8) + +func int16SubAvx2Asm(x []int16, y []int16, r []int16) + +func int16SubScalarAvx2Asm(x int16, y []int16, r []int16) + +func int16SubByScalarAvx2Asm(x int16, y []int16, r []int16) + +func int32SubAvx2Asm(x []int32, y []int32, r []int32) + +func int32SubScalarAvx2Asm(x int32, y []int32, r []int32) + +func int32SubByScalarAvx2Asm(x int32, y []int32, r []int32) + +func int64SubAvx2Asm(x []int64, y []int64, r []int64) + +func int64SubScalarAvx2Asm(x int64, y []int64, r []int64) + +func int64SubByScalarAvx2Asm(x int64, y []int64, r []int64) + +func uint8SubAvx2Asm(x []uint8, y []uint8, r []uint8) + +func uint8SubScalarAvx2Asm(x uint8, y []uint8, r []uint8) + +func uint8SubByScalarAvx2Asm(x uint8, y []uint8, r []uint8) + +func uint16SubAvx2Asm(x []uint16, y []uint16, r []uint16) + +func uint16SubScalarAvx2Asm(x uint16, y []uint16, r []uint16) + +func uint16SubByScalarAvx2Asm(x uint16, y []uint16, r []uint16) + +func uint32SubAvx2Asm(x []uint32, y []uint32, r []uint32) + +func uint32SubScalarAvx2Asm(x uint32, y []uint32, r []uint32) + +func uint32SubByScalarAvx2Asm(x uint32, y []uint32, r []uint32) + +func uint64SubAvx2Asm(x []uint64, y []uint64, r []uint64) + +func uint64SubScalarAvx2Asm(x uint64, y []uint64, r []uint64) + +func uint64SubByScalarAvx2Asm(x uint64, y []uint64, r []uint64) + +func float32SubAvx2Asm(x []float32, y []float32, r []float32) + +func float32SubScalarAvx2Asm(x float32, y []float32, r []float32) + +func float32SubByScalarAvx2Asm(x float32, y []float32, r []float32) + +func float64SubAvx2Asm(x []float64, y []float64, r []float64) + +func float64SubScalarAvx2Asm(x float64, y []float64, r []float64) + +func float64SubByScalarAvx2Asm(x float64, y []float64, r []float64) diff --git a/pkg/vectorize/sub/avx512.s b/pkg/vectorize/sub/avx512.s new file mode 100644 index 0000000000000000000000000000000000000000..2185675bb1c6b164adf75ea516483cc034380cd7 --- /dev/null +++ b/pkg/vectorize/sub/avx512.s @@ -0,0 +1,1909 @@ +// Code generated by command: go run avx512.go -out minus/avx512.s -stubs minus/avx512_stubs.go. DO NOT EDIT. + +#include "textflag.h" + +// func int8MinusAvx512Asm(x []int8, y []int8, r []int8) +// Requires: AVX512BW, AVX512F +TEXT ·int8MinusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +int8MinusBlockLoop: + CMPQ BX, $0x00000300 + JL int8MinusTailLoop + VMOVDQU32 (AX), Z0 + VMOVDQU32 64(AX), Z1 + VMOVDQU32 128(AX), Z2 + VMOVDQU32 192(AX), Z3 + VMOVDQU32 256(AX), Z4 + VMOVDQU32 320(AX), Z5 + VMOVDQU32 384(AX), Z6 + VMOVDQU32 448(AX), Z7 + VMOVDQU32 512(AX), Z8 + VMOVDQU32 576(AX), Z9 + VMOVDQU32 640(AX), Z10 + VMOVDQU32 704(AX), Z11 + VPSUBB (CX), Z0, Z0 + VPSUBB 64(CX), Z1, Z1 + VPSUBB 128(CX), Z2, Z2 + VPSUBB 192(CX), Z3, Z3 + VPSUBB 256(CX), Z4, Z4 + VPSUBB 320(CX), Z5, Z5 + VPSUBB 384(CX), Z6, Z6 + VPSUBB 448(CX), Z7, Z7 + VPSUBB 512(CX), Z8, Z8 + VPSUBB 576(CX), Z9, Z9 + VPSUBB 640(CX), Z10, Z10 + VPSUBB 704(CX), Z11, Z11 + VMOVDQU32 Z0, (DX) + VMOVDQU32 Z1, 64(DX) + VMOVDQU32 Z2, 128(DX) + VMOVDQU32 Z3, 192(DX) + VMOVDQU32 Z4, 256(DX) + VMOVDQU32 Z5, 320(DX) + VMOVDQU32 Z6, 384(DX) + VMOVDQU32 Z7, 448(DX) + VMOVDQU32 Z8, 512(DX) + VMOVDQU32 Z9, 576(DX) + VMOVDQU32 Z10, 640(DX) + VMOVDQU32 Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000300, BX + JMP int8MinusBlockLoop + +int8MinusTailLoop: + CMPQ BX, $0x00000040 + JL int8MinusDone + VMOVDQU32 (AX), Z0 + VPSUBB (CX), Z0, Z0 + VMOVDQU32 Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000040, BX + JMP int8MinusTailLoop + +int8MinusDone: + RET + +// func int8MinusScalarAvx512Asm(x int8, y []int8, r []int8) +// Requires: AVX512BW, AVX512F, SSE2 +TEXT ·int8MinusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVBLSX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTB X0, Z0 + +int8MinusScalarBlockLoop: + CMPQ BX, $0x00000300 + JL int8MinusScalarTailLoop + VMOVDQU32 (CX), Z1 + VMOVDQU32 64(CX), Z2 + VMOVDQU32 128(CX), Z3 + VMOVDQU32 192(CX), Z4 + VMOVDQU32 256(CX), Z5 + VMOVDQU32 320(CX), Z6 + VMOVDQU32 384(CX), Z7 + VMOVDQU32 448(CX), Z8 + VMOVDQU32 512(CX), Z9 + VMOVDQU32 576(CX), Z10 + VMOVDQU32 640(CX), Z11 + VMOVDQU32 704(CX), Z12 + VPSUBB Z0, Z1, Z1 + VPSUBB Z0, Z2, Z2 + VPSUBB Z0, Z3, Z3 + VPSUBB Z0, Z4, Z4 + VPSUBB Z0, Z5, Z5 + VPSUBB Z0, Z6, Z6 + VPSUBB Z0, Z7, Z7 + VPSUBB Z0, Z8, Z8 + VPSUBB Z0, Z9, Z9 + VPSUBB Z0, Z10, Z10 + VPSUBB Z0, Z11, Z11 + VPSUBB Z0, Z12, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000300, BX + JMP int8MinusScalarBlockLoop + +int8MinusScalarTailLoop: + CMPQ BX, $0x00000040 + JL int8MinusScalarDone + VMOVDQU32 (CX), Z1 + VPSUBB Z0, Z1, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000040, BX + JMP int8MinusScalarTailLoop + +int8MinusScalarDone: + RET + +// func int8MinusByScalarAvx512Asm(x int8, y []int8, r []int8) +// Requires: AVX512BW, AVX512F, SSE2 +TEXT ·int8MinusByScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVBLSX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTB X0, Z0 + +int8MinusByScalarBlockLoop: + CMPQ BX, $0x00000300 + JL int8MinusByScalarTailLoop + VPSUBB (CX), Z0, Z1 + VPSUBB 64(CX), Z0, Z2 + VPSUBB 128(CX), Z0, Z3 + VPSUBB 192(CX), Z0, Z4 + VPSUBB 256(CX), Z0, Z5 + VPSUBB 320(CX), Z0, Z6 + VPSUBB 384(CX), Z0, Z7 + VPSUBB 448(CX), Z0, Z8 + VPSUBB 512(CX), Z0, Z9 + VPSUBB 576(CX), Z0, Z10 + VPSUBB 640(CX), Z0, Z11 + VPSUBB 704(CX), Z0, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000300, BX + JMP int8MinusByScalarBlockLoop + +int8MinusByScalarTailLoop: + CMPQ BX, $0x00000040 + JL int8MinusByScalarDone + VPSUBB (CX), Z0, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000040, BX + JMP int8MinusByScalarTailLoop + +int8MinusByScalarDone: + RET + +// func int16MinusAvx512Asm(x []int16, y []int16, r []int16) +// Requires: AVX512BW, AVX512F +TEXT ·int16MinusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +int16MinusBlockLoop: + CMPQ BX, $0x00000180 + JL int16MinusTailLoop + VMOVDQU32 (AX), Z0 + VMOVDQU32 64(AX), Z1 + VMOVDQU32 128(AX), Z2 + VMOVDQU32 192(AX), Z3 + VMOVDQU32 256(AX), Z4 + VMOVDQU32 320(AX), Z5 + VMOVDQU32 384(AX), Z6 + VMOVDQU32 448(AX), Z7 + VMOVDQU32 512(AX), Z8 + VMOVDQU32 576(AX), Z9 + VMOVDQU32 640(AX), Z10 + VMOVDQU32 704(AX), Z11 + VPSUBW (CX), Z0, Z0 + VPSUBW 64(CX), Z1, Z1 + VPSUBW 128(CX), Z2, Z2 + VPSUBW 192(CX), Z3, Z3 + VPSUBW 256(CX), Z4, Z4 + VPSUBW 320(CX), Z5, Z5 + VPSUBW 384(CX), Z6, Z6 + VPSUBW 448(CX), Z7, Z7 + VPSUBW 512(CX), Z8, Z8 + VPSUBW 576(CX), Z9, Z9 + VPSUBW 640(CX), Z10, Z10 + VPSUBW 704(CX), Z11, Z11 + VMOVDQU32 Z0, (DX) + VMOVDQU32 Z1, 64(DX) + VMOVDQU32 Z2, 128(DX) + VMOVDQU32 Z3, 192(DX) + VMOVDQU32 Z4, 256(DX) + VMOVDQU32 Z5, 320(DX) + VMOVDQU32 Z6, 384(DX) + VMOVDQU32 Z7, 448(DX) + VMOVDQU32 Z8, 512(DX) + VMOVDQU32 Z9, 576(DX) + VMOVDQU32 Z10, 640(DX) + VMOVDQU32 Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000180, BX + JMP int16MinusBlockLoop + +int16MinusTailLoop: + CMPQ BX, $0x00000020 + JL int16MinusDone + VMOVDQU32 (AX), Z0 + VPSUBW (CX), Z0, Z0 + VMOVDQU32 Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000020, BX + JMP int16MinusTailLoop + +int16MinusDone: + RET + +// func int16MinusScalarAvx512Asm(x int16, y []int16, r []int16) +// Requires: AVX512BW, AVX512F, SSE2 +TEXT ·int16MinusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVWLSX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTW X0, Z0 + +int16MinusScalarBlockLoop: + CMPQ BX, $0x00000180 + JL int16MinusScalarTailLoop + VMOVDQU32 (CX), Z1 + VMOVDQU32 64(CX), Z2 + VMOVDQU32 128(CX), Z3 + VMOVDQU32 192(CX), Z4 + VMOVDQU32 256(CX), Z5 + VMOVDQU32 320(CX), Z6 + VMOVDQU32 384(CX), Z7 + VMOVDQU32 448(CX), Z8 + VMOVDQU32 512(CX), Z9 + VMOVDQU32 576(CX), Z10 + VMOVDQU32 640(CX), Z11 + VMOVDQU32 704(CX), Z12 + VPSUBW Z0, Z1, Z1 + VPSUBW Z0, Z2, Z2 + VPSUBW Z0, Z3, Z3 + VPSUBW Z0, Z4, Z4 + VPSUBW Z0, Z5, Z5 + VPSUBW Z0, Z6, Z6 + VPSUBW Z0, Z7, Z7 + VPSUBW Z0, Z8, Z8 + VPSUBW Z0, Z9, Z9 + VPSUBW Z0, Z10, Z10 + VPSUBW Z0, Z11, Z11 + VPSUBW Z0, Z12, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000180, BX + JMP int16MinusScalarBlockLoop + +int16MinusScalarTailLoop: + CMPQ BX, $0x00000020 + JL int16MinusScalarDone + VMOVDQU32 (CX), Z1 + VPSUBW Z0, Z1, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000020, BX + JMP int16MinusScalarTailLoop + +int16MinusScalarDone: + RET + +// func int16MinusByScalarAvx512Asm(x int16, y []int16, r []int16) +// Requires: AVX512BW, AVX512F, SSE2 +TEXT ·int16MinusByScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVWLSX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTW X0, Z0 + +int16MinusByScalarBlockLoop: + CMPQ BX, $0x00000180 + JL int16MinusByScalarTailLoop + VPSUBW (CX), Z0, Z1 + VPSUBW 64(CX), Z0, Z2 + VPSUBW 128(CX), Z0, Z3 + VPSUBW 192(CX), Z0, Z4 + VPSUBW 256(CX), Z0, Z5 + VPSUBW 320(CX), Z0, Z6 + VPSUBW 384(CX), Z0, Z7 + VPSUBW 448(CX), Z0, Z8 + VPSUBW 512(CX), Z0, Z9 + VPSUBW 576(CX), Z0, Z10 + VPSUBW 640(CX), Z0, Z11 + VPSUBW 704(CX), Z0, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000180, BX + JMP int16MinusByScalarBlockLoop + +int16MinusByScalarTailLoop: + CMPQ BX, $0x00000020 + JL int16MinusByScalarDone + VPSUBW (CX), Z0, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000020, BX + JMP int16MinusByScalarTailLoop + +int16MinusByScalarDone: + RET + +// func int32MinusAvx512Asm(x []int32, y []int32, r []int32) +// Requires: AVX512F +TEXT ·int32MinusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +int32MinusBlockLoop: + CMPQ BX, $0x000000c0 + JL int32MinusTailLoop + VMOVDQU32 (AX), Z0 + VMOVDQU32 64(AX), Z1 + VMOVDQU32 128(AX), Z2 + VMOVDQU32 192(AX), Z3 + VMOVDQU32 256(AX), Z4 + VMOVDQU32 320(AX), Z5 + VMOVDQU32 384(AX), Z6 + VMOVDQU32 448(AX), Z7 + VMOVDQU32 512(AX), Z8 + VMOVDQU32 576(AX), Z9 + VMOVDQU32 640(AX), Z10 + VMOVDQU32 704(AX), Z11 + VPSUBD (CX), Z0, Z0 + VPSUBD 64(CX), Z1, Z1 + VPSUBD 128(CX), Z2, Z2 + VPSUBD 192(CX), Z3, Z3 + VPSUBD 256(CX), Z4, Z4 + VPSUBD 320(CX), Z5, Z5 + VPSUBD 384(CX), Z6, Z6 + VPSUBD 448(CX), Z7, Z7 + VPSUBD 512(CX), Z8, Z8 + VPSUBD 576(CX), Z9, Z9 + VPSUBD 640(CX), Z10, Z10 + VPSUBD 704(CX), Z11, Z11 + VMOVDQU32 Z0, (DX) + VMOVDQU32 Z1, 64(DX) + VMOVDQU32 Z2, 128(DX) + VMOVDQU32 Z3, 192(DX) + VMOVDQU32 Z4, 256(DX) + VMOVDQU32 Z5, 320(DX) + VMOVDQU32 Z6, 384(DX) + VMOVDQU32 Z7, 448(DX) + VMOVDQU32 Z8, 512(DX) + VMOVDQU32 Z9, 576(DX) + VMOVDQU32 Z10, 640(DX) + VMOVDQU32 Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x000000c0, BX + JMP int32MinusBlockLoop + +int32MinusTailLoop: + CMPQ BX, $0x00000010 + JL int32MinusDone + VMOVDQU32 (AX), Z0 + VPSUBD (CX), Z0, Z0 + VMOVDQU32 Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000010, BX + JMP int32MinusTailLoop + +int32MinusDone: + RET + +// func int32MinusScalarAvx512Asm(x int32, y []int32, r []int32) +// Requires: AVX512F, SSE2 +TEXT ·int32MinusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVL x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTD X0, Z0 + +int32MinusScalarBlockLoop: + CMPQ BX, $0x000000c0 + JL int32MinusScalarTailLoop + VMOVDQU32 (CX), Z1 + VMOVDQU32 64(CX), Z2 + VMOVDQU32 128(CX), Z3 + VMOVDQU32 192(CX), Z4 + VMOVDQU32 256(CX), Z5 + VMOVDQU32 320(CX), Z6 + VMOVDQU32 384(CX), Z7 + VMOVDQU32 448(CX), Z8 + VMOVDQU32 512(CX), Z9 + VMOVDQU32 576(CX), Z10 + VMOVDQU32 640(CX), Z11 + VMOVDQU32 704(CX), Z12 + VPSUBD Z0, Z1, Z1 + VPSUBD Z0, Z2, Z2 + VPSUBD Z0, Z3, Z3 + VPSUBD Z0, Z4, Z4 + VPSUBD Z0, Z5, Z5 + VPSUBD Z0, Z6, Z6 + VPSUBD Z0, Z7, Z7 + VPSUBD Z0, Z8, Z8 + VPSUBD Z0, Z9, Z9 + VPSUBD Z0, Z10, Z10 + VPSUBD Z0, Z11, Z11 + VPSUBD Z0, Z12, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x000000c0, BX + JMP int32MinusScalarBlockLoop + +int32MinusScalarTailLoop: + CMPQ BX, $0x00000010 + JL int32MinusScalarDone + VMOVDQU32 (CX), Z1 + VPSUBD Z0, Z1, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000010, BX + JMP int32MinusScalarTailLoop + +int32MinusScalarDone: + RET + +// func int32MinusByScalarAvx512Asm(x int32, y []int32, r []int32) +// Requires: AVX512F, SSE2 +TEXT ·int32MinusByScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVL x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTD X0, Z0 + +int32MinusByScalarBlockLoop: + CMPQ BX, $0x000000c0 + JL int32MinusByScalarTailLoop + VPSUBD (CX), Z0, Z1 + VPSUBD 64(CX), Z0, Z2 + VPSUBD 128(CX), Z0, Z3 + VPSUBD 192(CX), Z0, Z4 + VPSUBD 256(CX), Z0, Z5 + VPSUBD 320(CX), Z0, Z6 + VPSUBD 384(CX), Z0, Z7 + VPSUBD 448(CX), Z0, Z8 + VPSUBD 512(CX), Z0, Z9 + VPSUBD 576(CX), Z0, Z10 + VPSUBD 640(CX), Z0, Z11 + VPSUBD 704(CX), Z0, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x000000c0, BX + JMP int32MinusByScalarBlockLoop + +int32MinusByScalarTailLoop: + CMPQ BX, $0x00000010 + JL int32MinusByScalarDone + VPSUBD (CX), Z0, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000010, BX + JMP int32MinusByScalarTailLoop + +int32MinusByScalarDone: + RET + +// func int64MinusAvx512Asm(x []int64, y []int64, r []int64) +// Requires: AVX512F +TEXT ·int64MinusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +int64MinusBlockLoop: + CMPQ BX, $0x00000060 + JL int64MinusTailLoop + VMOVDQU32 (AX), Z0 + VMOVDQU32 64(AX), Z1 + VMOVDQU32 128(AX), Z2 + VMOVDQU32 192(AX), Z3 + VMOVDQU32 256(AX), Z4 + VMOVDQU32 320(AX), Z5 + VMOVDQU32 384(AX), Z6 + VMOVDQU32 448(AX), Z7 + VMOVDQU32 512(AX), Z8 + VMOVDQU32 576(AX), Z9 + VMOVDQU32 640(AX), Z10 + VMOVDQU32 704(AX), Z11 + VPSUBQ (CX), Z0, Z0 + VPSUBQ 64(CX), Z1, Z1 + VPSUBQ 128(CX), Z2, Z2 + VPSUBQ 192(CX), Z3, Z3 + VPSUBQ 256(CX), Z4, Z4 + VPSUBQ 320(CX), Z5, Z5 + VPSUBQ 384(CX), Z6, Z6 + VPSUBQ 448(CX), Z7, Z7 + VPSUBQ 512(CX), Z8, Z8 + VPSUBQ 576(CX), Z9, Z9 + VPSUBQ 640(CX), Z10, Z10 + VPSUBQ 704(CX), Z11, Z11 + VMOVDQU32 Z0, (DX) + VMOVDQU32 Z1, 64(DX) + VMOVDQU32 Z2, 128(DX) + VMOVDQU32 Z3, 192(DX) + VMOVDQU32 Z4, 256(DX) + VMOVDQU32 Z5, 320(DX) + VMOVDQU32 Z6, 384(DX) + VMOVDQU32 Z7, 448(DX) + VMOVDQU32 Z8, 512(DX) + VMOVDQU32 Z9, 576(DX) + VMOVDQU32 Z10, 640(DX) + VMOVDQU32 Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000060, BX + JMP int64MinusBlockLoop + +int64MinusTailLoop: + CMPQ BX, $0x00000008 + JL int64MinusDone + VMOVDQU32 (AX), Z0 + VPSUBQ (CX), Z0, Z0 + VMOVDQU32 Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000008, BX + JMP int64MinusTailLoop + +int64MinusDone: + RET + +// func int64MinusScalarAvx512Asm(x int64, y []int64, r []int64) +// Requires: AVX512F, SSE2 +TEXT ·int64MinusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVQ x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVQ AX, X0 + VPBROADCASTQ X0, Z0 + +int64MinusScalarBlockLoop: + CMPQ BX, $0x00000060 + JL int64MinusScalarTailLoop + VMOVDQU32 (CX), Z1 + VMOVDQU32 64(CX), Z2 + VMOVDQU32 128(CX), Z3 + VMOVDQU32 192(CX), Z4 + VMOVDQU32 256(CX), Z5 + VMOVDQU32 320(CX), Z6 + VMOVDQU32 384(CX), Z7 + VMOVDQU32 448(CX), Z8 + VMOVDQU32 512(CX), Z9 + VMOVDQU32 576(CX), Z10 + VMOVDQU32 640(CX), Z11 + VMOVDQU32 704(CX), Z12 + VPSUBQ Z0, Z1, Z1 + VPSUBQ Z0, Z2, Z2 + VPSUBQ Z0, Z3, Z3 + VPSUBQ Z0, Z4, Z4 + VPSUBQ Z0, Z5, Z5 + VPSUBQ Z0, Z6, Z6 + VPSUBQ Z0, Z7, Z7 + VPSUBQ Z0, Z8, Z8 + VPSUBQ Z0, Z9, Z9 + VPSUBQ Z0, Z10, Z10 + VPSUBQ Z0, Z11, Z11 + VPSUBQ Z0, Z12, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000060, BX + JMP int64MinusScalarBlockLoop + +int64MinusScalarTailLoop: + CMPQ BX, $0x00000008 + JL int64MinusScalarDone + VMOVDQU32 (CX), Z1 + VPSUBQ Z0, Z1, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000008, BX + JMP int64MinusScalarTailLoop + +int64MinusScalarDone: + RET + +// func int64MinusByScalarAvx512Asm(x int64, y []int64, r []int64) +// Requires: AVX512F, SSE2 +TEXT ·int64MinusByScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVQ x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVQ AX, X0 + VPBROADCASTQ X0, Z0 + +int64MinusByScalarBlockLoop: + CMPQ BX, $0x00000060 + JL int64MinusByScalarTailLoop + VPSUBQ (CX), Z0, Z1 + VPSUBQ 64(CX), Z0, Z2 + VPSUBQ 128(CX), Z0, Z3 + VPSUBQ 192(CX), Z0, Z4 + VPSUBQ 256(CX), Z0, Z5 + VPSUBQ 320(CX), Z0, Z6 + VPSUBQ 384(CX), Z0, Z7 + VPSUBQ 448(CX), Z0, Z8 + VPSUBQ 512(CX), Z0, Z9 + VPSUBQ 576(CX), Z0, Z10 + VPSUBQ 640(CX), Z0, Z11 + VPSUBQ 704(CX), Z0, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000060, BX + JMP int64MinusByScalarBlockLoop + +int64MinusByScalarTailLoop: + CMPQ BX, $0x00000008 + JL int64MinusByScalarDone + VPSUBQ (CX), Z0, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000008, BX + JMP int64MinusByScalarTailLoop + +int64MinusByScalarDone: + RET + +// func uint8MinusAvx512Asm(x []uint8, y []uint8, r []uint8) +// Requires: AVX512BW, AVX512F +TEXT ·uint8MinusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +uint8MinusBlockLoop: + CMPQ BX, $0x00000300 + JL uint8MinusTailLoop + VMOVDQU32 (AX), Z0 + VMOVDQU32 64(AX), Z1 + VMOVDQU32 128(AX), Z2 + VMOVDQU32 192(AX), Z3 + VMOVDQU32 256(AX), Z4 + VMOVDQU32 320(AX), Z5 + VMOVDQU32 384(AX), Z6 + VMOVDQU32 448(AX), Z7 + VMOVDQU32 512(AX), Z8 + VMOVDQU32 576(AX), Z9 + VMOVDQU32 640(AX), Z10 + VMOVDQU32 704(AX), Z11 + VPSUBB (CX), Z0, Z0 + VPSUBB 64(CX), Z1, Z1 + VPSUBB 128(CX), Z2, Z2 + VPSUBB 192(CX), Z3, Z3 + VPSUBB 256(CX), Z4, Z4 + VPSUBB 320(CX), Z5, Z5 + VPSUBB 384(CX), Z6, Z6 + VPSUBB 448(CX), Z7, Z7 + VPSUBB 512(CX), Z8, Z8 + VPSUBB 576(CX), Z9, Z9 + VPSUBB 640(CX), Z10, Z10 + VPSUBB 704(CX), Z11, Z11 + VMOVDQU32 Z0, (DX) + VMOVDQU32 Z1, 64(DX) + VMOVDQU32 Z2, 128(DX) + VMOVDQU32 Z3, 192(DX) + VMOVDQU32 Z4, 256(DX) + VMOVDQU32 Z5, 320(DX) + VMOVDQU32 Z6, 384(DX) + VMOVDQU32 Z7, 448(DX) + VMOVDQU32 Z8, 512(DX) + VMOVDQU32 Z9, 576(DX) + VMOVDQU32 Z10, 640(DX) + VMOVDQU32 Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000300, BX + JMP uint8MinusBlockLoop + +uint8MinusTailLoop: + CMPQ BX, $0x00000040 + JL uint8MinusDone + VMOVDQU32 (AX), Z0 + VPSUBB (CX), Z0, Z0 + VMOVDQU32 Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000040, BX + JMP uint8MinusTailLoop + +uint8MinusDone: + RET + +// func uint8MinusScalarAvx512Asm(x uint8, y []uint8, r []uint8) +// Requires: AVX512BW, AVX512F, SSE2 +TEXT ·uint8MinusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVBLZX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTB X0, Z0 + +uint8MinusScalarBlockLoop: + CMPQ BX, $0x00000300 + JL uint8MinusScalarTailLoop + VMOVDQU32 (CX), Z1 + VMOVDQU32 64(CX), Z2 + VMOVDQU32 128(CX), Z3 + VMOVDQU32 192(CX), Z4 + VMOVDQU32 256(CX), Z5 + VMOVDQU32 320(CX), Z6 + VMOVDQU32 384(CX), Z7 + VMOVDQU32 448(CX), Z8 + VMOVDQU32 512(CX), Z9 + VMOVDQU32 576(CX), Z10 + VMOVDQU32 640(CX), Z11 + VMOVDQU32 704(CX), Z12 + VPSUBB Z0, Z1, Z1 + VPSUBB Z0, Z2, Z2 + VPSUBB Z0, Z3, Z3 + VPSUBB Z0, Z4, Z4 + VPSUBB Z0, Z5, Z5 + VPSUBB Z0, Z6, Z6 + VPSUBB Z0, Z7, Z7 + VPSUBB Z0, Z8, Z8 + VPSUBB Z0, Z9, Z9 + VPSUBB Z0, Z10, Z10 + VPSUBB Z0, Z11, Z11 + VPSUBB Z0, Z12, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000300, BX + JMP uint8MinusScalarBlockLoop + +uint8MinusScalarTailLoop: + CMPQ BX, $0x00000040 + JL uint8MinusScalarDone + VMOVDQU32 (CX), Z1 + VPSUBB Z0, Z1, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000040, BX + JMP uint8MinusScalarTailLoop + +uint8MinusScalarDone: + RET + +// func uint8MinusByScalarAvx512Asm(x uint8, y []uint8, r []uint8) +// Requires: AVX512BW, AVX512F, SSE2 +TEXT ·uint8MinusByScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVBLZX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTB X0, Z0 + +uint8MinusByScalarBlockLoop: + CMPQ BX, $0x00000300 + JL uint8MinusByScalarTailLoop + VPSUBB (CX), Z0, Z1 + VPSUBB 64(CX), Z0, Z2 + VPSUBB 128(CX), Z0, Z3 + VPSUBB 192(CX), Z0, Z4 + VPSUBB 256(CX), Z0, Z5 + VPSUBB 320(CX), Z0, Z6 + VPSUBB 384(CX), Z0, Z7 + VPSUBB 448(CX), Z0, Z8 + VPSUBB 512(CX), Z0, Z9 + VPSUBB 576(CX), Z0, Z10 + VPSUBB 640(CX), Z0, Z11 + VPSUBB 704(CX), Z0, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000300, BX + JMP uint8MinusByScalarBlockLoop + +uint8MinusByScalarTailLoop: + CMPQ BX, $0x00000040 + JL uint8MinusByScalarDone + VPSUBB (CX), Z0, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000040, BX + JMP uint8MinusByScalarTailLoop + +uint8MinusByScalarDone: + RET + +// func uint16MinusAvx512Asm(x []uint16, y []uint16, r []uint16) +// Requires: AVX512BW, AVX512F +TEXT ·uint16MinusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +uint16MinusBlockLoop: + CMPQ BX, $0x00000180 + JL uint16MinusTailLoop + VMOVDQU32 (AX), Z0 + VMOVDQU32 64(AX), Z1 + VMOVDQU32 128(AX), Z2 + VMOVDQU32 192(AX), Z3 + VMOVDQU32 256(AX), Z4 + VMOVDQU32 320(AX), Z5 + VMOVDQU32 384(AX), Z6 + VMOVDQU32 448(AX), Z7 + VMOVDQU32 512(AX), Z8 + VMOVDQU32 576(AX), Z9 + VMOVDQU32 640(AX), Z10 + VMOVDQU32 704(AX), Z11 + VPSUBW (CX), Z0, Z0 + VPSUBW 64(CX), Z1, Z1 + VPSUBW 128(CX), Z2, Z2 + VPSUBW 192(CX), Z3, Z3 + VPSUBW 256(CX), Z4, Z4 + VPSUBW 320(CX), Z5, Z5 + VPSUBW 384(CX), Z6, Z6 + VPSUBW 448(CX), Z7, Z7 + VPSUBW 512(CX), Z8, Z8 + VPSUBW 576(CX), Z9, Z9 + VPSUBW 640(CX), Z10, Z10 + VPSUBW 704(CX), Z11, Z11 + VMOVDQU32 Z0, (DX) + VMOVDQU32 Z1, 64(DX) + VMOVDQU32 Z2, 128(DX) + VMOVDQU32 Z3, 192(DX) + VMOVDQU32 Z4, 256(DX) + VMOVDQU32 Z5, 320(DX) + VMOVDQU32 Z6, 384(DX) + VMOVDQU32 Z7, 448(DX) + VMOVDQU32 Z8, 512(DX) + VMOVDQU32 Z9, 576(DX) + VMOVDQU32 Z10, 640(DX) + VMOVDQU32 Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000180, BX + JMP uint16MinusBlockLoop + +uint16MinusTailLoop: + CMPQ BX, $0x00000020 + JL uint16MinusDone + VMOVDQU32 (AX), Z0 + VPSUBW (CX), Z0, Z0 + VMOVDQU32 Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000020, BX + JMP uint16MinusTailLoop + +uint16MinusDone: + RET + +// func uint16MinusScalarAvx512Asm(x uint16, y []uint16, r []uint16) +// Requires: AVX512BW, AVX512F, SSE2 +TEXT ·uint16MinusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVWLZX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTW X0, Z0 + +uint16MinusScalarBlockLoop: + CMPQ BX, $0x00000180 + JL uint16MinusScalarTailLoop + VMOVDQU32 (CX), Z1 + VMOVDQU32 64(CX), Z2 + VMOVDQU32 128(CX), Z3 + VMOVDQU32 192(CX), Z4 + VMOVDQU32 256(CX), Z5 + VMOVDQU32 320(CX), Z6 + VMOVDQU32 384(CX), Z7 + VMOVDQU32 448(CX), Z8 + VMOVDQU32 512(CX), Z9 + VMOVDQU32 576(CX), Z10 + VMOVDQU32 640(CX), Z11 + VMOVDQU32 704(CX), Z12 + VPSUBW Z0, Z1, Z1 + VPSUBW Z0, Z2, Z2 + VPSUBW Z0, Z3, Z3 + VPSUBW Z0, Z4, Z4 + VPSUBW Z0, Z5, Z5 + VPSUBW Z0, Z6, Z6 + VPSUBW Z0, Z7, Z7 + VPSUBW Z0, Z8, Z8 + VPSUBW Z0, Z9, Z9 + VPSUBW Z0, Z10, Z10 + VPSUBW Z0, Z11, Z11 + VPSUBW Z0, Z12, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000180, BX + JMP uint16MinusScalarBlockLoop + +uint16MinusScalarTailLoop: + CMPQ BX, $0x00000020 + JL uint16MinusScalarDone + VMOVDQU32 (CX), Z1 + VPSUBW Z0, Z1, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000020, BX + JMP uint16MinusScalarTailLoop + +uint16MinusScalarDone: + RET + +// func uint16MinusByScalarAvx512Asm(x uint16, y []uint16, r []uint16) +// Requires: AVX512BW, AVX512F, SSE2 +TEXT ·uint16MinusByScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVWLZX x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTW X0, Z0 + +uint16MinusByScalarBlockLoop: + CMPQ BX, $0x00000180 + JL uint16MinusByScalarTailLoop + VPSUBW (CX), Z0, Z1 + VPSUBW 64(CX), Z0, Z2 + VPSUBW 128(CX), Z0, Z3 + VPSUBW 192(CX), Z0, Z4 + VPSUBW 256(CX), Z0, Z5 + VPSUBW 320(CX), Z0, Z6 + VPSUBW 384(CX), Z0, Z7 + VPSUBW 448(CX), Z0, Z8 + VPSUBW 512(CX), Z0, Z9 + VPSUBW 576(CX), Z0, Z10 + VPSUBW 640(CX), Z0, Z11 + VPSUBW 704(CX), Z0, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000180, BX + JMP uint16MinusByScalarBlockLoop + +uint16MinusByScalarTailLoop: + CMPQ BX, $0x00000020 + JL uint16MinusByScalarDone + VPSUBW (CX), Z0, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000020, BX + JMP uint16MinusByScalarTailLoop + +uint16MinusByScalarDone: + RET + +// func uint32MinusAvx512Asm(x []uint32, y []uint32, r []uint32) +// Requires: AVX512F +TEXT ·uint32MinusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +uint32MinusBlockLoop: + CMPQ BX, $0x000000c0 + JL uint32MinusTailLoop + VMOVDQU32 (AX), Z0 + VMOVDQU32 64(AX), Z1 + VMOVDQU32 128(AX), Z2 + VMOVDQU32 192(AX), Z3 + VMOVDQU32 256(AX), Z4 + VMOVDQU32 320(AX), Z5 + VMOVDQU32 384(AX), Z6 + VMOVDQU32 448(AX), Z7 + VMOVDQU32 512(AX), Z8 + VMOVDQU32 576(AX), Z9 + VMOVDQU32 640(AX), Z10 + VMOVDQU32 704(AX), Z11 + VPSUBD (CX), Z0, Z0 + VPSUBD 64(CX), Z1, Z1 + VPSUBD 128(CX), Z2, Z2 + VPSUBD 192(CX), Z3, Z3 + VPSUBD 256(CX), Z4, Z4 + VPSUBD 320(CX), Z5, Z5 + VPSUBD 384(CX), Z6, Z6 + VPSUBD 448(CX), Z7, Z7 + VPSUBD 512(CX), Z8, Z8 + VPSUBD 576(CX), Z9, Z9 + VPSUBD 640(CX), Z10, Z10 + VPSUBD 704(CX), Z11, Z11 + VMOVDQU32 Z0, (DX) + VMOVDQU32 Z1, 64(DX) + VMOVDQU32 Z2, 128(DX) + VMOVDQU32 Z3, 192(DX) + VMOVDQU32 Z4, 256(DX) + VMOVDQU32 Z5, 320(DX) + VMOVDQU32 Z6, 384(DX) + VMOVDQU32 Z7, 448(DX) + VMOVDQU32 Z8, 512(DX) + VMOVDQU32 Z9, 576(DX) + VMOVDQU32 Z10, 640(DX) + VMOVDQU32 Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x000000c0, BX + JMP uint32MinusBlockLoop + +uint32MinusTailLoop: + CMPQ BX, $0x00000010 + JL uint32MinusDone + VMOVDQU32 (AX), Z0 + VPSUBD (CX), Z0, Z0 + VMOVDQU32 Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000010, BX + JMP uint32MinusTailLoop + +uint32MinusDone: + RET + +// func uint32MinusScalarAvx512Asm(x uint32, y []uint32, r []uint32) +// Requires: AVX512F, SSE2 +TEXT ·uint32MinusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVL x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTD X0, Z0 + +uint32MinusScalarBlockLoop: + CMPQ BX, $0x000000c0 + JL uint32MinusScalarTailLoop + VMOVDQU32 (CX), Z1 + VMOVDQU32 64(CX), Z2 + VMOVDQU32 128(CX), Z3 + VMOVDQU32 192(CX), Z4 + VMOVDQU32 256(CX), Z5 + VMOVDQU32 320(CX), Z6 + VMOVDQU32 384(CX), Z7 + VMOVDQU32 448(CX), Z8 + VMOVDQU32 512(CX), Z9 + VMOVDQU32 576(CX), Z10 + VMOVDQU32 640(CX), Z11 + VMOVDQU32 704(CX), Z12 + VPSUBD Z0, Z1, Z1 + VPSUBD Z0, Z2, Z2 + VPSUBD Z0, Z3, Z3 + VPSUBD Z0, Z4, Z4 + VPSUBD Z0, Z5, Z5 + VPSUBD Z0, Z6, Z6 + VPSUBD Z0, Z7, Z7 + VPSUBD Z0, Z8, Z8 + VPSUBD Z0, Z9, Z9 + VPSUBD Z0, Z10, Z10 + VPSUBD Z0, Z11, Z11 + VPSUBD Z0, Z12, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x000000c0, BX + JMP uint32MinusScalarBlockLoop + +uint32MinusScalarTailLoop: + CMPQ BX, $0x00000010 + JL uint32MinusScalarDone + VMOVDQU32 (CX), Z1 + VPSUBD Z0, Z1, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000010, BX + JMP uint32MinusScalarTailLoop + +uint32MinusScalarDone: + RET + +// func uint32MinusByScalarAvx512Asm(x uint32, y []uint32, r []uint32) +// Requires: AVX512F, SSE2 +TEXT ·uint32MinusByScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVL x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVD AX, X0 + VPBROADCASTD X0, Z0 + +uint32MinusByScalarBlockLoop: + CMPQ BX, $0x000000c0 + JL uint32MinusByScalarTailLoop + VPSUBD (CX), Z0, Z1 + VPSUBD 64(CX), Z0, Z2 + VPSUBD 128(CX), Z0, Z3 + VPSUBD 192(CX), Z0, Z4 + VPSUBD 256(CX), Z0, Z5 + VPSUBD 320(CX), Z0, Z6 + VPSUBD 384(CX), Z0, Z7 + VPSUBD 448(CX), Z0, Z8 + VPSUBD 512(CX), Z0, Z9 + VPSUBD 576(CX), Z0, Z10 + VPSUBD 640(CX), Z0, Z11 + VPSUBD 704(CX), Z0, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x000000c0, BX + JMP uint32MinusByScalarBlockLoop + +uint32MinusByScalarTailLoop: + CMPQ BX, $0x00000010 + JL uint32MinusByScalarDone + VPSUBD (CX), Z0, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000010, BX + JMP uint32MinusByScalarTailLoop + +uint32MinusByScalarDone: + RET + +// func uint64MinusAvx512Asm(x []uint64, y []uint64, r []uint64) +// Requires: AVX512F +TEXT ·uint64MinusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +uint64MinusBlockLoop: + CMPQ BX, $0x00000060 + JL uint64MinusTailLoop + VMOVDQU32 (AX), Z0 + VMOVDQU32 64(AX), Z1 + VMOVDQU32 128(AX), Z2 + VMOVDQU32 192(AX), Z3 + VMOVDQU32 256(AX), Z4 + VMOVDQU32 320(AX), Z5 + VMOVDQU32 384(AX), Z6 + VMOVDQU32 448(AX), Z7 + VMOVDQU32 512(AX), Z8 + VMOVDQU32 576(AX), Z9 + VMOVDQU32 640(AX), Z10 + VMOVDQU32 704(AX), Z11 + VPSUBQ (CX), Z0, Z0 + VPSUBQ 64(CX), Z1, Z1 + VPSUBQ 128(CX), Z2, Z2 + VPSUBQ 192(CX), Z3, Z3 + VPSUBQ 256(CX), Z4, Z4 + VPSUBQ 320(CX), Z5, Z5 + VPSUBQ 384(CX), Z6, Z6 + VPSUBQ 448(CX), Z7, Z7 + VPSUBQ 512(CX), Z8, Z8 + VPSUBQ 576(CX), Z9, Z9 + VPSUBQ 640(CX), Z10, Z10 + VPSUBQ 704(CX), Z11, Z11 + VMOVDQU32 Z0, (DX) + VMOVDQU32 Z1, 64(DX) + VMOVDQU32 Z2, 128(DX) + VMOVDQU32 Z3, 192(DX) + VMOVDQU32 Z4, 256(DX) + VMOVDQU32 Z5, 320(DX) + VMOVDQU32 Z6, 384(DX) + VMOVDQU32 Z7, 448(DX) + VMOVDQU32 Z8, 512(DX) + VMOVDQU32 Z9, 576(DX) + VMOVDQU32 Z10, 640(DX) + VMOVDQU32 Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000060, BX + JMP uint64MinusBlockLoop + +uint64MinusTailLoop: + CMPQ BX, $0x00000008 + JL uint64MinusDone + VMOVDQU32 (AX), Z0 + VPSUBQ (CX), Z0, Z0 + VMOVDQU32 Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000008, BX + JMP uint64MinusTailLoop + +uint64MinusDone: + RET + +// func uint64MinusScalarAvx512Asm(x uint64, y []uint64, r []uint64) +// Requires: AVX512F, SSE2 +TEXT ·uint64MinusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVQ x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVQ AX, X0 + VPBROADCASTQ X0, Z0 + +uint64MinusScalarBlockLoop: + CMPQ BX, $0x00000060 + JL uint64MinusScalarTailLoop + VMOVDQU32 (CX), Z1 + VMOVDQU32 64(CX), Z2 + VMOVDQU32 128(CX), Z3 + VMOVDQU32 192(CX), Z4 + VMOVDQU32 256(CX), Z5 + VMOVDQU32 320(CX), Z6 + VMOVDQU32 384(CX), Z7 + VMOVDQU32 448(CX), Z8 + VMOVDQU32 512(CX), Z9 + VMOVDQU32 576(CX), Z10 + VMOVDQU32 640(CX), Z11 + VMOVDQU32 704(CX), Z12 + VPSUBQ Z0, Z1, Z1 + VPSUBQ Z0, Z2, Z2 + VPSUBQ Z0, Z3, Z3 + VPSUBQ Z0, Z4, Z4 + VPSUBQ Z0, Z5, Z5 + VPSUBQ Z0, Z6, Z6 + VPSUBQ Z0, Z7, Z7 + VPSUBQ Z0, Z8, Z8 + VPSUBQ Z0, Z9, Z9 + VPSUBQ Z0, Z10, Z10 + VPSUBQ Z0, Z11, Z11 + VPSUBQ Z0, Z12, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000060, BX + JMP uint64MinusScalarBlockLoop + +uint64MinusScalarTailLoop: + CMPQ BX, $0x00000008 + JL uint64MinusScalarDone + VMOVDQU32 (CX), Z1 + VPSUBQ Z0, Z1, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000008, BX + JMP uint64MinusScalarTailLoop + +uint64MinusScalarDone: + RET + +// func uint64MinusByScalarAvx512Asm(x uint64, y []uint64, r []uint64) +// Requires: AVX512F, SSE2 +TEXT ·uint64MinusByScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVQ x+0(FP), AX + MOVQ y_base+8(FP), CX + MOVQ r_base+32(FP), DX + MOVQ y_len+16(FP), BX + MOVQ AX, X0 + VPBROADCASTQ X0, Z0 + +uint64MinusByScalarBlockLoop: + CMPQ BX, $0x00000060 + JL uint64MinusByScalarTailLoop + VPSUBQ (CX), Z0, Z1 + VPSUBQ 64(CX), Z0, Z2 + VPSUBQ 128(CX), Z0, Z3 + VPSUBQ 192(CX), Z0, Z4 + VPSUBQ 256(CX), Z0, Z5 + VPSUBQ 320(CX), Z0, Z6 + VPSUBQ 384(CX), Z0, Z7 + VPSUBQ 448(CX), Z0, Z8 + VPSUBQ 512(CX), Z0, Z9 + VPSUBQ 576(CX), Z0, Z10 + VPSUBQ 640(CX), Z0, Z11 + VPSUBQ 704(CX), Z0, Z12 + VMOVDQU32 Z1, (DX) + VMOVDQU32 Z2, 64(DX) + VMOVDQU32 Z3, 128(DX) + VMOVDQU32 Z4, 192(DX) + VMOVDQU32 Z5, 256(DX) + VMOVDQU32 Z6, 320(DX) + VMOVDQU32 Z7, 384(DX) + VMOVDQU32 Z8, 448(DX) + VMOVDQU32 Z9, 512(DX) + VMOVDQU32 Z10, 576(DX) + VMOVDQU32 Z11, 640(DX) + VMOVDQU32 Z12, 704(DX) + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000060, BX + JMP uint64MinusByScalarBlockLoop + +uint64MinusByScalarTailLoop: + CMPQ BX, $0x00000008 + JL uint64MinusByScalarDone + VPSUBQ (CX), Z0, Z1 + VMOVDQU32 Z1, (DX) + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000008, BX + JMP uint64MinusByScalarTailLoop + +uint64MinusByScalarDone: + RET + +// func float32MinusAvx512Asm(x []float32, y []float32, r []float32) +// Requires: AVX512F +TEXT ·float32MinusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +float32MinusBlockLoop: + CMPQ BX, $0x000000c0 + JL float32MinusTailLoop + VMOVUPS (AX), Z0 + VMOVUPS 64(AX), Z1 + VMOVUPS 128(AX), Z2 + VMOVUPS 192(AX), Z3 + VMOVUPS 256(AX), Z4 + VMOVUPS 320(AX), Z5 + VMOVUPS 384(AX), Z6 + VMOVUPS 448(AX), Z7 + VMOVUPS 512(AX), Z8 + VMOVUPS 576(AX), Z9 + VMOVUPS 640(AX), Z10 + VMOVUPS 704(AX), Z11 + VSUBPS (CX), Z0, Z0 + VSUBPS 64(CX), Z1, Z1 + VSUBPS 128(CX), Z2, Z2 + VSUBPS 192(CX), Z3, Z3 + VSUBPS 256(CX), Z4, Z4 + VSUBPS 320(CX), Z5, Z5 + VSUBPS 384(CX), Z6, Z6 + VSUBPS 448(CX), Z7, Z7 + VSUBPS 512(CX), Z8, Z8 + VSUBPS 576(CX), Z9, Z9 + VSUBPS 640(CX), Z10, Z10 + VSUBPS 704(CX), Z11, Z11 + VMOVUPS Z0, (DX) + VMOVUPS Z1, 64(DX) + VMOVUPS Z2, 128(DX) + VMOVUPS Z3, 192(DX) + VMOVUPS Z4, 256(DX) + VMOVUPS Z5, 320(DX) + VMOVUPS Z6, 384(DX) + VMOVUPS Z7, 448(DX) + VMOVUPS Z8, 512(DX) + VMOVUPS Z9, 576(DX) + VMOVUPS Z10, 640(DX) + VMOVUPS Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x000000c0, BX + JMP float32MinusBlockLoop + +float32MinusTailLoop: + CMPQ BX, $0x00000010 + JL float32MinusDone + VMOVUPS (AX), Z0 + VSUBPS (CX), Z0, Z0 + VMOVUPS Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000010, BX + JMP float32MinusTailLoop + +float32MinusDone: + RET + +// func float32MinusScalarAvx512Asm(x float32, y []float32, r []float32) +// Requires: AVX512F, SSE +TEXT ·float32MinusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVSS x+0(FP), X0 + MOVQ y_base+8(FP), AX + MOVQ r_base+32(FP), CX + MOVQ y_len+16(FP), DX + VBROADCASTSS X0, Z0 + +float32MinusScalarBlockLoop: + CMPQ DX, $0x000000c0 + JL float32MinusScalarTailLoop + VMOVUPS (AX), Z1 + VMOVUPS 64(AX), Z2 + VMOVUPS 128(AX), Z3 + VMOVUPS 192(AX), Z4 + VMOVUPS 256(AX), Z5 + VMOVUPS 320(AX), Z6 + VMOVUPS 384(AX), Z7 + VMOVUPS 448(AX), Z8 + VMOVUPS 512(AX), Z9 + VMOVUPS 576(AX), Z10 + VMOVUPS 640(AX), Z11 + VMOVUPS 704(AX), Z12 + VSUBPS Z0, Z1, Z1 + VSUBPS Z0, Z2, Z2 + VSUBPS Z0, Z3, Z3 + VSUBPS Z0, Z4, Z4 + VSUBPS Z0, Z5, Z5 + VSUBPS Z0, Z6, Z6 + VSUBPS Z0, Z7, Z7 + VSUBPS Z0, Z8, Z8 + VSUBPS Z0, Z9, Z9 + VSUBPS Z0, Z10, Z10 + VSUBPS Z0, Z11, Z11 + VSUBPS Z0, Z12, Z12 + VMOVUPS Z1, (CX) + VMOVUPS Z2, 64(CX) + VMOVUPS Z3, 128(CX) + VMOVUPS Z4, 192(CX) + VMOVUPS Z5, 256(CX) + VMOVUPS Z6, 320(CX) + VMOVUPS Z7, 384(CX) + VMOVUPS Z8, 448(CX) + VMOVUPS Z9, 512(CX) + VMOVUPS Z10, 576(CX) + VMOVUPS Z11, 640(CX) + VMOVUPS Z12, 704(CX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + SUBQ $0x000000c0, DX + JMP float32MinusScalarBlockLoop + +float32MinusScalarTailLoop: + CMPQ DX, $0x00000010 + JL float32MinusScalarDone + VMOVUPS (AX), Z1 + VSUBPS Z0, Z1, Z1 + VMOVUPS Z1, (CX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + SUBQ $0x00000010, DX + JMP float32MinusScalarTailLoop + +float32MinusScalarDone: + RET + +// func float32MinusByScalarAvx512Asm(x float32, y []float32, r []float32) +// Requires: AVX512F, SSE +TEXT ·float32MinusByScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVSS x+0(FP), X0 + MOVQ y_base+8(FP), AX + MOVQ r_base+32(FP), CX + MOVQ y_len+16(FP), DX + VBROADCASTSS X0, Z0 + +float32MinusByScalarBlockLoop: + CMPQ DX, $0x000000c0 + JL float32MinusByScalarTailLoop + VSUBPS (AX), Z0, Z1 + VSUBPS 64(AX), Z0, Z2 + VSUBPS 128(AX), Z0, Z3 + VSUBPS 192(AX), Z0, Z4 + VSUBPS 256(AX), Z0, Z5 + VSUBPS 320(AX), Z0, Z6 + VSUBPS 384(AX), Z0, Z7 + VSUBPS 448(AX), Z0, Z8 + VSUBPS 512(AX), Z0, Z9 + VSUBPS 576(AX), Z0, Z10 + VSUBPS 640(AX), Z0, Z11 + VSUBPS 704(AX), Z0, Z12 + VMOVUPS Z1, (CX) + VMOVUPS Z2, 64(CX) + VMOVUPS Z3, 128(CX) + VMOVUPS Z4, 192(CX) + VMOVUPS Z5, 256(CX) + VMOVUPS Z6, 320(CX) + VMOVUPS Z7, 384(CX) + VMOVUPS Z8, 448(CX) + VMOVUPS Z9, 512(CX) + VMOVUPS Z10, 576(CX) + VMOVUPS Z11, 640(CX) + VMOVUPS Z12, 704(CX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + SUBQ $0x000000c0, DX + JMP float32MinusByScalarBlockLoop + +float32MinusByScalarTailLoop: + CMPQ DX, $0x00000010 + JL float32MinusByScalarDone + VSUBPS (AX), Z0, Z1 + VMOVUPS Z1, (CX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + SUBQ $0x00000010, DX + JMP float32MinusByScalarTailLoop + +float32MinusByScalarDone: + RET + +// func float64MinusAvx512Asm(x []float64, y []float64, r []float64) +// Requires: AVX512F +TEXT ·float64MinusAvx512Asm(SB), NOSPLIT, $0-72 + MOVQ x_base+0(FP), AX + MOVQ y_base+24(FP), CX + MOVQ r_base+48(FP), DX + MOVQ x_len+8(FP), BX + +float64MinusBlockLoop: + CMPQ BX, $0x00000060 + JL float64MinusTailLoop + VMOVUPD (AX), Z0 + VMOVUPD 64(AX), Z1 + VMOVUPD 128(AX), Z2 + VMOVUPD 192(AX), Z3 + VMOVUPD 256(AX), Z4 + VMOVUPD 320(AX), Z5 + VMOVUPD 384(AX), Z6 + VMOVUPD 448(AX), Z7 + VMOVUPD 512(AX), Z8 + VMOVUPD 576(AX), Z9 + VMOVUPD 640(AX), Z10 + VMOVUPD 704(AX), Z11 + VSUBPD (CX), Z0, Z0 + VSUBPD 64(CX), Z1, Z1 + VSUBPD 128(CX), Z2, Z2 + VSUBPD 192(CX), Z3, Z3 + VSUBPD 256(CX), Z4, Z4 + VSUBPD 320(CX), Z5, Z5 + VSUBPD 384(CX), Z6, Z6 + VSUBPD 448(CX), Z7, Z7 + VSUBPD 512(CX), Z8, Z8 + VSUBPD 576(CX), Z9, Z9 + VSUBPD 640(CX), Z10, Z10 + VSUBPD 704(CX), Z11, Z11 + VMOVUPD Z0, (DX) + VMOVUPD Z1, 64(DX) + VMOVUPD Z2, 128(DX) + VMOVUPD Z3, 192(DX) + VMOVUPD Z4, 256(DX) + VMOVUPD Z5, 320(DX) + VMOVUPD Z6, 384(DX) + VMOVUPD Z7, 448(DX) + VMOVUPD Z8, 512(DX) + VMOVUPD Z9, 576(DX) + VMOVUPD Z10, 640(DX) + VMOVUPD Z11, 704(DX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + ADDQ $0x00000300, DX + SUBQ $0x00000060, BX + JMP float64MinusBlockLoop + +float64MinusTailLoop: + CMPQ BX, $0x00000008 + JL float64MinusDone + VMOVUPD (AX), Z0 + VSUBPD (CX), Z0, Z0 + VMOVUPD Z0, (DX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + ADDQ $0x00000040, DX + SUBQ $0x00000008, BX + JMP float64MinusTailLoop + +float64MinusDone: + RET + +// func float64MinusScalarAvx512Asm(x float64, y []float64, r []float64) +// Requires: AVX512F, SSE2 +TEXT ·float64MinusScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVSD x+0(FP), X0 + MOVQ y_base+8(FP), AX + MOVQ r_base+32(FP), CX + MOVQ y_len+16(FP), DX + VBROADCASTSD X0, Z0 + +float64MinusScalarBlockLoop: + CMPQ DX, $0x00000060 + JL float64MinusScalarTailLoop + VMOVUPD (AX), Z1 + VMOVUPD 64(AX), Z2 + VMOVUPD 128(AX), Z3 + VMOVUPD 192(AX), Z4 + VMOVUPD 256(AX), Z5 + VMOVUPD 320(AX), Z6 + VMOVUPD 384(AX), Z7 + VMOVUPD 448(AX), Z8 + VMOVUPD 512(AX), Z9 + VMOVUPD 576(AX), Z10 + VMOVUPD 640(AX), Z11 + VMOVUPD 704(AX), Z12 + VSUBPD Z0, Z1, Z1 + VSUBPD Z0, Z2, Z2 + VSUBPD Z0, Z3, Z3 + VSUBPD Z0, Z4, Z4 + VSUBPD Z0, Z5, Z5 + VSUBPD Z0, Z6, Z6 + VSUBPD Z0, Z7, Z7 + VSUBPD Z0, Z8, Z8 + VSUBPD Z0, Z9, Z9 + VSUBPD Z0, Z10, Z10 + VSUBPD Z0, Z11, Z11 + VSUBPD Z0, Z12, Z12 + VMOVUPD Z1, (CX) + VMOVUPD Z2, 64(CX) + VMOVUPD Z3, 128(CX) + VMOVUPD Z4, 192(CX) + VMOVUPD Z5, 256(CX) + VMOVUPD Z6, 320(CX) + VMOVUPD Z7, 384(CX) + VMOVUPD Z8, 448(CX) + VMOVUPD Z9, 512(CX) + VMOVUPD Z10, 576(CX) + VMOVUPD Z11, 640(CX) + VMOVUPD Z12, 704(CX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + SUBQ $0x00000060, DX + JMP float64MinusScalarBlockLoop + +float64MinusScalarTailLoop: + CMPQ DX, $0x00000008 + JL float64MinusScalarDone + VMOVUPD (AX), Z1 + VSUBPD Z0, Z1, Z1 + VMOVUPD Z1, (CX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + SUBQ $0x00000008, DX + JMP float64MinusScalarTailLoop + +float64MinusScalarDone: + RET + +// func float64MinusByScalarAvx512Asm(x float64, y []float64, r []float64) +// Requires: AVX512F, SSE2 +TEXT ·float64MinusByScalarAvx512Asm(SB), NOSPLIT, $0-56 + MOVSD x+0(FP), X0 + MOVQ y_base+8(FP), AX + MOVQ r_base+32(FP), CX + MOVQ y_len+16(FP), DX + VBROADCASTSD X0, Z0 + +float64MinusByScalarBlockLoop: + CMPQ DX, $0x00000060 + JL float64MinusByScalarTailLoop + VSUBPD (AX), Z0, Z1 + VSUBPD 64(AX), Z0, Z2 + VSUBPD 128(AX), Z0, Z3 + VSUBPD 192(AX), Z0, Z4 + VSUBPD 256(AX), Z0, Z5 + VSUBPD 320(AX), Z0, Z6 + VSUBPD 384(AX), Z0, Z7 + VSUBPD 448(AX), Z0, Z8 + VSUBPD 512(AX), Z0, Z9 + VSUBPD 576(AX), Z0, Z10 + VSUBPD 640(AX), Z0, Z11 + VSUBPD 704(AX), Z0, Z12 + VMOVUPD Z1, (CX) + VMOVUPD Z2, 64(CX) + VMOVUPD Z3, 128(CX) + VMOVUPD Z4, 192(CX) + VMOVUPD Z5, 256(CX) + VMOVUPD Z6, 320(CX) + VMOVUPD Z7, 384(CX) + VMOVUPD Z8, 448(CX) + VMOVUPD Z9, 512(CX) + VMOVUPD Z10, 576(CX) + VMOVUPD Z11, 640(CX) + VMOVUPD Z12, 704(CX) + ADDQ $0x00000300, AX + ADDQ $0x00000300, CX + SUBQ $0x00000060, DX + JMP float64MinusByScalarBlockLoop + +float64MinusByScalarTailLoop: + CMPQ DX, $0x00000008 + JL float64MinusByScalarDone + VSUBPD (AX), Z0, Z1 + VMOVUPD Z1, (CX) + ADDQ $0x00000040, AX + ADDQ $0x00000040, CX + SUBQ $0x00000008, DX + JMP float64MinusByScalarTailLoop + +float64MinusByScalarDone: + RET diff --git a/pkg/vectorize/sub/avx512_stubs.go b/pkg/vectorize/sub/avx512_stubs.go new file mode 100644 index 0000000000000000000000000000000000000000..4f9ab4b231419952b2e760a3fd11730a84affcb3 --- /dev/null +++ b/pkg/vectorize/sub/avx512_stubs.go @@ -0,0 +1,63 @@ +// Code generated by command: go run avx512.go -out minus/avx512.s -stubs minus/avx512_stubs.go. DO NOT EDIT. + +package vectorize + +func int8MinusAvx512Asm(x []int8, y []int8, r []int8) + +func int8MinusScalarAvx512Asm(x int8, y []int8, r []int8) + +func int8MinusByScalarAvx512Asm(x int8, y []int8, r []int8) + +func int16MinusAvx512Asm(x []int16, y []int16, r []int16) + +func int16MinusScalarAvx512Asm(x int16, y []int16, r []int16) + +func int16MinusByScalarAvx512Asm(x int16, y []int16, r []int16) + +func int32MinusAvx512Asm(x []int32, y []int32, r []int32) + +func int32MinusScalarAvx512Asm(x int32, y []int32, r []int32) + +func int32MinusByScalarAvx512Asm(x int32, y []int32, r []int32) + +func int64MinusAvx512Asm(x []int64, y []int64, r []int64) + +func int64MinusScalarAvx512Asm(x int64, y []int64, r []int64) + +func int64MinusByScalarAvx512Asm(x int64, y []int64, r []int64) + +func uint8MinusAvx512Asm(x []uint8, y []uint8, r []uint8) + +func uint8MinusScalarAvx512Asm(x uint8, y []uint8, r []uint8) + +func uint8MinusByScalarAvx512Asm(x uint8, y []uint8, r []uint8) + +func uint16MinusAvx512Asm(x []uint16, y []uint16, r []uint16) + +func uint16MinusScalarAvx512Asm(x uint16, y []uint16, r []uint16) + +func uint16MinusByScalarAvx512Asm(x uint16, y []uint16, r []uint16) + +func uint32MinusAvx512Asm(x []uint32, y []uint32, r []uint32) + +func uint32MinusScalarAvx512Asm(x uint32, y []uint32, r []uint32) + +func uint32MinusByScalarAvx512Asm(x uint32, y []uint32, r []uint32) + +func uint64MinusAvx512Asm(x []uint64, y []uint64, r []uint64) + +func uint64MinusScalarAvx512Asm(x uint64, y []uint64, r []uint64) + +func uint64MinusByScalarAvx512Asm(x uint64, y []uint64, r []uint64) + +func float32MinusAvx512Asm(x []float32, y []float32, r []float32) + +func float32MinusScalarAvx512Asm(x float32, y []float32, r []float32) + +func float32MinusByScalarAvx512Asm(x float32, y []float32, r []float32) + +func float64MinusAvx512Asm(x []float64, y []float64, r []float64) + +func float64MinusScalarAvx512Asm(x float64, y []float64, r []float64) + +func float64MinusByScalarAvx512Asm(x float64, y []float64, r []float64) diff --git a/pkg/vectorize/sub/sub.go b/pkg/vectorize/sub/sub.go new file mode 100644 index 0000000000000000000000000000000000000000..51f49b1fd3f44ca11033077666a675cbd3636dac --- /dev/null +++ b/pkg/vectorize/sub/sub.go @@ -0,0 +1,2143 @@ +package minus + +import "golang.org/x/sys/cpu" + +var ( + int8Minus func([]int8, []int8, []int8) []int8 + int8MinusSels func([]int8, []int8, []int8, []int64) []int8 + int8MinusScalar func(int8, []int8, []int8) []int8 + int8MinusScalarSels func(int8, []int8, []int8, []int64) []int8 + int8MinusByScalar func(int8, []int8, []int8) []int8 + int8MinusByScalarSels func(int8, []int8, []int8, []int64) []int8 + int16Minus func([]int16, []int16, []int16) []int16 + int16MinusSels func([]int16, []int16, []int16, []int64) []int16 + int16MinusScalar func(int16, []int16, []int16) []int16 + int16MinusScalarSels func(int16, []int16, []int16, []int64) []int16 + int16MinusByScalar func(int16, []int16, []int16) []int16 + int16MinusByScalarSels func(int16, []int16, []int16, []int64) []int16 + int32Minus func([]int32, []int32, []int32) []int32 + int32MinusSels func([]int32, []int32, []int32, []int64) []int32 + int32MinusScalar func(int32, []int32, []int32) []int32 + int32MinusScalarSels func(int32, []int32, []int32, []int64) []int32 + int32MinusByScalar func(int32, []int32, []int32) []int32 + int32MinusByScalarSels func(int32, []int32, []int32, []int64) []int32 + int64Minus func([]int64, []int64, []int64) []int64 + int64MinusSels func([]int64, []int64, []int64, []int64) []int64 + int64MinusScalar func(int64, []int64, []int64) []int64 + int64MinusScalarSels func(int64, []int64, []int64, []int64) []int64 + int64MinusByScalar func(int64, []int64, []int64) []int64 + int64MinusByScalarSels func(int64, []int64, []int64, []int64) []int64 + uint8Minus func([]uint8, []uint8, []uint8) []uint8 + uint8MinusSels func([]uint8, []uint8, []uint8, []int64) []uint8 + uint8MinusScalar func(uint8, []uint8, []uint8) []uint8 + uint8MinusScalarSels func(uint8, []uint8, []uint8, []int64) []uint8 + uint8MinusByScalar func(uint8, []uint8, []uint8) []uint8 + uint8MinusByScalarSels func(uint8, []uint8, []uint8, []int64) []uint8 + uint16Minus func([]uint16, []uint16, []uint16) []uint16 + uint16MinusSels func([]uint16, []uint16, []uint16, []int64) []uint16 + uint16MinusScalar func(uint16, []uint16, []uint16) []uint16 + uint16MinusScalarSels func(uint16, []uint16, []uint16, []int64) []uint16 + uint16MinusByScalar func(uint16, []uint16, []uint16) []uint16 + uint16MinusByScalarSels func(uint16, []uint16, []uint16, []int64) []uint16 + uint32Minus func([]uint32, []uint32, []uint32) []uint32 + uint32MinusSels func([]uint32, []uint32, []uint32, []int64) []uint32 + uint32MinusScalar func(uint32, []uint32, []uint32) []uint32 + uint32MinusScalarSels func(uint32, []uint32, []uint32, []int64) []uint32 + uint32MinusByScalar func(uint32, []uint32, []uint32) []uint32 + uint32MinusByScalarSels func(uint32, []uint32, []uint32, []int64) []uint32 + uint64Minus func([]uint64, []uint64, []uint64) []uint64 + uint64MinusSels func([]uint64, []uint64, []uint64, []int64) []uint64 + uint64MinusScalar func(uint64, []uint64, []uint64) []uint64 + uint64MinusScalarSels func(uint64, []uint64, []uint64, []int64) []uint64 + uint64MinusByScalar func(uint64, []uint64, []uint64) []uint64 + uint64MinusByScalarSels func(uint64, []uint64, []uint64, []int64) []uint64 + float32Minus func([]float32, []float32, []float32) []float32 + float32MinusSels func([]float32, []float32, []float32, []int64) []float32 + float32MinusScalar func(float32, []float32, []float32) []float32 + float32MinusScalarSels func(float32, []float32, []float32, []int64) []float32 + float32MinusByScalar func(float32, []float32, []float32) []float32 + float32MinusByScalarSels func(float32, []float32, []float32, []int64) []float32 + float64Minus func([]float64, []float64, []float64) []float64 + float64MinusSels func([]float64, []float64, []float64, []int64) []float64 + float64MinusScalar func(float64, []float64, []float64) []float64 + float64MinusScalarSels func(float64, []float64, []float64, []int64) []float64 + float64MinusByScalar func(float64, []float64, []float64) []float64 + float64MinusByScalarSels func(float64, []float64, []float64, []int64) []float64 +) + +func init() { + if cpu.X86.HasAVX512 { + int8Minus = int8MinusAvx512 + //int8MinusSels = int8MinusSelsAvx512 + int8MinusScalar = int8MinusScalarAvx512 + //int8MinusScalarSels = int8MinusScalarSelsAvx512 + int8MinusByScalar = int8MinusByScalarAvx512 + //int8MinusByScalarSels = int8MinusByScalarSelsAvx512 + int16Minus = int16MinusAvx512 + //int16MinusSels = int16MinusSelsAvx512 + int16MinusScalar = int16MinusScalarAvx512 + //int16MinusScalarSels = int16MinusScalarSelsAvx512 + int16MinusByScalar = int16MinusByScalarAvx512 + //int16MinusByScalarSels = int16MinusByScalarSelsAvx512 + int32Minus = int32MinusAvx512 + //int32MinusSels = int32MinusSelsAvx512 + int32MinusScalar = int32MinusScalarAvx512 + //int32MinusScalarSels = int32MinusScalarSelsAvx512 + int32MinusByScalar = int32MinusByScalarAvx512 + //int32MinusByScalarSels = int32MinusByScalarSelsAvx512 + int64Minus = int64MinusAvx512 + //int64MinusSels = int64MinusSelsAvx512 + int64MinusScalar = int64MinusScalarAvx512 + //int64MinusScalarSels = int64MinusScalarSelsAvx512 + int64MinusByScalar = int64MinusByScalarAvx512 + //int64MinusByScalarSels = int64MinusByScalarSelsAvx512 + uint8Minus = uint8MinusAvx512 + //uint8MinusSels = uint8MinusSelsAvx512 + uint8MinusScalar = uint8MinusScalarAvx512 + //uint8MinusScalarSels = uint8MinusScalarSelsAvx512 + uint8MinusByScalar = uint8MinusByScalarAvx512 + //uint8MinusByScalarSels = uint8MinusByScalarSelsAvx512 + uint16Minus = uint16MinusAvx512 + //uint16MinusSels = uint16MinusSelsAvx512 + uint16MinusScalar = uint16MinusScalarAvx512 + //uint16MinusScalarSels = uint16MinusScalarSelsAvx512 + uint16MinusByScalar = uint16MinusByScalarAvx512 + //uint16MinusByScalarSels = uint16MinusByScalarSelsAvx512 + uint32Minus = uint32MinusAvx512 + //uint32MinusSels = uint32MinusSelsAvx512 + uint32MinusScalar = uint32MinusScalarAvx512 + //uint32MinusScalarSels = uint32MinusScalarSelsAvx512 + uint32MinusByScalar = uint32MinusByScalarAvx512 + //uint32MinusByScalarSels = uint32MinusByScalarSelsAvx512 + uint64Minus = uint64MinusAvx512 + //uint64MinusSels = uint64MinusSelsAvx512 + uint64MinusScalar = uint64MinusScalarAvx512 + //uint64MinusScalarSels = uint64MinusScalarSelsAvx512 + uint64MinusByScalar = uint64MinusByScalarAvx512 + //uint64MinusByScalarSels = uint64MinusByScalarSelsAvx512 + float32Minus = float32MinusAvx512 + //float32MinusSels = float32MinusSelsAvx512 + float32MinusScalar = float32MinusScalarAvx512 + //float32MinusScalarSels = float32MinusScalarSelsAvx512 + float32MinusByScalar = float32MinusByScalarAvx512 + //float32MinusByScalarSels = float32MinusByScalarSelsAvx512 + float64Minus = float64MinusAvx512 + //float64MinusSels = float64MinusSelsAvx512 + float64MinusScalar = float64MinusScalarAvx512 + //float64MinusScalarSels = float64MinusScalarSelsAvx512 + float64MinusByScalar = float64MinusByScalarAvx512 + //float64MinusByScalarSels = float64MinusByScalarSelsAvx512 + } else if cpu.X86.HasAVX2 { + int8Minus = int8MinusAvx2 + //int8MinusSels = int8MinusSelsAvx2 + int8MinusScalar = int8MinusScalarAvx2 + //int8MinusScalarSels = int8MinusScalarSelsAvx2 + int8MinusByScalar = int8MinusByScalarAvx2 + //int8MinusByScalarSels = int8MinusByScalarSelsAvx2 + int16Minus = int16MinusAvx2 + //int16MinusSels = int16MinusSelsAvx2 + int16MinusScalar = int16MinusScalarAvx2 + //int16MinusScalarSels = int16MinusScalarSelsAvx2 + int16MinusByScalar = int16MinusByScalarAvx2 + //int16MinusByScalarSels = int16MinusByScalarSelsAvx2 + int32Minus = int32MinusAvx2 + //int32MinusSels = int32MinusSelsAvx2 + int32MinusScalar = int32MinusScalarAvx2 + //int32MinusScalarSels = int32MinusScalarSelsAvx2 + int32MinusByScalar = int32MinusByScalarAvx2 + //int32MinusByScalarSels = int32MinusByScalarSelsAvx2 + int64Minus = int64MinusAvx2 + //int64MinusSels = int64MinusSelsAvx2 + int64MinusScalar = int64MinusScalarAvx2 + //int64MinusScalarSels = int64MinusScalarSelsAvx2 + int64MinusByScalar = int64MinusByScalarAvx2 + //int64MinusByScalarSels = int64MinusByScalarSelsAvx2 + uint8Minus = uint8MinusAvx2 + //uint8MinusSels = uint8MinusSelsAvx2 + uint8MinusScalar = uint8MinusScalarAvx2 + //uint8MinusScalarSels = uint8MinusScalarSelsAvx2 + uint8MinusByScalar = uint8MinusByScalarAvx2 + //uint8MinusByScalarSels = uint8MinusByScalarSelsAvx2 + uint16Minus = uint16MinusAvx2 + //uint16MinusSels = uint16MinusSelsAvx2 + uint16MinusScalar = uint16MinusScalarAvx2 + //uint16MinusScalarSels = uint16MinusScalarSelsAvx2 + uint16MinusByScalar = uint16MinusByScalarAvx2 + //uint16MinusByScalarSels = uint16MinusByScalarSelsAvx2 + uint32Minus = uint32MinusAvx2 + //uint32MinusSels = uint32MinusSelsAvx2 + uint32MinusScalar = uint32MinusScalarAvx2 + //uint32MinusScalarSels = uint32MinusScalarSelsAvx2 + uint32MinusByScalar = uint32MinusByScalarAvx2 + //uint32MinusByScalarSels = uint32MinusByScalarSelsAvx2 + uint64Minus = uint64MinusAvx2 + //uint64MinusSels = uint64MinusSelsAvx2 + uint64MinusScalar = uint64MinusScalarAvx2 + //uint64MinusScalarSels = uint64MinusScalarSelsAvx2 + uint64MinusByScalar = uint64MinusByScalarAvx2 + //uint64MinusByScalarSels = uint64MinusByScalarSelsAvx2 + float32Minus = float32MinusAvx2 + //float32MinusSels = float32MinusSelsAvx2 + float32MinusScalar = float32MinusScalarAvx2 + //float32MinusScalarSels = float32MinusScalarSelsAvx2 + float32MinusByScalar = float32MinusByScalarAvx2 + //float32MinusByScalarSels = float32MinusByScalarSelsAvx2 + float64Minus = float64MinusAvx2 + //float64MinusSels = float64MinusSelsAvx2 + float64MinusScalar = float64MinusScalarAvx2 + //float64MinusScalarSels = float64MinusScalarSelsAvx2 + float64MinusByScalar = float64MinusByScalarAvx2 + //float64MinusByScalarSels = float64MinusByScalarSelsAvx2 + } else { + int8Minus = int8MinusPure + //int8MinusSels = int8MinusSelsPure + int8MinusScalar = int8MinusScalarPure + //int8MinusScalarSels = int8MinusScalarSelsPure + int8MinusByScalar = int8MinusByScalarPure + //int8MinusByScalarSels = int8MinusByScalarSelsPure + int16Minus = int16MinusPure + //int16MinusSels = int16MinusSelsPure + int16MinusScalar = int16MinusScalarPure + //int16MinusScalarSels = int16MinusScalarSelsPure + int16MinusByScalar = int16MinusByScalarPure + //int16MinusByScalarSels = int16MinusByScalarSelsPure + int32Minus = int32MinusPure + //int32MinusSels = int32MinusSelsPure + int32MinusScalar = int32MinusScalarPure + //int32MinusScalarSels = int32MinusScalarSelsPure + int32MinusByScalar = int32MinusByScalarPure + //int32MinusByScalarSels = int32MinusByScalarSelsPure + int64Minus = int64MinusPure + //int64MinusSels = int64MinusSelsPure + int64MinusScalar = int64MinusScalarPure + //int64MinusScalarSels = int64MinusScalarSelsPure + int64MinusByScalar = int64MinusByScalarPure + //int64MinusByScalarSels = int64MinusByScalarSelsPure + uint8Minus = uint8MinusPure + //uint8MinusSels = uint8MinusSelsPure + uint8MinusScalar = uint8MinusScalarPure + //uint8MinusScalarSels = uint8MinusScalarSelsPure + uint8MinusByScalar = uint8MinusByScalarPure + //uint8MinusByScalarSels = uint8MinusByScalarSelsPure + uint16Minus = uint16MinusPure + //uint16MinusSels = uint16MinusSelsPure + uint16MinusScalar = uint16MinusScalarPure + //uint16MinusScalarSels = uint16MinusScalarSelsPure + uint16MinusByScalar = uint16MinusByScalarPure + //uint16MinusByScalarSels = uint16MinusByScalarSelsPure + uint32Minus = uint32MinusPure + //uint32MinusSels = uint32MinusSelsPure + uint32MinusScalar = uint32MinusScalarPure + //uint32MinusScalarSels = uint32MinusScalarSelsPure + uint32MinusByScalar = uint32MinusByScalarPure + //uint32MinusByScalarSels = uint32MinusByScalarSelsPure + uint64Minus = uint64MinusPure + //uint64MinusSels = uint64MinusSelsPure + uint64MinusScalar = uint64MinusScalarPure + //uint64MinusScalarSels = uint64MinusScalarSelsPure + uint64MinusByScalar = uint64MinusByScalarPure + //uint64MinusByScalarSels = uint64MinusByScalarSelsPure + float32Minus = float32MinusPure + //float32MinusSels = float32MinusSelsPure + float32MinusScalar = float32MinusScalarPure + //float32MinusScalarSels = float32MinusScalarSelsPure + float32MinusByScalar = float32MinusByScalarPure + //float32MinusByScalarSels = float32MinusByScalarSelsPure + float64Minus = float64MinusPure + //float64MinusSels = float64MinusSelsPure + float64MinusScalar = float64MinusScalarPure + //float64MinusScalarSels = float64MinusScalarSelsPure + float64MinusByScalar = float64MinusByScalarPure + //float64MinusByScalarSels = float64MinusByScalarSelsPure + } + int8MinusSels = int8MinusSelsPure + int8MinusScalarSels = int8MinusScalarSelsPure + int8MinusByScalarSels = int8MinusByScalarSelsPure + int16MinusSels = int16MinusSelsPure + int16MinusScalarSels = int16MinusScalarSelsPure + int16MinusByScalarSels = int16MinusByScalarSelsPure + int32MinusSels = int32MinusSelsPure + int32MinusScalarSels = int32MinusScalarSelsPure + int32MinusByScalarSels = int32MinusByScalarSelsPure + int64MinusSels = int64MinusSelsPure + int64MinusScalarSels = int64MinusScalarSelsPure + int64MinusByScalarSels = int64MinusByScalarSelsPure + uint8MinusSels = uint8MinusSelsPure + uint8MinusScalarSels = uint8MinusScalarSelsPure + uint8MinusByScalarSels = uint8MinusByScalarSelsPure + uint16MinusSels = uint16MinusSelsPure + uint16MinusScalarSels = uint16MinusScalarSelsPure + uint16MinusByScalarSels = uint16MinusByScalarSelsPure + uint32MinusSels = uint32MinusSelsPure + uint32MinusScalarSels = uint32MinusScalarSelsPure + uint32MinusByScalarSels = uint32MinusByScalarSelsPure + uint64MinusSels = uint64MinusSelsPure + uint64MinusScalarSels = uint64MinusScalarSelsPure + uint64MinusByScalarSels = uint64MinusByScalarSelsPure + float32MinusSels = float32MinusSelsPure + float32MinusScalarSels = float32MinusScalarSelsPure + float32MinusByScalarSels = float32MinusByScalarSelsPure + float64MinusSels = float64MinusSelsPure + float64MinusScalarSels = float64MinusScalarSelsPure + float64MinusByScalarSels = float64MinusByScalarSelsPure +} + +func Int8Minus(xs, ys, rs []int8) []int8 { + return int8Minus(xs, ys, rs) +} + +func int8MinusPure(xs, ys, rs []int8) []int8 { + for i, x := range xs { + rs[i] = x - ys[i] + } + return rs +} + +func int8MinusAvx2(xs, ys, rs []int8) []int8 { + const regItems = 32 / 1 + n := len(xs) / regItems + int8MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func int8MinusAvx512(xs, ys, rs []int8) []int8 { + const regItems = 64 / 1 + n := len(xs) / regItems + int8MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func Int8MinusSels(xs, ys, rs []int8, sels []int64) []int8 { + return int8MinusSels(xs, ys, rs, sels) +} + +func int8MinusSelsPure(xs, ys, rs []int8, sels []int64) []int8 { + for i, sel := range sels { + rs[i] = xs[sel] - ys[sel] + } + return rs +} + +//func int8MinusSelsAvx2(xs, ys, rs []int8, sels []int64) []int8 { +// const regItems = 32 / 1 +// n := len(sels) / regItems +// int8MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +//func int8MinusSelsAvx512(xs, ys, rs []int8, sels []int64) []int8 { +// const regItems = 64 / 1 +// n := len(sels) / regItems +// int8MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +func Int8MinusScalar(x int8, ys, rs []int8) []int8 { + return int8MinusScalar(x, ys, rs) +} + +func int8MinusScalarPure(x int8, ys, rs []int8) []int8 { + for i, y := range ys { + rs[i] = x - y + } + return rs +} + +func int8MinusScalarAvx2(x int8, ys, rs []int8) []int8 { + const regItems = 32 / 1 + n := len(ys) / regItems + int8MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func int8MinusScalarAvx512(x int8, ys, rs []int8) []int8 { + const regItems = 64 / 1 + n := len(ys) / regItems + int8MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func Int8MinusScalarSels(x int8, ys, rs []int8, sels []int64) []int8 { + return int8MinusScalarSels(x, ys, rs, sels) +} + +func int8MinusScalarSelsPure(x int8, ys, rs []int8, sels []int64) []int8 { + for i, sel := range sels { + rs[i] = x - ys[sel] + } + return rs +} + +//func int8MinusScalarSelsAvx2(x int8, ys, rs []int8, sels []int64) []int8 { +// const regItems = 32 / 1 +// n := len(sels) / regItems +// int8MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +//func int8MinusScalarSelsAvx512(x int8, ys, rs []int8, sels []int64) []int8 { +// const regItems = 64 / 1 +// n := len(sels) / regItems +// int8MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +func Int8MinusByScalar(x int8, ys, rs []int8) []int8 { + return int8MinusByScalar(x, ys, rs) +} + +func int8MinusByScalarPure(x int8, ys, rs []int8) []int8 { + for i, y := range ys { + rs[i] = y - x + } + return rs +} + +func int8MinusByScalarAvx2(x int8, ys, rs []int8) []int8 { + const regItems = 32 / 1 + n := len(ys) / regItems + int8MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func int8MinusByScalarAvx512(x int8, ys, rs []int8) []int8 { + const regItems = 64 / 1 + n := len(ys) / regItems + int8MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func Int8MinusByScalarSels(x int8, ys, rs []int8, sels []int64) []int8 { + return int8MinusByScalarSels(x, ys, rs, sels) +} + +func int8MinusByScalarSelsPure(x int8, ys, rs []int8, sels []int64) []int8 { + for i, sel := range sels { + rs[i] = ys[sel] - x + } + return rs +} + +//func int8MinusByScalarSelsAvx2(x int8, ys, rs []int8, sels []int64) []int8 { +// const regItems = 32 / 1 +// n := len(sels) / regItems +// int8MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//} + +//func int8MinusByScalarSelsAvx512(x int8, ys, rs []int8, sels []int64) []int8 { +// const regItems = 64 / 1 +// n := len(sels) / regItems +// int8MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//} + +func Int16Minus(xs, ys, rs []int16) []int16 { + return int16Minus(xs, ys, rs) +} + +func int16MinusPure(xs, ys, rs []int16) []int16 { + for i, x := range xs { + rs[i] = x - ys[i] + } + return rs +} + +func int16MinusAvx2(xs, ys, rs []int16) []int16 { + const regItems = 32 / 2 + n := len(xs) / regItems + int16MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func int16MinusAvx512(xs, ys, rs []int16) []int16 { + const regItems = 64 / 2 + n := len(xs) / regItems + int16MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func Int16MinusSels(xs, ys, rs []int16, sels []int64) []int16 { + return int16MinusSels(xs, ys, rs, sels) +} + +func int16MinusSelsPure(xs, ys, rs []int16, sels []int64) []int16 { + for i, sel := range sels { + rs[i] = xs[sel] - ys[sel] + } + return rs +} + +//func int16MinusSelsAvx2(xs, ys, rs []int16, sels []int64) []int16 { +// const regItems = 32 / 2 +// n := len(sels) / regItems +// int16MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +//func int16MinusSelsAvx512(xs, ys, rs []int16, sels []int64) []int16 { +// const regItems = 64 / 2 +// n := len(sels) / regItems +// int16MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +func Int16MinusScalar(x int16, ys, rs []int16) []int16 { + return int16MinusScalar(x, ys, rs) +} + +func int16MinusScalarPure(x int16, ys, rs []int16) []int16 { + for i, y := range ys { + rs[i] = x - y + } + return rs +} + +func int16MinusScalarAvx2(x int16, ys, rs []int16) []int16 { + const regItems = 32 / 2 + n := len(ys) / regItems + int16MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func int16MinusScalarAvx512(x int16, ys, rs []int16) []int16 { + const regItems = 64 / 2 + n := len(ys) / regItems + int16MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func Int16MinusScalarSels(x int16, ys, rs []int16, sels []int64) []int16 { + return int16MinusScalarSels(x, ys, rs, sels) +} + +func int16MinusScalarSelsPure(x int16, ys, rs []int16, sels []int64) []int16 { + for i, sel := range sels { + rs[i] = x - ys[sel] + } + return rs +} + +//func int16MinusScalarSelsAvx2(x int16, ys, rs []int16, sels []int64) []int16 { +// const regItems = 32 / 2 +// n := len(sels) / regItems +// int16MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +//func int16MinusScalarSelsAvx512(x int16, ys, rs []int16, sels []int64) []int16 { +// const regItems = 64 / 2 +// n := len(sels) / regItems +// int16MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +func Int16MinusByScalar(x int16, ys, rs []int16) []int16 { + return int16MinusByScalar(x, ys, rs) +} + +func int16MinusByScalarPure(x int16, ys, rs []int16) []int16 { + for i, y := range ys { + rs[i] = y - x + } + return rs +} + +func int16MinusByScalarAvx2(x int16, ys, rs []int16) []int16 { + const regItems = 32 / 2 + n := len(ys) / regItems + int16MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func int16MinusByScalarAvx512(x int16, ys, rs []int16) []int16 { + const regItems = 64 / 2 + n := len(ys) / regItems + int16MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func Int16MinusByScalarSels(x int16, ys, rs []int16, sels []int64) []int16 { + return int16MinusByScalarSels(x, ys, rs, sels) +} + +func int16MinusByScalarSelsPure(x int16, ys, rs []int16, sels []int64) []int16 { + for i, sel := range sels { + rs[i] = ys[sel] - x + } + return rs +} + +//func int16MinusByScalarSelsAvx2(x int16, ys, rs []int16, sels []int64) []int16 { +// const regItems = 32 / 2 +// n := len(sels) / regItems +// int16MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//} + +//func int16MinusByScalarSelsAvx512(x int16, ys, rs []int16, sels []int64) []int16 { +// const regItems = 64 / 2 +// n := len(sels) / regItems +// int16MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//} + +func Int32Minus(xs, ys, rs []int32) []int32 { + return int32Minus(xs, ys, rs) +} + +func int32MinusPure(xs, ys, rs []int32) []int32 { + for i, x := range xs { + rs[i] = x - ys[i] + } + return rs +} + +func int32MinusAvx2(xs, ys, rs []int32) []int32 { + const regItems = 32 / 4 + n := len(xs) / regItems + int32MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func int32MinusAvx512(xs, ys, rs []int32) []int32 { + const regItems = 64 / 4 + n := len(xs) / regItems + int32MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func Int32MinusSels(xs, ys, rs []int32, sels []int64) []int32 { + return int32MinusSels(xs, ys, rs, sels) +} + +func int32MinusSelsPure(xs, ys, rs []int32, sels []int64) []int32 { + for i, sel := range sels { + rs[i] = xs[sel] - ys[sel] + } + return rs +} + +//func int32MinusSelsAvx2(xs, ys, rs []int32, sels []int64) []int32 { +// const regItems = 32 / 4 +// n := len(sels) / regItems +// int32MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +//func int32MinusSelsAvx512(xs, ys, rs []int32, sels []int64) []int32 { +// const regItems = 64 / 4 +// n := len(sels) / regItems +// int32MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +func Int32MinusScalar(x int32, ys, rs []int32) []int32 { + return int32MinusScalar(x, ys, rs) +} + +func int32MinusScalarPure(x int32, ys, rs []int32) []int32 { + for i, y := range ys { + rs[i] = x - y + } + return rs +} + +func int32MinusScalarAvx2(x int32, ys, rs []int32) []int32 { + const regItems = 32 / 4 + n := len(ys) / regItems + int32MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func int32MinusScalarAvx512(x int32, ys, rs []int32) []int32 { + const regItems = 64 / 4 + n := len(ys) / regItems + int32MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func Int32MinusScalarSels(x int32, ys, rs []int32, sels []int64) []int32 { + return int32MinusScalarSels(x, ys, rs, sels) +} + +func int32MinusScalarSelsPure(x int32, ys, rs []int32, sels []int64) []int32 { + for i, sel := range sels { + rs[i] = x - ys[sel] + } + return rs +} + +//func int32MinusScalarSelsAvx2(x int32, ys, rs []int32, sels []int64) []int32 { +// const regItems = 32 / 4 +// n := len(sels) / regItems +// int32MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +//func int32MinusScalarSelsAvx512(x int32, ys, rs []int32, sels []int64) []int32 { +// const regItems = 64 / 4 +// n := len(sels) / regItems +// int32MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +func Int32MinusByScalar(x int32, ys, rs []int32) []int32 { + return int32MinusByScalar(x, ys, rs) +} + +func int32MinusByScalarPure(x int32, ys, rs []int32) []int32 { + for i, y := range ys { + rs[i] = y - x + } + return rs +} + +func int32MinusByScalarAvx2(x int32, ys, rs []int32) []int32 { + const regItems = 32 / 4 + n := len(ys) / regItems + int32MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func int32MinusByScalarAvx512(x int32, ys, rs []int32) []int32 { + const regItems = 64 / 4 + n := len(ys) / regItems + int32MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func Int32MinusByScalarSels(x int32, ys, rs []int32, sels []int64) []int32 { + return int32MinusByScalarSels(x, ys, rs, sels) +} + +func int32MinusByScalarSelsPure(x int32, ys, rs []int32, sels []int64) []int32 { + for i, sel := range sels { + rs[i] = ys[sel] - x + } + return rs +} + +//func int32MinusByScalarSelsAvx2(x int32, ys, rs []int32, sels []int64) []int32 { +// const regItems = 32 / 4 +// n := len(sels) / regItems +// int32MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//} + +//func int32MinusByScalarSelsAvx512(x int32, ys, rs []int32, sels []int64) []int32 { +// const regItems = 64 / 4 +// n := len(sels) / regItems +// int32MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//} + +func Int64Minus(xs, ys, rs []int64) []int64 { + return int64Minus(xs, ys, rs) +} + +func int64MinusPure(xs, ys, rs []int64) []int64 { + for i, x := range xs { + rs[i] = x - ys[i] + } + return rs +} + +func int64MinusAvx2(xs, ys, rs []int64) []int64 { + const regItems = 32 / 8 + n := len(xs) / regItems + int64MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func int64MinusAvx512(xs, ys, rs []int64) []int64 { + const regItems = 64 / 8 + n := len(xs) / regItems + int64MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func Int64MinusSels(xs, ys, rs []int64, sels []int64) []int64 { + return int64MinusSels(xs, ys, rs, sels) +} + +func int64MinusSelsPure(xs, ys, rs []int64, sels []int64) []int64 { + for i, sel := range sels { + rs[i] = xs[sel] - ys[sel] + } + return rs +} + +//func int64MinusSelsAvx2(xs, ys, rs []int64, sels []int64) []int64 { +// const regItems = 32 / 8 +// n := len(sels) / regItems +// int64MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +//func int64MinusSelsAvx512(xs, ys, rs []int64, sels []int64) []int64 { +// const regItems = 64 / 8 +// n := len(sels) / regItems +// int64MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +func Int64MinusScalar(x int64, ys, rs []int64) []int64 { + return int64MinusScalar(x, ys, rs) +} + +func int64MinusScalarPure(x int64, ys, rs []int64) []int64 { + for i, y := range ys { + rs[i] = x - y + } + return rs +} + +func int64MinusScalarAvx2(x int64, ys, rs []int64) []int64 { + const regItems = 32 / 8 + n := len(ys) / regItems + int64MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func int64MinusScalarAvx512(x int64, ys, rs []int64) []int64 { + const regItems = 64 / 8 + n := len(ys) / regItems + int64MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func Int64MinusScalarSels(x int64, ys, rs []int64, sels []int64) []int64 { + return int64MinusScalarSels(x, ys, rs, sels) +} + +func int64MinusScalarSelsPure(x int64, ys, rs []int64, sels []int64) []int64 { + for i, sel := range sels { + rs[i] = x - ys[sel] + } + return rs +} + +//func int64MinusScalarSelsAvx2(x int64, ys, rs []int64, sels []int64) []int64 { +// const regItems = 32 / 8 +// n := len(sels) / regItems +// int64MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +//func int64MinusScalarSelsAvx512(x int64, ys, rs []int64, sels []int64) []int64 { +// const regItems = 64 / 8 +// n := len(sels) / regItems +// int64MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +func Int64MinusByScalar(x int64, ys, rs []int64) []int64 { + return int64MinusByScalar(x, ys, rs) +} + +func int64MinusByScalarPure(x int64, ys, rs []int64) []int64 { + for i, y := range ys { + rs[i] = y - x + } + return rs +} + +func int64MinusByScalarAvx2(x int64, ys, rs []int64) []int64 { + const regItems = 32 / 8 + n := len(ys) / regItems + int64MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func int64MinusByScalarAvx512(x int64, ys, rs []int64) []int64 { + const regItems = 64 / 8 + n := len(ys) / regItems + int64MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func Int64MinusByScalarSels(x int64, ys, rs []int64, sels []int64) []int64 { + return int64MinusByScalarSels(x, ys, rs, sels) +} + +func int64MinusByScalarSelsPure(x int64, ys, rs []int64, sels []int64) []int64 { + for i, sel := range sels { + rs[i] = ys[sel] - x + } + return rs +} + +//func int64MinusByScalarSelsAvx2(x int64, ys, rs []int64, sels []int64) []int64 { +// const regItems = 32 / 8 +// n := len(sels) / regItems +// int64MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//} + +//func int64MinusByScalarSelsAvx512(x int64, ys, rs []int64, sels []int64) []int64 { +// const regItems = 64 / 8 +// n := len(sels) / regItems +// int64MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//} + +func Uint8Minus(xs, ys, rs []uint8) []uint8 { + return uint8Minus(xs, ys, rs) +} + +func uint8MinusPure(xs, ys, rs []uint8) []uint8 { + for i, x := range xs { + rs[i] = x - ys[i] + } + return rs +} + +func uint8MinusAvx2(xs, ys, rs []uint8) []uint8 { + const regItems = 32 / 1 + n := len(xs) / regItems + uint8MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func uint8MinusAvx512(xs, ys, rs []uint8) []uint8 { + const regItems = 64 / 1 + n := len(xs) / regItems + uint8MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func Uint8MinusSels(xs, ys, rs []uint8, sels []int64) []uint8 { + return uint8MinusSels(xs, ys, rs, sels) +} + +func uint8MinusSelsPure(xs, ys, rs []uint8, sels []int64) []uint8 { + for i, sel := range sels { + rs[i] = xs[sel] - ys[sel] + } + return rs +} + +//func uint8MinusSelsAvx2(xs, ys, rs []uint8, sels []int64) []uint8 { +// const regItems = 32 / 1 +// n := len(sels) / regItems +// uint8MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +//func uint8MinusSelsAvx512(xs, ys, rs []uint8, sels []int64) []uint8 { +// const regItems = 64 / 1 +// n := len(sels) / regItems +// uint8MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +func Uint8MinusScalar(x uint8, ys, rs []uint8) []uint8 { + return uint8MinusScalar(x, ys, rs) +} + +func uint8MinusScalarPure(x uint8, ys, rs []uint8) []uint8 { + for i, y := range ys { + rs[i] = x - y + } + return rs +} + +func uint8MinusScalarAvx2(x uint8, ys, rs []uint8) []uint8 { + const regItems = 32 / 1 + n := len(ys) / regItems + uint8MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func uint8MinusScalarAvx512(x uint8, ys, rs []uint8) []uint8 { + const regItems = 64 / 1 + n := len(ys) / regItems + uint8MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func Uint8MinusScalarSels(x uint8, ys, rs []uint8, sels []int64) []uint8 { + return uint8MinusScalarSels(x, ys, rs, sels) +} + +func uint8MinusScalarSelsPure(x uint8, ys, rs []uint8, sels []int64) []uint8 { + for i, sel := range sels { + rs[i] = x - ys[sel] + } + return rs +} + +//func uint8MinusScalarSelsAvx2(x uint8, ys, rs []uint8, sels []int64) []uint8 { +// const regItems = 32 / 1 +// n := len(sels) / regItems +// uint8MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +//func uint8MinusScalarSelsAvx512(x uint8, ys, rs []uint8, sels []int64) []uint8 { +// const regItems = 64 / 1 +// n := len(sels) / regItems +// uint8MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +func Uint8MinusByScalar(x uint8, ys, rs []uint8) []uint8 { + return uint8MinusByScalar(x, ys, rs) +} + +func uint8MinusByScalarPure(x uint8, ys, rs []uint8) []uint8 { + for i, y := range ys { + rs[i] = y - x + } + return rs +} + +func uint8MinusByScalarAvx2(x uint8, ys, rs []uint8) []uint8 { + const regItems = 32 / 1 + n := len(ys) / regItems + uint8MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func uint8MinusByScalarAvx512(x uint8, ys, rs []uint8) []uint8 { + const regItems = 64 / 1 + n := len(ys) / regItems + uint8MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func Uint8MinusByScalarSels(x uint8, ys, rs []uint8, sels []int64) []uint8 { + return uint8MinusByScalarSels(x, ys, rs, sels) +} + +func uint8MinusByScalarSelsPure(x uint8, ys, rs []uint8, sels []int64) []uint8 { + for i, sel := range sels { + rs[i] = ys[sel] - x + } + return rs +} + +//func uint8MinusByScalarSelsAvx2(x uint8, ys, rs []uint8, sels []int64) []uint8 { +// const regItems = 32 / 1 +// n := len(sels) / regItems +// uint8MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//} + +//func uint8MinusByScalarSelsAvx512(x uint8, ys, rs []uint8, sels []int64) []uint8 { +// const regItems = 64 / 1 +// n := len(sels) / regItems +// uint8MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//} + +func Uint16Minus(xs, ys, rs []uint16) []uint16 { + return uint16Minus(xs, ys, rs) +} + +func uint16MinusPure(xs, ys, rs []uint16) []uint16 { + for i, x := range xs { + rs[i] = x - ys[i] + } + return rs +} + +func uint16MinusAvx2(xs, ys, rs []uint16) []uint16 { + const regItems = 32 / 2 + n := len(xs) / regItems + uint16MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func uint16MinusAvx512(xs, ys, rs []uint16) []uint16 { + const regItems = 64 / 2 + n := len(xs) / regItems + uint16MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func Uint16MinusSels(xs, ys, rs []uint16, sels []int64) []uint16 { + return uint16MinusSels(xs, ys, rs, sels) +} + +func uint16MinusSelsPure(xs, ys, rs []uint16, sels []int64) []uint16 { + for i, sel := range sels { + rs[i] = xs[sel] - ys[sel] + } + return rs +} + +//func uint16MinusSelsAvx2(xs, ys, rs []uint16, sels []int64) []uint16 { +// const regItems = 32 / 2 +// n := len(sels) / regItems +// uint16MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +//func uint16MinusSelsAvx512(xs, ys, rs []uint16, sels []int64) []uint16 { +// const regItems = 64 / 2 +// n := len(sels) / regItems +// uint16MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +func Uint16MinusScalar(x uint16, ys, rs []uint16) []uint16 { + return uint16MinusScalar(x, ys, rs) +} + +func uint16MinusScalarPure(x uint16, ys, rs []uint16) []uint16 { + for i, y := range ys { + rs[i] = x - y + } + return rs +} + +func uint16MinusScalarAvx2(x uint16, ys, rs []uint16) []uint16 { + const regItems = 32 / 2 + n := len(ys) / regItems + uint16MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func uint16MinusScalarAvx512(x uint16, ys, rs []uint16) []uint16 { + const regItems = 64 / 2 + n := len(ys) / regItems + uint16MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func Uint16MinusScalarSels(x uint16, ys, rs []uint16, sels []int64) []uint16 { + return uint16MinusScalarSels(x, ys, rs, sels) +} + +func uint16MinusScalarSelsPure(x uint16, ys, rs []uint16, sels []int64) []uint16 { + for i, sel := range sels { + rs[i] = x - ys[sel] + } + return rs +} + +//func uint16MinusScalarSelsAvx2(x uint16, ys, rs []uint16, sels []int64) []uint16 { +// const regItems = 32 / 2 +// n := len(sels) / regItems +// uint16MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +//func uint16MinusScalarSelsAvx512(x uint16, ys, rs []uint16, sels []int64) []uint16 { +// const regItems = 64 / 2 +// n := len(sels) / regItems +// uint16MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +func Uint16MinusByScalar(x uint16, ys, rs []uint16) []uint16 { + return uint16MinusByScalar(x, ys, rs) +} + +func uint16MinusByScalarPure(x uint16, ys, rs []uint16) []uint16 { + for i, y := range ys { + rs[i] = y - x + } + return rs +} + +func uint16MinusByScalarAvx2(x uint16, ys, rs []uint16) []uint16 { + const regItems = 32 / 2 + n := len(ys) / regItems + uint16MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func uint16MinusByScalarAvx512(x uint16, ys, rs []uint16) []uint16 { + const regItems = 64 / 2 + n := len(ys) / regItems + uint16MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func Uint16MinusByScalarSels(x uint16, ys, rs []uint16, sels []int64) []uint16 { + return uint16MinusByScalarSels(x, ys, rs, sels) +} + +func uint16MinusByScalarSelsPure(x uint16, ys, rs []uint16, sels []int64) []uint16 { + for i, sel := range sels { + rs[i] = ys[sel] - x + } + return rs +} + +//func uint16MinusByScalarSelsAvx2(x uint16, ys, rs []uint16, sels []int64) []uint16 { +// const regItems = 32 / 2 +// n := len(sels) / regItems +// uint16MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//} + +//func uint16MinusByScalarSelsAvx512(x uint16, ys, rs []uint16, sels []int64) []uint16 { +// const regItems = 64 / 2 +// n := len(sels) / regItems +// uint16MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//} + +func Uint32Minus(xs, ys, rs []uint32) []uint32 { + return uint32Minus(xs, ys, rs) +} + +func uint32MinusPure(xs, ys, rs []uint32) []uint32 { + for i, x := range xs { + rs[i] = x - ys[i] + } + return rs +} + +func uint32MinusAvx2(xs, ys, rs []uint32) []uint32 { + const regItems = 32 / 4 + n := len(xs) / regItems + uint32MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func uint32MinusAvx512(xs, ys, rs []uint32) []uint32 { + const regItems = 64 / 4 + n := len(xs) / regItems + uint32MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func Uint32MinusSels(xs, ys, rs []uint32, sels []int64) []uint32 { + return uint32MinusSels(xs, ys, rs, sels) +} + +func uint32MinusSelsPure(xs, ys, rs []uint32, sels []int64) []uint32 { + for i, sel := range sels { + rs[i] = xs[sel] - ys[sel] + } + return rs +} + +//func uint32MinusSelsAvx2(xs, ys, rs []uint32, sels []int64) []uint32 { +// const regItems = 32 / 4 +// n := len(sels) / regItems +// uint32MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +//func uint32MinusSelsAvx512(xs, ys, rs []uint32, sels []int64) []uint32 { +// const regItems = 64 / 4 +// n := len(sels) / regItems +// uint32MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +func Uint32MinusScalar(x uint32, ys, rs []uint32) []uint32 { + return uint32MinusScalar(x, ys, rs) +} + +func uint32MinusScalarPure(x uint32, ys, rs []uint32) []uint32 { + for i, y := range ys { + rs[i] = x - y + } + return rs +} + +func uint32MinusScalarAvx2(x uint32, ys, rs []uint32) []uint32 { + const regItems = 32 / 4 + n := len(ys) / regItems + uint32MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func uint32MinusScalarAvx512(x uint32, ys, rs []uint32) []uint32 { + const regItems = 64 / 4 + n := len(ys) / regItems + uint32MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func Uint32MinusScalarSels(x uint32, ys, rs []uint32, sels []int64) []uint32 { + return uint32MinusScalarSels(x, ys, rs, sels) +} + +func uint32MinusScalarSelsPure(x uint32, ys, rs []uint32, sels []int64) []uint32 { + for i, sel := range sels { + rs[i] = x - ys[sel] + } + return rs +} + +//func uint32MinusScalarSelsAvx2(x uint32, ys, rs []uint32, sels []int64) []uint32 { +// const regItems = 32 / 4 +// n := len(sels) / regItems +// uint32MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +//func uint32MinusScalarSelsAvx512(x uint32, ys, rs []uint32, sels []int64) []uint32 { +// const regItems = 64 / 4 +// n := len(sels) / regItems +// uint32MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +func Uint32MinusByScalar(x uint32, ys, rs []uint32) []uint32 { + return uint32MinusByScalar(x, ys, rs) +} + +func uint32MinusByScalarPure(x uint32, ys, rs []uint32) []uint32 { + for i, y := range ys { + rs[i] = y - x + } + return rs +} + +func uint32MinusByScalarAvx2(x uint32, ys, rs []uint32) []uint32 { + const regItems = 32 / 4 + n := len(ys) / regItems + uint32MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func uint32MinusByScalarAvx512(x uint32, ys, rs []uint32) []uint32 { + const regItems = 64 / 4 + n := len(ys) / regItems + uint32MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func Uint32MinusByScalarSels(x uint32, ys, rs []uint32, sels []int64) []uint32 { + return uint32MinusByScalarSels(x, ys, rs, sels) +} + +func uint32MinusByScalarSelsPure(x uint32, ys, rs []uint32, sels []int64) []uint32 { + for i, sel := range sels { + rs[i] = ys[sel] - x + } + return rs +} + +//func uint32MinusByScalarSelsAvx2(x uint32, ys, rs []uint32, sels []int64) []uint32 { +// const regItems = 32 / 4 +// n := len(sels) / regItems +// uint32MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//} + +//func uint32MinusByScalarSelsAvx512(x uint32, ys, rs []uint32, sels []int64) []uint32 { +// const regItems = 64 / 4 +// n := len(sels) / regItems +// uint32MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//} + +func Uint64Minus(xs, ys, rs []uint64) []uint64 { + return uint64Minus(xs, ys, rs) +} + +func uint64MinusPure(xs, ys, rs []uint64) []uint64 { + for i, x := range xs { + rs[i] = x - ys[i] + } + return rs +} + +func uint64MinusAvx2(xs, ys, rs []uint64) []uint64 { + const regItems = 32 / 8 + n := len(xs) / regItems + uint64MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func uint64MinusAvx512(xs, ys, rs []uint64) []uint64 { + const regItems = 64 / 8 + n := len(xs) / regItems + uint64MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func Uint64MinusSels(xs, ys, rs []uint64, sels []int64) []uint64 { + return uint64MinusSels(xs, ys, rs, sels) +} + +func uint64MinusSelsPure(xs, ys, rs []uint64, sels []int64) []uint64 { + for i, sel := range sels { + rs[i] = xs[sel] - ys[sel] + } + return rs +} + +//func uint64MinusSelsAvx2(xs, ys, rs []uint64, sels []int64) []uint64 { +// const regItems = 32 / 8 +// n := len(sels) / regItems +// uint64MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +//func uint64MinusSelsAvx512(xs, ys, rs []uint64, sels []int64) []uint64 { +// const regItems = 64 / 8 +// n := len(sels) / regItems +// uint64MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +func Uint64MinusScalar(x uint64, ys, rs []uint64) []uint64 { + return uint64MinusScalar(x, ys, rs) +} + +func uint64MinusScalarPure(x uint64, ys, rs []uint64) []uint64 { + for i, y := range ys { + rs[i] = x - y + } + return rs +} + +func uint64MinusScalarAvx2(x uint64, ys, rs []uint64) []uint64 { + const regItems = 32 / 8 + n := len(ys) / regItems + uint64MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func uint64MinusScalarAvx512(x uint64, ys, rs []uint64) []uint64 { + const regItems = 64 / 8 + n := len(ys) / regItems + uint64MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func Uint64MinusScalarSels(x uint64, ys, rs []uint64, sels []int64) []uint64 { + return uint64MinusScalarSels(x, ys, rs, sels) +} + +func uint64MinusScalarSelsPure(x uint64, ys, rs []uint64, sels []int64) []uint64 { + for i, sel := range sels { + rs[i] = x - ys[sel] + } + return rs +} + +//func uint64MinusScalarSelsAvx2(x uint64, ys, rs []uint64, sels []int64) []uint64 { +// const regItems = 32 / 8 +// n := len(sels) / regItems +// uint64MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +//func uint64MinusScalarSelsAvx512(x uint64, ys, rs []uint64, sels []int64) []uint64 { +// const regItems = 64 / 8 +// n := len(sels) / regItems +// uint64MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +func Uint64MinusByScalar(x uint64, ys, rs []uint64) []uint64 { + return uint64MinusByScalar(x, ys, rs) +} + +func uint64MinusByScalarPure(x uint64, ys, rs []uint64) []uint64 { + for i, y := range ys { + rs[i] = y - x + } + return rs +} + +func uint64MinusByScalarAvx2(x uint64, ys, rs []uint64) []uint64 { + const regItems = 32 / 8 + n := len(ys) / regItems + uint64MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func uint64MinusByScalarAvx512(x uint64, ys, rs []uint64) []uint64 { + const regItems = 64 / 8 + n := len(ys) / regItems + uint64MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func Uint64MinusByScalarSels(x uint64, ys, rs []uint64, sels []int64) []uint64 { + return uint64MinusByScalarSels(x, ys, rs, sels) +} + +func uint64MinusByScalarSelsPure(x uint64, ys, rs []uint64, sels []int64) []uint64 { + for i, sel := range sels { + rs[i] = ys[sel] - x + } + return rs +} + +//func uint64MinusByScalarSelsAvx2(x uint64, ys, rs []uint64, sels []int64) []uint64 { +// const regItems = 32 / 8 +// n := len(sels) / regItems +// uint64MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//} + +//func uint64MinusByScalarSelsAvx512(x uint64, ys, rs []uint64, sels []int64) []uint64 { +// const regItems = 64 / 8 +// n := len(sels) / regItems +// uint64MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//} + +func Float32Minus(xs, ys, rs []float32) []float32 { + return float32Minus(xs, ys, rs) +} + +func float32MinusPure(xs, ys, rs []float32) []float32 { + for i, x := range xs { + rs[i] = x - ys[i] + } + return rs +} + +func float32MinusAvx2(xs, ys, rs []float32) []float32 { + const regItems = 32 / 4 + n := len(xs) / regItems + float32MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func float32MinusAvx512(xs, ys, rs []float32) []float32 { + const regItems = 64 / 4 + n := len(xs) / regItems + float32MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func Float32MinusSels(xs, ys, rs []float32, sels []int64) []float32 { + return float32MinusSels(xs, ys, rs, sels) +} + +func float32MinusSelsPure(xs, ys, rs []float32, sels []int64) []float32 { + for i, sel := range sels { + rs[i] = xs[sel] - ys[sel] + } + return rs +} + +//func float32MinusSelsAvx2(xs, ys, rs []float32, sels []int64) []float32 { +// const regItems = 32 / 4 +// n := len(sels) / regItems +// float32MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +//func float32MinusSelsAvx512(xs, ys, rs []float32, sels []int64) []float32 { +// const regItems = 64 / 4 +// n := len(sels) / regItems +// float32MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +func Float32MinusScalar(x float32, ys, rs []float32) []float32 { + return float32MinusScalar(x, ys, rs) +} + +func float32MinusScalarPure(x float32, ys, rs []float32) []float32 { + for i, y := range ys { + rs[i] = x - y + } + return rs +} + +func float32MinusScalarAvx2(x float32, ys, rs []float32) []float32 { + const regItems = 32 / 4 + n := len(ys) / regItems + float32MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func float32MinusScalarAvx512(x float32, ys, rs []float32) []float32 { + const regItems = 64 / 4 + n := len(ys) / regItems + float32MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func Float32MinusScalarSels(x float32, ys, rs []float32, sels []int64) []float32 { + return float32MinusScalarSels(x, ys, rs, sels) +} + +func float32MinusScalarSelsPure(x float32, ys, rs []float32, sels []int64) []float32 { + for i, sel := range sels { + rs[i] = x - ys[sel] + } + return rs +} + +//func float32MinusScalarSelsAvx2(x float32, ys, rs []float32, sels []int64) []float32 { +// const regItems = 32 / 4 +// n := len(sels) / regItems +// float32MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +//func float32MinusScalarSelsAvx512(x float32, ys, rs []float32, sels []int64) []float32 { +// const regItems = 64 / 4 +// n := len(sels) / regItems +// float32MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +func Float32MinusByScalar(x float32, ys, rs []float32) []float32 { + return float32MinusByScalar(x, ys, rs) +} + +func float32MinusByScalarPure(x float32, ys, rs []float32) []float32 { + for i, y := range ys { + rs[i] = y - x + } + return rs +} + +func float32MinusByScalarAvx2(x float32, ys, rs []float32) []float32 { + const regItems = 32 / 4 + n := len(ys) / regItems + float32MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func float32MinusByScalarAvx512(x float32, ys, rs []float32) []float32 { + const regItems = 64 / 4 + n := len(ys) / regItems + float32MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func Float32MinusByScalarSels(x float32, ys, rs []float32, sels []int64) []float32 { + return float32MinusByScalarSels(x, ys, rs, sels) +} + +func float32MinusByScalarSelsPure(x float32, ys, rs []float32, sels []int64) []float32 { + for i, sel := range sels { + rs[i] = ys[sel] - x + } + return rs +} + +//func float32MinusByScalarSelsAvx2(x float32, ys, rs []float32, sels []int64) []float32 { +// const regItems = 32 / 4 +// n := len(sels) / regItems +// float32MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//} + +//func float32MinusByScalarSelsAvx512(x float32, ys, rs []float32, sels []int64) []float32 { +// const regItems = 64 / 4 +// n := len(sels) / regItems +// float32MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//} + +func Float64Minus(xs, ys, rs []float64) []float64 { + return float64Minus(xs, ys, rs) +} + +func float64MinusPure(xs, ys, rs []float64) []float64 { + for i, x := range xs { + rs[i] = x - ys[i] + } + return rs +} + +func float64MinusAvx2(xs, ys, rs []float64) []float64 { + const regItems = 32 / 8 + n := len(xs) / regItems + float64MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func float64MinusAvx512(xs, ys, rs []float64) []float64 { + const regItems = 64 / 8 + n := len(xs) / regItems + float64MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = xs[i] - ys[i] + } + return rs +} + +func Float64MinusSels(xs, ys, rs []float64, sels []int64) []float64 { + return float64MinusSels(xs, ys, rs, sels) +} + +func float64MinusSelsPure(xs, ys, rs []float64, sels []int64) []float64 { + for i, sel := range sels { + rs[i] = xs[sel] - ys[sel] + } + return rs +} + +//func float64MinusSelsAvx2(xs, ys, rs []float64, sels []int64) []float64 { +// const regItems = 32 / 8 +// n := len(sels) / regItems +// float64MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +//func float64MinusSelsAvx512(xs, ys, rs []float64, sels []int64) []float64 { +// const regItems = 64 / 8 +// n := len(sels) / regItems +// float64MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = xs[sels[i]] - ys[sels[i]] +// } +// return rs +//} + +func Float64MinusScalar(x float64, ys, rs []float64) []float64 { + return float64MinusScalar(x, ys, rs) +} + +func float64MinusScalarPure(x float64, ys, rs []float64) []float64 { + for i, y := range ys { + rs[i] = x - y + } + return rs +} + +func float64MinusScalarAvx2(x float64, ys, rs []float64) []float64 { + const regItems = 32 / 8 + n := len(ys) / regItems + float64MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func float64MinusScalarAvx512(x float64, ys, rs []float64) []float64 { + const regItems = 64 / 8 + n := len(ys) / regItems + float64MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = x - ys[i] + } + return rs +} + +func Float64MinusScalarSels(x float64, ys, rs []float64, sels []int64) []float64 { + return float64MinusScalarSels(x, ys, rs, sels) +} + +func float64MinusScalarSelsPure(x float64, ys, rs []float64, sels []int64) []float64 { + for i, sel := range sels { + rs[i] = x - ys[sel] + } + return rs +} + +//func float64MinusScalarSelsAvx2(x float64, ys, rs []float64, sels []int64) []float64 { +// const regItems = 32 / 8 +// n := len(sels) / regItems +// float64MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +//func float64MinusScalarSelsAvx512(x float64, ys, rs []float64, sels []int64) []float64 { +// const regItems = 64 / 8 +// n := len(sels) / regItems +// float64MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = x - ys[sels[i]] +// } +// return rs +//} + +func Float64MinusByScalar(x float64, ys, rs []float64) []float64 { + return float64MinusByScalar(x, ys, rs) +} + +func float64MinusByScalarPure(x float64, ys, rs []float64) []float64 { + for i, y := range ys { + rs[i] = y - x + } + return rs +} + +func float64MinusByScalarAvx2(x float64, ys, rs []float64) []float64 { + const regItems = 32 / 8 + n := len(ys) / regItems + float64MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func float64MinusByScalarAvx512(x float64, ys, rs []float64) []float64 { + const regItems = 64 / 8 + n := len(ys) / regItems + float64MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems]) + for i, j := n * regItems, len(xs); i < j; i++ { + rs[i] = ys[i] - x + } + return rs +} + +func Float64MinusByScalarSels(x float64, ys, rs []float64, sels []int64) []float64 { + return float64MinusByScalarSels(x, ys, rs, sels) +} + +func float64MinusByScalarSelsPure(x float64, ys, rs []float64, sels []int64) []float64 { + for i, sel := range sels { + rs[i] = ys[sel] - x + } + return rs +} + +//func float64MinusByScalarSelsAvx2(x float64, ys, rs []float64, sels []int64) []float64 { +// const regItems = 32 / 8 +// n := len(sels) / regItems +// float64MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//} + +//func float64MinusByScalarSelsAvx512(x float64, ys, rs []float64, sels []int64) []float64 { +// const regItems = 64 / 8 +// n := len(sels) / regItems +// float64MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems]) +// for i, j := n * regItems, len(sels); i < j; i++ { +// rs[i] = ys[sels[i]] - x +// } +// return rs +//}