diff --git a/pkg/vectorize/max/avx2.s b/pkg/vectorize/max/avx2.s index 6a552397978c26fbae5d19379730c4c8ca4e0659..82ff9c9ed3a615327b975ad98c112cebea8558fc 100644 --- a/pkg/vectorize/max/avx2.s +++ b/pkg/vectorize/max/avx2.s @@ -149,18 +149,15 @@ int32MaxDone: // func uint8MaxAvx2Asm(x []uint8, r []uint8) // Requires: AVX, AVX2, SSE2 TEXT ·uint8MaxAvx2Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0x0000000000000000, BX - MOVQ BX, X0 - VPBROADCASTB X0, Y0 - VMOVDQU Y0, Y1 - VMOVDQU Y0, Y2 - VMOVDQU Y0, Y3 - VMOVDQU Y0, Y4 - VMOVDQU Y0, Y5 - VMOVDQU Y0, Y0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 uint8MaxBlockLoop: CMPQ DX, $0x000000c0 @@ -197,18 +194,15 @@ uint8MaxDone: // func uint16MaxAvx2Asm(x []uint16, r []uint16) // Requires: AVX, AVX2, SSE2, SSE4.1 TEXT ·uint16MaxAvx2Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0x0000000000000000, BX - MOVQ BX, X0 - VPBROADCASTW X0, Y0 - VMOVDQU Y0, Y1 - VMOVDQU Y0, Y2 - VMOVDQU Y0, Y3 - VMOVDQU Y0, Y4 - VMOVDQU Y0, Y5 - VMOVDQU Y0, Y0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 uint16MaxBlockLoop: CMPQ DX, $0x00000060 @@ -245,18 +239,15 @@ uint16MaxDone: // func uint32MaxAvx2Asm(x []uint32, r []uint32) // Requires: AVX, AVX2, SSE2, SSE4.1 TEXT ·uint32MaxAvx2Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0x0000000000000000, BX - MOVQ BX, X0 - VPBROADCASTD X0, Y0 - VMOVDQU Y0, Y1 - VMOVDQU Y0, Y2 - VMOVDQU Y0, Y3 - VMOVDQU Y0, Y4 - VMOVDQU Y0, Y5 - VMOVDQU Y0, Y0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 uint32MaxBlockLoop: CMPQ DX, $0x00000030 diff --git a/pkg/vectorize/max/avx512.s b/pkg/vectorize/max/avx512.s index 38218bcb7a0f96942bc11d2ea7f823da1c2f60ff..bc09e01e1e6940b6740de7de0b6ff6a4d1fd5538 100644 --- a/pkg/vectorize/max/avx512.s +++ b/pkg/vectorize/max/avx512.s @@ -277,24 +277,21 @@ int64MaxDone: // func uint8MaxAvx512Asm(x []uint8, r []uint8) // Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2 TEXT ·uint8MaxAvx512Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0x0000000000000000, BX - MOVQ BX, X0 - VPBROADCASTB X0, Z0 - VMOVDQU64 Z0, Z1 - VMOVDQU64 Z0, Z2 - VMOVDQU64 Z0, Z3 - VMOVDQU64 Z0, Z4 - VMOVDQU64 Z0, Z5 - VMOVDQU64 Z0, Z6 - VMOVDQU64 Z0, Z7 - VMOVDQU64 Z0, Z8 - VMOVDQU64 Z0, Z9 - VMOVDQU64 Z0, Z10 - VMOVDQU64 Z0, Z11 - VMOVDQU64 Z0, Z0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXORQ Z0, Z0, Z0 + VPXORQ Z1, Z1, Z1 + VPXORQ Z2, Z2, Z2 + VPXORQ Z3, Z3, Z3 + VPXORQ Z4, Z4, Z4 + VPXORQ Z5, Z5, Z5 + VPXORQ Z6, Z6, Z6 + VPXORQ Z7, Z7, Z7 + VPXORQ Z8, Z8, Z8 + VPXORQ Z9, Z9, Z9 + VPXORQ Z10, Z10, Z10 + VPXORQ Z11, Z11, Z11 uint8MaxBlockLoop: CMPQ DX, $0x00000300 @@ -345,24 +342,21 @@ uint8MaxDone: // func uint16MaxAvx512Asm(x []uint16, r []uint16) // Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2, SSE4.1 TEXT ·uint16MaxAvx512Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0x0000000000000000, BX - MOVQ BX, X0 - VPBROADCASTW X0, Z0 - VMOVDQU64 Z0, Z1 - VMOVDQU64 Z0, Z2 - VMOVDQU64 Z0, Z3 - VMOVDQU64 Z0, Z4 - VMOVDQU64 Z0, Z5 - VMOVDQU64 Z0, Z6 - VMOVDQU64 Z0, Z7 - VMOVDQU64 Z0, Z8 - VMOVDQU64 Z0, Z9 - VMOVDQU64 Z0, Z10 - VMOVDQU64 Z0, Z11 - VMOVDQU64 Z0, Z0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXORQ Z0, Z0, Z0 + VPXORQ Z1, Z1, Z1 + VPXORQ Z2, Z2, Z2 + VPXORQ Z3, Z3, Z3 + VPXORQ Z4, Z4, Z4 + VPXORQ Z5, Z5, Z5 + VPXORQ Z6, Z6, Z6 + VPXORQ Z7, Z7, Z7 + VPXORQ Z8, Z8, Z8 + VPXORQ Z9, Z9, Z9 + VPXORQ Z10, Z10, Z10 + VPXORQ Z11, Z11, Z11 uint16MaxBlockLoop: CMPQ DX, $0x00000180 @@ -413,24 +407,21 @@ uint16MaxDone: // func uint32MaxAvx512Asm(x []uint32, r []uint32) // Requires: AVX, AVX2, AVX512F, SSE2, SSE4.1 TEXT ·uint32MaxAvx512Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0x0000000000000000, BX - MOVQ BX, X0 - VPBROADCASTD X0, Z0 - VMOVDQU64 Z0, Z1 - VMOVDQU64 Z0, Z2 - VMOVDQU64 Z0, Z3 - VMOVDQU64 Z0, Z4 - VMOVDQU64 Z0, Z5 - VMOVDQU64 Z0, Z6 - VMOVDQU64 Z0, Z7 - VMOVDQU64 Z0, Z8 - VMOVDQU64 Z0, Z9 - VMOVDQU64 Z0, Z10 - VMOVDQU64 Z0, Z11 - VMOVDQU64 Z0, Z0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXORQ Z0, Z0, Z0 + VPXORQ Z1, Z1, Z1 + VPXORQ Z2, Z2, Z2 + VPXORQ Z3, Z3, Z3 + VPXORQ Z4, Z4, Z4 + VPXORQ Z5, Z5, Z5 + VPXORQ Z6, Z6, Z6 + VPXORQ Z7, Z7, Z7 + VPXORQ Z8, Z8, Z8 + VPXORQ Z9, Z9, Z9 + VPXORQ Z10, Z10, Z10 + VPXORQ Z11, Z11, Z11 uint32MaxBlockLoop: CMPQ DX, $0x000000c0 @@ -481,24 +472,21 @@ uint32MaxDone: // func uint64MaxAvx512Asm(x []uint64, r []uint64) // Requires: AVX, AVX512F, AVX512VL, SSE2 TEXT ·uint64MaxAvx512Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0x0000000000000000, BX - MOVQ BX, X0 - VPBROADCASTQ X0, Z0 - VMOVDQU64 Z0, Z1 - VMOVDQU64 Z0, Z2 - VMOVDQU64 Z0, Z3 - VMOVDQU64 Z0, Z4 - VMOVDQU64 Z0, Z5 - VMOVDQU64 Z0, Z6 - VMOVDQU64 Z0, Z7 - VMOVDQU64 Z0, Z8 - VMOVDQU64 Z0, Z9 - VMOVDQU64 Z0, Z10 - VMOVDQU64 Z0, Z11 - VMOVDQU64 Z0, Z0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXORQ Z0, Z0, Z0 + VPXORQ Z1, Z1, Z1 + VPXORQ Z2, Z2, Z2 + VPXORQ Z3, Z3, Z3 + VPXORQ Z4, Z4, Z4 + VPXORQ Z5, Z5, Z5 + VPXORQ Z6, Z6, Z6 + VPXORQ Z7, Z7, Z7 + VPXORQ Z8, Z8, Z8 + VPXORQ Z9, Z9, Z9 + VPXORQ Z10, Z10, Z10 + VPXORQ Z11, Z11, Z11 uint64MaxBlockLoop: CMPQ DX, $0x00000060 diff --git a/pkg/vectorize/min/avx2.s b/pkg/vectorize/min/avx2.s index 1b2bdd87369fe32d9da7a586fef653b84e38fcf2..de70b9c85d465d48949cc1ea57597ecec9beebf8 100644 --- a/pkg/vectorize/min/avx2.s +++ b/pkg/vectorize/min/avx2.s @@ -149,18 +149,15 @@ int32MinDone: // func uint8MinAvx2Asm(x []uint8, r []uint8) // Requires: AVX, AVX2, SSE2 TEXT ·uint8MinAvx2Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0xffffffffffffffff, BX - MOVQ BX, X0 - VPBROADCASTB X0, Y0 - VMOVDQU Y0, Y1 - VMOVDQU Y0, Y2 - VMOVDQU Y0, Y3 - VMOVDQU Y0, Y4 - VMOVDQU Y0, Y5 - VMOVDQU Y0, Y0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPCMPEQQ Y0, Y0, Y0 + VPCMPEQQ Y1, Y1, Y1 + VPCMPEQQ Y2, Y2, Y2 + VPCMPEQQ Y3, Y3, Y3 + VPCMPEQQ Y4, Y4, Y4 + VPCMPEQQ Y5, Y5, Y5 uint8MinBlockLoop: CMPQ DX, $0x000000c0 @@ -197,18 +194,15 @@ uint8MinDone: // func uint16MinAvx2Asm(x []uint16, r []uint16) // Requires: AVX, AVX2, SSE2, SSE4.1 TEXT ·uint16MinAvx2Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0xffffffffffffffff, BX - MOVQ BX, X0 - VPBROADCASTW X0, Y0 - VMOVDQU Y0, Y1 - VMOVDQU Y0, Y2 - VMOVDQU Y0, Y3 - VMOVDQU Y0, Y4 - VMOVDQU Y0, Y5 - VMOVDQU Y0, Y0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPCMPEQQ Y0, Y0, Y0 + VPCMPEQQ Y1, Y1, Y1 + VPCMPEQQ Y2, Y2, Y2 + VPCMPEQQ Y3, Y3, Y3 + VPCMPEQQ Y4, Y4, Y4 + VPCMPEQQ Y5, Y5, Y5 uint16MinBlockLoop: CMPQ DX, $0x00000060 @@ -245,18 +239,15 @@ uint16MinDone: // func uint32MinAvx2Asm(x []uint32, r []uint32) // Requires: AVX, AVX2, SSE2, SSE4.1 TEXT ·uint32MinAvx2Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0xffffffffffffffff, BX - MOVQ BX, X0 - VPBROADCASTD X0, Y0 - VMOVDQU Y0, Y1 - VMOVDQU Y0, Y2 - VMOVDQU Y0, Y3 - VMOVDQU Y0, Y4 - VMOVDQU Y0, Y5 - VMOVDQU Y0, Y0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPCMPEQQ Y0, Y0, Y0 + VPCMPEQQ Y1, Y1, Y1 + VPCMPEQQ Y2, Y2, Y2 + VPCMPEQQ Y3, Y3, Y3 + VPCMPEQQ Y4, Y4, Y4 + VPCMPEQQ Y5, Y5, Y5 uint32MinBlockLoop: CMPQ DX, $0x00000030 diff --git a/pkg/vectorize/min/avx512.s b/pkg/vectorize/min/avx512.s index 0175db4ed5e95a61fac4379b0a09ee4855ac77ab..0a75b602ec688307d45bf666ab01f47fc35c6a3f 100644 --- a/pkg/vectorize/min/avx512.s +++ b/pkg/vectorize/min/avx512.s @@ -277,24 +277,21 @@ int64MinDone: // func uint8MinAvx512Asm(x []uint8, r []uint8) // Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2 TEXT ·uint8MinAvx512Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0xffffffffffffffff, BX - MOVQ BX, X0 - VPBROADCASTB X0, Z0 - VMOVDQU64 Z0, Z1 - VMOVDQU64 Z0, Z2 - VMOVDQU64 Z0, Z3 - VMOVDQU64 Z0, Z4 - VMOVDQU64 Z0, Z5 - VMOVDQU64 Z0, Z6 - VMOVDQU64 Z0, Z7 - VMOVDQU64 Z0, Z8 - VMOVDQU64 Z0, Z9 - VMOVDQU64 Z0, Z10 - VMOVDQU64 Z0, Z11 - VMOVDQU64 Z0, Z0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPCMPEQQ Z0, Z0, Z0 + VPCMPEQQ Z1, Z1, Z1 + VPCMPEQQ Z2, Z2, Z2 + VPCMPEQQ Z3, Z3, Z3 + VPCMPEQQ Z4, Z4, Z4 + VPCMPEQQ Z5, Z5, Z5 + VPCMPEQQ Z6, Z6, Z6 + VPCMPEQQ Z7, Z7, Z7 + VPCMPEQQ Z8, Z8, Z8 + VPCMPEQQ Z9, Z9, Z9 + VPCMPEQQ Z10, Z10, Z10 + VPCMPEQQ Z11, Z11, Z11 uint8MinBlockLoop: CMPQ DX, $0x00000300 @@ -345,24 +342,21 @@ uint8MinDone: // func uint16MinAvx512Asm(x []uint16, r []uint16) // Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2, SSE4.1 TEXT ·uint16MinAvx512Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0xffffffffffffffff, BX - MOVQ BX, X0 - VPBROADCASTW X0, Z0 - VMOVDQU64 Z0, Z1 - VMOVDQU64 Z0, Z2 - VMOVDQU64 Z0, Z3 - VMOVDQU64 Z0, Z4 - VMOVDQU64 Z0, Z5 - VMOVDQU64 Z0, Z6 - VMOVDQU64 Z0, Z7 - VMOVDQU64 Z0, Z8 - VMOVDQU64 Z0, Z9 - VMOVDQU64 Z0, Z10 - VMOVDQU64 Z0, Z11 - VMOVDQU64 Z0, Z0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPCMPEQQ Z0, Z0, Z0 + VPCMPEQQ Z1, Z1, Z1 + VPCMPEQQ Z2, Z2, Z2 + VPCMPEQQ Z3, Z3, Z3 + VPCMPEQQ Z4, Z4, Z4 + VPCMPEQQ Z5, Z5, Z5 + VPCMPEQQ Z6, Z6, Z6 + VPCMPEQQ Z7, Z7, Z7 + VPCMPEQQ Z8, Z8, Z8 + VPCMPEQQ Z9, Z9, Z9 + VPCMPEQQ Z10, Z10, Z10 + VPCMPEQQ Z11, Z11, Z11 uint16MinBlockLoop: CMPQ DX, $0x00000180 @@ -413,24 +407,21 @@ uint16MinDone: // func uint32MinAvx512Asm(x []uint32, r []uint32) // Requires: AVX, AVX2, AVX512F, SSE2, SSE4.1 TEXT ·uint32MinAvx512Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0xffffffffffffffff, BX - MOVQ BX, X0 - VPBROADCASTD X0, Z0 - VMOVDQU64 Z0, Z1 - VMOVDQU64 Z0, Z2 - VMOVDQU64 Z0, Z3 - VMOVDQU64 Z0, Z4 - VMOVDQU64 Z0, Z5 - VMOVDQU64 Z0, Z6 - VMOVDQU64 Z0, Z7 - VMOVDQU64 Z0, Z8 - VMOVDQU64 Z0, Z9 - VMOVDQU64 Z0, Z10 - VMOVDQU64 Z0, Z11 - VMOVDQU64 Z0, Z0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPCMPEQQ Z0, Z0, Z0 + VPCMPEQQ Z1, Z1, Z1 + VPCMPEQQ Z2, Z2, Z2 + VPCMPEQQ Z3, Z3, Z3 + VPCMPEQQ Z4, Z4, Z4 + VPCMPEQQ Z5, Z5, Z5 + VPCMPEQQ Z6, Z6, Z6 + VPCMPEQQ Z7, Z7, Z7 + VPCMPEQQ Z8, Z8, Z8 + VPCMPEQQ Z9, Z9, Z9 + VPCMPEQQ Z10, Z10, Z10 + VPCMPEQQ Z11, Z11, Z11 uint32MinBlockLoop: CMPQ DX, $0x000000c0 @@ -481,24 +472,21 @@ uint32MinDone: // func uint64MinAvx512Asm(x []uint64, r []uint64) // Requires: AVX, AVX512F, AVX512VL, SSE2 TEXT ·uint64MinAvx512Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0xffffffffffffffff, BX - MOVQ BX, X0 - VPBROADCASTQ X0, Z0 - VMOVDQU64 Z0, Z1 - VMOVDQU64 Z0, Z2 - VMOVDQU64 Z0, Z3 - VMOVDQU64 Z0, Z4 - VMOVDQU64 Z0, Z5 - VMOVDQU64 Z0, Z6 - VMOVDQU64 Z0, Z7 - VMOVDQU64 Z0, Z8 - VMOVDQU64 Z0, Z9 - VMOVDQU64 Z0, Z10 - VMOVDQU64 Z0, Z11 - VMOVDQU64 Z0, Z0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPCMPEQQ Z0, Z0, Z0 + VPCMPEQQ Z1, Z1, Z1 + VPCMPEQQ Z2, Z2, Z2 + VPCMPEQQ Z3, Z3, Z3 + VPCMPEQQ Z4, Z4, Z4 + VPCMPEQQ Z5, Z5, Z5 + VPCMPEQQ Z6, Z6, Z6 + VPCMPEQQ Z7, Z7, Z7 + VPCMPEQQ Z8, Z8, Z8 + VPCMPEQQ Z9, Z9, Z9 + VPCMPEQQ Z10, Z10, Z10 + VPCMPEQQ Z11, Z11, Z11 uint64MinBlockLoop: CMPQ DX, $0x00000060