From 0081575d05a026b65da89cd1e2135bd896e5309d Mon Sep 17 00:00:00 2001 From: bRong Njam <longran1989@gmail.com> Date: Fri, 19 Mar 2021 14:09:08 +0800 Subject: [PATCH] Improve SIMD implementations to MAX/MIN --- pkg/vectorize/max/avx2.s | 63 ++++++++---------- pkg/vectorize/max/avx512.s | 132 +++++++++++++++++-------------------- pkg/vectorize/min/avx2.s | 63 ++++++++---------- pkg/vectorize/min/avx512.s | 132 +++++++++++++++++-------------------- 4 files changed, 174 insertions(+), 216 deletions(-) diff --git a/pkg/vectorize/max/avx2.s b/pkg/vectorize/max/avx2.s index 6a5523979..82ff9c9ed 100644 --- a/pkg/vectorize/max/avx2.s +++ b/pkg/vectorize/max/avx2.s @@ -149,18 +149,15 @@ int32MaxDone: // func uint8MaxAvx2Asm(x []uint8, r []uint8) // Requires: AVX, AVX2, SSE2 TEXT ·uint8MaxAvx2Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0x0000000000000000, BX - MOVQ BX, X0 - VPBROADCASTB X0, Y0 - VMOVDQU Y0, Y1 - VMOVDQU Y0, Y2 - VMOVDQU Y0, Y3 - VMOVDQU Y0, Y4 - VMOVDQU Y0, Y5 - VMOVDQU Y0, Y0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 uint8MaxBlockLoop: CMPQ DX, $0x000000c0 @@ -197,18 +194,15 @@ uint8MaxDone: // func uint16MaxAvx2Asm(x []uint16, r []uint16) // Requires: AVX, AVX2, SSE2, SSE4.1 TEXT ·uint16MaxAvx2Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0x0000000000000000, BX - MOVQ BX, X0 - VPBROADCASTW X0, Y0 - VMOVDQU Y0, Y1 - VMOVDQU Y0, Y2 - VMOVDQU Y0, Y3 - VMOVDQU Y0, Y4 - VMOVDQU Y0, Y5 - VMOVDQU Y0, Y0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 uint16MaxBlockLoop: CMPQ DX, $0x00000060 @@ -245,18 +239,15 @@ uint16MaxDone: // func uint32MaxAvx2Asm(x []uint32, r []uint32) // Requires: AVX, AVX2, SSE2, SSE4.1 TEXT ·uint32MaxAvx2Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0x0000000000000000, BX - MOVQ BX, X0 - VPBROADCASTD X0, Y0 - VMOVDQU Y0, Y1 - VMOVDQU Y0, Y2 - VMOVDQU Y0, Y3 - VMOVDQU Y0, Y4 - VMOVDQU Y0, Y5 - VMOVDQU Y0, Y0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 uint32MaxBlockLoop: CMPQ DX, $0x00000030 diff --git a/pkg/vectorize/max/avx512.s b/pkg/vectorize/max/avx512.s index 38218bcb7..bc09e01e1 100644 --- a/pkg/vectorize/max/avx512.s +++ b/pkg/vectorize/max/avx512.s @@ -277,24 +277,21 @@ int64MaxDone: // func uint8MaxAvx512Asm(x []uint8, r []uint8) // Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2 TEXT ·uint8MaxAvx512Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0x0000000000000000, BX - MOVQ BX, X0 - VPBROADCASTB X0, Z0 - VMOVDQU64 Z0, Z1 - VMOVDQU64 Z0, Z2 - VMOVDQU64 Z0, Z3 - VMOVDQU64 Z0, Z4 - VMOVDQU64 Z0, Z5 - VMOVDQU64 Z0, Z6 - VMOVDQU64 Z0, Z7 - VMOVDQU64 Z0, Z8 - VMOVDQU64 Z0, Z9 - VMOVDQU64 Z0, Z10 - VMOVDQU64 Z0, Z11 - VMOVDQU64 Z0, Z0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXORQ Z0, Z0, Z0 + VPXORQ Z1, Z1, Z1 + VPXORQ Z2, Z2, Z2 + VPXORQ Z3, Z3, Z3 + VPXORQ Z4, Z4, Z4 + VPXORQ Z5, Z5, Z5 + VPXORQ Z6, Z6, Z6 + VPXORQ Z7, Z7, Z7 + VPXORQ Z8, Z8, Z8 + VPXORQ Z9, Z9, Z9 + VPXORQ Z10, Z10, Z10 + VPXORQ Z11, Z11, Z11 uint8MaxBlockLoop: CMPQ DX, $0x00000300 @@ -345,24 +342,21 @@ uint8MaxDone: // func uint16MaxAvx512Asm(x []uint16, r []uint16) // Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2, SSE4.1 TEXT ·uint16MaxAvx512Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0x0000000000000000, BX - MOVQ BX, X0 - VPBROADCASTW X0, Z0 - VMOVDQU64 Z0, Z1 - VMOVDQU64 Z0, Z2 - VMOVDQU64 Z0, Z3 - VMOVDQU64 Z0, Z4 - VMOVDQU64 Z0, Z5 - VMOVDQU64 Z0, Z6 - VMOVDQU64 Z0, Z7 - VMOVDQU64 Z0, Z8 - VMOVDQU64 Z0, Z9 - VMOVDQU64 Z0, Z10 - VMOVDQU64 Z0, Z11 - VMOVDQU64 Z0, Z0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXORQ Z0, Z0, Z0 + VPXORQ Z1, Z1, Z1 + VPXORQ Z2, Z2, Z2 + VPXORQ Z3, Z3, Z3 + VPXORQ Z4, Z4, Z4 + VPXORQ Z5, Z5, Z5 + VPXORQ Z6, Z6, Z6 + VPXORQ Z7, Z7, Z7 + VPXORQ Z8, Z8, Z8 + VPXORQ Z9, Z9, Z9 + VPXORQ Z10, Z10, Z10 + VPXORQ Z11, Z11, Z11 uint16MaxBlockLoop: CMPQ DX, $0x00000180 @@ -413,24 +407,21 @@ uint16MaxDone: // func uint32MaxAvx512Asm(x []uint32, r []uint32) // Requires: AVX, AVX2, AVX512F, SSE2, SSE4.1 TEXT ·uint32MaxAvx512Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0x0000000000000000, BX - MOVQ BX, X0 - VPBROADCASTD X0, Z0 - VMOVDQU64 Z0, Z1 - VMOVDQU64 Z0, Z2 - VMOVDQU64 Z0, Z3 - VMOVDQU64 Z0, Z4 - VMOVDQU64 Z0, Z5 - VMOVDQU64 Z0, Z6 - VMOVDQU64 Z0, Z7 - VMOVDQU64 Z0, Z8 - VMOVDQU64 Z0, Z9 - VMOVDQU64 Z0, Z10 - VMOVDQU64 Z0, Z11 - VMOVDQU64 Z0, Z0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXORQ Z0, Z0, Z0 + VPXORQ Z1, Z1, Z1 + VPXORQ Z2, Z2, Z2 + VPXORQ Z3, Z3, Z3 + VPXORQ Z4, Z4, Z4 + VPXORQ Z5, Z5, Z5 + VPXORQ Z6, Z6, Z6 + VPXORQ Z7, Z7, Z7 + VPXORQ Z8, Z8, Z8 + VPXORQ Z9, Z9, Z9 + VPXORQ Z10, Z10, Z10 + VPXORQ Z11, Z11, Z11 uint32MaxBlockLoop: CMPQ DX, $0x000000c0 @@ -481,24 +472,21 @@ uint32MaxDone: // func uint64MaxAvx512Asm(x []uint64, r []uint64) // Requires: AVX, AVX512F, AVX512VL, SSE2 TEXT ·uint64MaxAvx512Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0x0000000000000000, BX - MOVQ BX, X0 - VPBROADCASTQ X0, Z0 - VMOVDQU64 Z0, Z1 - VMOVDQU64 Z0, Z2 - VMOVDQU64 Z0, Z3 - VMOVDQU64 Z0, Z4 - VMOVDQU64 Z0, Z5 - VMOVDQU64 Z0, Z6 - VMOVDQU64 Z0, Z7 - VMOVDQU64 Z0, Z8 - VMOVDQU64 Z0, Z9 - VMOVDQU64 Z0, Z10 - VMOVDQU64 Z0, Z11 - VMOVDQU64 Z0, Z0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXORQ Z0, Z0, Z0 + VPXORQ Z1, Z1, Z1 + VPXORQ Z2, Z2, Z2 + VPXORQ Z3, Z3, Z3 + VPXORQ Z4, Z4, Z4 + VPXORQ Z5, Z5, Z5 + VPXORQ Z6, Z6, Z6 + VPXORQ Z7, Z7, Z7 + VPXORQ Z8, Z8, Z8 + VPXORQ Z9, Z9, Z9 + VPXORQ Z10, Z10, Z10 + VPXORQ Z11, Z11, Z11 uint64MaxBlockLoop: CMPQ DX, $0x00000060 diff --git a/pkg/vectorize/min/avx2.s b/pkg/vectorize/min/avx2.s index 1b2bdd873..de70b9c85 100644 --- a/pkg/vectorize/min/avx2.s +++ b/pkg/vectorize/min/avx2.s @@ -149,18 +149,15 @@ int32MinDone: // func uint8MinAvx2Asm(x []uint8, r []uint8) // Requires: AVX, AVX2, SSE2 TEXT ·uint8MinAvx2Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0xffffffffffffffff, BX - MOVQ BX, X0 - VPBROADCASTB X0, Y0 - VMOVDQU Y0, Y1 - VMOVDQU Y0, Y2 - VMOVDQU Y0, Y3 - VMOVDQU Y0, Y4 - VMOVDQU Y0, Y5 - VMOVDQU Y0, Y0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPCMPEQQ Y0, Y0, Y0 + VPCMPEQQ Y1, Y1, Y1 + VPCMPEQQ Y2, Y2, Y2 + VPCMPEQQ Y3, Y3, Y3 + VPCMPEQQ Y4, Y4, Y4 + VPCMPEQQ Y5, Y5, Y5 uint8MinBlockLoop: CMPQ DX, $0x000000c0 @@ -197,18 +194,15 @@ uint8MinDone: // func uint16MinAvx2Asm(x []uint16, r []uint16) // Requires: AVX, AVX2, SSE2, SSE4.1 TEXT ·uint16MinAvx2Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0xffffffffffffffff, BX - MOVQ BX, X0 - VPBROADCASTW X0, Y0 - VMOVDQU Y0, Y1 - VMOVDQU Y0, Y2 - VMOVDQU Y0, Y3 - VMOVDQU Y0, Y4 - VMOVDQU Y0, Y5 - VMOVDQU Y0, Y0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPCMPEQQ Y0, Y0, Y0 + VPCMPEQQ Y1, Y1, Y1 + VPCMPEQQ Y2, Y2, Y2 + VPCMPEQQ Y3, Y3, Y3 + VPCMPEQQ Y4, Y4, Y4 + VPCMPEQQ Y5, Y5, Y5 uint16MinBlockLoop: CMPQ DX, $0x00000060 @@ -245,18 +239,15 @@ uint16MinDone: // func uint32MinAvx2Asm(x []uint32, r []uint32) // Requires: AVX, AVX2, SSE2, SSE4.1 TEXT ·uint32MinAvx2Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0xffffffffffffffff, BX - MOVQ BX, X0 - VPBROADCASTD X0, Y0 - VMOVDQU Y0, Y1 - VMOVDQU Y0, Y2 - VMOVDQU Y0, Y3 - VMOVDQU Y0, Y4 - VMOVDQU Y0, Y5 - VMOVDQU Y0, Y0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPCMPEQQ Y0, Y0, Y0 + VPCMPEQQ Y1, Y1, Y1 + VPCMPEQQ Y2, Y2, Y2 + VPCMPEQQ Y3, Y3, Y3 + VPCMPEQQ Y4, Y4, Y4 + VPCMPEQQ Y5, Y5, Y5 uint32MinBlockLoop: CMPQ DX, $0x00000030 diff --git a/pkg/vectorize/min/avx512.s b/pkg/vectorize/min/avx512.s index 0175db4ed..0a75b602e 100644 --- a/pkg/vectorize/min/avx512.s +++ b/pkg/vectorize/min/avx512.s @@ -277,24 +277,21 @@ int64MinDone: // func uint8MinAvx512Asm(x []uint8, r []uint8) // Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2 TEXT ·uint8MinAvx512Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0xffffffffffffffff, BX - MOVQ BX, X0 - VPBROADCASTB X0, Z0 - VMOVDQU64 Z0, Z1 - VMOVDQU64 Z0, Z2 - VMOVDQU64 Z0, Z3 - VMOVDQU64 Z0, Z4 - VMOVDQU64 Z0, Z5 - VMOVDQU64 Z0, Z6 - VMOVDQU64 Z0, Z7 - VMOVDQU64 Z0, Z8 - VMOVDQU64 Z0, Z9 - VMOVDQU64 Z0, Z10 - VMOVDQU64 Z0, Z11 - VMOVDQU64 Z0, Z0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPCMPEQQ Z0, Z0, Z0 + VPCMPEQQ Z1, Z1, Z1 + VPCMPEQQ Z2, Z2, Z2 + VPCMPEQQ Z3, Z3, Z3 + VPCMPEQQ Z4, Z4, Z4 + VPCMPEQQ Z5, Z5, Z5 + VPCMPEQQ Z6, Z6, Z6 + VPCMPEQQ Z7, Z7, Z7 + VPCMPEQQ Z8, Z8, Z8 + VPCMPEQQ Z9, Z9, Z9 + VPCMPEQQ Z10, Z10, Z10 + VPCMPEQQ Z11, Z11, Z11 uint8MinBlockLoop: CMPQ DX, $0x00000300 @@ -345,24 +342,21 @@ uint8MinDone: // func uint16MinAvx512Asm(x []uint16, r []uint16) // Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2, SSE4.1 TEXT ·uint16MinAvx512Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0xffffffffffffffff, BX - MOVQ BX, X0 - VPBROADCASTW X0, Z0 - VMOVDQU64 Z0, Z1 - VMOVDQU64 Z0, Z2 - VMOVDQU64 Z0, Z3 - VMOVDQU64 Z0, Z4 - VMOVDQU64 Z0, Z5 - VMOVDQU64 Z0, Z6 - VMOVDQU64 Z0, Z7 - VMOVDQU64 Z0, Z8 - VMOVDQU64 Z0, Z9 - VMOVDQU64 Z0, Z10 - VMOVDQU64 Z0, Z11 - VMOVDQU64 Z0, Z0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPCMPEQQ Z0, Z0, Z0 + VPCMPEQQ Z1, Z1, Z1 + VPCMPEQQ Z2, Z2, Z2 + VPCMPEQQ Z3, Z3, Z3 + VPCMPEQQ Z4, Z4, Z4 + VPCMPEQQ Z5, Z5, Z5 + VPCMPEQQ Z6, Z6, Z6 + VPCMPEQQ Z7, Z7, Z7 + VPCMPEQQ Z8, Z8, Z8 + VPCMPEQQ Z9, Z9, Z9 + VPCMPEQQ Z10, Z10, Z10 + VPCMPEQQ Z11, Z11, Z11 uint16MinBlockLoop: CMPQ DX, $0x00000180 @@ -413,24 +407,21 @@ uint16MinDone: // func uint32MinAvx512Asm(x []uint32, r []uint32) // Requires: AVX, AVX2, AVX512F, SSE2, SSE4.1 TEXT ·uint32MinAvx512Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0xffffffffffffffff, BX - MOVQ BX, X0 - VPBROADCASTD X0, Z0 - VMOVDQU64 Z0, Z1 - VMOVDQU64 Z0, Z2 - VMOVDQU64 Z0, Z3 - VMOVDQU64 Z0, Z4 - VMOVDQU64 Z0, Z5 - VMOVDQU64 Z0, Z6 - VMOVDQU64 Z0, Z7 - VMOVDQU64 Z0, Z8 - VMOVDQU64 Z0, Z9 - VMOVDQU64 Z0, Z10 - VMOVDQU64 Z0, Z11 - VMOVDQU64 Z0, Z0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPCMPEQQ Z0, Z0, Z0 + VPCMPEQQ Z1, Z1, Z1 + VPCMPEQQ Z2, Z2, Z2 + VPCMPEQQ Z3, Z3, Z3 + VPCMPEQQ Z4, Z4, Z4 + VPCMPEQQ Z5, Z5, Z5 + VPCMPEQQ Z6, Z6, Z6 + VPCMPEQQ Z7, Z7, Z7 + VPCMPEQQ Z8, Z8, Z8 + VPCMPEQQ Z9, Z9, Z9 + VPCMPEQQ Z10, Z10, Z10 + VPCMPEQQ Z11, Z11, Z11 uint32MinBlockLoop: CMPQ DX, $0x000000c0 @@ -481,24 +472,21 @@ uint32MinDone: // func uint64MinAvx512Asm(x []uint64, r []uint64) // Requires: AVX, AVX512F, AVX512VL, SSE2 TEXT ·uint64MinAvx512Asm(SB), NOSPLIT, $0-48 - MOVQ x_base+0(FP), AX - MOVQ r_base+24(FP), CX - MOVQ x_len+8(FP), DX - MOVQ $0xffffffffffffffff, BX - MOVQ BX, X0 - VPBROADCASTQ X0, Z0 - VMOVDQU64 Z0, Z1 - VMOVDQU64 Z0, Z2 - VMOVDQU64 Z0, Z3 - VMOVDQU64 Z0, Z4 - VMOVDQU64 Z0, Z5 - VMOVDQU64 Z0, Z6 - VMOVDQU64 Z0, Z7 - VMOVDQU64 Z0, Z8 - VMOVDQU64 Z0, Z9 - VMOVDQU64 Z0, Z10 - VMOVDQU64 Z0, Z11 - VMOVDQU64 Z0, Z0 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPCMPEQQ Z0, Z0, Z0 + VPCMPEQQ Z1, Z1, Z1 + VPCMPEQQ Z2, Z2, Z2 + VPCMPEQQ Z3, Z3, Z3 + VPCMPEQQ Z4, Z4, Z4 + VPCMPEQQ Z5, Z5, Z5 + VPCMPEQQ Z6, Z6, Z6 + VPCMPEQQ Z7, Z7, Z7 + VPCMPEQQ Z8, Z8, Z8 + VPCMPEQQ Z9, Z9, Z9 + VPCMPEQQ Z10, Z10, Z10 + VPCMPEQQ Z11, Z11, Z11 uint64MinBlockLoop: CMPQ DX, $0x00000060 -- GitLab