Skip to content
Snippets Groups Projects
Commit 0081575d authored by bRong Njam's avatar bRong Njam
Browse files

Improve SIMD implementations to MAX/MIN

parent 0f034373
No related branches found
No related tags found
No related merge requests found
......@@ -149,18 +149,15 @@ int32MaxDone:
// func uint8MaxAvx2Asm(x []uint8, r []uint8)
// Requires: AVX, AVX2, SSE2
TEXT ·uint8MaxAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000000000, BX
MOVQ BX, X0
VPBROADCASTB X0, Y0
VMOVDQU Y0, Y1
VMOVDQU Y0, Y2
VMOVDQU Y0, Y3
VMOVDQU Y0, Y4
VMOVDQU Y0, Y5
VMOVDQU Y0, Y0
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
VPXOR Y0, Y0, Y0
VPXOR Y1, Y1, Y1
VPXOR Y2, Y2, Y2
VPXOR Y3, Y3, Y3
VPXOR Y4, Y4, Y4
VPXOR Y5, Y5, Y5
uint8MaxBlockLoop:
CMPQ DX, $0x000000c0
......@@ -197,18 +194,15 @@ uint8MaxDone:
// func uint16MaxAvx2Asm(x []uint16, r []uint16)
// Requires: AVX, AVX2, SSE2, SSE4.1
TEXT ·uint16MaxAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000000000, BX
MOVQ BX, X0
VPBROADCASTW X0, Y0
VMOVDQU Y0, Y1
VMOVDQU Y0, Y2
VMOVDQU Y0, Y3
VMOVDQU Y0, Y4
VMOVDQU Y0, Y5
VMOVDQU Y0, Y0
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
VPXOR Y0, Y0, Y0
VPXOR Y1, Y1, Y1
VPXOR Y2, Y2, Y2
VPXOR Y3, Y3, Y3
VPXOR Y4, Y4, Y4
VPXOR Y5, Y5, Y5
uint16MaxBlockLoop:
CMPQ DX, $0x00000060
......@@ -245,18 +239,15 @@ uint16MaxDone:
// func uint32MaxAvx2Asm(x []uint32, r []uint32)
// Requires: AVX, AVX2, SSE2, SSE4.1
TEXT ·uint32MaxAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000000000, BX
MOVQ BX, X0
VPBROADCASTD X0, Y0
VMOVDQU Y0, Y1
VMOVDQU Y0, Y2
VMOVDQU Y0, Y3
VMOVDQU Y0, Y4
VMOVDQU Y0, Y5
VMOVDQU Y0, Y0
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
VPXOR Y0, Y0, Y0
VPXOR Y1, Y1, Y1
VPXOR Y2, Y2, Y2
VPXOR Y3, Y3, Y3
VPXOR Y4, Y4, Y4
VPXOR Y5, Y5, Y5
uint32MaxBlockLoop:
CMPQ DX, $0x00000030
......
......@@ -277,24 +277,21 @@ int64MaxDone:
// func uint8MaxAvx512Asm(x []uint8, r []uint8)
// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2
TEXT ·uint8MaxAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000000000, BX
MOVQ BX, X0
VPBROADCASTB X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
VPXORQ Z0, Z0, Z0
VPXORQ Z1, Z1, Z1
VPXORQ Z2, Z2, Z2
VPXORQ Z3, Z3, Z3
VPXORQ Z4, Z4, Z4
VPXORQ Z5, Z5, Z5
VPXORQ Z6, Z6, Z6
VPXORQ Z7, Z7, Z7
VPXORQ Z8, Z8, Z8
VPXORQ Z9, Z9, Z9
VPXORQ Z10, Z10, Z10
VPXORQ Z11, Z11, Z11
uint8MaxBlockLoop:
CMPQ DX, $0x00000300
......@@ -345,24 +342,21 @@ uint8MaxDone:
// func uint16MaxAvx512Asm(x []uint16, r []uint16)
// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2, SSE4.1
TEXT ·uint16MaxAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000000000, BX
MOVQ BX, X0
VPBROADCASTW X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
VPXORQ Z0, Z0, Z0
VPXORQ Z1, Z1, Z1
VPXORQ Z2, Z2, Z2
VPXORQ Z3, Z3, Z3
VPXORQ Z4, Z4, Z4
VPXORQ Z5, Z5, Z5
VPXORQ Z6, Z6, Z6
VPXORQ Z7, Z7, Z7
VPXORQ Z8, Z8, Z8
VPXORQ Z9, Z9, Z9
VPXORQ Z10, Z10, Z10
VPXORQ Z11, Z11, Z11
uint16MaxBlockLoop:
CMPQ DX, $0x00000180
......@@ -413,24 +407,21 @@ uint16MaxDone:
// func uint32MaxAvx512Asm(x []uint32, r []uint32)
// Requires: AVX, AVX2, AVX512F, SSE2, SSE4.1
TEXT ·uint32MaxAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000000000, BX
MOVQ BX, X0
VPBROADCASTD X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
VPXORQ Z0, Z0, Z0
VPXORQ Z1, Z1, Z1
VPXORQ Z2, Z2, Z2
VPXORQ Z3, Z3, Z3
VPXORQ Z4, Z4, Z4
VPXORQ Z5, Z5, Z5
VPXORQ Z6, Z6, Z6
VPXORQ Z7, Z7, Z7
VPXORQ Z8, Z8, Z8
VPXORQ Z9, Z9, Z9
VPXORQ Z10, Z10, Z10
VPXORQ Z11, Z11, Z11
uint32MaxBlockLoop:
CMPQ DX, $0x000000c0
......@@ -481,24 +472,21 @@ uint32MaxDone:
// func uint64MaxAvx512Asm(x []uint64, r []uint64)
// Requires: AVX, AVX512F, AVX512VL, SSE2
TEXT ·uint64MaxAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000000000, BX
MOVQ BX, X0
VPBROADCASTQ X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
VPXORQ Z0, Z0, Z0
VPXORQ Z1, Z1, Z1
VPXORQ Z2, Z2, Z2
VPXORQ Z3, Z3, Z3
VPXORQ Z4, Z4, Z4
VPXORQ Z5, Z5, Z5
VPXORQ Z6, Z6, Z6
VPXORQ Z7, Z7, Z7
VPXORQ Z8, Z8, Z8
VPXORQ Z9, Z9, Z9
VPXORQ Z10, Z10, Z10
VPXORQ Z11, Z11, Z11
uint64MaxBlockLoop:
CMPQ DX, $0x00000060
......
......@@ -149,18 +149,15 @@ int32MinDone:
// func uint8MinAvx2Asm(x []uint8, r []uint8)
// Requires: AVX, AVX2, SSE2
TEXT ·uint8MinAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0xffffffffffffffff, BX
MOVQ BX, X0
VPBROADCASTB X0, Y0
VMOVDQU Y0, Y1
VMOVDQU Y0, Y2
VMOVDQU Y0, Y3
VMOVDQU Y0, Y4
VMOVDQU Y0, Y5
VMOVDQU Y0, Y0
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
VPCMPEQQ Y0, Y0, Y0
VPCMPEQQ Y1, Y1, Y1
VPCMPEQQ Y2, Y2, Y2
VPCMPEQQ Y3, Y3, Y3
VPCMPEQQ Y4, Y4, Y4
VPCMPEQQ Y5, Y5, Y5
uint8MinBlockLoop:
CMPQ DX, $0x000000c0
......@@ -197,18 +194,15 @@ uint8MinDone:
// func uint16MinAvx2Asm(x []uint16, r []uint16)
// Requires: AVX, AVX2, SSE2, SSE4.1
TEXT ·uint16MinAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0xffffffffffffffff, BX
MOVQ BX, X0
VPBROADCASTW X0, Y0
VMOVDQU Y0, Y1
VMOVDQU Y0, Y2
VMOVDQU Y0, Y3
VMOVDQU Y0, Y4
VMOVDQU Y0, Y5
VMOVDQU Y0, Y0
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
VPCMPEQQ Y0, Y0, Y0
VPCMPEQQ Y1, Y1, Y1
VPCMPEQQ Y2, Y2, Y2
VPCMPEQQ Y3, Y3, Y3
VPCMPEQQ Y4, Y4, Y4
VPCMPEQQ Y5, Y5, Y5
uint16MinBlockLoop:
CMPQ DX, $0x00000060
......@@ -245,18 +239,15 @@ uint16MinDone:
// func uint32MinAvx2Asm(x []uint32, r []uint32)
// Requires: AVX, AVX2, SSE2, SSE4.1
TEXT ·uint32MinAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0xffffffffffffffff, BX
MOVQ BX, X0
VPBROADCASTD X0, Y0
VMOVDQU Y0, Y1
VMOVDQU Y0, Y2
VMOVDQU Y0, Y3
VMOVDQU Y0, Y4
VMOVDQU Y0, Y5
VMOVDQU Y0, Y0
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
VPCMPEQQ Y0, Y0, Y0
VPCMPEQQ Y1, Y1, Y1
VPCMPEQQ Y2, Y2, Y2
VPCMPEQQ Y3, Y3, Y3
VPCMPEQQ Y4, Y4, Y4
VPCMPEQQ Y5, Y5, Y5
uint32MinBlockLoop:
CMPQ DX, $0x00000030
......
......@@ -277,24 +277,21 @@ int64MinDone:
// func uint8MinAvx512Asm(x []uint8, r []uint8)
// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2
TEXT ·uint8MinAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0xffffffffffffffff, BX
MOVQ BX, X0
VPBROADCASTB X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
VPCMPEQQ Z0, Z0, Z0
VPCMPEQQ Z1, Z1, Z1
VPCMPEQQ Z2, Z2, Z2
VPCMPEQQ Z3, Z3, Z3
VPCMPEQQ Z4, Z4, Z4
VPCMPEQQ Z5, Z5, Z5
VPCMPEQQ Z6, Z6, Z6
VPCMPEQQ Z7, Z7, Z7
VPCMPEQQ Z8, Z8, Z8
VPCMPEQQ Z9, Z9, Z9
VPCMPEQQ Z10, Z10, Z10
VPCMPEQQ Z11, Z11, Z11
uint8MinBlockLoop:
CMPQ DX, $0x00000300
......@@ -345,24 +342,21 @@ uint8MinDone:
// func uint16MinAvx512Asm(x []uint16, r []uint16)
// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2, SSE4.1
TEXT ·uint16MinAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0xffffffffffffffff, BX
MOVQ BX, X0
VPBROADCASTW X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
VPCMPEQQ Z0, Z0, Z0
VPCMPEQQ Z1, Z1, Z1
VPCMPEQQ Z2, Z2, Z2
VPCMPEQQ Z3, Z3, Z3
VPCMPEQQ Z4, Z4, Z4
VPCMPEQQ Z5, Z5, Z5
VPCMPEQQ Z6, Z6, Z6
VPCMPEQQ Z7, Z7, Z7
VPCMPEQQ Z8, Z8, Z8
VPCMPEQQ Z9, Z9, Z9
VPCMPEQQ Z10, Z10, Z10
VPCMPEQQ Z11, Z11, Z11
uint16MinBlockLoop:
CMPQ DX, $0x00000180
......@@ -413,24 +407,21 @@ uint16MinDone:
// func uint32MinAvx512Asm(x []uint32, r []uint32)
// Requires: AVX, AVX2, AVX512F, SSE2, SSE4.1
TEXT ·uint32MinAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0xffffffffffffffff, BX
MOVQ BX, X0
VPBROADCASTD X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
VPCMPEQQ Z0, Z0, Z0
VPCMPEQQ Z1, Z1, Z1
VPCMPEQQ Z2, Z2, Z2
VPCMPEQQ Z3, Z3, Z3
VPCMPEQQ Z4, Z4, Z4
VPCMPEQQ Z5, Z5, Z5
VPCMPEQQ Z6, Z6, Z6
VPCMPEQQ Z7, Z7, Z7
VPCMPEQQ Z8, Z8, Z8
VPCMPEQQ Z9, Z9, Z9
VPCMPEQQ Z10, Z10, Z10
VPCMPEQQ Z11, Z11, Z11
uint32MinBlockLoop:
CMPQ DX, $0x000000c0
......@@ -481,24 +472,21 @@ uint32MinDone:
// func uint64MinAvx512Asm(x []uint64, r []uint64)
// Requires: AVX, AVX512F, AVX512VL, SSE2
TEXT ·uint64MinAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0xffffffffffffffff, BX
MOVQ BX, X0
VPBROADCASTQ X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
VPCMPEQQ Z0, Z0, Z0
VPCMPEQQ Z1, Z1, Z1
VPCMPEQQ Z2, Z2, Z2
VPCMPEQQ Z3, Z3, Z3
VPCMPEQQ Z4, Z4, Z4
VPCMPEQQ Z5, Z5, Z5
VPCMPEQQ Z6, Z6, Z6
VPCMPEQQ Z7, Z7, Z7
VPCMPEQQ Z8, Z8, Z8
VPCMPEQQ Z9, Z9, Z9
VPCMPEQQ Z10, Z10, Z10
VPCMPEQQ Z11, Z11, Z11
uint64MinBlockLoop:
CMPQ DX, $0x00000060
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment