diff --git a/pkg/vectorize/max/avx2.s b/pkg/vectorize/max/avx2.s new file mode 100644 index 0000000000000000000000000000000000000000..6a552397978c26fbae5d19379730c4c8ca4e0659 --- /dev/null +++ b/pkg/vectorize/max/avx2.s @@ -0,0 +1,387 @@ +// Code generated by command: go run avx2.go -out max/avx2.s -stubs max/avx2_stubs.go. DO NOT EDIT. + +#include "textflag.h" + +// func int8MaxAvx2Asm(x []int8, r []int8) +// Requires: AVX, AVX2, SSE2, SSE4.1 +TEXT ·int8MaxAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x0000000000000080, BX + MOVQ BX, X0 + VPBROADCASTB X0, Y0 + VMOVDQU Y0, Y1 + VMOVDQU Y0, Y2 + VMOVDQU Y0, Y3 + VMOVDQU Y0, Y4 + VMOVDQU Y0, Y5 + VMOVDQU Y0, Y0 + +int8MaxBlockLoop: + CMPQ DX, $0x000000c0 + JL int8MaxTailLoop + VPMAXSB (AX), Y1, Y1 + VPMAXSB 32(AX), Y2, Y2 + VPMAXSB 64(AX), Y3, Y3 + VPMAXSB 96(AX), Y4, Y4 + VPMAXSB 128(AX), Y5, Y5 + VPMAXSB 160(AX), Y0, Y0 + ADDQ $0x000000c0, AX + SUBQ $0x000000c0, DX + JMP int8MaxBlockLoop + +int8MaxTailLoop: + CMPQ DX, $0x00000004 + JL int8MaxDone + VPMAXSB (AX), Y1, Y1 + ADDQ $0x00000020, AX + SUBQ $0x00000020, DX + JMP int8MaxTailLoop + +int8MaxDone: + VPMAXSB Y1, Y2, Y1 + VPMAXSB Y1, Y3, Y1 + VPMAXSB Y1, Y4, Y1 + VPMAXSB Y1, Y5, Y1 + VPMAXSB Y1, Y0, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMAXSB X0, X1 + MOVOU X1, (CX) + RET + +// func int16MaxAvx2Asm(x []int16, r []int16) +// Requires: AVX, AVX2, SSE2 +TEXT ·int16MaxAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x0000000000008000, BX + MOVQ BX, X0 + VPBROADCASTW X0, Y0 + VMOVDQU Y0, Y1 + VMOVDQU Y0, Y2 + VMOVDQU Y0, Y3 + VMOVDQU Y0, Y4 + VMOVDQU Y0, Y5 + VMOVDQU Y0, Y0 + +int16MaxBlockLoop: + CMPQ DX, $0x00000060 + JL int16MaxTailLoop + VPMAXSW (AX), Y1, Y1 + VPMAXSW 32(AX), Y2, Y2 + VPMAXSW 64(AX), Y3, Y3 + VPMAXSW 96(AX), Y4, Y4 + VPMAXSW 128(AX), Y5, Y5 + VPMAXSW 160(AX), Y0, Y0 + ADDQ $0x000000c0, AX + SUBQ $0x00000060, DX + JMP int16MaxBlockLoop + +int16MaxTailLoop: + CMPQ DX, $0x00000004 + JL int16MaxDone + VPMAXSW (AX), Y1, Y1 + ADDQ $0x00000020, AX + SUBQ $0x00000010, DX + JMP int16MaxTailLoop + +int16MaxDone: + VPMAXSW Y1, Y2, Y1 + VPMAXSW Y1, Y3, Y1 + VPMAXSW Y1, Y4, Y1 + VPMAXSW Y1, Y5, Y1 + VPMAXSW Y1, Y0, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMAXSW X0, X1 + MOVOU X1, (CX) + RET + +// func int32MaxAvx2Asm(x []int32, r []int32) +// Requires: AVX, AVX2, SSE2, SSE4.1 +TEXT ·int32MaxAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x0000000080000000, BX + MOVQ BX, X0 + VPBROADCASTD X0, Y0 + VMOVDQU Y0, Y1 + VMOVDQU Y0, Y2 + VMOVDQU Y0, Y3 + VMOVDQU Y0, Y4 + VMOVDQU Y0, Y5 + VMOVDQU Y0, Y0 + +int32MaxBlockLoop: + CMPQ DX, $0x00000030 + JL int32MaxTailLoop + VPMAXSD (AX), Y1, Y1 + VPMAXSD 32(AX), Y2, Y2 + VPMAXSD 64(AX), Y3, Y3 + VPMAXSD 96(AX), Y4, Y4 + VPMAXSD 128(AX), Y5, Y5 + VPMAXSD 160(AX), Y0, Y0 + ADDQ $0x000000c0, AX + SUBQ $0x00000030, DX + JMP int32MaxBlockLoop + +int32MaxTailLoop: + CMPQ DX, $0x00000004 + JL int32MaxDone + VPMAXSD (AX), Y1, Y1 + ADDQ $0x00000020, AX + SUBQ $0x00000008, DX + JMP int32MaxTailLoop + +int32MaxDone: + VPMAXSD Y1, Y2, Y1 + VPMAXSD Y1, Y3, Y1 + VPMAXSD Y1, Y4, Y1 + VPMAXSD Y1, Y5, Y1 + VPMAXSD Y1, Y0, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMAXSD X0, X1 + MOVOU X1, (CX) + RET + +// func uint8MaxAvx2Asm(x []uint8, r []uint8) +// Requires: AVX, AVX2, SSE2 +TEXT ·uint8MaxAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x0000000000000000, BX + MOVQ BX, X0 + VPBROADCASTB X0, Y0 + VMOVDQU Y0, Y1 + VMOVDQU Y0, Y2 + VMOVDQU Y0, Y3 + VMOVDQU Y0, Y4 + VMOVDQU Y0, Y5 + VMOVDQU Y0, Y0 + +uint8MaxBlockLoop: + CMPQ DX, $0x000000c0 + JL uint8MaxTailLoop + VPMAXUB (AX), Y1, Y1 + VPMAXUB 32(AX), Y2, Y2 + VPMAXUB 64(AX), Y3, Y3 + VPMAXUB 96(AX), Y4, Y4 + VPMAXUB 128(AX), Y5, Y5 + VPMAXUB 160(AX), Y0, Y0 + ADDQ $0x000000c0, AX + SUBQ $0x000000c0, DX + JMP uint8MaxBlockLoop + +uint8MaxTailLoop: + CMPQ DX, $0x00000004 + JL uint8MaxDone + VPMAXUB (AX), Y1, Y1 + ADDQ $0x00000020, AX + SUBQ $0x00000020, DX + JMP uint8MaxTailLoop + +uint8MaxDone: + VPMAXUB Y1, Y2, Y1 + VPMAXUB Y1, Y3, Y1 + VPMAXUB Y1, Y4, Y1 + VPMAXUB Y1, Y5, Y1 + VPMAXUB Y1, Y0, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMAXUB X0, X1 + MOVOU X1, (CX) + RET + +// func uint16MaxAvx2Asm(x []uint16, r []uint16) +// Requires: AVX, AVX2, SSE2, SSE4.1 +TEXT ·uint16MaxAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x0000000000000000, BX + MOVQ BX, X0 + VPBROADCASTW X0, Y0 + VMOVDQU Y0, Y1 + VMOVDQU Y0, Y2 + VMOVDQU Y0, Y3 + VMOVDQU Y0, Y4 + VMOVDQU Y0, Y5 + VMOVDQU Y0, Y0 + +uint16MaxBlockLoop: + CMPQ DX, $0x00000060 + JL uint16MaxTailLoop + VPMAXUW (AX), Y1, Y1 + VPMAXUW 32(AX), Y2, Y2 + VPMAXUW 64(AX), Y3, Y3 + VPMAXUW 96(AX), Y4, Y4 + VPMAXUW 128(AX), Y5, Y5 + VPMAXUW 160(AX), Y0, Y0 + ADDQ $0x000000c0, AX + SUBQ $0x00000060, DX + JMP uint16MaxBlockLoop + +uint16MaxTailLoop: + CMPQ DX, $0x00000004 + JL uint16MaxDone + VPMAXUW (AX), Y1, Y1 + ADDQ $0x00000020, AX + SUBQ $0x00000010, DX + JMP uint16MaxTailLoop + +uint16MaxDone: + VPMAXUW Y1, Y2, Y1 + VPMAXUW Y1, Y3, Y1 + VPMAXUW Y1, Y4, Y1 + VPMAXUW Y1, Y5, Y1 + VPMAXUW Y1, Y0, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMAXUW X0, X1 + MOVOU X1, (CX) + RET + +// func uint32MaxAvx2Asm(x []uint32, r []uint32) +// Requires: AVX, AVX2, SSE2, SSE4.1 +TEXT ·uint32MaxAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x0000000000000000, BX + MOVQ BX, X0 + VPBROADCASTD X0, Y0 + VMOVDQU Y0, Y1 + VMOVDQU Y0, Y2 + VMOVDQU Y0, Y3 + VMOVDQU Y0, Y4 + VMOVDQU Y0, Y5 + VMOVDQU Y0, Y0 + +uint32MaxBlockLoop: + CMPQ DX, $0x00000030 + JL uint32MaxTailLoop + VPMAXUD (AX), Y1, Y1 + VPMAXUD 32(AX), Y2, Y2 + VPMAXUD 64(AX), Y3, Y3 + VPMAXUD 96(AX), Y4, Y4 + VPMAXUD 128(AX), Y5, Y5 + VPMAXUD 160(AX), Y0, Y0 + ADDQ $0x000000c0, AX + SUBQ $0x00000030, DX + JMP uint32MaxBlockLoop + +uint32MaxTailLoop: + CMPQ DX, $0x00000004 + JL uint32MaxDone + VPMAXUD (AX), Y1, Y1 + ADDQ $0x00000020, AX + SUBQ $0x00000008, DX + JMP uint32MaxTailLoop + +uint32MaxDone: + VPMAXUD Y1, Y2, Y1 + VPMAXUD Y1, Y3, Y1 + VPMAXUD Y1, Y4, Y1 + VPMAXUD Y1, Y5, Y1 + VPMAXUD Y1, Y0, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMAXUD X0, X1 + MOVOU X1, (CX) + RET + +// func float32MaxAvx2Asm(x []float32, r []float32) +// Requires: AVX, AVX2, SSE, SSE2 +TEXT ·float32MaxAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x00000000ff7fffff, BX + MOVQ BX, X0 + VBROADCASTSS X0, Y0 + VMOVUPS Y0, Y1 + VMOVUPS Y0, Y2 + VMOVUPS Y0, Y3 + VMOVUPS Y0, Y4 + VMOVUPS Y0, Y5 + VMOVUPS Y0, Y0 + +float32MaxBlockLoop: + CMPQ DX, $0x00000030 + JL float32MaxTailLoop + VMAXPS (AX), Y1, Y1 + VMAXPS 32(AX), Y2, Y2 + VMAXPS 64(AX), Y3, Y3 + VMAXPS 96(AX), Y4, Y4 + VMAXPS 128(AX), Y5, Y5 + VMAXPS 160(AX), Y0, Y0 + ADDQ $0x000000c0, AX + SUBQ $0x00000030, DX + JMP float32MaxBlockLoop + +float32MaxTailLoop: + CMPQ DX, $0x00000004 + JL float32MaxDone + VMAXPS (AX), Y1, Y1 + ADDQ $0x00000020, AX + SUBQ $0x00000008, DX + JMP float32MaxTailLoop + +float32MaxDone: + VMAXPS Y1, Y2, Y1 + VMAXPS Y1, Y3, Y1 + VMAXPS Y1, Y4, Y1 + VMAXPS Y1, Y5, Y1 + VMAXPS Y1, Y0, Y1 + VEXTRACTF128 $0x01, Y1, X0 + MAXPS X0, X1 + MOVOU X1, (CX) + RET + +// func float64MaxAvx2Asm(x []float64, r []float64) +// Requires: AVX, AVX2, SSE2 +TEXT ·float64MaxAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0xffefffffffffffff, BX + MOVQ BX, X0 + VBROADCASTSD X0, Y0 + VMOVUPD Y0, Y1 + VMOVUPD Y0, Y2 + VMOVUPD Y0, Y3 + VMOVUPD Y0, Y4 + VMOVUPD Y0, Y5 + VMOVUPD Y0, Y0 + +float64MaxBlockLoop: + CMPQ DX, $0x00000018 + JL float64MaxTailLoop + VMAXPD (AX), Y1, Y1 + VMAXPD 32(AX), Y2, Y2 + VMAXPD 64(AX), Y3, Y3 + VMAXPD 96(AX), Y4, Y4 + VMAXPD 128(AX), Y5, Y5 + VMAXPD 160(AX), Y0, Y0 + ADDQ $0x000000c0, AX + SUBQ $0x00000018, DX + JMP float64MaxBlockLoop + +float64MaxTailLoop: + CMPQ DX, $0x00000004 + JL float64MaxDone + VMAXPD (AX), Y1, Y1 + ADDQ $0x00000020, AX + SUBQ $0x00000004, DX + JMP float64MaxTailLoop + +float64MaxDone: + VMAXPD Y1, Y2, Y1 + VMAXPD Y1, Y3, Y1 + VMAXPD Y1, Y4, Y1 + VMAXPD Y1, Y5, Y1 + VMAXPD Y1, Y0, Y1 + VEXTRACTF128 $0x01, Y1, X0 + MAXPD X0, X1 + MOVOU X1, (CX) + RET diff --git a/pkg/vectorize/max/avx2_stubs.go b/pkg/vectorize/max/avx2_stubs.go new file mode 100644 index 0000000000000000000000000000000000000000..3ac14bbcc51da5adfd1ded0b3bd0772d5a02f3fc --- /dev/null +++ b/pkg/vectorize/max/avx2_stubs.go @@ -0,0 +1,19 @@ +// Code generated by command: go run avx2.go -out max/avx2.s -stubs max/avx2_stubs.go. DO NOT EDIT. + +package max + +func int8MaxAvx2Asm(x []int8, r []int8) + +func int16MaxAvx2Asm(x []int16, r []int16) + +func int32MaxAvx2Asm(x []int32, r []int32) + +func uint8MaxAvx2Asm(x []uint8, r []uint8) + +func uint16MaxAvx2Asm(x []uint16, r []uint16) + +func uint32MaxAvx2Asm(x []uint32, r []uint32) + +func float32MaxAvx2Asm(x []float32, r []float32) + +func float64MaxAvx2Asm(x []float64, r []float64) diff --git a/pkg/vectorize/max/avx512.s b/pkg/vectorize/max/avx512.s new file mode 100644 index 0000000000000000000000000000000000000000..38218bcb7a0f96942bc11d2ea7f823da1c2f60ff --- /dev/null +++ b/pkg/vectorize/max/avx512.s @@ -0,0 +1,683 @@ +// Code generated by command: go run avx512.go -out avx512.s -stubs avx512_stubs.go. DO NOT EDIT. + +#include "textflag.h" + +// func int8MaxAvx512Asm(x []int8, r []int8) +// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2, SSE4.1 +TEXT ·int8MaxAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x0000000000000080, BX + MOVQ BX, X0 + VPBROADCASTB X0, Z0 + VMOVDQU64 Z0, Z1 + VMOVDQU64 Z0, Z2 + VMOVDQU64 Z0, Z3 + VMOVDQU64 Z0, Z4 + VMOVDQU64 Z0, Z5 + VMOVDQU64 Z0, Z6 + VMOVDQU64 Z0, Z7 + VMOVDQU64 Z0, Z8 + VMOVDQU64 Z0, Z9 + VMOVDQU64 Z0, Z10 + VMOVDQU64 Z0, Z11 + VMOVDQU64 Z0, Z0 + +int8MaxBlockLoop: + CMPQ DX, $0x00000300 + JL int8MaxTailLoop + VPMAXSB (AX), Z1, Z1 + VPMAXSB 64(AX), Z2, Z2 + VPMAXSB 128(AX), Z3, Z3 + VPMAXSB 192(AX), Z4, Z4 + VPMAXSB 256(AX), Z5, Z5 + VPMAXSB 320(AX), Z6, Z6 + VPMAXSB 384(AX), Z7, Z7 + VPMAXSB 448(AX), Z8, Z8 + VPMAXSB 512(AX), Z9, Z9 + VPMAXSB 576(AX), Z10, Z10 + VPMAXSB 640(AX), Z11, Z11 + VPMAXSB 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x00000300, DX + JMP int8MaxBlockLoop + +int8MaxTailLoop: + CMPQ DX, $0x00000004 + JL int8MaxDone + VPMAXSB (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000040, DX + JMP int8MaxTailLoop + +int8MaxDone: + VPMAXSB Z1, Z2, Z1 + VPMAXSB Z1, Z3, Z1 + VPMAXSB Z1, Z4, Z1 + VPMAXSB Z1, Z5, Z1 + VPMAXSB Z1, Z6, Z1 + VPMAXSB Z1, Z7, Z1 + VPMAXSB Z1, Z8, Z1 + VPMAXSB Z1, Z9, Z1 + VPMAXSB Z1, Z10, Z1 + VPMAXSB Z1, Z11, Z1 + VPMAXSB Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VPMAXSB Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMAXSB X0, X1 + MOVOU X1, (CX) + RET + +// func int16MaxAvx512Asm(x []int16, r []int16) +// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2 +TEXT ·int16MaxAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x0000000000008000, BX + MOVQ BX, X0 + VPBROADCASTW X0, Z0 + VMOVDQU64 Z0, Z1 + VMOVDQU64 Z0, Z2 + VMOVDQU64 Z0, Z3 + VMOVDQU64 Z0, Z4 + VMOVDQU64 Z0, Z5 + VMOVDQU64 Z0, Z6 + VMOVDQU64 Z0, Z7 + VMOVDQU64 Z0, Z8 + VMOVDQU64 Z0, Z9 + VMOVDQU64 Z0, Z10 + VMOVDQU64 Z0, Z11 + VMOVDQU64 Z0, Z0 + +int16MaxBlockLoop: + CMPQ DX, $0x00000180 + JL int16MaxTailLoop + VPMAXSW (AX), Z1, Z1 + VPMAXSW 64(AX), Z2, Z2 + VPMAXSW 128(AX), Z3, Z3 + VPMAXSW 192(AX), Z4, Z4 + VPMAXSW 256(AX), Z5, Z5 + VPMAXSW 320(AX), Z6, Z6 + VPMAXSW 384(AX), Z7, Z7 + VPMAXSW 448(AX), Z8, Z8 + VPMAXSW 512(AX), Z9, Z9 + VPMAXSW 576(AX), Z10, Z10 + VPMAXSW 640(AX), Z11, Z11 + VPMAXSW 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x00000180, DX + JMP int16MaxBlockLoop + +int16MaxTailLoop: + CMPQ DX, $0x00000004 + JL int16MaxDone + VPMAXSW (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000020, DX + JMP int16MaxTailLoop + +int16MaxDone: + VPMAXSW Z1, Z2, Z1 + VPMAXSW Z1, Z3, Z1 + VPMAXSW Z1, Z4, Z1 + VPMAXSW Z1, Z5, Z1 + VPMAXSW Z1, Z6, Z1 + VPMAXSW Z1, Z7, Z1 + VPMAXSW Z1, Z8, Z1 + VPMAXSW Z1, Z9, Z1 + VPMAXSW Z1, Z10, Z1 + VPMAXSW Z1, Z11, Z1 + VPMAXSW Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VPMAXSW Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMAXSW X0, X1 + MOVOU X1, (CX) + RET + +// func int32MaxAvx512Asm(x []int32, r []int32) +// Requires: AVX, AVX2, AVX512F, SSE2, SSE4.1 +TEXT ·int32MaxAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x0000000080000000, BX + MOVQ BX, X0 + VPBROADCASTD X0, Z0 + VMOVDQU64 Z0, Z1 + VMOVDQU64 Z0, Z2 + VMOVDQU64 Z0, Z3 + VMOVDQU64 Z0, Z4 + VMOVDQU64 Z0, Z5 + VMOVDQU64 Z0, Z6 + VMOVDQU64 Z0, Z7 + VMOVDQU64 Z0, Z8 + VMOVDQU64 Z0, Z9 + VMOVDQU64 Z0, Z10 + VMOVDQU64 Z0, Z11 + VMOVDQU64 Z0, Z0 + +int32MaxBlockLoop: + CMPQ DX, $0x000000c0 + JL int32MaxTailLoop + VPMAXSD (AX), Z1, Z1 + VPMAXSD 64(AX), Z2, Z2 + VPMAXSD 128(AX), Z3, Z3 + VPMAXSD 192(AX), Z4, Z4 + VPMAXSD 256(AX), Z5, Z5 + VPMAXSD 320(AX), Z6, Z6 + VPMAXSD 384(AX), Z7, Z7 + VPMAXSD 448(AX), Z8, Z8 + VPMAXSD 512(AX), Z9, Z9 + VPMAXSD 576(AX), Z10, Z10 + VPMAXSD 640(AX), Z11, Z11 + VPMAXSD 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x000000c0, DX + JMP int32MaxBlockLoop + +int32MaxTailLoop: + CMPQ DX, $0x00000004 + JL int32MaxDone + VPMAXSD (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000010, DX + JMP int32MaxTailLoop + +int32MaxDone: + VPMAXSD Z1, Z2, Z1 + VPMAXSD Z1, Z3, Z1 + VPMAXSD Z1, Z4, Z1 + VPMAXSD Z1, Z5, Z1 + VPMAXSD Z1, Z6, Z1 + VPMAXSD Z1, Z7, Z1 + VPMAXSD Z1, Z8, Z1 + VPMAXSD Z1, Z9, Z1 + VPMAXSD Z1, Z10, Z1 + VPMAXSD Z1, Z11, Z1 + VPMAXSD Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VPMAXSD Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMAXSD X0, X1 + MOVOU X1, (CX) + RET + +// func int64MaxAvx512Asm(x []int64, r []int64) +// Requires: AVX, AVX512F, AVX512VL, SSE2 +TEXT ·int64MaxAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x8000000000000000, BX + MOVQ BX, X0 + VPBROADCASTQ X0, Z0 + VMOVDQU64 Z0, Z1 + VMOVDQU64 Z0, Z2 + VMOVDQU64 Z0, Z3 + VMOVDQU64 Z0, Z4 + VMOVDQU64 Z0, Z5 + VMOVDQU64 Z0, Z6 + VMOVDQU64 Z0, Z7 + VMOVDQU64 Z0, Z8 + VMOVDQU64 Z0, Z9 + VMOVDQU64 Z0, Z10 + VMOVDQU64 Z0, Z11 + VMOVDQU64 Z0, Z0 + +int64MaxBlockLoop: + CMPQ DX, $0x00000060 + JL int64MaxTailLoop + VPMAXSQ (AX), Z1, Z1 + VPMAXSQ 64(AX), Z2, Z2 + VPMAXSQ 128(AX), Z3, Z3 + VPMAXSQ 192(AX), Z4, Z4 + VPMAXSQ 256(AX), Z5, Z5 + VPMAXSQ 320(AX), Z6, Z6 + VPMAXSQ 384(AX), Z7, Z7 + VPMAXSQ 448(AX), Z8, Z8 + VPMAXSQ 512(AX), Z9, Z9 + VPMAXSQ 576(AX), Z10, Z10 + VPMAXSQ 640(AX), Z11, Z11 + VPMAXSQ 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x00000060, DX + JMP int64MaxBlockLoop + +int64MaxTailLoop: + CMPQ DX, $0x00000004 + JL int64MaxDone + VPMAXSQ (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000008, DX + JMP int64MaxTailLoop + +int64MaxDone: + VPMAXSQ Z1, Z2, Z1 + VPMAXSQ Z1, Z3, Z1 + VPMAXSQ Z1, Z4, Z1 + VPMAXSQ Z1, Z5, Z1 + VPMAXSQ Z1, Z6, Z1 + VPMAXSQ Z1, Z7, Z1 + VPMAXSQ Z1, Z8, Z1 + VPMAXSQ Z1, Z9, Z1 + VPMAXSQ Z1, Z10, Z1 + VPMAXSQ Z1, Z11, Z1 + VPMAXSQ Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VPMAXSQ Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + VPMAXSQ X0, X1, X1 + MOVOU X1, (CX) + RET + +// func uint8MaxAvx512Asm(x []uint8, r []uint8) +// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2 +TEXT ·uint8MaxAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x0000000000000000, BX + MOVQ BX, X0 + VPBROADCASTB X0, Z0 + VMOVDQU64 Z0, Z1 + VMOVDQU64 Z0, Z2 + VMOVDQU64 Z0, Z3 + VMOVDQU64 Z0, Z4 + VMOVDQU64 Z0, Z5 + VMOVDQU64 Z0, Z6 + VMOVDQU64 Z0, Z7 + VMOVDQU64 Z0, Z8 + VMOVDQU64 Z0, Z9 + VMOVDQU64 Z0, Z10 + VMOVDQU64 Z0, Z11 + VMOVDQU64 Z0, Z0 + +uint8MaxBlockLoop: + CMPQ DX, $0x00000300 + JL uint8MaxTailLoop + VPMAXUB (AX), Z1, Z1 + VPMAXUB 64(AX), Z2, Z2 + VPMAXUB 128(AX), Z3, Z3 + VPMAXUB 192(AX), Z4, Z4 + VPMAXUB 256(AX), Z5, Z5 + VPMAXUB 320(AX), Z6, Z6 + VPMAXUB 384(AX), Z7, Z7 + VPMAXUB 448(AX), Z8, Z8 + VPMAXUB 512(AX), Z9, Z9 + VPMAXUB 576(AX), Z10, Z10 + VPMAXUB 640(AX), Z11, Z11 + VPMAXUB 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x00000300, DX + JMP uint8MaxBlockLoop + +uint8MaxTailLoop: + CMPQ DX, $0x00000004 + JL uint8MaxDone + VPMAXUB (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000040, DX + JMP uint8MaxTailLoop + +uint8MaxDone: + VPMAXUB Z1, Z2, Z1 + VPMAXUB Z1, Z3, Z1 + VPMAXUB Z1, Z4, Z1 + VPMAXUB Z1, Z5, Z1 + VPMAXUB Z1, Z6, Z1 + VPMAXUB Z1, Z7, Z1 + VPMAXUB Z1, Z8, Z1 + VPMAXUB Z1, Z9, Z1 + VPMAXUB Z1, Z10, Z1 + VPMAXUB Z1, Z11, Z1 + VPMAXUB Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VPMAXUB Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMAXUB X0, X1 + MOVOU X1, (CX) + RET + +// func uint16MaxAvx512Asm(x []uint16, r []uint16) +// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2, SSE4.1 +TEXT ·uint16MaxAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x0000000000000000, BX + MOVQ BX, X0 + VPBROADCASTW X0, Z0 + VMOVDQU64 Z0, Z1 + VMOVDQU64 Z0, Z2 + VMOVDQU64 Z0, Z3 + VMOVDQU64 Z0, Z4 + VMOVDQU64 Z0, Z5 + VMOVDQU64 Z0, Z6 + VMOVDQU64 Z0, Z7 + VMOVDQU64 Z0, Z8 + VMOVDQU64 Z0, Z9 + VMOVDQU64 Z0, Z10 + VMOVDQU64 Z0, Z11 + VMOVDQU64 Z0, Z0 + +uint16MaxBlockLoop: + CMPQ DX, $0x00000180 + JL uint16MaxTailLoop + VPMAXUW (AX), Z1, Z1 + VPMAXUW 64(AX), Z2, Z2 + VPMAXUW 128(AX), Z3, Z3 + VPMAXUW 192(AX), Z4, Z4 + VPMAXUW 256(AX), Z5, Z5 + VPMAXUW 320(AX), Z6, Z6 + VPMAXUW 384(AX), Z7, Z7 + VPMAXUW 448(AX), Z8, Z8 + VPMAXUW 512(AX), Z9, Z9 + VPMAXUW 576(AX), Z10, Z10 + VPMAXUW 640(AX), Z11, Z11 + VPMAXUW 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x00000180, DX + JMP uint16MaxBlockLoop + +uint16MaxTailLoop: + CMPQ DX, $0x00000004 + JL uint16MaxDone + VPMAXUW (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000020, DX + JMP uint16MaxTailLoop + +uint16MaxDone: + VPMAXUW Z1, Z2, Z1 + VPMAXUW Z1, Z3, Z1 + VPMAXUW Z1, Z4, Z1 + VPMAXUW Z1, Z5, Z1 + VPMAXUW Z1, Z6, Z1 + VPMAXUW Z1, Z7, Z1 + VPMAXUW Z1, Z8, Z1 + VPMAXUW Z1, Z9, Z1 + VPMAXUW Z1, Z10, Z1 + VPMAXUW Z1, Z11, Z1 + VPMAXUW Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VPMAXUW Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMAXUW X0, X1 + MOVOU X1, (CX) + RET + +// func uint32MaxAvx512Asm(x []uint32, r []uint32) +// Requires: AVX, AVX2, AVX512F, SSE2, SSE4.1 +TEXT ·uint32MaxAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x0000000000000000, BX + MOVQ BX, X0 + VPBROADCASTD X0, Z0 + VMOVDQU64 Z0, Z1 + VMOVDQU64 Z0, Z2 + VMOVDQU64 Z0, Z3 + VMOVDQU64 Z0, Z4 + VMOVDQU64 Z0, Z5 + VMOVDQU64 Z0, Z6 + VMOVDQU64 Z0, Z7 + VMOVDQU64 Z0, Z8 + VMOVDQU64 Z0, Z9 + VMOVDQU64 Z0, Z10 + VMOVDQU64 Z0, Z11 + VMOVDQU64 Z0, Z0 + +uint32MaxBlockLoop: + CMPQ DX, $0x000000c0 + JL uint32MaxTailLoop + VPMAXUD (AX), Z1, Z1 + VPMAXUD 64(AX), Z2, Z2 + VPMAXUD 128(AX), Z3, Z3 + VPMAXUD 192(AX), Z4, Z4 + VPMAXUD 256(AX), Z5, Z5 + VPMAXUD 320(AX), Z6, Z6 + VPMAXUD 384(AX), Z7, Z7 + VPMAXUD 448(AX), Z8, Z8 + VPMAXUD 512(AX), Z9, Z9 + VPMAXUD 576(AX), Z10, Z10 + VPMAXUD 640(AX), Z11, Z11 + VPMAXUD 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x000000c0, DX + JMP uint32MaxBlockLoop + +uint32MaxTailLoop: + CMPQ DX, $0x00000004 + JL uint32MaxDone + VPMAXUD (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000010, DX + JMP uint32MaxTailLoop + +uint32MaxDone: + VPMAXUD Z1, Z2, Z1 + VPMAXUD Z1, Z3, Z1 + VPMAXUD Z1, Z4, Z1 + VPMAXUD Z1, Z5, Z1 + VPMAXUD Z1, Z6, Z1 + VPMAXUD Z1, Z7, Z1 + VPMAXUD Z1, Z8, Z1 + VPMAXUD Z1, Z9, Z1 + VPMAXUD Z1, Z10, Z1 + VPMAXUD Z1, Z11, Z1 + VPMAXUD Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VPMAXUD Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMAXUD X0, X1 + MOVOU X1, (CX) + RET + +// func uint64MaxAvx512Asm(x []uint64, r []uint64) +// Requires: AVX, AVX512F, AVX512VL, SSE2 +TEXT ·uint64MaxAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x0000000000000000, BX + MOVQ BX, X0 + VPBROADCASTQ X0, Z0 + VMOVDQU64 Z0, Z1 + VMOVDQU64 Z0, Z2 + VMOVDQU64 Z0, Z3 + VMOVDQU64 Z0, Z4 + VMOVDQU64 Z0, Z5 + VMOVDQU64 Z0, Z6 + VMOVDQU64 Z0, Z7 + VMOVDQU64 Z0, Z8 + VMOVDQU64 Z0, Z9 + VMOVDQU64 Z0, Z10 + VMOVDQU64 Z0, Z11 + VMOVDQU64 Z0, Z0 + +uint64MaxBlockLoop: + CMPQ DX, $0x00000060 + JL uint64MaxTailLoop + VPMAXUQ (AX), Z1, Z1 + VPMAXUQ 64(AX), Z2, Z2 + VPMAXUQ 128(AX), Z3, Z3 + VPMAXUQ 192(AX), Z4, Z4 + VPMAXUQ 256(AX), Z5, Z5 + VPMAXUQ 320(AX), Z6, Z6 + VPMAXUQ 384(AX), Z7, Z7 + VPMAXUQ 448(AX), Z8, Z8 + VPMAXUQ 512(AX), Z9, Z9 + VPMAXUQ 576(AX), Z10, Z10 + VPMAXUQ 640(AX), Z11, Z11 + VPMAXUQ 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x00000060, DX + JMP uint64MaxBlockLoop + +uint64MaxTailLoop: + CMPQ DX, $0x00000004 + JL uint64MaxDone + VPMAXUQ (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000008, DX + JMP uint64MaxTailLoop + +uint64MaxDone: + VPMAXUQ Z1, Z2, Z1 + VPMAXUQ Z1, Z3, Z1 + VPMAXUQ Z1, Z4, Z1 + VPMAXUQ Z1, Z5, Z1 + VPMAXUQ Z1, Z6, Z1 + VPMAXUQ Z1, Z7, Z1 + VPMAXUQ Z1, Z8, Z1 + VPMAXUQ Z1, Z9, Z1 + VPMAXUQ Z1, Z10, Z1 + VPMAXUQ Z1, Z11, Z1 + VPMAXUQ Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VPMAXUQ Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + VPMAXUQ X0, X1, X1 + MOVOU X1, (CX) + RET + +// func float32MaxAvx512Asm(x []float32, r []float32) +// Requires: AVX, AVX512F, SSE, SSE2 +TEXT ·float32MaxAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x00000000ff7fffff, BX + MOVQ BX, X0 + VBROADCASTSS X0, Z0 + VMOVUPS Z0, Z1 + VMOVUPS Z0, Z2 + VMOVUPS Z0, Z3 + VMOVUPS Z0, Z4 + VMOVUPS Z0, Z5 + VMOVUPS Z0, Z6 + VMOVUPS Z0, Z7 + VMOVUPS Z0, Z8 + VMOVUPS Z0, Z9 + VMOVUPS Z0, Z10 + VMOVUPS Z0, Z11 + VMOVUPS Z0, Z0 + +float32MaxBlockLoop: + CMPQ DX, $0x000000c0 + JL float32MaxTailLoop + VMAXPS (AX), Z1, Z1 + VMAXPS 64(AX), Z2, Z2 + VMAXPS 128(AX), Z3, Z3 + VMAXPS 192(AX), Z4, Z4 + VMAXPS 256(AX), Z5, Z5 + VMAXPS 320(AX), Z6, Z6 + VMAXPS 384(AX), Z7, Z7 + VMAXPS 448(AX), Z8, Z8 + VMAXPS 512(AX), Z9, Z9 + VMAXPS 576(AX), Z10, Z10 + VMAXPS 640(AX), Z11, Z11 + VMAXPS 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x000000c0, DX + JMP float32MaxBlockLoop + +float32MaxTailLoop: + CMPQ DX, $0x00000004 + JL float32MaxDone + VMAXPS (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000010, DX + JMP float32MaxTailLoop + +float32MaxDone: + VMAXPS Z1, Z2, Z1 + VMAXPS Z1, Z3, Z1 + VMAXPS Z1, Z4, Z1 + VMAXPS Z1, Z5, Z1 + VMAXPS Z1, Z6, Z1 + VMAXPS Z1, Z7, Z1 + VMAXPS Z1, Z8, Z1 + VMAXPS Z1, Z9, Z1 + VMAXPS Z1, Z10, Z1 + VMAXPS Z1, Z11, Z1 + VMAXPS Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VMAXPS Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + MAXPS X0, X1 + MOVOU X1, (CX) + RET + +// func float64MaxAvx512Asm(x []float64, r []float64) +// Requires: AVX, AVX512F, SSE2 +TEXT ·float64MaxAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0xffefffffffffffff, BX + MOVQ BX, X0 + VBROADCASTSD X0, Z0 + VMOVUPD Z0, Z1 + VMOVUPD Z0, Z2 + VMOVUPD Z0, Z3 + VMOVUPD Z0, Z4 + VMOVUPD Z0, Z5 + VMOVUPD Z0, Z6 + VMOVUPD Z0, Z7 + VMOVUPD Z0, Z8 + VMOVUPD Z0, Z9 + VMOVUPD Z0, Z10 + VMOVUPD Z0, Z11 + VMOVUPD Z0, Z0 + +float64MaxBlockLoop: + CMPQ DX, $0x00000060 + JL float64MaxTailLoop + VMAXPD (AX), Z1, Z1 + VMAXPD 64(AX), Z2, Z2 + VMAXPD 128(AX), Z3, Z3 + VMAXPD 192(AX), Z4, Z4 + VMAXPD 256(AX), Z5, Z5 + VMAXPD 320(AX), Z6, Z6 + VMAXPD 384(AX), Z7, Z7 + VMAXPD 448(AX), Z8, Z8 + VMAXPD 512(AX), Z9, Z9 + VMAXPD 576(AX), Z10, Z10 + VMAXPD 640(AX), Z11, Z11 + VMAXPD 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x00000060, DX + JMP float64MaxBlockLoop + +float64MaxTailLoop: + CMPQ DX, $0x00000004 + JL float64MaxDone + VMAXPD (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000008, DX + JMP float64MaxTailLoop + +float64MaxDone: + VMAXPD Z1, Z2, Z1 + VMAXPD Z1, Z3, Z1 + VMAXPD Z1, Z4, Z1 + VMAXPD Z1, Z5, Z1 + VMAXPD Z1, Z6, Z1 + VMAXPD Z1, Z7, Z1 + VMAXPD Z1, Z8, Z1 + VMAXPD Z1, Z9, Z1 + VMAXPD Z1, Z10, Z1 + VMAXPD Z1, Z11, Z1 + VMAXPD Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VMAXPD Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + MAXPD X0, X1 + MOVOU X1, (CX) + RET diff --git a/pkg/vectorize/max/avx512_stubs.go b/pkg/vectorize/max/avx512_stubs.go new file mode 100644 index 0000000000000000000000000000000000000000..315c3e27499f285e7e22e44bc6bccaa6a2fc22c3 --- /dev/null +++ b/pkg/vectorize/max/avx512_stubs.go @@ -0,0 +1,23 @@ +// Code generated by command: go run avx512.go -out avx512.s -stubs avx512_stubs.go. DO NOT EDIT. + +package max + +func int8MaxAvx512Asm(x []int8, r []int8) + +func int16MaxAvx512Asm(x []int16, r []int16) + +func int32MaxAvx512Asm(x []int32, r []int32) + +func int64MaxAvx512Asm(x []int64, r []int64) + +func uint8MaxAvx512Asm(x []uint8, r []uint8) + +func uint16MaxAvx512Asm(x []uint16, r []uint16) + +func uint32MaxAvx512Asm(x []uint32, r []uint32) + +func uint64MaxAvx512Asm(x []uint64, r []uint64) + +func float32MaxAvx512Asm(x []float32, r []float32) + +func float64MaxAvx512Asm(x []float64, r []float64) diff --git a/pkg/vectorize/max/max.go b/pkg/vectorize/max/max.go index de2a37a5d3ed7b6287a185c28e0c17fdf0bb5af2..aef042ecf949ff75a9c6defa41c7624cdf09551a 100644 --- a/pkg/vectorize/max/max.go +++ b/pkg/vectorize/max/max.go @@ -2,141 +2,775 @@ package max import ( "bytes" - "matrixbase/pkg/container/vector" + "matrixbase/pkg/container/types" + + "golang.org/x/sys/cpu" ) var ( - boolMax func([]bool) bool - i64Max func([]int64) int64 - f64Max func([]float64) float64 - bytesMax func(*vector.Bytes) []byte - boolMaxSels func([]bool, []int64) bool - i64MaxSels func([]int64, []int64) int64 - f64MaxSels func([]float64, []int64) float64 - bytesMaxSels func(*vector.Bytes, []int64) []byte + boolMax func([]bool) bool + boolMaxSels func([]bool, []int64) bool + int8Max func([]int8) int8 + int8MaxSels func([]int8, []int64) int8 + int16Max func([]int16) int16 + int16MaxSels func([]int16, []int64) int16 + int32Max func([]int32) int32 + int32MaxSels func([]int32, []int64) int32 + int64Max func([]int64) int64 + int64MaxSels func([]int64, []int64) int64 + uint8Max func([]uint8) uint8 + uint8MaxSels func([]uint8, []int64) uint8 + uint16Max func([]uint16) uint16 + uint16MaxSels func([]uint16, []int64) uint16 + uint32Max func([]uint32) uint32 + uint32MaxSels func([]uint32, []int64) uint32 + uint64Max func([]uint64) uint64 + uint64MaxSels func([]uint64, []int64) uint64 + float32Max func([]float32) float32 + float32MaxSels func([]float32, []int64) float32 + float64Max func([]float64) float64 + float64MaxSels func([]float64, []int64) float64 + strMax func(*types.Bytes) []byte + strMaxSels func(*types.Bytes, []int64) []byte ) func init() { - i64Max = i64MaxPure - f64Max = f64MaxPure + if cpu.X86.HasAVX512 { + int8Max = int8MaxAvx512 + int16Max = int16MaxAvx512 + int32Max = int32MaxAvx512 + int64Max = int64MaxAvx512 + uint8Max = uint8MaxAvx512 + uint16Max = uint16MaxAvx512 + uint32Max = uint32MaxAvx512 + uint64Max = uint64MaxAvx512 + float32Max = float32MaxAvx512 + float64Max = float64MaxAvx512 + } else if cpu.X86.HasAVX2 { + int8Max = int8MaxAvx2 + int16Max = int16MaxAvx2 + int32Max = int32MaxAvx2 + int64Max = int64MaxPure + uint8Max = uint8MaxAvx2 + uint16Max = uint16MaxAvx2 + uint32Max = uint32MaxAvx2 + uint64Max = uint64MaxPure + float32Max = float32MaxAvx2 + float64Max = float64MaxAvx2 + } else { + int8Max = int8MaxPure + int16Max = int16MaxPure + int32Max = int32MaxPure + int64Max = int64MaxPure + uint8Max = uint8MaxPure + uint16Max = uint16MaxPure + uint32Max = uint32MaxPure + uint64Max = uint64MaxPure + float32Max = float32MaxPure + float64Max = float64MaxPure + } + boolMax = boolMaxPure - bytesMax = bytesMaxPure - i64MaxSels = i64MaxSelsPure - f64MaxSels = f64MaxSelsPure + strMax = strMaxPure + boolMaxSels = boolMaxSelsPure - bytesMaxSels = bytesMaxSelsPure + int8MaxSels = int8MaxSelsPure + int16MaxSels = int16MaxSelsPure + int32MaxSels = int32MaxSelsPure + int64MaxSels = int64MaxSelsPure + uint8MaxSels = uint8MaxSelsPure + uint16MaxSels = uint16MaxSelsPure + uint32MaxSels = uint32MaxSelsPure + uint64MaxSels = uint64MaxSelsPure + float32MaxSels = float32MaxSelsPure + float64MaxSels = float64MaxSelsPure + strMaxSels = strMaxSelsPure } func BoolMax(xs []bool) bool { - return boolMaxPure(xs) + return boolMax(xs) } -func I64Max(xs []int64) int64 { - return i64Max(xs) +func boolMaxPure(xs []bool) bool { + for _, x := range xs { + if x == true { + return true + } + } + return false } -func F64Max(xs []float64) float64 { - return f64Max(xs) +func BoolMaxSels(xs []bool, sels []int64) bool { + return boolMaxSels(xs, sels) } -func BytesMax(xs *vector.Bytes) []byte { - return bytesMax(xs) +func boolMaxSelsPure(xs []bool, sels []int64) bool { + for _, sel := range sels { + if xs[sel] == true { + return true + } + } + return false } -func BoolMaxSels(xs []bool, sels []int64) bool { - return boolMaxSelsPure(xs, sels) +func Int8Max(xs []int8) int8 { + return int8Max(xs) } -func I64MaxSels(xs []int64, sels []int64) int64 { - return i64MaxSels(xs, sels) +func int8MaxPure(xs []int8) int8 { + res := xs[0] + for _, x := range xs { + if x > res { + res = x + } + } + return res } -func F64MaxSels(xs []float64, sels []int64) float64 { - return f64MaxSels(xs, sels) +func int8MaxAvx2(xs []int8) int8 { + const regItems int = 32 / 1 + n := len(xs) / regItems + var rs [16]int8 + int8MaxAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 16; i++ { + if rs[i] > res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] > res { + res = xs[i] + } + } + return res } -func BytesMaxSels(xs *vector.Bytes, sels []int64) []byte { - return bytesMaxSels(xs, sels) +func int8MaxAvx512(xs []int8) int8 { + const regItems int = 64 / 1 + n := len(xs) / regItems + var rs [16]int8 + int8MaxAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 16; i++ { + if rs[i] > res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] > res { + res = xs[i] + } + } + return res } -func boolMaxPure(xs []bool) bool { +func Int8MaxSels(xs []int8, sels []int64) int8 { + return int8MaxSels(xs, sels) +} + +func int8MaxSelsPure(xs []int8, sels []int64) int8 { + res := xs[sels[0]] + for _, sel := range sels { + x := xs[sel] + if x > res { + res = x + } + } + return res +} + +func Int16Max(xs []int16) int16 { + return int16Max(xs) +} + +func int16MaxPure(xs []int16) int16 { + res := xs[0] for _, x := range xs { - if x { - return true + if x > res { + res = x } } - return false + return res +} + +func int16MaxAvx2(xs []int16) int16 { + const regItems int = 32 / 2 + n := len(xs) / regItems + var rs [8]int16 + int16MaxAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 8; i++ { + if rs[i] > res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] > res { + res = xs[i] + } + } + return res +} + +func int16MaxAvx512(xs []int16) int16 { + const regItems int = 64 / 2 + n := len(xs) / regItems + var rs [8]int16 + int16MaxAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 8; i++ { + if rs[i] > res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] > res { + res = xs[i] + } + } + return res +} + +func Int16MaxSels(xs []int16, sels []int64) int16 { + return int16MaxSels(xs, sels) +} + +func int16MaxSelsPure(xs []int16, sels []int64) int16 { + res := xs[sels[0]] + for _, sel := range sels { + x := xs[sel] + if x > res { + res = x + } + } + return res +} + +func Int32Max(xs []int32) int32 { + return int32Max(xs) } -func i64MaxPure(xs []int64) int64 { - max := xs[0] +func int32MaxPure(xs []int32) int32 { + res := xs[0] for _, x := range xs { - if x > max { - max = x + if x > res { + res = x + } + } + return res +} + +func int32MaxAvx2(xs []int32) int32 { + const regItems int = 32 / 4 + n := len(xs) / regItems + var rs [4]int32 + int32MaxAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 4; i++ { + if rs[i] > res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] > res { + res = xs[i] + } + } + return res +} + +func int32MaxAvx512(xs []int32) int32 { + const regItems int = 64 / 4 + n := len(xs) / regItems + var rs [4]int32 + int32MaxAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 4; i++ { + if rs[i] > res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] > res { + res = xs[i] } } - return max + return res } -func f64MaxPure(xs []float64) float64 { - max := xs[0] +func Int32MaxSels(xs []int32, sels []int64) int32 { + return int32MaxSels(xs, sels) +} + +func int32MaxSelsPure(xs []int32, sels []int64) int32 { + res := xs[sels[0]] + for _, sel := range sels { + x := xs[sel] + if x > res { + res = x + } + } + return res +} + +func Int64Max(xs []int64) int64 { + return int64Max(xs) +} + +func int64MaxPure(xs []int64) int64 { + res := xs[0] for _, x := range xs { - if x > max { - max = x + if x > res { + res = x } } - return max + return res } -func bytesMaxPure(xs *vector.Bytes) []byte { - var tm []byte - var max []byte +func int64MaxAvx512(xs []int64) int64 { + const regItems int = 64 / 8 + n := len(xs) / regItems + var rs [2]int64 + int64MaxAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 2; i++ { + if rs[i] > res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] > res { + res = xs[i] + } + } + return res +} - for i, o := range xs.Os { - if tm = xs.Data[o : o+xs.Ns[i]]; bytes.Compare(tm, max) > 0 { - max = tm +func Int64MaxSels(xs []int64, sels []int64) int64 { + return int64MaxSels(xs, sels) +} + +func int64MaxSelsPure(xs []int64, sels []int64) int64 { + res := xs[sels[0]] + for _, sel := range sels { + x := xs[sel] + if x > res { + res = x } } - return max + return res } -func boolMaxSelsPure(xs []bool, sels []int64) bool { +func Uint8Max(xs []uint8) uint8 { + return uint8Max(xs) +} + +func uint8MaxPure(xs []uint8) uint8 { + res := xs[0] + for _, x := range xs { + if x > res { + res = x + } + } + return res +} + +func uint8MaxAvx2(xs []uint8) uint8 { + const regItems int = 32 / 1 + n := len(xs) / regItems + var rs [16]uint8 + uint8MaxAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 16; i++ { + if rs[i] > res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] > res { + res = xs[i] + } + } + return res +} + +func uint8MaxAvx512(xs []uint8) uint8 { + const regItems int = 64 / 1 + n := len(xs) / regItems + var rs [16]uint8 + uint8MaxAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 16; i++ { + if rs[i] > res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] > res { + res = xs[i] + } + } + return res +} + +func Uint8MaxSels(xs []uint8, sels []int64) uint8 { + return uint8MaxSels(xs, sels) +} + +func uint8MaxSelsPure(xs []uint8, sels []int64) uint8 { + res := xs[sels[0]] for _, sel := range sels { - if xs[sel] { - return true + x := xs[sel] + if x > res { + res = x } } - return false + return res +} + +func Uint16Max(xs []uint16) uint16 { + return uint16Max(xs) +} + +func uint16MaxPure(xs []uint16) uint16 { + res := xs[0] + for _, x := range xs { + if x > res { + res = x + } + } + return res +} + +func uint16MaxAvx2(xs []uint16) uint16 { + const regItems int = 32 / 2 + n := len(xs) / regItems + var rs [8]uint16 + uint16MaxAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 8; i++ { + if rs[i] > res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] > res { + res = xs[i] + } + } + return res +} + +func uint16MaxAvx512(xs []uint16) uint16 { + const regItems int = 64 / 2 + n := len(xs) / regItems + var rs [8]uint16 + uint16MaxAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 8; i++ { + if rs[i] > res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] > res { + res = xs[i] + } + } + return res +} + +func Uint16MaxSels(xs []uint16, sels []int64) uint16 { + return uint16MaxSels(xs, sels) +} + +func uint16MaxSelsPure(xs []uint16, sels []int64) uint16 { + res := xs[sels[0]] + for _, sel := range sels { + x := xs[sel] + if x > res { + res = x + } + } + return res +} + +func Uint32Max(xs []uint32) uint32 { + return uint32Max(xs) +} + +func uint32MaxPure(xs []uint32) uint32 { + res := xs[0] + for _, x := range xs { + if x > res { + res = x + } + } + return res +} + +func uint32MaxAvx2(xs []uint32) uint32 { + const regItems int = 32 / 4 + n := len(xs) / regItems + var rs [4]uint32 + uint32MaxAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 4; i++ { + if rs[i] > res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] > res { + res = xs[i] + } + } + return res +} + +func uint32MaxAvx512(xs []uint32) uint32 { + const regItems int = 64 / 4 + n := len(xs) / regItems + var rs [4]uint32 + uint32MaxAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 4; i++ { + if rs[i] > res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] > res { + res = xs[i] + } + } + return res +} + +func Uint32MaxSels(xs []uint32, sels []int64) uint32 { + return uint32MaxSels(xs, sels) +} + +func uint32MaxSelsPure(xs []uint32, sels []int64) uint32 { + res := xs[sels[0]] + for _, sel := range sels { + x := xs[sel] + if x > res { + res = x + } + } + return res +} + +func Uint64Max(xs []uint64) uint64 { + return uint64Max(xs) +} + +func uint64MaxPure(xs []uint64) uint64 { + res := xs[0] + for _, x := range xs { + if x > res { + res = x + } + } + return res +} + +func uint64MaxAvx512(xs []uint64) uint64 { + const regItems int = 64 / 8 + n := len(xs) / regItems + var rs [2]uint64 + uint64MaxAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 2; i++ { + if rs[i] > res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] > res { + res = xs[i] + } + } + return res +} + +func Uint64MaxSels(xs []uint64, sels []int64) uint64 { + return uint64MaxSels(xs, sels) } -func i64MaxSelsPure(xs []int64, sels []int64) int64 { - max := xs[sels[0]] +func uint64MaxSelsPure(xs []uint64, sels []int64) uint64 { + res := xs[sels[0]] for _, sel := range sels { - if x := xs[sel]; x > max { - max = x + x := xs[sel] + if x > res { + res = x + } + } + return res +} + +func Float32Max(xs []float32) float32 { + return float32Max(xs) +} + +func float32MaxPure(xs []float32) float32 { + res := xs[0] + for _, x := range xs { + if x > res { + res = x + } + } + return res +} + +func float32MaxAvx2(xs []float32) float32 { + const regItems int = 32 / 4 + n := len(xs) / regItems + var rs [4]float32 + float32MaxAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 4; i++ { + if rs[i] > res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] > res { + res = xs[i] } } - return max + return res } -func f64MaxSelsPure(xs []float64, sels []int64) float64 { - max := xs[sels[0]] +func float32MaxAvx512(xs []float32) float32 { + const regItems int = 64 / 4 + n := len(xs) / regItems + var rs [4]float32 + float32MaxAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 4; i++ { + if rs[i] > res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] > res { + res = xs[i] + } + } + return res +} + +func Float32MaxSels(xs []float32, sels []int64) float32 { + return float32MaxSels(xs, sels) +} + +func float32MaxSelsPure(xs []float32, sels []int64) float32 { + res := xs[sels[0]] for _, sel := range sels { - if x := xs[sel]; x > max { - max = x + x := xs[sel] + if x > res { + res = x } } - return max + return res } -func bytesMaxSelsPure(xs *vector.Bytes, sels []int64) []byte { - var tm []byte - var max []byte +func Float64Max(xs []float64) float64 { + return float64Max(xs) +} + +func float64MaxPure(xs []float64) float64 { + res := xs[0] + for _, x := range xs { + if x > res { + res = x + } + } + return res +} + +func float64MaxAvx2(xs []float64) float64 { + const regItems int = 32 / 8 + n := len(xs) / regItems + var rs [2]float64 + float64MaxAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 2; i++ { + if rs[i] > res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] > res { + res = xs[i] + } + } + return res +} + +func float64MaxAvx512(xs []float64) float64 { + const regItems int = 64 / 8 + n := len(xs) / regItems + var rs [2]float64 + float64MaxAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 2; i++ { + if rs[i] > res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] > res { + res = xs[i] + } + } + return res +} + +func Float64MaxSels(xs []float64, sels []int64) float64 { + return float64MaxSels(xs, sels) +} + +func float64MaxSelsPure(xs []float64, sels []int64) float64 { + res := xs[sels[0]] + for _, sel := range sels { + x := xs[sel] + if x > res { + res = x + } + } + return res +} + +func StrMax(xs *types.Bytes) []byte { + return strMax(xs) +} + +func strMaxPure(xs *types.Bytes) []byte { + res := xs.Get(0) + for i, n := 0, len(xs.Offsets); i < n; i++ { + x := xs.Get(i) + if bytes.Compare(x, res) > 0 { + res = x + } + } + return res +} + +func StrMaxSels(xs *types.Bytes, sels []int64) []byte { + return strMaxSels(xs, sels) +} +func strMaxSelsPure(xs *types.Bytes, sels []int64) []byte { + res := xs.Get(int(sels[0])) for _, sel := range sels { - if tm = xs.Data[xs.Os[sel] : xs.Os[sel]+xs.Ns[sel]]; bytes.Compare(tm, max) > 0 { - max = tm + x := xs.Get(int(sel)) + if bytes.Compare(x, res) > 0 { + res = x } } - return max + return res } diff --git a/pkg/vectorize/min/avx2.s b/pkg/vectorize/min/avx2.s new file mode 100644 index 0000000000000000000000000000000000000000..1b2bdd87369fe32d9da7a586fef653b84e38fcf2 --- /dev/null +++ b/pkg/vectorize/min/avx2.s @@ -0,0 +1,387 @@ +// Code generated by command: go run avx2.go -out min/avx2.s -stubs min/avx2_stubs.go. DO NOT EDIT. + +#include "textflag.h" + +// func int8MinAvx2Asm(x []int8, r []int8) +// Requires: AVX, AVX2, SSE2, SSE4.1 +TEXT ·int8MinAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x000000000000007f, BX + MOVQ BX, X0 + VPBROADCASTB X0, Y0 + VMOVDQU Y0, Y1 + VMOVDQU Y0, Y2 + VMOVDQU Y0, Y3 + VMOVDQU Y0, Y4 + VMOVDQU Y0, Y5 + VMOVDQU Y0, Y0 + +int8MinBlockLoop: + CMPQ DX, $0x000000c0 + JL int8MinTailLoop + VPMINSB (AX), Y1, Y1 + VPMINSB 32(AX), Y2, Y2 + VPMINSB 64(AX), Y3, Y3 + VPMINSB 96(AX), Y4, Y4 + VPMINSB 128(AX), Y5, Y5 + VPMINSB 160(AX), Y0, Y0 + ADDQ $0x000000c0, AX + SUBQ $0x000000c0, DX + JMP int8MinBlockLoop + +int8MinTailLoop: + CMPQ DX, $0x00000004 + JL int8MinDone + VPMINSB (AX), Y1, Y1 + ADDQ $0x00000020, AX + SUBQ $0x00000020, DX + JMP int8MinTailLoop + +int8MinDone: + VPMINSB Y1, Y2, Y1 + VPMINSB Y1, Y3, Y1 + VPMINSB Y1, Y4, Y1 + VPMINSB Y1, Y5, Y1 + VPMINSB Y1, Y0, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMINSB X0, X1 + MOVOU X1, (CX) + RET + +// func int16MinAvx2Asm(x []int16, r []int16) +// Requires: AVX, AVX2, SSE2 +TEXT ·int16MinAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x0000000000007fff, BX + MOVQ BX, X0 + VPBROADCASTW X0, Y0 + VMOVDQU Y0, Y1 + VMOVDQU Y0, Y2 + VMOVDQU Y0, Y3 + VMOVDQU Y0, Y4 + VMOVDQU Y0, Y5 + VMOVDQU Y0, Y0 + +int16MinBlockLoop: + CMPQ DX, $0x00000060 + JL int16MinTailLoop + VPMINSW (AX), Y1, Y1 + VPMINSW 32(AX), Y2, Y2 + VPMINSW 64(AX), Y3, Y3 + VPMINSW 96(AX), Y4, Y4 + VPMINSW 128(AX), Y5, Y5 + VPMINSW 160(AX), Y0, Y0 + ADDQ $0x000000c0, AX + SUBQ $0x00000060, DX + JMP int16MinBlockLoop + +int16MinTailLoop: + CMPQ DX, $0x00000004 + JL int16MinDone + VPMINSW (AX), Y1, Y1 + ADDQ $0x00000020, AX + SUBQ $0x00000010, DX + JMP int16MinTailLoop + +int16MinDone: + VPMINSW Y1, Y2, Y1 + VPMINSW Y1, Y3, Y1 + VPMINSW Y1, Y4, Y1 + VPMINSW Y1, Y5, Y1 + VPMINSW Y1, Y0, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMINSW X0, X1 + MOVOU X1, (CX) + RET + +// func int32MinAvx2Asm(x []int32, r []int32) +// Requires: AVX, AVX2, SSE2, SSE4.1 +TEXT ·int32MinAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x000000007fffffff, BX + MOVQ BX, X0 + VPBROADCASTD X0, Y0 + VMOVDQU Y0, Y1 + VMOVDQU Y0, Y2 + VMOVDQU Y0, Y3 + VMOVDQU Y0, Y4 + VMOVDQU Y0, Y5 + VMOVDQU Y0, Y0 + +int32MinBlockLoop: + CMPQ DX, $0x00000030 + JL int32MinTailLoop + VPMINSD (AX), Y1, Y1 + VPMINSD 32(AX), Y2, Y2 + VPMINSD 64(AX), Y3, Y3 + VPMINSD 96(AX), Y4, Y4 + VPMINSD 128(AX), Y5, Y5 + VPMINSD 160(AX), Y0, Y0 + ADDQ $0x000000c0, AX + SUBQ $0x00000030, DX + JMP int32MinBlockLoop + +int32MinTailLoop: + CMPQ DX, $0x00000004 + JL int32MinDone + VPMINSD (AX), Y1, Y1 + ADDQ $0x00000020, AX + SUBQ $0x00000008, DX + JMP int32MinTailLoop + +int32MinDone: + VPMINSD Y1, Y2, Y1 + VPMINSD Y1, Y3, Y1 + VPMINSD Y1, Y4, Y1 + VPMINSD Y1, Y5, Y1 + VPMINSD Y1, Y0, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMINSD X0, X1 + MOVOU X1, (CX) + RET + +// func uint8MinAvx2Asm(x []uint8, r []uint8) +// Requires: AVX, AVX2, SSE2 +TEXT ·uint8MinAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0xffffffffffffffff, BX + MOVQ BX, X0 + VPBROADCASTB X0, Y0 + VMOVDQU Y0, Y1 + VMOVDQU Y0, Y2 + VMOVDQU Y0, Y3 + VMOVDQU Y0, Y4 + VMOVDQU Y0, Y5 + VMOVDQU Y0, Y0 + +uint8MinBlockLoop: + CMPQ DX, $0x000000c0 + JL uint8MinTailLoop + VPMINUB (AX), Y1, Y1 + VPMINUB 32(AX), Y2, Y2 + VPMINUB 64(AX), Y3, Y3 + VPMINUB 96(AX), Y4, Y4 + VPMINUB 128(AX), Y5, Y5 + VPMINUB 160(AX), Y0, Y0 + ADDQ $0x000000c0, AX + SUBQ $0x000000c0, DX + JMP uint8MinBlockLoop + +uint8MinTailLoop: + CMPQ DX, $0x00000004 + JL uint8MinDone + VPMINUB (AX), Y1, Y1 + ADDQ $0x00000020, AX + SUBQ $0x00000020, DX + JMP uint8MinTailLoop + +uint8MinDone: + VPMINUB Y1, Y2, Y1 + VPMINUB Y1, Y3, Y1 + VPMINUB Y1, Y4, Y1 + VPMINUB Y1, Y5, Y1 + VPMINUB Y1, Y0, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMINUB X0, X1 + MOVOU X1, (CX) + RET + +// func uint16MinAvx2Asm(x []uint16, r []uint16) +// Requires: AVX, AVX2, SSE2, SSE4.1 +TEXT ·uint16MinAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0xffffffffffffffff, BX + MOVQ BX, X0 + VPBROADCASTW X0, Y0 + VMOVDQU Y0, Y1 + VMOVDQU Y0, Y2 + VMOVDQU Y0, Y3 + VMOVDQU Y0, Y4 + VMOVDQU Y0, Y5 + VMOVDQU Y0, Y0 + +uint16MinBlockLoop: + CMPQ DX, $0x00000060 + JL uint16MinTailLoop + VPMINUW (AX), Y1, Y1 + VPMINUW 32(AX), Y2, Y2 + VPMINUW 64(AX), Y3, Y3 + VPMINUW 96(AX), Y4, Y4 + VPMINUW 128(AX), Y5, Y5 + VPMINUW 160(AX), Y0, Y0 + ADDQ $0x000000c0, AX + SUBQ $0x00000060, DX + JMP uint16MinBlockLoop + +uint16MinTailLoop: + CMPQ DX, $0x00000004 + JL uint16MinDone + VPMINUW (AX), Y1, Y1 + ADDQ $0x00000020, AX + SUBQ $0x00000010, DX + JMP uint16MinTailLoop + +uint16MinDone: + VPMINUW Y1, Y2, Y1 + VPMINUW Y1, Y3, Y1 + VPMINUW Y1, Y4, Y1 + VPMINUW Y1, Y5, Y1 + VPMINUW Y1, Y0, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMINUW X0, X1 + MOVOU X1, (CX) + RET + +// func uint32MinAvx2Asm(x []uint32, r []uint32) +// Requires: AVX, AVX2, SSE2, SSE4.1 +TEXT ·uint32MinAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0xffffffffffffffff, BX + MOVQ BX, X0 + VPBROADCASTD X0, Y0 + VMOVDQU Y0, Y1 + VMOVDQU Y0, Y2 + VMOVDQU Y0, Y3 + VMOVDQU Y0, Y4 + VMOVDQU Y0, Y5 + VMOVDQU Y0, Y0 + +uint32MinBlockLoop: + CMPQ DX, $0x00000030 + JL uint32MinTailLoop + VPMINUD (AX), Y1, Y1 + VPMINUD 32(AX), Y2, Y2 + VPMINUD 64(AX), Y3, Y3 + VPMINUD 96(AX), Y4, Y4 + VPMINUD 128(AX), Y5, Y5 + VPMINUD 160(AX), Y0, Y0 + ADDQ $0x000000c0, AX + SUBQ $0x00000030, DX + JMP uint32MinBlockLoop + +uint32MinTailLoop: + CMPQ DX, $0x00000004 + JL uint32MinDone + VPMINUD (AX), Y1, Y1 + ADDQ $0x00000020, AX + SUBQ $0x00000008, DX + JMP uint32MinTailLoop + +uint32MinDone: + VPMINUD Y1, Y2, Y1 + VPMINUD Y1, Y3, Y1 + VPMINUD Y1, Y4, Y1 + VPMINUD Y1, Y5, Y1 + VPMINUD Y1, Y0, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMINUD X0, X1 + MOVOU X1, (CX) + RET + +// func float32MinAvx2Asm(x []float32, r []float32) +// Requires: AVX, AVX2, SSE, SSE2 +TEXT ·float32MinAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x000000007f7fffff, BX + MOVQ BX, X0 + VBROADCASTSS X0, Y0 + VMOVUPS Y0, Y1 + VMOVUPS Y0, Y2 + VMOVUPS Y0, Y3 + VMOVUPS Y0, Y4 + VMOVUPS Y0, Y5 + VMOVUPS Y0, Y0 + +float32MinBlockLoop: + CMPQ DX, $0x00000030 + JL float32MinTailLoop + VMINPS (AX), Y1, Y1 + VMINPS 32(AX), Y2, Y2 + VMINPS 64(AX), Y3, Y3 + VMINPS 96(AX), Y4, Y4 + VMINPS 128(AX), Y5, Y5 + VMINPS 160(AX), Y0, Y0 + ADDQ $0x000000c0, AX + SUBQ $0x00000030, DX + JMP float32MinBlockLoop + +float32MinTailLoop: + CMPQ DX, $0x00000004 + JL float32MinDone + VMINPS (AX), Y1, Y1 + ADDQ $0x00000020, AX + SUBQ $0x00000008, DX + JMP float32MinTailLoop + +float32MinDone: + VMINPS Y1, Y2, Y1 + VMINPS Y1, Y3, Y1 + VMINPS Y1, Y4, Y1 + VMINPS Y1, Y5, Y1 + VMINPS Y1, Y0, Y1 + VEXTRACTF128 $0x01, Y1, X0 + MINPS X0, X1 + MOVOU X1, (CX) + RET + +// func float64MinAvx2Asm(x []float64, r []float64) +// Requires: AVX, AVX2, SSE2 +TEXT ·float64MinAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x7fefffffffffffff, BX + MOVQ BX, X0 + VBROADCASTSD X0, Y0 + VMOVUPD Y0, Y1 + VMOVUPD Y0, Y2 + VMOVUPD Y0, Y3 + VMOVUPD Y0, Y4 + VMOVUPD Y0, Y5 + VMOVUPD Y0, Y0 + +float64MinBlockLoop: + CMPQ DX, $0x00000018 + JL float64MinTailLoop + VMINPD (AX), Y1, Y1 + VMINPD 32(AX), Y2, Y2 + VMINPD 64(AX), Y3, Y3 + VMINPD 96(AX), Y4, Y4 + VMINPD 128(AX), Y5, Y5 + VMINPD 160(AX), Y0, Y0 + ADDQ $0x000000c0, AX + SUBQ $0x00000018, DX + JMP float64MinBlockLoop + +float64MinTailLoop: + CMPQ DX, $0x00000004 + JL float64MinDone + VMINPD (AX), Y1, Y1 + ADDQ $0x00000020, AX + SUBQ $0x00000004, DX + JMP float64MinTailLoop + +float64MinDone: + VMINPD Y1, Y2, Y1 + VMINPD Y1, Y3, Y1 + VMINPD Y1, Y4, Y1 + VMINPD Y1, Y5, Y1 + VMINPD Y1, Y0, Y1 + VEXTRACTF128 $0x01, Y1, X0 + MINPD X0, X1 + MOVOU X1, (CX) + RET diff --git a/pkg/vectorize/min/avx2_stubs.go b/pkg/vectorize/min/avx2_stubs.go new file mode 100644 index 0000000000000000000000000000000000000000..627dab0d89b52486e144da21df2c38b50c2234c3 --- /dev/null +++ b/pkg/vectorize/min/avx2_stubs.go @@ -0,0 +1,19 @@ +// Code generated by command: go run avx2.go -out min/avx2.s -stubs min/avx2_stubs.go. DO NOT EDIT. + +package min + +func int8MinAvx2Asm(x []int8, r []int8) + +func int16MinAvx2Asm(x []int16, r []int16) + +func int32MinAvx2Asm(x []int32, r []int32) + +func uint8MinAvx2Asm(x []uint8, r []uint8) + +func uint16MinAvx2Asm(x []uint16, r []uint16) + +func uint32MinAvx2Asm(x []uint32, r []uint32) + +func float32MinAvx2Asm(x []float32, r []float32) + +func float64MinAvx2Asm(x []float64, r []float64) diff --git a/pkg/vectorize/min/avx512.s b/pkg/vectorize/min/avx512.s new file mode 100644 index 0000000000000000000000000000000000000000..0175db4ed5e95a61fac4379b0a09ee4855ac77ab --- /dev/null +++ b/pkg/vectorize/min/avx512.s @@ -0,0 +1,683 @@ +// Code generated by command: go run avx512.go -out avx512.s -stubs avx512_stubs.go. DO NOT EDIT. + +#include "textflag.h" + +// func int8MinAvx512Asm(x []int8, r []int8) +// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2, SSE4.1 +TEXT ·int8MinAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x000000000000007f, BX + MOVQ BX, X0 + VPBROADCASTB X0, Z0 + VMOVDQU64 Z0, Z1 + VMOVDQU64 Z0, Z2 + VMOVDQU64 Z0, Z3 + VMOVDQU64 Z0, Z4 + VMOVDQU64 Z0, Z5 + VMOVDQU64 Z0, Z6 + VMOVDQU64 Z0, Z7 + VMOVDQU64 Z0, Z8 + VMOVDQU64 Z0, Z9 + VMOVDQU64 Z0, Z10 + VMOVDQU64 Z0, Z11 + VMOVDQU64 Z0, Z0 + +int8MinBlockLoop: + CMPQ DX, $0x00000300 + JL int8MinTailLoop + VPMINSB (AX), Z1, Z1 + VPMINSB 64(AX), Z2, Z2 + VPMINSB 128(AX), Z3, Z3 + VPMINSB 192(AX), Z4, Z4 + VPMINSB 256(AX), Z5, Z5 + VPMINSB 320(AX), Z6, Z6 + VPMINSB 384(AX), Z7, Z7 + VPMINSB 448(AX), Z8, Z8 + VPMINSB 512(AX), Z9, Z9 + VPMINSB 576(AX), Z10, Z10 + VPMINSB 640(AX), Z11, Z11 + VPMINSB 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x00000300, DX + JMP int8MinBlockLoop + +int8MinTailLoop: + CMPQ DX, $0x00000004 + JL int8MinDone + VPMINSB (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000040, DX + JMP int8MinTailLoop + +int8MinDone: + VPMINSB Z1, Z2, Z1 + VPMINSB Z1, Z3, Z1 + VPMINSB Z1, Z4, Z1 + VPMINSB Z1, Z5, Z1 + VPMINSB Z1, Z6, Z1 + VPMINSB Z1, Z7, Z1 + VPMINSB Z1, Z8, Z1 + VPMINSB Z1, Z9, Z1 + VPMINSB Z1, Z10, Z1 + VPMINSB Z1, Z11, Z1 + VPMINSB Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VPMINSB Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMINSB X0, X1 + MOVOU X1, (CX) + RET + +// func int16MinAvx512Asm(x []int16, r []int16) +// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2 +TEXT ·int16MinAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x0000000000007fff, BX + MOVQ BX, X0 + VPBROADCASTW X0, Z0 + VMOVDQU64 Z0, Z1 + VMOVDQU64 Z0, Z2 + VMOVDQU64 Z0, Z3 + VMOVDQU64 Z0, Z4 + VMOVDQU64 Z0, Z5 + VMOVDQU64 Z0, Z6 + VMOVDQU64 Z0, Z7 + VMOVDQU64 Z0, Z8 + VMOVDQU64 Z0, Z9 + VMOVDQU64 Z0, Z10 + VMOVDQU64 Z0, Z11 + VMOVDQU64 Z0, Z0 + +int16MinBlockLoop: + CMPQ DX, $0x00000180 + JL int16MinTailLoop + VPMINSW (AX), Z1, Z1 + VPMINSW 64(AX), Z2, Z2 + VPMINSW 128(AX), Z3, Z3 + VPMINSW 192(AX), Z4, Z4 + VPMINSW 256(AX), Z5, Z5 + VPMINSW 320(AX), Z6, Z6 + VPMINSW 384(AX), Z7, Z7 + VPMINSW 448(AX), Z8, Z8 + VPMINSW 512(AX), Z9, Z9 + VPMINSW 576(AX), Z10, Z10 + VPMINSW 640(AX), Z11, Z11 + VPMINSW 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x00000180, DX + JMP int16MinBlockLoop + +int16MinTailLoop: + CMPQ DX, $0x00000004 + JL int16MinDone + VPMINSW (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000020, DX + JMP int16MinTailLoop + +int16MinDone: + VPMINSW Z1, Z2, Z1 + VPMINSW Z1, Z3, Z1 + VPMINSW Z1, Z4, Z1 + VPMINSW Z1, Z5, Z1 + VPMINSW Z1, Z6, Z1 + VPMINSW Z1, Z7, Z1 + VPMINSW Z1, Z8, Z1 + VPMINSW Z1, Z9, Z1 + VPMINSW Z1, Z10, Z1 + VPMINSW Z1, Z11, Z1 + VPMINSW Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VPMINSW Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMINSW X0, X1 + MOVOU X1, (CX) + RET + +// func int32MinAvx512Asm(x []int32, r []int32) +// Requires: AVX, AVX2, AVX512F, SSE2, SSE4.1 +TEXT ·int32MinAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x000000007fffffff, BX + MOVQ BX, X0 + VPBROADCASTD X0, Z0 + VMOVDQU64 Z0, Z1 + VMOVDQU64 Z0, Z2 + VMOVDQU64 Z0, Z3 + VMOVDQU64 Z0, Z4 + VMOVDQU64 Z0, Z5 + VMOVDQU64 Z0, Z6 + VMOVDQU64 Z0, Z7 + VMOVDQU64 Z0, Z8 + VMOVDQU64 Z0, Z9 + VMOVDQU64 Z0, Z10 + VMOVDQU64 Z0, Z11 + VMOVDQU64 Z0, Z0 + +int32MinBlockLoop: + CMPQ DX, $0x000000c0 + JL int32MinTailLoop + VPMINSD (AX), Z1, Z1 + VPMINSD 64(AX), Z2, Z2 + VPMINSD 128(AX), Z3, Z3 + VPMINSD 192(AX), Z4, Z4 + VPMINSD 256(AX), Z5, Z5 + VPMINSD 320(AX), Z6, Z6 + VPMINSD 384(AX), Z7, Z7 + VPMINSD 448(AX), Z8, Z8 + VPMINSD 512(AX), Z9, Z9 + VPMINSD 576(AX), Z10, Z10 + VPMINSD 640(AX), Z11, Z11 + VPMINSD 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x000000c0, DX + JMP int32MinBlockLoop + +int32MinTailLoop: + CMPQ DX, $0x00000004 + JL int32MinDone + VPMINSD (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000010, DX + JMP int32MinTailLoop + +int32MinDone: + VPMINSD Z1, Z2, Z1 + VPMINSD Z1, Z3, Z1 + VPMINSD Z1, Z4, Z1 + VPMINSD Z1, Z5, Z1 + VPMINSD Z1, Z6, Z1 + VPMINSD Z1, Z7, Z1 + VPMINSD Z1, Z8, Z1 + VPMINSD Z1, Z9, Z1 + VPMINSD Z1, Z10, Z1 + VPMINSD Z1, Z11, Z1 + VPMINSD Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VPMINSD Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMINSD X0, X1 + MOVOU X1, (CX) + RET + +// func int64MinAvx512Asm(x []int64, r []int64) +// Requires: AVX, AVX512F, AVX512VL, SSE2 +TEXT ·int64MinAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x7fffffffffffffff, BX + MOVQ BX, X0 + VPBROADCASTQ X0, Z0 + VMOVDQU64 Z0, Z1 + VMOVDQU64 Z0, Z2 + VMOVDQU64 Z0, Z3 + VMOVDQU64 Z0, Z4 + VMOVDQU64 Z0, Z5 + VMOVDQU64 Z0, Z6 + VMOVDQU64 Z0, Z7 + VMOVDQU64 Z0, Z8 + VMOVDQU64 Z0, Z9 + VMOVDQU64 Z0, Z10 + VMOVDQU64 Z0, Z11 + VMOVDQU64 Z0, Z0 + +int64MinBlockLoop: + CMPQ DX, $0x00000060 + JL int64MinTailLoop + VPMINSQ (AX), Z1, Z1 + VPMINSQ 64(AX), Z2, Z2 + VPMINSQ 128(AX), Z3, Z3 + VPMINSQ 192(AX), Z4, Z4 + VPMINSQ 256(AX), Z5, Z5 + VPMINSQ 320(AX), Z6, Z6 + VPMINSQ 384(AX), Z7, Z7 + VPMINSQ 448(AX), Z8, Z8 + VPMINSQ 512(AX), Z9, Z9 + VPMINSQ 576(AX), Z10, Z10 + VPMINSQ 640(AX), Z11, Z11 + VPMINSQ 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x00000060, DX + JMP int64MinBlockLoop + +int64MinTailLoop: + CMPQ DX, $0x00000004 + JL int64MinDone + VPMINSQ (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000008, DX + JMP int64MinTailLoop + +int64MinDone: + VPMINSQ Z1, Z2, Z1 + VPMINSQ Z1, Z3, Z1 + VPMINSQ Z1, Z4, Z1 + VPMINSQ Z1, Z5, Z1 + VPMINSQ Z1, Z6, Z1 + VPMINSQ Z1, Z7, Z1 + VPMINSQ Z1, Z8, Z1 + VPMINSQ Z1, Z9, Z1 + VPMINSQ Z1, Z10, Z1 + VPMINSQ Z1, Z11, Z1 + VPMINSQ Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VPMINSQ Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + VPMINSQ X0, X1, X1 + MOVOU X1, (CX) + RET + +// func uint8MinAvx512Asm(x []uint8, r []uint8) +// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2 +TEXT ·uint8MinAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0xffffffffffffffff, BX + MOVQ BX, X0 + VPBROADCASTB X0, Z0 + VMOVDQU64 Z0, Z1 + VMOVDQU64 Z0, Z2 + VMOVDQU64 Z0, Z3 + VMOVDQU64 Z0, Z4 + VMOVDQU64 Z0, Z5 + VMOVDQU64 Z0, Z6 + VMOVDQU64 Z0, Z7 + VMOVDQU64 Z0, Z8 + VMOVDQU64 Z0, Z9 + VMOVDQU64 Z0, Z10 + VMOVDQU64 Z0, Z11 + VMOVDQU64 Z0, Z0 + +uint8MinBlockLoop: + CMPQ DX, $0x00000300 + JL uint8MinTailLoop + VPMINUB (AX), Z1, Z1 + VPMINUB 64(AX), Z2, Z2 + VPMINUB 128(AX), Z3, Z3 + VPMINUB 192(AX), Z4, Z4 + VPMINUB 256(AX), Z5, Z5 + VPMINUB 320(AX), Z6, Z6 + VPMINUB 384(AX), Z7, Z7 + VPMINUB 448(AX), Z8, Z8 + VPMINUB 512(AX), Z9, Z9 + VPMINUB 576(AX), Z10, Z10 + VPMINUB 640(AX), Z11, Z11 + VPMINUB 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x00000300, DX + JMP uint8MinBlockLoop + +uint8MinTailLoop: + CMPQ DX, $0x00000004 + JL uint8MinDone + VPMINUB (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000040, DX + JMP uint8MinTailLoop + +uint8MinDone: + VPMINUB Z1, Z2, Z1 + VPMINUB Z1, Z3, Z1 + VPMINUB Z1, Z4, Z1 + VPMINUB Z1, Z5, Z1 + VPMINUB Z1, Z6, Z1 + VPMINUB Z1, Z7, Z1 + VPMINUB Z1, Z8, Z1 + VPMINUB Z1, Z9, Z1 + VPMINUB Z1, Z10, Z1 + VPMINUB Z1, Z11, Z1 + VPMINUB Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VPMINUB Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMINUB X0, X1 + MOVOU X1, (CX) + RET + +// func uint16MinAvx512Asm(x []uint16, r []uint16) +// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2, SSE4.1 +TEXT ·uint16MinAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0xffffffffffffffff, BX + MOVQ BX, X0 + VPBROADCASTW X0, Z0 + VMOVDQU64 Z0, Z1 + VMOVDQU64 Z0, Z2 + VMOVDQU64 Z0, Z3 + VMOVDQU64 Z0, Z4 + VMOVDQU64 Z0, Z5 + VMOVDQU64 Z0, Z6 + VMOVDQU64 Z0, Z7 + VMOVDQU64 Z0, Z8 + VMOVDQU64 Z0, Z9 + VMOVDQU64 Z0, Z10 + VMOVDQU64 Z0, Z11 + VMOVDQU64 Z0, Z0 + +uint16MinBlockLoop: + CMPQ DX, $0x00000180 + JL uint16MinTailLoop + VPMINUW (AX), Z1, Z1 + VPMINUW 64(AX), Z2, Z2 + VPMINUW 128(AX), Z3, Z3 + VPMINUW 192(AX), Z4, Z4 + VPMINUW 256(AX), Z5, Z5 + VPMINUW 320(AX), Z6, Z6 + VPMINUW 384(AX), Z7, Z7 + VPMINUW 448(AX), Z8, Z8 + VPMINUW 512(AX), Z9, Z9 + VPMINUW 576(AX), Z10, Z10 + VPMINUW 640(AX), Z11, Z11 + VPMINUW 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x00000180, DX + JMP uint16MinBlockLoop + +uint16MinTailLoop: + CMPQ DX, $0x00000004 + JL uint16MinDone + VPMINUW (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000020, DX + JMP uint16MinTailLoop + +uint16MinDone: + VPMINUW Z1, Z2, Z1 + VPMINUW Z1, Z3, Z1 + VPMINUW Z1, Z4, Z1 + VPMINUW Z1, Z5, Z1 + VPMINUW Z1, Z6, Z1 + VPMINUW Z1, Z7, Z1 + VPMINUW Z1, Z8, Z1 + VPMINUW Z1, Z9, Z1 + VPMINUW Z1, Z10, Z1 + VPMINUW Z1, Z11, Z1 + VPMINUW Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VPMINUW Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMINUW X0, X1 + MOVOU X1, (CX) + RET + +// func uint32MinAvx512Asm(x []uint32, r []uint32) +// Requires: AVX, AVX2, AVX512F, SSE2, SSE4.1 +TEXT ·uint32MinAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0xffffffffffffffff, BX + MOVQ BX, X0 + VPBROADCASTD X0, Z0 + VMOVDQU64 Z0, Z1 + VMOVDQU64 Z0, Z2 + VMOVDQU64 Z0, Z3 + VMOVDQU64 Z0, Z4 + VMOVDQU64 Z0, Z5 + VMOVDQU64 Z0, Z6 + VMOVDQU64 Z0, Z7 + VMOVDQU64 Z0, Z8 + VMOVDQU64 Z0, Z9 + VMOVDQU64 Z0, Z10 + VMOVDQU64 Z0, Z11 + VMOVDQU64 Z0, Z0 + +uint32MinBlockLoop: + CMPQ DX, $0x000000c0 + JL uint32MinTailLoop + VPMINUD (AX), Z1, Z1 + VPMINUD 64(AX), Z2, Z2 + VPMINUD 128(AX), Z3, Z3 + VPMINUD 192(AX), Z4, Z4 + VPMINUD 256(AX), Z5, Z5 + VPMINUD 320(AX), Z6, Z6 + VPMINUD 384(AX), Z7, Z7 + VPMINUD 448(AX), Z8, Z8 + VPMINUD 512(AX), Z9, Z9 + VPMINUD 576(AX), Z10, Z10 + VPMINUD 640(AX), Z11, Z11 + VPMINUD 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x000000c0, DX + JMP uint32MinBlockLoop + +uint32MinTailLoop: + CMPQ DX, $0x00000004 + JL uint32MinDone + VPMINUD (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000010, DX + JMP uint32MinTailLoop + +uint32MinDone: + VPMINUD Z1, Z2, Z1 + VPMINUD Z1, Z3, Z1 + VPMINUD Z1, Z4, Z1 + VPMINUD Z1, Z5, Z1 + VPMINUD Z1, Z6, Z1 + VPMINUD Z1, Z7, Z1 + VPMINUD Z1, Z8, Z1 + VPMINUD Z1, Z9, Z1 + VPMINUD Z1, Z10, Z1 + VPMINUD Z1, Z11, Z1 + VPMINUD Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VPMINUD Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + PMINUD X0, X1 + MOVOU X1, (CX) + RET + +// func uint64MinAvx512Asm(x []uint64, r []uint64) +// Requires: AVX, AVX512F, AVX512VL, SSE2 +TEXT ·uint64MinAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0xffffffffffffffff, BX + MOVQ BX, X0 + VPBROADCASTQ X0, Z0 + VMOVDQU64 Z0, Z1 + VMOVDQU64 Z0, Z2 + VMOVDQU64 Z0, Z3 + VMOVDQU64 Z0, Z4 + VMOVDQU64 Z0, Z5 + VMOVDQU64 Z0, Z6 + VMOVDQU64 Z0, Z7 + VMOVDQU64 Z0, Z8 + VMOVDQU64 Z0, Z9 + VMOVDQU64 Z0, Z10 + VMOVDQU64 Z0, Z11 + VMOVDQU64 Z0, Z0 + +uint64MinBlockLoop: + CMPQ DX, $0x00000060 + JL uint64MinTailLoop + VPMINUQ (AX), Z1, Z1 + VPMINUQ 64(AX), Z2, Z2 + VPMINUQ 128(AX), Z3, Z3 + VPMINUQ 192(AX), Z4, Z4 + VPMINUQ 256(AX), Z5, Z5 + VPMINUQ 320(AX), Z6, Z6 + VPMINUQ 384(AX), Z7, Z7 + VPMINUQ 448(AX), Z8, Z8 + VPMINUQ 512(AX), Z9, Z9 + VPMINUQ 576(AX), Z10, Z10 + VPMINUQ 640(AX), Z11, Z11 + VPMINUQ 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x00000060, DX + JMP uint64MinBlockLoop + +uint64MinTailLoop: + CMPQ DX, $0x00000004 + JL uint64MinDone + VPMINUQ (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000008, DX + JMP uint64MinTailLoop + +uint64MinDone: + VPMINUQ Z1, Z2, Z1 + VPMINUQ Z1, Z3, Z1 + VPMINUQ Z1, Z4, Z1 + VPMINUQ Z1, Z5, Z1 + VPMINUQ Z1, Z6, Z1 + VPMINUQ Z1, Z7, Z1 + VPMINUQ Z1, Z8, Z1 + VPMINUQ Z1, Z9, Z1 + VPMINUQ Z1, Z10, Z1 + VPMINUQ Z1, Z11, Z1 + VPMINUQ Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VPMINUQ Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + VPMINUQ X0, X1, X1 + MOVOU X1, (CX) + RET + +// func float32MinAvx512Asm(x []float32, r []float32) +// Requires: AVX, AVX512F, SSE, SSE2 +TEXT ·float32MinAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x000000007f7fffff, BX + MOVQ BX, X0 + VBROADCASTSS X0, Z0 + VMOVUPS Z0, Z1 + VMOVUPS Z0, Z2 + VMOVUPS Z0, Z3 + VMOVUPS Z0, Z4 + VMOVUPS Z0, Z5 + VMOVUPS Z0, Z6 + VMOVUPS Z0, Z7 + VMOVUPS Z0, Z8 + VMOVUPS Z0, Z9 + VMOVUPS Z0, Z10 + VMOVUPS Z0, Z11 + VMOVUPS Z0, Z0 + +float32MinBlockLoop: + CMPQ DX, $0x000000c0 + JL float32MinTailLoop + VMINPS (AX), Z1, Z1 + VMINPS 64(AX), Z2, Z2 + VMINPS 128(AX), Z3, Z3 + VMINPS 192(AX), Z4, Z4 + VMINPS 256(AX), Z5, Z5 + VMINPS 320(AX), Z6, Z6 + VMINPS 384(AX), Z7, Z7 + VMINPS 448(AX), Z8, Z8 + VMINPS 512(AX), Z9, Z9 + VMINPS 576(AX), Z10, Z10 + VMINPS 640(AX), Z11, Z11 + VMINPS 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x000000c0, DX + JMP float32MinBlockLoop + +float32MinTailLoop: + CMPQ DX, $0x00000004 + JL float32MinDone + VMINPS (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000010, DX + JMP float32MinTailLoop + +float32MinDone: + VMINPS Z1, Z2, Z1 + VMINPS Z1, Z3, Z1 + VMINPS Z1, Z4, Z1 + VMINPS Z1, Z5, Z1 + VMINPS Z1, Z6, Z1 + VMINPS Z1, Z7, Z1 + VMINPS Z1, Z8, Z1 + VMINPS Z1, Z9, Z1 + VMINPS Z1, Z10, Z1 + VMINPS Z1, Z11, Z1 + VMINPS Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VMINPS Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + MINPS X0, X1 + MOVOU X1, (CX) + RET + +// func float64MinAvx512Asm(x []float64, r []float64) +// Requires: AVX, AVX512F, SSE2 +TEXT ·float64MinAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + MOVQ $0x7fefffffffffffff, BX + MOVQ BX, X0 + VBROADCASTSD X0, Z0 + VMOVUPD Z0, Z1 + VMOVUPD Z0, Z2 + VMOVUPD Z0, Z3 + VMOVUPD Z0, Z4 + VMOVUPD Z0, Z5 + VMOVUPD Z0, Z6 + VMOVUPD Z0, Z7 + VMOVUPD Z0, Z8 + VMOVUPD Z0, Z9 + VMOVUPD Z0, Z10 + VMOVUPD Z0, Z11 + VMOVUPD Z0, Z0 + +float64MinBlockLoop: + CMPQ DX, $0x00000060 + JL float64MinTailLoop + VMINPD (AX), Z1, Z1 + VMINPD 64(AX), Z2, Z2 + VMINPD 128(AX), Z3, Z3 + VMINPD 192(AX), Z4, Z4 + VMINPD 256(AX), Z5, Z5 + VMINPD 320(AX), Z6, Z6 + VMINPD 384(AX), Z7, Z7 + VMINPD 448(AX), Z8, Z8 + VMINPD 512(AX), Z9, Z9 + VMINPD 576(AX), Z10, Z10 + VMINPD 640(AX), Z11, Z11 + VMINPD 704(AX), Z0, Z0 + ADDQ $0x00000300, AX + SUBQ $0x00000060, DX + JMP float64MinBlockLoop + +float64MinTailLoop: + CMPQ DX, $0x00000004 + JL float64MinDone + VMINPD (AX), Z1, Z1 + ADDQ $0x00000040, AX + SUBQ $0x00000008, DX + JMP float64MinTailLoop + +float64MinDone: + VMINPD Z1, Z2, Z1 + VMINPD Z1, Z3, Z1 + VMINPD Z1, Z4, Z1 + VMINPD Z1, Z5, Z1 + VMINPD Z1, Z6, Z1 + VMINPD Z1, Z7, Z1 + VMINPD Z1, Z8, Z1 + VMINPD Z1, Z9, Z1 + VMINPD Z1, Z10, Z1 + VMINPD Z1, Z11, Z1 + VMINPD Z1, Z0, Z1 + VEXTRACTI64X4 $0x01, Z1, Y0 + VMINPD Y0, Y1, Y1 + VEXTRACTF128 $0x01, Y1, X0 + MINPD X0, X1 + MOVOU X1, (CX) + RET diff --git a/pkg/vectorize/min/avx512_stubs.go b/pkg/vectorize/min/avx512_stubs.go new file mode 100644 index 0000000000000000000000000000000000000000..e09023bd59e250be15411bbb75acf47fc6602fa4 --- /dev/null +++ b/pkg/vectorize/min/avx512_stubs.go @@ -0,0 +1,23 @@ +// Code generated by command: go run avx512.go -out avx512.s -stubs avx512_stubs.go. DO NOT EDIT. + +package min + +func int8MinAvx512Asm(x []int8, r []int8) + +func int16MinAvx512Asm(x []int16, r []int16) + +func int32MinAvx512Asm(x []int32, r []int32) + +func int64MinAvx512Asm(x []int64, r []int64) + +func uint8MinAvx512Asm(x []uint8, r []uint8) + +func uint16MinAvx512Asm(x []uint16, r []uint16) + +func uint32MinAvx512Asm(x []uint32, r []uint32) + +func uint64MinAvx512Asm(x []uint64, r []uint64) + +func float32MinAvx512Asm(x []float32, r []float32) + +func float64MinAvx512Asm(x []float64, r []float64) diff --git a/pkg/vectorize/min/min.go b/pkg/vectorize/min/min.go index f97616d60fdc2afec5b75a6cf34092dcada2b0a9..9e2e01c8b5ffe9086c611b6c769cdfa6718f8375 100644 --- a/pkg/vectorize/min/min.go +++ b/pkg/vectorize/min/min.go @@ -2,141 +2,775 @@ package min import ( "bytes" - "matrixbase/pkg/container/vector" + "matrixbase/pkg/container/types" + + "golang.org/x/sys/cpu" ) var ( - boolMin func([]bool) bool - i64Min func([]int64) int64 - f64Min func([]float64) float64 - bytesMin func(*vector.Bytes) []byte - boolMinSels func([]bool, []int64) bool - i64MinSels func([]int64, []int64) int64 - f64MinSels func([]float64, []int64) float64 - bytesMinSels func(*vector.Bytes, []int64) []byte + boolMin func([]bool) bool + boolMinSels func([]bool, []int64) bool + int8Min func([]int8) int8 + int8MinSels func([]int8, []int64) int8 + int16Min func([]int16) int16 + int16MinSels func([]int16, []int64) int16 + int32Min func([]int32) int32 + int32MinSels func([]int32, []int64) int32 + int64Min func([]int64) int64 + int64MinSels func([]int64, []int64) int64 + uint8Min func([]uint8) uint8 + uint8MinSels func([]uint8, []int64) uint8 + uint16Min func([]uint16) uint16 + uint16MinSels func([]uint16, []int64) uint16 + uint32Min func([]uint32) uint32 + uint32MinSels func([]uint32, []int64) uint32 + uint64Min func([]uint64) uint64 + uint64MinSels func([]uint64, []int64) uint64 + float32Min func([]float32) float32 + float32MinSels func([]float32, []int64) float32 + float64Min func([]float64) float64 + float64MinSels func([]float64, []int64) float64 + strMin func(*types.Bytes) []byte + strMinSels func(*types.Bytes, []int64) []byte ) func init() { - i64Min = i64MinPure - f64Min = f64MinPure + if cpu.X86.HasAVX512 { + int8Min = int8MinAvx512 + int16Min = int16MinAvx512 + int32Min = int32MinAvx512 + int64Min = int64MinAvx512 + uint8Min = uint8MinAvx512 + uint16Min = uint16MinAvx512 + uint32Min = uint32MinAvx512 + uint64Min = uint64MinAvx512 + float32Min = float32MinAvx512 + float64Min = float64MinAvx512 + } else if cpu.X86.HasAVX2 { + int8Min = int8MinAvx2 + int16Min = int16MinAvx2 + int32Min = int32MinAvx2 + int64Min = int64MinPure + uint8Min = uint8MinAvx2 + uint16Min = uint16MinAvx2 + uint32Min = uint32MinAvx2 + uint64Min = uint64MinPure + float32Min = float32MinAvx2 + float64Min = float64MinAvx2 + } else { + int8Min = int8MinPure + int16Min = int16MinPure + int32Min = int32MinPure + int64Min = int64MinPure + uint8Min = uint8MinPure + uint16Min = uint16MinPure + uint32Min = uint32MinPure + uint64Min = uint64MinPure + float32Min = float32MinPure + float64Min = float64MinPure + } + boolMin = boolMinPure - bytesMin = bytesMinPure - i64MinSels = i64MinSelsPure - f64MinSels = f64MinSelsPure + strMin = strMinPure + boolMinSels = boolMinSelsPure - bytesMinSels = bytesMinSelsPure + int8MinSels = int8MinSelsPure + int16MinSels = int16MinSelsPure + int32MinSels = int32MinSelsPure + int64MinSels = int64MinSelsPure + uint8MinSels = uint8MinSelsPure + uint16MinSels = uint16MinSelsPure + uint32MinSels = uint32MinSelsPure + uint64MinSels = uint64MinSelsPure + float32MinSels = float32MinSelsPure + float64MinSels = float64MinSelsPure + strMinSels = strMinSelsPure } func BoolMin(xs []bool) bool { - return boolMinPure(xs) + return boolMin(xs) } -func I64Min(xs []int64) int64 { - return i64Min(xs) +func boolMinPure(xs []bool) bool { + for _, x := range xs { + if x == false { + return false + } + } + return true } -func F64Min(xs []float64) float64 { - return f64Min(xs) +func BoolMinSels(xs []bool, sels []int64) bool { + return boolMinSels(xs, sels) } -func BytesMin(xs *vector.Bytes) []byte { - return bytesMin(xs) +func boolMinSelsPure(xs []bool, sels []int64) bool { + for _, sel := range sels { + if xs[sel] == false { + return false + } + } + return true } -func BoolMinSels(xs []bool, sels []int64) bool { - return boolMinSelsPure(xs, sels) +func Int8Min(xs []int8) int8 { + return int8Min(xs) } -func I64MinSels(xs []int64, sels []int64) int64 { - return i64MinSels(xs, sels) +func int8MinPure(xs []int8) int8 { + res := xs[0] + for _, x := range xs { + if x < res { + res = x + } + } + return res } -func F64MinSels(xs []float64, sels []int64) float64 { - return f64MinSels(xs, sels) +func int8MinAvx2(xs []int8) int8 { + const regItems int = 32 / 1 + n := len(xs) / regItems + var rs [16]int8 + int8MinAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 16; i++ { + if rs[i] < res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] < res { + res = xs[i] + } + } + return res } -func BytesMinSels(xs *vector.Bytes, sels []int64) []byte { - return bytesMinSels(xs, sels) +func int8MinAvx512(xs []int8) int8 { + const regItems int = 64 / 1 + n := len(xs) / regItems + var rs [16]int8 + int8MinAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 16; i++ { + if rs[i] < res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] < res { + res = xs[i] + } + } + return res } -func boolMinPure(xs []bool) bool { +func Int8MinSels(xs []int8, sels []int64) int8 { + return int8MinSels(xs, sels) +} + +func int8MinSelsPure(xs []int8, sels []int64) int8 { + res := xs[sels[0]] + for _, sel := range sels { + x := xs[sel] + if x < res { + res = x + } + } + return res +} + +func Int16Min(xs []int16) int16 { + return int16Min(xs) +} + +func int16MinPure(xs []int16) int16 { + res := xs[0] for _, x := range xs { - if !x { - return false + if x < res { + res = x } } - return true + return res +} + +func int16MinAvx2(xs []int16) int16 { + const regItems int = 32 / 2 + n := len(xs) / regItems + var rs [8]int16 + int16MinAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 8; i++ { + if rs[i] < res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] < res { + res = xs[i] + } + } + return res +} + +func int16MinAvx512(xs []int16) int16 { + const regItems int = 64 / 2 + n := len(xs) / regItems + var rs [8]int16 + int16MinAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 8; i++ { + if rs[i] < res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] < res { + res = xs[i] + } + } + return res +} + +func Int16MinSels(xs []int16, sels []int64) int16 { + return int16MinSels(xs, sels) +} + +func int16MinSelsPure(xs []int16, sels []int64) int16 { + res := xs[sels[0]] + for _, sel := range sels { + x := xs[sel] + if x < res { + res = x + } + } + return res +} + +func Int32Min(xs []int32) int32 { + return int32Min(xs) } -func i64MinPure(xs []int64) int64 { - min := xs[0] +func int32MinPure(xs []int32) int32 { + res := xs[0] for _, x := range xs { - if x < min { - min = x + if x < res { + res = x + } + } + return res +} + +func int32MinAvx2(xs []int32) int32 { + const regItems int = 32 / 4 + n := len(xs) / regItems + var rs [4]int32 + int32MinAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 4; i++ { + if rs[i] < res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] < res { + res = xs[i] + } + } + return res +} + +func int32MinAvx512(xs []int32) int32 { + const regItems int = 64 / 4 + n := len(xs) / regItems + var rs [4]int32 + int32MinAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 4; i++ { + if rs[i] < res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] < res { + res = xs[i] } } - return min + return res } -func f64MinPure(xs []float64) float64 { - min := xs[0] +func Int32MinSels(xs []int32, sels []int64) int32 { + return int32MinSels(xs, sels) +} + +func int32MinSelsPure(xs []int32, sels []int64) int32 { + res := xs[sels[0]] + for _, sel := range sels { + x := xs[sel] + if x < res { + res = x + } + } + return res +} + +func Int64Min(xs []int64) int64 { + return int64Min(xs) +} + +func int64MinPure(xs []int64) int64 { + res := xs[0] for _, x := range xs { - if x < min { - min = x + if x < res { + res = x } } - return min + return res } -func bytesMinPure(xs *vector.Bytes) []byte { - var tm []byte +func int64MinAvx512(xs []int64) int64 { + const regItems int = 64 / 8 + n := len(xs) / regItems + var rs [2]int64 + int64MinAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 2; i++ { + if rs[i] < res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] < res { + res = xs[i] + } + } + return res +} - min := xs.Data[xs.Os[0] : xs.Os[0]+xs.Ns[0]] - for i, o := range xs.Os { - if tm = xs.Data[o : o+xs.Ns[i]]; bytes.Compare(tm, min) < 0 { - min = tm +func Int64MinSels(xs []int64, sels []int64) int64 { + return int64MinSels(xs, sels) +} + +func int64MinSelsPure(xs []int64, sels []int64) int64 { + res := xs[sels[0]] + for _, sel := range sels { + x := xs[sel] + if x < res { + res = x } } - return min + return res } -func boolMinSelsPure(xs []bool, sels []int64) bool { +func Uint8Min(xs []uint8) uint8 { + return uint8Min(xs) +} + +func uint8MinPure(xs []uint8) uint8 { + res := xs[0] + for _, x := range xs { + if x < res { + res = x + } + } + return res +} + +func uint8MinAvx2(xs []uint8) uint8 { + const regItems int = 32 / 1 + n := len(xs) / regItems + var rs [16]uint8 + uint8MinAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 16; i++ { + if rs[i] < res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] < res { + res = xs[i] + } + } + return res +} + +func uint8MinAvx512(xs []uint8) uint8 { + const regItems int = 64 / 1 + n := len(xs) / regItems + var rs [16]uint8 + uint8MinAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 16; i++ { + if rs[i] < res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] < res { + res = xs[i] + } + } + return res +} + +func Uint8MinSels(xs []uint8, sels []int64) uint8 { + return uint8MinSels(xs, sels) +} + +func uint8MinSelsPure(xs []uint8, sels []int64) uint8 { + res := xs[sels[0]] for _, sel := range sels { - if !xs[sel] { - return false + x := xs[sel] + if x < res { + res = x } } - return true + return res +} + +func Uint16Min(xs []uint16) uint16 { + return uint16Min(xs) +} + +func uint16MinPure(xs []uint16) uint16 { + res := xs[0] + for _, x := range xs { + if x < res { + res = x + } + } + return res +} + +func uint16MinAvx2(xs []uint16) uint16 { + const regItems int = 32 / 2 + n := len(xs) / regItems + var rs [8]uint16 + uint16MinAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 8; i++ { + if rs[i] < res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] < res { + res = xs[i] + } + } + return res +} + +func uint16MinAvx512(xs []uint16) uint16 { + const regItems int = 64 / 2 + n := len(xs) / regItems + var rs [8]uint16 + uint16MinAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 8; i++ { + if rs[i] < res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] < res { + res = xs[i] + } + } + return res +} + +func Uint16MinSels(xs []uint16, sels []int64) uint16 { + return uint16MinSels(xs, sels) +} + +func uint16MinSelsPure(xs []uint16, sels []int64) uint16 { + res := xs[sels[0]] + for _, sel := range sels { + x := xs[sel] + if x < res { + res = x + } + } + return res +} + +func Uint32Min(xs []uint32) uint32 { + return uint32Min(xs) +} + +func uint32MinPure(xs []uint32) uint32 { + res := xs[0] + for _, x := range xs { + if x < res { + res = x + } + } + return res +} + +func uint32MinAvx2(xs []uint32) uint32 { + const regItems int = 32 / 4 + n := len(xs) / regItems + var rs [4]uint32 + uint32MinAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 4; i++ { + if rs[i] < res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] < res { + res = xs[i] + } + } + return res +} + +func uint32MinAvx512(xs []uint32) uint32 { + const regItems int = 64 / 4 + n := len(xs) / regItems + var rs [4]uint32 + uint32MinAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 4; i++ { + if rs[i] < res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] < res { + res = xs[i] + } + } + return res +} + +func Uint32MinSels(xs []uint32, sels []int64) uint32 { + return uint32MinSels(xs, sels) +} + +func uint32MinSelsPure(xs []uint32, sels []int64) uint32 { + res := xs[sels[0]] + for _, sel := range sels { + x := xs[sel] + if x < res { + res = x + } + } + return res +} + +func Uint64Min(xs []uint64) uint64 { + return uint64Min(xs) +} + +func uint64MinPure(xs []uint64) uint64 { + res := xs[0] + for _, x := range xs { + if x < res { + res = x + } + } + return res +} + +func uint64MinAvx512(xs []uint64) uint64 { + const regItems int = 64 / 8 + n := len(xs) / regItems + var rs [2]uint64 + uint64MinAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 2; i++ { + if rs[i] < res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] < res { + res = xs[i] + } + } + return res +} + +func Uint64MinSels(xs []uint64, sels []int64) uint64 { + return uint64MinSels(xs, sels) } -func i64MinSelsPure(xs []int64, sels []int64) int64 { - min := xs[sels[0]] +func uint64MinSelsPure(xs []uint64, sels []int64) uint64 { + res := xs[sels[0]] for _, sel := range sels { - if x := xs[sel]; x < min { - min = x + x := xs[sel] + if x < res { + res = x + } + } + return res +} + +func Float32Min(xs []float32) float32 { + return float32Min(xs) +} + +func float32MinPure(xs []float32) float32 { + res := xs[0] + for _, x := range xs { + if x < res { + res = x + } + } + return res +} + +func float32MinAvx2(xs []float32) float32 { + const regItems int = 32 / 4 + n := len(xs) / regItems + var rs [4]float32 + float32MinAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 4; i++ { + if rs[i] < res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] < res { + res = xs[i] } } - return min + return res } -func f64MinSelsPure(xs []float64, sels []int64) float64 { - min := xs[sels[0]] +func float32MinAvx512(xs []float32) float32 { + const regItems int = 64 / 4 + n := len(xs) / regItems + var rs [4]float32 + float32MinAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 4; i++ { + if rs[i] < res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] < res { + res = xs[i] + } + } + return res +} + +func Float32MinSels(xs []float32, sels []int64) float32 { + return float32MinSels(xs, sels) +} + +func float32MinSelsPure(xs []float32, sels []int64) float32 { + res := xs[sels[0]] for _, sel := range sels { - if x := xs[sel]; x < min { - min = x + x := xs[sel] + if x < res { + res = x } } - return min + return res } -func bytesMinSelsPure(xs *vector.Bytes, sels []int64) []byte { - var tm []byte +func Float64Min(xs []float64) float64 { + return float64Min(xs) +} + +func float64MinPure(xs []float64) float64 { + res := xs[0] + for _, x := range xs { + if x < res { + res = x + } + } + return res +} + +func float64MinAvx2(xs []float64) float64 { + const regItems int = 32 / 8 + n := len(xs) / regItems + var rs [2]float64 + float64MinAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 2; i++ { + if rs[i] < res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] < res { + res = xs[i] + } + } + return res +} + +func float64MinAvx512(xs []float64) float64 { + const regItems int = 64 / 8 + n := len(xs) / regItems + var rs [2]float64 + float64MinAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 2; i++ { + if rs[i] < res { + res = rs[i] + } + } + for i, j := n*regItems, len(xs); i < j; i++ { + if xs[i] < res { + res = xs[i] + } + } + return res +} + +func Float64MinSels(xs []float64, sels []int64) float64 { + return float64MinSels(xs, sels) +} + +func float64MinSelsPure(xs []float64, sels []int64) float64 { + res := xs[sels[0]] + for _, sel := range sels { + x := xs[sel] + if x < res { + res = x + } + } + return res +} + +func StrMin(xs *types.Bytes) []byte { + return strMin(xs) +} + +func strMinPure(xs *types.Bytes) []byte { + res := xs.Get(0) + for i, n := 0, len(xs.Offsets); i < n; i++ { + x := xs.Get(i) + if bytes.Compare(x, res) < 0 { + res = x + } + } + return res +} + +func StrMinSels(xs *types.Bytes, sels []int64) []byte { + return strMinSels(xs, sels) +} - min := xs.Data[xs.Os[sels[0]] : xs.Os[sels[0]]+xs.Ns[sels[0]]] +func strMinSelsPure(xs *types.Bytes, sels []int64) []byte { + res := xs.Get(int(sels[0])) for _, sel := range sels { - if tm = xs.Data[xs.Os[sel] : xs.Os[sel]+xs.Ns[sel]]; bytes.Compare(tm, min) < 0 { - min = tm + x := xs.Get(int(sel)) + if bytes.Compare(x, res) < 0 { + res = x } } - return min + return res }