Skip to content
Snippets Groups Projects
Commit 42c3d8bb authored by Njam bRong's avatar Njam bRong
Browse files

Add avx2/avx512 implementation for min/max

parent 18fffc30
No related branches found
No related tags found
No related merge requests found
// Code generated by command: go run avx2.go -out max/avx2.s -stubs max/avx2_stubs.go. DO NOT EDIT.
#include "textflag.h"
// func int8MaxAvx2Asm(x []int8, r []int8)
// Requires: AVX, AVX2, SSE2, SSE4.1
TEXT ·int8MaxAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000000080, BX
MOVQ BX, X0
VPBROADCASTB X0, Y0
VMOVDQU Y0, Y1
VMOVDQU Y0, Y2
VMOVDQU Y0, Y3
VMOVDQU Y0, Y4
VMOVDQU Y0, Y5
VMOVDQU Y0, Y0
int8MaxBlockLoop:
CMPQ DX, $0x000000c0
JL int8MaxTailLoop
VPMAXSB (AX), Y1, Y1
VPMAXSB 32(AX), Y2, Y2
VPMAXSB 64(AX), Y3, Y3
VPMAXSB 96(AX), Y4, Y4
VPMAXSB 128(AX), Y5, Y5
VPMAXSB 160(AX), Y0, Y0
ADDQ $0x000000c0, AX
SUBQ $0x000000c0, DX
JMP int8MaxBlockLoop
int8MaxTailLoop:
CMPQ DX, $0x00000004
JL int8MaxDone
VPMAXSB (AX), Y1, Y1
ADDQ $0x00000020, AX
SUBQ $0x00000020, DX
JMP int8MaxTailLoop
int8MaxDone:
VPMAXSB Y1, Y2, Y1
VPMAXSB Y1, Y3, Y1
VPMAXSB Y1, Y4, Y1
VPMAXSB Y1, Y5, Y1
VPMAXSB Y1, Y0, Y1
VEXTRACTF128 $0x01, Y1, X0
PMAXSB X0, X1
MOVOU X1, (CX)
RET
// func int16MaxAvx2Asm(x []int16, r []int16)
// Requires: AVX, AVX2, SSE2
TEXT ·int16MaxAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000008000, BX
MOVQ BX, X0
VPBROADCASTW X0, Y0
VMOVDQU Y0, Y1
VMOVDQU Y0, Y2
VMOVDQU Y0, Y3
VMOVDQU Y0, Y4
VMOVDQU Y0, Y5
VMOVDQU Y0, Y0
int16MaxBlockLoop:
CMPQ DX, $0x00000060
JL int16MaxTailLoop
VPMAXSW (AX), Y1, Y1
VPMAXSW 32(AX), Y2, Y2
VPMAXSW 64(AX), Y3, Y3
VPMAXSW 96(AX), Y4, Y4
VPMAXSW 128(AX), Y5, Y5
VPMAXSW 160(AX), Y0, Y0
ADDQ $0x000000c0, AX
SUBQ $0x00000060, DX
JMP int16MaxBlockLoop
int16MaxTailLoop:
CMPQ DX, $0x00000004
JL int16MaxDone
VPMAXSW (AX), Y1, Y1
ADDQ $0x00000020, AX
SUBQ $0x00000010, DX
JMP int16MaxTailLoop
int16MaxDone:
VPMAXSW Y1, Y2, Y1
VPMAXSW Y1, Y3, Y1
VPMAXSW Y1, Y4, Y1
VPMAXSW Y1, Y5, Y1
VPMAXSW Y1, Y0, Y1
VEXTRACTF128 $0x01, Y1, X0
PMAXSW X0, X1
MOVOU X1, (CX)
RET
// func int32MaxAvx2Asm(x []int32, r []int32)
// Requires: AVX, AVX2, SSE2, SSE4.1
TEXT ·int32MaxAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000080000000, BX
MOVQ BX, X0
VPBROADCASTD X0, Y0
VMOVDQU Y0, Y1
VMOVDQU Y0, Y2
VMOVDQU Y0, Y3
VMOVDQU Y0, Y4
VMOVDQU Y0, Y5
VMOVDQU Y0, Y0
int32MaxBlockLoop:
CMPQ DX, $0x00000030
JL int32MaxTailLoop
VPMAXSD (AX), Y1, Y1
VPMAXSD 32(AX), Y2, Y2
VPMAXSD 64(AX), Y3, Y3
VPMAXSD 96(AX), Y4, Y4
VPMAXSD 128(AX), Y5, Y5
VPMAXSD 160(AX), Y0, Y0
ADDQ $0x000000c0, AX
SUBQ $0x00000030, DX
JMP int32MaxBlockLoop
int32MaxTailLoop:
CMPQ DX, $0x00000004
JL int32MaxDone
VPMAXSD (AX), Y1, Y1
ADDQ $0x00000020, AX
SUBQ $0x00000008, DX
JMP int32MaxTailLoop
int32MaxDone:
VPMAXSD Y1, Y2, Y1
VPMAXSD Y1, Y3, Y1
VPMAXSD Y1, Y4, Y1
VPMAXSD Y1, Y5, Y1
VPMAXSD Y1, Y0, Y1
VEXTRACTF128 $0x01, Y1, X0
PMAXSD X0, X1
MOVOU X1, (CX)
RET
// func uint8MaxAvx2Asm(x []uint8, r []uint8)
// Requires: AVX, AVX2, SSE2
TEXT ·uint8MaxAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000000000, BX
MOVQ BX, X0
VPBROADCASTB X0, Y0
VMOVDQU Y0, Y1
VMOVDQU Y0, Y2
VMOVDQU Y0, Y3
VMOVDQU Y0, Y4
VMOVDQU Y0, Y5
VMOVDQU Y0, Y0
uint8MaxBlockLoop:
CMPQ DX, $0x000000c0
JL uint8MaxTailLoop
VPMAXUB (AX), Y1, Y1
VPMAXUB 32(AX), Y2, Y2
VPMAXUB 64(AX), Y3, Y3
VPMAXUB 96(AX), Y4, Y4
VPMAXUB 128(AX), Y5, Y5
VPMAXUB 160(AX), Y0, Y0
ADDQ $0x000000c0, AX
SUBQ $0x000000c0, DX
JMP uint8MaxBlockLoop
uint8MaxTailLoop:
CMPQ DX, $0x00000004
JL uint8MaxDone
VPMAXUB (AX), Y1, Y1
ADDQ $0x00000020, AX
SUBQ $0x00000020, DX
JMP uint8MaxTailLoop
uint8MaxDone:
VPMAXUB Y1, Y2, Y1
VPMAXUB Y1, Y3, Y1
VPMAXUB Y1, Y4, Y1
VPMAXUB Y1, Y5, Y1
VPMAXUB Y1, Y0, Y1
VEXTRACTF128 $0x01, Y1, X0
PMAXUB X0, X1
MOVOU X1, (CX)
RET
// func uint16MaxAvx2Asm(x []uint16, r []uint16)
// Requires: AVX, AVX2, SSE2, SSE4.1
TEXT ·uint16MaxAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000000000, BX
MOVQ BX, X0
VPBROADCASTW X0, Y0
VMOVDQU Y0, Y1
VMOVDQU Y0, Y2
VMOVDQU Y0, Y3
VMOVDQU Y0, Y4
VMOVDQU Y0, Y5
VMOVDQU Y0, Y0
uint16MaxBlockLoop:
CMPQ DX, $0x00000060
JL uint16MaxTailLoop
VPMAXUW (AX), Y1, Y1
VPMAXUW 32(AX), Y2, Y2
VPMAXUW 64(AX), Y3, Y3
VPMAXUW 96(AX), Y4, Y4
VPMAXUW 128(AX), Y5, Y5
VPMAXUW 160(AX), Y0, Y0
ADDQ $0x000000c0, AX
SUBQ $0x00000060, DX
JMP uint16MaxBlockLoop
uint16MaxTailLoop:
CMPQ DX, $0x00000004
JL uint16MaxDone
VPMAXUW (AX), Y1, Y1
ADDQ $0x00000020, AX
SUBQ $0x00000010, DX
JMP uint16MaxTailLoop
uint16MaxDone:
VPMAXUW Y1, Y2, Y1
VPMAXUW Y1, Y3, Y1
VPMAXUW Y1, Y4, Y1
VPMAXUW Y1, Y5, Y1
VPMAXUW Y1, Y0, Y1
VEXTRACTF128 $0x01, Y1, X0
PMAXUW X0, X1
MOVOU X1, (CX)
RET
// func uint32MaxAvx2Asm(x []uint32, r []uint32)
// Requires: AVX, AVX2, SSE2, SSE4.1
TEXT ·uint32MaxAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000000000, BX
MOVQ BX, X0
VPBROADCASTD X0, Y0
VMOVDQU Y0, Y1
VMOVDQU Y0, Y2
VMOVDQU Y0, Y3
VMOVDQU Y0, Y4
VMOVDQU Y0, Y5
VMOVDQU Y0, Y0
uint32MaxBlockLoop:
CMPQ DX, $0x00000030
JL uint32MaxTailLoop
VPMAXUD (AX), Y1, Y1
VPMAXUD 32(AX), Y2, Y2
VPMAXUD 64(AX), Y3, Y3
VPMAXUD 96(AX), Y4, Y4
VPMAXUD 128(AX), Y5, Y5
VPMAXUD 160(AX), Y0, Y0
ADDQ $0x000000c0, AX
SUBQ $0x00000030, DX
JMP uint32MaxBlockLoop
uint32MaxTailLoop:
CMPQ DX, $0x00000004
JL uint32MaxDone
VPMAXUD (AX), Y1, Y1
ADDQ $0x00000020, AX
SUBQ $0x00000008, DX
JMP uint32MaxTailLoop
uint32MaxDone:
VPMAXUD Y1, Y2, Y1
VPMAXUD Y1, Y3, Y1
VPMAXUD Y1, Y4, Y1
VPMAXUD Y1, Y5, Y1
VPMAXUD Y1, Y0, Y1
VEXTRACTF128 $0x01, Y1, X0
PMAXUD X0, X1
MOVOU X1, (CX)
RET
// func float32MaxAvx2Asm(x []float32, r []float32)
// Requires: AVX, AVX2, SSE, SSE2
TEXT ·float32MaxAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x00000000ff7fffff, BX
MOVQ BX, X0
VBROADCASTSS X0, Y0
VMOVUPS Y0, Y1
VMOVUPS Y0, Y2
VMOVUPS Y0, Y3
VMOVUPS Y0, Y4
VMOVUPS Y0, Y5
VMOVUPS Y0, Y0
float32MaxBlockLoop:
CMPQ DX, $0x00000030
JL float32MaxTailLoop
VMAXPS (AX), Y1, Y1
VMAXPS 32(AX), Y2, Y2
VMAXPS 64(AX), Y3, Y3
VMAXPS 96(AX), Y4, Y4
VMAXPS 128(AX), Y5, Y5
VMAXPS 160(AX), Y0, Y0
ADDQ $0x000000c0, AX
SUBQ $0x00000030, DX
JMP float32MaxBlockLoop
float32MaxTailLoop:
CMPQ DX, $0x00000004
JL float32MaxDone
VMAXPS (AX), Y1, Y1
ADDQ $0x00000020, AX
SUBQ $0x00000008, DX
JMP float32MaxTailLoop
float32MaxDone:
VMAXPS Y1, Y2, Y1
VMAXPS Y1, Y3, Y1
VMAXPS Y1, Y4, Y1
VMAXPS Y1, Y5, Y1
VMAXPS Y1, Y0, Y1
VEXTRACTF128 $0x01, Y1, X0
MAXPS X0, X1
MOVOU X1, (CX)
RET
// func float64MaxAvx2Asm(x []float64, r []float64)
// Requires: AVX, AVX2, SSE2
TEXT ·float64MaxAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0xffefffffffffffff, BX
MOVQ BX, X0
VBROADCASTSD X0, Y0
VMOVUPD Y0, Y1
VMOVUPD Y0, Y2
VMOVUPD Y0, Y3
VMOVUPD Y0, Y4
VMOVUPD Y0, Y5
VMOVUPD Y0, Y0
float64MaxBlockLoop:
CMPQ DX, $0x00000018
JL float64MaxTailLoop
VMAXPD (AX), Y1, Y1
VMAXPD 32(AX), Y2, Y2
VMAXPD 64(AX), Y3, Y3
VMAXPD 96(AX), Y4, Y4
VMAXPD 128(AX), Y5, Y5
VMAXPD 160(AX), Y0, Y0
ADDQ $0x000000c0, AX
SUBQ $0x00000018, DX
JMP float64MaxBlockLoop
float64MaxTailLoop:
CMPQ DX, $0x00000004
JL float64MaxDone
VMAXPD (AX), Y1, Y1
ADDQ $0x00000020, AX
SUBQ $0x00000004, DX
JMP float64MaxTailLoop
float64MaxDone:
VMAXPD Y1, Y2, Y1
VMAXPD Y1, Y3, Y1
VMAXPD Y1, Y4, Y1
VMAXPD Y1, Y5, Y1
VMAXPD Y1, Y0, Y1
VEXTRACTF128 $0x01, Y1, X0
MAXPD X0, X1
MOVOU X1, (CX)
RET
// Code generated by command: go run avx2.go -out max/avx2.s -stubs max/avx2_stubs.go. DO NOT EDIT.
package max
func int8MaxAvx2Asm(x []int8, r []int8)
func int16MaxAvx2Asm(x []int16, r []int16)
func int32MaxAvx2Asm(x []int32, r []int32)
func uint8MaxAvx2Asm(x []uint8, r []uint8)
func uint16MaxAvx2Asm(x []uint16, r []uint16)
func uint32MaxAvx2Asm(x []uint32, r []uint32)
func float32MaxAvx2Asm(x []float32, r []float32)
func float64MaxAvx2Asm(x []float64, r []float64)
// Code generated by command: go run avx512.go -out avx512.s -stubs avx512_stubs.go. DO NOT EDIT.
#include "textflag.h"
// func int8MaxAvx512Asm(x []int8, r []int8)
// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2, SSE4.1
TEXT ·int8MaxAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000000080, BX
MOVQ BX, X0
VPBROADCASTB X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
int8MaxBlockLoop:
CMPQ DX, $0x00000300
JL int8MaxTailLoop
VPMAXSB (AX), Z1, Z1
VPMAXSB 64(AX), Z2, Z2
VPMAXSB 128(AX), Z3, Z3
VPMAXSB 192(AX), Z4, Z4
VPMAXSB 256(AX), Z5, Z5
VPMAXSB 320(AX), Z6, Z6
VPMAXSB 384(AX), Z7, Z7
VPMAXSB 448(AX), Z8, Z8
VPMAXSB 512(AX), Z9, Z9
VPMAXSB 576(AX), Z10, Z10
VPMAXSB 640(AX), Z11, Z11
VPMAXSB 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x00000300, DX
JMP int8MaxBlockLoop
int8MaxTailLoop:
CMPQ DX, $0x00000004
JL int8MaxDone
VPMAXSB (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000040, DX
JMP int8MaxTailLoop
int8MaxDone:
VPMAXSB Z1, Z2, Z1
VPMAXSB Z1, Z3, Z1
VPMAXSB Z1, Z4, Z1
VPMAXSB Z1, Z5, Z1
VPMAXSB Z1, Z6, Z1
VPMAXSB Z1, Z7, Z1
VPMAXSB Z1, Z8, Z1
VPMAXSB Z1, Z9, Z1
VPMAXSB Z1, Z10, Z1
VPMAXSB Z1, Z11, Z1
VPMAXSB Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VPMAXSB Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
PMAXSB X0, X1
MOVOU X1, (CX)
RET
// func int16MaxAvx512Asm(x []int16, r []int16)
// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2
TEXT ·int16MaxAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000008000, BX
MOVQ BX, X0
VPBROADCASTW X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
int16MaxBlockLoop:
CMPQ DX, $0x00000180
JL int16MaxTailLoop
VPMAXSW (AX), Z1, Z1
VPMAXSW 64(AX), Z2, Z2
VPMAXSW 128(AX), Z3, Z3
VPMAXSW 192(AX), Z4, Z4
VPMAXSW 256(AX), Z5, Z5
VPMAXSW 320(AX), Z6, Z6
VPMAXSW 384(AX), Z7, Z7
VPMAXSW 448(AX), Z8, Z8
VPMAXSW 512(AX), Z9, Z9
VPMAXSW 576(AX), Z10, Z10
VPMAXSW 640(AX), Z11, Z11
VPMAXSW 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x00000180, DX
JMP int16MaxBlockLoop
int16MaxTailLoop:
CMPQ DX, $0x00000004
JL int16MaxDone
VPMAXSW (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000020, DX
JMP int16MaxTailLoop
int16MaxDone:
VPMAXSW Z1, Z2, Z1
VPMAXSW Z1, Z3, Z1
VPMAXSW Z1, Z4, Z1
VPMAXSW Z1, Z5, Z1
VPMAXSW Z1, Z6, Z1
VPMAXSW Z1, Z7, Z1
VPMAXSW Z1, Z8, Z1
VPMAXSW Z1, Z9, Z1
VPMAXSW Z1, Z10, Z1
VPMAXSW Z1, Z11, Z1
VPMAXSW Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VPMAXSW Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
PMAXSW X0, X1
MOVOU X1, (CX)
RET
// func int32MaxAvx512Asm(x []int32, r []int32)
// Requires: AVX, AVX2, AVX512F, SSE2, SSE4.1
TEXT ·int32MaxAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000080000000, BX
MOVQ BX, X0
VPBROADCASTD X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
int32MaxBlockLoop:
CMPQ DX, $0x000000c0
JL int32MaxTailLoop
VPMAXSD (AX), Z1, Z1
VPMAXSD 64(AX), Z2, Z2
VPMAXSD 128(AX), Z3, Z3
VPMAXSD 192(AX), Z4, Z4
VPMAXSD 256(AX), Z5, Z5
VPMAXSD 320(AX), Z6, Z6
VPMAXSD 384(AX), Z7, Z7
VPMAXSD 448(AX), Z8, Z8
VPMAXSD 512(AX), Z9, Z9
VPMAXSD 576(AX), Z10, Z10
VPMAXSD 640(AX), Z11, Z11
VPMAXSD 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x000000c0, DX
JMP int32MaxBlockLoop
int32MaxTailLoop:
CMPQ DX, $0x00000004
JL int32MaxDone
VPMAXSD (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000010, DX
JMP int32MaxTailLoop
int32MaxDone:
VPMAXSD Z1, Z2, Z1
VPMAXSD Z1, Z3, Z1
VPMAXSD Z1, Z4, Z1
VPMAXSD Z1, Z5, Z1
VPMAXSD Z1, Z6, Z1
VPMAXSD Z1, Z7, Z1
VPMAXSD Z1, Z8, Z1
VPMAXSD Z1, Z9, Z1
VPMAXSD Z1, Z10, Z1
VPMAXSD Z1, Z11, Z1
VPMAXSD Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VPMAXSD Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
PMAXSD X0, X1
MOVOU X1, (CX)
RET
// func int64MaxAvx512Asm(x []int64, r []int64)
// Requires: AVX, AVX512F, AVX512VL, SSE2
TEXT ·int64MaxAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x8000000000000000, BX
MOVQ BX, X0
VPBROADCASTQ X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
int64MaxBlockLoop:
CMPQ DX, $0x00000060
JL int64MaxTailLoop
VPMAXSQ (AX), Z1, Z1
VPMAXSQ 64(AX), Z2, Z2
VPMAXSQ 128(AX), Z3, Z3
VPMAXSQ 192(AX), Z4, Z4
VPMAXSQ 256(AX), Z5, Z5
VPMAXSQ 320(AX), Z6, Z6
VPMAXSQ 384(AX), Z7, Z7
VPMAXSQ 448(AX), Z8, Z8
VPMAXSQ 512(AX), Z9, Z9
VPMAXSQ 576(AX), Z10, Z10
VPMAXSQ 640(AX), Z11, Z11
VPMAXSQ 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x00000060, DX
JMP int64MaxBlockLoop
int64MaxTailLoop:
CMPQ DX, $0x00000004
JL int64MaxDone
VPMAXSQ (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000008, DX
JMP int64MaxTailLoop
int64MaxDone:
VPMAXSQ Z1, Z2, Z1
VPMAXSQ Z1, Z3, Z1
VPMAXSQ Z1, Z4, Z1
VPMAXSQ Z1, Z5, Z1
VPMAXSQ Z1, Z6, Z1
VPMAXSQ Z1, Z7, Z1
VPMAXSQ Z1, Z8, Z1
VPMAXSQ Z1, Z9, Z1
VPMAXSQ Z1, Z10, Z1
VPMAXSQ Z1, Z11, Z1
VPMAXSQ Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VPMAXSQ Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
VPMAXSQ X0, X1, X1
MOVOU X1, (CX)
RET
// func uint8MaxAvx512Asm(x []uint8, r []uint8)
// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2
TEXT ·uint8MaxAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000000000, BX
MOVQ BX, X0
VPBROADCASTB X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
uint8MaxBlockLoop:
CMPQ DX, $0x00000300
JL uint8MaxTailLoop
VPMAXUB (AX), Z1, Z1
VPMAXUB 64(AX), Z2, Z2
VPMAXUB 128(AX), Z3, Z3
VPMAXUB 192(AX), Z4, Z4
VPMAXUB 256(AX), Z5, Z5
VPMAXUB 320(AX), Z6, Z6
VPMAXUB 384(AX), Z7, Z7
VPMAXUB 448(AX), Z8, Z8
VPMAXUB 512(AX), Z9, Z9
VPMAXUB 576(AX), Z10, Z10
VPMAXUB 640(AX), Z11, Z11
VPMAXUB 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x00000300, DX
JMP uint8MaxBlockLoop
uint8MaxTailLoop:
CMPQ DX, $0x00000004
JL uint8MaxDone
VPMAXUB (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000040, DX
JMP uint8MaxTailLoop
uint8MaxDone:
VPMAXUB Z1, Z2, Z1
VPMAXUB Z1, Z3, Z1
VPMAXUB Z1, Z4, Z1
VPMAXUB Z1, Z5, Z1
VPMAXUB Z1, Z6, Z1
VPMAXUB Z1, Z7, Z1
VPMAXUB Z1, Z8, Z1
VPMAXUB Z1, Z9, Z1
VPMAXUB Z1, Z10, Z1
VPMAXUB Z1, Z11, Z1
VPMAXUB Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VPMAXUB Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
PMAXUB X0, X1
MOVOU X1, (CX)
RET
// func uint16MaxAvx512Asm(x []uint16, r []uint16)
// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2, SSE4.1
TEXT ·uint16MaxAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000000000, BX
MOVQ BX, X0
VPBROADCASTW X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
uint16MaxBlockLoop:
CMPQ DX, $0x00000180
JL uint16MaxTailLoop
VPMAXUW (AX), Z1, Z1
VPMAXUW 64(AX), Z2, Z2
VPMAXUW 128(AX), Z3, Z3
VPMAXUW 192(AX), Z4, Z4
VPMAXUW 256(AX), Z5, Z5
VPMAXUW 320(AX), Z6, Z6
VPMAXUW 384(AX), Z7, Z7
VPMAXUW 448(AX), Z8, Z8
VPMAXUW 512(AX), Z9, Z9
VPMAXUW 576(AX), Z10, Z10
VPMAXUW 640(AX), Z11, Z11
VPMAXUW 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x00000180, DX
JMP uint16MaxBlockLoop
uint16MaxTailLoop:
CMPQ DX, $0x00000004
JL uint16MaxDone
VPMAXUW (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000020, DX
JMP uint16MaxTailLoop
uint16MaxDone:
VPMAXUW Z1, Z2, Z1
VPMAXUW Z1, Z3, Z1
VPMAXUW Z1, Z4, Z1
VPMAXUW Z1, Z5, Z1
VPMAXUW Z1, Z6, Z1
VPMAXUW Z1, Z7, Z1
VPMAXUW Z1, Z8, Z1
VPMAXUW Z1, Z9, Z1
VPMAXUW Z1, Z10, Z1
VPMAXUW Z1, Z11, Z1
VPMAXUW Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VPMAXUW Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
PMAXUW X0, X1
MOVOU X1, (CX)
RET
// func uint32MaxAvx512Asm(x []uint32, r []uint32)
// Requires: AVX, AVX2, AVX512F, SSE2, SSE4.1
TEXT ·uint32MaxAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000000000, BX
MOVQ BX, X0
VPBROADCASTD X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
uint32MaxBlockLoop:
CMPQ DX, $0x000000c0
JL uint32MaxTailLoop
VPMAXUD (AX), Z1, Z1
VPMAXUD 64(AX), Z2, Z2
VPMAXUD 128(AX), Z3, Z3
VPMAXUD 192(AX), Z4, Z4
VPMAXUD 256(AX), Z5, Z5
VPMAXUD 320(AX), Z6, Z6
VPMAXUD 384(AX), Z7, Z7
VPMAXUD 448(AX), Z8, Z8
VPMAXUD 512(AX), Z9, Z9
VPMAXUD 576(AX), Z10, Z10
VPMAXUD 640(AX), Z11, Z11
VPMAXUD 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x000000c0, DX
JMP uint32MaxBlockLoop
uint32MaxTailLoop:
CMPQ DX, $0x00000004
JL uint32MaxDone
VPMAXUD (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000010, DX
JMP uint32MaxTailLoop
uint32MaxDone:
VPMAXUD Z1, Z2, Z1
VPMAXUD Z1, Z3, Z1
VPMAXUD Z1, Z4, Z1
VPMAXUD Z1, Z5, Z1
VPMAXUD Z1, Z6, Z1
VPMAXUD Z1, Z7, Z1
VPMAXUD Z1, Z8, Z1
VPMAXUD Z1, Z9, Z1
VPMAXUD Z1, Z10, Z1
VPMAXUD Z1, Z11, Z1
VPMAXUD Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VPMAXUD Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
PMAXUD X0, X1
MOVOU X1, (CX)
RET
// func uint64MaxAvx512Asm(x []uint64, r []uint64)
// Requires: AVX, AVX512F, AVX512VL, SSE2
TEXT ·uint64MaxAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000000000, BX
MOVQ BX, X0
VPBROADCASTQ X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
uint64MaxBlockLoop:
CMPQ DX, $0x00000060
JL uint64MaxTailLoop
VPMAXUQ (AX), Z1, Z1
VPMAXUQ 64(AX), Z2, Z2
VPMAXUQ 128(AX), Z3, Z3
VPMAXUQ 192(AX), Z4, Z4
VPMAXUQ 256(AX), Z5, Z5
VPMAXUQ 320(AX), Z6, Z6
VPMAXUQ 384(AX), Z7, Z7
VPMAXUQ 448(AX), Z8, Z8
VPMAXUQ 512(AX), Z9, Z9
VPMAXUQ 576(AX), Z10, Z10
VPMAXUQ 640(AX), Z11, Z11
VPMAXUQ 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x00000060, DX
JMP uint64MaxBlockLoop
uint64MaxTailLoop:
CMPQ DX, $0x00000004
JL uint64MaxDone
VPMAXUQ (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000008, DX
JMP uint64MaxTailLoop
uint64MaxDone:
VPMAXUQ Z1, Z2, Z1
VPMAXUQ Z1, Z3, Z1
VPMAXUQ Z1, Z4, Z1
VPMAXUQ Z1, Z5, Z1
VPMAXUQ Z1, Z6, Z1
VPMAXUQ Z1, Z7, Z1
VPMAXUQ Z1, Z8, Z1
VPMAXUQ Z1, Z9, Z1
VPMAXUQ Z1, Z10, Z1
VPMAXUQ Z1, Z11, Z1
VPMAXUQ Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VPMAXUQ Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
VPMAXUQ X0, X1, X1
MOVOU X1, (CX)
RET
// func float32MaxAvx512Asm(x []float32, r []float32)
// Requires: AVX, AVX512F, SSE, SSE2
TEXT ·float32MaxAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x00000000ff7fffff, BX
MOVQ BX, X0
VBROADCASTSS X0, Z0
VMOVUPS Z0, Z1
VMOVUPS Z0, Z2
VMOVUPS Z0, Z3
VMOVUPS Z0, Z4
VMOVUPS Z0, Z5
VMOVUPS Z0, Z6
VMOVUPS Z0, Z7
VMOVUPS Z0, Z8
VMOVUPS Z0, Z9
VMOVUPS Z0, Z10
VMOVUPS Z0, Z11
VMOVUPS Z0, Z0
float32MaxBlockLoop:
CMPQ DX, $0x000000c0
JL float32MaxTailLoop
VMAXPS (AX), Z1, Z1
VMAXPS 64(AX), Z2, Z2
VMAXPS 128(AX), Z3, Z3
VMAXPS 192(AX), Z4, Z4
VMAXPS 256(AX), Z5, Z5
VMAXPS 320(AX), Z6, Z6
VMAXPS 384(AX), Z7, Z7
VMAXPS 448(AX), Z8, Z8
VMAXPS 512(AX), Z9, Z9
VMAXPS 576(AX), Z10, Z10
VMAXPS 640(AX), Z11, Z11
VMAXPS 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x000000c0, DX
JMP float32MaxBlockLoop
float32MaxTailLoop:
CMPQ DX, $0x00000004
JL float32MaxDone
VMAXPS (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000010, DX
JMP float32MaxTailLoop
float32MaxDone:
VMAXPS Z1, Z2, Z1
VMAXPS Z1, Z3, Z1
VMAXPS Z1, Z4, Z1
VMAXPS Z1, Z5, Z1
VMAXPS Z1, Z6, Z1
VMAXPS Z1, Z7, Z1
VMAXPS Z1, Z8, Z1
VMAXPS Z1, Z9, Z1
VMAXPS Z1, Z10, Z1
VMAXPS Z1, Z11, Z1
VMAXPS Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VMAXPS Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
MAXPS X0, X1
MOVOU X1, (CX)
RET
// func float64MaxAvx512Asm(x []float64, r []float64)
// Requires: AVX, AVX512F, SSE2
TEXT ·float64MaxAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0xffefffffffffffff, BX
MOVQ BX, X0
VBROADCASTSD X0, Z0
VMOVUPD Z0, Z1
VMOVUPD Z0, Z2
VMOVUPD Z0, Z3
VMOVUPD Z0, Z4
VMOVUPD Z0, Z5
VMOVUPD Z0, Z6
VMOVUPD Z0, Z7
VMOVUPD Z0, Z8
VMOVUPD Z0, Z9
VMOVUPD Z0, Z10
VMOVUPD Z0, Z11
VMOVUPD Z0, Z0
float64MaxBlockLoop:
CMPQ DX, $0x00000060
JL float64MaxTailLoop
VMAXPD (AX), Z1, Z1
VMAXPD 64(AX), Z2, Z2
VMAXPD 128(AX), Z3, Z3
VMAXPD 192(AX), Z4, Z4
VMAXPD 256(AX), Z5, Z5
VMAXPD 320(AX), Z6, Z6
VMAXPD 384(AX), Z7, Z7
VMAXPD 448(AX), Z8, Z8
VMAXPD 512(AX), Z9, Z9
VMAXPD 576(AX), Z10, Z10
VMAXPD 640(AX), Z11, Z11
VMAXPD 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x00000060, DX
JMP float64MaxBlockLoop
float64MaxTailLoop:
CMPQ DX, $0x00000004
JL float64MaxDone
VMAXPD (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000008, DX
JMP float64MaxTailLoop
float64MaxDone:
VMAXPD Z1, Z2, Z1
VMAXPD Z1, Z3, Z1
VMAXPD Z1, Z4, Z1
VMAXPD Z1, Z5, Z1
VMAXPD Z1, Z6, Z1
VMAXPD Z1, Z7, Z1
VMAXPD Z1, Z8, Z1
VMAXPD Z1, Z9, Z1
VMAXPD Z1, Z10, Z1
VMAXPD Z1, Z11, Z1
VMAXPD Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VMAXPD Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
MAXPD X0, X1
MOVOU X1, (CX)
RET
// Code generated by command: go run avx512.go -out avx512.s -stubs avx512_stubs.go. DO NOT EDIT.
package max
func int8MaxAvx512Asm(x []int8, r []int8)
func int16MaxAvx512Asm(x []int16, r []int16)
func int32MaxAvx512Asm(x []int32, r []int32)
func int64MaxAvx512Asm(x []int64, r []int64)
func uint8MaxAvx512Asm(x []uint8, r []uint8)
func uint16MaxAvx512Asm(x []uint16, r []uint16)
func uint32MaxAvx512Asm(x []uint32, r []uint32)
func uint64MaxAvx512Asm(x []uint64, r []uint64)
func float32MaxAvx512Asm(x []float32, r []float32)
func float64MaxAvx512Asm(x []float64, r []float64)
......@@ -2,141 +2,775 @@ package max
import (
"bytes"
"matrixbase/pkg/container/vector"
"matrixbase/pkg/container/types"
"golang.org/x/sys/cpu"
)
var (
boolMax func([]bool) bool
i64Max func([]int64) int64
f64Max func([]float64) float64
bytesMax func(*vector.Bytes) []byte
boolMaxSels func([]bool, []int64) bool
i64MaxSels func([]int64, []int64) int64
f64MaxSels func([]float64, []int64) float64
bytesMaxSels func(*vector.Bytes, []int64) []byte
boolMax func([]bool) bool
boolMaxSels func([]bool, []int64) bool
int8Max func([]int8) int8
int8MaxSels func([]int8, []int64) int8
int16Max func([]int16) int16
int16MaxSels func([]int16, []int64) int16
int32Max func([]int32) int32
int32MaxSels func([]int32, []int64) int32
int64Max func([]int64) int64
int64MaxSels func([]int64, []int64) int64
uint8Max func([]uint8) uint8
uint8MaxSels func([]uint8, []int64) uint8
uint16Max func([]uint16) uint16
uint16MaxSels func([]uint16, []int64) uint16
uint32Max func([]uint32) uint32
uint32MaxSels func([]uint32, []int64) uint32
uint64Max func([]uint64) uint64
uint64MaxSels func([]uint64, []int64) uint64
float32Max func([]float32) float32
float32MaxSels func([]float32, []int64) float32
float64Max func([]float64) float64
float64MaxSels func([]float64, []int64) float64
strMax func(*types.Bytes) []byte
strMaxSels func(*types.Bytes, []int64) []byte
)
func init() {
i64Max = i64MaxPure
f64Max = f64MaxPure
if cpu.X86.HasAVX512 {
int8Max = int8MaxAvx512
int16Max = int16MaxAvx512
int32Max = int32MaxAvx512
int64Max = int64MaxAvx512
uint8Max = uint8MaxAvx512
uint16Max = uint16MaxAvx512
uint32Max = uint32MaxAvx512
uint64Max = uint64MaxAvx512
float32Max = float32MaxAvx512
float64Max = float64MaxAvx512
} else if cpu.X86.HasAVX2 {
int8Max = int8MaxAvx2
int16Max = int16MaxAvx2
int32Max = int32MaxAvx2
int64Max = int64MaxPure
uint8Max = uint8MaxAvx2
uint16Max = uint16MaxAvx2
uint32Max = uint32MaxAvx2
uint64Max = uint64MaxPure
float32Max = float32MaxAvx2
float64Max = float64MaxAvx2
} else {
int8Max = int8MaxPure
int16Max = int16MaxPure
int32Max = int32MaxPure
int64Max = int64MaxPure
uint8Max = uint8MaxPure
uint16Max = uint16MaxPure
uint32Max = uint32MaxPure
uint64Max = uint64MaxPure
float32Max = float32MaxPure
float64Max = float64MaxPure
}
boolMax = boolMaxPure
bytesMax = bytesMaxPure
i64MaxSels = i64MaxSelsPure
f64MaxSels = f64MaxSelsPure
strMax = strMaxPure
boolMaxSels = boolMaxSelsPure
bytesMaxSels = bytesMaxSelsPure
int8MaxSels = int8MaxSelsPure
int16MaxSels = int16MaxSelsPure
int32MaxSels = int32MaxSelsPure
int64MaxSels = int64MaxSelsPure
uint8MaxSels = uint8MaxSelsPure
uint16MaxSels = uint16MaxSelsPure
uint32MaxSels = uint32MaxSelsPure
uint64MaxSels = uint64MaxSelsPure
float32MaxSels = float32MaxSelsPure
float64MaxSels = float64MaxSelsPure
strMaxSels = strMaxSelsPure
}
func BoolMax(xs []bool) bool {
return boolMaxPure(xs)
return boolMax(xs)
}
func I64Max(xs []int64) int64 {
return i64Max(xs)
func boolMaxPure(xs []bool) bool {
for _, x := range xs {
if x == true {
return true
}
}
return false
}
func F64Max(xs []float64) float64 {
return f64Max(xs)
func BoolMaxSels(xs []bool, sels []int64) bool {
return boolMaxSels(xs, sels)
}
func BytesMax(xs *vector.Bytes) []byte {
return bytesMax(xs)
func boolMaxSelsPure(xs []bool, sels []int64) bool {
for _, sel := range sels {
if xs[sel] == true {
return true
}
}
return false
}
func BoolMaxSels(xs []bool, sels []int64) bool {
return boolMaxSelsPure(xs, sels)
func Int8Max(xs []int8) int8 {
return int8Max(xs)
}
func I64MaxSels(xs []int64, sels []int64) int64 {
return i64MaxSels(xs, sels)
func int8MaxPure(xs []int8) int8 {
res := xs[0]
for _, x := range xs {
if x > res {
res = x
}
}
return res
}
func F64MaxSels(xs []float64, sels []int64) float64 {
return f64MaxSels(xs, sels)
func int8MaxAvx2(xs []int8) int8 {
const regItems int = 32 / 1
n := len(xs) / regItems
var rs [16]int8
int8MaxAvx2Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 16; i++ {
if rs[i] > res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] > res {
res = xs[i]
}
}
return res
}
func BytesMaxSels(xs *vector.Bytes, sels []int64) []byte {
return bytesMaxSels(xs, sels)
func int8MaxAvx512(xs []int8) int8 {
const regItems int = 64 / 1
n := len(xs) / regItems
var rs [16]int8
int8MaxAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 16; i++ {
if rs[i] > res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] > res {
res = xs[i]
}
}
return res
}
func boolMaxPure(xs []bool) bool {
func Int8MaxSels(xs []int8, sels []int64) int8 {
return int8MaxSels(xs, sels)
}
func int8MaxSelsPure(xs []int8, sels []int64) int8 {
res := xs[sels[0]]
for _, sel := range sels {
x := xs[sel]
if x > res {
res = x
}
}
return res
}
func Int16Max(xs []int16) int16 {
return int16Max(xs)
}
func int16MaxPure(xs []int16) int16 {
res := xs[0]
for _, x := range xs {
if x {
return true
if x > res {
res = x
}
}
return false
return res
}
func int16MaxAvx2(xs []int16) int16 {
const regItems int = 32 / 2
n := len(xs) / regItems
var rs [8]int16
int16MaxAvx2Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 8; i++ {
if rs[i] > res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] > res {
res = xs[i]
}
}
return res
}
func int16MaxAvx512(xs []int16) int16 {
const regItems int = 64 / 2
n := len(xs) / regItems
var rs [8]int16
int16MaxAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 8; i++ {
if rs[i] > res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] > res {
res = xs[i]
}
}
return res
}
func Int16MaxSels(xs []int16, sels []int64) int16 {
return int16MaxSels(xs, sels)
}
func int16MaxSelsPure(xs []int16, sels []int64) int16 {
res := xs[sels[0]]
for _, sel := range sels {
x := xs[sel]
if x > res {
res = x
}
}
return res
}
func Int32Max(xs []int32) int32 {
return int32Max(xs)
}
func i64MaxPure(xs []int64) int64 {
max := xs[0]
func int32MaxPure(xs []int32) int32 {
res := xs[0]
for _, x := range xs {
if x > max {
max = x
if x > res {
res = x
}
}
return res
}
func int32MaxAvx2(xs []int32) int32 {
const regItems int = 32 / 4
n := len(xs) / regItems
var rs [4]int32
int32MaxAvx2Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 4; i++ {
if rs[i] > res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] > res {
res = xs[i]
}
}
return res
}
func int32MaxAvx512(xs []int32) int32 {
const regItems int = 64 / 4
n := len(xs) / regItems
var rs [4]int32
int32MaxAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 4; i++ {
if rs[i] > res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] > res {
res = xs[i]
}
}
return max
return res
}
func f64MaxPure(xs []float64) float64 {
max := xs[0]
func Int32MaxSels(xs []int32, sels []int64) int32 {
return int32MaxSels(xs, sels)
}
func int32MaxSelsPure(xs []int32, sels []int64) int32 {
res := xs[sels[0]]
for _, sel := range sels {
x := xs[sel]
if x > res {
res = x
}
}
return res
}
func Int64Max(xs []int64) int64 {
return int64Max(xs)
}
func int64MaxPure(xs []int64) int64 {
res := xs[0]
for _, x := range xs {
if x > max {
max = x
if x > res {
res = x
}
}
return max
return res
}
func bytesMaxPure(xs *vector.Bytes) []byte {
var tm []byte
var max []byte
func int64MaxAvx512(xs []int64) int64 {
const regItems int = 64 / 8
n := len(xs) / regItems
var rs [2]int64
int64MaxAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 2; i++ {
if rs[i] > res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] > res {
res = xs[i]
}
}
return res
}
for i, o := range xs.Os {
if tm = xs.Data[o : o+xs.Ns[i]]; bytes.Compare(tm, max) > 0 {
max = tm
func Int64MaxSels(xs []int64, sels []int64) int64 {
return int64MaxSels(xs, sels)
}
func int64MaxSelsPure(xs []int64, sels []int64) int64 {
res := xs[sels[0]]
for _, sel := range sels {
x := xs[sel]
if x > res {
res = x
}
}
return max
return res
}
func boolMaxSelsPure(xs []bool, sels []int64) bool {
func Uint8Max(xs []uint8) uint8 {
return uint8Max(xs)
}
func uint8MaxPure(xs []uint8) uint8 {
res := xs[0]
for _, x := range xs {
if x > res {
res = x
}
}
return res
}
func uint8MaxAvx2(xs []uint8) uint8 {
const regItems int = 32 / 1
n := len(xs) / regItems
var rs [16]uint8
uint8MaxAvx2Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 16; i++ {
if rs[i] > res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] > res {
res = xs[i]
}
}
return res
}
func uint8MaxAvx512(xs []uint8) uint8 {
const regItems int = 64 / 1
n := len(xs) / regItems
var rs [16]uint8
uint8MaxAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 16; i++ {
if rs[i] > res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] > res {
res = xs[i]
}
}
return res
}
func Uint8MaxSels(xs []uint8, sels []int64) uint8 {
return uint8MaxSels(xs, sels)
}
func uint8MaxSelsPure(xs []uint8, sels []int64) uint8 {
res := xs[sels[0]]
for _, sel := range sels {
if xs[sel] {
return true
x := xs[sel]
if x > res {
res = x
}
}
return false
return res
}
func Uint16Max(xs []uint16) uint16 {
return uint16Max(xs)
}
func uint16MaxPure(xs []uint16) uint16 {
res := xs[0]
for _, x := range xs {
if x > res {
res = x
}
}
return res
}
func uint16MaxAvx2(xs []uint16) uint16 {
const regItems int = 32 / 2
n := len(xs) / regItems
var rs [8]uint16
uint16MaxAvx2Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 8; i++ {
if rs[i] > res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] > res {
res = xs[i]
}
}
return res
}
func uint16MaxAvx512(xs []uint16) uint16 {
const regItems int = 64 / 2
n := len(xs) / regItems
var rs [8]uint16
uint16MaxAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 8; i++ {
if rs[i] > res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] > res {
res = xs[i]
}
}
return res
}
func Uint16MaxSels(xs []uint16, sels []int64) uint16 {
return uint16MaxSels(xs, sels)
}
func uint16MaxSelsPure(xs []uint16, sels []int64) uint16 {
res := xs[sels[0]]
for _, sel := range sels {
x := xs[sel]
if x > res {
res = x
}
}
return res
}
func Uint32Max(xs []uint32) uint32 {
return uint32Max(xs)
}
func uint32MaxPure(xs []uint32) uint32 {
res := xs[0]
for _, x := range xs {
if x > res {
res = x
}
}
return res
}
func uint32MaxAvx2(xs []uint32) uint32 {
const regItems int = 32 / 4
n := len(xs) / regItems
var rs [4]uint32
uint32MaxAvx2Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 4; i++ {
if rs[i] > res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] > res {
res = xs[i]
}
}
return res
}
func uint32MaxAvx512(xs []uint32) uint32 {
const regItems int = 64 / 4
n := len(xs) / regItems
var rs [4]uint32
uint32MaxAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 4; i++ {
if rs[i] > res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] > res {
res = xs[i]
}
}
return res
}
func Uint32MaxSels(xs []uint32, sels []int64) uint32 {
return uint32MaxSels(xs, sels)
}
func uint32MaxSelsPure(xs []uint32, sels []int64) uint32 {
res := xs[sels[0]]
for _, sel := range sels {
x := xs[sel]
if x > res {
res = x
}
}
return res
}
func Uint64Max(xs []uint64) uint64 {
return uint64Max(xs)
}
func uint64MaxPure(xs []uint64) uint64 {
res := xs[0]
for _, x := range xs {
if x > res {
res = x
}
}
return res
}
func uint64MaxAvx512(xs []uint64) uint64 {
const regItems int = 64 / 8
n := len(xs) / regItems
var rs [2]uint64
uint64MaxAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 2; i++ {
if rs[i] > res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] > res {
res = xs[i]
}
}
return res
}
func Uint64MaxSels(xs []uint64, sels []int64) uint64 {
return uint64MaxSels(xs, sels)
}
func i64MaxSelsPure(xs []int64, sels []int64) int64 {
max := xs[sels[0]]
func uint64MaxSelsPure(xs []uint64, sels []int64) uint64 {
res := xs[sels[0]]
for _, sel := range sels {
if x := xs[sel]; x > max {
max = x
x := xs[sel]
if x > res {
res = x
}
}
return res
}
func Float32Max(xs []float32) float32 {
return float32Max(xs)
}
func float32MaxPure(xs []float32) float32 {
res := xs[0]
for _, x := range xs {
if x > res {
res = x
}
}
return res
}
func float32MaxAvx2(xs []float32) float32 {
const regItems int = 32 / 4
n := len(xs) / regItems
var rs [4]float32
float32MaxAvx2Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 4; i++ {
if rs[i] > res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] > res {
res = xs[i]
}
}
return max
return res
}
func f64MaxSelsPure(xs []float64, sels []int64) float64 {
max := xs[sels[0]]
func float32MaxAvx512(xs []float32) float32 {
const regItems int = 64 / 4
n := len(xs) / regItems
var rs [4]float32
float32MaxAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 4; i++ {
if rs[i] > res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] > res {
res = xs[i]
}
}
return res
}
func Float32MaxSels(xs []float32, sels []int64) float32 {
return float32MaxSels(xs, sels)
}
func float32MaxSelsPure(xs []float32, sels []int64) float32 {
res := xs[sels[0]]
for _, sel := range sels {
if x := xs[sel]; x > max {
max = x
x := xs[sel]
if x > res {
res = x
}
}
return max
return res
}
func bytesMaxSelsPure(xs *vector.Bytes, sels []int64) []byte {
var tm []byte
var max []byte
func Float64Max(xs []float64) float64 {
return float64Max(xs)
}
func float64MaxPure(xs []float64) float64 {
res := xs[0]
for _, x := range xs {
if x > res {
res = x
}
}
return res
}
func float64MaxAvx2(xs []float64) float64 {
const regItems int = 32 / 8
n := len(xs) / regItems
var rs [2]float64
float64MaxAvx2Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 2; i++ {
if rs[i] > res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] > res {
res = xs[i]
}
}
return res
}
func float64MaxAvx512(xs []float64) float64 {
const regItems int = 64 / 8
n := len(xs) / regItems
var rs [2]float64
float64MaxAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 2; i++ {
if rs[i] > res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] > res {
res = xs[i]
}
}
return res
}
func Float64MaxSels(xs []float64, sels []int64) float64 {
return float64MaxSels(xs, sels)
}
func float64MaxSelsPure(xs []float64, sels []int64) float64 {
res := xs[sels[0]]
for _, sel := range sels {
x := xs[sel]
if x > res {
res = x
}
}
return res
}
func StrMax(xs *types.Bytes) []byte {
return strMax(xs)
}
func strMaxPure(xs *types.Bytes) []byte {
res := xs.Get(0)
for i, n := 0, len(xs.Offsets); i < n; i++ {
x := xs.Get(i)
if bytes.Compare(x, res) > 0 {
res = x
}
}
return res
}
func StrMaxSels(xs *types.Bytes, sels []int64) []byte {
return strMaxSels(xs, sels)
}
func strMaxSelsPure(xs *types.Bytes, sels []int64) []byte {
res := xs.Get(int(sels[0]))
for _, sel := range sels {
if tm = xs.Data[xs.Os[sel] : xs.Os[sel]+xs.Ns[sel]]; bytes.Compare(tm, max) > 0 {
max = tm
x := xs.Get(int(sel))
if bytes.Compare(x, res) > 0 {
res = x
}
}
return max
return res
}
// Code generated by command: go run avx2.go -out min/avx2.s -stubs min/avx2_stubs.go. DO NOT EDIT.
#include "textflag.h"
// func int8MinAvx2Asm(x []int8, r []int8)
// Requires: AVX, AVX2, SSE2, SSE4.1
TEXT ·int8MinAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x000000000000007f, BX
MOVQ BX, X0
VPBROADCASTB X0, Y0
VMOVDQU Y0, Y1
VMOVDQU Y0, Y2
VMOVDQU Y0, Y3
VMOVDQU Y0, Y4
VMOVDQU Y0, Y5
VMOVDQU Y0, Y0
int8MinBlockLoop:
CMPQ DX, $0x000000c0
JL int8MinTailLoop
VPMINSB (AX), Y1, Y1
VPMINSB 32(AX), Y2, Y2
VPMINSB 64(AX), Y3, Y3
VPMINSB 96(AX), Y4, Y4
VPMINSB 128(AX), Y5, Y5
VPMINSB 160(AX), Y0, Y0
ADDQ $0x000000c0, AX
SUBQ $0x000000c0, DX
JMP int8MinBlockLoop
int8MinTailLoop:
CMPQ DX, $0x00000004
JL int8MinDone
VPMINSB (AX), Y1, Y1
ADDQ $0x00000020, AX
SUBQ $0x00000020, DX
JMP int8MinTailLoop
int8MinDone:
VPMINSB Y1, Y2, Y1
VPMINSB Y1, Y3, Y1
VPMINSB Y1, Y4, Y1
VPMINSB Y1, Y5, Y1
VPMINSB Y1, Y0, Y1
VEXTRACTF128 $0x01, Y1, X0
PMINSB X0, X1
MOVOU X1, (CX)
RET
// func int16MinAvx2Asm(x []int16, r []int16)
// Requires: AVX, AVX2, SSE2
TEXT ·int16MinAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000007fff, BX
MOVQ BX, X0
VPBROADCASTW X0, Y0
VMOVDQU Y0, Y1
VMOVDQU Y0, Y2
VMOVDQU Y0, Y3
VMOVDQU Y0, Y4
VMOVDQU Y0, Y5
VMOVDQU Y0, Y0
int16MinBlockLoop:
CMPQ DX, $0x00000060
JL int16MinTailLoop
VPMINSW (AX), Y1, Y1
VPMINSW 32(AX), Y2, Y2
VPMINSW 64(AX), Y3, Y3
VPMINSW 96(AX), Y4, Y4
VPMINSW 128(AX), Y5, Y5
VPMINSW 160(AX), Y0, Y0
ADDQ $0x000000c0, AX
SUBQ $0x00000060, DX
JMP int16MinBlockLoop
int16MinTailLoop:
CMPQ DX, $0x00000004
JL int16MinDone
VPMINSW (AX), Y1, Y1
ADDQ $0x00000020, AX
SUBQ $0x00000010, DX
JMP int16MinTailLoop
int16MinDone:
VPMINSW Y1, Y2, Y1
VPMINSW Y1, Y3, Y1
VPMINSW Y1, Y4, Y1
VPMINSW Y1, Y5, Y1
VPMINSW Y1, Y0, Y1
VEXTRACTF128 $0x01, Y1, X0
PMINSW X0, X1
MOVOU X1, (CX)
RET
// func int32MinAvx2Asm(x []int32, r []int32)
// Requires: AVX, AVX2, SSE2, SSE4.1
TEXT ·int32MinAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x000000007fffffff, BX
MOVQ BX, X0
VPBROADCASTD X0, Y0
VMOVDQU Y0, Y1
VMOVDQU Y0, Y2
VMOVDQU Y0, Y3
VMOVDQU Y0, Y4
VMOVDQU Y0, Y5
VMOVDQU Y0, Y0
int32MinBlockLoop:
CMPQ DX, $0x00000030
JL int32MinTailLoop
VPMINSD (AX), Y1, Y1
VPMINSD 32(AX), Y2, Y2
VPMINSD 64(AX), Y3, Y3
VPMINSD 96(AX), Y4, Y4
VPMINSD 128(AX), Y5, Y5
VPMINSD 160(AX), Y0, Y0
ADDQ $0x000000c0, AX
SUBQ $0x00000030, DX
JMP int32MinBlockLoop
int32MinTailLoop:
CMPQ DX, $0x00000004
JL int32MinDone
VPMINSD (AX), Y1, Y1
ADDQ $0x00000020, AX
SUBQ $0x00000008, DX
JMP int32MinTailLoop
int32MinDone:
VPMINSD Y1, Y2, Y1
VPMINSD Y1, Y3, Y1
VPMINSD Y1, Y4, Y1
VPMINSD Y1, Y5, Y1
VPMINSD Y1, Y0, Y1
VEXTRACTF128 $0x01, Y1, X0
PMINSD X0, X1
MOVOU X1, (CX)
RET
// func uint8MinAvx2Asm(x []uint8, r []uint8)
// Requires: AVX, AVX2, SSE2
TEXT ·uint8MinAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0xffffffffffffffff, BX
MOVQ BX, X0
VPBROADCASTB X0, Y0
VMOVDQU Y0, Y1
VMOVDQU Y0, Y2
VMOVDQU Y0, Y3
VMOVDQU Y0, Y4
VMOVDQU Y0, Y5
VMOVDQU Y0, Y0
uint8MinBlockLoop:
CMPQ DX, $0x000000c0
JL uint8MinTailLoop
VPMINUB (AX), Y1, Y1
VPMINUB 32(AX), Y2, Y2
VPMINUB 64(AX), Y3, Y3
VPMINUB 96(AX), Y4, Y4
VPMINUB 128(AX), Y5, Y5
VPMINUB 160(AX), Y0, Y0
ADDQ $0x000000c0, AX
SUBQ $0x000000c0, DX
JMP uint8MinBlockLoop
uint8MinTailLoop:
CMPQ DX, $0x00000004
JL uint8MinDone
VPMINUB (AX), Y1, Y1
ADDQ $0x00000020, AX
SUBQ $0x00000020, DX
JMP uint8MinTailLoop
uint8MinDone:
VPMINUB Y1, Y2, Y1
VPMINUB Y1, Y3, Y1
VPMINUB Y1, Y4, Y1
VPMINUB Y1, Y5, Y1
VPMINUB Y1, Y0, Y1
VEXTRACTF128 $0x01, Y1, X0
PMINUB X0, X1
MOVOU X1, (CX)
RET
// func uint16MinAvx2Asm(x []uint16, r []uint16)
// Requires: AVX, AVX2, SSE2, SSE4.1
TEXT ·uint16MinAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0xffffffffffffffff, BX
MOVQ BX, X0
VPBROADCASTW X0, Y0
VMOVDQU Y0, Y1
VMOVDQU Y0, Y2
VMOVDQU Y0, Y3
VMOVDQU Y0, Y4
VMOVDQU Y0, Y5
VMOVDQU Y0, Y0
uint16MinBlockLoop:
CMPQ DX, $0x00000060
JL uint16MinTailLoop
VPMINUW (AX), Y1, Y1
VPMINUW 32(AX), Y2, Y2
VPMINUW 64(AX), Y3, Y3
VPMINUW 96(AX), Y4, Y4
VPMINUW 128(AX), Y5, Y5
VPMINUW 160(AX), Y0, Y0
ADDQ $0x000000c0, AX
SUBQ $0x00000060, DX
JMP uint16MinBlockLoop
uint16MinTailLoop:
CMPQ DX, $0x00000004
JL uint16MinDone
VPMINUW (AX), Y1, Y1
ADDQ $0x00000020, AX
SUBQ $0x00000010, DX
JMP uint16MinTailLoop
uint16MinDone:
VPMINUW Y1, Y2, Y1
VPMINUW Y1, Y3, Y1
VPMINUW Y1, Y4, Y1
VPMINUW Y1, Y5, Y1
VPMINUW Y1, Y0, Y1
VEXTRACTF128 $0x01, Y1, X0
PMINUW X0, X1
MOVOU X1, (CX)
RET
// func uint32MinAvx2Asm(x []uint32, r []uint32)
// Requires: AVX, AVX2, SSE2, SSE4.1
TEXT ·uint32MinAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0xffffffffffffffff, BX
MOVQ BX, X0
VPBROADCASTD X0, Y0
VMOVDQU Y0, Y1
VMOVDQU Y0, Y2
VMOVDQU Y0, Y3
VMOVDQU Y0, Y4
VMOVDQU Y0, Y5
VMOVDQU Y0, Y0
uint32MinBlockLoop:
CMPQ DX, $0x00000030
JL uint32MinTailLoop
VPMINUD (AX), Y1, Y1
VPMINUD 32(AX), Y2, Y2
VPMINUD 64(AX), Y3, Y3
VPMINUD 96(AX), Y4, Y4
VPMINUD 128(AX), Y5, Y5
VPMINUD 160(AX), Y0, Y0
ADDQ $0x000000c0, AX
SUBQ $0x00000030, DX
JMP uint32MinBlockLoop
uint32MinTailLoop:
CMPQ DX, $0x00000004
JL uint32MinDone
VPMINUD (AX), Y1, Y1
ADDQ $0x00000020, AX
SUBQ $0x00000008, DX
JMP uint32MinTailLoop
uint32MinDone:
VPMINUD Y1, Y2, Y1
VPMINUD Y1, Y3, Y1
VPMINUD Y1, Y4, Y1
VPMINUD Y1, Y5, Y1
VPMINUD Y1, Y0, Y1
VEXTRACTF128 $0x01, Y1, X0
PMINUD X0, X1
MOVOU X1, (CX)
RET
// func float32MinAvx2Asm(x []float32, r []float32)
// Requires: AVX, AVX2, SSE, SSE2
TEXT ·float32MinAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x000000007f7fffff, BX
MOVQ BX, X0
VBROADCASTSS X0, Y0
VMOVUPS Y0, Y1
VMOVUPS Y0, Y2
VMOVUPS Y0, Y3
VMOVUPS Y0, Y4
VMOVUPS Y0, Y5
VMOVUPS Y0, Y0
float32MinBlockLoop:
CMPQ DX, $0x00000030
JL float32MinTailLoop
VMINPS (AX), Y1, Y1
VMINPS 32(AX), Y2, Y2
VMINPS 64(AX), Y3, Y3
VMINPS 96(AX), Y4, Y4
VMINPS 128(AX), Y5, Y5
VMINPS 160(AX), Y0, Y0
ADDQ $0x000000c0, AX
SUBQ $0x00000030, DX
JMP float32MinBlockLoop
float32MinTailLoop:
CMPQ DX, $0x00000004
JL float32MinDone
VMINPS (AX), Y1, Y1
ADDQ $0x00000020, AX
SUBQ $0x00000008, DX
JMP float32MinTailLoop
float32MinDone:
VMINPS Y1, Y2, Y1
VMINPS Y1, Y3, Y1
VMINPS Y1, Y4, Y1
VMINPS Y1, Y5, Y1
VMINPS Y1, Y0, Y1
VEXTRACTF128 $0x01, Y1, X0
MINPS X0, X1
MOVOU X1, (CX)
RET
// func float64MinAvx2Asm(x []float64, r []float64)
// Requires: AVX, AVX2, SSE2
TEXT ·float64MinAvx2Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x7fefffffffffffff, BX
MOVQ BX, X0
VBROADCASTSD X0, Y0
VMOVUPD Y0, Y1
VMOVUPD Y0, Y2
VMOVUPD Y0, Y3
VMOVUPD Y0, Y4
VMOVUPD Y0, Y5
VMOVUPD Y0, Y0
float64MinBlockLoop:
CMPQ DX, $0x00000018
JL float64MinTailLoop
VMINPD (AX), Y1, Y1
VMINPD 32(AX), Y2, Y2
VMINPD 64(AX), Y3, Y3
VMINPD 96(AX), Y4, Y4
VMINPD 128(AX), Y5, Y5
VMINPD 160(AX), Y0, Y0
ADDQ $0x000000c0, AX
SUBQ $0x00000018, DX
JMP float64MinBlockLoop
float64MinTailLoop:
CMPQ DX, $0x00000004
JL float64MinDone
VMINPD (AX), Y1, Y1
ADDQ $0x00000020, AX
SUBQ $0x00000004, DX
JMP float64MinTailLoop
float64MinDone:
VMINPD Y1, Y2, Y1
VMINPD Y1, Y3, Y1
VMINPD Y1, Y4, Y1
VMINPD Y1, Y5, Y1
VMINPD Y1, Y0, Y1
VEXTRACTF128 $0x01, Y1, X0
MINPD X0, X1
MOVOU X1, (CX)
RET
// Code generated by command: go run avx2.go -out min/avx2.s -stubs min/avx2_stubs.go. DO NOT EDIT.
package min
func int8MinAvx2Asm(x []int8, r []int8)
func int16MinAvx2Asm(x []int16, r []int16)
func int32MinAvx2Asm(x []int32, r []int32)
func uint8MinAvx2Asm(x []uint8, r []uint8)
func uint16MinAvx2Asm(x []uint16, r []uint16)
func uint32MinAvx2Asm(x []uint32, r []uint32)
func float32MinAvx2Asm(x []float32, r []float32)
func float64MinAvx2Asm(x []float64, r []float64)
// Code generated by command: go run avx512.go -out avx512.s -stubs avx512_stubs.go. DO NOT EDIT.
#include "textflag.h"
// func int8MinAvx512Asm(x []int8, r []int8)
// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2, SSE4.1
TEXT ·int8MinAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x000000000000007f, BX
MOVQ BX, X0
VPBROADCASTB X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
int8MinBlockLoop:
CMPQ DX, $0x00000300
JL int8MinTailLoop
VPMINSB (AX), Z1, Z1
VPMINSB 64(AX), Z2, Z2
VPMINSB 128(AX), Z3, Z3
VPMINSB 192(AX), Z4, Z4
VPMINSB 256(AX), Z5, Z5
VPMINSB 320(AX), Z6, Z6
VPMINSB 384(AX), Z7, Z7
VPMINSB 448(AX), Z8, Z8
VPMINSB 512(AX), Z9, Z9
VPMINSB 576(AX), Z10, Z10
VPMINSB 640(AX), Z11, Z11
VPMINSB 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x00000300, DX
JMP int8MinBlockLoop
int8MinTailLoop:
CMPQ DX, $0x00000004
JL int8MinDone
VPMINSB (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000040, DX
JMP int8MinTailLoop
int8MinDone:
VPMINSB Z1, Z2, Z1
VPMINSB Z1, Z3, Z1
VPMINSB Z1, Z4, Z1
VPMINSB Z1, Z5, Z1
VPMINSB Z1, Z6, Z1
VPMINSB Z1, Z7, Z1
VPMINSB Z1, Z8, Z1
VPMINSB Z1, Z9, Z1
VPMINSB Z1, Z10, Z1
VPMINSB Z1, Z11, Z1
VPMINSB Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VPMINSB Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
PMINSB X0, X1
MOVOU X1, (CX)
RET
// func int16MinAvx512Asm(x []int16, r []int16)
// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2
TEXT ·int16MinAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x0000000000007fff, BX
MOVQ BX, X0
VPBROADCASTW X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
int16MinBlockLoop:
CMPQ DX, $0x00000180
JL int16MinTailLoop
VPMINSW (AX), Z1, Z1
VPMINSW 64(AX), Z2, Z2
VPMINSW 128(AX), Z3, Z3
VPMINSW 192(AX), Z4, Z4
VPMINSW 256(AX), Z5, Z5
VPMINSW 320(AX), Z6, Z6
VPMINSW 384(AX), Z7, Z7
VPMINSW 448(AX), Z8, Z8
VPMINSW 512(AX), Z9, Z9
VPMINSW 576(AX), Z10, Z10
VPMINSW 640(AX), Z11, Z11
VPMINSW 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x00000180, DX
JMP int16MinBlockLoop
int16MinTailLoop:
CMPQ DX, $0x00000004
JL int16MinDone
VPMINSW (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000020, DX
JMP int16MinTailLoop
int16MinDone:
VPMINSW Z1, Z2, Z1
VPMINSW Z1, Z3, Z1
VPMINSW Z1, Z4, Z1
VPMINSW Z1, Z5, Z1
VPMINSW Z1, Z6, Z1
VPMINSW Z1, Z7, Z1
VPMINSW Z1, Z8, Z1
VPMINSW Z1, Z9, Z1
VPMINSW Z1, Z10, Z1
VPMINSW Z1, Z11, Z1
VPMINSW Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VPMINSW Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
PMINSW X0, X1
MOVOU X1, (CX)
RET
// func int32MinAvx512Asm(x []int32, r []int32)
// Requires: AVX, AVX2, AVX512F, SSE2, SSE4.1
TEXT ·int32MinAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x000000007fffffff, BX
MOVQ BX, X0
VPBROADCASTD X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
int32MinBlockLoop:
CMPQ DX, $0x000000c0
JL int32MinTailLoop
VPMINSD (AX), Z1, Z1
VPMINSD 64(AX), Z2, Z2
VPMINSD 128(AX), Z3, Z3
VPMINSD 192(AX), Z4, Z4
VPMINSD 256(AX), Z5, Z5
VPMINSD 320(AX), Z6, Z6
VPMINSD 384(AX), Z7, Z7
VPMINSD 448(AX), Z8, Z8
VPMINSD 512(AX), Z9, Z9
VPMINSD 576(AX), Z10, Z10
VPMINSD 640(AX), Z11, Z11
VPMINSD 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x000000c0, DX
JMP int32MinBlockLoop
int32MinTailLoop:
CMPQ DX, $0x00000004
JL int32MinDone
VPMINSD (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000010, DX
JMP int32MinTailLoop
int32MinDone:
VPMINSD Z1, Z2, Z1
VPMINSD Z1, Z3, Z1
VPMINSD Z1, Z4, Z1
VPMINSD Z1, Z5, Z1
VPMINSD Z1, Z6, Z1
VPMINSD Z1, Z7, Z1
VPMINSD Z1, Z8, Z1
VPMINSD Z1, Z9, Z1
VPMINSD Z1, Z10, Z1
VPMINSD Z1, Z11, Z1
VPMINSD Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VPMINSD Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
PMINSD X0, X1
MOVOU X1, (CX)
RET
// func int64MinAvx512Asm(x []int64, r []int64)
// Requires: AVX, AVX512F, AVX512VL, SSE2
TEXT ·int64MinAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x7fffffffffffffff, BX
MOVQ BX, X0
VPBROADCASTQ X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
int64MinBlockLoop:
CMPQ DX, $0x00000060
JL int64MinTailLoop
VPMINSQ (AX), Z1, Z1
VPMINSQ 64(AX), Z2, Z2
VPMINSQ 128(AX), Z3, Z3
VPMINSQ 192(AX), Z4, Z4
VPMINSQ 256(AX), Z5, Z5
VPMINSQ 320(AX), Z6, Z6
VPMINSQ 384(AX), Z7, Z7
VPMINSQ 448(AX), Z8, Z8
VPMINSQ 512(AX), Z9, Z9
VPMINSQ 576(AX), Z10, Z10
VPMINSQ 640(AX), Z11, Z11
VPMINSQ 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x00000060, DX
JMP int64MinBlockLoop
int64MinTailLoop:
CMPQ DX, $0x00000004
JL int64MinDone
VPMINSQ (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000008, DX
JMP int64MinTailLoop
int64MinDone:
VPMINSQ Z1, Z2, Z1
VPMINSQ Z1, Z3, Z1
VPMINSQ Z1, Z4, Z1
VPMINSQ Z1, Z5, Z1
VPMINSQ Z1, Z6, Z1
VPMINSQ Z1, Z7, Z1
VPMINSQ Z1, Z8, Z1
VPMINSQ Z1, Z9, Z1
VPMINSQ Z1, Z10, Z1
VPMINSQ Z1, Z11, Z1
VPMINSQ Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VPMINSQ Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
VPMINSQ X0, X1, X1
MOVOU X1, (CX)
RET
// func uint8MinAvx512Asm(x []uint8, r []uint8)
// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2
TEXT ·uint8MinAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0xffffffffffffffff, BX
MOVQ BX, X0
VPBROADCASTB X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
uint8MinBlockLoop:
CMPQ DX, $0x00000300
JL uint8MinTailLoop
VPMINUB (AX), Z1, Z1
VPMINUB 64(AX), Z2, Z2
VPMINUB 128(AX), Z3, Z3
VPMINUB 192(AX), Z4, Z4
VPMINUB 256(AX), Z5, Z5
VPMINUB 320(AX), Z6, Z6
VPMINUB 384(AX), Z7, Z7
VPMINUB 448(AX), Z8, Z8
VPMINUB 512(AX), Z9, Z9
VPMINUB 576(AX), Z10, Z10
VPMINUB 640(AX), Z11, Z11
VPMINUB 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x00000300, DX
JMP uint8MinBlockLoop
uint8MinTailLoop:
CMPQ DX, $0x00000004
JL uint8MinDone
VPMINUB (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000040, DX
JMP uint8MinTailLoop
uint8MinDone:
VPMINUB Z1, Z2, Z1
VPMINUB Z1, Z3, Z1
VPMINUB Z1, Z4, Z1
VPMINUB Z1, Z5, Z1
VPMINUB Z1, Z6, Z1
VPMINUB Z1, Z7, Z1
VPMINUB Z1, Z8, Z1
VPMINUB Z1, Z9, Z1
VPMINUB Z1, Z10, Z1
VPMINUB Z1, Z11, Z1
VPMINUB Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VPMINUB Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
PMINUB X0, X1
MOVOU X1, (CX)
RET
// func uint16MinAvx512Asm(x []uint16, r []uint16)
// Requires: AVX, AVX2, AVX512BW, AVX512F, SSE2, SSE4.1
TEXT ·uint16MinAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0xffffffffffffffff, BX
MOVQ BX, X0
VPBROADCASTW X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
uint16MinBlockLoop:
CMPQ DX, $0x00000180
JL uint16MinTailLoop
VPMINUW (AX), Z1, Z1
VPMINUW 64(AX), Z2, Z2
VPMINUW 128(AX), Z3, Z3
VPMINUW 192(AX), Z4, Z4
VPMINUW 256(AX), Z5, Z5
VPMINUW 320(AX), Z6, Z6
VPMINUW 384(AX), Z7, Z7
VPMINUW 448(AX), Z8, Z8
VPMINUW 512(AX), Z9, Z9
VPMINUW 576(AX), Z10, Z10
VPMINUW 640(AX), Z11, Z11
VPMINUW 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x00000180, DX
JMP uint16MinBlockLoop
uint16MinTailLoop:
CMPQ DX, $0x00000004
JL uint16MinDone
VPMINUW (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000020, DX
JMP uint16MinTailLoop
uint16MinDone:
VPMINUW Z1, Z2, Z1
VPMINUW Z1, Z3, Z1
VPMINUW Z1, Z4, Z1
VPMINUW Z1, Z5, Z1
VPMINUW Z1, Z6, Z1
VPMINUW Z1, Z7, Z1
VPMINUW Z1, Z8, Z1
VPMINUW Z1, Z9, Z1
VPMINUW Z1, Z10, Z1
VPMINUW Z1, Z11, Z1
VPMINUW Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VPMINUW Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
PMINUW X0, X1
MOVOU X1, (CX)
RET
// func uint32MinAvx512Asm(x []uint32, r []uint32)
// Requires: AVX, AVX2, AVX512F, SSE2, SSE4.1
TEXT ·uint32MinAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0xffffffffffffffff, BX
MOVQ BX, X0
VPBROADCASTD X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
uint32MinBlockLoop:
CMPQ DX, $0x000000c0
JL uint32MinTailLoop
VPMINUD (AX), Z1, Z1
VPMINUD 64(AX), Z2, Z2
VPMINUD 128(AX), Z3, Z3
VPMINUD 192(AX), Z4, Z4
VPMINUD 256(AX), Z5, Z5
VPMINUD 320(AX), Z6, Z6
VPMINUD 384(AX), Z7, Z7
VPMINUD 448(AX), Z8, Z8
VPMINUD 512(AX), Z9, Z9
VPMINUD 576(AX), Z10, Z10
VPMINUD 640(AX), Z11, Z11
VPMINUD 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x000000c0, DX
JMP uint32MinBlockLoop
uint32MinTailLoop:
CMPQ DX, $0x00000004
JL uint32MinDone
VPMINUD (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000010, DX
JMP uint32MinTailLoop
uint32MinDone:
VPMINUD Z1, Z2, Z1
VPMINUD Z1, Z3, Z1
VPMINUD Z1, Z4, Z1
VPMINUD Z1, Z5, Z1
VPMINUD Z1, Z6, Z1
VPMINUD Z1, Z7, Z1
VPMINUD Z1, Z8, Z1
VPMINUD Z1, Z9, Z1
VPMINUD Z1, Z10, Z1
VPMINUD Z1, Z11, Z1
VPMINUD Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VPMINUD Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
PMINUD X0, X1
MOVOU X1, (CX)
RET
// func uint64MinAvx512Asm(x []uint64, r []uint64)
// Requires: AVX, AVX512F, AVX512VL, SSE2
TEXT ·uint64MinAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0xffffffffffffffff, BX
MOVQ BX, X0
VPBROADCASTQ X0, Z0
VMOVDQU64 Z0, Z1
VMOVDQU64 Z0, Z2
VMOVDQU64 Z0, Z3
VMOVDQU64 Z0, Z4
VMOVDQU64 Z0, Z5
VMOVDQU64 Z0, Z6
VMOVDQU64 Z0, Z7
VMOVDQU64 Z0, Z8
VMOVDQU64 Z0, Z9
VMOVDQU64 Z0, Z10
VMOVDQU64 Z0, Z11
VMOVDQU64 Z0, Z0
uint64MinBlockLoop:
CMPQ DX, $0x00000060
JL uint64MinTailLoop
VPMINUQ (AX), Z1, Z1
VPMINUQ 64(AX), Z2, Z2
VPMINUQ 128(AX), Z3, Z3
VPMINUQ 192(AX), Z4, Z4
VPMINUQ 256(AX), Z5, Z5
VPMINUQ 320(AX), Z6, Z6
VPMINUQ 384(AX), Z7, Z7
VPMINUQ 448(AX), Z8, Z8
VPMINUQ 512(AX), Z9, Z9
VPMINUQ 576(AX), Z10, Z10
VPMINUQ 640(AX), Z11, Z11
VPMINUQ 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x00000060, DX
JMP uint64MinBlockLoop
uint64MinTailLoop:
CMPQ DX, $0x00000004
JL uint64MinDone
VPMINUQ (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000008, DX
JMP uint64MinTailLoop
uint64MinDone:
VPMINUQ Z1, Z2, Z1
VPMINUQ Z1, Z3, Z1
VPMINUQ Z1, Z4, Z1
VPMINUQ Z1, Z5, Z1
VPMINUQ Z1, Z6, Z1
VPMINUQ Z1, Z7, Z1
VPMINUQ Z1, Z8, Z1
VPMINUQ Z1, Z9, Z1
VPMINUQ Z1, Z10, Z1
VPMINUQ Z1, Z11, Z1
VPMINUQ Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VPMINUQ Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
VPMINUQ X0, X1, X1
MOVOU X1, (CX)
RET
// func float32MinAvx512Asm(x []float32, r []float32)
// Requires: AVX, AVX512F, SSE, SSE2
TEXT ·float32MinAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x000000007f7fffff, BX
MOVQ BX, X0
VBROADCASTSS X0, Z0
VMOVUPS Z0, Z1
VMOVUPS Z0, Z2
VMOVUPS Z0, Z3
VMOVUPS Z0, Z4
VMOVUPS Z0, Z5
VMOVUPS Z0, Z6
VMOVUPS Z0, Z7
VMOVUPS Z0, Z8
VMOVUPS Z0, Z9
VMOVUPS Z0, Z10
VMOVUPS Z0, Z11
VMOVUPS Z0, Z0
float32MinBlockLoop:
CMPQ DX, $0x000000c0
JL float32MinTailLoop
VMINPS (AX), Z1, Z1
VMINPS 64(AX), Z2, Z2
VMINPS 128(AX), Z3, Z3
VMINPS 192(AX), Z4, Z4
VMINPS 256(AX), Z5, Z5
VMINPS 320(AX), Z6, Z6
VMINPS 384(AX), Z7, Z7
VMINPS 448(AX), Z8, Z8
VMINPS 512(AX), Z9, Z9
VMINPS 576(AX), Z10, Z10
VMINPS 640(AX), Z11, Z11
VMINPS 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x000000c0, DX
JMP float32MinBlockLoop
float32MinTailLoop:
CMPQ DX, $0x00000004
JL float32MinDone
VMINPS (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000010, DX
JMP float32MinTailLoop
float32MinDone:
VMINPS Z1, Z2, Z1
VMINPS Z1, Z3, Z1
VMINPS Z1, Z4, Z1
VMINPS Z1, Z5, Z1
VMINPS Z1, Z6, Z1
VMINPS Z1, Z7, Z1
VMINPS Z1, Z8, Z1
VMINPS Z1, Z9, Z1
VMINPS Z1, Z10, Z1
VMINPS Z1, Z11, Z1
VMINPS Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VMINPS Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
MINPS X0, X1
MOVOU X1, (CX)
RET
// func float64MinAvx512Asm(x []float64, r []float64)
// Requires: AVX, AVX512F, SSE2
TEXT ·float64MinAvx512Asm(SB), NOSPLIT, $0-48
MOVQ x_base+0(FP), AX
MOVQ r_base+24(FP), CX
MOVQ x_len+8(FP), DX
MOVQ $0x7fefffffffffffff, BX
MOVQ BX, X0
VBROADCASTSD X0, Z0
VMOVUPD Z0, Z1
VMOVUPD Z0, Z2
VMOVUPD Z0, Z3
VMOVUPD Z0, Z4
VMOVUPD Z0, Z5
VMOVUPD Z0, Z6
VMOVUPD Z0, Z7
VMOVUPD Z0, Z8
VMOVUPD Z0, Z9
VMOVUPD Z0, Z10
VMOVUPD Z0, Z11
VMOVUPD Z0, Z0
float64MinBlockLoop:
CMPQ DX, $0x00000060
JL float64MinTailLoop
VMINPD (AX), Z1, Z1
VMINPD 64(AX), Z2, Z2
VMINPD 128(AX), Z3, Z3
VMINPD 192(AX), Z4, Z4
VMINPD 256(AX), Z5, Z5
VMINPD 320(AX), Z6, Z6
VMINPD 384(AX), Z7, Z7
VMINPD 448(AX), Z8, Z8
VMINPD 512(AX), Z9, Z9
VMINPD 576(AX), Z10, Z10
VMINPD 640(AX), Z11, Z11
VMINPD 704(AX), Z0, Z0
ADDQ $0x00000300, AX
SUBQ $0x00000060, DX
JMP float64MinBlockLoop
float64MinTailLoop:
CMPQ DX, $0x00000004
JL float64MinDone
VMINPD (AX), Z1, Z1
ADDQ $0x00000040, AX
SUBQ $0x00000008, DX
JMP float64MinTailLoop
float64MinDone:
VMINPD Z1, Z2, Z1
VMINPD Z1, Z3, Z1
VMINPD Z1, Z4, Z1
VMINPD Z1, Z5, Z1
VMINPD Z1, Z6, Z1
VMINPD Z1, Z7, Z1
VMINPD Z1, Z8, Z1
VMINPD Z1, Z9, Z1
VMINPD Z1, Z10, Z1
VMINPD Z1, Z11, Z1
VMINPD Z1, Z0, Z1
VEXTRACTI64X4 $0x01, Z1, Y0
VMINPD Y0, Y1, Y1
VEXTRACTF128 $0x01, Y1, X0
MINPD X0, X1
MOVOU X1, (CX)
RET
// Code generated by command: go run avx512.go -out avx512.s -stubs avx512_stubs.go. DO NOT EDIT.
package min
func int8MinAvx512Asm(x []int8, r []int8)
func int16MinAvx512Asm(x []int16, r []int16)
func int32MinAvx512Asm(x []int32, r []int32)
func int64MinAvx512Asm(x []int64, r []int64)
func uint8MinAvx512Asm(x []uint8, r []uint8)
func uint16MinAvx512Asm(x []uint16, r []uint16)
func uint32MinAvx512Asm(x []uint32, r []uint32)
func uint64MinAvx512Asm(x []uint64, r []uint64)
func float32MinAvx512Asm(x []float32, r []float32)
func float64MinAvx512Asm(x []float64, r []float64)
......@@ -2,141 +2,775 @@ package min
import (
"bytes"
"matrixbase/pkg/container/vector"
"matrixbase/pkg/container/types"
"golang.org/x/sys/cpu"
)
var (
boolMin func([]bool) bool
i64Min func([]int64) int64
f64Min func([]float64) float64
bytesMin func(*vector.Bytes) []byte
boolMinSels func([]bool, []int64) bool
i64MinSels func([]int64, []int64) int64
f64MinSels func([]float64, []int64) float64
bytesMinSels func(*vector.Bytes, []int64) []byte
boolMin func([]bool) bool
boolMinSels func([]bool, []int64) bool
int8Min func([]int8) int8
int8MinSels func([]int8, []int64) int8
int16Min func([]int16) int16
int16MinSels func([]int16, []int64) int16
int32Min func([]int32) int32
int32MinSels func([]int32, []int64) int32
int64Min func([]int64) int64
int64MinSels func([]int64, []int64) int64
uint8Min func([]uint8) uint8
uint8MinSels func([]uint8, []int64) uint8
uint16Min func([]uint16) uint16
uint16MinSels func([]uint16, []int64) uint16
uint32Min func([]uint32) uint32
uint32MinSels func([]uint32, []int64) uint32
uint64Min func([]uint64) uint64
uint64MinSels func([]uint64, []int64) uint64
float32Min func([]float32) float32
float32MinSels func([]float32, []int64) float32
float64Min func([]float64) float64
float64MinSels func([]float64, []int64) float64
strMin func(*types.Bytes) []byte
strMinSels func(*types.Bytes, []int64) []byte
)
func init() {
i64Min = i64MinPure
f64Min = f64MinPure
if cpu.X86.HasAVX512 {
int8Min = int8MinAvx512
int16Min = int16MinAvx512
int32Min = int32MinAvx512
int64Min = int64MinAvx512
uint8Min = uint8MinAvx512
uint16Min = uint16MinAvx512
uint32Min = uint32MinAvx512
uint64Min = uint64MinAvx512
float32Min = float32MinAvx512
float64Min = float64MinAvx512
} else if cpu.X86.HasAVX2 {
int8Min = int8MinAvx2
int16Min = int16MinAvx2
int32Min = int32MinAvx2
int64Min = int64MinPure
uint8Min = uint8MinAvx2
uint16Min = uint16MinAvx2
uint32Min = uint32MinAvx2
uint64Min = uint64MinPure
float32Min = float32MinAvx2
float64Min = float64MinAvx2
} else {
int8Min = int8MinPure
int16Min = int16MinPure
int32Min = int32MinPure
int64Min = int64MinPure
uint8Min = uint8MinPure
uint16Min = uint16MinPure
uint32Min = uint32MinPure
uint64Min = uint64MinPure
float32Min = float32MinPure
float64Min = float64MinPure
}
boolMin = boolMinPure
bytesMin = bytesMinPure
i64MinSels = i64MinSelsPure
f64MinSels = f64MinSelsPure
strMin = strMinPure
boolMinSels = boolMinSelsPure
bytesMinSels = bytesMinSelsPure
int8MinSels = int8MinSelsPure
int16MinSels = int16MinSelsPure
int32MinSels = int32MinSelsPure
int64MinSels = int64MinSelsPure
uint8MinSels = uint8MinSelsPure
uint16MinSels = uint16MinSelsPure
uint32MinSels = uint32MinSelsPure
uint64MinSels = uint64MinSelsPure
float32MinSels = float32MinSelsPure
float64MinSels = float64MinSelsPure
strMinSels = strMinSelsPure
}
func BoolMin(xs []bool) bool {
return boolMinPure(xs)
return boolMin(xs)
}
func I64Min(xs []int64) int64 {
return i64Min(xs)
func boolMinPure(xs []bool) bool {
for _, x := range xs {
if x == false {
return false
}
}
return true
}
func F64Min(xs []float64) float64 {
return f64Min(xs)
func BoolMinSels(xs []bool, sels []int64) bool {
return boolMinSels(xs, sels)
}
func BytesMin(xs *vector.Bytes) []byte {
return bytesMin(xs)
func boolMinSelsPure(xs []bool, sels []int64) bool {
for _, sel := range sels {
if xs[sel] == false {
return false
}
}
return true
}
func BoolMinSels(xs []bool, sels []int64) bool {
return boolMinSelsPure(xs, sels)
func Int8Min(xs []int8) int8 {
return int8Min(xs)
}
func I64MinSels(xs []int64, sels []int64) int64 {
return i64MinSels(xs, sels)
func int8MinPure(xs []int8) int8 {
res := xs[0]
for _, x := range xs {
if x < res {
res = x
}
}
return res
}
func F64MinSels(xs []float64, sels []int64) float64 {
return f64MinSels(xs, sels)
func int8MinAvx2(xs []int8) int8 {
const regItems int = 32 / 1
n := len(xs) / regItems
var rs [16]int8
int8MinAvx2Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 16; i++ {
if rs[i] < res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] < res {
res = xs[i]
}
}
return res
}
func BytesMinSels(xs *vector.Bytes, sels []int64) []byte {
return bytesMinSels(xs, sels)
func int8MinAvx512(xs []int8) int8 {
const regItems int = 64 / 1
n := len(xs) / regItems
var rs [16]int8
int8MinAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 16; i++ {
if rs[i] < res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] < res {
res = xs[i]
}
}
return res
}
func boolMinPure(xs []bool) bool {
func Int8MinSels(xs []int8, sels []int64) int8 {
return int8MinSels(xs, sels)
}
func int8MinSelsPure(xs []int8, sels []int64) int8 {
res := xs[sels[0]]
for _, sel := range sels {
x := xs[sel]
if x < res {
res = x
}
}
return res
}
func Int16Min(xs []int16) int16 {
return int16Min(xs)
}
func int16MinPure(xs []int16) int16 {
res := xs[0]
for _, x := range xs {
if !x {
return false
if x < res {
res = x
}
}
return true
return res
}
func int16MinAvx2(xs []int16) int16 {
const regItems int = 32 / 2
n := len(xs) / regItems
var rs [8]int16
int16MinAvx2Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 8; i++ {
if rs[i] < res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] < res {
res = xs[i]
}
}
return res
}
func int16MinAvx512(xs []int16) int16 {
const regItems int = 64 / 2
n := len(xs) / regItems
var rs [8]int16
int16MinAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 8; i++ {
if rs[i] < res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] < res {
res = xs[i]
}
}
return res
}
func Int16MinSels(xs []int16, sels []int64) int16 {
return int16MinSels(xs, sels)
}
func int16MinSelsPure(xs []int16, sels []int64) int16 {
res := xs[sels[0]]
for _, sel := range sels {
x := xs[sel]
if x < res {
res = x
}
}
return res
}
func Int32Min(xs []int32) int32 {
return int32Min(xs)
}
func i64MinPure(xs []int64) int64 {
min := xs[0]
func int32MinPure(xs []int32) int32 {
res := xs[0]
for _, x := range xs {
if x < min {
min = x
if x < res {
res = x
}
}
return res
}
func int32MinAvx2(xs []int32) int32 {
const regItems int = 32 / 4
n := len(xs) / regItems
var rs [4]int32
int32MinAvx2Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 4; i++ {
if rs[i] < res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] < res {
res = xs[i]
}
}
return res
}
func int32MinAvx512(xs []int32) int32 {
const regItems int = 64 / 4
n := len(xs) / regItems
var rs [4]int32
int32MinAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 4; i++ {
if rs[i] < res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] < res {
res = xs[i]
}
}
return min
return res
}
func f64MinPure(xs []float64) float64 {
min := xs[0]
func Int32MinSels(xs []int32, sels []int64) int32 {
return int32MinSels(xs, sels)
}
func int32MinSelsPure(xs []int32, sels []int64) int32 {
res := xs[sels[0]]
for _, sel := range sels {
x := xs[sel]
if x < res {
res = x
}
}
return res
}
func Int64Min(xs []int64) int64 {
return int64Min(xs)
}
func int64MinPure(xs []int64) int64 {
res := xs[0]
for _, x := range xs {
if x < min {
min = x
if x < res {
res = x
}
}
return min
return res
}
func bytesMinPure(xs *vector.Bytes) []byte {
var tm []byte
func int64MinAvx512(xs []int64) int64 {
const regItems int = 64 / 8
n := len(xs) / regItems
var rs [2]int64
int64MinAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 2; i++ {
if rs[i] < res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] < res {
res = xs[i]
}
}
return res
}
min := xs.Data[xs.Os[0] : xs.Os[0]+xs.Ns[0]]
for i, o := range xs.Os {
if tm = xs.Data[o : o+xs.Ns[i]]; bytes.Compare(tm, min) < 0 {
min = tm
func Int64MinSels(xs []int64, sels []int64) int64 {
return int64MinSels(xs, sels)
}
func int64MinSelsPure(xs []int64, sels []int64) int64 {
res := xs[sels[0]]
for _, sel := range sels {
x := xs[sel]
if x < res {
res = x
}
}
return min
return res
}
func boolMinSelsPure(xs []bool, sels []int64) bool {
func Uint8Min(xs []uint8) uint8 {
return uint8Min(xs)
}
func uint8MinPure(xs []uint8) uint8 {
res := xs[0]
for _, x := range xs {
if x < res {
res = x
}
}
return res
}
func uint8MinAvx2(xs []uint8) uint8 {
const regItems int = 32 / 1
n := len(xs) / regItems
var rs [16]uint8
uint8MinAvx2Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 16; i++ {
if rs[i] < res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] < res {
res = xs[i]
}
}
return res
}
func uint8MinAvx512(xs []uint8) uint8 {
const regItems int = 64 / 1
n := len(xs) / regItems
var rs [16]uint8
uint8MinAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 16; i++ {
if rs[i] < res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] < res {
res = xs[i]
}
}
return res
}
func Uint8MinSels(xs []uint8, sels []int64) uint8 {
return uint8MinSels(xs, sels)
}
func uint8MinSelsPure(xs []uint8, sels []int64) uint8 {
res := xs[sels[0]]
for _, sel := range sels {
if !xs[sel] {
return false
x := xs[sel]
if x < res {
res = x
}
}
return true
return res
}
func Uint16Min(xs []uint16) uint16 {
return uint16Min(xs)
}
func uint16MinPure(xs []uint16) uint16 {
res := xs[0]
for _, x := range xs {
if x < res {
res = x
}
}
return res
}
func uint16MinAvx2(xs []uint16) uint16 {
const regItems int = 32 / 2
n := len(xs) / regItems
var rs [8]uint16
uint16MinAvx2Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 8; i++ {
if rs[i] < res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] < res {
res = xs[i]
}
}
return res
}
func uint16MinAvx512(xs []uint16) uint16 {
const regItems int = 64 / 2
n := len(xs) / regItems
var rs [8]uint16
uint16MinAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 8; i++ {
if rs[i] < res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] < res {
res = xs[i]
}
}
return res
}
func Uint16MinSels(xs []uint16, sels []int64) uint16 {
return uint16MinSels(xs, sels)
}
func uint16MinSelsPure(xs []uint16, sels []int64) uint16 {
res := xs[sels[0]]
for _, sel := range sels {
x := xs[sel]
if x < res {
res = x
}
}
return res
}
func Uint32Min(xs []uint32) uint32 {
return uint32Min(xs)
}
func uint32MinPure(xs []uint32) uint32 {
res := xs[0]
for _, x := range xs {
if x < res {
res = x
}
}
return res
}
func uint32MinAvx2(xs []uint32) uint32 {
const regItems int = 32 / 4
n := len(xs) / regItems
var rs [4]uint32
uint32MinAvx2Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 4; i++ {
if rs[i] < res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] < res {
res = xs[i]
}
}
return res
}
func uint32MinAvx512(xs []uint32) uint32 {
const regItems int = 64 / 4
n := len(xs) / regItems
var rs [4]uint32
uint32MinAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 4; i++ {
if rs[i] < res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] < res {
res = xs[i]
}
}
return res
}
func Uint32MinSels(xs []uint32, sels []int64) uint32 {
return uint32MinSels(xs, sels)
}
func uint32MinSelsPure(xs []uint32, sels []int64) uint32 {
res := xs[sels[0]]
for _, sel := range sels {
x := xs[sel]
if x < res {
res = x
}
}
return res
}
func Uint64Min(xs []uint64) uint64 {
return uint64Min(xs)
}
func uint64MinPure(xs []uint64) uint64 {
res := xs[0]
for _, x := range xs {
if x < res {
res = x
}
}
return res
}
func uint64MinAvx512(xs []uint64) uint64 {
const regItems int = 64 / 8
n := len(xs) / regItems
var rs [2]uint64
uint64MinAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 2; i++ {
if rs[i] < res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] < res {
res = xs[i]
}
}
return res
}
func Uint64MinSels(xs []uint64, sels []int64) uint64 {
return uint64MinSels(xs, sels)
}
func i64MinSelsPure(xs []int64, sels []int64) int64 {
min := xs[sels[0]]
func uint64MinSelsPure(xs []uint64, sels []int64) uint64 {
res := xs[sels[0]]
for _, sel := range sels {
if x := xs[sel]; x < min {
min = x
x := xs[sel]
if x < res {
res = x
}
}
return res
}
func Float32Min(xs []float32) float32 {
return float32Min(xs)
}
func float32MinPure(xs []float32) float32 {
res := xs[0]
for _, x := range xs {
if x < res {
res = x
}
}
return res
}
func float32MinAvx2(xs []float32) float32 {
const regItems int = 32 / 4
n := len(xs) / regItems
var rs [4]float32
float32MinAvx2Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 4; i++ {
if rs[i] < res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] < res {
res = xs[i]
}
}
return min
return res
}
func f64MinSelsPure(xs []float64, sels []int64) float64 {
min := xs[sels[0]]
func float32MinAvx512(xs []float32) float32 {
const regItems int = 64 / 4
n := len(xs) / regItems
var rs [4]float32
float32MinAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 4; i++ {
if rs[i] < res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] < res {
res = xs[i]
}
}
return res
}
func Float32MinSels(xs []float32, sels []int64) float32 {
return float32MinSels(xs, sels)
}
func float32MinSelsPure(xs []float32, sels []int64) float32 {
res := xs[sels[0]]
for _, sel := range sels {
if x := xs[sel]; x < min {
min = x
x := xs[sel]
if x < res {
res = x
}
}
return min
return res
}
func bytesMinSelsPure(xs *vector.Bytes, sels []int64) []byte {
var tm []byte
func Float64Min(xs []float64) float64 {
return float64Min(xs)
}
func float64MinPure(xs []float64) float64 {
res := xs[0]
for _, x := range xs {
if x < res {
res = x
}
}
return res
}
func float64MinAvx2(xs []float64) float64 {
const regItems int = 32 / 8
n := len(xs) / regItems
var rs [2]float64
float64MinAvx2Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 2; i++ {
if rs[i] < res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] < res {
res = xs[i]
}
}
return res
}
func float64MinAvx512(xs []float64) float64 {
const regItems int = 64 / 8
n := len(xs) / regItems
var rs [2]float64
float64MinAvx512Asm(xs[:n*regItems], rs[:])
res := rs[0]
for i := 1; i < 2; i++ {
if rs[i] < res {
res = rs[i]
}
}
for i, j := n*regItems, len(xs); i < j; i++ {
if xs[i] < res {
res = xs[i]
}
}
return res
}
func Float64MinSels(xs []float64, sels []int64) float64 {
return float64MinSels(xs, sels)
}
func float64MinSelsPure(xs []float64, sels []int64) float64 {
res := xs[sels[0]]
for _, sel := range sels {
x := xs[sel]
if x < res {
res = x
}
}
return res
}
func StrMin(xs *types.Bytes) []byte {
return strMin(xs)
}
func strMinPure(xs *types.Bytes) []byte {
res := xs.Get(0)
for i, n := 0, len(xs.Offsets); i < n; i++ {
x := xs.Get(i)
if bytes.Compare(x, res) < 0 {
res = x
}
}
return res
}
func StrMinSels(xs *types.Bytes, sels []int64) []byte {
return strMinSels(xs, sels)
}
min := xs.Data[xs.Os[sels[0]] : xs.Os[sels[0]]+xs.Ns[sels[0]]]
func strMinSelsPure(xs *types.Bytes, sels []int64) []byte {
res := xs.Get(int(sels[0]))
for _, sel := range sels {
if tm = xs.Data[xs.Os[sel] : xs.Os[sel]+xs.Ns[sel]]; bytes.Compare(tm, min) < 0 {
min = tm
x := xs.Get(int(sel))
if bytes.Compare(x, res) < 0 {
res = x
}
}
return min
return res
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment