diff --git a/pkg/vectorize/sub/sub.go b/pkg/vectorize/sub/sub.go
index 51f49b1fd3f44ca11033077666a675cbd3636dac..9a095e3f427a4dd549a79d0755f7c513fcf6175a 100644
--- a/pkg/vectorize/sub/sub.go
+++ b/pkg/vectorize/sub/sub.go
@@ -1,2141 +1,2141 @@
-package minus
+package sub
 
 import "golang.org/x/sys/cpu"
 
 var (
-    int8Minus func([]int8, []int8, []int8) []int8
-    int8MinusSels func([]int8, []int8, []int8, []int64) []int8
-    int8MinusScalar func(int8, []int8, []int8) []int8
-    int8MinusScalarSels func(int8, []int8, []int8, []int64) []int8
-    int8MinusByScalar func(int8, []int8, []int8) []int8
-    int8MinusByScalarSels func(int8, []int8, []int8, []int64) []int8
-    int16Minus func([]int16, []int16, []int16) []int16
-    int16MinusSels func([]int16, []int16, []int16, []int64) []int16
-    int16MinusScalar func(int16, []int16, []int16) []int16
-    int16MinusScalarSels func(int16, []int16, []int16, []int64) []int16
-    int16MinusByScalar func(int16, []int16, []int16) []int16
-    int16MinusByScalarSels func(int16, []int16, []int16, []int64) []int16
-    int32Minus func([]int32, []int32, []int32) []int32
-    int32MinusSels func([]int32, []int32, []int32, []int64) []int32
-    int32MinusScalar func(int32, []int32, []int32) []int32
-    int32MinusScalarSels func(int32, []int32, []int32, []int64) []int32
-    int32MinusByScalar func(int32, []int32, []int32) []int32
-    int32MinusByScalarSels func(int32, []int32, []int32, []int64) []int32
-    int64Minus func([]int64, []int64, []int64) []int64
-    int64MinusSels func([]int64, []int64, []int64, []int64) []int64
-    int64MinusScalar func(int64, []int64, []int64) []int64
-    int64MinusScalarSels func(int64, []int64, []int64, []int64) []int64
-    int64MinusByScalar func(int64, []int64, []int64) []int64
-    int64MinusByScalarSels func(int64, []int64, []int64, []int64) []int64
-    uint8Minus func([]uint8, []uint8, []uint8) []uint8
-    uint8MinusSels func([]uint8, []uint8, []uint8, []int64) []uint8
-    uint8MinusScalar func(uint8, []uint8, []uint8) []uint8
-    uint8MinusScalarSels func(uint8, []uint8, []uint8, []int64) []uint8
-    uint8MinusByScalar func(uint8, []uint8, []uint8) []uint8
-    uint8MinusByScalarSels func(uint8, []uint8, []uint8, []int64) []uint8
-    uint16Minus func([]uint16, []uint16, []uint16) []uint16
-    uint16MinusSels func([]uint16, []uint16, []uint16, []int64) []uint16
-    uint16MinusScalar func(uint16, []uint16, []uint16) []uint16
-    uint16MinusScalarSels func(uint16, []uint16, []uint16, []int64) []uint16
-    uint16MinusByScalar func(uint16, []uint16, []uint16) []uint16
-    uint16MinusByScalarSels func(uint16, []uint16, []uint16, []int64) []uint16
-    uint32Minus func([]uint32, []uint32, []uint32) []uint32
-    uint32MinusSels func([]uint32, []uint32, []uint32, []int64) []uint32
-    uint32MinusScalar func(uint32, []uint32, []uint32) []uint32
-    uint32MinusScalarSels func(uint32, []uint32, []uint32, []int64) []uint32
-    uint32MinusByScalar func(uint32, []uint32, []uint32) []uint32
-    uint32MinusByScalarSels func(uint32, []uint32, []uint32, []int64) []uint32
-    uint64Minus func([]uint64, []uint64, []uint64) []uint64
-    uint64MinusSels func([]uint64, []uint64, []uint64, []int64) []uint64
-    uint64MinusScalar func(uint64, []uint64, []uint64) []uint64
-    uint64MinusScalarSels func(uint64, []uint64, []uint64, []int64) []uint64
-    uint64MinusByScalar func(uint64, []uint64, []uint64) []uint64
-    uint64MinusByScalarSels func(uint64, []uint64, []uint64, []int64) []uint64
-    float32Minus func([]float32, []float32, []float32) []float32
-    float32MinusSels func([]float32, []float32, []float32, []int64) []float32
-    float32MinusScalar func(float32, []float32, []float32) []float32
-    float32MinusScalarSels func(float32, []float32, []float32, []int64) []float32
-    float32MinusByScalar func(float32, []float32, []float32) []float32
-    float32MinusByScalarSels func(float32, []float32, []float32, []int64) []float32
-    float64Minus func([]float64, []float64, []float64) []float64
-    float64MinusSels func([]float64, []float64, []float64, []int64) []float64
-    float64MinusScalar func(float64, []float64, []float64) []float64
-    float64MinusScalarSels func(float64, []float64, []float64, []int64) []float64
-    float64MinusByScalar func(float64, []float64, []float64) []float64
-    float64MinusByScalarSels func(float64, []float64, []float64, []int64) []float64
+    int8Sub func([]int8, []int8, []int8) []int8
+    int8SubSels func([]int8, []int8, []int8, []int64) []int8
+    int8SubScalar func(int8, []int8, []int8) []int8
+    int8SubScalarSels func(int8, []int8, []int8, []int64) []int8
+    int8SubByScalar func(int8, []int8, []int8) []int8
+    int8SubByScalarSels func(int8, []int8, []int8, []int64) []int8
+    int16Sub func([]int16, []int16, []int16) []int16
+    int16SubSels func([]int16, []int16, []int16, []int64) []int16
+    int16SubScalar func(int16, []int16, []int16) []int16
+    int16SubScalarSels func(int16, []int16, []int16, []int64) []int16
+    int16SubByScalar func(int16, []int16, []int16) []int16
+    int16SubByScalarSels func(int16, []int16, []int16, []int64) []int16
+    int32Sub func([]int32, []int32, []int32) []int32
+    int32SubSels func([]int32, []int32, []int32, []int64) []int32
+    int32SubScalar func(int32, []int32, []int32) []int32
+    int32SubScalarSels func(int32, []int32, []int32, []int64) []int32
+    int32SubByScalar func(int32, []int32, []int32) []int32
+    int32SubByScalarSels func(int32, []int32, []int32, []int64) []int32
+    int64Sub func([]int64, []int64, []int64) []int64
+    int64SubSels func([]int64, []int64, []int64, []int64) []int64
+    int64SubScalar func(int64, []int64, []int64) []int64
+    int64SubScalarSels func(int64, []int64, []int64, []int64) []int64
+    int64SubByScalar func(int64, []int64, []int64) []int64
+    int64SubByScalarSels func(int64, []int64, []int64, []int64) []int64
+    uint8Sub func([]uint8, []uint8, []uint8) []uint8
+    uint8SubSels func([]uint8, []uint8, []uint8, []int64) []uint8
+    uint8SubScalar func(uint8, []uint8, []uint8) []uint8
+    uint8SubScalarSels func(uint8, []uint8, []uint8, []int64) []uint8
+    uint8SubByScalar func(uint8, []uint8, []uint8) []uint8
+    uint8SubByScalarSels func(uint8, []uint8, []uint8, []int64) []uint8
+    uint16Sub func([]uint16, []uint16, []uint16) []uint16
+    uint16SubSels func([]uint16, []uint16, []uint16, []int64) []uint16
+    uint16SubScalar func(uint16, []uint16, []uint16) []uint16
+    uint16SubScalarSels func(uint16, []uint16, []uint16, []int64) []uint16
+    uint16SubByScalar func(uint16, []uint16, []uint16) []uint16
+    uint16SubByScalarSels func(uint16, []uint16, []uint16, []int64) []uint16
+    uint32Sub func([]uint32, []uint32, []uint32) []uint32
+    uint32SubSels func([]uint32, []uint32, []uint32, []int64) []uint32
+    uint32SubScalar func(uint32, []uint32, []uint32) []uint32
+    uint32SubScalarSels func(uint32, []uint32, []uint32, []int64) []uint32
+    uint32SubByScalar func(uint32, []uint32, []uint32) []uint32
+    uint32SubByScalarSels func(uint32, []uint32, []uint32, []int64) []uint32
+    uint64Sub func([]uint64, []uint64, []uint64) []uint64
+    uint64SubSels func([]uint64, []uint64, []uint64, []int64) []uint64
+    uint64SubScalar func(uint64, []uint64, []uint64) []uint64
+    uint64SubScalarSels func(uint64, []uint64, []uint64, []int64) []uint64
+    uint64SubByScalar func(uint64, []uint64, []uint64) []uint64
+    uint64SubByScalarSels func(uint64, []uint64, []uint64, []int64) []uint64
+    float32Sub func([]float32, []float32, []float32) []float32
+    float32SubSels func([]float32, []float32, []float32, []int64) []float32
+    float32SubScalar func(float32, []float32, []float32) []float32
+    float32SubScalarSels func(float32, []float32, []float32, []int64) []float32
+    float32SubByScalar func(float32, []float32, []float32) []float32
+    float32SubByScalarSels func(float32, []float32, []float32, []int64) []float32
+    float64Sub func([]float64, []float64, []float64) []float64
+    float64SubSels func([]float64, []float64, []float64, []int64) []float64
+    float64SubScalar func(float64, []float64, []float64) []float64
+    float64SubScalarSels func(float64, []float64, []float64, []int64) []float64
+    float64SubByScalar func(float64, []float64, []float64) []float64
+    float64SubByScalarSels func(float64, []float64, []float64, []int64) []float64
 )
 
 func init() {
 	if cpu.X86.HasAVX512 {
-        int8Minus = int8MinusAvx512
-        //int8MinusSels = int8MinusSelsAvx512
-        int8MinusScalar = int8MinusScalarAvx512
-        //int8MinusScalarSels = int8MinusScalarSelsAvx512
-        int8MinusByScalar = int8MinusByScalarAvx512
-        //int8MinusByScalarSels = int8MinusByScalarSelsAvx512
-        int16Minus = int16MinusAvx512
-        //int16MinusSels = int16MinusSelsAvx512
-        int16MinusScalar = int16MinusScalarAvx512
-        //int16MinusScalarSels = int16MinusScalarSelsAvx512
-        int16MinusByScalar = int16MinusByScalarAvx512
-        //int16MinusByScalarSels = int16MinusByScalarSelsAvx512
-        int32Minus = int32MinusAvx512
-        //int32MinusSels = int32MinusSelsAvx512
-        int32MinusScalar = int32MinusScalarAvx512
-        //int32MinusScalarSels = int32MinusScalarSelsAvx512
-        int32MinusByScalar = int32MinusByScalarAvx512
-        //int32MinusByScalarSels = int32MinusByScalarSelsAvx512
-        int64Minus = int64MinusAvx512
-        //int64MinusSels = int64MinusSelsAvx512
-        int64MinusScalar = int64MinusScalarAvx512
-        //int64MinusScalarSels = int64MinusScalarSelsAvx512
-        int64MinusByScalar = int64MinusByScalarAvx512
-        //int64MinusByScalarSels = int64MinusByScalarSelsAvx512
-        uint8Minus = uint8MinusAvx512
-        //uint8MinusSels = uint8MinusSelsAvx512
-        uint8MinusScalar = uint8MinusScalarAvx512
-        //uint8MinusScalarSels = uint8MinusScalarSelsAvx512
-        uint8MinusByScalar = uint8MinusByScalarAvx512
-        //uint8MinusByScalarSels = uint8MinusByScalarSelsAvx512
-        uint16Minus = uint16MinusAvx512
-        //uint16MinusSels = uint16MinusSelsAvx512
-        uint16MinusScalar = uint16MinusScalarAvx512
-        //uint16MinusScalarSels = uint16MinusScalarSelsAvx512
-        uint16MinusByScalar = uint16MinusByScalarAvx512
-        //uint16MinusByScalarSels = uint16MinusByScalarSelsAvx512
-        uint32Minus = uint32MinusAvx512
-        //uint32MinusSels = uint32MinusSelsAvx512
-        uint32MinusScalar = uint32MinusScalarAvx512
-        //uint32MinusScalarSels = uint32MinusScalarSelsAvx512
-        uint32MinusByScalar = uint32MinusByScalarAvx512
-        //uint32MinusByScalarSels = uint32MinusByScalarSelsAvx512
-        uint64Minus = uint64MinusAvx512
-        //uint64MinusSels = uint64MinusSelsAvx512
-        uint64MinusScalar = uint64MinusScalarAvx512
-        //uint64MinusScalarSels = uint64MinusScalarSelsAvx512
-        uint64MinusByScalar = uint64MinusByScalarAvx512
-        //uint64MinusByScalarSels = uint64MinusByScalarSelsAvx512
-        float32Minus = float32MinusAvx512
-        //float32MinusSels = float32MinusSelsAvx512
-        float32MinusScalar = float32MinusScalarAvx512
-        //float32MinusScalarSels = float32MinusScalarSelsAvx512
-        float32MinusByScalar = float32MinusByScalarAvx512
-        //float32MinusByScalarSels = float32MinusByScalarSelsAvx512
-        float64Minus = float64MinusAvx512
-        //float64MinusSels = float64MinusSelsAvx512
-        float64MinusScalar = float64MinusScalarAvx512
-        //float64MinusScalarSels = float64MinusScalarSelsAvx512
-        float64MinusByScalar = float64MinusByScalarAvx512
-        //float64MinusByScalarSels = float64MinusByScalarSelsAvx512
+        int8Sub = int8SubAvx512
+        //int8SubSels = int8SubSelsAvx512
+        int8SubScalar = int8SubScalarAvx512
+        //int8SubScalarSels = int8SubScalarSelsAvx512
+        int8SubByScalar = int8SubByScalarAvx512
+        //int8SubByScalarSels = int8SubByScalarSelsAvx512
+        int16Sub = int16SubAvx512
+        //int16SubSels = int16SubSelsAvx512
+        int16SubScalar = int16SubScalarAvx512
+        //int16SubScalarSels = int16SubScalarSelsAvx512
+        int16SubByScalar = int16SubByScalarAvx512
+        //int16SubByScalarSels = int16SubByScalarSelsAvx512
+        int32Sub = int32SubAvx512
+        //int32SubSels = int32SubSelsAvx512
+        int32SubScalar = int32SubScalarAvx512
+        //int32SubScalarSels = int32SubScalarSelsAvx512
+        int32SubByScalar = int32SubByScalarAvx512
+        //int32SubByScalarSels = int32SubByScalarSelsAvx512
+        int64Sub = int64SubAvx512
+        //int64SubSels = int64SubSelsAvx512
+        int64SubScalar = int64SubScalarAvx512
+        //int64SubScalarSels = int64SubScalarSelsAvx512
+        int64SubByScalar = int64SubByScalarAvx512
+        //int64SubByScalarSels = int64SubByScalarSelsAvx512
+        uint8Sub = uint8SubAvx512
+        //uint8SubSels = uint8SubSelsAvx512
+        uint8SubScalar = uint8SubScalarAvx512
+        //uint8SubScalarSels = uint8SubScalarSelsAvx512
+        uint8SubByScalar = uint8SubByScalarAvx512
+        //uint8SubByScalarSels = uint8SubByScalarSelsAvx512
+        uint16Sub = uint16SubAvx512
+        //uint16SubSels = uint16SubSelsAvx512
+        uint16SubScalar = uint16SubScalarAvx512
+        //uint16SubScalarSels = uint16SubScalarSelsAvx512
+        uint16SubByScalar = uint16SubByScalarAvx512
+        //uint16SubByScalarSels = uint16SubByScalarSelsAvx512
+        uint32Sub = uint32SubAvx512
+        //uint32SubSels = uint32SubSelsAvx512
+        uint32SubScalar = uint32SubScalarAvx512
+        //uint32SubScalarSels = uint32SubScalarSelsAvx512
+        uint32SubByScalar = uint32SubByScalarAvx512
+        //uint32SubByScalarSels = uint32SubByScalarSelsAvx512
+        uint64Sub = uint64SubAvx512
+        //uint64SubSels = uint64SubSelsAvx512
+        uint64SubScalar = uint64SubScalarAvx512
+        //uint64SubScalarSels = uint64SubScalarSelsAvx512
+        uint64SubByScalar = uint64SubByScalarAvx512
+        //uint64SubByScalarSels = uint64SubByScalarSelsAvx512
+        float32Sub = float32SubAvx512
+        //float32SubSels = float32SubSelsAvx512
+        float32SubScalar = float32SubScalarAvx512
+        //float32SubScalarSels = float32SubScalarSelsAvx512
+        float32SubByScalar = float32SubByScalarAvx512
+        //float32SubByScalarSels = float32SubByScalarSelsAvx512
+        float64Sub = float64SubAvx512
+        //float64SubSels = float64SubSelsAvx512
+        float64SubScalar = float64SubScalarAvx512
+        //float64SubScalarSels = float64SubScalarSelsAvx512
+        float64SubByScalar = float64SubByScalarAvx512
+        //float64SubByScalarSels = float64SubByScalarSelsAvx512
 	} else if cpu.X86.HasAVX2 {
-        int8Minus = int8MinusAvx2
-        //int8MinusSels = int8MinusSelsAvx2
-        int8MinusScalar = int8MinusScalarAvx2
-        //int8MinusScalarSels = int8MinusScalarSelsAvx2
-        int8MinusByScalar = int8MinusByScalarAvx2
-        //int8MinusByScalarSels = int8MinusByScalarSelsAvx2
-        int16Minus = int16MinusAvx2
-        //int16MinusSels = int16MinusSelsAvx2
-        int16MinusScalar = int16MinusScalarAvx2
-        //int16MinusScalarSels = int16MinusScalarSelsAvx2
-        int16MinusByScalar = int16MinusByScalarAvx2
-        //int16MinusByScalarSels = int16MinusByScalarSelsAvx2
-        int32Minus = int32MinusAvx2
-        //int32MinusSels = int32MinusSelsAvx2
-        int32MinusScalar = int32MinusScalarAvx2
-        //int32MinusScalarSels = int32MinusScalarSelsAvx2
-        int32MinusByScalar = int32MinusByScalarAvx2
-        //int32MinusByScalarSels = int32MinusByScalarSelsAvx2
-        int64Minus = int64MinusAvx2
-        //int64MinusSels = int64MinusSelsAvx2
-        int64MinusScalar = int64MinusScalarAvx2
-        //int64MinusScalarSels = int64MinusScalarSelsAvx2
-        int64MinusByScalar = int64MinusByScalarAvx2
-        //int64MinusByScalarSels = int64MinusByScalarSelsAvx2
-        uint8Minus = uint8MinusAvx2
-        //uint8MinusSels = uint8MinusSelsAvx2
-        uint8MinusScalar = uint8MinusScalarAvx2
-        //uint8MinusScalarSels = uint8MinusScalarSelsAvx2
-        uint8MinusByScalar = uint8MinusByScalarAvx2
-        //uint8MinusByScalarSels = uint8MinusByScalarSelsAvx2
-        uint16Minus = uint16MinusAvx2
-        //uint16MinusSels = uint16MinusSelsAvx2
-        uint16MinusScalar = uint16MinusScalarAvx2
-        //uint16MinusScalarSels = uint16MinusScalarSelsAvx2
-        uint16MinusByScalar = uint16MinusByScalarAvx2
-        //uint16MinusByScalarSels = uint16MinusByScalarSelsAvx2
-        uint32Minus = uint32MinusAvx2
-        //uint32MinusSels = uint32MinusSelsAvx2
-        uint32MinusScalar = uint32MinusScalarAvx2
-        //uint32MinusScalarSels = uint32MinusScalarSelsAvx2
-        uint32MinusByScalar = uint32MinusByScalarAvx2
-        //uint32MinusByScalarSels = uint32MinusByScalarSelsAvx2
-        uint64Minus = uint64MinusAvx2
-        //uint64MinusSels = uint64MinusSelsAvx2
-        uint64MinusScalar = uint64MinusScalarAvx2
-        //uint64MinusScalarSels = uint64MinusScalarSelsAvx2
-        uint64MinusByScalar = uint64MinusByScalarAvx2
-        //uint64MinusByScalarSels = uint64MinusByScalarSelsAvx2
-        float32Minus = float32MinusAvx2
-        //float32MinusSels = float32MinusSelsAvx2
-        float32MinusScalar = float32MinusScalarAvx2
-        //float32MinusScalarSels = float32MinusScalarSelsAvx2
-        float32MinusByScalar = float32MinusByScalarAvx2
-        //float32MinusByScalarSels = float32MinusByScalarSelsAvx2
-        float64Minus = float64MinusAvx2
-        //float64MinusSels = float64MinusSelsAvx2
-        float64MinusScalar = float64MinusScalarAvx2
-        //float64MinusScalarSels = float64MinusScalarSelsAvx2
-        float64MinusByScalar = float64MinusByScalarAvx2
-        //float64MinusByScalarSels = float64MinusByScalarSelsAvx2
+        int8Sub = int8SubAvx2
+        //int8SubSels = int8SubSelsAvx2
+        int8SubScalar = int8SubScalarAvx2
+        //int8SubScalarSels = int8SubScalarSelsAvx2
+        int8SubByScalar = int8SubByScalarAvx2
+        //int8SubByScalarSels = int8SubByScalarSelsAvx2
+        int16Sub = int16SubAvx2
+        //int16SubSels = int16SubSelsAvx2
+        int16SubScalar = int16SubScalarAvx2
+        //int16SubScalarSels = int16SubScalarSelsAvx2
+        int16SubByScalar = int16SubByScalarAvx2
+        //int16SubByScalarSels = int16SubByScalarSelsAvx2
+        int32Sub = int32SubAvx2
+        //int32SubSels = int32SubSelsAvx2
+        int32SubScalar = int32SubScalarAvx2
+        //int32SubScalarSels = int32SubScalarSelsAvx2
+        int32SubByScalar = int32SubByScalarAvx2
+        //int32SubByScalarSels = int32SubByScalarSelsAvx2
+        int64Sub = int64SubAvx2
+        //int64SubSels = int64SubSelsAvx2
+        int64SubScalar = int64SubScalarAvx2
+        //int64SubScalarSels = int64SubScalarSelsAvx2
+        int64SubByScalar = int64SubByScalarAvx2
+        //int64SubByScalarSels = int64SubByScalarSelsAvx2
+        uint8Sub = uint8SubAvx2
+        //uint8SubSels = uint8SubSelsAvx2
+        uint8SubScalar = uint8SubScalarAvx2
+        //uint8SubScalarSels = uint8SubScalarSelsAvx2
+        uint8SubByScalar = uint8SubByScalarAvx2
+        //uint8SubByScalarSels = uint8SubByScalarSelsAvx2
+        uint16Sub = uint16SubAvx2
+        //uint16SubSels = uint16SubSelsAvx2
+        uint16SubScalar = uint16SubScalarAvx2
+        //uint16SubScalarSels = uint16SubScalarSelsAvx2
+        uint16SubByScalar = uint16SubByScalarAvx2
+        //uint16SubByScalarSels = uint16SubByScalarSelsAvx2
+        uint32Sub = uint32SubAvx2
+        //uint32SubSels = uint32SubSelsAvx2
+        uint32SubScalar = uint32SubScalarAvx2
+        //uint32SubScalarSels = uint32SubScalarSelsAvx2
+        uint32SubByScalar = uint32SubByScalarAvx2
+        //uint32SubByScalarSels = uint32SubByScalarSelsAvx2
+        uint64Sub = uint64SubAvx2
+        //uint64SubSels = uint64SubSelsAvx2
+        uint64SubScalar = uint64SubScalarAvx2
+        //uint64SubScalarSels = uint64SubScalarSelsAvx2
+        uint64SubByScalar = uint64SubByScalarAvx2
+        //uint64SubByScalarSels = uint64SubByScalarSelsAvx2
+        float32Sub = float32SubAvx2
+        //float32SubSels = float32SubSelsAvx2
+        float32SubScalar = float32SubScalarAvx2
+        //float32SubScalarSels = float32SubScalarSelsAvx2
+        float32SubByScalar = float32SubByScalarAvx2
+        //float32SubByScalarSels = float32SubByScalarSelsAvx2
+        float64Sub = float64SubAvx2
+        //float64SubSels = float64SubSelsAvx2
+        float64SubScalar = float64SubScalarAvx2
+        //float64SubScalarSels = float64SubScalarSelsAvx2
+        float64SubByScalar = float64SubByScalarAvx2
+        //float64SubByScalarSels = float64SubByScalarSelsAvx2
 	} else {
-        int8Minus = int8MinusPure
-        //int8MinusSels = int8MinusSelsPure
-        int8MinusScalar = int8MinusScalarPure
-        //int8MinusScalarSels = int8MinusScalarSelsPure
-        int8MinusByScalar = int8MinusByScalarPure
-        //int8MinusByScalarSels = int8MinusByScalarSelsPure
-        int16Minus = int16MinusPure
-        //int16MinusSels = int16MinusSelsPure
-        int16MinusScalar = int16MinusScalarPure
-        //int16MinusScalarSels = int16MinusScalarSelsPure
-        int16MinusByScalar = int16MinusByScalarPure
-        //int16MinusByScalarSels = int16MinusByScalarSelsPure
-        int32Minus = int32MinusPure
-        //int32MinusSels = int32MinusSelsPure
-        int32MinusScalar = int32MinusScalarPure
-        //int32MinusScalarSels = int32MinusScalarSelsPure
-        int32MinusByScalar = int32MinusByScalarPure
-        //int32MinusByScalarSels = int32MinusByScalarSelsPure
-        int64Minus = int64MinusPure
-        //int64MinusSels = int64MinusSelsPure
-        int64MinusScalar = int64MinusScalarPure
-        //int64MinusScalarSels = int64MinusScalarSelsPure
-        int64MinusByScalar = int64MinusByScalarPure
-        //int64MinusByScalarSels = int64MinusByScalarSelsPure
-        uint8Minus = uint8MinusPure
-        //uint8MinusSels = uint8MinusSelsPure
-        uint8MinusScalar = uint8MinusScalarPure
-        //uint8MinusScalarSels = uint8MinusScalarSelsPure
-        uint8MinusByScalar = uint8MinusByScalarPure
-        //uint8MinusByScalarSels = uint8MinusByScalarSelsPure
-        uint16Minus = uint16MinusPure
-        //uint16MinusSels = uint16MinusSelsPure
-        uint16MinusScalar = uint16MinusScalarPure
-        //uint16MinusScalarSels = uint16MinusScalarSelsPure
-        uint16MinusByScalar = uint16MinusByScalarPure
-        //uint16MinusByScalarSels = uint16MinusByScalarSelsPure
-        uint32Minus = uint32MinusPure
-        //uint32MinusSels = uint32MinusSelsPure
-        uint32MinusScalar = uint32MinusScalarPure
-        //uint32MinusScalarSels = uint32MinusScalarSelsPure
-        uint32MinusByScalar = uint32MinusByScalarPure
-        //uint32MinusByScalarSels = uint32MinusByScalarSelsPure
-        uint64Minus = uint64MinusPure
-        //uint64MinusSels = uint64MinusSelsPure
-        uint64MinusScalar = uint64MinusScalarPure
-        //uint64MinusScalarSels = uint64MinusScalarSelsPure
-        uint64MinusByScalar = uint64MinusByScalarPure
-        //uint64MinusByScalarSels = uint64MinusByScalarSelsPure
-        float32Minus = float32MinusPure
-        //float32MinusSels = float32MinusSelsPure
-        float32MinusScalar = float32MinusScalarPure
-        //float32MinusScalarSels = float32MinusScalarSelsPure
-        float32MinusByScalar = float32MinusByScalarPure
-        //float32MinusByScalarSels = float32MinusByScalarSelsPure
-        float64Minus = float64MinusPure
-        //float64MinusSels = float64MinusSelsPure
-        float64MinusScalar = float64MinusScalarPure
-        //float64MinusScalarSels = float64MinusScalarSelsPure
-        float64MinusByScalar = float64MinusByScalarPure
-        //float64MinusByScalarSels = float64MinusByScalarSelsPure
-	}
-	int8MinusSels = int8MinusSelsPure
-	int8MinusScalarSels = int8MinusScalarSelsPure
-	int8MinusByScalarSels = int8MinusByScalarSelsPure
-	int16MinusSels = int16MinusSelsPure
-	int16MinusScalarSels = int16MinusScalarSelsPure
-	int16MinusByScalarSels = int16MinusByScalarSelsPure
-	int32MinusSels = int32MinusSelsPure
-	int32MinusScalarSels = int32MinusScalarSelsPure
-	int32MinusByScalarSels = int32MinusByScalarSelsPure
-	int64MinusSels = int64MinusSelsPure
-	int64MinusScalarSels = int64MinusScalarSelsPure
-	int64MinusByScalarSels = int64MinusByScalarSelsPure
-	uint8MinusSels = uint8MinusSelsPure
-	uint8MinusScalarSels = uint8MinusScalarSelsPure
-	uint8MinusByScalarSels = uint8MinusByScalarSelsPure
-	uint16MinusSels = uint16MinusSelsPure
-	uint16MinusScalarSels = uint16MinusScalarSelsPure
-	uint16MinusByScalarSels = uint16MinusByScalarSelsPure
-	uint32MinusSels = uint32MinusSelsPure
-	uint32MinusScalarSels = uint32MinusScalarSelsPure
-	uint32MinusByScalarSels = uint32MinusByScalarSelsPure
-	uint64MinusSels = uint64MinusSelsPure
-	uint64MinusScalarSels = uint64MinusScalarSelsPure
-	uint64MinusByScalarSels = uint64MinusByScalarSelsPure
-	float32MinusSels = float32MinusSelsPure
-	float32MinusScalarSels = float32MinusScalarSelsPure
-	float32MinusByScalarSels = float32MinusByScalarSelsPure
-	float64MinusSels = float64MinusSelsPure
-	float64MinusScalarSels = float64MinusScalarSelsPure
-	float64MinusByScalarSels = float64MinusByScalarSelsPure
-}
-
-func Int8Minus(xs, ys, rs []int8) []int8 {
-	return int8Minus(xs, ys, rs)
-}
-
-func int8MinusPure(xs, ys, rs []int8) []int8 {
+        int8Sub = int8SubPure
+        //int8SubSels = int8SubSelsPure
+        int8SubScalar = int8SubScalarPure
+        //int8SubScalarSels = int8SubScalarSelsPure
+        int8SubByScalar = int8SubByScalarPure
+        //int8SubByScalarSels = int8SubByScalarSelsPure
+        int16Sub = int16SubPure
+        //int16SubSels = int16SubSelsPure
+        int16SubScalar = int16SubScalarPure
+        //int16SubScalarSels = int16SubScalarSelsPure
+        int16SubByScalar = int16SubByScalarPure
+        //int16SubByScalarSels = int16SubByScalarSelsPure
+        int32Sub = int32SubPure
+        //int32SubSels = int32SubSelsPure
+        int32SubScalar = int32SubScalarPure
+        //int32SubScalarSels = int32SubScalarSelsPure
+        int32SubByScalar = int32SubByScalarPure
+        //int32SubByScalarSels = int32SubByScalarSelsPure
+        int64Sub = int64SubPure
+        //int64SubSels = int64SubSelsPure
+        int64SubScalar = int64SubScalarPure
+        //int64SubScalarSels = int64SubScalarSelsPure
+        int64SubByScalar = int64SubByScalarPure
+        //int64SubByScalarSels = int64SubByScalarSelsPure
+        uint8Sub = uint8SubPure
+        //uint8SubSels = uint8SubSelsPure
+        uint8SubScalar = uint8SubScalarPure
+        //uint8SubScalarSels = uint8SubScalarSelsPure
+        uint8SubByScalar = uint8SubByScalarPure
+        //uint8SubByScalarSels = uint8SubByScalarSelsPure
+        uint16Sub = uint16SubPure
+        //uint16SubSels = uint16SubSelsPure
+        uint16SubScalar = uint16SubScalarPure
+        //uint16SubScalarSels = uint16SubScalarSelsPure
+        uint16SubByScalar = uint16SubByScalarPure
+        //uint16SubByScalarSels = uint16SubByScalarSelsPure
+        uint32Sub = uint32SubPure
+        //uint32SubSels = uint32SubSelsPure
+        uint32SubScalar = uint32SubScalarPure
+        //uint32SubScalarSels = uint32SubScalarSelsPure
+        uint32SubByScalar = uint32SubByScalarPure
+        //uint32SubByScalarSels = uint32SubByScalarSelsPure
+        uint64Sub = uint64SubPure
+        //uint64SubSels = uint64SubSelsPure
+        uint64SubScalar = uint64SubScalarPure
+        //uint64SubScalarSels = uint64SubScalarSelsPure
+        uint64SubByScalar = uint64SubByScalarPure
+        //uint64SubByScalarSels = uint64SubByScalarSelsPure
+        float32Sub = float32SubPure
+        //float32SubSels = float32SubSelsPure
+        float32SubScalar = float32SubScalarPure
+        //float32SubScalarSels = float32SubScalarSelsPure
+        float32SubByScalar = float32SubByScalarPure
+        //float32SubByScalarSels = float32SubByScalarSelsPure
+        float64Sub = float64SubPure
+        //float64SubSels = float64SubSelsPure
+        float64SubScalar = float64SubScalarPure
+        //float64SubScalarSels = float64SubScalarSelsPure
+        float64SubByScalar = float64SubByScalarPure
+        //float64SubByScalarSels = float64SubByScalarSelsPure
+	}
+	int8SubSels = int8SubSelsPure
+	int8SubScalarSels = int8SubScalarSelsPure
+	int8SubByScalarSels = int8SubByScalarSelsPure
+	int16SubSels = int16SubSelsPure
+	int16SubScalarSels = int16SubScalarSelsPure
+	int16SubByScalarSels = int16SubByScalarSelsPure
+	int32SubSels = int32SubSelsPure
+	int32SubScalarSels = int32SubScalarSelsPure
+	int32SubByScalarSels = int32SubByScalarSelsPure
+	int64SubSels = int64SubSelsPure
+	int64SubScalarSels = int64SubScalarSelsPure
+	int64SubByScalarSels = int64SubByScalarSelsPure
+	uint8SubSels = uint8SubSelsPure
+	uint8SubScalarSels = uint8SubScalarSelsPure
+	uint8SubByScalarSels = uint8SubByScalarSelsPure
+	uint16SubSels = uint16SubSelsPure
+	uint16SubScalarSels = uint16SubScalarSelsPure
+	uint16SubByScalarSels = uint16SubByScalarSelsPure
+	uint32SubSels = uint32SubSelsPure
+	uint32SubScalarSels = uint32SubScalarSelsPure
+	uint32SubByScalarSels = uint32SubByScalarSelsPure
+	uint64SubSels = uint64SubSelsPure
+	uint64SubScalarSels = uint64SubScalarSelsPure
+	uint64SubByScalarSels = uint64SubByScalarSelsPure
+	float32SubSels = float32SubSelsPure
+	float32SubScalarSels = float32SubScalarSelsPure
+	float32SubByScalarSels = float32SubByScalarSelsPure
+	float64SubSels = float64SubSelsPure
+	float64SubScalarSels = float64SubScalarSelsPure
+	float64SubByScalarSels = float64SubByScalarSelsPure
+}
+
+func Int8Sub(xs, ys, rs []int8) []int8 {
+	return int8Sub(xs, ys, rs)
+}
+
+func int8SubPure(xs, ys, rs []int8) []int8 {
 	for i, x := range xs {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func int8MinusAvx2(xs, ys, rs []int8) []int8 {
+func int8SubAvx2(xs, ys, rs []int8) []int8 {
     const regItems = 32 / 1
 	n := len(xs) / regItems
-	int8MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	int8SubAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func int8MinusAvx512(xs, ys, rs []int8) []int8 {
+func int8SubAvx512(xs, ys, rs []int8) []int8 {
     const regItems = 64 / 1
 	n := len(xs) / regItems
-	int8MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	int8SubAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func Int8MinusSels(xs, ys, rs []int8, sels []int64) []int8 {
-	return int8MinusSels(xs, ys, rs, sels)
+func Int8SubSels(xs, ys, rs []int8, sels []int64) []int8 {
+	return int8SubSels(xs, ys, rs, sels)
 }
 
-func int8MinusSelsPure(xs, ys, rs []int8, sels []int64) []int8 {
+func int8SubSelsPure(xs, ys, rs []int8, sels []int64) []int8 {
 	for i, sel := range sels {
 		rs[i] = xs[sel] - ys[sel]
 	}
 	return rs
 }
 
-//func int8MinusSelsAvx2(xs, ys, rs []int8, sels []int64) []int8 {
+//func int8SubSelsAvx2(xs, ys, rs []int8, sels []int64) []int8 {
 //    const regItems = 32 / 1
 //	n := len(sels) / regItems
-//	int8MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
+//	int8SubSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func int8MinusSelsAvx512(xs, ys, rs []int8, sels []int64) []int8 {
+//func int8SubSelsAvx512(xs, ys, rs []int8, sels []int64) []int8 {
 //    const regItems = 64 / 1
 //	n := len(sels) / regItems
-//	int8MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
+//	int8SubSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Int8MinusScalar(x int8, ys, rs []int8) []int8 {
-	return int8MinusScalar(x, ys, rs)
+func Int8SubScalar(x int8, ys, rs []int8) []int8 {
+	return int8SubScalar(x, ys, rs)
 }
 
-func int8MinusScalarPure(x int8, ys, rs []int8) []int8 {
+func int8SubScalarPure(x int8, ys, rs []int8) []int8 {
 	for i, y := range ys {
 		rs[i] = x - y
 	}
 	return rs
 }
 
-func int8MinusScalarAvx2(x int8, ys, rs []int8) []int8 {
+func int8SubScalarAvx2(x int8, ys, rs []int8) []int8 {
     const regItems = 32 / 1
 	n := len(ys) / regItems
-	int8MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	int8SubScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func int8MinusScalarAvx512(x int8, ys, rs []int8) []int8 {
+func int8SubScalarAvx512(x int8, ys, rs []int8) []int8 {
     const regItems = 64 / 1
 	n := len(ys) / regItems
-	int8MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	int8SubScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func Int8MinusScalarSels(x int8, ys, rs []int8, sels []int64) []int8 {
-	return int8MinusScalarSels(x, ys, rs, sels)
+func Int8SubScalarSels(x int8, ys, rs []int8, sels []int64) []int8 {
+	return int8SubScalarSels(x, ys, rs, sels)
 }
 
-func int8MinusScalarSelsPure(x int8, ys, rs []int8, sels []int64) []int8 {
+func int8SubScalarSelsPure(x int8, ys, rs []int8, sels []int64) []int8 {
 	for i, sel := range sels {
 		rs[i] = x - ys[sel]
 	}
 	return rs
 }
 
-//func int8MinusScalarSelsAvx2(x int8, ys, rs []int8, sels []int64) []int8 {
+//func int8SubScalarSelsAvx2(x int8, ys, rs []int8, sels []int64) []int8 {
 //    const regItems = 32 / 1
 //	n := len(sels) / regItems
-//	int8MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	int8SubScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func int8MinusScalarSelsAvx512(x int8, ys, rs []int8, sels []int64) []int8 {
+//func int8SubScalarSelsAvx512(x int8, ys, rs []int8, sels []int64) []int8 {
 //    const regItems = 64 / 1
 //	n := len(sels) / regItems
-//	int8MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	int8SubScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Int8MinusByScalar(x int8, ys, rs []int8) []int8 {
-	return int8MinusByScalar(x, ys, rs)
+func Int8SubByScalar(x int8, ys, rs []int8) []int8 {
+	return int8SubByScalar(x, ys, rs)
 }
 
-func int8MinusByScalarPure(x int8, ys, rs []int8) []int8 {
+func int8SubByScalarPure(x int8, ys, rs []int8) []int8 {
 	for i, y := range ys {
 		rs[i] = y - x
 	}
 	return rs
 }
 
-func int8MinusByScalarAvx2(x int8, ys, rs []int8) []int8 {
+func int8SubByScalarAvx2(x int8, ys, rs []int8) []int8 {
     const regItems = 32 / 1
 	n := len(ys) / regItems
-	int8MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	int8SubByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func int8MinusByScalarAvx512(x int8, ys, rs []int8) []int8 {
+func int8SubByScalarAvx512(x int8, ys, rs []int8) []int8 {
     const regItems = 64 / 1
 	n := len(ys) / regItems
-	int8MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	int8SubByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func Int8MinusByScalarSels(x int8, ys, rs []int8, sels []int64) []int8 {
-	return int8MinusByScalarSels(x, ys, rs, sels)
+func Int8SubByScalarSels(x int8, ys, rs []int8, sels []int64) []int8 {
+	return int8SubByScalarSels(x, ys, rs, sels)
 }
 
-func int8MinusByScalarSelsPure(x int8, ys, rs []int8, sels []int64) []int8 {
+func int8SubByScalarSelsPure(x int8, ys, rs []int8, sels []int64) []int8 {
 	for i, sel := range sels {
 		rs[i] = ys[sel] - x
 	}
 	return rs
 }
 
-//func int8MinusByScalarSelsAvx2(x int8, ys, rs []int8, sels []int64) []int8 {
+//func int8SubByScalarSelsAvx2(x int8, ys, rs []int8, sels []int64) []int8 {
 //    const regItems = 32 / 1
 //	n := len(sels) / regItems
-//	int8MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	int8SubByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}
 //	return rs
 //}
 
-//func int8MinusByScalarSelsAvx512(x int8, ys, rs []int8, sels []int64) []int8 {
+//func int8SubByScalarSelsAvx512(x int8, ys, rs []int8, sels []int64) []int8 {
 //    const regItems = 64 / 1
 //	n := len(sels) / regItems
-//	int8MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	int8SubByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}
 //	return rs
 //}
 
-func Int16Minus(xs, ys, rs []int16) []int16 {
-	return int16Minus(xs, ys, rs)
+func Int16Sub(xs, ys, rs []int16) []int16 {
+	return int16Sub(xs, ys, rs)
 }
 
-func int16MinusPure(xs, ys, rs []int16) []int16 {
+func int16SubPure(xs, ys, rs []int16) []int16 {
 	for i, x := range xs {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func int16MinusAvx2(xs, ys, rs []int16) []int16 {
+func int16SubAvx2(xs, ys, rs []int16) []int16 {
     const regItems = 32 / 2
 	n := len(xs) / regItems
-	int16MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	int16SubAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func int16MinusAvx512(xs, ys, rs []int16) []int16 {
+func int16SubAvx512(xs, ys, rs []int16) []int16 {
     const regItems = 64 / 2
 	n := len(xs) / regItems
-	int16MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	int16SubAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func Int16MinusSels(xs, ys, rs []int16, sels []int64) []int16 {
-	return int16MinusSels(xs, ys, rs, sels)
+func Int16SubSels(xs, ys, rs []int16, sels []int64) []int16 {
+	return int16SubSels(xs, ys, rs, sels)
 }
 
-func int16MinusSelsPure(xs, ys, rs []int16, sels []int64) []int16 {
+func int16SubSelsPure(xs, ys, rs []int16, sels []int64) []int16 {
 	for i, sel := range sels {
 		rs[i] = xs[sel] - ys[sel]
 	}
 	return rs
 }
 
-//func int16MinusSelsAvx2(xs, ys, rs []int16, sels []int64) []int16 {
+//func int16SubSelsAvx2(xs, ys, rs []int16, sels []int64) []int16 {
 //    const regItems = 32 / 2
 //	n := len(sels) / regItems
-//	int16MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
+//	int16SubSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func int16MinusSelsAvx512(xs, ys, rs []int16, sels []int64) []int16 {
+//func int16SubSelsAvx512(xs, ys, rs []int16, sels []int64) []int16 {
 //    const regItems = 64 / 2
 //	n := len(sels) / regItems
-//	int16MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
+//	int16SubSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Int16MinusScalar(x int16, ys, rs []int16) []int16 {
-	return int16MinusScalar(x, ys, rs)
+func Int16SubScalar(x int16, ys, rs []int16) []int16 {
+	return int16SubScalar(x, ys, rs)
 }
 
-func int16MinusScalarPure(x int16, ys, rs []int16) []int16 {
+func int16SubScalarPure(x int16, ys, rs []int16) []int16 {
 	for i, y := range ys {
 		rs[i] = x - y
 	}
 	return rs
 }
 
-func int16MinusScalarAvx2(x int16, ys, rs []int16) []int16 {
+func int16SubScalarAvx2(x int16, ys, rs []int16) []int16 {
     const regItems = 32 / 2
 	n := len(ys) / regItems
-	int16MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	int16SubScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func int16MinusScalarAvx512(x int16, ys, rs []int16) []int16 {
+func int16SubScalarAvx512(x int16, ys, rs []int16) []int16 {
     const regItems = 64 / 2
 	n := len(ys) / regItems
-	int16MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	int16SubScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func Int16MinusScalarSels(x int16, ys, rs []int16, sels []int64) []int16 {
-	return int16MinusScalarSels(x, ys, rs, sels)
+func Int16SubScalarSels(x int16, ys, rs []int16, sels []int64) []int16 {
+	return int16SubScalarSels(x, ys, rs, sels)
 }
 
-func int16MinusScalarSelsPure(x int16, ys, rs []int16, sels []int64) []int16 {
+func int16SubScalarSelsPure(x int16, ys, rs []int16, sels []int64) []int16 {
 	for i, sel := range sels {
 		rs[i] = x - ys[sel]
 	}
 	return rs
 }
 
-//func int16MinusScalarSelsAvx2(x int16, ys, rs []int16, sels []int64) []int16 {
+//func int16SubScalarSelsAvx2(x int16, ys, rs []int16, sels []int64) []int16 {
 //    const regItems = 32 / 2
 //	n := len(sels) / regItems
-//	int16MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	int16SubScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func int16MinusScalarSelsAvx512(x int16, ys, rs []int16, sels []int64) []int16 {
+//func int16SubScalarSelsAvx512(x int16, ys, rs []int16, sels []int64) []int16 {
 //    const regItems = 64 / 2
 //	n := len(sels) / regItems
-//	int16MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	int16SubScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Int16MinusByScalar(x int16, ys, rs []int16) []int16 {
-	return int16MinusByScalar(x, ys, rs)
+func Int16SubByScalar(x int16, ys, rs []int16) []int16 {
+	return int16SubByScalar(x, ys, rs)
 }
 
-func int16MinusByScalarPure(x int16, ys, rs []int16) []int16 {
+func int16SubByScalarPure(x int16, ys, rs []int16) []int16 {
 	for i, y := range ys {
 		rs[i] = y - x
 	}
 	return rs
 }
 
-func int16MinusByScalarAvx2(x int16, ys, rs []int16) []int16 {
+func int16SubByScalarAvx2(x int16, ys, rs []int16) []int16 {
     const regItems = 32 / 2
 	n := len(ys) / regItems
-	int16MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	int16SubByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func int16MinusByScalarAvx512(x int16, ys, rs []int16) []int16 {
+func int16SubByScalarAvx512(x int16, ys, rs []int16) []int16 {
     const regItems = 64 / 2
 	n := len(ys) / regItems
-	int16MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	int16SubByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func Int16MinusByScalarSels(x int16, ys, rs []int16, sels []int64) []int16 {
-	return int16MinusByScalarSels(x, ys, rs, sels)
+func Int16SubByScalarSels(x int16, ys, rs []int16, sels []int64) []int16 {
+	return int16SubByScalarSels(x, ys, rs, sels)
 }
 
-func int16MinusByScalarSelsPure(x int16, ys, rs []int16, sels []int64) []int16 {
+func int16SubByScalarSelsPure(x int16, ys, rs []int16, sels []int64) []int16 {
 	for i, sel := range sels {
 		rs[i] = ys[sel] - x
 	}
 	return rs
 }
 
-//func int16MinusByScalarSelsAvx2(x int16, ys, rs []int16, sels []int64) []int16 {
+//func int16SubByScalarSelsAvx2(x int16, ys, rs []int16, sels []int64) []int16 {
 //    const regItems = 32 / 2
 //	n := len(sels) / regItems
-//	int16MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	int16SubByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}
 //	return rs
 //}
 
-//func int16MinusByScalarSelsAvx512(x int16, ys, rs []int16, sels []int64) []int16 {
+//func int16SubByScalarSelsAvx512(x int16, ys, rs []int16, sels []int64) []int16 {
 //    const regItems = 64 / 2
 //	n := len(sels) / regItems
-//	int16MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	int16SubByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}
 //	return rs
 //}
 
-func Int32Minus(xs, ys, rs []int32) []int32 {
-	return int32Minus(xs, ys, rs)
+func Int32Sub(xs, ys, rs []int32) []int32 {
+	return int32Sub(xs, ys, rs)
 }
 
-func int32MinusPure(xs, ys, rs []int32) []int32 {
+func int32SubPure(xs, ys, rs []int32) []int32 {
 	for i, x := range xs {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func int32MinusAvx2(xs, ys, rs []int32) []int32 {
+func int32SubAvx2(xs, ys, rs []int32) []int32 {
     const regItems = 32 / 4
 	n := len(xs) / regItems
-	int32MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	int32SubAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func int32MinusAvx512(xs, ys, rs []int32) []int32 {
+func int32SubAvx512(xs, ys, rs []int32) []int32 {
     const regItems = 64 / 4
 	n := len(xs) / regItems
-	int32MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	int32SubAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func Int32MinusSels(xs, ys, rs []int32, sels []int64) []int32 {
-	return int32MinusSels(xs, ys, rs, sels)
+func Int32SubSels(xs, ys, rs []int32, sels []int64) []int32 {
+	return int32SubSels(xs, ys, rs, sels)
 }
 
-func int32MinusSelsPure(xs, ys, rs []int32, sels []int64) []int32 {
+func int32SubSelsPure(xs, ys, rs []int32, sels []int64) []int32 {
 	for i, sel := range sels {
 		rs[i] = xs[sel] - ys[sel]
 	}
 	return rs
 }
 
-//func int32MinusSelsAvx2(xs, ys, rs []int32, sels []int64) []int32 {
+//func int32SubSelsAvx2(xs, ys, rs []int32, sels []int64) []int32 {
 //    const regItems = 32 / 4
 //	n := len(sels) / regItems
-//	int32MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
+//	int32SubSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func int32MinusSelsAvx512(xs, ys, rs []int32, sels []int64) []int32 {
+//func int32SubSelsAvx512(xs, ys, rs []int32, sels []int64) []int32 {
 //    const regItems = 64 / 4
 //	n := len(sels) / regItems
-//	int32MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
+//	int32SubSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Int32MinusScalar(x int32, ys, rs []int32) []int32 {
-	return int32MinusScalar(x, ys, rs)
+func Int32SubScalar(x int32, ys, rs []int32) []int32 {
+	return int32SubScalar(x, ys, rs)
 }
 
-func int32MinusScalarPure(x int32, ys, rs []int32) []int32 {
+func int32SubScalarPure(x int32, ys, rs []int32) []int32 {
 	for i, y := range ys {
 		rs[i] = x - y
 	}
 	return rs
 }
 
-func int32MinusScalarAvx2(x int32, ys, rs []int32) []int32 {
+func int32SubScalarAvx2(x int32, ys, rs []int32) []int32 {
     const regItems = 32 / 4
 	n := len(ys) / regItems
-	int32MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	int32SubScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func int32MinusScalarAvx512(x int32, ys, rs []int32) []int32 {
+func int32SubScalarAvx512(x int32, ys, rs []int32) []int32 {
     const regItems = 64 / 4
 	n := len(ys) / regItems
-	int32MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	int32SubScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func Int32MinusScalarSels(x int32, ys, rs []int32, sels []int64) []int32 {
-	return int32MinusScalarSels(x, ys, rs, sels)
+func Int32SubScalarSels(x int32, ys, rs []int32, sels []int64) []int32 {
+	return int32SubScalarSels(x, ys, rs, sels)
 }
 
-func int32MinusScalarSelsPure(x int32, ys, rs []int32, sels []int64) []int32 {
+func int32SubScalarSelsPure(x int32, ys, rs []int32, sels []int64) []int32 {
 	for i, sel := range sels {
 		rs[i] = x - ys[sel]
 	}
 	return rs
 }
 
-//func int32MinusScalarSelsAvx2(x int32, ys, rs []int32, sels []int64) []int32 {
+//func int32SubScalarSelsAvx2(x int32, ys, rs []int32, sels []int64) []int32 {
 //    const regItems = 32 / 4
 //	n := len(sels) / regItems
-//	int32MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	int32SubScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func int32MinusScalarSelsAvx512(x int32, ys, rs []int32, sels []int64) []int32 {
+//func int32SubScalarSelsAvx512(x int32, ys, rs []int32, sels []int64) []int32 {
 //    const regItems = 64 / 4
 //	n := len(sels) / regItems
-//	int32MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	int32SubScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Int32MinusByScalar(x int32, ys, rs []int32) []int32 {
-	return int32MinusByScalar(x, ys, rs)
+func Int32SubByScalar(x int32, ys, rs []int32) []int32 {
+	return int32SubByScalar(x, ys, rs)
 }
 
-func int32MinusByScalarPure(x int32, ys, rs []int32) []int32 {
+func int32SubByScalarPure(x int32, ys, rs []int32) []int32 {
 	for i, y := range ys {
 		rs[i] = y - x
 	}
 	return rs
 }
 
-func int32MinusByScalarAvx2(x int32, ys, rs []int32) []int32 {
+func int32SubByScalarAvx2(x int32, ys, rs []int32) []int32 {
     const regItems = 32 / 4
 	n := len(ys) / regItems
-	int32MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	int32SubByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func int32MinusByScalarAvx512(x int32, ys, rs []int32) []int32 {
+func int32SubByScalarAvx512(x int32, ys, rs []int32) []int32 {
     const regItems = 64 / 4
 	n := len(ys) / regItems
-	int32MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	int32SubByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func Int32MinusByScalarSels(x int32, ys, rs []int32, sels []int64) []int32 {
-	return int32MinusByScalarSels(x, ys, rs, sels)
+func Int32SubByScalarSels(x int32, ys, rs []int32, sels []int64) []int32 {
+	return int32SubByScalarSels(x, ys, rs, sels)
 }
 
-func int32MinusByScalarSelsPure(x int32, ys, rs []int32, sels []int64) []int32 {
+func int32SubByScalarSelsPure(x int32, ys, rs []int32, sels []int64) []int32 {
 	for i, sel := range sels {
 		rs[i] = ys[sel] - x
 	}
 	return rs
 }
 
-//func int32MinusByScalarSelsAvx2(x int32, ys, rs []int32, sels []int64) []int32 {
+//func int32SubByScalarSelsAvx2(x int32, ys, rs []int32, sels []int64) []int32 {
 //    const regItems = 32 / 4
 //	n := len(sels) / regItems
-//	int32MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	int32SubByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}
 //	return rs
 //}
 
-//func int32MinusByScalarSelsAvx512(x int32, ys, rs []int32, sels []int64) []int32 {
+//func int32SubByScalarSelsAvx512(x int32, ys, rs []int32, sels []int64) []int32 {
 //    const regItems = 64 / 4
 //	n := len(sels) / regItems
-//	int32MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	int32SubByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}
 //	return rs
 //}
 
-func Int64Minus(xs, ys, rs []int64) []int64 {
-	return int64Minus(xs, ys, rs)
+func Int64Sub(xs, ys, rs []int64) []int64 {
+	return int64Sub(xs, ys, rs)
 }
 
-func int64MinusPure(xs, ys, rs []int64) []int64 {
+func int64SubPure(xs, ys, rs []int64) []int64 {
 	for i, x := range xs {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func int64MinusAvx2(xs, ys, rs []int64) []int64 {
+func int64SubAvx2(xs, ys, rs []int64) []int64 {
     const regItems = 32 / 8
 	n := len(xs) / regItems
-	int64MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	int64SubAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func int64MinusAvx512(xs, ys, rs []int64) []int64 {
+func int64SubAvx512(xs, ys, rs []int64) []int64 {
     const regItems = 64 / 8
 	n := len(xs) / regItems
-	int64MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	int64SubAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func Int64MinusSels(xs, ys, rs []int64, sels []int64) []int64 {
-	return int64MinusSels(xs, ys, rs, sels)
+func Int64SubSels(xs, ys, rs []int64, sels []int64) []int64 {
+	return int64SubSels(xs, ys, rs, sels)
 }
 
-func int64MinusSelsPure(xs, ys, rs []int64, sels []int64) []int64 {
+func int64SubSelsPure(xs, ys, rs []int64, sels []int64) []int64 {
 	for i, sel := range sels {
 		rs[i] = xs[sel] - ys[sel]
 	}
 	return rs
 }
 
-//func int64MinusSelsAvx2(xs, ys, rs []int64, sels []int64) []int64 {
+//func int64SubSelsAvx2(xs, ys, rs []int64, sels []int64) []int64 {
 //    const regItems = 32 / 8
 //	n := len(sels) / regItems
-//	int64MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
+//	int64SubSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func int64MinusSelsAvx512(xs, ys, rs []int64, sels []int64) []int64 {
+//func int64SubSelsAvx512(xs, ys, rs []int64, sels []int64) []int64 {
 //    const regItems = 64 / 8
 //	n := len(sels) / regItems
-//	int64MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
+//	int64SubSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Int64MinusScalar(x int64, ys, rs []int64) []int64 {
-	return int64MinusScalar(x, ys, rs)
+func Int64SubScalar(x int64, ys, rs []int64) []int64 {
+	return int64SubScalar(x, ys, rs)
 }
 
-func int64MinusScalarPure(x int64, ys, rs []int64) []int64 {
+func int64SubScalarPure(x int64, ys, rs []int64) []int64 {
 	for i, y := range ys {
 		rs[i] = x - y
 	}
 	return rs
 }
 
-func int64MinusScalarAvx2(x int64, ys, rs []int64) []int64 {
+func int64SubScalarAvx2(x int64, ys, rs []int64) []int64 {
     const regItems = 32 / 8
 	n := len(ys) / regItems
-	int64MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	int64SubScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func int64MinusScalarAvx512(x int64, ys, rs []int64) []int64 {
+func int64SubScalarAvx512(x int64, ys, rs []int64) []int64 {
     const regItems = 64 / 8
 	n := len(ys) / regItems
-	int64MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	int64SubScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func Int64MinusScalarSels(x int64, ys, rs []int64, sels []int64) []int64 {
-	return int64MinusScalarSels(x, ys, rs, sels)
+func Int64SubScalarSels(x int64, ys, rs []int64, sels []int64) []int64 {
+	return int64SubScalarSels(x, ys, rs, sels)
 }
 
-func int64MinusScalarSelsPure(x int64, ys, rs []int64, sels []int64) []int64 {
+func int64SubScalarSelsPure(x int64, ys, rs []int64, sels []int64) []int64 {
 	for i, sel := range sels {
 		rs[i] = x - ys[sel]
 	}
 	return rs
 }
 
-//func int64MinusScalarSelsAvx2(x int64, ys, rs []int64, sels []int64) []int64 {
+//func int64SubScalarSelsAvx2(x int64, ys, rs []int64, sels []int64) []int64 {
 //    const regItems = 32 / 8
 //	n := len(sels) / regItems
-//	int64MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	int64SubScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func int64MinusScalarSelsAvx512(x int64, ys, rs []int64, sels []int64) []int64 {
+//func int64SubScalarSelsAvx512(x int64, ys, rs []int64, sels []int64) []int64 {
 //    const regItems = 64 / 8
 //	n := len(sels) / regItems
-//	int64MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	int64SubScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Int64MinusByScalar(x int64, ys, rs []int64) []int64 {
-	return int64MinusByScalar(x, ys, rs)
+func Int64SubByScalar(x int64, ys, rs []int64) []int64 {
+	return int64SubByScalar(x, ys, rs)
 }
 
-func int64MinusByScalarPure(x int64, ys, rs []int64) []int64 {
+func int64SubByScalarPure(x int64, ys, rs []int64) []int64 {
 	for i, y := range ys {
 		rs[i] = y - x
 	}
 	return rs
 }
 
-func int64MinusByScalarAvx2(x int64, ys, rs []int64) []int64 {
+func int64SubByScalarAvx2(x int64, ys, rs []int64) []int64 {
     const regItems = 32 / 8
 	n := len(ys) / regItems
-	int64MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	int64SubByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func int64MinusByScalarAvx512(x int64, ys, rs []int64) []int64 {
+func int64SubByScalarAvx512(x int64, ys, rs []int64) []int64 {
     const regItems = 64 / 8
 	n := len(ys) / regItems
-	int64MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	int64SubByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func Int64MinusByScalarSels(x int64, ys, rs []int64, sels []int64) []int64 {
-	return int64MinusByScalarSels(x, ys, rs, sels)
+func Int64SubByScalarSels(x int64, ys, rs []int64, sels []int64) []int64 {
+	return int64SubByScalarSels(x, ys, rs, sels)
 }
 
-func int64MinusByScalarSelsPure(x int64, ys, rs []int64, sels []int64) []int64 {
+func int64SubByScalarSelsPure(x int64, ys, rs []int64, sels []int64) []int64 {
 	for i, sel := range sels {
 		rs[i] = ys[sel] - x
 	}
 	return rs
 }
 
-//func int64MinusByScalarSelsAvx2(x int64, ys, rs []int64, sels []int64) []int64 {
+//func int64SubByScalarSelsAvx2(x int64, ys, rs []int64, sels []int64) []int64 {
 //    const regItems = 32 / 8
 //	n := len(sels) / regItems
-//	int64MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	int64SubByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}
 //	return rs
 //}
 
-//func int64MinusByScalarSelsAvx512(x int64, ys, rs []int64, sels []int64) []int64 {
+//func int64SubByScalarSelsAvx512(x int64, ys, rs []int64, sels []int64) []int64 {
 //    const regItems = 64 / 8
 //	n := len(sels) / regItems
-//	int64MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	int64SubByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}
 //	return rs
 //}
 
-func Uint8Minus(xs, ys, rs []uint8) []uint8 {
-	return uint8Minus(xs, ys, rs)
+func Uint8Sub(xs, ys, rs []uint8) []uint8 {
+	return uint8Sub(xs, ys, rs)
 }
 
-func uint8MinusPure(xs, ys, rs []uint8) []uint8 {
+func uint8SubPure(xs, ys, rs []uint8) []uint8 {
 	for i, x := range xs {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func uint8MinusAvx2(xs, ys, rs []uint8) []uint8 {
+func uint8SubAvx2(xs, ys, rs []uint8) []uint8 {
     const regItems = 32 / 1
 	n := len(xs) / regItems
-	uint8MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	uint8SubAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func uint8MinusAvx512(xs, ys, rs []uint8) []uint8 {
+func uint8SubAvx512(xs, ys, rs []uint8) []uint8 {
     const regItems = 64 / 1
 	n := len(xs) / regItems
-	uint8MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	uint8SubAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func Uint8MinusSels(xs, ys, rs []uint8, sels []int64) []uint8 {
-	return uint8MinusSels(xs, ys, rs, sels)
+func Uint8SubSels(xs, ys, rs []uint8, sels []int64) []uint8 {
+	return uint8SubSels(xs, ys, rs, sels)
 }
 
-func uint8MinusSelsPure(xs, ys, rs []uint8, sels []int64) []uint8 {
+func uint8SubSelsPure(xs, ys, rs []uint8, sels []int64) []uint8 {
 	for i, sel := range sels {
 		rs[i] = xs[sel] - ys[sel]
 	}
 	return rs
 }
 
-//func uint8MinusSelsAvx2(xs, ys, rs []uint8, sels []int64) []uint8 {
+//func uint8SubSelsAvx2(xs, ys, rs []uint8, sels []int64) []uint8 {
 //    const regItems = 32 / 1
 //	n := len(sels) / regItems
-//	uint8MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
+//	uint8SubSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func uint8MinusSelsAvx512(xs, ys, rs []uint8, sels []int64) []uint8 {
+//func uint8SubSelsAvx512(xs, ys, rs []uint8, sels []int64) []uint8 {
 //    const regItems = 64 / 1
 //	n := len(sels) / regItems
-//	uint8MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
+//	uint8SubSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Uint8MinusScalar(x uint8, ys, rs []uint8) []uint8 {
-	return uint8MinusScalar(x, ys, rs)
+func Uint8SubScalar(x uint8, ys, rs []uint8) []uint8 {
+	return uint8SubScalar(x, ys, rs)
 }
 
-func uint8MinusScalarPure(x uint8, ys, rs []uint8) []uint8 {
+func uint8SubScalarPure(x uint8, ys, rs []uint8) []uint8 {
 	for i, y := range ys {
 		rs[i] = x - y
 	}
 	return rs
 }
 
-func uint8MinusScalarAvx2(x uint8, ys, rs []uint8) []uint8 {
+func uint8SubScalarAvx2(x uint8, ys, rs []uint8) []uint8 {
     const regItems = 32 / 1
 	n := len(ys) / regItems
-	uint8MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	uint8SubScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func uint8MinusScalarAvx512(x uint8, ys, rs []uint8) []uint8 {
+func uint8SubScalarAvx512(x uint8, ys, rs []uint8) []uint8 {
     const regItems = 64 / 1
 	n := len(ys) / regItems
-	uint8MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	uint8SubScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func Uint8MinusScalarSels(x uint8, ys, rs []uint8, sels []int64) []uint8 {
-	return uint8MinusScalarSels(x, ys, rs, sels)
+func Uint8SubScalarSels(x uint8, ys, rs []uint8, sels []int64) []uint8 {
+	return uint8SubScalarSels(x, ys, rs, sels)
 }
 
-func uint8MinusScalarSelsPure(x uint8, ys, rs []uint8, sels []int64) []uint8 {
+func uint8SubScalarSelsPure(x uint8, ys, rs []uint8, sels []int64) []uint8 {
 	for i, sel := range sels {
 		rs[i] = x - ys[sel]
 	}
 	return rs
 }
 
-//func uint8MinusScalarSelsAvx2(x uint8, ys, rs []uint8, sels []int64) []uint8 {
+//func uint8SubScalarSelsAvx2(x uint8, ys, rs []uint8, sels []int64) []uint8 {
 //    const regItems = 32 / 1
 //	n := len(sels) / regItems
-//	uint8MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	uint8SubScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func uint8MinusScalarSelsAvx512(x uint8, ys, rs []uint8, sels []int64) []uint8 {
+//func uint8SubScalarSelsAvx512(x uint8, ys, rs []uint8, sels []int64) []uint8 {
 //    const regItems = 64 / 1
 //	n := len(sels) / regItems
-//	uint8MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	uint8SubScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Uint8MinusByScalar(x uint8, ys, rs []uint8) []uint8 {
-	return uint8MinusByScalar(x, ys, rs)
+func Uint8SubByScalar(x uint8, ys, rs []uint8) []uint8 {
+	return uint8SubByScalar(x, ys, rs)
 }
 
-func uint8MinusByScalarPure(x uint8, ys, rs []uint8) []uint8 {
+func uint8SubByScalarPure(x uint8, ys, rs []uint8) []uint8 {
 	for i, y := range ys {
 		rs[i] = y - x
 	}
 	return rs
 }
 
-func uint8MinusByScalarAvx2(x uint8, ys, rs []uint8) []uint8 {
+func uint8SubByScalarAvx2(x uint8, ys, rs []uint8) []uint8 {
     const regItems = 32 / 1
 	n := len(ys) / regItems
-	uint8MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	uint8SubByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func uint8MinusByScalarAvx512(x uint8, ys, rs []uint8) []uint8 {
+func uint8SubByScalarAvx512(x uint8, ys, rs []uint8) []uint8 {
     const regItems = 64 / 1
 	n := len(ys) / regItems
-	uint8MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	uint8SubByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func Uint8MinusByScalarSels(x uint8, ys, rs []uint8, sels []int64) []uint8 {
-	return uint8MinusByScalarSels(x, ys, rs, sels)
+func Uint8SubByScalarSels(x uint8, ys, rs []uint8, sels []int64) []uint8 {
+	return uint8SubByScalarSels(x, ys, rs, sels)
 }
 
-func uint8MinusByScalarSelsPure(x uint8, ys, rs []uint8, sels []int64) []uint8 {
+func uint8SubByScalarSelsPure(x uint8, ys, rs []uint8, sels []int64) []uint8 {
 	for i, sel := range sels {
 		rs[i] = ys[sel] - x
 	}
 	return rs
 }
 
-//func uint8MinusByScalarSelsAvx2(x uint8, ys, rs []uint8, sels []int64) []uint8 {
+//func uint8SubByScalarSelsAvx2(x uint8, ys, rs []uint8, sels []int64) []uint8 {
 //    const regItems = 32 / 1
 //	n := len(sels) / regItems
-//	uint8MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	uint8SubByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}
 //	return rs
 //}
 
-//func uint8MinusByScalarSelsAvx512(x uint8, ys, rs []uint8, sels []int64) []uint8 {
+//func uint8SubByScalarSelsAvx512(x uint8, ys, rs []uint8, sels []int64) []uint8 {
 //    const regItems = 64 / 1
 //	n := len(sels) / regItems
-//	uint8MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	uint8SubByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}
 //	return rs
 //}
 
-func Uint16Minus(xs, ys, rs []uint16) []uint16 {
-	return uint16Minus(xs, ys, rs)
+func Uint16Sub(xs, ys, rs []uint16) []uint16 {
+	return uint16Sub(xs, ys, rs)
 }
 
-func uint16MinusPure(xs, ys, rs []uint16) []uint16 {
+func uint16SubPure(xs, ys, rs []uint16) []uint16 {
 	for i, x := range xs {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func uint16MinusAvx2(xs, ys, rs []uint16) []uint16 {
+func uint16SubAvx2(xs, ys, rs []uint16) []uint16 {
     const regItems = 32 / 2
 	n := len(xs) / regItems
-	uint16MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	uint16SubAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func uint16MinusAvx512(xs, ys, rs []uint16) []uint16 {
+func uint16SubAvx512(xs, ys, rs []uint16) []uint16 {
     const regItems = 64 / 2
 	n := len(xs) / regItems
-	uint16MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	uint16SubAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func Uint16MinusSels(xs, ys, rs []uint16, sels []int64) []uint16 {
-	return uint16MinusSels(xs, ys, rs, sels)
+func Uint16SubSels(xs, ys, rs []uint16, sels []int64) []uint16 {
+	return uint16SubSels(xs, ys, rs, sels)
 }
 
-func uint16MinusSelsPure(xs, ys, rs []uint16, sels []int64) []uint16 {
+func uint16SubSelsPure(xs, ys, rs []uint16, sels []int64) []uint16 {
 	for i, sel := range sels {
 		rs[i] = xs[sel] - ys[sel]
 	}
 	return rs
 }
 
-//func uint16MinusSelsAvx2(xs, ys, rs []uint16, sels []int64) []uint16 {
+//func uint16SubSelsAvx2(xs, ys, rs []uint16, sels []int64) []uint16 {
 //    const regItems = 32 / 2
 //	n := len(sels) / regItems
-//	uint16MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
+//	uint16SubSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func uint16MinusSelsAvx512(xs, ys, rs []uint16, sels []int64) []uint16 {
+//func uint16SubSelsAvx512(xs, ys, rs []uint16, sels []int64) []uint16 {
 //    const regItems = 64 / 2
 //	n := len(sels) / regItems
-//	uint16MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
+//	uint16SubSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Uint16MinusScalar(x uint16, ys, rs []uint16) []uint16 {
-	return uint16MinusScalar(x, ys, rs)
+func Uint16SubScalar(x uint16, ys, rs []uint16) []uint16 {
+	return uint16SubScalar(x, ys, rs)
 }
 
-func uint16MinusScalarPure(x uint16, ys, rs []uint16) []uint16 {
+func uint16SubScalarPure(x uint16, ys, rs []uint16) []uint16 {
 	for i, y := range ys {
 		rs[i] = x - y
 	}
 	return rs
 }
 
-func uint16MinusScalarAvx2(x uint16, ys, rs []uint16) []uint16 {
+func uint16SubScalarAvx2(x uint16, ys, rs []uint16) []uint16 {
     const regItems = 32 / 2
 	n := len(ys) / regItems
-	uint16MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	uint16SubScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func uint16MinusScalarAvx512(x uint16, ys, rs []uint16) []uint16 {
+func uint16SubScalarAvx512(x uint16, ys, rs []uint16) []uint16 {
     const regItems = 64 / 2
 	n := len(ys) / regItems
-	uint16MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	uint16SubScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func Uint16MinusScalarSels(x uint16, ys, rs []uint16, sels []int64) []uint16 {
-	return uint16MinusScalarSels(x, ys, rs, sels)
+func Uint16SubScalarSels(x uint16, ys, rs []uint16, sels []int64) []uint16 {
+	return uint16SubScalarSels(x, ys, rs, sels)
 }
 
-func uint16MinusScalarSelsPure(x uint16, ys, rs []uint16, sels []int64) []uint16 {
+func uint16SubScalarSelsPure(x uint16, ys, rs []uint16, sels []int64) []uint16 {
 	for i, sel := range sels {
 		rs[i] = x - ys[sel]
 	}
 	return rs
 }
 
-//func uint16MinusScalarSelsAvx2(x uint16, ys, rs []uint16, sels []int64) []uint16 {
+//func uint16SubScalarSelsAvx2(x uint16, ys, rs []uint16, sels []int64) []uint16 {
 //    const regItems = 32 / 2
 //	n := len(sels) / regItems
-//	uint16MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	uint16SubScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func uint16MinusScalarSelsAvx512(x uint16, ys, rs []uint16, sels []int64) []uint16 {
+//func uint16SubScalarSelsAvx512(x uint16, ys, rs []uint16, sels []int64) []uint16 {
 //    const regItems = 64 / 2
 //	n := len(sels) / regItems
-//	uint16MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	uint16SubScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Uint16MinusByScalar(x uint16, ys, rs []uint16) []uint16 {
-	return uint16MinusByScalar(x, ys, rs)
+func Uint16SubByScalar(x uint16, ys, rs []uint16) []uint16 {
+	return uint16SubByScalar(x, ys, rs)
 }
 
-func uint16MinusByScalarPure(x uint16, ys, rs []uint16) []uint16 {
+func uint16SubByScalarPure(x uint16, ys, rs []uint16) []uint16 {
 	for i, y := range ys {
 		rs[i] = y - x
 	}
 	return rs
 }
 
-func uint16MinusByScalarAvx2(x uint16, ys, rs []uint16) []uint16 {
+func uint16SubByScalarAvx2(x uint16, ys, rs []uint16) []uint16 {
     const regItems = 32 / 2
 	n := len(ys) / regItems
-	uint16MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	uint16SubByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func uint16MinusByScalarAvx512(x uint16, ys, rs []uint16) []uint16 {
+func uint16SubByScalarAvx512(x uint16, ys, rs []uint16) []uint16 {
     const regItems = 64 / 2
 	n := len(ys) / regItems
-	uint16MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	uint16SubByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func Uint16MinusByScalarSels(x uint16, ys, rs []uint16, sels []int64) []uint16 {
-	return uint16MinusByScalarSels(x, ys, rs, sels)
+func Uint16SubByScalarSels(x uint16, ys, rs []uint16, sels []int64) []uint16 {
+	return uint16SubByScalarSels(x, ys, rs, sels)
 }
 
-func uint16MinusByScalarSelsPure(x uint16, ys, rs []uint16, sels []int64) []uint16 {
+func uint16SubByScalarSelsPure(x uint16, ys, rs []uint16, sels []int64) []uint16 {
 	for i, sel := range sels {
 		rs[i] = ys[sel] - x
 	}
 	return rs
 }
 
-//func uint16MinusByScalarSelsAvx2(x uint16, ys, rs []uint16, sels []int64) []uint16 {
+//func uint16SubByScalarSelsAvx2(x uint16, ys, rs []uint16, sels []int64) []uint16 {
 //    const regItems = 32 / 2
 //	n := len(sels) / regItems
-//	uint16MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	uint16SubByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}
 //	return rs
 //}
 
-//func uint16MinusByScalarSelsAvx512(x uint16, ys, rs []uint16, sels []int64) []uint16 {
+//func uint16SubByScalarSelsAvx512(x uint16, ys, rs []uint16, sels []int64) []uint16 {
 //    const regItems = 64 / 2
 //	n := len(sels) / regItems
-//	uint16MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	uint16SubByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}
 //	return rs
 //}
 
-func Uint32Minus(xs, ys, rs []uint32) []uint32 {
-	return uint32Minus(xs, ys, rs)
+func Uint32Sub(xs, ys, rs []uint32) []uint32 {
+	return uint32Sub(xs, ys, rs)
 }
 
-func uint32MinusPure(xs, ys, rs []uint32) []uint32 {
+func uint32SubPure(xs, ys, rs []uint32) []uint32 {
 	for i, x := range xs {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func uint32MinusAvx2(xs, ys, rs []uint32) []uint32 {
+func uint32SubAvx2(xs, ys, rs []uint32) []uint32 {
     const regItems = 32 / 4
 	n := len(xs) / regItems
-	uint32MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	uint32SubAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func uint32MinusAvx512(xs, ys, rs []uint32) []uint32 {
+func uint32SubAvx512(xs, ys, rs []uint32) []uint32 {
     const regItems = 64 / 4
 	n := len(xs) / regItems
-	uint32MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	uint32SubAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func Uint32MinusSels(xs, ys, rs []uint32, sels []int64) []uint32 {
-	return uint32MinusSels(xs, ys, rs, sels)
+func Uint32SubSels(xs, ys, rs []uint32, sels []int64) []uint32 {
+	return uint32SubSels(xs, ys, rs, sels)
 }
 
-func uint32MinusSelsPure(xs, ys, rs []uint32, sels []int64) []uint32 {
+func uint32SubSelsPure(xs, ys, rs []uint32, sels []int64) []uint32 {
 	for i, sel := range sels {
 		rs[i] = xs[sel] - ys[sel]
 	}
 	return rs
 }
 
-//func uint32MinusSelsAvx2(xs, ys, rs []uint32, sels []int64) []uint32 {
+//func uint32SubSelsAvx2(xs, ys, rs []uint32, sels []int64) []uint32 {
 //    const regItems = 32 / 4
 //	n := len(sels) / regItems
-//	uint32MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
+//	uint32SubSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func uint32MinusSelsAvx512(xs, ys, rs []uint32, sels []int64) []uint32 {
+//func uint32SubSelsAvx512(xs, ys, rs []uint32, sels []int64) []uint32 {
 //    const regItems = 64 / 4
 //	n := len(sels) / regItems
-//	uint32MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
+//	uint32SubSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Uint32MinusScalar(x uint32, ys, rs []uint32) []uint32 {
-	return uint32MinusScalar(x, ys, rs)
+func Uint32SubScalar(x uint32, ys, rs []uint32) []uint32 {
+	return uint32SubScalar(x, ys, rs)
 }
 
-func uint32MinusScalarPure(x uint32, ys, rs []uint32) []uint32 {
+func uint32SubScalarPure(x uint32, ys, rs []uint32) []uint32 {
 	for i, y := range ys {
 		rs[i] = x - y
 	}
 	return rs
 }
 
-func uint32MinusScalarAvx2(x uint32, ys, rs []uint32) []uint32 {
+func uint32SubScalarAvx2(x uint32, ys, rs []uint32) []uint32 {
     const regItems = 32 / 4
 	n := len(ys) / regItems
-	uint32MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	uint32SubScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func uint32MinusScalarAvx512(x uint32, ys, rs []uint32) []uint32 {
+func uint32SubScalarAvx512(x uint32, ys, rs []uint32) []uint32 {
     const regItems = 64 / 4
 	n := len(ys) / regItems
-	uint32MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	uint32SubScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func Uint32MinusScalarSels(x uint32, ys, rs []uint32, sels []int64) []uint32 {
-	return uint32MinusScalarSels(x, ys, rs, sels)
+func Uint32SubScalarSels(x uint32, ys, rs []uint32, sels []int64) []uint32 {
+	return uint32SubScalarSels(x, ys, rs, sels)
 }
 
-func uint32MinusScalarSelsPure(x uint32, ys, rs []uint32, sels []int64) []uint32 {
+func uint32SubScalarSelsPure(x uint32, ys, rs []uint32, sels []int64) []uint32 {
 	for i, sel := range sels {
 		rs[i] = x - ys[sel]
 	}
 	return rs
 }
 
-//func uint32MinusScalarSelsAvx2(x uint32, ys, rs []uint32, sels []int64) []uint32 {
+//func uint32SubScalarSelsAvx2(x uint32, ys, rs []uint32, sels []int64) []uint32 {
 //    const regItems = 32 / 4
 //	n := len(sels) / regItems
-//	uint32MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	uint32SubScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func uint32MinusScalarSelsAvx512(x uint32, ys, rs []uint32, sels []int64) []uint32 {
+//func uint32SubScalarSelsAvx512(x uint32, ys, rs []uint32, sels []int64) []uint32 {
 //    const regItems = 64 / 4
 //	n := len(sels) / regItems
-//	uint32MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	uint32SubScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Uint32MinusByScalar(x uint32, ys, rs []uint32) []uint32 {
-	return uint32MinusByScalar(x, ys, rs)
+func Uint32SubByScalar(x uint32, ys, rs []uint32) []uint32 {
+	return uint32SubByScalar(x, ys, rs)
 }
 
-func uint32MinusByScalarPure(x uint32, ys, rs []uint32) []uint32 {
+func uint32SubByScalarPure(x uint32, ys, rs []uint32) []uint32 {
 	for i, y := range ys {
 		rs[i] = y - x
 	}
 	return rs
 }
 
-func uint32MinusByScalarAvx2(x uint32, ys, rs []uint32) []uint32 {
+func uint32SubByScalarAvx2(x uint32, ys, rs []uint32) []uint32 {
     const regItems = 32 / 4
 	n := len(ys) / regItems
-	uint32MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	uint32SubByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func uint32MinusByScalarAvx512(x uint32, ys, rs []uint32) []uint32 {
+func uint32SubByScalarAvx512(x uint32, ys, rs []uint32) []uint32 {
     const regItems = 64 / 4
 	n := len(ys) / regItems
-	uint32MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	uint32SubByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func Uint32MinusByScalarSels(x uint32, ys, rs []uint32, sels []int64) []uint32 {
-	return uint32MinusByScalarSels(x, ys, rs, sels)
+func Uint32SubByScalarSels(x uint32, ys, rs []uint32, sels []int64) []uint32 {
+	return uint32SubByScalarSels(x, ys, rs, sels)
 }
 
-func uint32MinusByScalarSelsPure(x uint32, ys, rs []uint32, sels []int64) []uint32 {
+func uint32SubByScalarSelsPure(x uint32, ys, rs []uint32, sels []int64) []uint32 {
 	for i, sel := range sels {
 		rs[i] = ys[sel] - x
 	}
 	return rs
 }
 
-//func uint32MinusByScalarSelsAvx2(x uint32, ys, rs []uint32, sels []int64) []uint32 {
+//func uint32SubByScalarSelsAvx2(x uint32, ys, rs []uint32, sels []int64) []uint32 {
 //    const regItems = 32 / 4
 //	n := len(sels) / regItems
-//	uint32MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	uint32SubByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}
 //	return rs
 //}
 
-//func uint32MinusByScalarSelsAvx512(x uint32, ys, rs []uint32, sels []int64) []uint32 {
+//func uint32SubByScalarSelsAvx512(x uint32, ys, rs []uint32, sels []int64) []uint32 {
 //    const regItems = 64 / 4
 //	n := len(sels) / regItems
-//	uint32MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	uint32SubByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}
 //	return rs
 //}
 
-func Uint64Minus(xs, ys, rs []uint64) []uint64 {
-	return uint64Minus(xs, ys, rs)
+func Uint64Sub(xs, ys, rs []uint64) []uint64 {
+	return uint64Sub(xs, ys, rs)
 }
 
-func uint64MinusPure(xs, ys, rs []uint64) []uint64 {
+func uint64SubPure(xs, ys, rs []uint64) []uint64 {
 	for i, x := range xs {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func uint64MinusAvx2(xs, ys, rs []uint64) []uint64 {
+func uint64SubAvx2(xs, ys, rs []uint64) []uint64 {
     const regItems = 32 / 8
 	n := len(xs) / regItems
-	uint64MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	uint64SubAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func uint64MinusAvx512(xs, ys, rs []uint64) []uint64 {
+func uint64SubAvx512(xs, ys, rs []uint64) []uint64 {
     const regItems = 64 / 8
 	n := len(xs) / regItems
-	uint64MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	uint64SubAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func Uint64MinusSels(xs, ys, rs []uint64, sels []int64) []uint64 {
-	return uint64MinusSels(xs, ys, rs, sels)
+func Uint64SubSels(xs, ys, rs []uint64, sels []int64) []uint64 {
+	return uint64SubSels(xs, ys, rs, sels)
 }
 
-func uint64MinusSelsPure(xs, ys, rs []uint64, sels []int64) []uint64 {
+func uint64SubSelsPure(xs, ys, rs []uint64, sels []int64) []uint64 {
 	for i, sel := range sels {
 		rs[i] = xs[sel] - ys[sel]
 	}
 	return rs
 }
 
-//func uint64MinusSelsAvx2(xs, ys, rs []uint64, sels []int64) []uint64 {
+//func uint64SubSelsAvx2(xs, ys, rs []uint64, sels []int64) []uint64 {
 //    const regItems = 32 / 8
 //	n := len(sels) / regItems
-//	uint64MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
+//	uint64SubSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func uint64MinusSelsAvx512(xs, ys, rs []uint64, sels []int64) []uint64 {
+//func uint64SubSelsAvx512(xs, ys, rs []uint64, sels []int64) []uint64 {
 //    const regItems = 64 / 8
 //	n := len(sels) / regItems
-//	uint64MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
+//	uint64SubSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Uint64MinusScalar(x uint64, ys, rs []uint64) []uint64 {
-	return uint64MinusScalar(x, ys, rs)
+func Uint64SubScalar(x uint64, ys, rs []uint64) []uint64 {
+	return uint64SubScalar(x, ys, rs)
 }
 
-func uint64MinusScalarPure(x uint64, ys, rs []uint64) []uint64 {
+func uint64SubScalarPure(x uint64, ys, rs []uint64) []uint64 {
 	for i, y := range ys {
 		rs[i] = x - y
 	}
 	return rs
 }
 
-func uint64MinusScalarAvx2(x uint64, ys, rs []uint64) []uint64 {
+func uint64SubScalarAvx2(x uint64, ys, rs []uint64) []uint64 {
     const regItems = 32 / 8
 	n := len(ys) / regItems
-	uint64MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	uint64SubScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func uint64MinusScalarAvx512(x uint64, ys, rs []uint64) []uint64 {
+func uint64SubScalarAvx512(x uint64, ys, rs []uint64) []uint64 {
     const regItems = 64 / 8
 	n := len(ys) / regItems
-	uint64MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	uint64SubScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func Uint64MinusScalarSels(x uint64, ys, rs []uint64, sels []int64) []uint64 {
-	return uint64MinusScalarSels(x, ys, rs, sels)
+func Uint64SubScalarSels(x uint64, ys, rs []uint64, sels []int64) []uint64 {
+	return uint64SubScalarSels(x, ys, rs, sels)
 }
 
-func uint64MinusScalarSelsPure(x uint64, ys, rs []uint64, sels []int64) []uint64 {
+func uint64SubScalarSelsPure(x uint64, ys, rs []uint64, sels []int64) []uint64 {
 	for i, sel := range sels {
 		rs[i] = x - ys[sel]
 	}
 	return rs
 }
 
-//func uint64MinusScalarSelsAvx2(x uint64, ys, rs []uint64, sels []int64) []uint64 {
+//func uint64SubScalarSelsAvx2(x uint64, ys, rs []uint64, sels []int64) []uint64 {
 //    const regItems = 32 / 8
 //	n := len(sels) / regItems
-//	uint64MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	uint64SubScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func uint64MinusScalarSelsAvx512(x uint64, ys, rs []uint64, sels []int64) []uint64 {
+//func uint64SubScalarSelsAvx512(x uint64, ys, rs []uint64, sels []int64) []uint64 {
 //    const regItems = 64 / 8
 //	n := len(sels) / regItems
-//	uint64MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	uint64SubScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Uint64MinusByScalar(x uint64, ys, rs []uint64) []uint64 {
-	return uint64MinusByScalar(x, ys, rs)
+func Uint64SubByScalar(x uint64, ys, rs []uint64) []uint64 {
+	return uint64SubByScalar(x, ys, rs)
 }
 
-func uint64MinusByScalarPure(x uint64, ys, rs []uint64) []uint64 {
+func uint64SubByScalarPure(x uint64, ys, rs []uint64) []uint64 {
 	for i, y := range ys {
 		rs[i] = y - x
 	}
 	return rs
 }
 
-func uint64MinusByScalarAvx2(x uint64, ys, rs []uint64) []uint64 {
+func uint64SubByScalarAvx2(x uint64, ys, rs []uint64) []uint64 {
     const regItems = 32 / 8
 	n := len(ys) / regItems
-	uint64MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	uint64SubByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func uint64MinusByScalarAvx512(x uint64, ys, rs []uint64) []uint64 {
+func uint64SubByScalarAvx512(x uint64, ys, rs []uint64) []uint64 {
     const regItems = 64 / 8
 	n := len(ys) / regItems
-	uint64MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	uint64SubByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func Uint64MinusByScalarSels(x uint64, ys, rs []uint64, sels []int64) []uint64 {
-	return uint64MinusByScalarSels(x, ys, rs, sels)
+func Uint64SubByScalarSels(x uint64, ys, rs []uint64, sels []int64) []uint64 {
+	return uint64SubByScalarSels(x, ys, rs, sels)
 }
 
-func uint64MinusByScalarSelsPure(x uint64, ys, rs []uint64, sels []int64) []uint64 {
+func uint64SubByScalarSelsPure(x uint64, ys, rs []uint64, sels []int64) []uint64 {
 	for i, sel := range sels {
 		rs[i] = ys[sel] - x
 	}
 	return rs
 }
 
-//func uint64MinusByScalarSelsAvx2(x uint64, ys, rs []uint64, sels []int64) []uint64 {
+//func uint64SubByScalarSelsAvx2(x uint64, ys, rs []uint64, sels []int64) []uint64 {
 //    const regItems = 32 / 8
 //	n := len(sels) / regItems
-//	uint64MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	uint64SubByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}
 //	return rs
 //}
 
-//func uint64MinusByScalarSelsAvx512(x uint64, ys, rs []uint64, sels []int64) []uint64 {
+//func uint64SubByScalarSelsAvx512(x uint64, ys, rs []uint64, sels []int64) []uint64 {
 //    const regItems = 64 / 8
 //	n := len(sels) / regItems
-//	uint64MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	uint64SubByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}
 //	return rs
 //}
 
-func Float32Minus(xs, ys, rs []float32) []float32 {
-	return float32Minus(xs, ys, rs)
+func Float32Sub(xs, ys, rs []float32) []float32 {
+	return float32Sub(xs, ys, rs)
 }
 
-func float32MinusPure(xs, ys, rs []float32) []float32 {
+func float32SubPure(xs, ys, rs []float32) []float32 {
 	for i, x := range xs {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func float32MinusAvx2(xs, ys, rs []float32) []float32 {
+func float32SubAvx2(xs, ys, rs []float32) []float32 {
     const regItems = 32 / 4
 	n := len(xs) / regItems
-	float32MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	float32SubAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func float32MinusAvx512(xs, ys, rs []float32) []float32 {
+func float32SubAvx512(xs, ys, rs []float32) []float32 {
     const regItems = 64 / 4
 	n := len(xs) / regItems
-	float32MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	float32SubAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func Float32MinusSels(xs, ys, rs []float32, sels []int64) []float32 {
-	return float32MinusSels(xs, ys, rs, sels)
+func Float32SubSels(xs, ys, rs []float32, sels []int64) []float32 {
+	return float32SubSels(xs, ys, rs, sels)
 }
 
-func float32MinusSelsPure(xs, ys, rs []float32, sels []int64) []float32 {
+func float32SubSelsPure(xs, ys, rs []float32, sels []int64) []float32 {
 	for i, sel := range sels {
 		rs[i] = xs[sel] - ys[sel]
 	}
 	return rs
 }
 
-//func float32MinusSelsAvx2(xs, ys, rs []float32, sels []int64) []float32 {
+//func float32SubSelsAvx2(xs, ys, rs []float32, sels []int64) []float32 {
 //    const regItems = 32 / 4
 //	n := len(sels) / regItems
-//	float32MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
+//	float32SubSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func float32MinusSelsAvx512(xs, ys, rs []float32, sels []int64) []float32 {
+//func float32SubSelsAvx512(xs, ys, rs []float32, sels []int64) []float32 {
 //    const regItems = 64 / 4
 //	n := len(sels) / regItems
-//	float32MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
+//	float32SubSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Float32MinusScalar(x float32, ys, rs []float32) []float32 {
-	return float32MinusScalar(x, ys, rs)
+func Float32SubScalar(x float32, ys, rs []float32) []float32 {
+	return float32SubScalar(x, ys, rs)
 }
 
-func float32MinusScalarPure(x float32, ys, rs []float32) []float32 {
+func float32SubScalarPure(x float32, ys, rs []float32) []float32 {
 	for i, y := range ys {
 		rs[i] = x - y
 	}
 	return rs
 }
 
-func float32MinusScalarAvx2(x float32, ys, rs []float32) []float32 {
+func float32SubScalarAvx2(x float32, ys, rs []float32) []float32 {
     const regItems = 32 / 4
 	n := len(ys) / regItems
-	float32MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	float32SubScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func float32MinusScalarAvx512(x float32, ys, rs []float32) []float32 {
+func float32SubScalarAvx512(x float32, ys, rs []float32) []float32 {
     const regItems = 64 / 4
 	n := len(ys) / regItems
-	float32MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	float32SubScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func Float32MinusScalarSels(x float32, ys, rs []float32, sels []int64) []float32 {
-	return float32MinusScalarSels(x, ys, rs, sels)
+func Float32SubScalarSels(x float32, ys, rs []float32, sels []int64) []float32 {
+	return float32SubScalarSels(x, ys, rs, sels)
 }
 
-func float32MinusScalarSelsPure(x float32, ys, rs []float32, sels []int64) []float32 {
+func float32SubScalarSelsPure(x float32, ys, rs []float32, sels []int64) []float32 {
 	for i, sel := range sels {
 		rs[i] = x - ys[sel]
 	}
 	return rs
 }
 
-//func float32MinusScalarSelsAvx2(x float32, ys, rs []float32, sels []int64) []float32 {
+//func float32SubScalarSelsAvx2(x float32, ys, rs []float32, sels []int64) []float32 {
 //    const regItems = 32 / 4
 //	n := len(sels) / regItems
-//	float32MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	float32SubScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func float32MinusScalarSelsAvx512(x float32, ys, rs []float32, sels []int64) []float32 {
+//func float32SubScalarSelsAvx512(x float32, ys, rs []float32, sels []int64) []float32 {
 //    const regItems = 64 / 4
 //	n := len(sels) / regItems
-//	float32MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	float32SubScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Float32MinusByScalar(x float32, ys, rs []float32) []float32 {
-	return float32MinusByScalar(x, ys, rs)
+func Float32SubByScalar(x float32, ys, rs []float32) []float32 {
+	return float32SubByScalar(x, ys, rs)
 }
 
-func float32MinusByScalarPure(x float32, ys, rs []float32) []float32 {
+func float32SubByScalarPure(x float32, ys, rs []float32) []float32 {
 	for i, y := range ys {
 		rs[i] = y - x
 	}
 	return rs
 }
 
-func float32MinusByScalarAvx2(x float32, ys, rs []float32) []float32 {
+func float32SubByScalarAvx2(x float32, ys, rs []float32) []float32 {
     const regItems = 32 / 4
 	n := len(ys) / regItems
-	float32MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	float32SubByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func float32MinusByScalarAvx512(x float32, ys, rs []float32) []float32 {
+func float32SubByScalarAvx512(x float32, ys, rs []float32) []float32 {
     const regItems = 64 / 4
 	n := len(ys) / regItems
-	float32MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	float32SubByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func Float32MinusByScalarSels(x float32, ys, rs []float32, sels []int64) []float32 {
-	return float32MinusByScalarSels(x, ys, rs, sels)
+func Float32SubByScalarSels(x float32, ys, rs []float32, sels []int64) []float32 {
+	return float32SubByScalarSels(x, ys, rs, sels)
 }
 
-func float32MinusByScalarSelsPure(x float32, ys, rs []float32, sels []int64) []float32 {
+func float32SubByScalarSelsPure(x float32, ys, rs []float32, sels []int64) []float32 {
 	for i, sel := range sels {
 		rs[i] = ys[sel] - x
 	}
 	return rs
 }
 
-//func float32MinusByScalarSelsAvx2(x float32, ys, rs []float32, sels []int64) []float32 {
+//func float32SubByScalarSelsAvx2(x float32, ys, rs []float32, sels []int64) []float32 {
 //    const regItems = 32 / 4
 //	n := len(sels) / regItems
-//	float32MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	float32SubByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}
 //	return rs
 //}
 
-//func float32MinusByScalarSelsAvx512(x float32, ys, rs []float32, sels []int64) []float32 {
+//func float32SubByScalarSelsAvx512(x float32, ys, rs []float32, sels []int64) []float32 {
 //    const regItems = 64 / 4
 //	n := len(sels) / regItems
-//	float32MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	float32SubByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}
 //	return rs
 //}
 
-func Float64Minus(xs, ys, rs []float64) []float64 {
-	return float64Minus(xs, ys, rs)
+func Float64Sub(xs, ys, rs []float64) []float64 {
+	return float64Sub(xs, ys, rs)
 }
 
-func float64MinusPure(xs, ys, rs []float64) []float64 {
+func float64SubPure(xs, ys, rs []float64) []float64 {
 	for i, x := range xs {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func float64MinusAvx2(xs, ys, rs []float64) []float64 {
+func float64SubAvx2(xs, ys, rs []float64) []float64 {
     const regItems = 32 / 8
 	n := len(xs) / regItems
-	float64MinusAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	float64SubAvx2Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func float64MinusAvx512(xs, ys, rs []float64) []float64 {
+func float64SubAvx512(xs, ys, rs []float64) []float64 {
     const regItems = 64 / 8
 	n := len(xs) / regItems
-	float64MinusAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
+	float64SubAvx512Asm(xs[:n*regItems], ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = xs[i] - ys[i]
 	}
 	return rs
 }
 
-func Float64MinusSels(xs, ys, rs []float64, sels []int64) []float64 {
-	return float64MinusSels(xs, ys, rs, sels)
+func Float64SubSels(xs, ys, rs []float64, sels []int64) []float64 {
+	return float64SubSels(xs, ys, rs, sels)
 }
 
-func float64MinusSelsPure(xs, ys, rs []float64, sels []int64) []float64 {
+func float64SubSelsPure(xs, ys, rs []float64, sels []int64) []float64 {
 	for i, sel := range sels {
 		rs[i] = xs[sel] - ys[sel]
 	}
 	return rs
 }
 
-//func float64MinusSelsAvx2(xs, ys, rs []float64, sels []int64) []float64 {
+//func float64SubSelsAvx2(xs, ys, rs []float64, sels []int64) []float64 {
 //    const regItems = 32 / 8
 //	n := len(sels) / regItems
-//	float64MinusSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
+//	float64SubSelsAvx2Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func float64MinusSelsAvx512(xs, ys, rs []float64, sels []int64) []float64 {
+//func float64SubSelsAvx512(xs, ys, rs []float64, sels []int64) []float64 {
 //    const regItems = 64 / 8
 //	n := len(sels) / regItems
-//	float64MinusSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
+//	float64SubSelsAvx512Asm(xs, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = xs[sels[i]] - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Float64MinusScalar(x float64, ys, rs []float64) []float64 {
-	return float64MinusScalar(x, ys, rs)
+func Float64SubScalar(x float64, ys, rs []float64) []float64 {
+	return float64SubScalar(x, ys, rs)
 }
 
-func float64MinusScalarPure(x float64, ys, rs []float64) []float64 {
+func float64SubScalarPure(x float64, ys, rs []float64) []float64 {
 	for i, y := range ys {
 		rs[i] = x - y
 	}
 	return rs
 }
 
-func float64MinusScalarAvx2(x float64, ys, rs []float64) []float64 {
+func float64SubScalarAvx2(x float64, ys, rs []float64) []float64 {
     const regItems = 32 / 8
 	n := len(ys) / regItems
-	float64MinusScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	float64SubScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func float64MinusScalarAvx512(x float64, ys, rs []float64) []float64 {
+func float64SubScalarAvx512(x float64, ys, rs []float64) []float64 {
     const regItems = 64 / 8
 	n := len(ys) / regItems
-	float64MinusScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	float64SubScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = x - ys[i]
 	}
 	return rs
 }
 
-func Float64MinusScalarSels(x float64, ys, rs []float64, sels []int64) []float64 {
-	return float64MinusScalarSels(x, ys, rs, sels)
+func Float64SubScalarSels(x float64, ys, rs []float64, sels []int64) []float64 {
+	return float64SubScalarSels(x, ys, rs, sels)
 }
 
-func float64MinusScalarSelsPure(x float64, ys, rs []float64, sels []int64) []float64 {
+func float64SubScalarSelsPure(x float64, ys, rs []float64, sels []int64) []float64 {
 	for i, sel := range sels {
 		rs[i] = x - ys[sel]
 	}
 	return rs
 }
 
-//func float64MinusScalarSelsAvx2(x float64, ys, rs []float64, sels []int64) []float64 {
+//func float64SubScalarSelsAvx2(x float64, ys, rs []float64, sels []int64) []float64 {
 //    const regItems = 32 / 8
 //	n := len(sels) / regItems
-//	float64MinusScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	float64SubScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-//func float64MinusScalarSelsAvx512(x float64, ys, rs []float64, sels []int64) []float64 {
+//func float64SubScalarSelsAvx512(x float64, ys, rs []float64, sels []int64) []float64 {
 //    const regItems = 64 / 8
 //	n := len(sels) / regItems
-//	float64MinusScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	float64SubScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = x - ys[sels[i]]
 //	}
 //	return rs
 //}
 
-func Float64MinusByScalar(x float64, ys, rs []float64) []float64 {
-	return float64MinusByScalar(x, ys, rs)
+func Float64SubByScalar(x float64, ys, rs []float64) []float64 {
+	return float64SubByScalar(x, ys, rs)
 }
 
-func float64MinusByScalarPure(x float64, ys, rs []float64) []float64 {
+func float64SubByScalarPure(x float64, ys, rs []float64) []float64 {
 	for i, y := range ys {
 		rs[i] = y - x
 	}
 	return rs
 }
 
-func float64MinusByScalarAvx2(x float64, ys, rs []float64) []float64 {
+func float64SubByScalarAvx2(x float64, ys, rs []float64) []float64 {
     const regItems = 32 / 8
 	n := len(ys) / regItems
-	float64MinusByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
+	float64SubByScalarAvx2Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func float64MinusByScalarAvx512(x float64, ys, rs []float64) []float64 {
+func float64SubByScalarAvx512(x float64, ys, rs []float64) []float64 {
     const regItems = 64 / 8
 	n := len(ys) / regItems
-	float64MinusByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
+	float64SubByScalarAvx512Asm(x, ys[:n*regItems], rs[:n*regItems])
 	for i, j := n * regItems, len(xs); i < j; i++ {
 		rs[i] = ys[i] - x
 	}
 	return rs
 }
 
-func Float64MinusByScalarSels(x float64, ys, rs []float64, sels []int64) []float64 {
-	return float64MinusByScalarSels(x, ys, rs, sels)
+func Float64SubByScalarSels(x float64, ys, rs []float64, sels []int64) []float64 {
+	return float64SubByScalarSels(x, ys, rs, sels)
 }
 
-func float64MinusByScalarSelsPure(x float64, ys, rs []float64, sels []int64) []float64 {
+func float64SubByScalarSelsPure(x float64, ys, rs []float64, sels []int64) []float64 {
 	for i, sel := range sels {
 		rs[i] = ys[sel] - x
 	}
 	return rs
 }
 
-//func float64MinusByScalarSelsAvx2(x float64, ys, rs []float64, sels []int64) []float64 {
+//func float64SubByScalarSelsAvx2(x float64, ys, rs []float64, sels []int64) []float64 {
 //    const regItems = 32 / 8
 //	n := len(sels) / regItems
-//	float64MinusByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
+//	float64SubByScalarSelsAvx2Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}
 //	return rs
 //}
 
-//func float64MinusByScalarSelsAvx512(x float64, ys, rs []float64, sels []int64) []float64 {
+//func float64SubByScalarSelsAvx512(x float64, ys, rs []float64, sels []int64) []float64 {
 //    const regItems = 64 / 8
 //	n := len(sels) / regItems
-//	float64MinusByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
+//	float64SubByScalarSelsAvx512Asm(x, ys, rs, sels[:n*regItems])
 //	for i, j := n * regItems, len(sels); i < j; i++ {
 //		rs[i] = ys[sels[i]] - x
 //	}