diff --git a/pkg/vectorize/div/div.go b/pkg/vectorize/div/div.go
index 359cea06a499e6f7ef36c6c2fca6d302d4d7a0a8..031b7b60e8598709baf341a7a833ed9d8706ef77 100644
--- a/pkg/vectorize/div/div.go
+++ b/pkg/vectorize/div/div.go
@@ -1,149 +1,787 @@
 package div
 
 var (
-	i64DivOne     func(int64, []int64, []int64) []int64
-	i64DivOneBy   func(int64, []int64, []int64) []int64
-	i64Div        func([]int64, []int64, []int64) []int64
-	i64DivOneSels func(int64, []int64, []int64, []int64) []int64
-	i64DivSels    func([]int64, []int64, []int64, []int64) []int64
-
-	f64DivOne     func(float64, []float64, []float64) []float64
-	f64DivOneBy   func(float64, []float64, []float64) []float64
-	f64Div        func([]float64, []float64, []float64) []float64
-	f64DivOneSels func(float64, []float64, []float64, []int64) []float64
-	f64DivSels    func([]float64, []float64, []float64, []int64) []float64
+	int8Div                func([]int8, []int8, []int8) []int8
+	int8DivSels            func([]int8, []int8, []int8, []int64) []int8
+	int8DivScalar          func(int8, []int8, []int8) []int8
+	int8DivScalarSels      func(int8, []int8, []int8, []int64) []int8
+	int8DivByScalar        func(int8, []int8, []int8) []int8
+	int8DivByScalarSels    func(int8, []int8, []int8, []int64) []int8
+	int16Div               func([]int16, []int16, []int16) []int16
+	int16DivSels           func([]int16, []int16, []int16, []int64) []int16
+	int16DivScalar         func(int16, []int16, []int16) []int16
+	int16DivScalarSels     func(int16, []int16, []int16, []int64) []int16
+	int16DivByScalar       func(int16, []int16, []int16) []int16
+	int16DivByScalarSels   func(int16, []int16, []int16, []int64) []int16
+	int32Div               func([]int32, []int32, []int32) []int32
+	int32DivSels           func([]int32, []int32, []int32, []int64) []int32
+	int32DivScalar         func(int32, []int32, []int32) []int32
+	int32DivScalarSels     func(int32, []int32, []int32, []int64) []int32
+	int32DivByScalar       func(int32, []int32, []int32) []int32
+	int32DivByScalarSels   func(int32, []int32, []int32, []int64) []int32
+	int64Div               func([]int64, []int64, []int64) []int64
+	int64DivSels           func([]int64, []int64, []int64, []int64) []int64
+	int64DivScalar         func(int64, []int64, []int64) []int64
+	int64DivScalarSels     func(int64, []int64, []int64, []int64) []int64
+	int64DivByScalar       func(int64, []int64, []int64) []int64
+	int64DivByScalarSels   func(int64, []int64, []int64, []int64) []int64
+	uint8Div               func([]uint8, []uint8, []uint8) []uint8
+	uint8DivSels           func([]uint8, []uint8, []uint8, []int64) []uint8
+	uint8DivScalar         func(uint8, []uint8, []uint8) []uint8
+	uint8DivScalarSels     func(uint8, []uint8, []uint8, []int64) []uint8
+	uint8DivByScalar       func(uint8, []uint8, []uint8) []uint8
+	uint8DivByScalarSels   func(uint8, []uint8, []uint8, []int64) []uint8
+	uint16Div              func([]uint16, []uint16, []uint16) []uint16
+	uint16DivSels          func([]uint16, []uint16, []uint16, []int64) []uint16
+	uint16DivScalar        func(uint16, []uint16, []uint16) []uint16
+	uint16DivScalarSels    func(uint16, []uint16, []uint16, []int64) []uint16
+	uint16DivByScalar      func(uint16, []uint16, []uint16) []uint16
+	uint16DivByScalarSels  func(uint16, []uint16, []uint16, []int64) []uint16
+	uint32Div              func([]uint32, []uint32, []uint32) []uint32
+	uint32DivSels          func([]uint32, []uint32, []uint32, []int64) []uint32
+	uint32DivScalar        func(uint32, []uint32, []uint32) []uint32
+	uint32DivScalarSels    func(uint32, []uint32, []uint32, []int64) []uint32
+	uint32DivByScalar      func(uint32, []uint32, []uint32) []uint32
+	uint32DivByScalarSels  func(uint32, []uint32, []uint32, []int64) []uint32
+	uint64Div              func([]uint64, []uint64, []uint64) []uint64
+	uint64DivSels          func([]uint64, []uint64, []uint64, []int64) []uint64
+	uint64DivScalar        func(uint64, []uint64, []uint64) []uint64
+	uint64DivScalarSels    func(uint64, []uint64, []uint64, []int64) []uint64
+	uint64DivByScalar      func(uint64, []uint64, []uint64) []uint64
+	uint64DivByScalarSels  func(uint64, []uint64, []uint64, []int64) []uint64
+	float32Div             func([]float32, []float32, []float32) []float32
+	float32DivSels         func([]float32, []float32, []float32, []int64) []float32
+	float32DivScalar       func(float32, []float32, []float32) []float32
+	float32DivScalarSels   func(float32, []float32, []float32, []int64) []float32
+	float32DivByScalar     func(float32, []float32, []float32) []float32
+	float32DivByScalarSels func(float32, []float32, []float32, []int64) []float32
+	float64Div             func([]float64, []float64, []float64) []float64
+	float64DivSels         func([]float64, []float64, []float64, []int64) []float64
+	float64DivScalar       func(float64, []float64, []float64) []float64
+	float64DivScalarSels   func(float64, []float64, []float64, []int64) []float64
+	float64DivByScalar     func(float64, []float64, []float64) []float64
+	float64DivByScalarSels func(float64, []float64, []float64, []int64) []float64
 )
 
 func init() {
-	i64Div = i64DivPure
-	i64DivOne = i64DivOnePure
-	i64DivOneBy = i64DivOneByPure
-	i64DivSels = i64DivSelsPure
-	i64DivOneSels = i64DivOneSelsPure
+	int8Div = int8DivPure
+	int8DivSels = int8DivSelsPure
+	int8DivScalar = int8DivScalarPure
+	int8DivScalarSels = int8DivScalarSelsPure
+	int8DivByScalar = int8DivByScalarPure
+	int8DivByScalarSels = int8DivByScalarSelsPure
+	int16Div = int16DivPure
+	int16DivSels = int16DivSelsPure
+	int16DivScalar = int16DivScalarPure
+	int16DivScalarSels = int16DivScalarSelsPure
+	int16DivByScalar = int16DivByScalarPure
+	int16DivByScalarSels = int16DivByScalarSelsPure
+	int32Div = int32DivPure
+	int32DivSels = int32DivSelsPure
+	int32DivScalar = int32DivScalarPure
+	int32DivScalarSels = int32DivScalarSelsPure
+	int32DivByScalar = int32DivByScalarPure
+	int32DivByScalarSels = int32DivByScalarSelsPure
+	int64Div = int64DivPure
+	int64DivSels = int64DivSelsPure
+	int64DivScalar = int64DivScalarPure
+	int64DivScalarSels = int64DivScalarSelsPure
+	int64DivByScalar = int64DivByScalarPure
+	int64DivByScalarSels = int64DivByScalarSelsPure
+	uint8Div = uint8DivPure
+	uint8DivSels = uint8DivSelsPure
+	uint8DivScalar = uint8DivScalarPure
+	uint8DivScalarSels = uint8DivScalarSelsPure
+	uint8DivByScalar = uint8DivByScalarPure
+	uint8DivByScalarSels = uint8DivByScalarSelsPure
+	uint16Div = uint16DivPure
+	uint16DivSels = uint16DivSelsPure
+	uint16DivScalar = uint16DivScalarPure
+	uint16DivScalarSels = uint16DivScalarSelsPure
+	uint16DivByScalar = uint16DivByScalarPure
+	uint16DivByScalarSels = uint16DivByScalarSelsPure
+	uint32Div = uint32DivPure
+	uint32DivSels = uint32DivSelsPure
+	uint32DivScalar = uint32DivScalarPure
+	uint32DivScalarSels = uint32DivScalarSelsPure
+	uint32DivByScalar = uint32DivByScalarPure
+	uint32DivByScalarSels = uint32DivByScalarSelsPure
+	uint64Div = uint64DivPure
+	uint64DivSels = uint64DivSelsPure
+	uint64DivScalar = uint64DivScalarPure
+	uint64DivScalarSels = uint64DivScalarSelsPure
+	uint64DivByScalar = uint64DivByScalarPure
+	uint64DivByScalarSels = uint64DivByScalarSelsPure
+	float32Div = float32DivPure
+	float32DivSels = float32DivSelsPure
+	float32DivScalar = float32DivScalarPure
+	float32DivScalarSels = float32DivScalarSelsPure
+	float32DivByScalar = float32DivByScalarPure
+	float32DivByScalarSels = float32DivByScalarSelsPure
+	float64Div = float64DivPure
+	float64DivSels = float64DivSelsPure
+	float64DivScalar = float64DivScalarPure
+	float64DivScalarSels = float64DivScalarSelsPure
+	float64DivByScalar = float64DivByScalarPure
+	float64DivByScalarSels = float64DivByScalarSelsPure
+}
 
-	f64Div = f64DivPure
-	f64DivOne = f64DivOnePure
-	i64DivOneBy = i64DivOneByPure
-	f64DivSels = f64DivSelsPure
-	f64DivOneSels = f64DivOneSelsPure
+func Int8Div(xs, ys, rs []int8) []int8 {
+	return int8Div(xs, ys, rs)
 }
 
-// check if 0 exists before the function call
-func I64Div(xs, ys, rs []int64) []int64 {
-	return i64Div(xs, ys, rs)
+func int8DivPure(xs, ys, rs []int8) []int8 {
+	for i, x := range xs {
+		rs[i] = x / ys[i]
+	}
+	return rs
+}
+
+func Int8DivSels(xs, ys, rs []int8, sels []int64) []int8 {
+	return int8DivSels(xs, ys, rs, sels)
+}
+
+func int8DivSelsPure(xs, ys, rs []int8, sels []int64) []int8 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] / ys[sel]
+	}
+	return rs
 }
 
-// check if 0 exists before the function call
-func I64DivOne(x int64, ys, rs []int64) []int64 {
-	return i64DivOnePure(x, ys, rs)
+func Int8DivScalar(x int8, ys, rs []int8) []int8 {
+	return int8DivScalar(x, ys, rs)
 }
 
-// check if 0 exists before the function call
-func I64DivOneBy(x int64, ys, rs []int64) []int64 {
-	return i64DivOneByPure(x, ys, rs)
+func int8DivScalarPure(x int8, ys, rs []int8) []int8 {
+	for i, y := range ys {
+		rs[i] = x / y
+	}
+	return rs
 }
 
-// check if 0 exists before the function call
-func I64DivSels(xs, ys, rs []int64, sels []int64) []int64 {
-	return i64DivSels(xs, ys, rs, sels)
+func Int8DivScalarSels(x int8, ys, rs []int8, sels []int64) []int8 {
+	return int8DivScalarSels(x, ys, rs, sels)
 }
 
-// check if 0 exists before the function call
-func I64DivOneSels(x int64, ys, rs []int64, sels []int64) []int64 {
-	return i64DivOneSels(x, ys, rs, sels)
+func int8DivScalarSelsPure(x int8, ys, rs []int8, sels []int64) []int8 {
+	for i, sel := range sels {
+		rs[i] = x / ys[sel]
+	}
+	return rs
 }
 
-// check if 0 exists before the function call
-func F64Div(xs, ys, rs []float64) []float64 {
-	return f64Div(xs, ys, rs)
+func Int8DivByScalar(x int8, ys, rs []int8) []int8 {
+	return int8DivByScalar(x, ys, rs)
 }
 
-// check if 0 exists before the function call
-func F64DivOne(x float64, ys, rs []float64) []float64 {
-	return f64DivOnePure(x, ys, rs)
+func int8DivByScalarPure(x int8, ys, rs []int8) []int8 {
+	for i, y := range ys {
+		rs[i] = y / x
+	}
+	return rs
 }
 
-// check if 0 exists before the function call
-func F64DivOneBy(x float64, ys, rs []float64) []float64 {
-	return f64DivOneByPure(x, ys, rs)
+func Int8DivByScalarSels(x int8, ys, rs []int8, sels []int64) []int8 {
+	return int8DivByScalarSels(x, ys, rs, sels)
 }
 
-// check if 0 exists before the function call
-func F64DivSels(xs, ys, rs []float64, sels []int64) []float64 {
-	return f64DivSels(xs, ys, rs, sels)
+func int8DivByScalarSelsPure(x int8, ys, rs []int8, sels []int64) []int8 {
+	for i, sel := range sels {
+		rs[i] = ys[sel] / x
+	}
+	return rs
 }
 
-// check if 0 exists before the function call
-func F64DivOneSels(x float64, ys, rs []float64, sels []int64) []float64 {
-	return f64DivOneSels(x, ys, rs, sels)
+func Int16Div(xs, ys, rs []int16) []int16 {
+	return int16Div(xs, ys, rs)
 }
 
-func i64DivPure(xs, ys, rs []int64) []int64 {
+func int16DivPure(xs, ys, rs []int16) []int16 {
 	for i, x := range xs {
 		rs[i] = x / ys[i]
 	}
 	return rs
 }
 
-func i64DivOnePure(x int64, ys, rs []int64) []int64 {
+func Int16DivSels(xs, ys, rs []int16, sels []int64) []int16 {
+	return int16DivSels(xs, ys, rs, sels)
+}
+
+func int16DivSelsPure(xs, ys, rs []int16, sels []int64) []int16 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] / ys[sel]
+	}
+	return rs
+}
+
+func Int16DivScalar(x int16, ys, rs []int16) []int16 {
+	return int16DivScalar(x, ys, rs)
+}
+
+func int16DivScalarPure(x int16, ys, rs []int16) []int16 {
 	for i, y := range ys {
 		rs[i] = x / y
 	}
 	return rs
 }
 
-func i64DivOneByPure(x int64, ys, rs []int64) []int64 {
+func Int16DivScalarSels(x int16, ys, rs []int16, sels []int64) []int16 {
+	return int16DivScalarSels(x, ys, rs, sels)
+}
+
+func int16DivScalarSelsPure(x int16, ys, rs []int16, sels []int64) []int16 {
+	for i, sel := range sels {
+		rs[i] = x / ys[sel]
+	}
+	return rs
+}
+
+func Int16DivByScalar(x int16, ys, rs []int16) []int16 {
+	return int16DivByScalar(x, ys, rs)
+}
+
+func int16DivByScalarPure(x int16, ys, rs []int16) []int16 {
 	for i, y := range ys {
 		rs[i] = y / x
 	}
 	return rs
 }
 
-func i64DivOneSelsPure(x int64, ys, rs []int64, sels []int64) []int64 {
-	for _, sel := range sels {
-		rs[sel] = x / ys[sel]
+func Int16DivByScalarSels(x int16, ys, rs []int16, sels []int64) []int16 {
+	return int16DivByScalarSels(x, ys, rs, sels)
+}
+
+func int16DivByScalarSelsPure(x int16, ys, rs []int16, sels []int64) []int16 {
+	for i, sel := range sels {
+		rs[i] = ys[sel] / x
+	}
+	return rs
+}
+
+func Int32Div(xs, ys, rs []int32) []int32 {
+	return int32Div(xs, ys, rs)
+}
+
+func int32DivPure(xs, ys, rs []int32) []int32 {
+	for i, x := range xs {
+		rs[i] = x / ys[i]
+	}
+	return rs
+}
+
+func Int32DivSels(xs, ys, rs []int32, sels []int64) []int32 {
+	return int32DivSels(xs, ys, rs, sels)
+}
+
+func int32DivSelsPure(xs, ys, rs []int32, sels []int64) []int32 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] / ys[sel]
+	}
+	return rs
+}
+
+func Int32DivScalar(x int32, ys, rs []int32) []int32 {
+	return int32DivScalar(x, ys, rs)
+}
+
+func int32DivScalarPure(x int32, ys, rs []int32) []int32 {
+	for i, y := range ys {
+		rs[i] = x / y
+	}
+	return rs
+}
+
+func Int32DivScalarSels(x int32, ys, rs []int32, sels []int64) []int32 {
+	return int32DivScalarSels(x, ys, rs, sels)
+}
+
+func int32DivScalarSelsPure(x int32, ys, rs []int32, sels []int64) []int32 {
+	for i, sel := range sels {
+		rs[i] = x / ys[sel]
+	}
+	return rs
+}
+
+func Int32DivByScalar(x int32, ys, rs []int32) []int32 {
+	return int32DivByScalar(x, ys, rs)
+}
+
+func int32DivByScalarPure(x int32, ys, rs []int32) []int32 {
+	for i, y := range ys {
+		rs[i] = y / x
+	}
+	return rs
+}
+
+func Int32DivByScalarSels(x int32, ys, rs []int32, sels []int64) []int32 {
+	return int32DivByScalarSels(x, ys, rs, sels)
+}
+
+func int32DivByScalarSelsPure(x int32, ys, rs []int32, sels []int64) []int32 {
+	for i, sel := range sels {
+		rs[i] = ys[sel] / x
+	}
+	return rs
+}
+
+func Int64Div(xs, ys, rs []int64) []int64 {
+	return int64Div(xs, ys, rs)
+}
+
+func int64DivPure(xs, ys, rs []int64) []int64 {
+	for i, x := range xs {
+		rs[i] = x / ys[i]
+	}
+	return rs
+}
+
+func Int64DivSels(xs, ys, rs []int64, sels []int64) []int64 {
+	return int64DivSels(xs, ys, rs, sels)
+}
+
+func int64DivSelsPure(xs, ys, rs []int64, sels []int64) []int64 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] / ys[sel]
+	}
+	return rs
+}
+
+func Int64DivScalar(x int64, ys, rs []int64) []int64 {
+	return int64DivScalar(x, ys, rs)
+}
+
+func int64DivScalarPure(x int64, ys, rs []int64) []int64 {
+	for i, y := range ys {
+		rs[i] = x / y
+	}
+	return rs
+}
+
+func Int64DivScalarSels(x int64, ys, rs []int64, sels []int64) []int64 {
+	return int64DivScalarSels(x, ys, rs, sels)
+}
+
+func int64DivScalarSelsPure(x int64, ys, rs []int64, sels []int64) []int64 {
+	for i, sel := range sels {
+		rs[i] = x / ys[sel]
 	}
 	return rs
 }
 
-func i64DivSelsPure(xs, ys, rs []int64, sels []int64) []int64 {
-	for _, sel := range sels {
-		rs[sel] = xs[sel] / ys[sel]
+func Int64DivByScalar(x int64, ys, rs []int64) []int64 {
+	return int64DivByScalar(x, ys, rs)
+}
+
+func int64DivByScalarPure(x int64, ys, rs []int64) []int64 {
+	for i, y := range ys {
+		rs[i] = y / x
+	}
+	return rs
+}
+
+func Int64DivByScalarSels(x int64, ys, rs []int64, sels []int64) []int64 {
+	return int64DivByScalarSels(x, ys, rs, sels)
+}
+
+func int64DivByScalarSelsPure(x int64, ys, rs []int64, sels []int64) []int64 {
+	for i, sel := range sels {
+		rs[i] = ys[sel] / x
 	}
 	return rs
 }
 
-func f64DivPure(xs, ys, rs []float64) []float64 {
+func Uint8Div(xs, ys, rs []uint8) []uint8 {
+	return uint8Div(xs, ys, rs)
+}
+
+func uint8DivPure(xs, ys, rs []uint8) []uint8 {
 	for i, x := range xs {
 		rs[i] = x / ys[i]
 	}
 	return rs
 }
 
-func f64DivOnePure(x float64, ys, rs []float64) []float64 {
+func Uint8DivSels(xs, ys, rs []uint8, sels []int64) []uint8 {
+	return uint8DivSels(xs, ys, rs, sels)
+}
+
+func uint8DivSelsPure(xs, ys, rs []uint8, sels []int64) []uint8 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] / ys[sel]
+	}
+	return rs
+}
+
+func Uint8DivScalar(x uint8, ys, rs []uint8) []uint8 {
+	return uint8DivScalar(x, ys, rs)
+}
+
+func uint8DivScalarPure(x uint8, ys, rs []uint8) []uint8 {
 	for i, y := range ys {
 		rs[i] = x / y
 	}
 	return rs
 }
 
-func f64DivOneByPure(x float64, ys, rs []float64) []float64 {
+func Uint8DivScalarSels(x uint8, ys, rs []uint8, sels []int64) []uint8 {
+	return uint8DivScalarSels(x, ys, rs, sels)
+}
+
+func uint8DivScalarSelsPure(x uint8, ys, rs []uint8, sels []int64) []uint8 {
+	for i, sel := range sels {
+		rs[i] = x / ys[sel]
+	}
+	return rs
+}
+
+func Uint8DivByScalar(x uint8, ys, rs []uint8) []uint8 {
+	return uint8DivByScalar(x, ys, rs)
+}
+
+func uint8DivByScalarPure(x uint8, ys, rs []uint8) []uint8 {
 	for i, y := range ys {
 		rs[i] = y / x
 	}
 	return rs
 }
 
-func f64DivOneSelsPure(x float64, ys, rs []float64, sels []int64) []float64 {
-	for _, sel := range sels {
-		rs[sel] = x / ys[sel]
+func Uint8DivByScalarSels(x uint8, ys, rs []uint8, sels []int64) []uint8 {
+	return uint8DivByScalarSels(x, ys, rs, sels)
+}
+
+func uint8DivByScalarSelsPure(x uint8, ys, rs []uint8, sels []int64) []uint8 {
+	for i, sel := range sels {
+		rs[i] = ys[sel] / x
+	}
+	return rs
+}
+
+func Uint16Div(xs, ys, rs []uint16) []uint16 {
+	return uint16Div(xs, ys, rs)
+}
+
+func uint16DivPure(xs, ys, rs []uint16) []uint16 {
+	for i, x := range xs {
+		rs[i] = x / ys[i]
 	}
 	return rs
 }
 
-func f64DivSelsPure(xs, ys, rs []float64, sels []int64) []float64 {
-	for _, sel := range sels {
-		rs[sel] = xs[sel] / ys[sel]
+func Uint16DivSels(xs, ys, rs []uint16, sels []int64) []uint16 {
+	return uint16DivSels(xs, ys, rs, sels)
+}
+
+func uint16DivSelsPure(xs, ys, rs []uint16, sels []int64) []uint16 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] / ys[sel]
+	}
+	return rs
+}
+
+func Uint16DivScalar(x uint16, ys, rs []uint16) []uint16 {
+	return uint16DivScalar(x, ys, rs)
+}
+
+func uint16DivScalarPure(x uint16, ys, rs []uint16) []uint16 {
+	for i, y := range ys {
+		rs[i] = x / y
+	}
+	return rs
+}
+
+func Uint16DivScalarSels(x uint16, ys, rs []uint16, sels []int64) []uint16 {
+	return uint16DivScalarSels(x, ys, rs, sels)
+}
+
+func uint16DivScalarSelsPure(x uint16, ys, rs []uint16, sels []int64) []uint16 {
+	for i, sel := range sels {
+		rs[i] = x / ys[sel]
+	}
+	return rs
+}
+
+func Uint16DivByScalar(x uint16, ys, rs []uint16) []uint16 {
+	return uint16DivByScalar(x, ys, rs)
+}
+
+func uint16DivByScalarPure(x uint16, ys, rs []uint16) []uint16 {
+	for i, y := range ys {
+		rs[i] = y / x
+	}
+	return rs
+}
+
+func Uint16DivByScalarSels(x uint16, ys, rs []uint16, sels []int64) []uint16 {
+	return uint16DivByScalarSels(x, ys, rs, sels)
+}
+
+func uint16DivByScalarSelsPure(x uint16, ys, rs []uint16, sels []int64) []uint16 {
+	for i, sel := range sels {
+		rs[i] = ys[sel] / x
+	}
+	return rs
+}
+
+func Uint32Div(xs, ys, rs []uint32) []uint32 {
+	return uint32Div(xs, ys, rs)
+}
+
+func uint32DivPure(xs, ys, rs []uint32) []uint32 {
+	for i, x := range xs {
+		rs[i] = x / ys[i]
+	}
+	return rs
+}
+
+func Uint32DivSels(xs, ys, rs []uint32, sels []int64) []uint32 {
+	return uint32DivSels(xs, ys, rs, sels)
+}
+
+func uint32DivSelsPure(xs, ys, rs []uint32, sels []int64) []uint32 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] / ys[sel]
+	}
+	return rs
+}
+
+func Uint32DivScalar(x uint32, ys, rs []uint32) []uint32 {
+	return uint32DivScalar(x, ys, rs)
+}
+
+func uint32DivScalarPure(x uint32, ys, rs []uint32) []uint32 {
+	for i, y := range ys {
+		rs[i] = x / y
+	}
+	return rs
+}
+
+func Uint32DivScalarSels(x uint32, ys, rs []uint32, sels []int64) []uint32 {
+	return uint32DivScalarSels(x, ys, rs, sels)
+}
+
+func uint32DivScalarSelsPure(x uint32, ys, rs []uint32, sels []int64) []uint32 {
+	for i, sel := range sels {
+		rs[i] = x / ys[sel]
+	}
+	return rs
+}
+
+func Uint32DivByScalar(x uint32, ys, rs []uint32) []uint32 {
+	return uint32DivByScalar(x, ys, rs)
+}
+
+func uint32DivByScalarPure(x uint32, ys, rs []uint32) []uint32 {
+	for i, y := range ys {
+		rs[i] = y / x
+	}
+	return rs
+}
+
+func Uint32DivByScalarSels(x uint32, ys, rs []uint32, sels []int64) []uint32 {
+	return uint32DivByScalarSels(x, ys, rs, sels)
+}
+
+func uint32DivByScalarSelsPure(x uint32, ys, rs []uint32, sels []int64) []uint32 {
+	for i, sel := range sels {
+		rs[i] = ys[sel] / x
+	}
+	return rs
+}
+
+func Uint64Div(xs, ys, rs []uint64) []uint64 {
+	return uint64Div(xs, ys, rs)
+}
+
+func uint64DivPure(xs, ys, rs []uint64) []uint64 {
+	for i, x := range xs {
+		rs[i] = x / ys[i]
+	}
+	return rs
+}
+
+func Uint64DivSels(xs, ys, rs []uint64, sels []int64) []uint64 {
+	return uint64DivSels(xs, ys, rs, sels)
+}
+
+func uint64DivSelsPure(xs, ys, rs []uint64, sels []int64) []uint64 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] / ys[sel]
+	}
+	return rs
+}
+
+func Uint64DivScalar(x uint64, ys, rs []uint64) []uint64 {
+	return uint64DivScalar(x, ys, rs)
+}
+
+func uint64DivScalarPure(x uint64, ys, rs []uint64) []uint64 {
+	for i, y := range ys {
+		rs[i] = x / y
+	}
+	return rs
+}
+
+func Uint64DivScalarSels(x uint64, ys, rs []uint64, sels []int64) []uint64 {
+	return uint64DivScalarSels(x, ys, rs, sels)
+}
+
+func uint64DivScalarSelsPure(x uint64, ys, rs []uint64, sels []int64) []uint64 {
+	for i, sel := range sels {
+		rs[i] = x / ys[sel]
+	}
+	return rs
+}
+
+func Uint64DivByScalar(x uint64, ys, rs []uint64) []uint64 {
+	return uint64DivByScalar(x, ys, rs)
+}
+
+func uint64DivByScalarPure(x uint64, ys, rs []uint64) []uint64 {
+	for i, y := range ys {
+		rs[i] = y / x
+	}
+	return rs
+}
+
+func Uint64DivByScalarSels(x uint64, ys, rs []uint64, sels []int64) []uint64 {
+	return uint64DivByScalarSels(x, ys, rs, sels)
+}
+
+func uint64DivByScalarSelsPure(x uint64, ys, rs []uint64, sels []int64) []uint64 {
+	for i, sel := range sels {
+		rs[i] = ys[sel] / x
+	}
+	return rs
+}
+
+func Float32Div(xs, ys, rs []float32) []float32 {
+	return float32Div(xs, ys, rs)
+}
+
+func float32DivPure(xs, ys, rs []float32) []float32 {
+	for i, x := range xs {
+		rs[i] = x / ys[i]
+	}
+	return rs
+}
+
+func Float32DivSels(xs, ys, rs []float32, sels []int64) []float32 {
+	return float32DivSels(xs, ys, rs, sels)
+}
+
+func float32DivSelsPure(xs, ys, rs []float32, sels []int64) []float32 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] / ys[sel]
+	}
+	return rs
+}
+
+func Float32DivScalar(x float32, ys, rs []float32) []float32 {
+	return float32DivScalar(x, ys, rs)
+}
+
+func float32DivScalarPure(x float32, ys, rs []float32) []float32 {
+	for i, y := range ys {
+		rs[i] = x / y
+	}
+	return rs
+}
+
+func Float32DivScalarSels(x float32, ys, rs []float32, sels []int64) []float32 {
+	return float32DivScalarSels(x, ys, rs, sels)
+}
+
+func float32DivScalarSelsPure(x float32, ys, rs []float32, sels []int64) []float32 {
+	for i, sel := range sels {
+		rs[i] = x / ys[sel]
+	}
+	return rs
+}
+
+func Float32DivByScalar(x float32, ys, rs []float32) []float32 {
+	return float32DivByScalar(x, ys, rs)
+}
+
+func float32DivByScalarPure(x float32, ys, rs []float32) []float32 {
+	for i, y := range ys {
+		rs[i] = y / x
+	}
+	return rs
+}
+
+func Float32DivByScalarSels(x float32, ys, rs []float32, sels []int64) []float32 {
+	return float32DivByScalarSels(x, ys, rs, sels)
+}
+
+func float32DivByScalarSelsPure(x float32, ys, rs []float32, sels []int64) []float32 {
+	for i, sel := range sels {
+		rs[i] = ys[sel] / x
+	}
+	return rs
+}
+
+func Float64Div(xs, ys, rs []float64) []float64 {
+	return float64Div(xs, ys, rs)
+}
+
+func float64DivPure(xs, ys, rs []float64) []float64 {
+	for i, x := range xs {
+		rs[i] = x / ys[i]
+	}
+	return rs
+}
+
+func Float64DivSels(xs, ys, rs []float64, sels []int64) []float64 {
+	return float64DivSels(xs, ys, rs, sels)
+}
+
+func float64DivSelsPure(xs, ys, rs []float64, sels []int64) []float64 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] / ys[sel]
+	}
+	return rs
+}
+
+func Float64DivScalar(x float64, ys, rs []float64) []float64 {
+	return float64DivScalar(x, ys, rs)
+}
+
+func float64DivScalarPure(x float64, ys, rs []float64) []float64 {
+	for i, y := range ys {
+		rs[i] = x / y
+	}
+	return rs
+}
+
+func Float64DivScalarSels(x float64, ys, rs []float64, sels []int64) []float64 {
+	return float64DivScalarSels(x, ys, rs, sels)
+}
+
+func float64DivScalarSelsPure(x float64, ys, rs []float64, sels []int64) []float64 {
+	for i, sel := range sels {
+		rs[i] = x / ys[sel]
+	}
+	return rs
+}
+
+func Float64DivByScalar(x float64, ys, rs []float64) []float64 {
+	return float64DivByScalar(x, ys, rs)
+}
+
+func float64DivByScalarPure(x float64, ys, rs []float64) []float64 {
+	for i, y := range ys {
+		rs[i] = y / x
+	}
+	return rs
+}
+
+func Float64DivByScalarSels(x float64, ys, rs []float64, sels []int64) []float64 {
+	return float64DivByScalarSels(x, ys, rs, sels)
+}
+
+func float64DivByScalarSelsPure(x float64, ys, rs []float64, sels []int64) []float64 {
+	for i, sel := range sels {
+		rs[i] = ys[sel] / x
 	}
 	return rs
 }
diff --git a/pkg/vectorize/mod/mod.go b/pkg/vectorize/mod/mod.go
index 24eb0754b4df5ae638669616a2ac142e8bf6c5bb..e00d6f435868b74471ee5fd2a9b42a1dd6c84be8 100644
--- a/pkg/vectorize/mod/mod.go
+++ b/pkg/vectorize/mod/mod.go
@@ -1,153 +1,423 @@
 package mod
 
-import (
-	"math"
-)
-
 var (
-	i64ModOne     func(int64, []int64, []int64) []int64
-	i64ModOneBy   func(int64, []int64, []int64) []int64
-	i64Mod        func([]int64, []int64, []int64) []int64
-	i64ModOneSels func(int64, []int64, []int64, []int64) []int64
-	i64ModSels    func([]int64, []int64, []int64, []int64) []int64
-
-	f64ModOne     func(float64, []float64, []float64) []float64
-	f64ModOneBy   func(float64, []float64, []float64) []float64
-	f64Mod        func([]float64, []float64, []float64) []float64
-	f64ModOneSels func(float64, []float64, []float64, []int64) []float64
-	f64ModSels    func([]float64, []float64, []float64, []int64) []float64
+	int8Mod             func([]int8, []int8, []int8) []int8
+	int8ModSels         func([]int8, []int8, []int8, []int64) []int8
+	int8ModScalar       func(int8, []int8, []int8) []int8
+	int8ModScalarSels   func(int8, []int8, []int8, []int64) []int8
+	int16Mod            func([]int16, []int16, []int16) []int16
+	int16ModSels        func([]int16, []int16, []int16, []int64) []int16
+	int16ModScalar      func(int16, []int16, []int16) []int16
+	int16ModScalarSels  func(int16, []int16, []int16, []int64) []int16
+	int32Mod            func([]int32, []int32, []int32) []int32
+	int32ModSels        func([]int32, []int32, []int32, []int64) []int32
+	int32ModScalar      func(int32, []int32, []int32) []int32
+	int32ModScalarSels  func(int32, []int32, []int32, []int64) []int32
+	int64Mod            func([]int64, []int64, []int64) []int64
+	int64ModSels        func([]int64, []int64, []int64, []int64) []int64
+	int64ModScalar      func(int64, []int64, []int64) []int64
+	int64ModScalarSels  func(int64, []int64, []int64, []int64) []int64
+	uint8Mod            func([]uint8, []uint8, []uint8) []uint8
+	uint8ModSels        func([]uint8, []uint8, []uint8, []int64) []uint8
+	uint8ModScalar      func(uint8, []uint8, []uint8) []uint8
+	uint8ModScalarSels  func(uint8, []uint8, []uint8, []int64) []uint8
+	uint16Mod           func([]uint16, []uint16, []uint16) []uint16
+	uint16ModSels       func([]uint16, []uint16, []uint16, []int64) []uint16
+	uint16ModScalar     func(uint16, []uint16, []uint16) []uint16
+	uint16ModScalarSels func(uint16, []uint16, []uint16, []int64) []uint16
+	uint32Mod           func([]uint32, []uint32, []uint32) []uint32
+	uint32ModSels       func([]uint32, []uint32, []uint32, []int64) []uint32
+	uint32ModScalar     func(uint32, []uint32, []uint32) []uint32
+	uint32ModScalarSels func(uint32, []uint32, []uint32, []int64) []uint32
+	uint64Mod           func([]uint64, []uint64, []uint64) []uint64
+	uint64ModSels       func([]uint64, []uint64, []uint64, []int64) []uint64
+	uint64ModScalar     func(uint64, []uint64, []uint64) []uint64
+	uint64ModScalarSels func(uint64, []uint64, []uint64, []int64) []uint64
 )
 
 func init() {
-	i64Mod = i64ModPure
-	i64ModOne = i64ModOnePure
-	i64ModOneBy = i64ModOneByPure
-	i64ModSels = i64ModSelsPure
-	i64ModOneSels = i64ModOneSelsPure
+	int8Mod = int8ModPure
+	int8ModSels = int8ModSelsPure
+	int8ModScalar = int8ModScalarPure
+	int8ModScalarSels = int8ModScalarSelsPure
+	int16Mod = int16ModPure
+	int16ModSels = int16ModSelsPure
+	int16ModScalar = int16ModScalarPure
+	int16ModScalarSels = int16ModScalarSelsPure
+	int32Mod = int32ModPure
+	int32ModSels = int32ModSelsPure
+	int32ModScalar = int32ModScalarPure
+	int32ModScalarSels = int32ModScalarSelsPure
+	int64Mod = int64ModPure
+	int64ModSels = int64ModSelsPure
+	int64ModScalar = int64ModScalarPure
+	int64ModScalarSels = int64ModScalarSelsPure
+	uint8Mod = uint8ModPure
+	uint8ModSels = uint8ModSelsPure
+	uint8ModScalar = uint8ModScalarPure
+	uint8ModScalarSels = uint8ModScalarSelsPure
+	uint16Mod = uint16ModPure
+	uint16ModSels = uint16ModSelsPure
+	uint16ModScalar = uint16ModScalarPure
+	uint16ModScalarSels = uint16ModScalarSelsPure
+	uint32Mod = uint32ModPure
+	uint32ModSels = uint32ModSelsPure
+	uint32ModScalar = uint32ModScalarPure
+	uint32ModScalarSels = uint32ModScalarSelsPure
+	uint64Mod = uint64ModPure
+	uint64ModSels = uint64ModSelsPure
+	uint64ModScalar = uint64ModScalarPure
+	uint64ModScalarSels = uint64ModScalarSelsPure
+}
+
+func Int8Mod(xs, ys, rs []int8) []int8 {
+	return int8Mod(xs, ys, rs)
+}
+
+func int8ModPure(xs, ys, rs []int8) []int8 {
+	for i, x := range xs {
+		rs[i] = x % ys[i]
+	}
+	return rs
+}
+
+func Int8ModSels(xs, ys, rs []int8, sels []int64) []int8 {
+	return int8ModSels(xs, ys, rs, sels)
+}
+
+func int8ModSelsPure(xs, ys, rs []int8, sels []int64) []int8 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] % ys[sel]
+	}
+	return rs
+}
+
+func Int8ModScalar(x int8, ys, rs []int8) []int8 {
+	return int8ModScalar(x, ys, rs)
+}
+
+func int8ModScalarPure(x int8, ys, rs []int8) []int8 {
+	for i, y := range ys {
+		rs[i] = x % y
+	}
+	return rs
+}
 
-	f64Mod = f64ModPure
-	f64ModOne = f64ModOnePure
-	i64ModOneBy = i64ModOneByPure
-	f64ModSels = f64ModSelsPure
-	f64ModOneSels = f64ModOneSelsPure
+func Int8ModScalarSels(x int8, ys, rs []int8, sels []int64) []int8 {
+	return int8ModScalarSels(x, ys, rs, sels)
 }
 
-// check if 0 exists before the function call
-func I64Mod(xs, ys, rs []int64) []int64 {
-	return i64Mod(xs, ys, rs)
+func int8ModScalarSelsPure(x int8, ys, rs []int8, sels []int64) []int8 {
+	for i, sel := range sels {
+		rs[i] = x % ys[sel]
+	}
+	return rs
 }
 
-// check if 0 exists before the function call
-func I64ModOne(x int64, ys, rs []int64) []int64 {
-	return i64ModOnePure(x, ys, rs)
+func Int16Mod(xs, ys, rs []int16) []int16 {
+	return int16Mod(xs, ys, rs)
 }
 
-// check if 0 exists before the function call
-func I64ModOneBy(x int64, ys, rs []int64) []int64 {
-	return i64ModOneByPure(x, ys, rs)
+func int16ModPure(xs, ys, rs []int16) []int16 {
+	for i, x := range xs {
+		rs[i] = x % ys[i]
+	}
+	return rs
 }
 
-// check if 0 exists before the function call
-func I64ModSels(xs, ys, rs []int64, sels []int64) []int64 {
-	return i64ModSels(xs, ys, rs, sels)
+func Int16ModSels(xs, ys, rs []int16, sels []int64) []int16 {
+	return int16ModSels(xs, ys, rs, sels)
 }
 
-// check if 0 exists before the function call
-func I64ModOneSels(x int64, ys, rs []int64, sels []int64) []int64 {
-	return i64ModOneSels(x, ys, rs, sels)
+func int16ModSelsPure(xs, ys, rs []int16, sels []int64) []int16 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] % ys[sel]
+	}
+	return rs
 }
 
-// check if 0 exists before the function call
-func F64Mod(xs, ys, rs []float64) []float64 {
-	return f64Mod(xs, ys, rs)
+func Int16ModScalar(x int16, ys, rs []int16) []int16 {
+	return int16ModScalar(x, ys, rs)
 }
 
-// check if 0 exists before the function call
-func F64ModOne(x float64, ys, rs []float64) []float64 {
-	return f64ModOnePure(x, ys, rs)
+func int16ModScalarPure(x int16, ys, rs []int16) []int16 {
+	for i, y := range ys {
+		rs[i] = x % y
+	}
+	return rs
 }
 
-// check if 0 exists before the function call
-func F64ModOneBy(x float64, ys, rs []float64) []float64 {
-	return f64ModOneByPure(x, ys, rs)
+func Int16ModScalarSels(x int16, ys, rs []int16, sels []int64) []int16 {
+	return int16ModScalarSels(x, ys, rs, sels)
 }
 
-// check if 0 exists before the function call
-func F64ModSels(xs, ys, rs []float64, sels []int64) []float64 {
-	return f64ModSels(xs, ys, rs, sels)
+func int16ModScalarSelsPure(x int16, ys, rs []int16, sels []int64) []int16 {
+	for i, sel := range sels {
+		rs[i] = x % ys[sel]
+	}
+	return rs
 }
 
-// check if 0 exists before the function call
-func F64ModOneSels(x float64, ys, rs []float64, sels []int64) []float64 {
-	return f64ModOneSels(x, ys, rs, sels)
+func Int32Mod(xs, ys, rs []int32) []int32 {
+	return int32Mod(xs, ys, rs)
 }
 
-func i64ModPure(xs, ys, rs []int64) []int64 {
+func int32ModPure(xs, ys, rs []int32) []int32 {
 	for i, x := range xs {
 		rs[i] = x % ys[i]
 	}
 	return rs
 }
 
-func i64ModOnePure(x int64, ys, rs []int64) []int64 {
+func Int32ModSels(xs, ys, rs []int32, sels []int64) []int32 {
+	return int32ModSels(xs, ys, rs, sels)
+}
+
+func int32ModSelsPure(xs, ys, rs []int32, sels []int64) []int32 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] % ys[sel]
+	}
+	return rs
+}
+
+func Int32ModScalar(x int32, ys, rs []int32) []int32 {
+	return int32ModScalar(x, ys, rs)
+}
+
+func int32ModScalarPure(x int32, ys, rs []int32) []int32 {
 	for i, y := range ys {
 		rs[i] = x % y
 	}
 	return rs
 }
 
-func i64ModOneByPure(x int64, ys, rs []int64) []int64 {
+func Int32ModScalarSels(x int32, ys, rs []int32, sels []int64) []int32 {
+	return int32ModScalarSels(x, ys, rs, sels)
+}
+
+func int32ModScalarSelsPure(x int32, ys, rs []int32, sels []int64) []int32 {
+	for i, sel := range sels {
+		rs[i] = x % ys[sel]
+	}
+	return rs
+}
+
+func Int64Mod(xs, ys, rs []int64) []int64 {
+	return int64Mod(xs, ys, rs)
+}
+
+func int64ModPure(xs, ys, rs []int64) []int64 {
+	for i, x := range xs {
+		rs[i] = x % ys[i]
+	}
+	return rs
+}
+
+func Int64ModSels(xs, ys, rs []int64, sels []int64) []int64 {
+	return int64ModSels(xs, ys, rs, sels)
+}
+
+func int64ModSelsPure(xs, ys, rs []int64, sels []int64) []int64 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] % ys[sel]
+	}
+	return rs
+}
+
+func Int64ModScalar(x int64, ys, rs []int64) []int64 {
+	return int64ModScalar(x, ys, rs)
+}
+
+func int64ModScalarPure(x int64, ys, rs []int64) []int64 {
 	for i, y := range ys {
-		rs[i] = y % x
+		rs[i] = x % y
 	}
 	return rs
 }
 
-func i64ModOneSelsPure(x int64, ys, rs []int64, sels []int64) []int64 {
-	for _, sel := range sels {
-		rs[sel] = x % ys[sel]
+func Int64ModScalarSels(x int64, ys, rs []int64, sels []int64) []int64 {
+	return int64ModScalarSels(x, ys, rs, sels)
+}
+
+func int64ModScalarSelsPure(x int64, ys, rs []int64, sels []int64) []int64 {
+	for i, sel := range sels {
+		rs[i] = x % ys[sel]
 	}
 	return rs
 }
 
-func i64ModSelsPure(xs, ys, rs []int64, sels []int64) []int64 {
-	for _, sel := range sels {
-		rs[sel] = xs[sel] % ys[sel]
+func Uint8Mod(xs, ys, rs []uint8) []uint8 {
+	return uint8Mod(xs, ys, rs)
+}
+
+func uint8ModPure(xs, ys, rs []uint8) []uint8 {
+	for i, x := range xs {
+		rs[i] = x % ys[i]
 	}
 	return rs
 }
 
-func f64ModPure(xs, ys, rs []float64) []float64 {
+func Uint8ModSels(xs, ys, rs []uint8, sels []int64) []uint8 {
+	return uint8ModSels(xs, ys, rs, sels)
+}
+
+func uint8ModSelsPure(xs, ys, rs []uint8, sels []int64) []uint8 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] % ys[sel]
+	}
+	return rs
+}
+
+func Uint8ModScalar(x uint8, ys, rs []uint8) []uint8 {
+	return uint8ModScalar(x, ys, rs)
+}
+
+func uint8ModScalarPure(x uint8, ys, rs []uint8) []uint8 {
+	for i, y := range ys {
+		rs[i] = x % y
+	}
+	return rs
+}
+
+func Uint8ModScalarSels(x uint8, ys, rs []uint8, sels []int64) []uint8 {
+	return uint8ModScalarSels(x, ys, rs, sels)
+}
+
+func uint8ModScalarSelsPure(x uint8, ys, rs []uint8, sels []int64) []uint8 {
+	for i, sel := range sels {
+		rs[i] = x % ys[sel]
+	}
+	return rs
+}
+
+func Uint16Mod(xs, ys, rs []uint16) []uint16 {
+	return uint16Mod(xs, ys, rs)
+}
+
+func uint16ModPure(xs, ys, rs []uint16) []uint16 {
 	for i, x := range xs {
-		rs[i] = math.Mod(x, ys[i])
+		rs[i] = x % ys[i]
 	}
 	return rs
 }
 
-func f64ModOnePure(x float64, ys, rs []float64) []float64 {
+func Uint16ModSels(xs, ys, rs []uint16, sels []int64) []uint16 {
+	return uint16ModSels(xs, ys, rs, sels)
+}
+
+func uint16ModSelsPure(xs, ys, rs []uint16, sels []int64) []uint16 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] % ys[sel]
+	}
+	return rs
+}
+
+func Uint16ModScalar(x uint16, ys, rs []uint16) []uint16 {
+	return uint16ModScalar(x, ys, rs)
+}
+
+func uint16ModScalarPure(x uint16, ys, rs []uint16) []uint16 {
 	for i, y := range ys {
-		rs[i] = math.Mod(x, y)
+		rs[i] = x % y
+	}
+	return rs
+}
+
+func Uint16ModScalarSels(x uint16, ys, rs []uint16, sels []int64) []uint16 {
+	return uint16ModScalarSels(x, ys, rs, sels)
+}
+
+func uint16ModScalarSelsPure(x uint16, ys, rs []uint16, sels []int64) []uint16 {
+	for i, sel := range sels {
+		rs[i] = x % ys[sel]
+	}
+	return rs
+}
+
+func Uint32Mod(xs, ys, rs []uint32) []uint32 {
+	return uint32Mod(xs, ys, rs)
+}
+
+func uint32ModPure(xs, ys, rs []uint32) []uint32 {
+	for i, x := range xs {
+		rs[i] = x % ys[i]
+	}
+	return rs
+}
+
+func Uint32ModSels(xs, ys, rs []uint32, sels []int64) []uint32 {
+	return uint32ModSels(xs, ys, rs, sels)
+}
+
+func uint32ModSelsPure(xs, ys, rs []uint32, sels []int64) []uint32 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] % ys[sel]
 	}
 	return rs
 }
 
-func f64ModOneByPure(x float64, ys, rs []float64) []float64 {
+func Uint32ModScalar(x uint32, ys, rs []uint32) []uint32 {
+	return uint32ModScalar(x, ys, rs)
+}
+
+func uint32ModScalarPure(x uint32, ys, rs []uint32) []uint32 {
 	for i, y := range ys {
-		rs[i] = math.Mod(y, x)
+		rs[i] = x % y
 	}
 	return rs
 }
 
-func f64ModOneSelsPure(x float64, ys, rs []float64, sels []int64) []float64 {
-	for _, sel := range sels {
-		rs[sel] = math.Mod(x, ys[sel])
+func Uint32ModScalarSels(x uint32, ys, rs []uint32, sels []int64) []uint32 {
+	return uint32ModScalarSels(x, ys, rs, sels)
+}
+
+func uint32ModScalarSelsPure(x uint32, ys, rs []uint32, sels []int64) []uint32 {
+	for i, sel := range sels {
+		rs[i] = x % ys[sel]
 	}
 	return rs
 }
 
-func f64ModSelsPure(xs, ys, rs []float64, sels []int64) []float64 {
-	for _, sel := range sels {
-		rs[sel] = math.Mod(xs[sel], ys[sel])
+func Uint64Mod(xs, ys, rs []uint64) []uint64 {
+	return uint64Mod(xs, ys, rs)
+}
+
+func uint64ModPure(xs, ys, rs []uint64) []uint64 {
+	for i, x := range xs {
+		rs[i] = x % ys[i]
+	}
+	return rs
+}
+
+func Uint64ModSels(xs, ys, rs []uint64, sels []int64) []uint64 {
+	return uint64ModSels(xs, ys, rs, sels)
+}
+
+func uint64ModSelsPure(xs, ys, rs []uint64, sels []int64) []uint64 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] % ys[sel]
+	}
+	return rs
+}
+
+func Uint64ModScalar(x uint64, ys, rs []uint64) []uint64 {
+	return uint64ModScalar(x, ys, rs)
+}
+
+func uint64ModScalarPure(x uint64, ys, rs []uint64) []uint64 {
+	for i, y := range ys {
+		rs[i] = x % y
+	}
+	return rs
+}
+
+func Uint64ModScalarSels(x uint64, ys, rs []uint64, sels []int64) []uint64 {
+	return uint64ModScalarSels(x, ys, rs, sels)
+}
+
+func uint64ModScalarSelsPure(x uint64, ys, rs []uint64, sels []int64) []uint64 {
+	for i, sel := range sels {
+		rs[i] = x % ys[sel]
 	}
 	return rs
 }
diff --git a/pkg/vectorize/mul/mul.go b/pkg/vectorize/mul/mul.go
new file mode 100644
index 0000000000000000000000000000000000000000..5ebe684f6848f13f124504a246036e67aff3e260
--- /dev/null
+++ b/pkg/vectorize/mul/mul.go
@@ -0,0 +1,527 @@
+package mul
+
+var (
+	int8Mul              func([]int8, []int8, []int8) []int8
+	int8MulSels          func([]int8, []int8, []int8, []int64) []int8
+	int8MulScalar        func(int8, []int8, []int8) []int8
+	int8MulScalarSels    func(int8, []int8, []int8, []int64) []int8
+	int16Mul             func([]int16, []int16, []int16) []int16
+	int16MulSels         func([]int16, []int16, []int16, []int64) []int16
+	int16MulScalar       func(int16, []int16, []int16) []int16
+	int16MulScalarSels   func(int16, []int16, []int16, []int64) []int16
+	int32Mul             func([]int32, []int32, []int32) []int32
+	int32MulSels         func([]int32, []int32, []int32, []int64) []int32
+	int32MulScalar       func(int32, []int32, []int32) []int32
+	int32MulScalarSels   func(int32, []int32, []int32, []int64) []int32
+	int64Mul             func([]int64, []int64, []int64) []int64
+	int64MulSels         func([]int64, []int64, []int64, []int64) []int64
+	int64MulScalar       func(int64, []int64, []int64) []int64
+	int64MulScalarSels   func(int64, []int64, []int64, []int64) []int64
+	uint8Mul             func([]uint8, []uint8, []uint8) []uint8
+	uint8MulSels         func([]uint8, []uint8, []uint8, []int64) []uint8
+	uint8MulScalar       func(uint8, []uint8, []uint8) []uint8
+	uint8MulScalarSels   func(uint8, []uint8, []uint8, []int64) []uint8
+	uint16Mul            func([]uint16, []uint16, []uint16) []uint16
+	uint16MulSels        func([]uint16, []uint16, []uint16, []int64) []uint16
+	uint16MulScalar      func(uint16, []uint16, []uint16) []uint16
+	uint16MulScalarSels  func(uint16, []uint16, []uint16, []int64) []uint16
+	uint32Mul            func([]uint32, []uint32, []uint32) []uint32
+	uint32MulSels        func([]uint32, []uint32, []uint32, []int64) []uint32
+	uint32MulScalar      func(uint32, []uint32, []uint32) []uint32
+	uint32MulScalarSels  func(uint32, []uint32, []uint32, []int64) []uint32
+	uint64Mul            func([]uint64, []uint64, []uint64) []uint64
+	uint64MulSels        func([]uint64, []uint64, []uint64, []int64) []uint64
+	uint64MulScalar      func(uint64, []uint64, []uint64) []uint64
+	uint64MulScalarSels  func(uint64, []uint64, []uint64, []int64) []uint64
+	float32Mul           func([]float32, []float32, []float32) []float32
+	float32MulSels       func([]float32, []float32, []float32, []int64) []float32
+	float32MulScalar     func(float32, []float32, []float32) []float32
+	float32MulScalarSels func(float32, []float32, []float32, []int64) []float32
+	float64Mul           func([]float64, []float64, []float64) []float64
+	float64MulSels       func([]float64, []float64, []float64, []int64) []float64
+	float64MulScalar     func(float64, []float64, []float64) []float64
+	float64MulScalarSels func(float64, []float64, []float64, []int64) []float64
+)
+
+func init() {
+	int8Mul = int8MulPure
+	int8MulSels = int8MulSelsPure
+	int8MulScalar = int8MulScalarPure
+	int8MulScalarSels = int8MulScalarSelsPure
+	int16Mul = int16MulPure
+	int16MulSels = int16MulSelsPure
+	int16MulScalar = int16MulScalarPure
+	int16MulScalarSels = int16MulScalarSelsPure
+	int32Mul = int32MulPure
+	int32MulSels = int32MulSelsPure
+	int32MulScalar = int32MulScalarPure
+	int32MulScalarSels = int32MulScalarSelsPure
+	int64Mul = int64MulPure
+	int64MulSels = int64MulSelsPure
+	int64MulScalar = int64MulScalarPure
+	int64MulScalarSels = int64MulScalarSelsPure
+	uint8Mul = uint8MulPure
+	uint8MulSels = uint8MulSelsPure
+	uint8MulScalar = uint8MulScalarPure
+	uint8MulScalarSels = uint8MulScalarSelsPure
+	uint16Mul = uint16MulPure
+	uint16MulSels = uint16MulSelsPure
+	uint16MulScalar = uint16MulScalarPure
+	uint16MulScalarSels = uint16MulScalarSelsPure
+	uint32Mul = uint32MulPure
+	uint32MulSels = uint32MulSelsPure
+	uint32MulScalar = uint32MulScalarPure
+	uint32MulScalarSels = uint32MulScalarSelsPure
+	uint64Mul = uint64MulPure
+	uint64MulSels = uint64MulSelsPure
+	uint64MulScalar = uint64MulScalarPure
+	uint64MulScalarSels = uint64MulScalarSelsPure
+	float32Mul = float32MulPure
+	float32MulSels = float32MulSelsPure
+	float32MulScalar = float32MulScalarPure
+	float32MulScalarSels = float32MulScalarSelsPure
+	float64Mul = float64MulPure
+	float64MulSels = float64MulSelsPure
+	float64MulScalar = float64MulScalarPure
+	float64MulScalarSels = float64MulScalarSelsPure
+}
+
+func Int8Mul(xs, ys, rs []int8) []int8 {
+	return int8Mul(xs, ys, rs)
+}
+
+func int8MulPure(xs, ys, rs []int8) []int8 {
+	for i, x := range xs {
+		rs[i] = x * ys[i]
+	}
+	return rs
+}
+
+func Int8MulSels(xs, ys, rs []int8, sels []int64) []int8 {
+	return int8MulSels(xs, ys, rs, sels)
+}
+
+func int8MulSelsPure(xs, ys, rs []int8, sels []int64) []int8 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] * ys[sel]
+	}
+	return rs
+}
+
+func Int8MulScalar(x int8, ys, rs []int8) []int8 {
+	return int8MulScalar(x, ys, rs)
+}
+
+func int8MulScalarPure(x int8, ys, rs []int8) []int8 {
+	for i, y := range ys {
+		rs[i] = x * y
+	}
+	return rs
+}
+
+func Int8MulScalarSels(x int8, ys, rs []int8, sels []int64) []int8 {
+	return int8MulScalarSels(x, ys, rs, sels)
+}
+
+func int8MulScalarSelsPure(x int8, ys, rs []int8, sels []int64) []int8 {
+	for i, sel := range sels {
+		rs[i] = x * ys[sel]
+	}
+	return rs
+}
+
+func Int16Mul(xs, ys, rs []int16) []int16 {
+	return int16Mul(xs, ys, rs)
+}
+
+func int16MulPure(xs, ys, rs []int16) []int16 {
+	for i, x := range xs {
+		rs[i] = x * ys[i]
+	}
+	return rs
+}
+
+func Int16MulSels(xs, ys, rs []int16, sels []int64) []int16 {
+	return int16MulSels(xs, ys, rs, sels)
+}
+
+func int16MulSelsPure(xs, ys, rs []int16, sels []int64) []int16 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] * ys[sel]
+	}
+	return rs
+}
+
+func Int16MulScalar(x int16, ys, rs []int16) []int16 {
+	return int16MulScalar(x, ys, rs)
+}
+
+func int16MulScalarPure(x int16, ys, rs []int16) []int16 {
+	for i, y := range ys {
+		rs[i] = x * y
+	}
+	return rs
+}
+
+func Int16MulScalarSels(x int16, ys, rs []int16, sels []int64) []int16 {
+	return int16MulScalarSels(x, ys, rs, sels)
+}
+
+func int16MulScalarSelsPure(x int16, ys, rs []int16, sels []int64) []int16 {
+	for i, sel := range sels {
+		rs[i] = x * ys[sel]
+	}
+	return rs
+}
+
+func Int32Mul(xs, ys, rs []int32) []int32 {
+	return int32Mul(xs, ys, rs)
+}
+
+func int32MulPure(xs, ys, rs []int32) []int32 {
+	for i, x := range xs {
+		rs[i] = x * ys[i]
+	}
+	return rs
+}
+
+func Int32MulSels(xs, ys, rs []int32, sels []int64) []int32 {
+	return int32MulSels(xs, ys, rs, sels)
+}
+
+func int32MulSelsPure(xs, ys, rs []int32, sels []int64) []int32 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] * ys[sel]
+	}
+	return rs
+}
+
+func Int32MulScalar(x int32, ys, rs []int32) []int32 {
+	return int32MulScalar(x, ys, rs)
+}
+
+func int32MulScalarPure(x int32, ys, rs []int32) []int32 {
+	for i, y := range ys {
+		rs[i] = x * y
+	}
+	return rs
+}
+
+func Int32MulScalarSels(x int32, ys, rs []int32, sels []int64) []int32 {
+	return int32MulScalarSels(x, ys, rs, sels)
+}
+
+func int32MulScalarSelsPure(x int32, ys, rs []int32, sels []int64) []int32 {
+	for i, sel := range sels {
+		rs[i] = x * ys[sel]
+	}
+	return rs
+}
+
+func Int64Mul(xs, ys, rs []int64) []int64 {
+	return int64Mul(xs, ys, rs)
+}
+
+func int64MulPure(xs, ys, rs []int64) []int64 {
+	for i, x := range xs {
+		rs[i] = x * ys[i]
+	}
+	return rs
+}
+
+func Int64MulSels(xs, ys, rs []int64, sels []int64) []int64 {
+	return int64MulSels(xs, ys, rs, sels)
+}
+
+func int64MulSelsPure(xs, ys, rs []int64, sels []int64) []int64 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] * ys[sel]
+	}
+	return rs
+}
+
+func Int64MulScalar(x int64, ys, rs []int64) []int64 {
+	return int64MulScalar(x, ys, rs)
+}
+
+func int64MulScalarPure(x int64, ys, rs []int64) []int64 {
+	for i, y := range ys {
+		rs[i] = x * y
+	}
+	return rs
+}
+
+func Int64MulScalarSels(x int64, ys, rs []int64, sels []int64) []int64 {
+	return int64MulScalarSels(x, ys, rs, sels)
+}
+
+func int64MulScalarSelsPure(x int64, ys, rs []int64, sels []int64) []int64 {
+	for i, sel := range sels {
+		rs[i] = x * ys[sel]
+	}
+	return rs
+}
+
+func Uint8Mul(xs, ys, rs []uint8) []uint8 {
+	return uint8Mul(xs, ys, rs)
+}
+
+func uint8MulPure(xs, ys, rs []uint8) []uint8 {
+	for i, x := range xs {
+		rs[i] = x * ys[i]
+	}
+	return rs
+}
+
+func Uint8MulSels(xs, ys, rs []uint8, sels []int64) []uint8 {
+	return uint8MulSels(xs, ys, rs, sels)
+}
+
+func uint8MulSelsPure(xs, ys, rs []uint8, sels []int64) []uint8 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] * ys[sel]
+	}
+	return rs
+}
+
+func Uint8MulScalar(x uint8, ys, rs []uint8) []uint8 {
+	return uint8MulScalar(x, ys, rs)
+}
+
+func uint8MulScalarPure(x uint8, ys, rs []uint8) []uint8 {
+	for i, y := range ys {
+		rs[i] = x * y
+	}
+	return rs
+}
+
+func Uint8MulScalarSels(x uint8, ys, rs []uint8, sels []int64) []uint8 {
+	return uint8MulScalarSels(x, ys, rs, sels)
+}
+
+func uint8MulScalarSelsPure(x uint8, ys, rs []uint8, sels []int64) []uint8 {
+	for i, sel := range sels {
+		rs[i] = x * ys[sel]
+	}
+	return rs
+}
+
+func Uint16Mul(xs, ys, rs []uint16) []uint16 {
+	return uint16Mul(xs, ys, rs)
+}
+
+func uint16MulPure(xs, ys, rs []uint16) []uint16 {
+	for i, x := range xs {
+		rs[i] = x * ys[i]
+	}
+	return rs
+}
+
+func Uint16MulSels(xs, ys, rs []uint16, sels []int64) []uint16 {
+	return uint16MulSels(xs, ys, rs, sels)
+}
+
+func uint16MulSelsPure(xs, ys, rs []uint16, sels []int64) []uint16 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] * ys[sel]
+	}
+	return rs
+}
+
+func Uint16MulScalar(x uint16, ys, rs []uint16) []uint16 {
+	return uint16MulScalar(x, ys, rs)
+}
+
+func uint16MulScalarPure(x uint16, ys, rs []uint16) []uint16 {
+	for i, y := range ys {
+		rs[i] = x * y
+	}
+	return rs
+}
+
+func Uint16MulScalarSels(x uint16, ys, rs []uint16, sels []int64) []uint16 {
+	return uint16MulScalarSels(x, ys, rs, sels)
+}
+
+func uint16MulScalarSelsPure(x uint16, ys, rs []uint16, sels []int64) []uint16 {
+	for i, sel := range sels {
+		rs[i] = x * ys[sel]
+	}
+	return rs
+}
+
+func Uint32Mul(xs, ys, rs []uint32) []uint32 {
+	return uint32Mul(xs, ys, rs)
+}
+
+func uint32MulPure(xs, ys, rs []uint32) []uint32 {
+	for i, x := range xs {
+		rs[i] = x * ys[i]
+	}
+	return rs
+}
+
+func Uint32MulSels(xs, ys, rs []uint32, sels []int64) []uint32 {
+	return uint32MulSels(xs, ys, rs, sels)
+}
+
+func uint32MulSelsPure(xs, ys, rs []uint32, sels []int64) []uint32 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] * ys[sel]
+	}
+	return rs
+}
+
+func Uint32MulScalar(x uint32, ys, rs []uint32) []uint32 {
+	return uint32MulScalar(x, ys, rs)
+}
+
+func uint32MulScalarPure(x uint32, ys, rs []uint32) []uint32 {
+	for i, y := range ys {
+		rs[i] = x * y
+	}
+	return rs
+}
+
+func Uint32MulScalarSels(x uint32, ys, rs []uint32, sels []int64) []uint32 {
+	return uint32MulScalarSels(x, ys, rs, sels)
+}
+
+func uint32MulScalarSelsPure(x uint32, ys, rs []uint32, sels []int64) []uint32 {
+	for i, sel := range sels {
+		rs[i] = x * ys[sel]
+	}
+	return rs
+}
+
+func Uint64Mul(xs, ys, rs []uint64) []uint64 {
+	return uint64Mul(xs, ys, rs)
+}
+
+func uint64MulPure(xs, ys, rs []uint64) []uint64 {
+	for i, x := range xs {
+		rs[i] = x * ys[i]
+	}
+	return rs
+}
+
+func Uint64MulSels(xs, ys, rs []uint64, sels []int64) []uint64 {
+	return uint64MulSels(xs, ys, rs, sels)
+}
+
+func uint64MulSelsPure(xs, ys, rs []uint64, sels []int64) []uint64 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] * ys[sel]
+	}
+	return rs
+}
+
+func Uint64MulScalar(x uint64, ys, rs []uint64) []uint64 {
+	return uint64MulScalar(x, ys, rs)
+}
+
+func uint64MulScalarPure(x uint64, ys, rs []uint64) []uint64 {
+	for i, y := range ys {
+		rs[i] = x * y
+	}
+	return rs
+}
+
+func Uint64MulScalarSels(x uint64, ys, rs []uint64, sels []int64) []uint64 {
+	return uint64MulScalarSels(x, ys, rs, sels)
+}
+
+func uint64MulScalarSelsPure(x uint64, ys, rs []uint64, sels []int64) []uint64 {
+	for i, sel := range sels {
+		rs[i] = x * ys[sel]
+	}
+	return rs
+}
+
+func Float32Mul(xs, ys, rs []float32) []float32 {
+	return float32Mul(xs, ys, rs)
+}
+
+func float32MulPure(xs, ys, rs []float32) []float32 {
+	for i, x := range xs {
+		rs[i] = x * ys[i]
+	}
+	return rs
+}
+
+func Float32MulSels(xs, ys, rs []float32, sels []int64) []float32 {
+	return float32MulSels(xs, ys, rs, sels)
+}
+
+func float32MulSelsPure(xs, ys, rs []float32, sels []int64) []float32 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] * ys[sel]
+	}
+	return rs
+}
+
+func Float32MulScalar(x float32, ys, rs []float32) []float32 {
+	return float32MulScalar(x, ys, rs)
+}
+
+func float32MulScalarPure(x float32, ys, rs []float32) []float32 {
+	for i, y := range ys {
+		rs[i] = x * y
+	}
+	return rs
+}
+
+func Float32MulScalarSels(x float32, ys, rs []float32, sels []int64) []float32 {
+	return float32MulScalarSels(x, ys, rs, sels)
+}
+
+func float32MulScalarSelsPure(x float32, ys, rs []float32, sels []int64) []float32 {
+	for i, sel := range sels {
+		rs[i] = x * ys[sel]
+	}
+	return rs
+}
+
+func Float64Mul(xs, ys, rs []float64) []float64 {
+	return float64Mul(xs, ys, rs)
+}
+
+func float64MulPure(xs, ys, rs []float64) []float64 {
+	for i, x := range xs {
+		rs[i] = x * ys[i]
+	}
+	return rs
+}
+
+func Float64MulSels(xs, ys, rs []float64, sels []int64) []float64 {
+	return float64MulSels(xs, ys, rs, sels)
+}
+
+func float64MulSelsPure(xs, ys, rs []float64, sels []int64) []float64 {
+	for i, sel := range sels {
+		rs[i] = xs[sel] * ys[sel]
+	}
+	return rs
+}
+
+func Float64MulScalar(x float64, ys, rs []float64) []float64 {
+	return float64MulScalar(x, ys, rs)
+}
+
+func float64MulScalarPure(x float64, ys, rs []float64) []float64 {
+	for i, y := range ys {
+		rs[i] = x * y
+	}
+	return rs
+}
+
+func Float64MulScalarSels(x float64, ys, rs []float64, sels []int64) []float64 {
+	return float64MulScalarSels(x, ys, rs, sels)
+}
+
+func float64MulScalarSelsPure(x float64, ys, rs []float64, sels []int64) []float64 {
+	for i, sel := range sels {
+		rs[i] = x * ys[sel]
+	}
+	return rs
+}
diff --git a/pkg/vectorize/mult/mult.go b/pkg/vectorize/mult/mult.go
deleted file mode 100644
index 4c81ad7e7fdae90ea2096bb4e7d6111040aab5dd..0000000000000000000000000000000000000000
--- a/pkg/vectorize/mult/mult.go
+++ /dev/null
@@ -1,61 +0,0 @@
-package mult
-
-var (
-	i64MultOne func(int64, []int64, []int64) []int64
-	i64Mult    func([]int64, []int64, []int64) []int64
-
-	f64MultOne func(float64, []float64, []float64) []float64
-	f64Mult    func([]float64, []float64, []float64) []float64
-)
-
-func init() {
-	i64Mult = i64MultPure
-	i64MultOne = i64MultOnePure
-
-	f64Mult = f64MultPure
-	f64MultOne = f64MultOnePure
-}
-
-func I64Mult(xs, ys, rs []int64) []int64 {
-	return i64Mult(xs, ys, rs)
-}
-
-func I64MultOne(x int64, ys, rs []int64) []int64 {
-	return i64MultOnePure(x, ys, rs)
-}
-
-func F64Mult(xs, ys, rs []float64) []float64 {
-	return f64Mult(xs, ys, rs)
-}
-
-func F64MultOne(x float64, ys, rs []float64) []float64 {
-	return f64MultOnePure(x, ys, rs)
-}
-
-func i64MultPure(xs, ys, rs []int64) []int64 {
-	for i, x := range xs {
-		rs[i] = x * ys[i]
-	}
-	return rs
-}
-
-func i64MultOnePure(x int64, ys, rs []int64) []int64 {
-	for i, y := range ys {
-		rs[i] = x * y
-	}
-	return rs
-}
-
-func f64MultPure(xs, ys, rs []float64) []float64 {
-	for i, x := range xs {
-		rs[i] = x * ys[i]
-	}
-	return rs
-}
-
-func f64MultOnePure(x float64, ys, rs []float64) []float64 {
-	for i, y := range ys {
-		rs[i] = x * y
-	}
-	return rs
-}