Various additions to SIMDRegister

2026-01-10 23:44:24 +00:00 · 2017-11-30 13:06:31 +01:00 · 2017-11-30 13:06:31 +01:00 · 8f02179bbf
commit 8f02179bbf
parent 6894e04356
8 changed files with 250 additions and 135 deletions
--- a/modules/juce_dsp/containers/juce_SIMDRegister.h
+++ b/modules/juce_dsp/containers/juce_SIMDRegister.h
@ -102,6 +102,18 @@ struct SIMDRegister

    vSIMDType value;

+    /** Default constructor. */
+    inline JUCE_VECTOR_CALLTYPE SIMDRegister() noexcept {}
+
+    /** Constructs an object from the native SIMD type. */
+    inline JUCE_VECTOR_CALLTYPE SIMDRegister (vSIMDType a) noexcept : value (a) {}
+
+    /** Constructs an object from a scalar type by broadcasting it to all elements. */
+    inline JUCE_VECTOR_CALLTYPE SIMDRegister (Type s) noexcept  { *this = s; }
+
+    /** Destrutor. */
+    inline JUCE_VECTOR_CALLTYPE ~SIMDRegister() noexcept {}
+
    //==============================================================================
    /** Returns the number of elements in this vector. */
    static constexpr size_t size() noexcept    { return SIMDNumElements; }
@ -232,6 +244,19 @@ struct SIMDRegister
    /** Returns a vector where each element is the bit-xor'd value of the corresponding element in the receiver and the scalar s.*/
    inline SIMDRegister JUCE_VECTOR_CALLTYPE operator^ (MaskType s) const noexcept      { return { NativeOps::bit_xor (value, toVecType (s)) }; }

+    //==============================================================================
+    /** Returns true if all elements-wise comparisons return true. */
+    inline bool JUCE_VECTOR_CALLTYPE operator== (SIMDRegister other) const noexcept    { return  NativeOps::allEqual (value, other.value); }
+
+    /** Returns true if any elements-wise comparisons return false. */
+    inline bool JUCE_VECTOR_CALLTYPE operator!= (SIMDRegister other) const noexcept    { return ! (*this == other); }
+
+    /** Returns true if all elements are equal to the scalar. */
+    inline bool JUCE_VECTOR_CALLTYPE operator== (Type s) const noexcept                { return *this == SIMDRegister::expand (s); }
+
+    /** Returns true if any elements are not equal to the scalar. */
+    inline bool JUCE_VECTOR_CALLTYPE operator!= (Type s) const noexcept                { return ! (*this == s); }
+
    //==============================================================================
    /** Returns a SIMDRegister of the corresponding integral type where each element has each bit set
        if the corresponding element of a is equal to the corresponding element of b, or zero otherwise.
--- a/modules/juce_dsp/containers/juce_SIMDRegister_test.cpp
+++ b/modules/juce_dsp/containers/juce_SIMDRegister_test.cpp
@ -370,8 +370,10 @@ public:
                {
                    type array_a [SIMDRegister<type>::SIMDNumElements];

-                    union
+                    union ConversionUnion
                    {
+                        inline ConversionUnion() {}
+                        inline ~ConversionUnion() {}
                        SIMDRegister<type> floatVersion;
                        vMaskType intVersion;
                    } a, b;
@ -512,6 +514,39 @@ public:
                u.expect (vecEqualToArray (le,  array_le ));
                u.expect (vecEqualToArray (gt,  array_gt ));
                u.expect (vecEqualToArray (ge,  array_ge ));
+
+                do
+                {
+                    SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister<type>::SIMDNumElements, random);
+                    SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister<type>::SIMDNumElements, random);
+                } while (std::equal (array_a, array_a + SIMDRegister<type>::SIMDNumElements, array_b));
+
+                copy (a, array_a);
+                copy (b, array_b);
+                u.expect (a != b);
+                u.expect (b != a);
+                u.expect (! (a == b));
+                u.expect (! (b == a));
+
+                SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister<type>::SIMDNumElements, random);
+                copy (a, array_a);
+                copy (b, array_a);
+
+                u.expect (a == b);
+                u.expect (b == a);
+                u.expect (! (a != b));
+                u.expect (! (b != a));
+
+                auto scalar = a[0];
+                a = SIMDRegister<type>::expand (scalar);
+
+                u.expect (a == scalar);
+                u.expect (! (a != scalar));
+
+                scalar--;
+
+                u.expect (a != scalar);
+                u.expect (! (a == scalar));
            }
        }
    };
--- a/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h
+++ b/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h
@ -82,6 +82,7 @@ struct SIMDNativeOps<float>
    static forcedinline __m256 JUCE_VECTOR_CALLTYPE notEqual (__m256 a, __m256 b) noexcept               { return _mm256_cmp_ps (a, b, _CMP_NEQ_OQ); }
    static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThan (__m256 a, __m256 b) noexcept            { return _mm256_cmp_ps (a, b, _CMP_GT_OQ); }
    static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256 a, __m256 b) noexcept     { return _mm256_cmp_ps (a, b, _CMP_GE_OQ); }
+    static forcedinline bool   JUCE_VECTOR_CALLTYPE allEqual (__m256 a, __m256 b) noexcept               { return (_mm256_movemask_ps (equal (a, b)) == 0xff); }
    static forcedinline __m256 JUCE_VECTOR_CALLTYPE multiplyAdd (__m256 a, __m256 b, __m256 c) noexcept  { return _mm256_fmadd_ps (b, c, a); }
    static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupeven (__m256 a) noexcept                          { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
    static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupodd (__m256 a) noexcept                           { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
@ -141,6 +142,7 @@ struct SIMDNativeOps<double>
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept               { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept            { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept     { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); }
+    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256d a, __m256d b) noexcept                { return (_mm256_movemask_pd (equal (a, b)) == 0xf); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept                           { return _mm256_shuffle_pd (a, a, 0); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept                            { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); }
@ -261,6 +263,7 @@ struct SIMDNativeOps<uint8_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept                   { return _mm256_cmpeq_epi8 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept             { return _mm256_cmpgt_epi8 (ssign (a), ssign (b)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
+    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }

@ -336,6 +339,7 @@ struct SIMDNativeOps<int16_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept
@ -390,6 +394,7 @@ struct SIMDNativeOps<uint16_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept
@ -443,6 +448,7 @@ struct SIMDNativeOps<int32_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept
@ -495,6 +501,7 @@ struct SIMDNativeOps<uint32_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept
@ -543,6 +550,7 @@ struct SIMDNativeOps<int64_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept
@ -614,6 +622,7 @@ struct SIMDNativeOps<uint64_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept
--- a/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h
+++ b/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h
@ -117,6 +117,19 @@ struct SIMDFallbackOps
        return retval;
    }

+    //==============================================================================
+    static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept
+    {
+        auto* aSrc = reinterpret_cast<const ScalarType*> (&a);
+        auto* bSrc = reinterpret_cast<const ScalarType*> (&b);
+
+        for (size_t i = 0; i < n; ++i)
+            if (aSrc[i] != bSrc[i])
+                return false;
+
+        return true;
+    }
+
    //==============================================================================
    static forcedinline vSIMDType cmplxmul (vSIMDType a, vSIMDType b) noexcept
    {
--- a/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h
+++ b/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h
@ -51,86 +51,82 @@ template <typename type>
 struct SIMDNativeOps;

 //==============================================================================
-/** Single-precision floating point NEON intrinsics. */
+/** Unsigned 32-bit integer NEON intrinsics. */
 template <>
-struct SIMDNativeOps<float>
+struct SIMDNativeOps<uint32_t>
 {
    //==============================================================================
-    typedef float32x4_t vSIMDType;
-    typedef uint32x4_t vMaskType;
-    typedef SIMDFallbackOps<float, vSIMDType> fb;
+    typedef uint32x4_t vSIMDType;
+    typedef SIMDFallbackOps<uint32_t, vSIMDType> fb;

    //==============================================================================
-    DECLARE_NEON_SIMD_CONST (int32_t, kAllBitsSet);
-    DECLARE_NEON_SIMD_CONST (int32_t, kEvenHighBit);
-    DECLARE_NEON_SIMD_CONST (float, kOne);
+    DECLARE_NEON_SIMD_CONST (uint32_t, kAllBitsSet);

    //==============================================================================
-    static forcedinline vSIMDType expand (float s) noexcept                         { return vdupq_n_f32 (s); }
-    static forcedinline vSIMDType load (const float* a) noexcept                    { return vld1q_f32 (a); }
-    static forcedinline void store (vSIMDType value, float* a) noexcept             { vst1q_f32 (a, value); }
-    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept           { return vaddq_f32 (a, b); }
-    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept           { return vsubq_f32 (a, b); }
-    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept           { return vmulq_f32 (a, b); }
-    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept       { return (vSIMDType) vandq_u32 ((vMaskType) a, (vMaskType) b); }
-    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept       { return (vSIMDType) vorrq_u32 ((vMaskType) a, (vMaskType) b); }
-    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept       { return (vSIMDType) veorq_u32 ((vMaskType) a, (vMaskType) b); }
-    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept    { return (vSIMDType) vbicq_u32 ((vMaskType) b, (vMaskType) a); }
-    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                    { return bit_notand (a, vld1q_f32 ((float*) kAllBitsSet)); }
-    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_f32 (a, b); }
-    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_f32 (a, b); }
-    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_f32 (a, b); }
+    static forcedinline vSIMDType expand (uint32_t s) noexcept                   { return vdupq_n_u32 (s); }
+    static forcedinline vSIMDType load (const uint32_t* a) noexcept              { return vld1q_u32 (a); }
+    static forcedinline void store (vSIMDType value, uint32_t* a) noexcept       { vst1q_u32 (a, value); }
+    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_u32 (a, b); }
+    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_u32 (a, b); }
+    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_u32 (a, b); }
+    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return vandq_u32 (a, b); }
+    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return vorrq_u32  (a, b); }
+    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return veorq_u32 (a, b); }
+    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u32 (b, a); }
+    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return bit_notand (a, vld1q_u32 ((uint32_t*) kAllBitsSet)); }
+    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_u32 (a, b); }
+    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_u32 (a, b); }
+    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_u32 (a, b); }
+    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (sum (notEqual (a, b)) == 0); }
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
-    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_f32 (a, b); }
-    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_f32 (a, b); }
-    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_f32 (a, b, c); }
-    static forcedinline vSIMDType dupeven (vSIMDType a) noexcept { return fb::shuffle<(0 << 0) | (0 << 2) | (2 << 4) | (2 << 6)>     (a); }
-    static forcedinline vSIMDType dupodd  (vSIMDType a) noexcept { return fb::shuffle<(1 << 0) | (1 << 2) | (3 << 4) | (3 << 6)>     (a); }
-    static forcedinline vSIMDType swapevenodd (vSIMDType a) noexcept { return fb::shuffle<(1 << 0) | (0 << 2) | (3 << 4) | (2 << 6)> (a); }
-    static forcedinline float sum (vSIMDType a) noexcept { return fb::sum (a); }
-    static forcedinline vSIMDType oddevensum (vSIMDType a) noexcept { return add (fb::shuffle<(2 << 0) | (3 << 2) | (0 << 4) | (1 << 6)> (a), a); }
-
-    //==============================================================================
-    static forcedinline vSIMDType cmplxmul (vSIMDType a, vSIMDType b) noexcept
+    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_u32 (a, b); }
+    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_u32 (a, b); }
+    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_u32 (a, b, c); }
+    static forcedinline uint32_t sum (vSIMDType a) noexcept
    {
-        vSIMDType rr_ir = mul (a, dupeven (b));
-        vSIMDType ii_ri = mul (swapevenodd (a), dupodd (b));
-        return add (rr_ir, bit_xor (ii_ri, vld1q_f32 ((float*) kEvenHighBit)));
+        auto rr = vadd_u32 (vget_high_u32 (a), vget_low_u32 (a));
+        return vget_lane_u32 (vpadd_u32 (rr, rr), 0);
    }
 };

 //==============================================================================
-/** Double-precision floating point NEON intrinsics does not exist in NEON
-    so we need to emulate this.
-*/
+/** Signed 32-bit integer NEON intrinsics. */
 template <>
-struct SIMDNativeOps<double>
+struct SIMDNativeOps<int32_t>
 {
    //==============================================================================
-    typedef struct { double values [2]; } vSIMDType;
-    typedef SIMDFallbackOps<double, vSIMDType> fb;
+    typedef int32x4_t vSIMDType;
+    typedef SIMDFallbackOps<int32_t, vSIMDType> fb;

-    static forcedinline vSIMDType expand (double s) noexcept                     { return fb::expand (s); }
-    static forcedinline vSIMDType load (const double* a) noexcept                { return fb::load (a); }
-    static forcedinline void store (vSIMDType value, double* a) noexcept         { fb::store (value, a); }
-    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return fb::add (a, b); }
-    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return fb::sub (a, b); }
-    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return fb::mul (a, b); }
-    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return fb::bit_and (a, b); }
-    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return fb::bit_or  (a, b); }
-    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return fb::bit_xor (a, b); }
-    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return fb::bit_notand (a, b); }
-    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return fb::bit_not (a); }
-    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return fb::min (a, b); }
-    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return fb::max (a, b); }
-    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return fb::equal (a, b); }
-    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return fb::notEqual (a, b); }
-    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return fb::greaterThan (a, b); }
-    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return fb::greaterThanOrEqual (a, b); }
-    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return fb::multiplyAdd (a, b, c); }
-    static forcedinline vSIMDType cmplxmul (vSIMDType a, vSIMDType b) noexcept { return fb::cmplxmul (a, b); }
-    static forcedinline double sum (vSIMDType a) noexcept { return fb::sum (a); }
-    static forcedinline vSIMDType oddevensum (vSIMDType a) noexcept { return a; }
+    //==============================================================================
+    DECLARE_NEON_SIMD_CONST (int32_t, kAllBitsSet);
+
+    //==============================================================================
+    static forcedinline vSIMDType expand (int32_t s) noexcept                    { return vdupq_n_s32 (s); }
+    static forcedinline vSIMDType load (const int32_t* a) noexcept               { return vld1q_s32 (a); }
+    static forcedinline void store (vSIMDType value, int32_t* a) noexcept        { vst1q_s32 (a, value); }
+    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_s32 (a, b); }
+    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_s32 (a, b); }
+    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_s32 (a, b); }
+    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return vandq_s32 (a, b); }
+    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return vorrq_s32 (a, b); }
+    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return veorq_s32 (a, b); }
+    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s32 (b, a); }
+    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return bit_notand (a, vld1q_s32 ((int32_t*) kAllBitsSet)); }
+    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_s32 (a, b); }
+    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_s32 (a, b); }
+    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_s32 (a, b); }
+    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (sum (notEqual (a, b)) == 0); }
+    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
+    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_s32 (a, b); }
+    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_s32 (a, b); }
+    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_s32 (a, b, c); }
+    static forcedinline int32_t sum (vSIMDType a) noexcept
+    {
+        auto rr = vadd_s32 (vget_high_s32 (a), vget_low_s32 (a));
+        rr = vpadd_s32 (rr, rr);
+        return vget_lane_s32 (rr, 0);
+    }
 };

 //==============================================================================
@ -163,6 +159,7 @@ struct SIMDNativeOps<int8_t>
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_s8 (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_s8 (a, b); }
+    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<int32_t>::sum (notEqual (a, b)) == 0); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_s8 (a, b, c); }
    static forcedinline int8_t sum (vSIMDType a) noexcept { return fb::sum (a); }
 };
@ -197,6 +194,7 @@ struct SIMDNativeOps<uint8_t>
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_u8 (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_u8 (a, b); }
+    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<uint32_t>::sum (notEqual (a, b)) == 0); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_u8 (a, b, c); }
    static forcedinline uint8_t sum (vSIMDType a) noexcept { return fb::sum (a); }
 };
@ -231,6 +229,7 @@ struct SIMDNativeOps<int16_t>
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_s16 (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_s16 (a, b); }
+    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<int32_t>::sum (notEqual (a, b)) == 0); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_s16 (a, b, c); }
    static forcedinline int16_t sum (vSIMDType a) noexcept { return fb::sum (a); }
 };
@ -266,79 +265,11 @@ struct SIMDNativeOps<uint16_t>
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_u16 (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_u16 (a, b); }
+    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<uint32_t>::sum (notEqual (a, b)) == 0); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_u16 (a, b, c); }
    static forcedinline uint16_t sum (vSIMDType a) noexcept { return fb::sum (a); }
 };

-//==============================================================================
-/** Signed 32-bit integer NEON intrinsics. */
-template <>
-struct SIMDNativeOps<int32_t>
-{
-    //==============================================================================
-    typedef int32x4_t vSIMDType;
-    typedef SIMDFallbackOps<int32_t, vSIMDType> fb;
-
-    //==============================================================================
-    DECLARE_NEON_SIMD_CONST (int32_t, kAllBitsSet);
-
-    //==============================================================================
-    static forcedinline vSIMDType expand (int32_t s) noexcept                    { return vdupq_n_s32 (s); }
-    static forcedinline vSIMDType load (const int32_t* a) noexcept               { return vld1q_s32 (a); }
-    static forcedinline void store (vSIMDType value, int32_t* a) noexcept        { vst1q_s32 (a, value); }
-    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_s32 (a, b); }
-    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_s32 (a, b); }
-    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_s32 (a, b); }
-    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return vandq_s32 (a, b); }
-    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return vorrq_s32 (a, b); }
-    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return veorq_s32 (a, b); }
-    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s32 (b, a); }
-    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return bit_notand (a, vld1q_s32 ((int32_t*) kAllBitsSet)); }
-    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_s32 (a, b); }
-    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_s32 (a, b); }
-    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_s32 (a, b); }
-    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
-    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_s32 (a, b); }
-    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_s32 (a, b); }
-    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_s32 (a, b, c); }
-    static forcedinline int32_t sum (vSIMDType a) noexcept { return fb::sum (a); }
-};
-
-
-//==============================================================================
-/** Unsigned 32-bit integer NEON intrinsics. */
-template <>
-struct SIMDNativeOps<uint32_t>
-{
-    //==============================================================================
-    typedef uint32x4_t vSIMDType;
-    typedef SIMDFallbackOps<uint32_t, vSIMDType> fb;
-
-    //==============================================================================
-    DECLARE_NEON_SIMD_CONST (uint32_t, kAllBitsSet);
-
-    //==============================================================================
-    static forcedinline vSIMDType expand (uint32_t s) noexcept                   { return vdupq_n_u32 (s); }
-    static forcedinline vSIMDType load (const uint32_t* a) noexcept              { return vld1q_u32 (a); }
-    static forcedinline void store (vSIMDType value, uint32_t* a) noexcept       { vst1q_u32 (a, value); }
-    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_u32 (a, b); }
-    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_u32 (a, b); }
-    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_u32 (a, b); }
-    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return vandq_u32 (a, b); }
-    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return vorrq_u32  (a, b); }
-    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return veorq_u32 (a, b); }
-    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u32 (b, a); }
-    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return bit_notand (a, vld1q_u32 ((uint32_t*) kAllBitsSet)); }
-    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_u32 (a, b); }
-    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_u32 (a, b); }
-    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_u32 (a, b); }
-    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
-    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_u32 (a, b); }
-    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_u32 (a, b); }
-    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_u32 (a, b, c); }
-    static forcedinline uint32_t sum (vSIMDType a) noexcept { return fb::sum (a); }
-};
-
 //==============================================================================
 /** Signed 64-bit integer NEON intrinsics. */
 template <>
@ -369,6 +300,7 @@ struct SIMDNativeOps<int64_t>
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return fb::notEqual (a, b); }
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return fb::greaterThan (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return fb::greaterThanOrEqual (a, b); }
+    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<int32_t>::sum (notEqual (a, b)) == 0); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return fb::multiplyAdd (a, b, c); }
    static forcedinline int64_t sum (vSIMDType a) noexcept { return fb::sum (a); }
 };
@ -404,10 +336,101 @@ struct SIMDNativeOps<uint64_t>
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return fb::notEqual (a, b); }
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return fb::greaterThan (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return fb::greaterThanOrEqual (a, b); }
+    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<uint32_t>::sum (notEqual (a, b)) == 0); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return fb::multiplyAdd (a, b, c); }
    static forcedinline uint64_t sum (vSIMDType a) noexcept { return fb::sum (a); }
 };

+    //==============================================================================
+/** Single-precision floating point NEON intrinsics. */
+template <>
+struct SIMDNativeOps<float>
+{
+    //==============================================================================
+    typedef float32x4_t vSIMDType;
+    typedef uint32x4_t vMaskType;
+    typedef SIMDFallbackOps<float, vSIMDType> fb;
+
+    //==============================================================================
+    DECLARE_NEON_SIMD_CONST (int32_t, kAllBitsSet);
+    DECLARE_NEON_SIMD_CONST (int32_t, kEvenHighBit);
+    DECLARE_NEON_SIMD_CONST (float, kOne);
+
+    //==============================================================================
+    static forcedinline vSIMDType expand (float s) noexcept                          { return vdupq_n_f32 (s); }
+    static forcedinline vSIMDType load (const float* a) noexcept                     { return vld1q_f32 (a); }
+    static forcedinline void store (vSIMDType value, float* a) noexcept              { vst1q_f32 (a, value); }
+    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_f32 (a, b); }
+    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_f32 (a, b); }
+    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_f32 (a, b); }
+    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return (vSIMDType) vandq_u32 ((vMaskType) a, (vMaskType) b); }
+    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return (vSIMDType) vorrq_u32 ((vMaskType) a, (vMaskType) b); }
+    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return (vSIMDType) veorq_u32 ((vMaskType) a, (vMaskType) b); }
+    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vbicq_u32 ((vMaskType) b, (vMaskType) a); }
+    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                   { return bit_notand (a, vld1q_f32 ((float*) kAllBitsSet)); }
+    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_f32 (a, b); }
+    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_f32 (a, b); }
+    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_f32 (a, b); }
+    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
+    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_f32 (a, b); }
+    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_f32 (a, b); }
+    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<uint32_t>::sum (notEqual (a, b)) == 0); }
+    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_f32 (a, b, c); }
+    static forcedinline vSIMDType dupeven (vSIMDType a) noexcept { return fb::shuffle<(0 << 0) | (0 << 2) | (2 << 4) | (2 << 6)>     (a); }
+    static forcedinline vSIMDType dupodd  (vSIMDType a) noexcept { return fb::shuffle<(1 << 0) | (1 << 2) | (3 << 4) | (3 << 6)>     (a); }
+    static forcedinline vSIMDType swapevenodd (vSIMDType a) noexcept { return fb::shuffle<(1 << 0) | (0 << 2) | (3 << 4) | (2 << 6)> (a); }
+    static forcedinline vSIMDType oddevensum (vSIMDType a) noexcept { return add (fb::shuffle<(2 << 0) | (3 << 2) | (0 << 4) | (1 << 6)> (a), a); }
+
+    //==============================================================================
+    static forcedinline vSIMDType cmplxmul (vSIMDType a, vSIMDType b) noexcept
+    {
+        vSIMDType rr_ir = mul (a, dupeven (b));
+        vSIMDType ii_ri = mul (swapevenodd (a), dupodd (b));
+        return add (rr_ir, bit_xor (ii_ri, vld1q_f32 ((float*) kEvenHighBit)));
+    }
+
+    static forcedinline float sum (vSIMDType a) noexcept
+    {
+        auto rr = vadd_f32 (vget_high_f32 (a), vget_low_f32 (a));
+        return vget_lane_f32 (vpadd_f32 (rr, rr), 0);
+    }
+};
+
+//==============================================================================
+/** Double-precision floating point NEON intrinsics does not exist in NEON
+    so we need to emulate this.
+*/
+template <>
+struct SIMDNativeOps<double>
+{
+    //==============================================================================
+    typedef struct { double values [2]; } vSIMDType;
+    typedef SIMDFallbackOps<double, vSIMDType> fb;
+
+    static forcedinline vSIMDType expand (double s) noexcept                     { return fb::expand (s); }
+    static forcedinline vSIMDType load (const double* a) noexcept                { return fb::load (a); }
+    static forcedinline void store (vSIMDType value, double* a) noexcept         { fb::store (value, a); }
+    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return fb::add (a, b); }
+    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return fb::sub (a, b); }
+    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return fb::mul (a, b); }
+    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return fb::bit_and (a, b); }
+    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return fb::bit_or  (a, b); }
+    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return fb::bit_xor (a, b); }
+    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return fb::bit_notand (a, b); }
+    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return fb::bit_not (a); }
+    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return fb::min (a, b); }
+    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return fb::max (a, b); }
+    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return fb::equal (a, b); }
+    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return fb::notEqual (a, b); }
+    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return fb::greaterThan (a, b); }
+    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return fb::greaterThanOrEqual (a, b); }
+    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return fb::allEqual (a, b); }
+    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return fb::multiplyAdd (a, b, c); }
+    static forcedinline vSIMDType cmplxmul (vSIMDType a, vSIMDType b) noexcept { return fb::cmplxmul (a, b); }
+    static forcedinline double sum (vSIMDType a) noexcept { return fb::sum (a); }
+    static forcedinline vSIMDType oddevensum (vSIMDType a) noexcept { return a; }
+};
+
 #endif

 } // namespace dsp
--- a/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h
+++ b/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h
@ -81,6 +81,7 @@ struct SIMDNativeOps<float>
    static forcedinline __m128 JUCE_VECTOR_CALLTYPE notEqual (__m128 a, __m128 b) noexcept               { return _mm_cmpneq_ps (a, b); }
    static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThan (__m128 a, __m128 b) noexcept            { return _mm_cmpgt_ps (a, b); }
    static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128 a, __m128 b) noexcept     { return _mm_cmpge_ps (a, b); }
+    static forcedinline bool   JUCE_VECTOR_CALLTYPE allEqual (__m128 a, __m128 b ) noexcept              { return (_mm_movemask_ps (equal (a, b)) == 0xf); }
    static forcedinline __m128 JUCE_VECTOR_CALLTYPE multiplyAdd (__m128 a, __m128 b, __m128 c) noexcept  { return _mm_add_ps (a, _mm_mul_ps (b, c)); }
    static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupeven (__m128 a) noexcept                          { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
    static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupodd (__m128 a) noexcept                           { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
@ -142,6 +143,7 @@ struct SIMDNativeOps<double>
    static forcedinline __m128d JUCE_VECTOR_CALLTYPE notEqual (__m128d a, __m128d b) noexcept                { return _mm_cmpneq_pd (a, b); }
    static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThan (__m128d a, __m128d b) noexcept             { return _mm_cmpgt_pd (a, b); }
    static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128d a, __m128d b) noexcept      { return _mm_cmpge_pd (a, b); }
+    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128d a, __m128d b ) noexcept               { return (_mm_movemask_pd (equal (a, b)) == 0x3); }
    static forcedinline __m128d JUCE_VECTOR_CALLTYPE multiplyAdd (__m128d a, __m128d b, __m128d c) noexcept  { return _mm_add_pd (a, _mm_mul_pd (b, c)); }
    static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupeven (__m128d a) noexcept                            { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 0)); }
    static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupodd (__m128d a) noexcept                             { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (1, 1)); }
@ -201,6 +203,7 @@ struct SIMDNativeOps<int8_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }

    //==============================================================================
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept
@ -282,6 +285,7 @@ struct SIMDNativeOps<uint8_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }

    //==============================================================================
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept
@ -363,6 +367,7 @@ struct SIMDNativeOps<int16_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }

    //==============================================================================
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept
@ -431,6 +436,7 @@ struct SIMDNativeOps<uint16_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }

    //==============================================================================
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept
@ -490,6 +496,7 @@ struct SIMDNativeOps<int32_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }

    //==============================================================================
    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int32_t* dest) noexcept
@ -575,6 +582,7 @@ struct SIMDNativeOps<uint32_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }

    //==============================================================================
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept
@ -671,6 +679,7 @@ struct SIMDNativeOps<int64_t>
    static forcedinline __m128i greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }

    //==============================================================================
    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int64_t* dest) noexcept
@ -762,6 +771,7 @@ struct SIMDNativeOps<uint64_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }

    //==============================================================================
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept
--- a/modules/juce_dsp/processors/juce_FIRFilter.h
+++ b/modules/juce_dsp/processors/juce_FIRFilter.h
@ -172,7 +172,7 @@ namespace FIR
        static SampleType JUCE_VECTOR_CALLTYPE processSingleSample (SampleType sample, SampleType* buf,
                                                                    const NumericType* fir, size_t m, size_t& p) noexcept
        {
-            SampleType out = {};
+            SampleType out (0);

            buf[p] = sample;

--- a/modules/juce_dsp/processors/juce_FIRFilter_test.cpp
+++ b/modules/juce_dsp/processors/juce_FIRFilter_test.cpp
@ -106,7 +106,7 @@ class FIRFilterTest : public UnitTest

            buffer[0] = input[i];

-            SampleType sum{};
+            SampleType sum (0);

            for (size_t j = 0; j < numCoefficients; ++j)
                sum += buffer[j] * firCoefficients[j];