IPP - Performance Issue

HI,

I'm using IPP dynamic linked with clang 11.0.0.

Hardware:

Processor Name: 6-Core Intel Core i5
Processor Speed: 3 GHz
Number of Processors: 1
Total Number of Cores: 6
L2 Cache (per Core): 256 KB
L3 Cache: 9 MB

I might be missing something but should this be slower than with simple std:: functions?

If yes, how can I make it faster?

struct Vect3DArray
{
    Ipp64f* x_;
    Ipp64f* y_;
    Ipp64f* z_;

    Vect3DArray(int size)
    {
        x_ = ippsMalloc_64f(size * sizeof(Ipp64f));
        y_ = ippsMalloc_64f(size * sizeof(Ipp64f));
        z_ = ippsMalloc_64f(size * sizeof(Ipp64f));
    }

    ~Vect3DArray() { ippFree(x_); ippFree(y_); ippFree(z_); }
};

int main() {
    Vect3DArray vectArray(kAmount);
    Vect3DArray dstVectArray(kAmount);
    Ipp64f* sums = ippsMalloc_64f(kAmount * sizeof(Ipp64f));
    for (std::size_t i = 1; i < kAmount; ++i) {
        vectArray.x_[i] = i * 2.5;
        vectArray.y_[i] = i * 3.3;
        vectArray.z_[i] = i * 4.7;
    }

    auto start = std::chrono::high_resolution_clock::now();

    ippsMul_64f(vectArray.x_, vectArray.x_, dstVectArray.x_, static_cast<int>(kAmount));
    ippsMul_64f(vectArray.y_, vectArray.y_, dstVectArray.y_, static_cast<int>(kAmount));
    ippsMul_64f(vectArray.z_, vectArray.z_, dstVectArray.z_, static_cast<int>(kAmount));

    ippsAdd_64f(dstVectArray.x_, dstVectArray.y_, sums, kAmount);
    ippsAdd_64f(sums, vectArray.z_, sums, kAmount);
    ippsSqr_64f_I(sums, kAmount);

    ippsDiv_64f_I(sums, vectArray.x_, kAmount);
    ippsDiv_64f_I(sums, vectArray.y_, kAmount);
    ippsDiv_64f_I(sums, vectArray.z_, kAmount);

    auto end = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
    std::cout << "#"<< duration << std::endl;
}

TCE Level:

Level 1

TCE Open Date:

Friday, March 13, 2020 - 04:48