HI,
I'm using IPP dynamic linked with clang 11.0.0.
Hardware:
- Processor Name: 6-Core Intel Core i5
- Processor Speed: 3 GHz
- Number of Processors: 1
- Total Number of Cores: 6
- L2 Cache (per Core): 256 KB
- L3 Cache: 9 MB
I might be missing something but should this be slower than with simple std:: functions?
If yes, how can I make it faster?
struct Vect3DArray { Ipp64f* x_; Ipp64f* y_; Ipp64f* z_; Vect3DArray(int size) { x_ = ippsMalloc_64f(size * sizeof(Ipp64f)); y_ = ippsMalloc_64f(size * sizeof(Ipp64f)); z_ = ippsMalloc_64f(size * sizeof(Ipp64f)); } ~Vect3DArray() { ippFree(x_); ippFree(y_); ippFree(z_); } }; int main() { Vect3DArray vectArray(kAmount); Vect3DArray dstVectArray(kAmount); Ipp64f* sums = ippsMalloc_64f(kAmount * sizeof(Ipp64f)); for (std::size_t i = 1; i < kAmount; ++i) { vectArray.x_[i] = i * 2.5; vectArray.y_[i] = i * 3.3; vectArray.z_[i] = i * 4.7; } auto start = std::chrono::high_resolution_clock::now(); ippsMul_64f(vectArray.x_, vectArray.x_, dstVectArray.x_, static_cast<int>(kAmount)); ippsMul_64f(vectArray.y_, vectArray.y_, dstVectArray.y_, static_cast<int>(kAmount)); ippsMul_64f(vectArray.z_, vectArray.z_, dstVectArray.z_, static_cast<int>(kAmount)); ippsAdd_64f(dstVectArray.x_, dstVectArray.y_, sums, kAmount); ippsAdd_64f(sums, vectArray.z_, sums, kAmount); ippsSqr_64f_I(sums, kAmount); ippsDiv_64f_I(sums, vectArray.x_, kAmount); ippsDiv_64f_I(sums, vectArray.y_, kAmount); ippsDiv_64f_I(sums, vectArray.z_, kAmount); auto end = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count(); std::cout << "#"<< duration << std::endl; }
TCE Level:
TCE Open Date:
Friday, March 13, 2020 - 04:48