/////////////////////////////////////////////////////////////////////// // File: dotproductsse.cpp // Description: Architecture-specific dot-product function. // Author: Ray Smith // // (C) Copyright 2015, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. /////////////////////////////////////////////////////////////////////// #if !defined(__SSE4_1__) #error Implementation only for SSE 4.1 capable architectures #endif #include #include #include #include "dotproduct.h" namespace tesseract { // Computes and returns the dot product of the n-vectors u and v. // Uses Intel SSE intrinsics to access the SIMD instruction set. double DotProductSSE(const double* u, const double* v, int n) { int max_offset = n - 2; int offset = 0; // Accumulate a set of 2 sums in sum, by loading pairs of 2 values from u and // v, and multiplying them together in parallel. __m128d sum = _mm_setzero_pd(); if (offset <= max_offset) { offset = 2; // Aligned load is reputedly faster but requires 16 byte aligned input. if ((reinterpret_cast(u) & 15) == 0 && (reinterpret_cast(v) & 15) == 0) { // Use aligned load. sum = _mm_load_pd(u); __m128d floats2 = _mm_load_pd(v); // Multiply. sum = _mm_mul_pd(sum, floats2); while (offset <= max_offset) { __m128d floats1 = _mm_load_pd(u + offset); floats2 = _mm_load_pd(v + offset); offset += 2; floats1 = _mm_mul_pd(floats1, floats2); sum = _mm_add_pd(sum, floats1); } } else { // Use unaligned load. sum = _mm_loadu_pd(u); __m128d floats2 = _mm_loadu_pd(v); // Multiply. sum = _mm_mul_pd(sum, floats2); while (offset <= max_offset) { __m128d floats1 = _mm_loadu_pd(u + offset); floats2 = _mm_loadu_pd(v + offset); offset += 2; floats1 = _mm_mul_pd(floats1, floats2); sum = _mm_add_pd(sum, floats1); } } } // Add the 2 sums in sum horizontally. sum = _mm_hadd_pd(sum, sum); // Extract the low result. double result = _mm_cvtsd_f64(sum); // Add on any left-over products. while (offset < n) { result += u[offset] * v[offset]; ++offset; } return result; } } // namespace tesseract.