compare_gcc.cc 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. /*
  2. * Copyright 2012 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/basic_types.h"
  11. #include "libyuv/compare_row.h"
  12. #include "libyuv/row.h"
  13. #ifdef __cplusplus
  14. namespace libyuv {
  15. extern "C" {
  16. #endif
  17. // This module is for GCC x86 and x64.
  18. #if !defined(LIBYUV_DISABLE_X86) && \
  19. (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
  20. uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
  21. uint32 sse;
  22. asm volatile (
  23. "pxor %%xmm0,%%xmm0 \n"
  24. "pxor %%xmm5,%%xmm5 \n"
  25. LABELALIGN
  26. "1: \n"
  27. "movdqu " MEMACCESS(0) ",%%xmm1 \n"
  28. "lea " MEMLEA(0x10, 0) ",%0 \n"
  29. "movdqu " MEMACCESS(1) ",%%xmm2 \n"
  30. "lea " MEMLEA(0x10, 1) ",%1 \n"
  31. "movdqa %%xmm1,%%xmm3 \n"
  32. "psubusb %%xmm2,%%xmm1 \n"
  33. "psubusb %%xmm3,%%xmm2 \n"
  34. "por %%xmm2,%%xmm1 \n"
  35. "movdqa %%xmm1,%%xmm2 \n"
  36. "punpcklbw %%xmm5,%%xmm1 \n"
  37. "punpckhbw %%xmm5,%%xmm2 \n"
  38. "pmaddwd %%xmm1,%%xmm1 \n"
  39. "pmaddwd %%xmm2,%%xmm2 \n"
  40. "paddd %%xmm1,%%xmm0 \n"
  41. "paddd %%xmm2,%%xmm0 \n"
  42. "sub $0x10,%2 \n"
  43. "jg 1b \n"
  44. "pshufd $0xee,%%xmm0,%%xmm1 \n"
  45. "paddd %%xmm1,%%xmm0 \n"
  46. "pshufd $0x1,%%xmm0,%%xmm1 \n"
  47. "paddd %%xmm1,%%xmm0 \n"
  48. "movd %%xmm0,%3 \n"
  49. : "+r"(src_a), // %0
  50. "+r"(src_b), // %1
  51. "+r"(count), // %2
  52. "=g"(sse) // %3
  53. :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  54. );
  55. return sse;
  56. }
  57. static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
  58. static uvec32 kHashMul0 = {
  59. 0x0c3525e1, // 33 ^ 15
  60. 0xa3476dc1, // 33 ^ 14
  61. 0x3b4039a1, // 33 ^ 13
  62. 0x4f5f0981, // 33 ^ 12
  63. };
  64. static uvec32 kHashMul1 = {
  65. 0x30f35d61, // 33 ^ 11
  66. 0x855cb541, // 33 ^ 10
  67. 0x040a9121, // 33 ^ 9
  68. 0x747c7101, // 33 ^ 8
  69. };
  70. static uvec32 kHashMul2 = {
  71. 0xec41d4e1, // 33 ^ 7
  72. 0x4cfa3cc1, // 33 ^ 6
  73. 0x025528a1, // 33 ^ 5
  74. 0x00121881, // 33 ^ 4
  75. };
  76. static uvec32 kHashMul3 = {
  77. 0x00008c61, // 33 ^ 3
  78. 0x00000441, // 33 ^ 2
  79. 0x00000021, // 33 ^ 1
  80. 0x00000001, // 33 ^ 0
  81. };
  82. uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
  83. uint32 hash;
  84. asm volatile (
  85. "movd %2,%%xmm0 \n"
  86. "pxor %%xmm7,%%xmm7 \n"
  87. "movdqa %4,%%xmm6 \n"
  88. LABELALIGN
  89. "1: \n"
  90. "movdqu " MEMACCESS(0) ",%%xmm1 \n"
  91. "lea " MEMLEA(0x10, 0) ",%0 \n"
  92. "pmulld %%xmm6,%%xmm0 \n"
  93. "movdqa %5,%%xmm5 \n"
  94. "movdqa %%xmm1,%%xmm2 \n"
  95. "punpcklbw %%xmm7,%%xmm2 \n"
  96. "movdqa %%xmm2,%%xmm3 \n"
  97. "punpcklwd %%xmm7,%%xmm3 \n"
  98. "pmulld %%xmm5,%%xmm3 \n"
  99. "movdqa %6,%%xmm5 \n"
  100. "movdqa %%xmm2,%%xmm4 \n"
  101. "punpckhwd %%xmm7,%%xmm4 \n"
  102. "pmulld %%xmm5,%%xmm4 \n"
  103. "movdqa %7,%%xmm5 \n"
  104. "punpckhbw %%xmm7,%%xmm1 \n"
  105. "movdqa %%xmm1,%%xmm2 \n"
  106. "punpcklwd %%xmm7,%%xmm2 \n"
  107. "pmulld %%xmm5,%%xmm2 \n"
  108. "movdqa %8,%%xmm5 \n"
  109. "punpckhwd %%xmm7,%%xmm1 \n"
  110. "pmulld %%xmm5,%%xmm1 \n"
  111. "paddd %%xmm4,%%xmm3 \n"
  112. "paddd %%xmm2,%%xmm1 \n"
  113. "paddd %%xmm3,%%xmm1 \n"
  114. "pshufd $0xe,%%xmm1,%%xmm2 \n"
  115. "paddd %%xmm2,%%xmm1 \n"
  116. "pshufd $0x1,%%xmm1,%%xmm2 \n"
  117. "paddd %%xmm2,%%xmm1 \n"
  118. "paddd %%xmm1,%%xmm0 \n"
  119. "sub $0x10,%1 \n"
  120. "jg 1b \n"
  121. "movd %%xmm0,%3 \n"
  122. : "+r"(src), // %0
  123. "+r"(count), // %1
  124. "+rm"(seed), // %2
  125. "=g"(hash) // %3
  126. : "m"(kHash16x33), // %4
  127. "m"(kHashMul0), // %5
  128. "m"(kHashMul1), // %6
  129. "m"(kHashMul2), // %7
  130. "m"(kHashMul3) // %8
  131. : "memory", "cc"
  132. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  133. );
  134. return hash;
  135. }
  136. #endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
  137. #ifdef __cplusplus
  138. } // extern "C"
  139. } // namespace libyuv
  140. #endif