vf_scale_cuda.cu 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. /*
  2. * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
  3. *
  4. * Permission is hereby granted, free of charge, to any person obtaining a
  5. * copy of this software and associated documentation files (the "Software"),
  6. * to deal in the Software without restriction, including without limitation
  7. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. * and/or sell copies of the Software, and to permit persons to whom the
  9. * Software is furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice shall be included in
  12. * all copies or substantial portions of the Software.
  13. *
  14. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  17. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  20. * DEALINGS IN THE SOFTWARE.
  21. */
  22. extern "C" {
  23. __global__ void Subsample_Bilinear_uchar(cudaTextureObject_t uchar_tex,
  24. unsigned char *dst,
  25. int dst_width, int dst_height, int dst_pitch,
  26. int src_width, int src_height)
  27. {
  28. int xo = blockIdx.x * blockDim.x + threadIdx.x;
  29. int yo = blockIdx.y * blockDim.y + threadIdx.y;
  30. if (yo < dst_height && xo < dst_width)
  31. {
  32. float hscale = (float)src_width / (float)dst_width;
  33. float vscale = (float)src_height / (float)dst_height;
  34. float xi = (xo + 0.5f) * hscale;
  35. float yi = (yo + 0.5f) * vscale;
  36. // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
  37. float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
  38. float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
  39. // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
  40. float dx = wh / (0.5f + wh);
  41. float dy = wv / (0.5f + wv);
  42. int y0 = tex2D<unsigned char>(uchar_tex, xi-dx, yi-dy);
  43. int y1 = tex2D<unsigned char>(uchar_tex, xi+dx, yi-dy);
  44. int y2 = tex2D<unsigned char>(uchar_tex, xi-dx, yi+dy);
  45. int y3 = tex2D<unsigned char>(uchar_tex, xi+dx, yi+dy);
  46. dst[yo*dst_pitch+xo] = (unsigned char)((y0+y1+y2+y3+2) >> 2);
  47. }
  48. }
  49. __global__ void Subsample_Bilinear_uchar2(cudaTextureObject_t uchar2_tex,
  50. uchar2 *dst,
  51. int dst_width, int dst_height, int dst_pitch2,
  52. int src_width, int src_height)
  53. {
  54. int xo = blockIdx.x * blockDim.x + threadIdx.x;
  55. int yo = blockIdx.y * blockDim.y + threadIdx.y;
  56. if (yo < dst_height && xo < dst_width)
  57. {
  58. float hscale = (float)src_width / (float)dst_width;
  59. float vscale = (float)src_height / (float)dst_height;
  60. float xi = (xo + 0.5f) * hscale;
  61. float yi = (yo + 0.5f) * vscale;
  62. // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
  63. float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
  64. float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
  65. // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
  66. float dx = wh / (0.5f + wh);
  67. float dy = wv / (0.5f + wv);
  68. uchar2 c0 = tex2D<uchar2>(uchar2_tex, xi-dx, yi-dy);
  69. uchar2 c1 = tex2D<uchar2>(uchar2_tex, xi+dx, yi-dy);
  70. uchar2 c2 = tex2D<uchar2>(uchar2_tex, xi-dx, yi+dy);
  71. uchar2 c3 = tex2D<uchar2>(uchar2_tex, xi+dx, yi+dy);
  72. int2 uv;
  73. uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
  74. uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
  75. dst[yo*dst_pitch2+xo] = make_uchar2((unsigned char)uv.x, (unsigned char)uv.y);
  76. }
  77. }
  78. __global__ void Subsample_Bilinear_uchar4(cudaTextureObject_t uchar4_tex,
  79. uchar4 *dst,
  80. int dst_width, int dst_height, int dst_pitch,
  81. int src_width, int src_height)
  82. {
  83. int xo = blockIdx.x * blockDim.x + threadIdx.x;
  84. int yo = blockIdx.y * blockDim.y + threadIdx.y;
  85. if (yo < dst_height && xo < dst_width)
  86. {
  87. float hscale = (float)src_width / (float)dst_width;
  88. float vscale = (float)src_height / (float)dst_height;
  89. float xi = (xo + 0.5f) * hscale;
  90. float yi = (yo + 0.5f) * vscale;
  91. // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
  92. float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
  93. float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
  94. // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
  95. float dx = wh / (0.5f + wh);
  96. float dy = wv / (0.5f + wv);
  97. uchar4 c0 = tex2D<uchar4>(uchar4_tex, xi-dx, yi-dy);
  98. uchar4 c1 = tex2D<uchar4>(uchar4_tex, xi+dx, yi-dy);
  99. uchar4 c2 = tex2D<uchar4>(uchar4_tex, xi-dx, yi+dy);
  100. uchar4 c3 = tex2D<uchar4>(uchar4_tex, xi+dx, yi+dy);
  101. int4 res;
  102. res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
  103. res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
  104. res.z = ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2;
  105. res.w = ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2;
  106. dst[yo*dst_pitch+xo] = make_uchar4(
  107. (unsigned char)res.x, (unsigned char)res.y, (unsigned char)res.z, (unsigned char)res.w);
  108. }
  109. }
  110. __global__ void Subsample_Bilinear_ushort(cudaTextureObject_t ushort_tex,
  111. unsigned short *dst,
  112. int dst_width, int dst_height, int dst_pitch,
  113. int src_width, int src_height)
  114. {
  115. int xo = blockIdx.x * blockDim.x + threadIdx.x;
  116. int yo = blockIdx.y * blockDim.y + threadIdx.y;
  117. if (yo < dst_height && xo < dst_width)
  118. {
  119. float hscale = (float)src_width / (float)dst_width;
  120. float vscale = (float)src_height / (float)dst_height;
  121. float xi = (xo + 0.5f) * hscale;
  122. float yi = (yo + 0.5f) * vscale;
  123. // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
  124. float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
  125. float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
  126. // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
  127. float dx = wh / (0.5f + wh);
  128. float dy = wv / (0.5f + wv);
  129. int y0 = tex2D<unsigned short>(ushort_tex, xi-dx, yi-dy);
  130. int y1 = tex2D<unsigned short>(ushort_tex, xi+dx, yi-dy);
  131. int y2 = tex2D<unsigned short>(ushort_tex, xi-dx, yi+dy);
  132. int y3 = tex2D<unsigned short>(ushort_tex, xi+dx, yi+dy);
  133. dst[yo*dst_pitch+xo] = (unsigned short)((y0+y1+y2+y3+2) >> 2);
  134. }
  135. }
  136. __global__ void Subsample_Bilinear_ushort2(cudaTextureObject_t ushort2_tex,
  137. ushort2 *dst,
  138. int dst_width, int dst_height, int dst_pitch2,
  139. int src_width, int src_height)
  140. {
  141. int xo = blockIdx.x * blockDim.x + threadIdx.x;
  142. int yo = blockIdx.y * blockDim.y + threadIdx.y;
  143. if (yo < dst_height && xo < dst_width)
  144. {
  145. float hscale = (float)src_width / (float)dst_width;
  146. float vscale = (float)src_height / (float)dst_height;
  147. float xi = (xo + 0.5f) * hscale;
  148. float yi = (yo + 0.5f) * vscale;
  149. // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
  150. float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
  151. float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
  152. // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
  153. float dx = wh / (0.5f + wh);
  154. float dy = wv / (0.5f + wv);
  155. ushort2 c0 = tex2D<ushort2>(ushort2_tex, xi-dx, yi-dy);
  156. ushort2 c1 = tex2D<ushort2>(ushort2_tex, xi+dx, yi-dy);
  157. ushort2 c2 = tex2D<ushort2>(ushort2_tex, xi-dx, yi+dy);
  158. ushort2 c3 = tex2D<ushort2>(ushort2_tex, xi+dx, yi+dy);
  159. int2 uv;
  160. uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
  161. uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
  162. dst[yo*dst_pitch2+xo] = make_ushort2((unsigned short)uv.x, (unsigned short)uv.y);
  163. }
  164. }
  165. __global__ void Subsample_Bilinear_ushort4(cudaTextureObject_t ushort4_tex,
  166. ushort4 *dst,
  167. int dst_width, int dst_height, int dst_pitch,
  168. int src_width, int src_height)
  169. {
  170. int xo = blockIdx.x * blockDim.x + threadIdx.x;
  171. int yo = blockIdx.y * blockDim.y + threadIdx.y;
  172. if (yo < dst_height && xo < dst_width)
  173. {
  174. float hscale = (float)src_width / (float)dst_width;
  175. float vscale = (float)src_height / (float)dst_height;
  176. float xi = (xo + 0.5f) * hscale;
  177. float yi = (yo + 0.5f) * vscale;
  178. // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
  179. float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
  180. float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
  181. // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
  182. float dx = wh / (0.5f + wh);
  183. float dy = wv / (0.5f + wv);
  184. ushort4 c0 = tex2D<ushort4>(ushort4_tex, xi-dx, yi-dy);
  185. ushort4 c1 = tex2D<ushort4>(ushort4_tex, xi+dx, yi-dy);
  186. ushort4 c2 = tex2D<ushort4>(ushort4_tex, xi-dx, yi+dy);
  187. ushort4 c3 = tex2D<ushort4>(ushort4_tex, xi+dx, yi+dy);
  188. int4 res;
  189. res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
  190. res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
  191. res.z = ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2;
  192. res.w = ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2;
  193. dst[yo*dst_pitch+xo] = make_ushort4(
  194. (unsigned short)res.x, (unsigned short)res.y, (unsigned short)res.z, (unsigned short)res.w);
  195. }
  196. }
  197. }