jdsample-altivec.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393
  1. /*
  2. * AltiVec optimizations for libjpeg-turbo
  3. *
  4. * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
  5. *
  6. * This software is provided 'as-is', without any express or implied
  7. * warranty. In no event will the authors be held liable for any damages
  8. * arising from the use of this software.
  9. *
  10. * Permission is granted to anyone to use this software for any purpose,
  11. * including commercial applications, and to alter it and redistribute it
  12. * freely, subject to the following restrictions:
  13. *
  14. * 1. The origin of this software must not be misrepresented; you must not
  15. * claim that you wrote the original software. If you use this software
  16. * in a product, an acknowledgment in the product documentation would be
  17. * appreciated but is not required.
  18. * 2. Altered source versions must be plainly marked as such, and must not be
  19. * misrepresented as being the original software.
  20. * 3. This notice may not be removed or altered from any source distribution.
  21. */
  22. /* CHROMA UPSAMPLING */
  23. #include "jsimd_altivec.h"
  24. void
  25. jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
  26. JDIMENSION downsampled_width,
  27. JSAMPARRAY input_data,
  28. JSAMPARRAY *output_data_ptr)
  29. {
  30. JSAMPARRAY output_data = *output_data_ptr;
  31. JSAMPROW inptr, outptr;
  32. int inrow, incol;
  33. __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0,
  34. out;
  35. __vector short this0e, this0o, this0l, this0h, last0l, last0h,
  36. next0l, next0h, outle, outhe, outlo, outho;
  37. /* Constants */
  38. __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
  39. last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14},
  40. last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30},
  41. next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},
  42. next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15},
  43. #if __BIG_ENDIAN__
  44. merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
  45. #else
  46. merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
  47. #endif
  48. __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
  49. for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
  50. inptr = input_data[inrow];
  51. outptr = output_data[inrow];
  52. if (downsampled_width & 15)
  53. inptr[downsampled_width] = inptr[downsampled_width - 1];
  54. this0 = vec_ld(0, inptr);
  55. p_last0 = vec_perm(this0, this0, last_index_col0);
  56. last0 = this0;
  57. for (incol = downsampled_width; incol > 0;
  58. incol -= 16, inptr += 16, outptr += 32) {
  59. if (downsampled_width - incol > 0) {
  60. p_last0 = vec_perm(last0, this0, last_index);
  61. last0 = this0;
  62. }
  63. if (incol <= 16)
  64. p_next0 = vec_perm(this0, this0, next_index_lastcol);
  65. else {
  66. next0 = vec_ld(16, inptr);
  67. p_next0 = vec_perm(this0, next0, next_index);
  68. }
  69. this0e = (__vector short)vec_mule(this0, pb_three);
  70. this0o = (__vector short)vec_mulo(this0, pb_three);
  71. this0l = vec_mergeh(this0e, this0o);
  72. this0h = vec_mergel(this0e, this0o);
  73. last0l = (__vector short)VEC_UNPACKHU(p_last0);
  74. last0h = (__vector short)VEC_UNPACKLU(p_last0);
  75. last0l = vec_add(last0l, pw_one);
  76. next0l = (__vector short)VEC_UNPACKHU(p_next0);
  77. next0h = (__vector short)VEC_UNPACKLU(p_next0);
  78. next0l = vec_add(next0l, pw_two);
  79. outle = vec_add(this0l, last0l);
  80. outlo = vec_add(this0l, next0l);
  81. outle = vec_sr(outle, (__vector unsigned short)pw_two);
  82. outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
  83. out = vec_perm((__vector unsigned char)outle,
  84. (__vector unsigned char)outlo, merge_pack_index);
  85. vec_st(out, 0, outptr);
  86. if (incol > 8) {
  87. last0h = vec_add(last0h, pw_one);
  88. next0h = vec_add(next0h, pw_two);
  89. outhe = vec_add(this0h, last0h);
  90. outho = vec_add(this0h, next0h);
  91. outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
  92. outho = vec_sr(outho, (__vector unsigned short)pw_two);
  93. out = vec_perm((__vector unsigned char)outhe,
  94. (__vector unsigned char)outho, merge_pack_index);
  95. vec_st(out, 16, outptr);
  96. }
  97. this0 = next0;
  98. }
  99. }
  100. }
  101. void
  102. jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
  103. JDIMENSION downsampled_width,
  104. JSAMPARRAY input_data,
  105. JSAMPARRAY *output_data_ptr)
  106. {
  107. JSAMPARRAY output_data = *output_data_ptr;
  108. JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
  109. int inrow, outrow, incol;
  110. __vector unsigned char this_1, this0, this1, out;
  111. __vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
  112. lastcolsum_1h, lastcolsum1h,
  113. p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
  114. thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
  115. nextcolsum_1l = {0}, nextcolsum_1h = {0},
  116. nextcolsum1l = {0}, nextcolsum1h = {0},
  117. p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
  118. tmpl, tmph, outle, outhe, outlo, outho;
  119. /* Constants */
  120. __vector unsigned char pb_zero = { __16X(0) },
  121. last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13},
  122. last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29},
  123. next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17},
  124. next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15},
  125. #if __BIG_ENDIAN__
  126. merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
  127. #else
  128. merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
  129. #endif
  130. __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
  131. pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
  132. __vector unsigned short pw_four = { __8X(4) };
  133. for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
  134. inptr_1 = input_data[inrow - 1];
  135. inptr0 = input_data[inrow];
  136. inptr1 = input_data[inrow + 1];
  137. outptr0 = output_data[outrow++];
  138. outptr1 = output_data[outrow++];
  139. if (downsampled_width & 15) {
  140. inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
  141. inptr0[downsampled_width] = inptr0[downsampled_width - 1];
  142. inptr1[downsampled_width] = inptr1[downsampled_width - 1];
  143. }
  144. this0 = vec_ld(0, inptr0);
  145. this0l = (__vector short)VEC_UNPACKHU(this0);
  146. this0h = (__vector short)VEC_UNPACKLU(this0);
  147. this0l = vec_mladd(this0l, pw_three, pw_zero);
  148. this0h = vec_mladd(this0h, pw_three, pw_zero);
  149. this_1 = vec_ld(0, inptr_1);
  150. this_1l = (__vector short)VEC_UNPACKHU(this_1);
  151. this_1h = (__vector short)VEC_UNPACKLU(this_1);
  152. thiscolsum_1l = vec_add(this0l, this_1l);
  153. thiscolsum_1h = vec_add(this0h, this_1h);
  154. lastcolsum_1h = thiscolsum_1h;
  155. p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
  156. p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
  157. this1 = vec_ld(0, inptr1);
  158. this1l = (__vector short)VEC_UNPACKHU(this1);
  159. this1h = (__vector short)VEC_UNPACKLU(this1);
  160. thiscolsum1l = vec_add(this0l, this1l);
  161. thiscolsum1h = vec_add(this0h, this1h);
  162. lastcolsum1h = thiscolsum1h;
  163. p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
  164. p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
  165. for (incol = downsampled_width; incol > 0;
  166. incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
  167. outptr0 += 32, outptr1 += 32) {
  168. if (downsampled_width - incol > 0) {
  169. p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
  170. p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
  171. p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
  172. p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
  173. lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h;
  174. }
  175. if (incol <= 16) {
  176. p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
  177. p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
  178. next_index_lastcol);
  179. p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
  180. p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
  181. next_index_lastcol);
  182. } else {
  183. this0 = vec_ld(16, inptr0);
  184. this0l = (__vector short)VEC_UNPACKHU(this0);
  185. this0h = (__vector short)VEC_UNPACKLU(this0);
  186. this0l = vec_mladd(this0l, pw_three, pw_zero);
  187. this0h = vec_mladd(this0h, pw_three, pw_zero);
  188. this_1 = vec_ld(16, inptr_1);
  189. this_1l = (__vector short)VEC_UNPACKHU(this_1);
  190. this_1h = (__vector short)VEC_UNPACKLU(this_1);
  191. nextcolsum_1l = vec_add(this0l, this_1l);
  192. nextcolsum_1h = vec_add(this0h, this_1h);
  193. p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
  194. p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
  195. this1 = vec_ld(16, inptr1);
  196. this1l = (__vector short)VEC_UNPACKHU(this1);
  197. this1h = (__vector short)VEC_UNPACKLU(this1);
  198. nextcolsum1l = vec_add(this0l, this1l);
  199. nextcolsum1h = vec_add(this0h, this1h);
  200. p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
  201. p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
  202. }
  203. /* Process the upper row */
  204. tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
  205. outle = vec_add(tmpl, p_lastcolsum_1l);
  206. outle = vec_add(outle, pw_eight);
  207. outle = vec_sr(outle, pw_four);
  208. outlo = vec_add(tmpl, p_nextcolsum_1l);
  209. outlo = vec_add(outlo, pw_seven);
  210. outlo = vec_sr(outlo, pw_four);
  211. out = vec_perm((__vector unsigned char)outle,
  212. (__vector unsigned char)outlo, merge_pack_index);
  213. vec_st(out, 0, outptr0);
  214. if (incol > 8) {
  215. tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
  216. outhe = vec_add(tmph, p_lastcolsum_1h);
  217. outhe = vec_add(outhe, pw_eight);
  218. outhe = vec_sr(outhe, pw_four);
  219. outho = vec_add(tmph, p_nextcolsum_1h);
  220. outho = vec_add(outho, pw_seven);
  221. outho = vec_sr(outho, pw_four);
  222. out = vec_perm((__vector unsigned char)outhe,
  223. (__vector unsigned char)outho, merge_pack_index);
  224. vec_st(out, 16, outptr0);
  225. }
  226. /* Process the lower row */
  227. tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
  228. outle = vec_add(tmpl, p_lastcolsum1l);
  229. outle = vec_add(outle, pw_eight);
  230. outle = vec_sr(outle, pw_four);
  231. outlo = vec_add(tmpl, p_nextcolsum1l);
  232. outlo = vec_add(outlo, pw_seven);
  233. outlo = vec_sr(outlo, pw_four);
  234. out = vec_perm((__vector unsigned char)outle,
  235. (__vector unsigned char)outlo, merge_pack_index);
  236. vec_st(out, 0, outptr1);
  237. if (incol > 8) {
  238. tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
  239. outhe = vec_add(tmph, p_lastcolsum1h);
  240. outhe = vec_add(outhe, pw_eight);
  241. outhe = vec_sr(outhe, pw_four);
  242. outho = vec_add(tmph, p_nextcolsum1h);
  243. outho = vec_add(outho, pw_seven);
  244. outho = vec_sr(outho, pw_four);
  245. out = vec_perm((__vector unsigned char)outhe,
  246. (__vector unsigned char)outho, merge_pack_index);
  247. vec_st(out, 16, outptr1);
  248. }
  249. thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h;
  250. thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h;
  251. }
  252. }
  253. }
  254. /* These are rarely used (mainly just for decompressing YCCK images) */
  255. void
  256. jsimd_h2v1_upsample_altivec (int max_v_samp_factor,
  257. JDIMENSION output_width,
  258. JSAMPARRAY input_data,
  259. JSAMPARRAY *output_data_ptr)
  260. {
  261. JSAMPARRAY output_data = *output_data_ptr;
  262. JSAMPROW inptr, outptr;
  263. int inrow, incol;
  264. __vector unsigned char in, inl, inh;
  265. for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
  266. inptr = input_data[inrow];
  267. outptr = output_data[inrow];
  268. for (incol = (output_width + 31) & (~31); incol > 0;
  269. incol -= 64, inptr += 32, outptr += 64) {
  270. in = vec_ld(0, inptr);
  271. inl = vec_mergeh(in, in);
  272. inh = vec_mergel(in, in);
  273. vec_st(inl, 0, outptr);
  274. vec_st(inh, 16, outptr);
  275. if (incol > 32) {
  276. in = vec_ld(16, inptr);
  277. inl = vec_mergeh(in, in);
  278. inh = vec_mergel(in, in);
  279. vec_st(inl, 32, outptr);
  280. vec_st(inh, 48, outptr);
  281. }
  282. }
  283. }
  284. }
  285. void
  286. jsimd_h2v2_upsample_altivec (int max_v_samp_factor,
  287. JDIMENSION output_width,
  288. JSAMPARRAY input_data,
  289. JSAMPARRAY *output_data_ptr)
  290. {
  291. JSAMPARRAY output_data = *output_data_ptr;
  292. JSAMPROW inptr, outptr0, outptr1;
  293. int inrow, outrow, incol;
  294. __vector unsigned char in, inl, inh;
  295. for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
  296. inptr = input_data[inrow];
  297. outptr0 = output_data[outrow++];
  298. outptr1 = output_data[outrow++];
  299. for (incol = (output_width + 31) & (~31); incol > 0;
  300. incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {
  301. in = vec_ld(0, inptr);
  302. inl = vec_mergeh(in, in);
  303. inh = vec_mergel(in, in);
  304. vec_st(inl, 0, outptr0);
  305. vec_st(inl, 0, outptr1);
  306. vec_st(inh, 16, outptr0);
  307. vec_st(inh, 16, outptr1);
  308. if (incol > 32) {
  309. in = vec_ld(16, inptr);
  310. inl = vec_mergeh(in, in);
  311. inh = vec_mergel(in, in);
  312. vec_st(inl, 32, outptr0);
  313. vec_st(inl, 32, outptr1);
  314. vec_st(inh, 48, outptr0);
  315. vec_st(inh, 48, outptr1);
  316. }
  317. }
  318. }
  319. }