rotate_neon.cc 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. /*
  2. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #include "libyuv/rotate_row.h"
  12. #include "libyuv/basic_types.h"
  13. #ifdef __cplusplus
  14. namespace libyuv {
  15. extern "C" {
  16. #endif
  17. #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
  18. !defined(__aarch64__)
  19. static uvec8 kVTbl4x4Transpose =
  20. { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
  21. void TransposeWx8_NEON(const uint8* src, int src_stride,
  22. uint8* dst, int dst_stride,
  23. int width) {
  24. const uint8* src_temp;
  25. asm volatile (
  26. // loops are on blocks of 8. loop will stop when
  27. // counter gets to or below 0. starting the counter
  28. // at w-8 allow for this
  29. "sub %5, #8 \n"
  30. // handle 8x8 blocks. this should be the majority of the plane
  31. "1: \n"
  32. "mov %0, %1 \n"
  33. MEMACCESS(0)
  34. "vld1.8 {d0}, [%0], %2 \n"
  35. MEMACCESS(0)
  36. "vld1.8 {d1}, [%0], %2 \n"
  37. MEMACCESS(0)
  38. "vld1.8 {d2}, [%0], %2 \n"
  39. MEMACCESS(0)
  40. "vld1.8 {d3}, [%0], %2 \n"
  41. MEMACCESS(0)
  42. "vld1.8 {d4}, [%0], %2 \n"
  43. MEMACCESS(0)
  44. "vld1.8 {d5}, [%0], %2 \n"
  45. MEMACCESS(0)
  46. "vld1.8 {d6}, [%0], %2 \n"
  47. MEMACCESS(0)
  48. "vld1.8 {d7}, [%0] \n"
  49. "vtrn.8 d1, d0 \n"
  50. "vtrn.8 d3, d2 \n"
  51. "vtrn.8 d5, d4 \n"
  52. "vtrn.8 d7, d6 \n"
  53. "vtrn.16 d1, d3 \n"
  54. "vtrn.16 d0, d2 \n"
  55. "vtrn.16 d5, d7 \n"
  56. "vtrn.16 d4, d6 \n"
  57. "vtrn.32 d1, d5 \n"
  58. "vtrn.32 d0, d4 \n"
  59. "vtrn.32 d3, d7 \n"
  60. "vtrn.32 d2, d6 \n"
  61. "vrev16.8 q0, q0 \n"
  62. "vrev16.8 q1, q1 \n"
  63. "vrev16.8 q2, q2 \n"
  64. "vrev16.8 q3, q3 \n"
  65. "mov %0, %3 \n"
  66. MEMACCESS(0)
  67. "vst1.8 {d1}, [%0], %4 \n"
  68. MEMACCESS(0)
  69. "vst1.8 {d0}, [%0], %4 \n"
  70. MEMACCESS(0)
  71. "vst1.8 {d3}, [%0], %4 \n"
  72. MEMACCESS(0)
  73. "vst1.8 {d2}, [%0], %4 \n"
  74. MEMACCESS(0)
  75. "vst1.8 {d5}, [%0], %4 \n"
  76. MEMACCESS(0)
  77. "vst1.8 {d4}, [%0], %4 \n"
  78. MEMACCESS(0)
  79. "vst1.8 {d7}, [%0], %4 \n"
  80. MEMACCESS(0)
  81. "vst1.8 {d6}, [%0] \n"
  82. "add %1, #8 \n" // src += 8
  83. "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
  84. "subs %5, #8 \n" // w -= 8
  85. "bge 1b \n"
  86. // add 8 back to counter. if the result is 0 there are
  87. // no residuals.
  88. "adds %5, #8 \n"
  89. "beq 4f \n"
  90. // some residual, so between 1 and 7 lines left to transpose
  91. "cmp %5, #2 \n"
  92. "blt 3f \n"
  93. "cmp %5, #4 \n"
  94. "blt 2f \n"
  95. // 4x8 block
  96. "mov %0, %1 \n"
  97. MEMACCESS(0)
  98. "vld1.32 {d0[0]}, [%0], %2 \n"
  99. MEMACCESS(0)
  100. "vld1.32 {d0[1]}, [%0], %2 \n"
  101. MEMACCESS(0)
  102. "vld1.32 {d1[0]}, [%0], %2 \n"
  103. MEMACCESS(0)
  104. "vld1.32 {d1[1]}, [%0], %2 \n"
  105. MEMACCESS(0)
  106. "vld1.32 {d2[0]}, [%0], %2 \n"
  107. MEMACCESS(0)
  108. "vld1.32 {d2[1]}, [%0], %2 \n"
  109. MEMACCESS(0)
  110. "vld1.32 {d3[0]}, [%0], %2 \n"
  111. MEMACCESS(0)
  112. "vld1.32 {d3[1]}, [%0] \n"
  113. "mov %0, %3 \n"
  114. MEMACCESS(6)
  115. "vld1.8 {q3}, [%6] \n"
  116. "vtbl.8 d4, {d0, d1}, d6 \n"
  117. "vtbl.8 d5, {d0, d1}, d7 \n"
  118. "vtbl.8 d0, {d2, d3}, d6 \n"
  119. "vtbl.8 d1, {d2, d3}, d7 \n"
  120. // TODO(frkoenig): Rework shuffle above to
  121. // write out with 4 instead of 8 writes.
  122. MEMACCESS(0)
  123. "vst1.32 {d4[0]}, [%0], %4 \n"
  124. MEMACCESS(0)
  125. "vst1.32 {d4[1]}, [%0], %4 \n"
  126. MEMACCESS(0)
  127. "vst1.32 {d5[0]}, [%0], %4 \n"
  128. MEMACCESS(0)
  129. "vst1.32 {d5[1]}, [%0] \n"
  130. "add %0, %3, #4 \n"
  131. MEMACCESS(0)
  132. "vst1.32 {d0[0]}, [%0], %4 \n"
  133. MEMACCESS(0)
  134. "vst1.32 {d0[1]}, [%0], %4 \n"
  135. MEMACCESS(0)
  136. "vst1.32 {d1[0]}, [%0], %4 \n"
  137. MEMACCESS(0)
  138. "vst1.32 {d1[1]}, [%0] \n"
  139. "add %1, #4 \n" // src += 4
  140. "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
  141. "subs %5, #4 \n" // w -= 4
  142. "beq 4f \n"
  143. // some residual, check to see if it includes a 2x8 block,
  144. // or less
  145. "cmp %5, #2 \n"
  146. "blt 3f \n"
  147. // 2x8 block
  148. "2: \n"
  149. "mov %0, %1 \n"
  150. MEMACCESS(0)
  151. "vld1.16 {d0[0]}, [%0], %2 \n"
  152. MEMACCESS(0)
  153. "vld1.16 {d1[0]}, [%0], %2 \n"
  154. MEMACCESS(0)
  155. "vld1.16 {d0[1]}, [%0], %2 \n"
  156. MEMACCESS(0)
  157. "vld1.16 {d1[1]}, [%0], %2 \n"
  158. MEMACCESS(0)
  159. "vld1.16 {d0[2]}, [%0], %2 \n"
  160. MEMACCESS(0)
  161. "vld1.16 {d1[2]}, [%0], %2 \n"
  162. MEMACCESS(0)
  163. "vld1.16 {d0[3]}, [%0], %2 \n"
  164. MEMACCESS(0)
  165. "vld1.16 {d1[3]}, [%0] \n"
  166. "vtrn.8 d0, d1 \n"
  167. "mov %0, %3 \n"
  168. MEMACCESS(0)
  169. "vst1.64 {d0}, [%0], %4 \n"
  170. MEMACCESS(0)
  171. "vst1.64 {d1}, [%0] \n"
  172. "add %1, #2 \n" // src += 2
  173. "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
  174. "subs %5, #2 \n" // w -= 2
  175. "beq 4f \n"
  176. // 1x8 block
  177. "3: \n"
  178. MEMACCESS(1)
  179. "vld1.8 {d0[0]}, [%1], %2 \n"
  180. MEMACCESS(1)
  181. "vld1.8 {d0[1]}, [%1], %2 \n"
  182. MEMACCESS(1)
  183. "vld1.8 {d0[2]}, [%1], %2 \n"
  184. MEMACCESS(1)
  185. "vld1.8 {d0[3]}, [%1], %2 \n"
  186. MEMACCESS(1)
  187. "vld1.8 {d0[4]}, [%1], %2 \n"
  188. MEMACCESS(1)
  189. "vld1.8 {d0[5]}, [%1], %2 \n"
  190. MEMACCESS(1)
  191. "vld1.8 {d0[6]}, [%1], %2 \n"
  192. MEMACCESS(1)
  193. "vld1.8 {d0[7]}, [%1] \n"
  194. MEMACCESS(3)
  195. "vst1.64 {d0}, [%3] \n"
  196. "4: \n"
  197. : "=&r"(src_temp), // %0
  198. "+r"(src), // %1
  199. "+r"(src_stride), // %2
  200. "+r"(dst), // %3
  201. "+r"(dst_stride), // %4
  202. "+r"(width) // %5
  203. : "r"(&kVTbl4x4Transpose) // %6
  204. : "memory", "cc", "q0", "q1", "q2", "q3"
  205. );
  206. }
  207. static uvec8 kVTbl4x4TransposeDi =
  208. { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
  209. void TransposeUVWx8_NEON(const uint8* src, int src_stride,
  210. uint8* dst_a, int dst_stride_a,
  211. uint8* dst_b, int dst_stride_b,
  212. int width) {
  213. const uint8* src_temp;
  214. asm volatile (
  215. // loops are on blocks of 8. loop will stop when
  216. // counter gets to or below 0. starting the counter
  217. // at w-8 allow for this
  218. "sub %7, #8 \n"
  219. // handle 8x8 blocks. this should be the majority of the plane
  220. "1: \n"
  221. "mov %0, %1 \n"
  222. MEMACCESS(0)
  223. "vld2.8 {d0, d1}, [%0], %2 \n"
  224. MEMACCESS(0)
  225. "vld2.8 {d2, d3}, [%0], %2 \n"
  226. MEMACCESS(0)
  227. "vld2.8 {d4, d5}, [%0], %2 \n"
  228. MEMACCESS(0)
  229. "vld2.8 {d6, d7}, [%0], %2 \n"
  230. MEMACCESS(0)
  231. "vld2.8 {d16, d17}, [%0], %2 \n"
  232. MEMACCESS(0)
  233. "vld2.8 {d18, d19}, [%0], %2 \n"
  234. MEMACCESS(0)
  235. "vld2.8 {d20, d21}, [%0], %2 \n"
  236. MEMACCESS(0)
  237. "vld2.8 {d22, d23}, [%0] \n"
  238. "vtrn.8 q1, q0 \n"
  239. "vtrn.8 q3, q2 \n"
  240. "vtrn.8 q9, q8 \n"
  241. "vtrn.8 q11, q10 \n"
  242. "vtrn.16 q1, q3 \n"
  243. "vtrn.16 q0, q2 \n"
  244. "vtrn.16 q9, q11 \n"
  245. "vtrn.16 q8, q10 \n"
  246. "vtrn.32 q1, q9 \n"
  247. "vtrn.32 q0, q8 \n"
  248. "vtrn.32 q3, q11 \n"
  249. "vtrn.32 q2, q10 \n"
  250. "vrev16.8 q0, q0 \n"
  251. "vrev16.8 q1, q1 \n"
  252. "vrev16.8 q2, q2 \n"
  253. "vrev16.8 q3, q3 \n"
  254. "vrev16.8 q8, q8 \n"
  255. "vrev16.8 q9, q9 \n"
  256. "vrev16.8 q10, q10 \n"
  257. "vrev16.8 q11, q11 \n"
  258. "mov %0, %3 \n"
  259. MEMACCESS(0)
  260. "vst1.8 {d2}, [%0], %4 \n"
  261. MEMACCESS(0)
  262. "vst1.8 {d0}, [%0], %4 \n"
  263. MEMACCESS(0)
  264. "vst1.8 {d6}, [%0], %4 \n"
  265. MEMACCESS(0)
  266. "vst1.8 {d4}, [%0], %4 \n"
  267. MEMACCESS(0)
  268. "vst1.8 {d18}, [%0], %4 \n"
  269. MEMACCESS(0)
  270. "vst1.8 {d16}, [%0], %4 \n"
  271. MEMACCESS(0)
  272. "vst1.8 {d22}, [%0], %4 \n"
  273. MEMACCESS(0)
  274. "vst1.8 {d20}, [%0] \n"
  275. "mov %0, %5 \n"
  276. MEMACCESS(0)
  277. "vst1.8 {d3}, [%0], %6 \n"
  278. MEMACCESS(0)
  279. "vst1.8 {d1}, [%0], %6 \n"
  280. MEMACCESS(0)
  281. "vst1.8 {d7}, [%0], %6 \n"
  282. MEMACCESS(0)
  283. "vst1.8 {d5}, [%0], %6 \n"
  284. MEMACCESS(0)
  285. "vst1.8 {d19}, [%0], %6 \n"
  286. MEMACCESS(0)
  287. "vst1.8 {d17}, [%0], %6 \n"
  288. MEMACCESS(0)
  289. "vst1.8 {d23}, [%0], %6 \n"
  290. MEMACCESS(0)
  291. "vst1.8 {d21}, [%0] \n"
  292. "add %1, #8*2 \n" // src += 8*2
  293. "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a
  294. "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b
  295. "subs %7, #8 \n" // w -= 8
  296. "bge 1b \n"
  297. // add 8 back to counter. if the result is 0 there are
  298. // no residuals.
  299. "adds %7, #8 \n"
  300. "beq 4f \n"
  301. // some residual, so between 1 and 7 lines left to transpose
  302. "cmp %7, #2 \n"
  303. "blt 3f \n"
  304. "cmp %7, #4 \n"
  305. "blt 2f \n"
  306. // TODO(frkoenig): Clean this up
  307. // 4x8 block
  308. "mov %0, %1 \n"
  309. MEMACCESS(0)
  310. "vld1.64 {d0}, [%0], %2 \n"
  311. MEMACCESS(0)
  312. "vld1.64 {d1}, [%0], %2 \n"
  313. MEMACCESS(0)
  314. "vld1.64 {d2}, [%0], %2 \n"
  315. MEMACCESS(0)
  316. "vld1.64 {d3}, [%0], %2 \n"
  317. MEMACCESS(0)
  318. "vld1.64 {d4}, [%0], %2 \n"
  319. MEMACCESS(0)
  320. "vld1.64 {d5}, [%0], %2 \n"
  321. MEMACCESS(0)
  322. "vld1.64 {d6}, [%0], %2 \n"
  323. MEMACCESS(0)
  324. "vld1.64 {d7}, [%0] \n"
  325. MEMACCESS(8)
  326. "vld1.8 {q15}, [%8] \n"
  327. "vtrn.8 q0, q1 \n"
  328. "vtrn.8 q2, q3 \n"
  329. "vtbl.8 d16, {d0, d1}, d30 \n"
  330. "vtbl.8 d17, {d0, d1}, d31 \n"
  331. "vtbl.8 d18, {d2, d3}, d30 \n"
  332. "vtbl.8 d19, {d2, d3}, d31 \n"
  333. "vtbl.8 d20, {d4, d5}, d30 \n"
  334. "vtbl.8 d21, {d4, d5}, d31 \n"
  335. "vtbl.8 d22, {d6, d7}, d30 \n"
  336. "vtbl.8 d23, {d6, d7}, d31 \n"
  337. "mov %0, %3 \n"
  338. MEMACCESS(0)
  339. "vst1.32 {d16[0]}, [%0], %4 \n"
  340. MEMACCESS(0)
  341. "vst1.32 {d16[1]}, [%0], %4 \n"
  342. MEMACCESS(0)
  343. "vst1.32 {d17[0]}, [%0], %4 \n"
  344. MEMACCESS(0)
  345. "vst1.32 {d17[1]}, [%0], %4 \n"
  346. "add %0, %3, #4 \n"
  347. MEMACCESS(0)
  348. "vst1.32 {d20[0]}, [%0], %4 \n"
  349. MEMACCESS(0)
  350. "vst1.32 {d20[1]}, [%0], %4 \n"
  351. MEMACCESS(0)
  352. "vst1.32 {d21[0]}, [%0], %4 \n"
  353. MEMACCESS(0)
  354. "vst1.32 {d21[1]}, [%0] \n"
  355. "mov %0, %5 \n"
  356. MEMACCESS(0)
  357. "vst1.32 {d18[0]}, [%0], %6 \n"
  358. MEMACCESS(0)
  359. "vst1.32 {d18[1]}, [%0], %6 \n"
  360. MEMACCESS(0)
  361. "vst1.32 {d19[0]}, [%0], %6 \n"
  362. MEMACCESS(0)
  363. "vst1.32 {d19[1]}, [%0], %6 \n"
  364. "add %0, %5, #4 \n"
  365. MEMACCESS(0)
  366. "vst1.32 {d22[0]}, [%0], %6 \n"
  367. MEMACCESS(0)
  368. "vst1.32 {d22[1]}, [%0], %6 \n"
  369. MEMACCESS(0)
  370. "vst1.32 {d23[0]}, [%0], %6 \n"
  371. MEMACCESS(0)
  372. "vst1.32 {d23[1]}, [%0] \n"
  373. "add %1, #4*2 \n" // src += 4 * 2
  374. "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a
  375. "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b
  376. "subs %7, #4 \n" // w -= 4
  377. "beq 4f \n"
  378. // some residual, check to see if it includes a 2x8 block,
  379. // or less
  380. "cmp %7, #2 \n"
  381. "blt 3f \n"
  382. // 2x8 block
  383. "2: \n"
  384. "mov %0, %1 \n"
  385. MEMACCESS(0)
  386. "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
  387. MEMACCESS(0)
  388. "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
  389. MEMACCESS(0)
  390. "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
  391. MEMACCESS(0)
  392. "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
  393. MEMACCESS(0)
  394. "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
  395. MEMACCESS(0)
  396. "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
  397. MEMACCESS(0)
  398. "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
  399. MEMACCESS(0)
  400. "vld2.16 {d1[3], d3[3]}, [%0] \n"
  401. "vtrn.8 d0, d1 \n"
  402. "vtrn.8 d2, d3 \n"
  403. "mov %0, %3 \n"
  404. MEMACCESS(0)
  405. "vst1.64 {d0}, [%0], %4 \n"
  406. MEMACCESS(0)
  407. "vst1.64 {d2}, [%0] \n"
  408. "mov %0, %5 \n"
  409. MEMACCESS(0)
  410. "vst1.64 {d1}, [%0], %6 \n"
  411. MEMACCESS(0)
  412. "vst1.64 {d3}, [%0] \n"
  413. "add %1, #2*2 \n" // src += 2 * 2
  414. "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a
  415. "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b
  416. "subs %7, #2 \n" // w -= 2
  417. "beq 4f \n"
  418. // 1x8 block
  419. "3: \n"
  420. MEMACCESS(1)
  421. "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
  422. MEMACCESS(1)
  423. "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
  424. MEMACCESS(1)
  425. "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
  426. MEMACCESS(1)
  427. "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
  428. MEMACCESS(1)
  429. "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
  430. MEMACCESS(1)
  431. "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
  432. MEMACCESS(1)
  433. "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
  434. MEMACCESS(1)
  435. "vld2.8 {d0[7], d1[7]}, [%1] \n"
  436. MEMACCESS(3)
  437. "vst1.64 {d0}, [%3] \n"
  438. MEMACCESS(5)
  439. "vst1.64 {d1}, [%5] \n"
  440. "4: \n"
  441. : "=&r"(src_temp), // %0
  442. "+r"(src), // %1
  443. "+r"(src_stride), // %2
  444. "+r"(dst_a), // %3
  445. "+r"(dst_stride_a), // %4
  446. "+r"(dst_b), // %5
  447. "+r"(dst_stride_b), // %6
  448. "+r"(width) // %7
  449. : "r"(&kVTbl4x4TransposeDi) // %8
  450. : "memory", "cc",
  451. "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
  452. );
  453. }
  454. #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
  455. #ifdef __cplusplus
  456. } // extern "C"
  457. } // namespace libyuv
  458. #endif