scale_mips.cc 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645
  1. /*
  2. * Copyright 2012 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/basic_types.h"
  11. #include "libyuv/row.h"
  12. #ifdef __cplusplus
  13. namespace libyuv {
  14. extern "C" {
  15. #endif
  16. // This module is for GCC MIPS DSPR2
  17. #if !defined(LIBYUV_DISABLE_MIPS) && \
  18. defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
  19. (_MIPS_SIM == _MIPS_SIM_ABI32)
  20. void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  21. uint8* dst, int dst_width) {
  22. __asm__ __volatile__(
  23. ".set push \n"
  24. ".set noreorder \n"
  25. "srl $t9, %[dst_width], 4 \n" // iterations -> by 16
  26. "beqz $t9, 2f \n"
  27. " nop \n"
  28. "1: \n"
  29. "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
  30. "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
  31. "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
  32. "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
  33. "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
  34. "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
  35. "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
  36. "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
  37. // TODO(fbarchard): Use odd pixels instead of even.
  38. "precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0|
  39. "precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8|
  40. "precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16|
  41. "precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24|
  42. "addiu %[src_ptr], %[src_ptr], 32 \n"
  43. "addiu $t9, $t9, -1 \n"
  44. "sw $t8, 0(%[dst]) \n"
  45. "sw $t0, 4(%[dst]) \n"
  46. "sw $t1, 8(%[dst]) \n"
  47. "sw $t2, 12(%[dst]) \n"
  48. "bgtz $t9, 1b \n"
  49. " addiu %[dst], %[dst], 16 \n"
  50. "2: \n"
  51. "andi $t9, %[dst_width], 0xf \n" // residue
  52. "beqz $t9, 3f \n"
  53. " nop \n"
  54. "21: \n"
  55. "lbu $t0, 0(%[src_ptr]) \n"
  56. "addiu %[src_ptr], %[src_ptr], 2 \n"
  57. "addiu $t9, $t9, -1 \n"
  58. "sb $t0, 0(%[dst]) \n"
  59. "bgtz $t9, 21b \n"
  60. " addiu %[dst], %[dst], 1 \n"
  61. "3: \n"
  62. ".set pop \n"
  63. : [src_ptr] "+r" (src_ptr),
  64. [dst] "+r" (dst)
  65. : [dst_width] "r" (dst_width)
  66. : "t0", "t1", "t2", "t3", "t4", "t5",
  67. "t6", "t7", "t8", "t9"
  68. );
  69. }
  70. void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  71. uint8* dst, int dst_width) {
  72. const uint8* t = src_ptr + src_stride;
  73. __asm__ __volatile__ (
  74. ".set push \n"
  75. ".set noreorder \n"
  76. "srl $t9, %[dst_width], 3 \n" // iterations -> step 8
  77. "bltz $t9, 2f \n"
  78. " nop \n"
  79. "1: \n"
  80. "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
  81. "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
  82. "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
  83. "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
  84. "lw $t4, 0(%[t]) \n" // |19|18|17|16|
  85. "lw $t5, 4(%[t]) \n" // |23|22|21|20|
  86. "lw $t6, 8(%[t]) \n" // |27|26|25|24|
  87. "lw $t7, 12(%[t]) \n" // |31|30|29|28|
  88. "addiu $t9, $t9, -1 \n"
  89. "srl $t8, $t0, 16 \n" // |X|X|3|2|
  90. "ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
  91. "ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
  92. "raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
  93. "raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
  94. "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
  95. "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
  96. "srl $t8, $t1, 16 \n" // |X|X|7|6|
  97. "ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
  98. "ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
  99. "raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
  100. "raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
  101. "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
  102. "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
  103. "srl $t8, $t2, 16 \n" // |X|X|11|10|
  104. "ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
  105. "ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
  106. "raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
  107. "raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
  108. "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
  109. "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
  110. "srl $t8, $t3, 16 \n" // |X|X|15|14|
  111. "ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
  112. "ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
  113. "raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
  114. "raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
  115. "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
  116. "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
  117. "addiu %[src_ptr], %[src_ptr], 16 \n"
  118. "addiu %[t], %[t], 16 \n"
  119. "sb $t0, 0(%[dst]) \n"
  120. "sb $t4, 1(%[dst]) \n"
  121. "sb $t1, 2(%[dst]) \n"
  122. "sb $t5, 3(%[dst]) \n"
  123. "sb $t2, 4(%[dst]) \n"
  124. "sb $t6, 5(%[dst]) \n"
  125. "sb $t3, 6(%[dst]) \n"
  126. "sb $t7, 7(%[dst]) \n"
  127. "bgtz $t9, 1b \n"
  128. " addiu %[dst], %[dst], 8 \n"
  129. "2: \n"
  130. "andi $t9, %[dst_width], 0x7 \n" // x = residue
  131. "beqz $t9, 3f \n"
  132. " nop \n"
  133. "21: \n"
  134. "lwr $t1, 0(%[src_ptr]) \n"
  135. "lwl $t1, 3(%[src_ptr]) \n"
  136. "lwr $t2, 0(%[t]) \n"
  137. "lwl $t2, 3(%[t]) \n"
  138. "srl $t8, $t1, 16 \n"
  139. "ins $t1, $t2, 16, 16 \n"
  140. "ins $t2, $t8, 0, 16 \n"
  141. "raddu.w.qb $t1, $t1 \n"
  142. "raddu.w.qb $t2, $t2 \n"
  143. "shra_r.w $t1, $t1, 2 \n"
  144. "shra_r.w $t2, $t2, 2 \n"
  145. "sb $t1, 0(%[dst]) \n"
  146. "sb $t2, 1(%[dst]) \n"
  147. "addiu %[src_ptr], %[src_ptr], 4 \n"
  148. "addiu $t9, $t9, -2 \n"
  149. "addiu %[t], %[t], 4 \n"
  150. "bgtz $t9, 21b \n"
  151. " addiu %[dst], %[dst], 2 \n"
  152. "3: \n"
  153. ".set pop \n"
  154. : [src_ptr] "+r" (src_ptr),
  155. [dst] "+r" (dst), [t] "+r" (t)
  156. : [dst_width] "r" (dst_width)
  157. : "t0", "t1", "t2", "t3", "t4", "t5",
  158. "t6", "t7", "t8", "t9"
  159. );
  160. }
  161. void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  162. uint8* dst, int dst_width) {
  163. __asm__ __volatile__ (
  164. ".set push \n"
  165. ".set noreorder \n"
  166. "srl $t9, %[dst_width], 3 \n"
  167. "beqz $t9, 2f \n"
  168. " nop \n"
  169. "1: \n"
  170. "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
  171. "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
  172. "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
  173. "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
  174. "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
  175. "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
  176. "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
  177. "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
  178. "precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0|
  179. "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8|
  180. "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16|
  181. "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24|
  182. "precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0|
  183. "precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16|
  184. "addiu %[src_ptr], %[src_ptr], 32 \n"
  185. "addiu $t9, $t9, -1 \n"
  186. "sw $t1, 0(%[dst]) \n"
  187. "sw $t5, 4(%[dst]) \n"
  188. "bgtz $t9, 1b \n"
  189. " addiu %[dst], %[dst], 8 \n"
  190. "2: \n"
  191. "andi $t9, %[dst_width], 7 \n" // residue
  192. "beqz $t9, 3f \n"
  193. " nop \n"
  194. "21: \n"
  195. "lbu $t1, 0(%[src_ptr]) \n"
  196. "addiu %[src_ptr], %[src_ptr], 4 \n"
  197. "addiu $t9, $t9, -1 \n"
  198. "sb $t1, 0(%[dst]) \n"
  199. "bgtz $t9, 21b \n"
  200. " addiu %[dst], %[dst], 1 \n"
  201. "3: \n"
  202. ".set pop \n"
  203. : [src_ptr] "+r" (src_ptr),
  204. [dst] "+r" (dst)
  205. : [dst_width] "r" (dst_width)
  206. : "t1", "t2", "t3", "t4", "t5",
  207. "t6", "t7", "t8", "t9"
  208. );
  209. }
  210. void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  211. uint8* dst, int dst_width) {
  212. intptr_t stride = src_stride;
  213. const uint8* s1 = src_ptr + stride;
  214. const uint8* s2 = s1 + stride;
  215. const uint8* s3 = s2 + stride;
  216. __asm__ __volatile__ (
  217. ".set push \n"
  218. ".set noreorder \n"
  219. "srl $t9, %[dst_width], 1 \n"
  220. "andi $t8, %[dst_width], 1 \n"
  221. "1: \n"
  222. "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
  223. "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
  224. "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
  225. "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
  226. "lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16|
  227. "lw $t5, 4(%[s1]) \n" // |23|22|21|20|
  228. "lw $t6, 4(%[s2]) \n" // |27|26|25|24|
  229. "lw $t7, 4(%[s3]) \n" // |31|30|29|28|
  230. "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
  231. "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
  232. "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
  233. "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
  234. "raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16|
  235. "raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20|
  236. "raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24|
  237. "raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28|
  238. "add $t0, $t0, $t1 \n"
  239. "add $t1, $t2, $t3 \n"
  240. "add $t0, $t0, $t1 \n"
  241. "add $t4, $t4, $t5 \n"
  242. "add $t6, $t6, $t7 \n"
  243. "add $t4, $t4, $t6 \n"
  244. "shra_r.w $t0, $t0, 4 \n"
  245. "shra_r.w $t4, $t4, 4 \n"
  246. "sb $t0, 0(%[dst]) \n"
  247. "sb $t4, 1(%[dst]) \n"
  248. "addiu %[src_ptr], %[src_ptr], 8 \n"
  249. "addiu %[s1], %[s1], 8 \n"
  250. "addiu %[s2], %[s2], 8 \n"
  251. "addiu %[s3], %[s3], 8 \n"
  252. "addiu $t9, $t9, -1 \n"
  253. "bgtz $t9, 1b \n"
  254. " addiu %[dst], %[dst], 2 \n"
  255. "beqz $t8, 2f \n"
  256. " nop \n"
  257. "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
  258. "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
  259. "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
  260. "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
  261. "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
  262. "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
  263. "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
  264. "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
  265. "add $t0, $t0, $t1 \n"
  266. "add $t1, $t2, $t3 \n"
  267. "add $t0, $t0, $t1 \n"
  268. "shra_r.w $t0, $t0, 4 \n"
  269. "sb $t0, 0(%[dst]) \n"
  270. "2: \n"
  271. ".set pop \n"
  272. : [src_ptr] "+r" (src_ptr),
  273. [dst] "+r" (dst),
  274. [s1] "+r" (s1),
  275. [s2] "+r" (s2),
  276. [s3] "+r" (s3)
  277. : [dst_width] "r" (dst_width)
  278. : "t0", "t1", "t2", "t3", "t4", "t5",
  279. "t6","t7", "t8", "t9"
  280. );
  281. }
  282. void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  283. uint8* dst, int dst_width) {
  284. __asm__ __volatile__ (
  285. ".set push \n"
  286. ".set noreorder \n"
  287. "1: \n"
  288. "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
  289. "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
  290. "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
  291. "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
  292. "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
  293. "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
  294. "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
  295. "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
  296. "precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13|
  297. "precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30|
  298. "addiu %[dst_width], %[dst_width], -24 \n"
  299. "ins $t1, $t1, 8, 16 \n" // |3|1|0|X|
  300. "ins $t4, $t0, 8, 16 \n" // |X|15|13|12|
  301. "ins $t5, $t5, 8, 16 \n" // |19|17|16|X|
  302. "ins $t8, $t9, 8, 16 \n" // |X|31|29|28|
  303. "addiu %[src_ptr], %[src_ptr], 32 \n"
  304. "packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5|
  305. "packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21|
  306. "prepend $t1, $t2, 8 \n" // |4|3|1|0|
  307. "prepend $t3, $t4, 24 \n" // |15|13|12|11|
  308. "prepend $t5, $t6, 8 \n" // |20|19|17|16|
  309. "prepend $t7, $t8, 24 \n" // |31|29|28|27|
  310. "sw $t1, 0(%[dst]) \n"
  311. "sw $t0, 4(%[dst]) \n"
  312. "sw $t3, 8(%[dst]) \n"
  313. "sw $t5, 12(%[dst]) \n"
  314. "sw $t9, 16(%[dst]) \n"
  315. "sw $t7, 20(%[dst]) \n"
  316. "bnez %[dst_width], 1b \n"
  317. " addiu %[dst], %[dst], 24 \n"
  318. ".set pop \n"
  319. : [src_ptr] "+r" (src_ptr),
  320. [dst] "+r" (dst),
  321. [dst_width] "+r" (dst_width)
  322. :
  323. : "t0", "t1", "t2", "t3", "t4", "t5",
  324. "t6","t7", "t8", "t9"
  325. );
  326. }
  327. void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  328. uint8* d, int dst_width) {
  329. __asm__ __volatile__ (
  330. ".set push \n"
  331. ".set noreorder \n"
  332. "repl.ph $t3, 3 \n" // 0x00030003
  333. "1: \n"
  334. "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
  335. "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
  336. "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1|
  337. "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
  338. "muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3|
  339. "muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3|
  340. "andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1|
  341. "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
  342. "raddu.w.qb $t0, $t0 \n"
  343. "raddu.w.qb $t1, $t1 \n"
  344. "shra_r.w $t0, $t0, 1 \n"
  345. "shra_r.w $t1, $t1, 1 \n"
  346. "preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1|
  347. "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
  348. "rotr $t2, $t2, 16 \n" // |0|S1|0|S2|
  349. "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
  350. "addu.ph $t2, $t2, $t4 \n"
  351. "addu.ph $t6, $t6, $t5 \n"
  352. "sll $t5, $t0, 1 \n"
  353. "add $t0, $t5, $t0 \n"
  354. "shra_r.ph $t2, $t2, 2 \n"
  355. "shra_r.ph $t6, $t6, 2 \n"
  356. "shll.ph $t4, $t2, 1 \n"
  357. "addq.ph $t4, $t4, $t2 \n"
  358. "addu $t0, $t0, $t1 \n"
  359. "addiu %[src_ptr], %[src_ptr], 4 \n"
  360. "shra_r.w $t0, $t0, 2 \n"
  361. "addu.ph $t6, $t6, $t4 \n"
  362. "shra_r.ph $t6, $t6, 2 \n"
  363. "srl $t1, $t6, 16 \n"
  364. "addiu %[dst_width], %[dst_width], -3 \n"
  365. "sb $t1, 0(%[d]) \n"
  366. "sb $t0, 1(%[d]) \n"
  367. "sb $t6, 2(%[d]) \n"
  368. "bgtz %[dst_width], 1b \n"
  369. " addiu %[d], %[d], 3 \n"
  370. "3: \n"
  371. ".set pop \n"
  372. : [src_ptr] "+r" (src_ptr),
  373. [src_stride] "+r" (src_stride),
  374. [d] "+r" (d),
  375. [dst_width] "+r" (dst_width)
  376. :
  377. : "t0", "t1", "t2", "t3",
  378. "t4", "t5", "t6"
  379. );
  380. }
  381. void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  382. uint8* d, int dst_width) {
  383. __asm__ __volatile__ (
  384. ".set push \n"
  385. ".set noreorder \n"
  386. "repl.ph $t2, 3 \n" // 0x00030003
  387. "1: \n"
  388. "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
  389. "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
  390. "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1|
  391. "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
  392. "muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3|
  393. "muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3|
  394. "andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1|
  395. "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
  396. "raddu.w.qb $t0, $t0 \n"
  397. "raddu.w.qb $t1, $t1 \n"
  398. "shra_r.w $t0, $t0, 1 \n"
  399. "shra_r.w $t1, $t1, 1 \n"
  400. "preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1|
  401. "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
  402. "rotr $t4, $t4, 16 \n" // |0|S1|0|S2|
  403. "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
  404. "addu.ph $t4, $t4, $t3 \n"
  405. "addu.ph $t6, $t6, $t5 \n"
  406. "shra_r.ph $t6, $t6, 2 \n"
  407. "shra_r.ph $t4, $t4, 2 \n"
  408. "addu.ph $t6, $t6, $t4 \n"
  409. "addiu %[src_ptr], %[src_ptr], 4 \n"
  410. "shra_r.ph $t6, $t6, 1 \n"
  411. "addu $t0, $t0, $t1 \n"
  412. "addiu %[dst_width], %[dst_width], -3 \n"
  413. "shra_r.w $t0, $t0, 1 \n"
  414. "srl $t1, $t6, 16 \n"
  415. "sb $t1, 0(%[d]) \n"
  416. "sb $t0, 1(%[d]) \n"
  417. "sb $t6, 2(%[d]) \n"
  418. "bgtz %[dst_width], 1b \n"
  419. " addiu %[d], %[d], 3 \n"
  420. "3: \n"
  421. ".set pop \n"
  422. : [src_ptr] "+r" (src_ptr),
  423. [src_stride] "+r" (src_stride),
  424. [d] "+r" (d),
  425. [dst_width] "+r" (dst_width)
  426. :
  427. : "t0", "t1", "t2", "t3",
  428. "t4", "t5", "t6"
  429. );
  430. }
  431. void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  432. uint8* dst, int dst_width) {
  433. __asm__ __volatile__ (
  434. ".set push \n"
  435. ".set noreorder \n"
  436. "1: \n"
  437. "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
  438. "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
  439. "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
  440. "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
  441. "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
  442. "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
  443. "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
  444. "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
  445. "wsbh $t0, $t0 \n" // |2|3|0|1|
  446. "wsbh $t6, $t6 \n" // |26|27|24|25|
  447. "srl $t0, $t0, 8 \n" // |X|2|3|0|
  448. "srl $t3, $t3, 16 \n" // |X|X|15|14|
  449. "srl $t5, $t5, 16 \n" // |X|X|23|22|
  450. "srl $t7, $t7, 16 \n" // |X|X|31|30|
  451. "ins $t1, $t2, 24, 8 \n" // |8|6|5|4|
  452. "ins $t6, $t5, 0, 8 \n" // |26|27|24|22|
  453. "ins $t1, $t0, 0, 16 \n" // |8|6|3|0|
  454. "ins $t6, $t7, 24, 8 \n" // |30|27|24|22|
  455. "prepend $t2, $t3, 24 \n" // |X|15|14|11|
  456. "ins $t4, $t4, 16, 8 \n" // |19|16|17|X|
  457. "ins $t4, $t2, 0, 16 \n" // |19|16|14|11|
  458. "addiu %[src_ptr], %[src_ptr], 32 \n"
  459. "addiu %[dst_width], %[dst_width], -12 \n"
  460. "addiu $t8,%[dst_width], -12 \n"
  461. "sw $t1, 0(%[dst]) \n"
  462. "sw $t4, 4(%[dst]) \n"
  463. "sw $t6, 8(%[dst]) \n"
  464. "bgez $t8, 1b \n"
  465. " addiu %[dst], %[dst], 12 \n"
  466. ".set pop \n"
  467. : [src_ptr] "+r" (src_ptr),
  468. [dst] "+r" (dst),
  469. [dst_width] "+r" (dst_width)
  470. :
  471. : "t0", "t1", "t2", "t3", "t4",
  472. "t5", "t6", "t7", "t8"
  473. );
  474. }
  475. void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  476. uint8* dst_ptr, int dst_width) {
  477. intptr_t stride = src_stride;
  478. const uint8* t = src_ptr + stride;
  479. const int c = 0x2AAA;
  480. __asm__ __volatile__ (
  481. ".set push \n"
  482. ".set noreorder \n"
  483. "1: \n"
  484. "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
  485. "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
  486. "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0|
  487. "lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4|
  488. "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
  489. "packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6|
  490. "packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4|
  491. "raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6
  492. "raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4
  493. "precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1|
  494. "precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3|
  495. "srl $t4, $t4, 2 \n" // t4 / 4
  496. "srl $t6, $t6, 16 \n" // |0|0|S3|T3|
  497. "raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3
  498. "addu $t6, $t5, $t6 \n"
  499. "mul $t6, $t6, %[c] \n" // t6 * 0x2AAA
  500. "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
  501. "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
  502. "raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0
  503. "raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0
  504. "addu $t0, $t0, $t2 \n"
  505. "mul $t0, $t0, %[c] \n" // t0 * 0x2AAA
  506. "addiu %[src_ptr], %[src_ptr], 8 \n"
  507. "addiu %[t], %[t], 8 \n"
  508. "addiu %[dst_width], %[dst_width], -3 \n"
  509. "addiu %[dst_ptr], %[dst_ptr], 3 \n"
  510. "srl $t6, $t6, 16 \n"
  511. "srl $t0, $t0, 16 \n"
  512. "sb $t4, -1(%[dst_ptr]) \n"
  513. "sb $t6, -2(%[dst_ptr]) \n"
  514. "bgtz %[dst_width], 1b \n"
  515. " sb $t0, -3(%[dst_ptr]) \n"
  516. ".set pop \n"
  517. : [src_ptr] "+r" (src_ptr),
  518. [dst_ptr] "+r" (dst_ptr),
  519. [t] "+r" (t),
  520. [dst_width] "+r" (dst_width)
  521. : [c] "r" (c)
  522. : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
  523. );
  524. }
  525. void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
  526. ptrdiff_t src_stride,
  527. uint8* dst_ptr, int dst_width) {
  528. intptr_t stride = src_stride;
  529. const uint8* s1 = src_ptr + stride;
  530. stride += stride;
  531. const uint8* s2 = src_ptr + stride;
  532. const int c1 = 0x1C71;
  533. const int c2 = 0x2AAA;
  534. __asm__ __volatile__ (
  535. ".set push \n"
  536. ".set noreorder \n"
  537. "1: \n"
  538. "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
  539. "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
  540. "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0|
  541. "lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4|
  542. "lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0|
  543. "lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4|
  544. "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
  545. "packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6|
  546. "raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6
  547. "packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4|
  548. "raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4
  549. "sll $t8, $t5, 16 \n" // |R5|R4|0|0|
  550. "raddu.w.qb $t8, $t8 \n" // R5+R4
  551. "addu $t7, $t7, $t8 \n"
  552. "srl $t8, $t5, 16 \n" // |0|0|R7|R6|
  553. "raddu.w.qb $t8, $t8 \n" // R7 + R6
  554. "addu $t6, $t6, $t8 \n"
  555. "mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA
  556. "precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1|
  557. "precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1|
  558. "srl $t8, $t8, 8 \n" // |0|S3|T3|R3|
  559. "raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3
  560. "addu $t7, $t7, $t8 \n"
  561. "mul $t7, $t7, %[c1] \n" // t7 * 0x1C71
  562. "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
  563. "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
  564. "sll $t4, $t4, 8 \n" // |R2|R1|R0|0|
  565. "raddu.w.qb $t0, $t0 \n"
  566. "raddu.w.qb $t2, $t2 \n"
  567. "raddu.w.qb $t4, $t4 \n"
  568. "addu $t0, $t0, $t2 \n"
  569. "addu $t0, $t0, $t4 \n"
  570. "mul $t0, $t0, %[c1] \n" // t0 * 0x1C71
  571. "addiu %[src_ptr], %[src_ptr], 8 \n"
  572. "addiu %[s1], %[s1], 8 \n"
  573. "addiu %[s2], %[s2], 8 \n"
  574. "addiu %[dst_width], %[dst_width], -3 \n"
  575. "addiu %[dst_ptr], %[dst_ptr], 3 \n"
  576. "srl $t6, $t6, 16 \n"
  577. "srl $t7, $t7, 16 \n"
  578. "srl $t0, $t0, 16 \n"
  579. "sb $t6, -1(%[dst_ptr]) \n"
  580. "sb $t7, -2(%[dst_ptr]) \n"
  581. "bgtz %[dst_width], 1b \n"
  582. " sb $t0, -3(%[dst_ptr]) \n"
  583. ".set pop \n"
  584. : [src_ptr] "+r" (src_ptr),
  585. [dst_ptr] "+r" (dst_ptr),
  586. [s1] "+r" (s1),
  587. [s2] "+r" (s2),
  588. [dst_width] "+r" (dst_width)
  589. : [c1] "r" (c1), [c2] "r" (c2)
  590. : "t0", "t1", "t2", "t3", "t4",
  591. "t5", "t6", "t7", "t8"
  592. );
  593. }
  594. #endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
  595. #ifdef __cplusplus
  596. } // extern "C"
  597. } // namespace libyuv
  598. #endif