variance_media.asm 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359
  1. ;
  2. ; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. EXPORT |vpx_variance16x16_media|
  11. EXPORT |vpx_variance8x8_media|
  12. EXPORT |vpx_mse16x16_media|
  13. ARM
  14. REQUIRE8
  15. PRESERVE8
  16. AREA ||.text||, CODE, READONLY, ALIGN=2
  17. ; r0 unsigned char *src_ptr
  18. ; r1 int source_stride
  19. ; r2 unsigned char *ref_ptr
  20. ; r3 int recon_stride
  21. ; stack unsigned int *sse
  22. |vpx_variance16x16_media| PROC
  23. stmfd sp!, {r4-r12, lr}
  24. pld [r0, r1, lsl #0]
  25. pld [r2, r3, lsl #0]
  26. mov r8, #0 ; initialize sum = 0
  27. mov r11, #0 ; initialize sse = 0
  28. mov r12, #16 ; set loop counter to 16 (=block height)
  29. loop16x16
  30. ; 1st 4 pixels
  31. ldr r4, [r0, #0] ; load 4 src pixels
  32. ldr r5, [r2, #0] ; load 4 ref pixels
  33. mov lr, #0 ; constant zero
  34. usub8 r6, r4, r5 ; calculate difference
  35. pld [r0, r1, lsl #1]
  36. sel r7, r6, lr ; select bytes with positive difference
  37. usub8 r9, r5, r4 ; calculate difference with reversed operands
  38. pld [r2, r3, lsl #1]
  39. sel r6, r9, lr ; select bytes with negative difference
  40. ; calculate partial sums
  41. usad8 r4, r7, lr ; calculate sum of positive differences
  42. usad8 r5, r6, lr ; calculate sum of negative differences
  43. orr r6, r6, r7 ; differences of all 4 pixels
  44. ; calculate total sum
  45. adds r8, r8, r4 ; add positive differences to sum
  46. subs r8, r8, r5 ; subtract negative differences from sum
  47. ; calculate sse
  48. uxtb16 r5, r6 ; byte (two pixels) to halfwords
  49. uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
  50. smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
  51. ; 2nd 4 pixels
  52. ldr r4, [r0, #4] ; load 4 src pixels
  53. ldr r5, [r2, #4] ; load 4 ref pixels
  54. smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
  55. usub8 r6, r4, r5 ; calculate difference
  56. sel r7, r6, lr ; select bytes with positive difference
  57. usub8 r9, r5, r4 ; calculate difference with reversed operands
  58. sel r6, r9, lr ; select bytes with negative difference
  59. ; calculate partial sums
  60. usad8 r4, r7, lr ; calculate sum of positive differences
  61. usad8 r5, r6, lr ; calculate sum of negative differences
  62. orr r6, r6, r7 ; differences of all 4 pixels
  63. ; calculate total sum
  64. add r8, r8, r4 ; add positive differences to sum
  65. sub r8, r8, r5 ; subtract negative differences from sum
  66. ; calculate sse
  67. uxtb16 r5, r6 ; byte (two pixels) to halfwords
  68. uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
  69. smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
  70. ; 3rd 4 pixels
  71. ldr r4, [r0, #8] ; load 4 src pixels
  72. ldr r5, [r2, #8] ; load 4 ref pixels
  73. smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
  74. usub8 r6, r4, r5 ; calculate difference
  75. sel r7, r6, lr ; select bytes with positive difference
  76. usub8 r9, r5, r4 ; calculate difference with reversed operands
  77. sel r6, r9, lr ; select bytes with negative difference
  78. ; calculate partial sums
  79. usad8 r4, r7, lr ; calculate sum of positive differences
  80. usad8 r5, r6, lr ; calculate sum of negative differences
  81. orr r6, r6, r7 ; differences of all 4 pixels
  82. ; calculate total sum
  83. add r8, r8, r4 ; add positive differences to sum
  84. sub r8, r8, r5 ; subtract negative differences from sum
  85. ; calculate sse
  86. uxtb16 r5, r6 ; byte (two pixels) to halfwords
  87. uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
  88. smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
  89. ; 4th 4 pixels
  90. ldr r4, [r0, #12] ; load 4 src pixels
  91. ldr r5, [r2, #12] ; load 4 ref pixels
  92. smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
  93. usub8 r6, r4, r5 ; calculate difference
  94. add r0, r0, r1 ; set src_ptr to next row
  95. sel r7, r6, lr ; select bytes with positive difference
  96. usub8 r9, r5, r4 ; calculate difference with reversed operands
  97. add r2, r2, r3 ; set dst_ptr to next row
  98. sel r6, r9, lr ; select bytes with negative difference
  99. ; calculate partial sums
  100. usad8 r4, r7, lr ; calculate sum of positive differences
  101. usad8 r5, r6, lr ; calculate sum of negative differences
  102. orr r6, r6, r7 ; differences of all 4 pixels
  103. ; calculate total sum
  104. add r8, r8, r4 ; add positive differences to sum
  105. sub r8, r8, r5 ; subtract negative differences from sum
  106. ; calculate sse
  107. uxtb16 r5, r6 ; byte (two pixels) to halfwords
  108. uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
  109. smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
  110. smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
  111. subs r12, r12, #1
  112. bne loop16x16
  113. ; return stuff
  114. ldr r6, [sp, #40] ; get address of sse
  115. mul r0, r8, r8 ; sum * sum
  116. str r11, [r6] ; store sse
  117. sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
  118. ldmfd sp!, {r4-r12, pc}
  119. ENDP
  120. ; r0 unsigned char *src_ptr
  121. ; r1 int source_stride
  122. ; r2 unsigned char *ref_ptr
  123. ; r3 int recon_stride
  124. ; stack unsigned int *sse
  125. |vpx_variance8x8_media| PROC
  126. push {r4-r10, lr}
  127. pld [r0, r1, lsl #0]
  128. pld [r2, r3, lsl #0]
  129. mov r12, #8 ; set loop counter to 8 (=block height)
  130. mov r4, #0 ; initialize sum = 0
  131. mov r5, #0 ; initialize sse = 0
  132. loop8x8
  133. ; 1st 4 pixels
  134. ldr r6, [r0, #0x0] ; load 4 src pixels
  135. ldr r7, [r2, #0x0] ; load 4 ref pixels
  136. mov lr, #0 ; constant zero
  137. usub8 r8, r6, r7 ; calculate difference
  138. pld [r0, r1, lsl #1]
  139. sel r10, r8, lr ; select bytes with positive difference
  140. usub8 r9, r7, r6 ; calculate difference with reversed operands
  141. pld [r2, r3, lsl #1]
  142. sel r8, r9, lr ; select bytes with negative difference
  143. ; calculate partial sums
  144. usad8 r6, r10, lr ; calculate sum of positive differences
  145. usad8 r7, r8, lr ; calculate sum of negative differences
  146. orr r8, r8, r10 ; differences of all 4 pixels
  147. ; calculate total sum
  148. add r4, r4, r6 ; add positive differences to sum
  149. sub r4, r4, r7 ; subtract negative differences from sum
  150. ; calculate sse
  151. uxtb16 r7, r8 ; byte (two pixels) to halfwords
  152. uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
  153. smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
  154. ; 2nd 4 pixels
  155. ldr r6, [r0, #0x4] ; load 4 src pixels
  156. ldr r7, [r2, #0x4] ; load 4 ref pixels
  157. smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
  158. usub8 r8, r6, r7 ; calculate difference
  159. add r0, r0, r1 ; set src_ptr to next row
  160. sel r10, r8, lr ; select bytes with positive difference
  161. usub8 r9, r7, r6 ; calculate difference with reversed operands
  162. add r2, r2, r3 ; set dst_ptr to next row
  163. sel r8, r9, lr ; select bytes with negative difference
  164. ; calculate partial sums
  165. usad8 r6, r10, lr ; calculate sum of positive differences
  166. usad8 r7, r8, lr ; calculate sum of negative differences
  167. orr r8, r8, r10 ; differences of all 4 pixels
  168. ; calculate total sum
  169. add r4, r4, r6 ; add positive differences to sum
  170. sub r4, r4, r7 ; subtract negative differences from sum
  171. ; calculate sse
  172. uxtb16 r7, r8 ; byte (two pixels) to halfwords
  173. uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
  174. smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
  175. subs r12, r12, #1 ; next row
  176. smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
  177. bne loop8x8
  178. ; return stuff
  179. ldr r8, [sp, #32] ; get address of sse
  180. mul r1, r4, r4 ; sum * sum
  181. str r5, [r8] ; store sse
  182. sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
  183. pop {r4-r10, pc}
  184. ENDP
  185. ; r0 unsigned char *src_ptr
  186. ; r1 int source_stride
  187. ; r2 unsigned char *ref_ptr
  188. ; r3 int recon_stride
  189. ; stack unsigned int *sse
  190. ;
  191. ;note: Based on vpx_variance16x16_media. In this function, sum is never used.
  192. ; So, we can remove this part of calculation.
  193. |vpx_mse16x16_media| PROC
  194. push {r4-r9, lr}
  195. pld [r0, r1, lsl #0]
  196. pld [r2, r3, lsl #0]
  197. mov r12, #16 ; set loop counter to 16 (=block height)
  198. mov r4, #0 ; initialize sse = 0
  199. loopmse
  200. ; 1st 4 pixels
  201. ldr r5, [r0, #0x0] ; load 4 src pixels
  202. ldr r6, [r2, #0x0] ; load 4 ref pixels
  203. mov lr, #0 ; constant zero
  204. usub8 r8, r5, r6 ; calculate difference
  205. pld [r0, r1, lsl #1]
  206. sel r7, r8, lr ; select bytes with positive difference
  207. usub8 r9, r6, r5 ; calculate difference with reversed operands
  208. pld [r2, r3, lsl #1]
  209. sel r8, r9, lr ; select bytes with negative difference
  210. ; calculate partial sums
  211. usad8 r5, r7, lr ; calculate sum of positive differences
  212. usad8 r6, r8, lr ; calculate sum of negative differences
  213. orr r8, r8, r7 ; differences of all 4 pixels
  214. ldr r5, [r0, #0x4] ; load 4 src pixels
  215. ; calculate sse
  216. uxtb16 r6, r8 ; byte (two pixels) to halfwords
  217. uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
  218. smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
  219. ; 2nd 4 pixels
  220. ldr r6, [r2, #0x4] ; load 4 ref pixels
  221. smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
  222. usub8 r8, r5, r6 ; calculate difference
  223. sel r7, r8, lr ; select bytes with positive difference
  224. usub8 r9, r6, r5 ; calculate difference with reversed operands
  225. sel r8, r9, lr ; select bytes with negative difference
  226. ; calculate partial sums
  227. usad8 r5, r7, lr ; calculate sum of positive differences
  228. usad8 r6, r8, lr ; calculate sum of negative differences
  229. orr r8, r8, r7 ; differences of all 4 pixels
  230. ldr r5, [r0, #0x8] ; load 4 src pixels
  231. ; calculate sse
  232. uxtb16 r6, r8 ; byte (two pixels) to halfwords
  233. uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
  234. smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
  235. ; 3rd 4 pixels
  236. ldr r6, [r2, #0x8] ; load 4 ref pixels
  237. smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
  238. usub8 r8, r5, r6 ; calculate difference
  239. sel r7, r8, lr ; select bytes with positive difference
  240. usub8 r9, r6, r5 ; calculate difference with reversed operands
  241. sel r8, r9, lr ; select bytes with negative difference
  242. ; calculate partial sums
  243. usad8 r5, r7, lr ; calculate sum of positive differences
  244. usad8 r6, r8, lr ; calculate sum of negative differences
  245. orr r8, r8, r7 ; differences of all 4 pixels
  246. ldr r5, [r0, #0xc] ; load 4 src pixels
  247. ; calculate sse
  248. uxtb16 r6, r8 ; byte (two pixels) to halfwords
  249. uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
  250. smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
  251. ; 4th 4 pixels
  252. ldr r6, [r2, #0xc] ; load 4 ref pixels
  253. smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
  254. usub8 r8, r5, r6 ; calculate difference
  255. add r0, r0, r1 ; set src_ptr to next row
  256. sel r7, r8, lr ; select bytes with positive difference
  257. usub8 r9, r6, r5 ; calculate difference with reversed operands
  258. add r2, r2, r3 ; set dst_ptr to next row
  259. sel r8, r9, lr ; select bytes with negative difference
  260. ; calculate partial sums
  261. usad8 r5, r7, lr ; calculate sum of positive differences
  262. usad8 r6, r8, lr ; calculate sum of negative differences
  263. orr r8, r8, r7 ; differences of all 4 pixels
  264. subs r12, r12, #1 ; next row
  265. ; calculate sse
  266. uxtb16 r6, r8 ; byte (two pixels) to halfwords
  267. uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
  268. smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
  269. smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
  270. bne loopmse
  271. ; return stuff
  272. ldr r1, [sp, #28] ; get address of sse
  273. mov r0, r4 ; return sse
  274. str r4, [r1] ; store sse
  275. pop {r4-r9, pc}
  276. ENDP
  277. END