armfrag.s 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663
  1. ;********************************************************************
  2. ;* *
  3. ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
  4. ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
  5. ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  6. ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
  7. ;* *
  8. ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
  9. ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  10. ;* *
  11. ;********************************************************************
  12. ; Original implementation:
  13. ; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
  14. ; last mod: $Id: armfrag.s 17481 2010-10-03 22:49:42Z tterribe $
  15. ;********************************************************************
  16. AREA |.text|, CODE, READONLY
  17. ; Explicitly specifying alignment here because some versions of
  18. ; gas don't align code correctly. See
  19. ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html
  20. ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992
  21. ALIGN
  22. GET armopts.s
  23. ; Vanilla ARM v4 versions
  24. EXPORT oc_frag_copy_list_arm
  25. EXPORT oc_frag_recon_intra_arm
  26. EXPORT oc_frag_recon_inter_arm
  27. EXPORT oc_frag_recon_inter2_arm
  28. oc_frag_copy_list_arm PROC
  29. ; r0 = _dst_frame
  30. ; r1 = _src_frame
  31. ; r2 = _ystride
  32. ; r3 = _fragis
  33. ; <> = _nfragis
  34. ; <> = _frag_buf_offs
  35. LDR r12,[r13] ; r12 = _nfragis
  36. STMFD r13!,{r4-r6,r11,r14}
  37. SUBS r12, r12, #1
  38. LDR r4,[r3],#4 ; r4 = _fragis[fragii]
  39. LDRGE r14,[r13,#4*6] ; r14 = _frag_buf_offs
  40. BLT ofcl_arm_end
  41. SUB r2, r2, #4
  42. ofcl_arm_lp
  43. LDR r11,[r14,r4,LSL #2] ; r11 = _frag_buf_offs[_fragis[fragii]]
  44. SUBS r12, r12, #1
  45. ; Stall (on XScale)
  46. ADD r4, r1, r11 ; r4 = _src_frame+frag_buf_off
  47. LDR r6, [r4], #4
  48. ADD r11,r0, r11 ; r11 = _dst_frame+frag_buf_off
  49. LDR r5, [r4], r2
  50. STR r6, [r11],#4
  51. LDR r6, [r4], #4
  52. STR r5, [r11],r2
  53. LDR r5, [r4], r2
  54. STR r6, [r11],#4
  55. LDR r6, [r4], #4
  56. STR r5, [r11],r2
  57. LDR r5, [r4], r2
  58. STR r6, [r11],#4
  59. LDR r6, [r4], #4
  60. STR r5, [r11],r2
  61. LDR r5, [r4], r2
  62. STR r6, [r11],#4
  63. LDR r6, [r4], #4
  64. STR r5, [r11],r2
  65. LDR r5, [r4], r2
  66. STR r6, [r11],#4
  67. LDR r6, [r4], #4
  68. STR r5, [r11],r2
  69. LDR r5, [r4], r2
  70. STR r6, [r11],#4
  71. LDR r6, [r4], #4
  72. STR r5, [r11],r2
  73. LDR r5, [r4], r2
  74. STR r6, [r11],#4
  75. LDR r6, [r4], #4
  76. STR r5, [r11],r2
  77. LDR r5, [r4]
  78. LDRGE r4,[r3],#4 ; r4 = _fragis[fragii]
  79. STR r6, [r11],#4
  80. STR r5, [r11]
  81. BGE ofcl_arm_lp
  82. ofcl_arm_end
  83. LDMFD r13!,{r4-r6,r11,PC}
  84. oc_frag_recon_intra_arm
  85. ; r0 = unsigned char *_dst
  86. ; r1 = int _ystride
  87. ; r2 = const ogg_int16_t _residue[64]
  88. STMFD r13!,{r4,r5,r14}
  89. MOV r14,#8
  90. MOV r5, #255
  91. SUB r1, r1, #7
  92. ofrintra_lp_arm
  93. LDRSH r3, [r2], #2
  94. LDRSH r4, [r2], #2
  95. LDRSH r12,[r2], #2
  96. ADDS r3, r3, #128
  97. CMPGT r5, r3
  98. EORLT r3, r5, r3, ASR #32
  99. STRB r3, [r0], #1
  100. ADDS r4, r4, #128
  101. CMPGT r5, r4
  102. EORLT r4, r5, r4, ASR #32
  103. LDRSH r3, [r2], #2
  104. STRB r4, [r0], #1
  105. ADDS r12,r12,#128
  106. CMPGT r5, r12
  107. EORLT r12,r5, r12,ASR #32
  108. LDRSH r4, [r2], #2
  109. STRB r12,[r0], #1
  110. ADDS r3, r3, #128
  111. CMPGT r5, r3
  112. EORLT r3, r5, r3, ASR #32
  113. LDRSH r12,[r2], #2
  114. STRB r3, [r0], #1
  115. ADDS r4, r4, #128
  116. CMPGT r5, r4
  117. EORLT r4, r5, r4, ASR #32
  118. LDRSH r3, [r2], #2
  119. STRB r4, [r0], #1
  120. ADDS r12,r12,#128
  121. CMPGT r5, r12
  122. EORLT r12,r5, r12,ASR #32
  123. LDRSH r4, [r2], #2
  124. STRB r12,[r0], #1
  125. ADDS r3, r3, #128
  126. CMPGT r5, r3
  127. EORLT r3, r5, r3, ASR #32
  128. STRB r3, [r0], #1
  129. ADDS r4, r4, #128
  130. CMPGT r5, r4
  131. EORLT r4, r5, r4, ASR #32
  132. STRB r4, [r0], r1
  133. SUBS r14,r14,#1
  134. BGT ofrintra_lp_arm
  135. LDMFD r13!,{r4,r5,PC}
  136. ENDP
  137. oc_frag_recon_inter_arm PROC
  138. ; r0 = unsigned char *dst
  139. ; r1 = const unsigned char *src
  140. ; r2 = int ystride
  141. ; r3 = const ogg_int16_t residue[64]
  142. STMFD r13!,{r5,r9-r11,r14}
  143. MOV r9, #8
  144. MOV r5, #255
  145. SUB r2, r2, #7
  146. ofrinter_lp_arm
  147. LDRSH r12,[r3], #2
  148. LDRB r14,[r1], #1
  149. LDRSH r11,[r3], #2
  150. LDRB r10,[r1], #1
  151. ADDS r12,r12,r14
  152. CMPGT r5, r12
  153. EORLT r12,r5, r12,ASR #32
  154. STRB r12,[r0], #1
  155. ADDS r11,r11,r10
  156. CMPGT r5, r11
  157. LDRSH r12,[r3], #2
  158. LDRB r14,[r1], #1
  159. EORLT r11,r5, r11,ASR #32
  160. STRB r11,[r0], #1
  161. ADDS r12,r12,r14
  162. CMPGT r5, r12
  163. LDRSH r11,[r3], #2
  164. LDRB r10,[r1], #1
  165. EORLT r12,r5, r12,ASR #32
  166. STRB r12,[r0], #1
  167. ADDS r11,r11,r10
  168. CMPGT r5, r11
  169. LDRSH r12,[r3], #2
  170. LDRB r14,[r1], #1
  171. EORLT r11,r5, r11,ASR #32
  172. STRB r11,[r0], #1
  173. ADDS r12,r12,r14
  174. CMPGT r5, r12
  175. LDRSH r11,[r3], #2
  176. LDRB r10,[r1], #1
  177. EORLT r12,r5, r12,ASR #32
  178. STRB r12,[r0], #1
  179. ADDS r11,r11,r10
  180. CMPGT r5, r11
  181. LDRSH r12,[r3], #2
  182. LDRB r14,[r1], #1
  183. EORLT r11,r5, r11,ASR #32
  184. STRB r11,[r0], #1
  185. ADDS r12,r12,r14
  186. CMPGT r5, r12
  187. LDRSH r11,[r3], #2
  188. LDRB r10,[r1], r2
  189. EORLT r12,r5, r12,ASR #32
  190. STRB r12,[r0], #1
  191. ADDS r11,r11,r10
  192. CMPGT r5, r11
  193. EORLT r11,r5, r11,ASR #32
  194. STRB r11,[r0], r2
  195. SUBS r9, r9, #1
  196. BGT ofrinter_lp_arm
  197. LDMFD r13!,{r5,r9-r11,PC}
  198. ENDP
  199. oc_frag_recon_inter2_arm PROC
  200. ; r0 = unsigned char *dst
  201. ; r1 = const unsigned char *src1
  202. ; r2 = const unsigned char *src2
  203. ; r3 = int ystride
  204. LDR r12,[r13]
  205. ; r12= const ogg_int16_t residue[64]
  206. STMFD r13!,{r4-r8,r14}
  207. MOV r14,#8
  208. MOV r8, #255
  209. SUB r3, r3, #7
  210. ofrinter2_lp_arm
  211. LDRB r5, [r1], #1
  212. LDRB r6, [r2], #1
  213. LDRSH r4, [r12],#2
  214. LDRB r7, [r1], #1
  215. ADD r5, r5, r6
  216. ADDS r5, r4, r5, LSR #1
  217. CMPGT r8, r5
  218. LDRB r6, [r2], #1
  219. LDRSH r4, [r12],#2
  220. EORLT r5, r8, r5, ASR #32
  221. STRB r5, [r0], #1
  222. ADD r7, r7, r6
  223. ADDS r7, r4, r7, LSR #1
  224. CMPGT r8, r7
  225. LDRB r5, [r1], #1
  226. LDRB r6, [r2], #1
  227. LDRSH r4, [r12],#2
  228. EORLT r7, r8, r7, ASR #32
  229. STRB r7, [r0], #1
  230. ADD r5, r5, r6
  231. ADDS r5, r4, r5, LSR #1
  232. CMPGT r8, r5
  233. LDRB r7, [r1], #1
  234. LDRB r6, [r2], #1
  235. LDRSH r4, [r12],#2
  236. EORLT r5, r8, r5, ASR #32
  237. STRB r5, [r0], #1
  238. ADD r7, r7, r6
  239. ADDS r7, r4, r7, LSR #1
  240. CMPGT r8, r7
  241. LDRB r5, [r1], #1
  242. LDRB r6, [r2], #1
  243. LDRSH r4, [r12],#2
  244. EORLT r7, r8, r7, ASR #32
  245. STRB r7, [r0], #1
  246. ADD r5, r5, r6
  247. ADDS r5, r4, r5, LSR #1
  248. CMPGT r8, r5
  249. LDRB r7, [r1], #1
  250. LDRB r6, [r2], #1
  251. LDRSH r4, [r12],#2
  252. EORLT r5, r8, r5, ASR #32
  253. STRB r5, [r0], #1
  254. ADD r7, r7, r6
  255. ADDS r7, r4, r7, LSR #1
  256. CMPGT r8, r7
  257. LDRB r5, [r1], #1
  258. LDRB r6, [r2], #1
  259. LDRSH r4, [r12],#2
  260. EORLT r7, r8, r7, ASR #32
  261. STRB r7, [r0], #1
  262. ADD r5, r5, r6
  263. ADDS r5, r4, r5, LSR #1
  264. CMPGT r8, r5
  265. LDRB r7, [r1], r3
  266. LDRB r6, [r2], r3
  267. LDRSH r4, [r12],#2
  268. EORLT r5, r8, r5, ASR #32
  269. STRB r5, [r0], #1
  270. ADD r7, r7, r6
  271. ADDS r7, r4, r7, LSR #1
  272. CMPGT r8, r7
  273. EORLT r7, r8, r7, ASR #32
  274. STRB r7, [r0], r3
  275. SUBS r14,r14,#1
  276. BGT ofrinter2_lp_arm
  277. LDMFD r13!,{r4-r8,PC}
  278. ENDP
  279. [ OC_ARM_ASM_EDSP
  280. EXPORT oc_frag_copy_list_edsp
  281. oc_frag_copy_list_edsp PROC
  282. ; r0 = _dst_frame
  283. ; r1 = _src_frame
  284. ; r2 = _ystride
  285. ; r3 = _fragis
  286. ; <> = _nfragis
  287. ; <> = _frag_buf_offs
  288. LDR r12,[r13] ; r12 = _nfragis
  289. STMFD r13!,{r4-r11,r14}
  290. SUBS r12, r12, #1
  291. LDRGE r5, [r3],#4 ; r5 = _fragis[fragii]
  292. LDRGE r14,[r13,#4*10] ; r14 = _frag_buf_offs
  293. BLT ofcl_edsp_end
  294. ofcl_edsp_lp
  295. MOV r4, r1
  296. LDR r5, [r14,r5, LSL #2] ; r5 = _frag_buf_offs[_fragis[fragii]]
  297. SUBS r12, r12, #1
  298. ; Stall (on XScale)
  299. LDRD r6, [r4, r5]! ; r4 = _src_frame+frag_buf_off
  300. LDRD r8, [r4, r2]!
  301. ; Stall
  302. STRD r6, [r5, r0]! ; r5 = _dst_frame+frag_buf_off
  303. STRD r8, [r5, r2]!
  304. ; Stall
  305. LDRD r6, [r4, r2]! ; On Xscale at least, doing 3 consecutive
  306. LDRD r8, [r4, r2]! ; loads causes a stall, but that's no worse
  307. LDRD r10,[r4, r2]! ; than us only doing 2, and having to do
  308. ; another pair of LDRD/STRD later on.
  309. ; Stall
  310. STRD r6, [r5, r2]!
  311. STRD r8, [r5, r2]!
  312. STRD r10,[r5, r2]!
  313. LDRD r6, [r4, r2]!
  314. LDRD r8, [r4, r2]!
  315. LDRD r10,[r4, r2]!
  316. STRD r6, [r5, r2]!
  317. STRD r8, [r5, r2]!
  318. STRD r10,[r5, r2]!
  319. LDRGE r5, [r3],#4 ; r5 = _fragis[fragii]
  320. BGE ofcl_edsp_lp
  321. ofcl_edsp_end
  322. LDMFD r13!,{r4-r11,PC}
  323. ENDP
  324. ]
  325. [ OC_ARM_ASM_MEDIA
  326. EXPORT oc_frag_recon_intra_v6
  327. EXPORT oc_frag_recon_inter_v6
  328. EXPORT oc_frag_recon_inter2_v6
  329. oc_frag_recon_intra_v6 PROC
  330. ; r0 = unsigned char *_dst
  331. ; r1 = int _ystride
  332. ; r2 = const ogg_int16_t _residue[64]
  333. STMFD r13!,{r4-r6,r14}
  334. MOV r14,#8
  335. MOV r12,r2
  336. LDR r6, =0x00800080
  337. ofrintra_v6_lp
  338. LDRD r2, [r12],#8 ; r2 = 11110000 r3 = 33332222
  339. LDRD r4, [r12],#8 ; r4 = 55554444 r5 = 77776666
  340. SUBS r14,r14,#1
  341. QADD16 r2, r2, r6
  342. QADD16 r3, r3, r6
  343. QADD16 r4, r4, r6
  344. QADD16 r5, r5, r6
  345. USAT16 r2, #8, r2 ; r2 = __11__00
  346. USAT16 r3, #8, r3 ; r3 = __33__22
  347. USAT16 r4, #8, r4 ; r4 = __55__44
  348. USAT16 r5, #8, r5 ; r5 = __77__66
  349. ORR r2, r2, r2, LSR #8 ; r2 = __111100
  350. ORR r3, r3, r3, LSR #8 ; r3 = __333322
  351. ORR r4, r4, r4, LSR #8 ; r4 = __555544
  352. ORR r5, r5, r5, LSR #8 ; r5 = __777766
  353. PKHBT r2, r2, r3, LSL #16 ; r2 = 33221100
  354. PKHBT r3, r4, r5, LSL #16 ; r3 = 77665544
  355. STRD r2, [r0], r1
  356. BGT ofrintra_v6_lp
  357. LDMFD r13!,{r4-r6,PC}
  358. ENDP
  359. oc_frag_recon_inter_v6 PROC
  360. ; r0 = unsigned char *_dst
  361. ; r1 = const unsigned char *_src
  362. ; r2 = int _ystride
  363. ; r3 = const ogg_int16_t _residue[64]
  364. STMFD r13!,{r4-r7,r14}
  365. MOV r14,#8
  366. ofrinter_v6_lp
  367. LDRD r6, [r3], #8 ; r6 = 11110000 r7 = 33332222
  368. SUBS r14,r14,#1
  369. [ OC_ARM_CAN_UNALIGN_LDRD
  370. LDRD r4, [r1], r2 ; Unaligned ; r4 = 33221100 r5 = 77665544
  371. |
  372. LDR r5, [r1, #4]
  373. LDR r4, [r1], r2
  374. ]
  375. PKHBT r12,r6, r7, LSL #16 ; r12= 22220000
  376. PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111
  377. UXTB16 r6,r4 ; r6 = __22__00
  378. UXTB16 r4,r4, ROR #8 ; r4 = __33__11
  379. QADD16 r12,r12,r6 ; r12= xx22xx00
  380. QADD16 r4, r7, r4 ; r4 = xx33xx11
  381. LDRD r6, [r3], #8 ; r6 = 55554444 r7 = 77776666
  382. USAT16 r4, #8, r4 ; r4 = __33__11
  383. USAT16 r12,#8,r12 ; r12= __22__00
  384. ORR r4, r12,r4, LSL #8 ; r4 = 33221100
  385. PKHBT r12,r6, r7, LSL #16 ; r12= 66664444
  386. PKHTB r7, r7, r6, ASR #16 ; r7 = 77775555
  387. UXTB16 r6,r5 ; r6 = __66__44
  388. UXTB16 r5,r5, ROR #8 ; r5 = __77__55
  389. QADD16 r12,r12,r6 ; r12= xx66xx44
  390. QADD16 r5, r7, r5 ; r5 = xx77xx55
  391. USAT16 r12,#8, r12 ; r12= __66__44
  392. USAT16 r5, #8, r5 ; r4 = __77__55
  393. ORR r5, r12,r5, LSL #8 ; r5 = 33221100
  394. STRD r4, [r0], r2
  395. BGT ofrinter_v6_lp
  396. LDMFD r13!,{r4-r7,PC}
  397. ENDP
  398. oc_frag_recon_inter2_v6 PROC
  399. ; r0 = unsigned char *_dst
  400. ; r1 = const unsigned char *_src1
  401. ; r2 = const unsigned char *_src2
  402. ; r3 = int _ystride
  403. LDR r12,[r13]
  404. ; r12= const ogg_int16_t _residue[64]
  405. STMFD r13!,{r4-r9,r14}
  406. MOV r14,#8
  407. ofrinter2_v6_lp
  408. LDRD r6, [r12,#8] ; r6 = 55554444 r7 = 77776666
  409. SUBS r14,r14,#1
  410. LDR r4, [r1, #4] ; Unaligned ; r4 = src1[1] = 77665544
  411. LDR r5, [r2, #4] ; Unaligned ; r5 = src2[1] = 77665544
  412. PKHBT r8, r6, r7, LSL #16 ; r8 = 66664444
  413. PKHTB r9, r7, r6, ASR #16 ; r9 = 77775555
  414. UHADD8 r4, r4, r5 ; r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
  415. UXTB16 r5, r4 ; r5 = __66__44
  416. UXTB16 r4, r4, ROR #8 ; r4 = __77__55
  417. QADD16 r8, r8, r5 ; r8 = xx66xx44
  418. QADD16 r9, r9, r4 ; r9 = xx77xx55
  419. LDRD r6,[r12],#16 ; r6 = 33332222 r7 = 11110000
  420. USAT16 r8, #8, r8 ; r8 = __66__44
  421. LDR r4, [r1], r3 ; Unaligned ; r4 = src1[0] = 33221100
  422. USAT16 r9, #8, r9 ; r9 = __77__55
  423. LDR r5, [r2], r3 ; Unaligned ; r5 = src2[0] = 33221100
  424. ORR r9, r8, r9, LSL #8 ; r9 = 77665544
  425. PKHBT r8, r6, r7, LSL #16 ; r8 = 22220000
  426. UHADD8 r4, r4, r5 ; r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
  427. PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111
  428. UXTB16 r5, r4 ; r5 = __22__00
  429. UXTB16 r4, r4, ROR #8 ; r4 = __33__11
  430. QADD16 r8, r8, r5 ; r8 = xx22xx00
  431. QADD16 r7, r7, r4 ; r7 = xx33xx11
  432. USAT16 r8, #8, r8 ; r8 = __22__00
  433. USAT16 r7, #8, r7 ; r7 = __33__11
  434. ORR r8, r8, r7, LSL #8 ; r8 = 33221100
  435. STRD r8, [r0], r3
  436. BGT ofrinter2_v6_lp
  437. LDMFD r13!,{r4-r9,PC}
  438. ENDP
  439. ]
  440. [ OC_ARM_ASM_NEON
  441. EXPORT oc_frag_copy_list_neon
  442. EXPORT oc_frag_recon_intra_neon
  443. EXPORT oc_frag_recon_inter_neon
  444. EXPORT oc_frag_recon_inter2_neon
  445. oc_frag_copy_list_neon PROC
  446. ; r0 = _dst_frame
  447. ; r1 = _src_frame
  448. ; r2 = _ystride
  449. ; r3 = _fragis
  450. ; <> = _nfragis
  451. ; <> = _frag_buf_offs
  452. LDR r12,[r13] ; r12 = _nfragis
  453. STMFD r13!,{r4-r7,r14}
  454. CMP r12, #1
  455. LDRGE r6, [r3] ; r6 = _fragis[fragii]
  456. LDRGE r14,[r13,#4*6] ; r14 = _frag_buf_offs
  457. BLT ofcl_neon_end
  458. ; Stall (2 on Xscale)
  459. LDR r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]]
  460. ; Stall (on XScale)
  461. MOV r7, r6 ; Guarantee PLD points somewhere valid.
  462. ofcl_neon_lp
  463. ADD r4, r1, r6
  464. VLD1.64 {D0}, [r4@64], r2
  465. ADD r5, r0, r6
  466. VLD1.64 {D1}, [r4@64], r2
  467. SUBS r12, r12, #1
  468. VLD1.64 {D2}, [r4@64], r2
  469. LDRGT r6, [r3,#4]! ; r6 = _fragis[fragii]
  470. VLD1.64 {D3}, [r4@64], r2
  471. LDRGT r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]]
  472. VLD1.64 {D4}, [r4@64], r2
  473. ADDGT r7, r1, r6
  474. VLD1.64 {D5}, [r4@64], r2
  475. PLD [r7]
  476. VLD1.64 {D6}, [r4@64], r2
  477. PLD [r7, r2]
  478. VLD1.64 {D7}, [r4@64]
  479. PLD [r7, r2, LSL #1]
  480. VST1.64 {D0}, [r5@64], r2
  481. ADDGT r7, r7, r2, LSL #2
  482. VST1.64 {D1}, [r5@64], r2
  483. PLD [r7, -r2]
  484. VST1.64 {D2}, [r5@64], r2
  485. PLD [r7]
  486. VST1.64 {D3}, [r5@64], r2
  487. PLD [r7, r2]
  488. VST1.64 {D4}, [r5@64], r2
  489. PLD [r7, r2, LSL #1]
  490. VST1.64 {D5}, [r5@64], r2
  491. ADDGT r7, r7, r2, LSL #2
  492. VST1.64 {D6}, [r5@64], r2
  493. PLD [r7, -r2]
  494. VST1.64 {D7}, [r5@64]
  495. BGT ofcl_neon_lp
  496. ofcl_neon_end
  497. LDMFD r13!,{r4-r7,PC}
  498. ENDP
  499. oc_frag_recon_intra_neon PROC
  500. ; r0 = unsigned char *_dst
  501. ; r1 = int _ystride
  502. ; r2 = const ogg_int16_t _residue[64]
  503. MOV r3, #128
  504. VDUP.S16 Q0, r3
  505. VLDMIA r2, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles
  506. VQADD.S16 Q8, Q8, Q0
  507. VQADD.S16 Q9, Q9, Q0
  508. VQADD.S16 Q10,Q10,Q0
  509. VQADD.S16 Q11,Q11,Q0
  510. VQADD.S16 Q12,Q12,Q0
  511. VQADD.S16 Q13,Q13,Q0
  512. VQADD.S16 Q14,Q14,Q0
  513. VQADD.S16 Q15,Q15,Q0
  514. VQMOVUN.S16 D16,Q8 ; D16= 7766554433221100 ; 1 cycle
  515. VQMOVUN.S16 D17,Q9 ; D17= FFEEDDCCBBAA9988 ; 1 cycle
  516. VQMOVUN.S16 D18,Q10 ; D18= NNMMLLKKJJIIHHGG ; 1 cycle
  517. VST1.64 {D16},[r0@64], r1
  518. VQMOVUN.S16 D19,Q11 ; D19= VVUUTTSSRRQQPPOO ; 1 cycle
  519. VST1.64 {D17},[r0@64], r1
  520. VQMOVUN.S16 D20,Q12 ; D20= ddccbbaaZZYYXXWW ; 1 cycle
  521. VST1.64 {D18},[r0@64], r1
  522. VQMOVUN.S16 D21,Q13 ; D21= llkkjjiihhggffee ; 1 cycle
  523. VST1.64 {D19},[r0@64], r1
  524. VQMOVUN.S16 D22,Q14 ; D22= ttssrrqqppoonnmm ; 1 cycle
  525. VST1.64 {D20},[r0@64], r1
  526. VQMOVUN.S16 D23,Q15 ; D23= !!@@zzyyxxwwvvuu ; 1 cycle
  527. VST1.64 {D21},[r0@64], r1
  528. VST1.64 {D22},[r0@64], r1
  529. VST1.64 {D23},[r0@64], r1
  530. MOV PC,R14
  531. ENDP
  532. oc_frag_recon_inter_neon PROC
  533. ; r0 = unsigned char *_dst
  534. ; r1 = const unsigned char *_src
  535. ; r2 = int _ystride
  536. ; r3 = const ogg_int16_t _residue[64]
  537. VLDMIA r3, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles
  538. VLD1.64 {D0}, [r1], r2
  539. VLD1.64 {D2}, [r1], r2
  540. VMOVL.U8 Q0, D0 ; Q0 = __77__66__55__44__33__22__11__00
  541. VLD1.64 {D4}, [r1], r2
  542. VMOVL.U8 Q1, D2 ; etc
  543. VLD1.64 {D6}, [r1], r2
  544. VMOVL.U8 Q2, D4
  545. VMOVL.U8 Q3, D6
  546. VQADD.S16 Q8, Q8, Q0
  547. VLD1.64 {D0}, [r1], r2
  548. VQADD.S16 Q9, Q9, Q1
  549. VLD1.64 {D2}, [r1], r2
  550. VQADD.S16 Q10,Q10,Q2
  551. VLD1.64 {D4}, [r1], r2
  552. VQADD.S16 Q11,Q11,Q3
  553. VLD1.64 {D6}, [r1], r2
  554. VMOVL.U8 Q0, D0
  555. VMOVL.U8 Q1, D2
  556. VMOVL.U8 Q2, D4
  557. VMOVL.U8 Q3, D6
  558. VQADD.S16 Q12,Q12,Q0
  559. VQADD.S16 Q13,Q13,Q1
  560. VQADD.S16 Q14,Q14,Q2
  561. VQADD.S16 Q15,Q15,Q3
  562. VQMOVUN.S16 D16,Q8
  563. VQMOVUN.S16 D17,Q9
  564. VQMOVUN.S16 D18,Q10
  565. VST1.64 {D16},[r0@64], r2
  566. VQMOVUN.S16 D19,Q11
  567. VST1.64 {D17},[r0@64], r2
  568. VQMOVUN.S16 D20,Q12
  569. VST1.64 {D18},[r0@64], r2
  570. VQMOVUN.S16 D21,Q13
  571. VST1.64 {D19},[r0@64], r2
  572. VQMOVUN.S16 D22,Q14
  573. VST1.64 {D20},[r0@64], r2
  574. VQMOVUN.S16 D23,Q15
  575. VST1.64 {D21},[r0@64], r2
  576. VST1.64 {D22},[r0@64], r2
  577. VST1.64 {D23},[r0@64], r2
  578. MOV PC,R14
  579. ENDP
  580. oc_frag_recon_inter2_neon PROC
  581. ; r0 = unsigned char *_dst
  582. ; r1 = const unsigned char *_src1
  583. ; r2 = const unsigned char *_src2
  584. ; r3 = int _ystride
  585. LDR r12,[r13]
  586. ; r12= const ogg_int16_t _residue[64]
  587. VLDMIA r12,{D16-D31}
  588. VLD1.64 {D0}, [r1], r3
  589. VLD1.64 {D4}, [r2], r3
  590. VLD1.64 {D1}, [r1], r3
  591. VLD1.64 {D5}, [r2], r3
  592. VHADD.U8 Q2, Q0, Q2 ; Q2 = FFEEDDCCBBAA99887766554433221100
  593. VLD1.64 {D2}, [r1], r3
  594. VLD1.64 {D6}, [r2], r3
  595. VMOVL.U8 Q0, D4 ; Q0 = __77__66__55__44__33__22__11__00
  596. VLD1.64 {D3}, [r1], r3
  597. VMOVL.U8 Q2, D5 ; etc
  598. VLD1.64 {D7}, [r2], r3
  599. VHADD.U8 Q3, Q1, Q3
  600. VQADD.S16 Q8, Q8, Q0
  601. VQADD.S16 Q9, Q9, Q2
  602. VLD1.64 {D0}, [r1], r3
  603. VMOVL.U8 Q1, D6
  604. VLD1.64 {D4}, [r2], r3
  605. VMOVL.U8 Q3, D7
  606. VLD1.64 {D1}, [r1], r3
  607. VQADD.S16 Q10,Q10,Q1
  608. VLD1.64 {D5}, [r2], r3
  609. VQADD.S16 Q11,Q11,Q3
  610. VLD1.64 {D2}, [r1], r3
  611. VHADD.U8 Q2, Q0, Q2
  612. VLD1.64 {D6}, [r2], r3
  613. VLD1.64 {D3}, [r1], r3
  614. VMOVL.U8 Q0, D4
  615. VLD1.64 {D7}, [r2], r3
  616. VMOVL.U8 Q2, D5
  617. VHADD.U8 Q3, Q1, Q3
  618. VQADD.S16 Q12,Q12,Q0
  619. VQADD.S16 Q13,Q13,Q2
  620. VMOVL.U8 Q1, D6
  621. VMOVL.U8 Q3, D7
  622. VQADD.S16 Q14,Q14,Q1
  623. VQADD.S16 Q15,Q15,Q3
  624. VQMOVUN.S16 D16,Q8
  625. VQMOVUN.S16 D17,Q9
  626. VQMOVUN.S16 D18,Q10
  627. VST1.64 {D16},[r0@64], r3
  628. VQMOVUN.S16 D19,Q11
  629. VST1.64 {D17},[r0@64], r3
  630. VQMOVUN.S16 D20,Q12
  631. VST1.64 {D18},[r0@64], r3
  632. VQMOVUN.S16 D21,Q13
  633. VST1.64 {D19},[r0@64], r3
  634. VQMOVUN.S16 D22,Q14
  635. VST1.64 {D20},[r0@64], r3
  636. VQMOVUN.S16 D23,Q15
  637. VST1.64 {D21},[r0@64], r3
  638. VST1.64 {D22},[r0@64], r3
  639. VST1.64 {D23},[r0@64], r3
  640. MOV PC,R14
  641. ENDP
  642. ]
  643. END