field_5x52_asm.asm 7.4 KB


  1. ;; Added by Diederik Huys, March 2013
  2. ;;
  3. ;; Provided public procedures:
  4. ;; secp256k1_fe_mul_inner
  5. ;; secp256k1_fe_sqr_inner
  6. ;;
  7. ;; Needed tools: YASM (http://yasm.tortall.net)
  8. ;;
  9. ;;
  10. BITS 64
  11. ;; Procedure ExSetMult
  12. ;; Register Layout:
  13. ;; INPUT: rdi = a->n
  14. ;; rsi = b->n
  15. ;; rdx = r->a
  16. ;;
  17. ;; INTERNAL: rdx:rax = multiplication accumulator
  18. ;; r9:r8 = c
  19. ;; r10-r13 = t0-t3
  20. ;; r14 = b.n[0] / t4
  21. ;; r15 = b.n[1] / t5
  22. ;; rbx = b.n[2] / t6
  23. ;; rcx = b.n[3] / t7
  24. ;; rbp = Constant 0FFFFFFFFFFFFFh / t8
  25. ;; rsi = b.n / b.n[4] / t9
  26. GLOBAL secp256k1_fe_mul_inner
  27. ALIGN 32
  28. secp256k1_fe_mul_inner:
  29. push rbp
  30. push rbx
  31. push r12
  32. push r13
  33. push r14
  34. push r15
  35. push rdx
  36. mov r14,[rsi+8*0] ; preload b.n[0]. This will be the case until
  37. ; b.n[0] is no longer needed, then we reassign
  38. ; r14 to t4
  39. ;; c=a.n[0] * b.n[0]
  40. mov rax,[rdi+0*8] ; load a.n[0]
  41. mov rbp,0FFFFFFFFFFFFFh
  42. mul r14 ; rdx:rax=a.n[0]*b.n[0]
  43. mov r15,[rsi+1*8]
  44. mov r10,rbp ; load modulus into target register for t0
  45. mov r8,rax
  46. and r10,rax ; only need lower qword of c
  47. shrd r8,rdx,52
  48. xor r9,r9 ; c < 2^64, so we ditch the HO part
  49. ;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0]
  50. mov rax,[rdi+0*8]
  51. mul r15
  52. add r8,rax
  53. adc r9,rdx
  54. mov rax,[rdi+1*8]
  55. mul r14
  56. mov r11,rbp
  57. mov rbx,[rsi+2*8]
  58. add r8,rax
  59. adc r9,rdx
  60. and r11,r8
  61. shrd r8,r9,52
  62. xor r9,r9
  63. ;; c+=a.n[0 1 2] * b.n[2 1 0]
  64. mov rax,[rdi+0*8]
  65. mul rbx
  66. add r8,rax
  67. adc r9,rdx
  68. mov rax,[rdi+1*8]
  69. mul r15
  70. add r8,rax
  71. adc r9,rdx
  72. mov rax,[rdi+2*8]
  73. mul r14
  74. mov r12,rbp
  75. mov rcx,[rsi+3*8]
  76. add r8,rax
  77. adc r9,rdx
  78. and r12,r8
  79. shrd r8,r9,52
  80. xor r9,r9
  81. ;; c+=a.n[0 1 2 3] * b.n[3 2 1 0]
  82. mov rax,[rdi+0*8]
  83. mul rcx
  84. add r8,rax
  85. adc r9,rdx
  86. mov rax,[rdi+1*8]
  87. mul rbx
  88. add r8,rax
  89. adc r9,rdx
  90. mov rax,[rdi+2*8]
  91. mul r15
  92. add r8,rax
  93. adc r9,rdx
  94. mov rax,[rdi+3*8]
  95. mul r14
  96. mov r13,rbp
  97. mov rsi,[rsi+4*8] ; load b.n[4] and destroy pointer
  98. add r8,rax
  99. adc r9,rdx
  100. and r13,r8
  101. shrd r8,r9,52
  102. xor r9,r9
  103. ;; c+=a.n[0 1 2 3 4] * b.n[4 3 2 1 0]
  104. mov rax,[rdi+0*8]
  105. mul rsi
  106. add r8,rax
  107. adc r9,rdx
  108. mov rax,[rdi+1*8]
  109. mul rcx
  110. add r8,rax
  111. adc r9,rdx
  112. mov rax,[rdi+2*8]
  113. mul rbx
  114. add r8,rax
  115. adc r9,rdx
  116. mov rax,[rdi+3*8]
  117. mul r15
  118. add r8,rax
  119. adc r9,rdx
  120. mov rax,[rdi+4*8]
  121. mul r14
  122. mov r14,rbp ; load modulus into t4 and destroy a.n[0]
  123. add r8,rax
  124. adc r9,rdx
  125. and r14,r8
  126. shrd r8,r9,52
  127. xor r9,r9
  128. ;; c+=a.n[1 2 3 4] * b.n[4 3 2 1]
  129. mov rax,[rdi+1*8]
  130. mul rsi
  131. add r8,rax
  132. adc r9,rdx
  133. mov rax,[rdi+2*8]
  134. mul rcx
  135. add r8,rax
  136. adc r9,rdx
  137. mov rax,[rdi+3*8]
  138. mul rbx
  139. add r8,rax
  140. adc r9,rdx
  141. mov rax,[rdi+4*8]
  142. mul r15
  143. mov r15,rbp
  144. add r8,rax
  145. adc r9,rdx
  146. and r15,r8
  147. shrd r8,r9,52
  148. xor r9,r9
  149. ;; c+=a.n[2 3 4] * b.n[4 3 2]
  150. mov rax,[rdi+2*8]
  151. mul rsi
  152. add r8,rax
  153. adc r9,rdx
  154. mov rax,[rdi+3*8]
  155. mul rcx
  156. add r8,rax
  157. adc r9,rdx
  158. mov rax,[rdi+4*8]
  159. mul rbx
  160. mov rbx,rbp
  161. add r8,rax
  162. adc r9,rdx
  163. and rbx,r8
  164. shrd r8,r9,52
  165. xor r9,r9
  166. ;; c+=a.n[3 4] * b.n[4 3]
  167. mov rax,[rdi+3*8]
  168. mul rsi
  169. add r8,rax
  170. adc r9,rdx
  171. mov rax,[rdi+4*8]
  172. mul rcx
  173. mov rcx,rbp
  174. add r8,rax
  175. adc r9,rdx
  176. and rcx,r8
  177. shrd r8,r9,52
  178. xor r9,r9
  179. ;; c+=a.n[4] * b.n[4]
  180. mov rax,[rdi+4*8]
  181. mul rsi
  182. ;; mov rbp,rbp ; modulus already there!
  183. add r8,rax
  184. adc r9,rdx
  185. and rbp,r8
  186. shrd r8,r9,52
  187. xor r9,r9
  188. mov rsi,r8 ; load c into t9 and destroy b.n[4]
  189. ;; *******************************************************
  190. common_exit_norm:
  191. mov rdi,01000003D10h ; load constant
  192. mov rax,r15 ; get t5
  193. mul rdi
  194. add rax,r10 ; +t0
  195. adc rdx,0
  196. mov r10,0FFFFFFFFFFFFFh ; modulus. Sadly, we ran out of registers!
  197. mov r8,rax ; +c
  198. and r10,rax
  199. shrd r8,rdx,52
  200. xor r9,r9
  201. mov rax,rbx ; get t6
  202. mul rdi
  203. add rax,r11 ; +t1
  204. adc rdx,0
  205. mov r11,0FFFFFFFFFFFFFh ; modulus
  206. add r8,rax ; +c
  207. adc r9,rdx
  208. and r11,r8
  209. shrd r8,r9,52
  210. xor r9,r9
  211. mov rax,rcx ; get t7
  212. mul rdi
  213. add rax,r12 ; +t2
  214. adc rdx,0
  215. pop rbx ; retrieve pointer to this.n
  216. mov r12,0FFFFFFFFFFFFFh ; modulus
  217. add r8,rax ; +c
  218. adc r9,rdx
  219. and r12,r8
  220. mov [rbx+2*8],r12 ; mov into this.n[2]
  221. shrd r8,r9,52
  222. xor r9,r9
  223. mov rax,rbp ; get t8
  224. mul rdi
  225. add rax,r13 ; +t3
  226. adc rdx,0
  227. mov r13,0FFFFFFFFFFFFFh ; modulus
  228. add r8,rax ; +c
  229. adc r9,rdx
  230. and r13,r8
  231. mov [rbx+3*8],r13 ; -> this.n[3]
  232. shrd r8,r9,52
  233. xor r9,r9
  234. mov rax,rsi ; get t9
  235. mul rdi
  236. add rax,r14 ; +t4
  237. adc rdx,0
  238. mov r14,0FFFFFFFFFFFFh ; !!!
  239. add r8,rax ; +c
  240. adc r9,rdx
  241. and r14,r8
  242. mov [rbx+4*8],r14 ; -> this.n[4]
  243. shrd r8,r9,48 ; !!!
  244. xor r9,r9
  245. mov rax,01000003D1h
  246. mul r8
  247. add rax,r10
  248. adc rdx,0
  249. mov r10,0FFFFFFFFFFFFFh ; modulus
  250. mov r8,rax
  251. and rax,r10
  252. shrd r8,rdx,52
  253. mov [rbx+0*8],rax ; -> this.n[0]
  254. add r8,r11
  255. mov [rbx+1*8],r8 ; -> this.n[1]
  256. pop r15
  257. pop r14
  258. pop r13
  259. pop r12
  260. pop rbx
  261. pop rbp
  262. ret
  263. ;; PROC ExSetSquare
  264. ;; Register Layout:
  265. ;; INPUT: rdi = a.n
  266. ;; rsi = this.a
  267. ;; INTERNAL: rdx:rax = multiplication accumulator
  268. ;; r9:r8 = c
  269. ;; r10-r13 = t0-t3
  270. ;; r14 = a.n[0] / t4
  271. ;; r15 = a.n[1] / t5
  272. ;; rbx = a.n[2] / t6
  273. ;; rcx = a.n[3] / t7
  274. ;; rbp = 0FFFFFFFFFFFFFh / t8
  275. ;; rsi = a.n[4] / t9
  276. GLOBAL secp256k1_fe_sqr_inner
  277. ALIGN 32
  278. secp256k1_fe_sqr_inner:
  279. push rbp
  280. push rbx
  281. push r12
  282. push r13
  283. push r14
  284. push r15
  285. push rsi
  286. mov rbp,0FFFFFFFFFFFFFh
  287. ;; c=a.n[0] * a.n[0]
  288. mov r14,[rdi+0*8] ; r14=a.n[0]
  289. mov r10,rbp ; modulus
  290. mov rax,r14
  291. mul rax
  292. mov r15,[rdi+1*8] ; a.n[1]
  293. add r14,r14 ; r14=2*a.n[0]
  294. mov r8,rax
  295. and r10,rax ; only need lower qword
  296. shrd r8,rdx,52
  297. xor r9,r9
  298. ;; c+=2*a.n[0] * a.n[1]
  299. mov rax,r14 ; r14=2*a.n[0]
  300. mul r15
  301. mov rbx,[rdi+2*8] ; rbx=a.n[2]
  302. mov r11,rbp ; modulus
  303. add r8,rax
  304. adc r9,rdx
  305. and r11,r8
  306. shrd r8,r9,52
  307. xor r9,r9
  308. ;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1]
  309. mov rax,r14
  310. mul rbx
  311. add r8,rax
  312. adc r9,rdx
  313. mov rax,r15
  314. mov r12,rbp ; modulus
  315. mul rax
  316. mov rcx,[rdi+3*8] ; rcx=a.n[3]
  317. add r15,r15 ; r15=a.n[1]*2
  318. add r8,rax
  319. adc r9,rdx
  320. and r12,r8 ; only need lower dword
  321. shrd r8,r9,52
  322. xor r9,r9
  323. ;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2]
  324. mov rax,r14
  325. mul rcx
  326. add r8,rax
  327. adc r9,rdx
  328. mov rax,r15 ; rax=2*a.n[1]
  329. mov r13,rbp ; modulus
  330. mul rbx
  331. mov rsi,[rdi+4*8] ; rsi=a.n[4]
  332. add r8,rax
  333. adc r9,rdx
  334. and r13,r8
  335. shrd r8,r9,52
  336. xor r9,r9
  337. ;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2]
  338. mov rax,r14 ; last time we need 2*a.n[0]
  339. mul rsi
  340. add r8,rax
  341. adc r9,rdx
  342. mov rax,r15
  343. mul rcx
  344. mov r14,rbp ; modulus
  345. add r8,rax
  346. adc r9,rdx
  347. mov rax,rbx
  348. mul rax
  349. add rbx,rbx ; rcx=2*a.n[2]
  350. add r8,rax
  351. adc r9,rdx
  352. and r14,r8
  353. shrd r8,r9,52
  354. xor r9,r9
  355. ;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3]
  356. mov rax,r15 ; last time we need 2*a.n[1]
  357. mul rsi
  358. add r8,rax
  359. adc r9,rdx
  360. mov rax,rbx
  361. mul rcx
  362. mov r15,rbp ; modulus
  363. add r8,rax
  364. adc r9,rdx
  365. and r15,r8
  366. shrd r8,r9,52
  367. xor r9,r9
  368. ;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3]
  369. mov rax,rbx ; last time we need 2*a.n[2]
  370. mul rsi
  371. add r8,rax
  372. adc r9,rdx
  373. mov rax,rcx ; a.n[3]
  374. mul rax
  375. mov rbx,rbp ; modulus
  376. add r8,rax
  377. adc r9,rdx
  378. and rbx,r8 ; only need lower dword
  379. lea rax,[2*rcx]
  380. shrd r8,r9,52
  381. xor r9,r9
  382. ;; c+=2*a.n[3]*a.n[4]
  383. mul rsi
  384. mov rcx,rbp ; modulus
  385. add r8,rax
  386. adc r9,rdx
  387. and rcx,r8 ; only need lower dword
  388. shrd r8,r9,52
  389. xor r9,r9
  390. ;; c+=a.n[4]*a.n[4]
  391. mov rax,rsi
  392. mul rax
  393. ;; mov rbp,rbp ; modulus is already there!
  394. add r8,rax
  395. adc r9,rdx
  396. and rbp,r8
  397. shrd r8,r9,52
  398. xor r9,r9
  399. mov rsi,r8
  400. ;; *******************************************************
  401. jmp common_exit_norm
  402. end