123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464 |
- ;; Added by Diederik Huys, March 2013
- ;;
- ;; Provided public procedures:
- ;; secp256k1_fe_mul_inner
- ;; secp256k1_fe_sqr_inner
- ;;
- ;; Needed tools: YASM (http://yasm.tortall.net)
- ;;
- ;;
- BITS 64
- ;; Procedure ExSetMult
- ;; Register Layout:
- ;; INPUT: rdi = a->n
- ;; rsi = b->n
- ;; rdx = r->a
- ;;
- ;; INTERNAL: rdx:rax = multiplication accumulator
- ;; r9:r8 = c
- ;; r10-r13 = t0-t3
- ;; r14 = b.n[0] / t4
- ;; r15 = b.n[1] / t5
- ;; rbx = b.n[2] / t6
- ;; rcx = b.n[3] / t7
- ;; rbp = Constant 0FFFFFFFFFFFFFh / t8
- ;; rsi = b.n / b.n[4] / t9
- GLOBAL secp256k1_fe_mul_inner
- ALIGN 32
- secp256k1_fe_mul_inner:
- push rbp
- push rbx
- push r12
- push r13
- push r14
- push r15
- push rdx
- mov r14,[rsi+8*0] ; preload b.n[0]. This will be the case until
- ; b.n[0] is no longer needed, then we reassign
- ; r14 to t4
- ;; c=a.n[0] * b.n[0]
- mov rax,[rdi+0*8] ; load a.n[0]
- mov rbp,0FFFFFFFFFFFFFh
- mul r14 ; rdx:rax=a.n[0]*b.n[0]
- mov r15,[rsi+1*8]
- mov r10,rbp ; load modulus into target register for t0
- mov r8,rax
- and r10,rax ; only need lower qword of c
- shrd r8,rdx,52
- xor r9,r9 ; c < 2^64, so we ditch the HO part
- ;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0]
- mov rax,[rdi+0*8]
- mul r15
- add r8,rax
- adc r9,rdx
- mov rax,[rdi+1*8]
- mul r14
- mov r11,rbp
- mov rbx,[rsi+2*8]
- add r8,rax
- adc r9,rdx
- and r11,r8
- shrd r8,r9,52
- xor r9,r9
-
- ;; c+=a.n[0 1 2] * b.n[2 1 0]
- mov rax,[rdi+0*8]
- mul rbx
- add r8,rax
- adc r9,rdx
- mov rax,[rdi+1*8]
- mul r15
- add r8,rax
- adc r9,rdx
- mov rax,[rdi+2*8]
- mul r14
- mov r12,rbp
- mov rcx,[rsi+3*8]
- add r8,rax
- adc r9,rdx
- and r12,r8
- shrd r8,r9,52
- xor r9,r9
- ;; c+=a.n[0 1 2 3] * b.n[3 2 1 0]
- mov rax,[rdi+0*8]
- mul rcx
- add r8,rax
- adc r9,rdx
- mov rax,[rdi+1*8]
- mul rbx
- add r8,rax
- adc r9,rdx
- mov rax,[rdi+2*8]
- mul r15
- add r8,rax
- adc r9,rdx
-
- mov rax,[rdi+3*8]
- mul r14
- mov r13,rbp
- mov rsi,[rsi+4*8] ; load b.n[4] and destroy pointer
- add r8,rax
- adc r9,rdx
- and r13,r8
- shrd r8,r9,52
- xor r9,r9
- ;; c+=a.n[0 1 2 3 4] * b.n[4 3 2 1 0]
- mov rax,[rdi+0*8]
- mul rsi
- add r8,rax
- adc r9,rdx
- mov rax,[rdi+1*8]
- mul rcx
- add r8,rax
- adc r9,rdx
- mov rax,[rdi+2*8]
- mul rbx
- add r8,rax
- adc r9,rdx
- mov rax,[rdi+3*8]
- mul r15
- add r8,rax
- adc r9,rdx
- mov rax,[rdi+4*8]
- mul r14
- mov r14,rbp ; load modulus into t4 and destroy a.n[0]
- add r8,rax
- adc r9,rdx
- and r14,r8
- shrd r8,r9,52
- xor r9,r9
- ;; c+=a.n[1 2 3 4] * b.n[4 3 2 1]
- mov rax,[rdi+1*8]
- mul rsi
- add r8,rax
- adc r9,rdx
- mov rax,[rdi+2*8]
- mul rcx
- add r8,rax
- adc r9,rdx
- mov rax,[rdi+3*8]
- mul rbx
- add r8,rax
- adc r9,rdx
- mov rax,[rdi+4*8]
- mul r15
- mov r15,rbp
- add r8,rax
- adc r9,rdx
- and r15,r8
- shrd r8,r9,52
- xor r9,r9
- ;; c+=a.n[2 3 4] * b.n[4 3 2]
- mov rax,[rdi+2*8]
- mul rsi
- add r8,rax
- adc r9,rdx
- mov rax,[rdi+3*8]
- mul rcx
- add r8,rax
- adc r9,rdx
- mov rax,[rdi+4*8]
- mul rbx
- mov rbx,rbp
- add r8,rax
- adc r9,rdx
- and rbx,r8
- shrd r8,r9,52
- xor r9,r9
- ;; c+=a.n[3 4] * b.n[4 3]
- mov rax,[rdi+3*8]
- mul rsi
- add r8,rax
- adc r9,rdx
- mov rax,[rdi+4*8]
- mul rcx
- mov rcx,rbp
- add r8,rax
- adc r9,rdx
- and rcx,r8
- shrd r8,r9,52
- xor r9,r9
- ;; c+=a.n[4] * b.n[4]
- mov rax,[rdi+4*8]
- mul rsi
- ;; mov rbp,rbp ; modulus already there!
- add r8,rax
- adc r9,rdx
- and rbp,r8
- shrd r8,r9,52
- xor r9,r9
- mov rsi,r8 ; load c into t9 and destroy b.n[4]
- ;; *******************************************************
- common_exit_norm:
- mov rdi,01000003D10h ; load constant
- mov rax,r15 ; get t5
- mul rdi
- add rax,r10 ; +t0
- adc rdx,0
- mov r10,0FFFFFFFFFFFFFh ; modulus. Sadly, we ran out of registers!
- mov r8,rax ; +c
- and r10,rax
- shrd r8,rdx,52
- xor r9,r9
- mov rax,rbx ; get t6
- mul rdi
- add rax,r11 ; +t1
- adc rdx,0
- mov r11,0FFFFFFFFFFFFFh ; modulus
- add r8,rax ; +c
- adc r9,rdx
- and r11,r8
- shrd r8,r9,52
- xor r9,r9
- mov rax,rcx ; get t7
- mul rdi
- add rax,r12 ; +t2
- adc rdx,0
- pop rbx ; retrieve pointer to this.n
- mov r12,0FFFFFFFFFFFFFh ; modulus
- add r8,rax ; +c
- adc r9,rdx
- and r12,r8
- mov [rbx+2*8],r12 ; mov into this.n[2]
- shrd r8,r9,52
- xor r9,r9
-
- mov rax,rbp ; get t8
- mul rdi
- add rax,r13 ; +t3
- adc rdx,0
- mov r13,0FFFFFFFFFFFFFh ; modulus
- add r8,rax ; +c
- adc r9,rdx
- and r13,r8
- mov [rbx+3*8],r13 ; -> this.n[3]
- shrd r8,r9,52
- xor r9,r9
-
- mov rax,rsi ; get t9
- mul rdi
- add rax,r14 ; +t4
- adc rdx,0
- mov r14,0FFFFFFFFFFFFh ; !!!
- add r8,rax ; +c
- adc r9,rdx
- and r14,r8
- mov [rbx+4*8],r14 ; -> this.n[4]
- shrd r8,r9,48 ; !!!
- xor r9,r9
-
- mov rax,01000003D1h
- mul r8
- add rax,r10
- adc rdx,0
- mov r10,0FFFFFFFFFFFFFh ; modulus
- mov r8,rax
- and rax,r10
- shrd r8,rdx,52
- mov [rbx+0*8],rax ; -> this.n[0]
- add r8,r11
- mov [rbx+1*8],r8 ; -> this.n[1]
- pop r15
- pop r14
- pop r13
- pop r12
- pop rbx
- pop rbp
- ret
-
- ;; PROC ExSetSquare
- ;; Register Layout:
- ;; INPUT: rdi = a.n
- ;; rsi = this.a
- ;; INTERNAL: rdx:rax = multiplication accumulator
- ;; r9:r8 = c
- ;; r10-r13 = t0-t3
- ;; r14 = a.n[0] / t4
- ;; r15 = a.n[1] / t5
- ;; rbx = a.n[2] / t6
- ;; rcx = a.n[3] / t7
- ;; rbp = 0FFFFFFFFFFFFFh / t8
- ;; rsi = a.n[4] / t9
- GLOBAL secp256k1_fe_sqr_inner
- ALIGN 32
- secp256k1_fe_sqr_inner:
- push rbp
- push rbx
- push r12
- push r13
- push r14
- push r15
- push rsi
- mov rbp,0FFFFFFFFFFFFFh
-
- ;; c=a.n[0] * a.n[0]
- mov r14,[rdi+0*8] ; r14=a.n[0]
- mov r10,rbp ; modulus
- mov rax,r14
- mul rax
- mov r15,[rdi+1*8] ; a.n[1]
- add r14,r14 ; r14=2*a.n[0]
- mov r8,rax
- and r10,rax ; only need lower qword
- shrd r8,rdx,52
- xor r9,r9
- ;; c+=2*a.n[0] * a.n[1]
- mov rax,r14 ; r14=2*a.n[0]
- mul r15
- mov rbx,[rdi+2*8] ; rbx=a.n[2]
- mov r11,rbp ; modulus
- add r8,rax
- adc r9,rdx
- and r11,r8
- shrd r8,r9,52
- xor r9,r9
-
- ;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1]
- mov rax,r14
- mul rbx
- add r8,rax
- adc r9,rdx
- mov rax,r15
- mov r12,rbp ; modulus
- mul rax
- mov rcx,[rdi+3*8] ; rcx=a.n[3]
- add r15,r15 ; r15=a.n[1]*2
- add r8,rax
- adc r9,rdx
- and r12,r8 ; only need lower dword
- shrd r8,r9,52
- xor r9,r9
- ;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2]
- mov rax,r14
- mul rcx
- add r8,rax
- adc r9,rdx
- mov rax,r15 ; rax=2*a.n[1]
- mov r13,rbp ; modulus
- mul rbx
- mov rsi,[rdi+4*8] ; rsi=a.n[4]
- add r8,rax
- adc r9,rdx
- and r13,r8
- shrd r8,r9,52
- xor r9,r9
- ;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2]
- mov rax,r14 ; last time we need 2*a.n[0]
- mul rsi
- add r8,rax
- adc r9,rdx
- mov rax,r15
- mul rcx
- mov r14,rbp ; modulus
- add r8,rax
- adc r9,rdx
- mov rax,rbx
- mul rax
- add rbx,rbx ; rcx=2*a.n[2]
- add r8,rax
- adc r9,rdx
- and r14,r8
- shrd r8,r9,52
- xor r9,r9
- ;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3]
- mov rax,r15 ; last time we need 2*a.n[1]
- mul rsi
- add r8,rax
- adc r9,rdx
- mov rax,rbx
- mul rcx
- mov r15,rbp ; modulus
- add r8,rax
- adc r9,rdx
- and r15,r8
- shrd r8,r9,52
- xor r9,r9
- ;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3]
- mov rax,rbx ; last time we need 2*a.n[2]
- mul rsi
- add r8,rax
- adc r9,rdx
- mov rax,rcx ; a.n[3]
- mul rax
- mov rbx,rbp ; modulus
- add r8,rax
- adc r9,rdx
- and rbx,r8 ; only need lower dword
- lea rax,[2*rcx]
- shrd r8,r9,52
- xor r9,r9
- ;; c+=2*a.n[3]*a.n[4]
- mul rsi
- mov rcx,rbp ; modulus
- add r8,rax
- adc r9,rdx
- and rcx,r8 ; only need lower dword
- shrd r8,r9,52
- xor r9,r9
- ;; c+=a.n[4]*a.n[4]
- mov rax,rsi
- mul rax
- ;; mov rbp,rbp ; modulus is already there!
- add r8,rax
- adc r9,rdx
- and rbp,r8
- shrd r8,r9,52
- xor r9,r9
- mov rsi,r8
- ;; *******************************************************
- jmp common_exit_norm
- end
-
|