123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183 |
- /* memset.S: optimised assembly memset
- *
- * Copyright (C) 2003 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
- .text
- .p2align 4
- ###############################################################################
- #
- # void *memset(void *p, char ch, size_t count)
- #
- # - NOTE: must not use any stack. exception detection performs function return
- # to caller's fixup routine, aborting the remainder of the set
- # GR4, GR7, GR8, and GR11 must be managed
- #
- ###############################################################################
- .globl memset,__memset_end
- .type memset,@function
- memset:
- orcc.p gr10,gr0,gr5,icc3 ; GR5 = count
- andi gr9,#0xff,gr9
- or.p gr8,gr0,gr4 ; GR4 = address
- beqlr icc3,#0
- # conditionally write a byte to 2b-align the address
- setlos.p #1,gr6
- andicc gr4,#1,gr0,icc0
- ckne icc0,cc7
- cstb.p gr9,@(gr4,gr0) ,cc7,#1
- csubcc gr5,gr6,gr5 ,cc7,#1 ; also set ICC3
- cadd.p gr4,gr6,gr4 ,cc7,#1
- beqlr icc3,#0
- # conditionally write a word to 4b-align the address
- andicc.p gr4,#2,gr0,icc0
- subicc gr5,#2,gr0,icc1
- setlos.p #2,gr6
- ckne icc0,cc7
- slli.p gr9,#8,gr12 ; need to double up the pattern
- cknc icc1,cc5
- or.p gr9,gr12,gr12
- andcr cc7,cc5,cc7
- csth.p gr12,@(gr4,gr0) ,cc7,#1
- csubcc gr5,gr6,gr5 ,cc7,#1 ; also set ICC3
- cadd.p gr4,gr6,gr4 ,cc7,#1
- beqlr icc3,#0
- # conditionally write a dword to 8b-align the address
- andicc.p gr4,#4,gr0,icc0
- subicc gr5,#4,gr0,icc1
- setlos.p #4,gr6
- ckne icc0,cc7
- slli.p gr12,#16,gr13 ; need to quadruple-up the pattern
- cknc icc1,cc5
- or.p gr13,gr12,gr12
- andcr cc7,cc5,cc7
- cst.p gr12,@(gr4,gr0) ,cc7,#1
- csubcc gr5,gr6,gr5 ,cc7,#1 ; also set ICC3
- cadd.p gr4,gr6,gr4 ,cc7,#1
- beqlr icc3,#0
- or.p gr12,gr12,gr13 ; need to octuple-up the pattern
- # the address is now 8b-aligned - loop around writing 64b chunks
- setlos #8,gr7
- subi.p gr4,#8,gr4 ; store with update index does weird stuff
- setlos #64,gr6
- subicc gr5,#64,gr0,icc0
- 0: cknc icc0,cc7
- cstdu gr12,@(gr4,gr7) ,cc7,#1
- cstdu gr12,@(gr4,gr7) ,cc7,#1
- cstdu gr12,@(gr4,gr7) ,cc7,#1
- cstdu gr12,@(gr4,gr7) ,cc7,#1
- cstdu gr12,@(gr4,gr7) ,cc7,#1
- cstdu.p gr12,@(gr4,gr7) ,cc7,#1
- csubcc gr5,gr6,gr5 ,cc7,#1 ; also set ICC3
- cstdu.p gr12,@(gr4,gr7) ,cc7,#1
- subicc gr5,#64,gr0,icc0
- cstdu.p gr12,@(gr4,gr7) ,cc7,#1
- beqlr icc3,#0
- bnc icc0,#2,0b
- # now do 32-byte remnant
- subicc.p gr5,#32,gr0,icc0
- setlos #32,gr6
- cknc icc0,cc7
- cstdu.p gr12,@(gr4,gr7) ,cc7,#1
- csubcc gr5,gr6,gr5 ,cc7,#1 ; also set ICC3
- cstdu.p gr12,@(gr4,gr7) ,cc7,#1
- setlos #16,gr6
- cstdu.p gr12,@(gr4,gr7) ,cc7,#1
- subicc gr5,#16,gr0,icc0
- cstdu.p gr12,@(gr4,gr7) ,cc7,#1
- beqlr icc3,#0
- # now do 16-byte remnant
- cknc icc0,cc7
- cstdu.p gr12,@(gr4,gr7) ,cc7,#1
- csubcc gr5,gr6,gr5 ,cc7,#1 ; also set ICC3
- cstdu.p gr12,@(gr4,gr7) ,cc7,#1
- beqlr icc3,#0
- # now do 8-byte remnant
- subicc gr5,#8,gr0,icc1
- cknc icc1,cc7
- cstdu.p gr12,@(gr4,gr7) ,cc7,#1
- csubcc gr5,gr7,gr5 ,cc7,#1 ; also set ICC3
- setlos.p #4,gr7
- beqlr icc3,#0
- # now do 4-byte remnant
- subicc gr5,#4,gr0,icc0
- addi.p gr4,#4,gr4
- cknc icc0,cc7
- cstu.p gr12,@(gr4,gr7) ,cc7,#1
- csubcc gr5,gr7,gr5 ,cc7,#1 ; also set ICC3
- subicc.p gr5,#2,gr0,icc1
- beqlr icc3,#0
- # now do 2-byte remnant
- setlos #2,gr7
- addi.p gr4,#2,gr4
- cknc icc1,cc7
- csthu.p gr12,@(gr4,gr7) ,cc7,#1
- csubcc gr5,gr7,gr5 ,cc7,#1 ; also set ICC3
- subicc.p gr5,#1,gr0,icc0
- beqlr icc3,#0
- # now do 1-byte remnant
- setlos #0,gr7
- addi.p gr4,#2,gr4
- cknc icc0,cc7
- cstb.p gr12,@(gr4,gr0) ,cc7,#1
- bralr
- __memset_end:
- .size memset, __memset_end-memset
- ###############################################################################
- #
- # clear memory in userspace
- # - return the number of bytes that could not be cleared (0 on complete success)
- #
- # long __memset_user(void *p, size_t count)
- #
- ###############################################################################
- .globl __memset_user, __memset_user_error_lr, __memset_user_error_handler
- .type __memset_user,@function
- __memset_user:
- movsg lr,gr11
- # abuse memset to do the dirty work
- or.p gr9,gr9,gr10
- setlos #0,gr9
- call memset
- __memset_user_error_lr:
- jmpl.p @(gr11,gr0)
- setlos #0,gr8
- # deal any exception generated by memset
- # GR4 - memset's address tracking pointer
- # GR7 - memset's step value (index register for store insns)
- # GR8 - memset's original start address
- # GR10 - memset's original count
- __memset_user_error_handler:
- add.p gr4,gr7,gr4
- add gr8,gr10,gr8
- jmpl.p @(gr11,gr0)
- sub gr8,gr4,gr8 ; we return the amount left uncleared
- .size __memset_user, .-__memset_user
|