| ! Copyright (C) 2008-2012 Imagination Technologies Ltd. |
| |
| .text |
| .global _memcpy |
| .type _memcpy,function |
| ! D1Ar1 dst |
| ! D0Ar2 src |
| ! D1Ar3 cnt |
| ! D0Re0 dst |
| _memcpy: |
| CMP D1Ar3, #16 |
| MOV A1.2, D0Ar2 ! source pointer |
| MOV A0.2, D1Ar1 ! destination pointer |
| MOV A0.3, D1Ar1 ! for return value |
| ! If there are less than 16 bytes to copy use the byte copy loop |
| BGE $Llong_copy |
| |
| $Lbyte_copy: |
| ! Simply copy a byte at a time |
| SUBS TXRPT, D1Ar3, #1 |
| BLT $Lend |
| $Lloop_byte: |
| GETB D1Re0, [A1.2++] |
| SETB [A0.2++], D1Re0 |
| BR $Lloop_byte |
| |
| $Lend: |
| ! Finally set return value and return |
| MOV D0Re0, A0.3 |
| MOV PC, D1RtP |
| |
| $Llong_copy: |
| ANDS D1Ar5, D1Ar1, #7 ! test destination alignment |
| BZ $Laligned_dst |
| |
| ! The destination address is not 8 byte aligned. We will copy bytes from |
| ! the source to the destination until the remaining data has an 8 byte |
| ! destination address alignment (i.e we should never copy more than 7 |
| ! bytes here). |
| $Lalign_dst: |
| GETB D0Re0, [A1.2++] |
| ADD D1Ar5, D1Ar5, #1 ! dest is aligned when D1Ar5 reaches #8 |
| SUB D1Ar3, D1Ar3, #1 ! decrement count of remaining bytes |
| SETB [A0.2++], D0Re0 |
| CMP D1Ar5, #8 |
| BNE $Lalign_dst |
| |
| ! We have at least (16 - 7) = 9 bytes to copy - calculate the number of 8 byte |
| ! blocks, then jump to the unaligned copy loop or fall through to the aligned |
| ! copy loop as appropriate. |
| $Laligned_dst: |
| MOV D0Ar4, A1.2 |
| LSR D1Ar5, D1Ar3, #3 ! D1Ar5 = number of 8 byte blocks |
| ANDS D0Ar4, D0Ar4, #7 ! test source alignment |
| BNZ $Lunaligned_copy ! if unaligned, use unaligned copy loop |
| |
| ! Both source and destination are 8 byte aligned - the easy case. |
| $Laligned_copy: |
| LSRS D1Ar5, D1Ar3, #5 ! D1Ar5 = number of 32 byte blocks |
| BZ $Lbyte_copy |
| SUB TXRPT, D1Ar5, #1 |
| |
| $Laligned_32: |
| GETL D0Re0, D1Re0, [A1.2++] |
| GETL D0Ar6, D1Ar5, [A1.2++] |
| SETL [A0.2++], D0Re0, D1Re0 |
| SETL [A0.2++], D0Ar6, D1Ar5 |
| GETL D0Re0, D1Re0, [A1.2++] |
| GETL D0Ar6, D1Ar5, [A1.2++] |
| SETL [A0.2++], D0Re0, D1Re0 |
| SETL [A0.2++], D0Ar6, D1Ar5 |
| BR $Laligned_32 |
| |
| ! If there are any remaining bytes use the byte copy loop, otherwise we are done |
| ANDS D1Ar3, D1Ar3, #0x1f |
| BNZ $Lbyte_copy |
| B $Lend |
| |
| ! The destination is 8 byte aligned but the source is not, and there are 8 |
| ! or more bytes to be copied. |
| $Lunaligned_copy: |
| ! Adjust the source pointer (A1.2) to the 8 byte boundary before its |
| ! current value |
| MOV D0Ar4, A1.2 |
| MOV D0Ar6, A1.2 |
| ANDMB D0Ar4, D0Ar4, #0xfff8 |
| MOV A1.2, D0Ar4 |
| ! Save the number of bytes of mis-alignment in D0Ar4 for use later |
| SUBS D0Ar6, D0Ar6, D0Ar4 |
| MOV D0Ar4, D0Ar6 |
| ! if there is no mis-alignment after all, use the aligned copy loop |
| BZ $Laligned_copy |
| |
| ! prefetch 8 bytes |
| GETL D0Re0, D1Re0, [A1.2] |
| |
| SUB TXRPT, D1Ar5, #1 |
| |
| ! There are 3 mis-alignment cases to be considered. Less than 4 bytes, exactly |
| ! 4 bytes, and more than 4 bytes. |
| CMP D0Ar6, #4 |
| BLT $Lunaligned_1_2_3 ! use 1-3 byte mis-alignment loop |
| BZ $Lunaligned_4 ! use 4 byte mis-alignment loop |
| |
| ! The mis-alignment is more than 4 bytes |
| $Lunaligned_5_6_7: |
| SUB D0Ar6, D0Ar6, #4 |
| ! Calculate the bit offsets required for the shift operations necesssary |
| ! to align the data. |
| ! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset) |
| MULW D0Ar6, D0Ar6, #8 |
| MOV D1Ar5, #32 |
| SUB D1Ar5, D1Ar5, D0Ar6 |
| ! Move data 4 bytes before we enter the main loop |
| MOV D0Re0, D1Re0 |
| |
| $Lloop_5_6_7: |
| GETL D0Ar2, D1Ar1, [++A1.2] |
| ! form 64-bit data in D0Re0, D1Re0 |
| LSR D0Re0, D0Re0, D0Ar6 |
| MOV D1Re0, D0Ar2 |
| LSL D1Re0, D1Re0, D1Ar5 |
| ADD D0Re0, D0Re0, D1Re0 |
| |
| LSR D0Ar2, D0Ar2, D0Ar6 |
| LSL D1Re0, D1Ar1, D1Ar5 |
| ADD D1Re0, D1Re0, D0Ar2 |
| |
| SETL [A0.2++], D0Re0, D1Re0 |
| MOV D0Re0, D1Ar1 |
| BR $Lloop_5_6_7 |
| |
| B $Lunaligned_end |
| |
| $Lunaligned_1_2_3: |
| ! Calculate the bit offsets required for the shift operations necesssary |
| ! to align the data. |
| ! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset) |
| MULW D0Ar6, D0Ar6, #8 |
| MOV D1Ar5, #32 |
| SUB D1Ar5, D1Ar5, D0Ar6 |
| |
| $Lloop_1_2_3: |
| ! form 64-bit data in D0Re0,D1Re0 |
| LSR D0Re0, D0Re0, D0Ar6 |
| LSL D1Ar1, D1Re0, D1Ar5 |
| ADD D0Re0, D0Re0, D1Ar1 |
| MOV D0Ar2, D1Re0 |
| LSR D0FrT, D0Ar2, D0Ar6 |
| GETL D0Ar2, D1Ar1, [++A1.2] |
| |
| MOV D1Re0, D0Ar2 |
| LSL D1Re0, D1Re0, D1Ar5 |
| ADD D1Re0, D1Re0, D0FrT |
| |
| SETL [A0.2++], D0Re0, D1Re0 |
| MOV D0Re0, D0Ar2 |
| MOV D1Re0, D1Ar1 |
| BR $Lloop_1_2_3 |
| |
| B $Lunaligned_end |
| |
| ! The 4 byte mis-alignment case - this does not require any shifting, just a |
| ! shuffling of registers. |
| $Lunaligned_4: |
| MOV D0Re0, D1Re0 |
| $Lloop_4: |
| GETL D0Ar2, D1Ar1, [++A1.2] |
| MOV D1Re0, D0Ar2 |
| SETL [A0.2++], D0Re0, D1Re0 |
| MOV D0Re0, D1Ar1 |
| BR $Lloop_4 |
| |
| $Lunaligned_end: |
| ! If there are no remaining bytes to copy, we are done. |
| ANDS D1Ar3, D1Ar3, #7 |
| BZ $Lend |
| ! Re-adjust the source pointer (A1.2) back to the actual (unaligned) byte |
| ! address of the remaining bytes, and fall through to the byte copy loop. |
| MOV D0Ar6, A1.2 |
| ADD D1Ar5, D0Ar4, D0Ar6 |
| MOV A1.2, D1Ar5 |
| B $Lbyte_copy |
| |
| .size _memcpy,.-_memcpy |