James Hogan | 086e9dc | 2012-10-05 17:02:09 +0100 | [diff] [blame] | 1 | ! Copyright (C) 2008-2012 Imagination Technologies Ltd. |
| 2 | |
| 3 | .text |
| 4 | .global _memcpy |
| 5 | .type _memcpy,function |
| 6 | ! D1Ar1 dst |
| 7 | ! D0Ar2 src |
| 8 | ! D1Ar3 cnt |
| 9 | ! D0Re0 dst |
| 10 | _memcpy: |
| 11 | CMP D1Ar3, #16 |
| 12 | MOV A1.2, D0Ar2 ! source pointer |
| 13 | MOV A0.2, D1Ar1 ! destination pointer |
| 14 | MOV A0.3, D1Ar1 ! for return value |
| 15 | ! If there are less than 16 bytes to copy use the byte copy loop |
| 16 | BGE $Llong_copy |
| 17 | |
| 18 | $Lbyte_copy: |
| 19 | ! Simply copy a byte at a time |
| 20 | SUBS TXRPT, D1Ar3, #1 |
| 21 | BLT $Lend |
| 22 | $Lloop_byte: |
| 23 | GETB D1Re0, [A1.2++] |
| 24 | SETB [A0.2++], D1Re0 |
| 25 | BR $Lloop_byte |
| 26 | |
| 27 | $Lend: |
| 28 | ! Finally set return value and return |
| 29 | MOV D0Re0, A0.3 |
| 30 | MOV PC, D1RtP |
| 31 | |
| 32 | $Llong_copy: |
| 33 | ANDS D1Ar5, D1Ar1, #7 ! test destination alignment |
| 34 | BZ $Laligned_dst |
| 35 | |
| 36 | ! The destination address is not 8 byte aligned. We will copy bytes from |
| 37 | ! the source to the destination until the remaining data has an 8 byte |
| 38 | ! destination address alignment (i.e we should never copy more than 7 |
| 39 | ! bytes here). |
| 40 | $Lalign_dst: |
| 41 | GETB D0Re0, [A1.2++] |
| 42 | ADD D1Ar5, D1Ar5, #1 ! dest is aligned when D1Ar5 reaches #8 |
| 43 | SUB D1Ar3, D1Ar3, #1 ! decrement count of remaining bytes |
| 44 | SETB [A0.2++], D0Re0 |
| 45 | CMP D1Ar5, #8 |
| 46 | BNE $Lalign_dst |
| 47 | |
| 48 | ! We have at least (16 - 7) = 9 bytes to copy - calculate the number of 8 byte |
| 49 | ! blocks, then jump to the unaligned copy loop or fall through to the aligned |
| 50 | ! copy loop as appropriate. |
| 51 | $Laligned_dst: |
| 52 | MOV D0Ar4, A1.2 |
| 53 | LSR D1Ar5, D1Ar3, #3 ! D1Ar5 = number of 8 byte blocks |
| 54 | ANDS D0Ar4, D0Ar4, #7 ! test source alignment |
| 55 | BNZ $Lunaligned_copy ! if unaligned, use unaligned copy loop |
| 56 | |
| 57 | ! Both source and destination are 8 byte aligned - the easy case. |
| 58 | $Laligned_copy: |
| 59 | LSRS D1Ar5, D1Ar3, #5 ! D1Ar5 = number of 32 byte blocks |
| 60 | BZ $Lbyte_copy |
| 61 | SUB TXRPT, D1Ar5, #1 |
| 62 | |
| 63 | $Laligned_32: |
| 64 | GETL D0Re0, D1Re0, [A1.2++] |
| 65 | GETL D0Ar6, D1Ar5, [A1.2++] |
| 66 | SETL [A0.2++], D0Re0, D1Re0 |
| 67 | SETL [A0.2++], D0Ar6, D1Ar5 |
| 68 | GETL D0Re0, D1Re0, [A1.2++] |
| 69 | GETL D0Ar6, D1Ar5, [A1.2++] |
| 70 | SETL [A0.2++], D0Re0, D1Re0 |
| 71 | SETL [A0.2++], D0Ar6, D1Ar5 |
| 72 | BR $Laligned_32 |
| 73 | |
| 74 | ! If there are any remaining bytes use the byte copy loop, otherwise we are done |
| 75 | ANDS D1Ar3, D1Ar3, #0x1f |
| 76 | BNZ $Lbyte_copy |
| 77 | B $Lend |
| 78 | |
| 79 | ! The destination is 8 byte aligned but the source is not, and there are 8 |
| 80 | ! or more bytes to be copied. |
| 81 | $Lunaligned_copy: |
| 82 | ! Adjust the source pointer (A1.2) to the 8 byte boundary before its |
| 83 | ! current value |
| 84 | MOV D0Ar4, A1.2 |
| 85 | MOV D0Ar6, A1.2 |
| 86 | ANDMB D0Ar4, D0Ar4, #0xfff8 |
| 87 | MOV A1.2, D0Ar4 |
| 88 | ! Save the number of bytes of mis-alignment in D0Ar4 for use later |
| 89 | SUBS D0Ar6, D0Ar6, D0Ar4 |
| 90 | MOV D0Ar4, D0Ar6 |
| 91 | ! if there is no mis-alignment after all, use the aligned copy loop |
| 92 | BZ $Laligned_copy |
| 93 | |
| 94 | ! prefetch 8 bytes |
| 95 | GETL D0Re0, D1Re0, [A1.2] |
| 96 | |
| 97 | SUB TXRPT, D1Ar5, #1 |
| 98 | |
| 99 | ! There are 3 mis-alignment cases to be considered. Less than 4 bytes, exactly |
| 100 | ! 4 bytes, and more than 4 bytes. |
| 101 | CMP D0Ar6, #4 |
| 102 | BLT $Lunaligned_1_2_3 ! use 1-3 byte mis-alignment loop |
| 103 | BZ $Lunaligned_4 ! use 4 byte mis-alignment loop |
| 104 | |
| 105 | ! The mis-alignment is more than 4 bytes |
| 106 | $Lunaligned_5_6_7: |
| 107 | SUB D0Ar6, D0Ar6, #4 |
| 108 | ! Calculate the bit offsets required for the shift operations necesssary |
| 109 | ! to align the data. |
| 110 | ! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset) |
| 111 | MULW D0Ar6, D0Ar6, #8 |
| 112 | MOV D1Ar5, #32 |
| 113 | SUB D1Ar5, D1Ar5, D0Ar6 |
| 114 | ! Move data 4 bytes before we enter the main loop |
| 115 | MOV D0Re0, D1Re0 |
| 116 | |
| 117 | $Lloop_5_6_7: |
| 118 | GETL D0Ar2, D1Ar1, [++A1.2] |
| 119 | ! form 64-bit data in D0Re0, D1Re0 |
| 120 | LSR D0Re0, D0Re0, D0Ar6 |
| 121 | MOV D1Re0, D0Ar2 |
| 122 | LSL D1Re0, D1Re0, D1Ar5 |
| 123 | ADD D0Re0, D0Re0, D1Re0 |
| 124 | |
| 125 | LSR D0Ar2, D0Ar2, D0Ar6 |
| 126 | LSL D1Re0, D1Ar1, D1Ar5 |
| 127 | ADD D1Re0, D1Re0, D0Ar2 |
| 128 | |
| 129 | SETL [A0.2++], D0Re0, D1Re0 |
| 130 | MOV D0Re0, D1Ar1 |
| 131 | BR $Lloop_5_6_7 |
| 132 | |
| 133 | B $Lunaligned_end |
| 134 | |
| 135 | $Lunaligned_1_2_3: |
| 136 | ! Calculate the bit offsets required for the shift operations necesssary |
| 137 | ! to align the data. |
| 138 | ! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset) |
| 139 | MULW D0Ar6, D0Ar6, #8 |
| 140 | MOV D1Ar5, #32 |
| 141 | SUB D1Ar5, D1Ar5, D0Ar6 |
| 142 | |
| 143 | $Lloop_1_2_3: |
| 144 | ! form 64-bit data in D0Re0,D1Re0 |
| 145 | LSR D0Re0, D0Re0, D0Ar6 |
| 146 | LSL D1Ar1, D1Re0, D1Ar5 |
| 147 | ADD D0Re0, D0Re0, D1Ar1 |
| 148 | MOV D0Ar2, D1Re0 |
| 149 | LSR D0FrT, D0Ar2, D0Ar6 |
| 150 | GETL D0Ar2, D1Ar1, [++A1.2] |
| 151 | |
| 152 | MOV D1Re0, D0Ar2 |
| 153 | LSL D1Re0, D1Re0, D1Ar5 |
| 154 | ADD D1Re0, D1Re0, D0FrT |
| 155 | |
| 156 | SETL [A0.2++], D0Re0, D1Re0 |
| 157 | MOV D0Re0, D0Ar2 |
| 158 | MOV D1Re0, D1Ar1 |
| 159 | BR $Lloop_1_2_3 |
| 160 | |
| 161 | B $Lunaligned_end |
| 162 | |
| 163 | ! The 4 byte mis-alignment case - this does not require any shifting, just a |
| 164 | ! shuffling of registers. |
| 165 | $Lunaligned_4: |
| 166 | MOV D0Re0, D1Re0 |
| 167 | $Lloop_4: |
| 168 | GETL D0Ar2, D1Ar1, [++A1.2] |
| 169 | MOV D1Re0, D0Ar2 |
| 170 | SETL [A0.2++], D0Re0, D1Re0 |
| 171 | MOV D0Re0, D1Ar1 |
| 172 | BR $Lloop_4 |
| 173 | |
| 174 | $Lunaligned_end: |
| 175 | ! If there are no remaining bytes to copy, we are done. |
| 176 | ANDS D1Ar3, D1Ar3, #7 |
| 177 | BZ $Lend |
| 178 | ! Re-adjust the source pointer (A1.2) back to the actual (unaligned) byte |
| 179 | ! address of the remaining bytes, and fall through to the byte copy loop. |
| 180 | MOV D0Ar6, A1.2 |
| 181 | ADD D1Ar5, D0Ar4, D0Ar6 |
| 182 | MOV A1.2, D1Ar5 |
| 183 | B $Lbyte_copy |
| 184 | |
| 185 | .size _memcpy,.-_memcpy |