| /* SPDX-License-Identifier: GPL-2.0-only */ |
| /* |
| * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org> |
| */ |
| |
| #include <linux/export.h> |
| #include <linux/linkage.h> |
| #include <asm/asm.h> |
| |
| /* |
| * copy_user_nocache - Uncached memory copy with exception handling |
| * |
| * This copies from user space into kernel space, but the kernel |
| * space accesses can take a machine check exception, so they too |
| * need exception handling. |
| * |
| * Note: only 32-bit and 64-bit stores have non-temporal versions, |
| * and we only use aligned versions. Any unaligned parts at the |
| * start or end of the copy will be done using normal cached stores. |
| * |
| * Input: |
| * rdi destination |
| * rsi source |
| * edx count |
| * |
| * Output: |
| * rax uncopied bytes or 0 if successful. |
| */ |
| SYM_FUNC_START(__copy_user_nocache) |
| /* If destination is not 7-byte aligned, we'll have to align it */ |
| testb $7,%dil |
| jne .Lalign |
| |
| .Lis_aligned: |
| cmp $64,%edx |
| jb .Lquadwords |
| |
| .p2align 4,0x90 |
| .Lunrolled: |
| 10: movq (%rsi),%r8 |
| 11: movq 8(%rsi),%r9 |
| 12: movq 16(%rsi),%r10 |
| 13: movq 24(%rsi),%r11 |
| 20: movnti %r8,(%rdi) |
| 21: movnti %r9,8(%rdi) |
| 22: movnti %r10,16(%rdi) |
| 23: movnti %r11,24(%rdi) |
| 30: movq 32(%rsi),%r8 |
| 31: movq 40(%rsi),%r9 |
| 32: movq 48(%rsi),%r10 |
| 33: movq 56(%rsi),%r11 |
| 40: movnti %r8,32(%rdi) |
| 41: movnti %r9,40(%rdi) |
| 42: movnti %r10,48(%rdi) |
| 43: movnti %r11,56(%rdi) |
| |
| addq $64,%rsi |
| addq $64,%rdi |
| sub $64,%edx |
| cmp $64,%edx |
| jae .Lunrolled |
| |
| /* |
| * First set of user mode loads have been done |
| * without any stores, so if they fail, we can |
| * just try the non-unrolled loop. |
| */ |
| _ASM_EXTABLE_UA(10b, .Lquadwords) |
| _ASM_EXTABLE_UA(11b, .Lquadwords) |
| _ASM_EXTABLE_UA(12b, .Lquadwords) |
| _ASM_EXTABLE_UA(13b, .Lquadwords) |
| |
| /* |
| * The second set of user mode loads have been |
| * done with 32 bytes stored to the destination, |
| * so we need to take that into account before |
| * falling back to the unrolled loop. |
| */ |
| _ASM_EXTABLE_UA(30b, .Lfixup32) |
| _ASM_EXTABLE_UA(31b, .Lfixup32) |
| _ASM_EXTABLE_UA(32b, .Lfixup32) |
| _ASM_EXTABLE_UA(33b, .Lfixup32) |
| |
| /* |
| * An exception on a write means that we're |
| * done, but we need to update the count |
| * depending on where in the unrolled loop |
| * we were. |
| */ |
| _ASM_EXTABLE_UA(20b, .Ldone0) |
| _ASM_EXTABLE_UA(21b, .Ldone8) |
| _ASM_EXTABLE_UA(22b, .Ldone16) |
| _ASM_EXTABLE_UA(23b, .Ldone24) |
| _ASM_EXTABLE_UA(40b, .Ldone32) |
| _ASM_EXTABLE_UA(41b, .Ldone40) |
| _ASM_EXTABLE_UA(42b, .Ldone48) |
| _ASM_EXTABLE_UA(43b, .Ldone56) |
| |
| .Lquadwords: |
| cmp $8,%edx |
| jb .Llong |
| 50: movq (%rsi),%rax |
| 51: movnti %rax,(%rdi) |
| addq $8,%rsi |
| addq $8,%rdi |
| sub $8,%edx |
| jmp .Lquadwords |
| |
| /* |
| * If we fail on the last full quadword, we will |
| * not try to do any byte-wise cached accesses. |
| * We will try to do one more 4-byte uncached |
| * one, though. |
| */ |
| _ASM_EXTABLE_UA(50b, .Llast4) |
| _ASM_EXTABLE_UA(51b, .Ldone0) |
| |
| .Llong: |
| test $4,%dl |
| je .Lword |
| 60: movl (%rsi),%eax |
| 61: movnti %eax,(%rdi) |
| addq $4,%rsi |
| addq $4,%rdi |
| sub $4,%edx |
| .Lword: |
| sfence |
| test $2,%dl |
| je .Lbyte |
| 70: movw (%rsi),%ax |
| 71: movw %ax,(%rdi) |
| addq $2,%rsi |
| addq $2,%rdi |
| sub $2,%edx |
| .Lbyte: |
| test $1,%dl |
| je .Ldone |
| 80: movb (%rsi),%al |
| 81: movb %al,(%rdi) |
| dec %edx |
| .Ldone: |
| mov %edx,%eax |
| RET |
| |
| /* |
| * If we fail on the last four bytes, we won't |
| * bother with any fixups. It's dead, Jim. Note |
| * that there's no need for 'sfence' for any |
| * of this, since the exception will have been |
| * serializing. |
| */ |
| _ASM_EXTABLE_UA(60b, .Ldone) |
| _ASM_EXTABLE_UA(61b, .Ldone) |
| _ASM_EXTABLE_UA(70b, .Ldone) |
| _ASM_EXTABLE_UA(71b, .Ldone) |
| _ASM_EXTABLE_UA(80b, .Ldone) |
| _ASM_EXTABLE_UA(81b, .Ldone) |
| |
| /* |
| * This is the "head needs aliging" case when |
| * the destination isn't 8-byte aligned. The |
| * 4-byte case can be done uncached, but any |
| * smaller alignment is done with regular stores. |
| */ |
| .Lalign: |
| test $1,%dil |
| je .Lalign_word |
| test %edx,%edx |
| je .Ldone |
| 90: movb (%rsi),%al |
| 91: movb %al,(%rdi) |
| inc %rsi |
| inc %rdi |
| dec %edx |
| .Lalign_word: |
| test $2,%dil |
| je .Lalign_long |
| cmp $2,%edx |
| jb .Lbyte |
| 92: movw (%rsi),%ax |
| 93: movw %ax,(%rdi) |
| addq $2,%rsi |
| addq $2,%rdi |
| sub $2,%edx |
| .Lalign_long: |
| test $4,%dil |
| je .Lis_aligned |
| cmp $4,%edx |
| jb .Lword |
| 94: movl (%rsi),%eax |
| 95: movnti %eax,(%rdi) |
| addq $4,%rsi |
| addq $4,%rdi |
| sub $4,%edx |
| jmp .Lis_aligned |
| |
| /* |
| * If we fail on the initial alignment accesses, |
| * we're all done. Again, no point in trying to |
| * do byte-by-byte probing if the 4-byte load |
| * fails - we're not doing any uncached accesses |
| * any more. |
| */ |
| _ASM_EXTABLE_UA(90b, .Ldone) |
| _ASM_EXTABLE_UA(91b, .Ldone) |
| _ASM_EXTABLE_UA(92b, .Ldone) |
| _ASM_EXTABLE_UA(93b, .Ldone) |
| _ASM_EXTABLE_UA(94b, .Ldone) |
| _ASM_EXTABLE_UA(95b, .Ldone) |
| |
| /* |
| * Exception table fixups for faults in the middle |
| */ |
| .Ldone56: sub $8,%edx |
| .Ldone48: sub $8,%edx |
| .Ldone40: sub $8,%edx |
| .Ldone32: sub $8,%edx |
| .Ldone24: sub $8,%edx |
| .Ldone16: sub $8,%edx |
| .Ldone8: sub $8,%edx |
| .Ldone0: |
| mov %edx,%eax |
| RET |
| |
| .Lfixup32: |
| addq $32,%rsi |
| addq $32,%rdi |
| sub $32,%edx |
| jmp .Lquadwords |
| |
| .Llast4: |
| 52: movl (%rsi),%eax |
| 53: movnti %eax,(%rdi) |
| sfence |
| sub $4,%edx |
| mov %edx,%eax |
| RET |
| _ASM_EXTABLE_UA(52b, .Ldone0) |
| _ASM_EXTABLE_UA(53b, .Ldone0) |
| |
| SYM_FUNC_END(__copy_user_nocache) |
| EXPORT_SYMBOL(__copy_user_nocache) |