| #include "libcflat.h" |
| #include "smp.h" |
| #include "atomic.h" |
| #include "processor.h" |
| #include "kvmclock.h" |
| #include "asm/barrier.h" |
| |
| #define unlikely(x) __builtin_expect(!!(x), 0) |
| #define likely(x) __builtin_expect(!!(x), 1) |
| |
| |
| struct pvclock_vcpu_time_info __attribute__((aligned(4))) hv_clock[MAX_CPU]; |
| struct pvclock_wall_clock wall_clock; |
| static unsigned char valid_flags = 0; |
| static atomic64_t last_value = ATOMIC64_INIT(0); |
| |
| /* |
| * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, |
| * yielding a 64-bit result. |
| */ |
| static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) |
| { |
| u64 product; |
| #ifdef __i386__ |
| u32 tmp1, tmp2; |
| #endif |
| |
| if (shift < 0) |
| delta >>= -shift; |
| else |
| delta <<= shift; |
| |
| #ifdef __i386__ |
| __asm__ ( |
| "mul %5 ; " |
| "mov %4,%%eax ; " |
| "mov %%edx,%4 ; " |
| "mul %5 ; " |
| "xor %5,%5 ; " |
| "add %4,%%eax ; " |
| "adc %5,%%edx ; " |
| : "=A" (product), "=r" (tmp1), "=r" (tmp2) |
| : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); |
| #elif defined(__x86_64__) |
| __asm__ ( |
| "mul %%rdx ; shrd $32,%%rdx,%%rax" |
| : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); |
| #else |
| #error implement me! |
| #endif |
| |
| return product; |
| } |
| |
| #ifdef __i386__ |
| # define do_div(n,base) ({ \ |
| u32 __base = (base); \ |
| u32 __rem; \ |
| __rem = ((u64)(n)) % __base; \ |
| (n) = ((u64)(n)) / __base; \ |
| __rem; \ |
| }) |
| #else |
| u32 __attribute__((weak)) __div64_32(u64 *n, u32 base); |
| u32 __attribute__((weak)) __div64_32(u64 *n, u32 base) |
| { |
| u64 rem = *n; |
| u64 b = base; |
| u64 res, d = 1; |
| u32 high = rem >> 32; |
| |
| /* Reduce the thing a bit first */ |
| res = 0; |
| if (high >= base) { |
| high /= base; |
| res = (u64) high << 32; |
| rem -= (u64) (high*base) << 32; |
| } |
| |
| while ((s64)b > 0 && b < rem) { |
| b = b+b; |
| d = d+d; |
| } |
| |
| do { |
| if (rem >= b) { |
| rem -= b; |
| res += d; |
| } |
| b >>= 1; |
| d >>= 1; |
| } while (d); |
| |
| *n = res; |
| return rem; |
| } |
| |
| # define do_div(n,base) ({ \ |
| u32 __base = (base); \ |
| u32 __rem; \ |
| (void)(((typeof((n)) *)0) == ((u64 *)0)); \ |
| if (likely(((n) >> 32) == 0)) { \ |
| __rem = (u32)(n) % __base; \ |
| (n) = (u32)(n) / __base; \ |
| } else \ |
| __rem = __div64_32(&(n), __base); \ |
| __rem; \ |
| }) |
| #endif |
| |
| /** |
| * set_normalized_timespec - set timespec sec and nsec parts and normalize |
| * |
| * @ts: pointer to timespec variable to be set |
| * @sec: seconds to set |
| * @nsec: nanoseconds to set |
| * |
| * Set seconds and nanoseconds field of a timespec variable and |
| * normalize to the timespec storage format |
| * |
| * Note: The tv_nsec part is always in the range of |
| * 0 <= tv_nsec < NSEC_PER_SEC |
| * For negative values only the tv_sec field is negative ! |
| */ |
| static void set_normalized_timespec(struct timespec *ts, long sec, s64 nsec) |
| { |
| while (nsec >= NSEC_PER_SEC) { |
| /* |
| * The following asm() prevents the compiler from |
| * optimising this loop into a modulo operation. See |
| * also __iter_div_u64_rem() in include/linux/time.h |
| */ |
| asm("" : "+rm"(nsec)); |
| nsec -= NSEC_PER_SEC; |
| ++sec; |
| } |
| while (nsec < 0) { |
| asm("" : "+rm"(nsec)); |
| nsec += NSEC_PER_SEC; |
| --sec; |
| } |
| ts->tv_sec = sec; |
| ts->tv_nsec = nsec; |
| } |
| |
| static inline |
| unsigned pvclock_read_begin(const struct pvclock_vcpu_time_info *src) |
| { |
| unsigned version = src->version & ~1; |
| /* Make sure that the version is read before the data. */ |
| smp_rmb(); |
| return version; |
| } |
| |
| static inline |
| bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src, |
| unsigned version) |
| { |
| /* Make sure that the version is re-read after the data. */ |
| smp_rmb(); |
| return version != src->version; |
| } |
| |
| static inline u64 rdtsc_ordered(void) |
| { |
| /* |
| * FIXME: on Intel CPUs rmb() aka lfence is sufficient which brings up |
| * to 2x speedup |
| */ |
| mb(); |
| return rdtsc(); |
| } |
| |
| static inline |
| cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src) |
| { |
| u64 delta = rdtsc_ordered() - src->tsc_timestamp; |
| cycle_t offset = scale_delta(delta, src->tsc_to_system_mul, |
| src->tsc_shift); |
| return src->system_time + offset; |
| } |
| |
| static cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) |
| { |
| unsigned version; |
| cycle_t ret; |
| u64 last; |
| u8 flags; |
| |
| do { |
| version = pvclock_read_begin(src); |
| ret = __pvclock_read_cycles(src); |
| flags = src->flags; |
| } while (pvclock_read_retry(src, version)); |
| |
| if ((valid_flags & PVCLOCK_RAW_CYCLE_BIT) || |
| ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && |
| (flags & PVCLOCK_TSC_STABLE_BIT))) |
| return ret; |
| |
| /* |
| * Assumption here is that last_value, a global accumulator, always goes |
| * forward. If we are less than that, we should not be much smaller. |
| * We assume there is an error margin we're inside, and then the |
| * correction does not sacrifice accuracy. |
| * |
| * For reads: global may have changed between test and return, |
| * but this means someone else updated poked the clock at a later time. |
| * We just need to make sure we are not seeing a backwards event. |
| * |
| * For updates: last_value = ret is not enough, since two vcpus could be |
| * updating at the same time, and one of them could be slightly behind, |
| * making the assumption that last_value always go forward fail to hold. |
| */ |
| last = atomic64_read(&last_value); |
| do { |
| if (ret < last) |
| return last; |
| last = atomic64_cmpxchg(&last_value, last, ret); |
| } while (unlikely(last != ret)); |
| |
| return ret; |
| } |
| |
| cycle_t kvm_clock_read() |
| { |
| struct pvclock_vcpu_time_info *src; |
| cycle_t ret; |
| int index = smp_id(); |
| |
| src = &hv_clock[index]; |
| ret = pvclock_clocksource_read(src); |
| return ret; |
| } |
| |
| void kvm_clock_init(void *data) |
| { |
| int index = smp_id(); |
| struct pvclock_vcpu_time_info *hvc = &hv_clock[index]; |
| |
| printf("kvm-clock: cpu %d, msr %p\n", index, hvc); |
| wrmsr(MSR_KVM_SYSTEM_TIME_NEW, (unsigned long)hvc | 1); |
| } |
| |
| void kvm_clock_clear(void *data) |
| { |
| wrmsr(MSR_KVM_SYSTEM_TIME_NEW, 0LL); |
| } |
| |
| static void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, |
| struct pvclock_vcpu_time_info *vcpu_time, |
| struct timespec *ts) |
| { |
| u32 version; |
| u64 delta; |
| struct timespec now; |
| |
| /* get wallclock at system boot */ |
| do { |
| version = wall_clock->version; |
| rmb(); /* fetch version before time */ |
| now.tv_sec = wall_clock->sec; |
| now.tv_nsec = wall_clock->nsec; |
| rmb(); /* fetch time before checking version */ |
| } while ((wall_clock->version & 1) || (version != wall_clock->version)); |
| |
| delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ |
| delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; |
| |
| now.tv_nsec = do_div(delta, NSEC_PER_SEC); |
| now.tv_sec = delta; |
| |
| set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); |
| } |
| |
| void kvm_get_wallclock(struct timespec *ts) |
| { |
| struct pvclock_vcpu_time_info *vcpu_time; |
| int index = smp_id(); |
| |
| wrmsr(MSR_KVM_WALL_CLOCK_NEW, (unsigned long)&wall_clock); |
| vcpu_time = &hv_clock[index]; |
| pvclock_read_wallclock(&wall_clock, vcpu_time, ts); |
| } |
| |
| void pvclock_set_flags(unsigned char flags) |
| { |
| valid_flags = flags; |
| } |