| /* SPDX-License-Identifier: GPL-2.0-only */ |
| /* |
| -*- linux-c -*- |
| drbd_receiver.c |
| This file is part of DRBD by Philipp Reisner and Lars Ellenberg. |
| |
| Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. |
| Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. |
| Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. |
| |
| */ |
| |
| #ifndef _DRBD_VLI_H |
| #define _DRBD_VLI_H |
| |
| /* |
| * At a granularity of 4KiB storage represented per bit, |
| * and stroage sizes of several TiB, |
| * and possibly small-bandwidth replication, |
| * the bitmap transfer time can take much too long, |
| * if transmitted in plain text. |
| * |
| * We try to reduce the transferred bitmap information |
| * by encoding runlengths of bit polarity. |
| * |
| * We never actually need to encode a "zero" (runlengths are positive). |
| * But then we have to store the value of the first bit. |
| * The first bit of information thus shall encode if the first runlength |
| * gives the number of set or unset bits. |
| * |
| * We assume that large areas are either completely set or unset, |
| * which gives good compression with any runlength method, |
| * even when encoding the runlength as fixed size 32bit/64bit integers. |
| * |
| * Still, there may be areas where the polarity flips every few bits, |
| * and encoding the runlength sequence of those areas with fix size |
| * integers would be much worse than plaintext. |
| * |
| * We want to encode small runlength values with minimum code length, |
| * while still being able to encode a Huge run of all zeros. |
| * |
| * Thus we need a Variable Length Integer encoding, VLI. |
| * |
| * For some cases, we produce more code bits than plaintext input. |
| * We need to send incompressible chunks as plaintext, skip over them |
| * and then see if the next chunk compresses better. |
| * |
| * We don't care too much about "excellent" compression ratio for large |
| * runlengths (all set/all clear): whether we achieve a factor of 100 |
| * or 1000 is not that much of an issue. |
| * We do not want to waste too much on short runlengths in the "noisy" |
| * parts of the bitmap, though. |
| * |
| * There are endless variants of VLI, we experimented with: |
| * * simple byte-based |
| * * various bit based with different code word length. |
| * |
| * To avoid yet an other configuration parameter (choice of bitmap compression |
| * algorithm) which was difficult to explain and tune, we just chose the one |
| * variant that turned out best in all test cases. |
| * Based on real world usage patterns, with device sizes ranging from a few GiB |
| * to several TiB, file server/mailserver/webserver/mysql/postgress, |
| * mostly idle to really busy, the all time winner (though sometimes only |
| * marginally better) is: |
| */ |
| |
| /* |
| * encoding is "visualised" as |
| * __little endian__ bitstream, least significant bit first (left most) |
| * |
| * this particular encoding is chosen so that the prefix code |
| * starts as unary encoding the level, then modified so that |
| * 10 levels can be described in 8bit, with minimal overhead |
| * for the smaller levels. |
| * |
| * Number of data bits follow fibonacci sequence, with the exception of the |
| * last level (+1 data bit, so it makes 64bit total). The only worse code when |
| * encoding bit polarity runlength is 1 plain bits => 2 code bits. |
| prefix data bits max val NÂș data bits |
| 0 x 0x2 1 |
| 10 x 0x4 1 |
| 110 xx 0x8 2 |
| 1110 xxx 0x10 3 |
| 11110 xxx xx 0x30 5 |
| 111110 xx xxxxxx 0x130 8 |
| 11111100 xxxxxxxx xxxxx 0x2130 13 |
| 11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21 |
| 11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34 |
| 11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56 |
| * maximum encodable value: 0x100000400202130 == 2**56 + some */ |
| |
| /* compression "table": |
| transmitted x 0.29 |
| as plaintext x ........................ |
| x ........................ |
| x ........................ |
| x 0.59 0.21........................ |
| x ........................................................ |
| x .. c ................................................... |
| x 0.44.. o ................................................... |
| x .......... d ................................................... |
| x .......... e ................................................... |
| X............. ................................................... |
| x.............. b ................................................... |
| 2.0x............... i ................................................... |
| #X................ t ................................................... |
| #................. s ........................... plain bits .......... |
| -+----------------------------------------------------------------------- |
| 1 16 32 64 |
| */ |
| |
| /* LEVEL: (total bits, prefix bits, prefix value), |
| * sorted ascending by number of total bits. |
| * The rest of the code table is calculated at compiletime from this. */ |
| |
| /* fibonacci data 1, 1, ... */ |
| #define VLI_L_1_1() do { \ |
| LEVEL( 2, 1, 0x00); \ |
| LEVEL( 3, 2, 0x01); \ |
| LEVEL( 5, 3, 0x03); \ |
| LEVEL( 7, 4, 0x07); \ |
| LEVEL(10, 5, 0x0f); \ |
| LEVEL(14, 6, 0x1f); \ |
| LEVEL(21, 8, 0x3f); \ |
| LEVEL(29, 8, 0x7f); \ |
| LEVEL(42, 8, 0xbf); \ |
| LEVEL(64, 8, 0xff); \ |
| } while (0) |
| |
| /* finds a suitable level to decode the least significant part of in. |
| * returns number of bits consumed. |
| * |
| * BUG() for bad input, as that would mean a buggy code table. */ |
| static inline int vli_decode_bits(u64 *out, const u64 in) |
| { |
| u64 adj = 1; |
| |
| #define LEVEL(t,b,v) \ |
| do { \ |
| if ((in & ((1 << b) -1)) == v) { \ |
| *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \ |
| return t; \ |
| } \ |
| adj += 1ULL << (t - b); \ |
| } while (0) |
| |
| VLI_L_1_1(); |
| |
| /* NOT REACHED, if VLI_LEVELS code table is defined properly */ |
| BUG(); |
| #undef LEVEL |
| } |
| |
| /* return number of code bits needed, |
| * or negative error number */ |
| static inline int __vli_encode_bits(u64 *out, const u64 in) |
| { |
| u64 max = 0; |
| u64 adj = 1; |
| |
| if (in == 0) |
| return -EINVAL; |
| |
| #define LEVEL(t,b,v) do { \ |
| max += 1ULL << (t - b); \ |
| if (in <= max) { \ |
| if (out) \ |
| *out = ((in - adj) << b) | v; \ |
| return t; \ |
| } \ |
| adj = max + 1; \ |
| } while (0) |
| |
| VLI_L_1_1(); |
| |
| return -EOVERFLOW; |
| #undef LEVEL |
| } |
| |
| #undef VLI_L_1_1 |
| |
| /* code from here down is independend of actually used bit code */ |
| |
| /* |
| * Code length is determined by some unique (e.g. unary) prefix. |
| * This encodes arbitrary bit length, not whole bytes: we have a bit-stream, |
| * not a byte stream. |
| */ |
| |
| /* for the bitstream, we need a cursor */ |
| struct bitstream_cursor { |
| /* the current byte */ |
| u8 *b; |
| /* the current bit within *b, nomalized: 0..7 */ |
| unsigned int bit; |
| }; |
| |
| /* initialize cursor to point to first bit of stream */ |
| static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s) |
| { |
| cur->b = s; |
| cur->bit = 0; |
| } |
| |
| /* advance cursor by that many bits; maximum expected input value: 64, |
| * but depending on VLI implementation, it may be more. */ |
| static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits) |
| { |
| bits += cur->bit; |
| cur->b = cur->b + (bits >> 3); |
| cur->bit = bits & 7; |
| } |
| |
| /* the bitstream itself knows its length */ |
| struct bitstream { |
| struct bitstream_cursor cur; |
| unsigned char *buf; |
| size_t buf_len; /* in bytes */ |
| |
| /* for input stream: |
| * number of trailing 0 bits for padding |
| * total number of valid bits in stream: buf_len * 8 - pad_bits */ |
| unsigned int pad_bits; |
| }; |
| |
| static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits) |
| { |
| bs->buf = s; |
| bs->buf_len = len; |
| bs->pad_bits = pad_bits; |
| bitstream_cursor_reset(&bs->cur, bs->buf); |
| } |
| |
| static inline void bitstream_rewind(struct bitstream *bs) |
| { |
| bitstream_cursor_reset(&bs->cur, bs->buf); |
| memset(bs->buf, 0, bs->buf_len); |
| } |
| |
| /* Put (at most 64) least significant bits of val into bitstream, and advance cursor. |
| * Ignores "pad_bits". |
| * Returns zero if bits == 0 (nothing to do). |
| * Returns number of bits used if successful. |
| * |
| * If there is not enough room left in bitstream, |
| * leaves bitstream unchanged and returns -ENOBUFS. |
| */ |
| static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits) |
| { |
| unsigned char *b = bs->cur.b; |
| unsigned int tmp; |
| |
| if (bits == 0) |
| return 0; |
| |
| if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len) |
| return -ENOBUFS; |
| |
| /* paranoia: strip off hi bits; they should not be set anyways. */ |
| if (bits < 64) |
| val &= ~0ULL >> (64 - bits); |
| |
| *b++ |= (val & 0xff) << bs->cur.bit; |
| |
| for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8) |
| *b++ |= (val >> tmp) & 0xff; |
| |
| bitstream_cursor_advance(&bs->cur, bits); |
| return bits; |
| } |
| |
| /* Fetch (at most 64) bits from bitstream into *out, and advance cursor. |
| * |
| * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged. |
| * |
| * If there are less than the requested number of valid bits left in the |
| * bitstream, still fetches all available bits. |
| * |
| * Returns number of actually fetched bits. |
| */ |
| static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits) |
| { |
| u64 val; |
| unsigned int n; |
| |
| if (bits > 64) |
| return -EINVAL; |
| |
| if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len) |
| bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3) |
| - bs->cur.bit - bs->pad_bits; |
| |
| if (bits == 0) { |
| *out = 0; |
| return 0; |
| } |
| |
| /* get the high bits */ |
| val = 0; |
| n = (bs->cur.bit + bits + 7) >> 3; |
| /* n may be at most 9, if cur.bit + bits > 64 */ |
| /* which means this copies at most 8 byte */ |
| if (n) { |
| memcpy(&val, bs->cur.b+1, n - 1); |
| val = le64_to_cpu(val) << (8 - bs->cur.bit); |
| } |
| |
| /* we still need the low bits */ |
| val |= bs->cur.b[0] >> bs->cur.bit; |
| |
| /* and mask out bits we don't want */ |
| val &= ~0ULL >> (64 - bits); |
| |
| bitstream_cursor_advance(&bs->cur, bits); |
| *out = val; |
| |
| return bits; |
| } |
| |
| /* encodes @in as vli into @bs; |
| |
| * return values |
| * > 0: number of bits successfully stored in bitstream |
| * -ENOBUFS @bs is full |
| * -EINVAL input zero (invalid) |
| * -EOVERFLOW input too large for this vli code (invalid) |
| */ |
| static inline int vli_encode_bits(struct bitstream *bs, u64 in) |
| { |
| u64 code; |
| int bits = __vli_encode_bits(&code, in); |
| |
| if (bits <= 0) |
| return bits; |
| |
| return bitstream_put_bits(bs, code, bits); |
| } |
| |
| #endif |