123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352 |
- /*
- -*- linux-c -*-
- drbd_receiver.c
- This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
- Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
- Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
- Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
- drbd is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2, or (at your option)
- any later version.
- drbd is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with drbd; see the file COPYING. If not, write to
- the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
- */
- #ifndef _DRBD_VLI_H
- #define _DRBD_VLI_H
- /*
- * At a granularity of 4KiB storage represented per bit,
- * and stroage sizes of several TiB,
- * and possibly small-bandwidth replication,
- * the bitmap transfer time can take much too long,
- * if transmitted in plain text.
- *
- * We try to reduce the transferred bitmap information
- * by encoding runlengths of bit polarity.
- *
- * We never actually need to encode a "zero" (runlengths are positive).
- * But then we have to store the value of the first bit.
- * The first bit of information thus shall encode if the first runlength
- * gives the number of set or unset bits.
- *
- * We assume that large areas are either completely set or unset,
- * which gives good compression with any runlength method,
- * even when encoding the runlength as fixed size 32bit/64bit integers.
- *
- * Still, there may be areas where the polarity flips every few bits,
- * and encoding the runlength sequence of those areas with fix size
- * integers would be much worse than plaintext.
- *
- * We want to encode small runlength values with minimum code length,
- * while still being able to encode a Huge run of all zeros.
- *
- * Thus we need a Variable Length Integer encoding, VLI.
- *
- * For some cases, we produce more code bits than plaintext input.
- * We need to send incompressible chunks as plaintext, skip over them
- * and then see if the next chunk compresses better.
- *
- * We don't care too much about "excellent" compression ratio for large
- * runlengths (all set/all clear): whether we achieve a factor of 100
- * or 1000 is not that much of an issue.
- * We do not want to waste too much on short runlengths in the "noisy"
- * parts of the bitmap, though.
- *
- * There are endless variants of VLI, we experimented with:
- * * simple byte-based
- * * various bit based with different code word length.
- *
- * To avoid yet an other configuration parameter (choice of bitmap compression
- * algorithm) which was difficult to explain and tune, we just chose the one
- * variant that turned out best in all test cases.
- * Based on real world usage patterns, with device sizes ranging from a few GiB
- * to several TiB, file server/mailserver/webserver/mysql/postgress,
- * mostly idle to really busy, the all time winner (though sometimes only
- * marginally better) is:
- */
- /*
- * encoding is "visualised" as
- * __little endian__ bitstream, least significant bit first (left most)
- *
- * this particular encoding is chosen so that the prefix code
- * starts as unary encoding the level, then modified so that
- * 10 levels can be described in 8bit, with minimal overhead
- * for the smaller levels.
- *
- * Number of data bits follow fibonacci sequence, with the exception of the
- * last level (+1 data bit, so it makes 64bit total). The only worse code when
- * encoding bit polarity runlength is 1 plain bits => 2 code bits.
- prefix data bits max val Nº data bits
- 0 x 0x2 1
- 10 x 0x4 1
- 110 xx 0x8 2
- 1110 xxx 0x10 3
- 11110 xxx xx 0x30 5
- 111110 xx xxxxxx 0x130 8
- 11111100 xxxxxxxx xxxxx 0x2130 13
- 11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21
- 11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34
- 11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
- * maximum encodable value: 0x100000400202130 == 2**56 + some */
- /* compression "table":
- transmitted x 0.29
- as plaintext x ........................
- x ........................
- x ........................
- x 0.59 0.21........................
- x ........................................................
- x .. c ...................................................
- x 0.44.. o ...................................................
- x .......... d ...................................................
- x .......... e ...................................................
- X............. ...................................................
- x.............. b ...................................................
- 2.0x............... i ...................................................
- #X................ t ...................................................
- #................. s ........................... plain bits ..........
- -+-----------------------------------------------------------------------
- 1 16 32 64
- */
- /* LEVEL: (total bits, prefix bits, prefix value),
- * sorted ascending by number of total bits.
- * The rest of the code table is calculated at compiletime from this. */
- /* fibonacci data 1, 1, ... */
- #define VLI_L_1_1() do { \
- LEVEL( 2, 1, 0x00); \
- LEVEL( 3, 2, 0x01); \
- LEVEL( 5, 3, 0x03); \
- LEVEL( 7, 4, 0x07); \
- LEVEL(10, 5, 0x0f); \
- LEVEL(14, 6, 0x1f); \
- LEVEL(21, 8, 0x3f); \
- LEVEL(29, 8, 0x7f); \
- LEVEL(42, 8, 0xbf); \
- LEVEL(64, 8, 0xff); \
- } while (0)
- /* finds a suitable level to decode the least significant part of in.
- * returns number of bits consumed.
- *
- * BUG() for bad input, as that would mean a buggy code table. */
- static inline int vli_decode_bits(u64 *out, const u64 in)
- {
- u64 adj = 1;
- #define LEVEL(t,b,v) \
- do { \
- if ((in & ((1 << b) -1)) == v) { \
- *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
- return t; \
- } \
- adj += 1ULL << (t - b); \
- } while (0)
- VLI_L_1_1();
- /* NOT REACHED, if VLI_LEVELS code table is defined properly */
- BUG();
- #undef LEVEL
- }
- /* return number of code bits needed,
- * or negative error number */
- static inline int __vli_encode_bits(u64 *out, const u64 in)
- {
- u64 max = 0;
- u64 adj = 1;
- if (in == 0)
- return -EINVAL;
- #define LEVEL(t,b,v) do { \
- max += 1ULL << (t - b); \
- if (in <= max) { \
- if (out) \
- *out = ((in - adj) << b) | v; \
- return t; \
- } \
- adj = max + 1; \
- } while (0)
- VLI_L_1_1();
- return -EOVERFLOW;
- #undef LEVEL
- }
- #undef VLI_L_1_1
- /* code from here down is independend of actually used bit code */
- /*
- * Code length is determined by some unique (e.g. unary) prefix.
- * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
- * not a byte stream.
- */
- /* for the bitstream, we need a cursor */
- struct bitstream_cursor {
- /* the current byte */
- u8 *b;
- /* the current bit within *b, nomalized: 0..7 */
- unsigned int bit;
- };
- /* initialize cursor to point to first bit of stream */
- static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
- {
- cur->b = s;
- cur->bit = 0;
- }
- /* advance cursor by that many bits; maximum expected input value: 64,
- * but depending on VLI implementation, it may be more. */
- static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
- {
- bits += cur->bit;
- cur->b = cur->b + (bits >> 3);
- cur->bit = bits & 7;
- }
- /* the bitstream itself knows its length */
- struct bitstream {
- struct bitstream_cursor cur;
- unsigned char *buf;
- size_t buf_len; /* in bytes */
- /* for input stream:
- * number of trailing 0 bits for padding
- * total number of valid bits in stream: buf_len * 8 - pad_bits */
- unsigned int pad_bits;
- };
- static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
- {
- bs->buf = s;
- bs->buf_len = len;
- bs->pad_bits = pad_bits;
- bitstream_cursor_reset(&bs->cur, bs->buf);
- }
- static inline void bitstream_rewind(struct bitstream *bs)
- {
- bitstream_cursor_reset(&bs->cur, bs->buf);
- memset(bs->buf, 0, bs->buf_len);
- }
- /* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
- * Ignores "pad_bits".
- * Returns zero if bits == 0 (nothing to do).
- * Returns number of bits used if successful.
- *
- * If there is not enough room left in bitstream,
- * leaves bitstream unchanged and returns -ENOBUFS.
- */
- static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
- {
- unsigned char *b = bs->cur.b;
- unsigned int tmp;
- if (bits == 0)
- return 0;
- if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
- return -ENOBUFS;
- /* paranoia: strip off hi bits; they should not be set anyways. */
- if (bits < 64)
- val &= ~0ULL >> (64 - bits);
- *b++ |= (val & 0xff) << bs->cur.bit;
- for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
- *b++ |= (val >> tmp) & 0xff;
- bitstream_cursor_advance(&bs->cur, bits);
- return bits;
- }
- /* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
- *
- * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
- *
- * If there are less than the requested number of valid bits left in the
- * bitstream, still fetches all available bits.
- *
- * Returns number of actually fetched bits.
- */
- static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
- {
- u64 val;
- unsigned int n;
- if (bits > 64)
- return -EINVAL;
- if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
- bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
- - bs->cur.bit - bs->pad_bits;
- if (bits == 0) {
- *out = 0;
- return 0;
- }
- /* get the high bits */
- val = 0;
- n = (bs->cur.bit + bits + 7) >> 3;
- /* n may be at most 9, if cur.bit + bits > 64 */
- /* which means this copies at most 8 byte */
- if (n) {
- memcpy(&val, bs->cur.b+1, n - 1);
- val = le64_to_cpu(val) << (8 - bs->cur.bit);
- }
- /* we still need the low bits */
- val |= bs->cur.b[0] >> bs->cur.bit;
- /* and mask out bits we don't want */
- val &= ~0ULL >> (64 - bits);
- bitstream_cursor_advance(&bs->cur, bits);
- *out = val;
- return bits;
- }
- /* encodes @in as vli into @bs;
- * return values
- * > 0: number of bits successfully stored in bitstream
- * -ENOBUFS @bs is full
- * -EINVAL input zero (invalid)
- * -EOVERFLOW input too large for this vli code (invalid)
- */
- static inline int vli_encode_bits(struct bitstream *bs, u64 in)
- {
- u64 code = code;
- int bits = __vli_encode_bits(&code, in);
- if (bits <= 0)
- return bits;
- return bitstream_put_bits(bs, code, bits);
- }
- #endif
|