changeset 3640:195f5ad43476 draft

Compact serialization for variable-length integers Variable-length integers: bytes are a MSB base-128 encoding of the number. The high bit in each byte signifies whether another digit follows. To make the encoding is one-to-one, one is subtracted from all but the last digit. Thus, the byte sequence a[] with length len, where all but the last byte has bit 128 set, encodes the number: (a[len-1] & 0x7F) + sum(i=1..len-1, 128^i*((a[len-i-1] & 0x7F)+1)) Properties: * Very small (0-127: 1 byte, 128-16511: 2 bytes, 16512-2113663: 3 bytes) * Every integer has exactly one encoding * Encoding does not depend on size of original integer type
author Pieter Wuille <pieter.wuille@gmail.com>
date Fri, 15 Jun 2012 14:19:11 +0200
parents 61a0d2ebf491
children d489f6576318
files src/serialize.h src/test/serialize_tests.cpp
diffstat 2 files changed, 138 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/src/serialize.h
+++ b/src/serialize.h
@@ -238,9 +238,75 @@
     return nSizeRet;
 }
 
+// Variable-length integers: bytes are a MSB base-128 encoding of the number.
+// The high bit in each byte signifies whether another digit follows. To make
+// the encoding is one-to-one, one is subtracted from all but the last digit.
+// Thus, the byte sequence a[] with length len, where all but the last byte
+// has bit 128 set, encodes the number:
+//
+//   (a[len-1] & 0x7F) + sum(i=1..len-1, 128^i*((a[len-i-1] & 0x7F)+1))
+//
+// Properties:
+// * Very small (0-127: 1 byte, 128-16511: 2 bytes, 16512-2113663: 3 bytes)
+// * Every integer has exactly one encoding
+// * Encoding does not depend on size of original integer type
+// * No redundancy: every (infinite) byte sequence corresponds to a list
+//   of encoded integers.
+//
+// 0:         [0x00]  256:        [0x81 0x00]
+// 1:         [0x01]  16383:      [0xFE 0x7F]
+// 127:       [0x7F]  16384:      [0xFF 0x00]
+// 128:  [0x80 0x00]  16511: [0x80 0xFF 0x7F]
+// 255:  [0x80 0x7F]  65535: [0x82 0xFD 0x7F]
+// 2^32:           [0x8E 0xFE 0xFE 0xFF 0x00]
 
+template<typename I>
+inline unsigned int GetSizeOfVarInt(I n)
+{
+    int nRet = 0;
+    while(true) {
+        nRet++;
+        if (n <= 0x7F)
+            break;
+        n = (n >> 7) - 1;
+    }
+    return nRet;
+}
 
-#define FLATDATA(obj)   REF(CFlatData((char*)&(obj), (char*)&(obj) + sizeof(obj)))
+template<typename Stream, typename I>
+void WriteVarInt(Stream& os, I n)
+{
+    unsigned char tmp[(sizeof(n)*8+6)/7];
+    int len=0;
+    while(true) {
+        tmp[len] = (n & 0x7F) | (len ? 0x80 : 0x00);
+        if (n <= 0x7F)
+            break;
+        n = (n >> 7) - 1;
+        len++;
+    }
+    do {
+        WRITEDATA(os, tmp[len]);
+    } while(len--);
+}
+
+template<typename Stream, typename I>
+I ReadVarInt(Stream& is)
+{
+    I n = 0;
+    while(true) {
+        unsigned char chData;
+        READDATA(is, chData);
+        n = (n << 7) | (chData & 0x7F);
+        if (chData & 0x80)
+            n++;
+        else
+            return n;
+    }
+}
+
+#define FLATDATA(obj)  REF(CFlatData((char*)&(obj), (char*)&(obj) + sizeof(obj)))
+#define VARINT(obj)    REF(WrapVarInt(REF(obj)))
 
 /** Wrapper for serializing arrays and POD.
  */
@@ -274,6 +340,32 @@
     }
 };
 
+template<typename I>
+class CVarInt
+{
+protected:
+    I &n;
+public:
+    CVarInt(I& nIn) : n(nIn) { }
+
+    unsigned int GetSerializeSize(int, int) const {
+        return GetSizeOfVarInt<I>(n);
+    }
+
+    template<typename Stream>
+    void Serialize(Stream &s, int, int) const {
+        WriteVarInt<Stream,I>(s, n);
+    }
+
+    template<typename Stream>
+    void Unserialize(Stream& s, int, int) {
+        n = ReadVarInt<Stream,I>(s);
+    }
+};
+
+template<typename I>
+CVarInt<I> WrapVarInt(I& n) { return CVarInt<I>(n); }
+
 //
 // Forward declarations
 //
new file mode 100644
--- /dev/null
+++ b/src/test/serialize_tests.cpp
@@ -0,0 +1,45 @@
+#include <boost/test/unit_test.hpp>
+
+#include <string>
+#include <vector>
+
+#include "serialize.h"
+
+using namespace std;
+
+BOOST_AUTO_TEST_SUITE(serialize_tests)
+
+BOOST_AUTO_TEST_CASE(varints)
+{
+    // encode
+
+    CDataStream ss(SER_DISK, 0);
+    CDataStream::size_type size = 0;
+    for (int i = 0; i < 100000; i++) {
+        ss << VARINT(i);
+        size += ::GetSerializeSize(VARINT(i), 0, 0);
+        BOOST_CHECK(size == ss.size());
+    }
+
+    for (uint64 i = 0;  i < 100000000000ULL; i += 999999937) {
+        ss << VARINT(i);
+        size += ::GetSerializeSize(VARINT(i), 0, 0);
+        BOOST_CHECK(size == ss.size());
+    }
+
+    // decode
+    for (int i = 0; i < 100000; i++) {
+        int j;
+        ss >> VARINT(j);
+        BOOST_CHECK_MESSAGE(i == j, "decoded:" << j << " expected:" << i);
+    }
+
+    for (uint64 i = 0;  i < 100000000000ULL; i += 999999937) {
+        uint64 j;
+        ss >> VARINT(j);
+        BOOST_CHECK_MESSAGE(i == j, "decoded:" << j << " expected:" << i);
+    }
+
+}
+
+BOOST_AUTO_TEST_SUITE_END()