summaryrefslogtreecommitdiff
path: root/improved-HyperLogLog-cardinality-estimation.patch
diff options
context:
space:
mode:
Diffstat (limited to 'improved-HyperLogLog-cardinality-estimation.patch')
-rw-r--r--improved-HyperLogLog-cardinality-estimation.patch328
1 files changed, 328 insertions, 0 deletions
diff --git a/improved-HyperLogLog-cardinality-estimation.patch b/improved-HyperLogLog-cardinality-estimation.patch
new file mode 100644
index 0000000..0a2256b
--- /dev/null
+++ b/improved-HyperLogLog-cardinality-estimation.patch
@@ -0,0 +1,328 @@
+From 1e9a7748716e1cd234893dd858d07ffa77920e41 Mon Sep 17 00:00:00 2001
+From: Otmar Ertl <otmar.ertl@gmail.com>
+Date: Sat, 10 Mar 2018 20:13:21 +0100
+Subject: [PATCH] improved HyperLogLog cardinality estimation
+
+based on method described in https://arxiv.org/abs/1702.01284
+that does not rely on any magic constants
+---
+ src/hyperloglog.c | 230 +++++++++++++++++++++++-----------------------
+ 1 file changed, 117 insertions(+), 113 deletions(-)
+
+diff --git a/src/hyperloglog.c b/src/hyperloglog.c
+index 8ab9d2a30c0..7f5f62445c9 100644
+--- a/src/hyperloglog.c
++++ b/src/hyperloglog.c
+@@ -192,6 +192,7 @@ struct hllhdr {
+ #define HLL_VALID_CACHE(hdr) (((hdr)->card[7] & (1<<7)) == 0)
+
+ #define HLL_P 14 /* The greater is P, the smaller the error. */
++#define HLL_Q (63-HLL_P)
+ #define HLL_REGISTERS (1<<HLL_P) /* With P=14, 16384 registers. */
+ #define HLL_P_MASK (HLL_REGISTERS-1) /* Mask to index register. */
+ #define HLL_BITS 6 /* Enough to count up to 63 leading zeroes. */
+@@ -510,13 +511,9 @@ int hllDenseAdd(uint8_t *registers, unsigned char *ele, size_t elesize) {
+ return hllDenseSet(registers,index,count);
+ }
+
+-/* Compute SUM(2^-reg) in the dense representation.
+- * PE is an array with a pre-computer table of values 2^-reg indexed by reg.
+- * As a side effect the integer pointed by 'ezp' is set to the number
+- * of zero registers. */
+-double hllDenseSum(uint8_t *registers, double *PE, int *ezp) {
+- double E = 0;
+- int j, ez = 0;
++/* Compute the register histogram in the dense representation. */
++void hllDenseRegHisto(uint8_t *registers, int* regHisto) {
++ int j;
+
+ /* Redis default is to use 16384 registers 6 bits each. The code works
+ * with other values by modifying the defines, but for our target value
+@@ -527,47 +524,49 @@ double hllDenseSum(uint8_t *registers, double *PE, int *ezp) {
+ r10, r11, r12, r13, r14, r15;
+ for (j = 0; j < 1024; j++) {
+ /* Handle 16 registers per iteration. */
+- r0 = r[0] & 63; if (r0 == 0) ez++;
+- r1 = (r[0] >> 6 | r[1] << 2) & 63; if (r1 == 0) ez++;
+- r2 = (r[1] >> 4 | r[2] << 4) & 63; if (r2 == 0) ez++;
+- r3 = (r[2] >> 2) & 63; if (r3 == 0) ez++;
+- r4 = r[3] & 63; if (r4 == 0) ez++;
+- r5 = (r[3] >> 6 | r[4] << 2) & 63; if (r5 == 0) ez++;
+- r6 = (r[4] >> 4 | r[5] << 4) & 63; if (r6 == 0) ez++;
+- r7 = (r[5] >> 2) & 63; if (r7 == 0) ez++;
+- r8 = r[6] & 63; if (r8 == 0) ez++;
+- r9 = (r[6] >> 6 | r[7] << 2) & 63; if (r9 == 0) ez++;
+- r10 = (r[7] >> 4 | r[8] << 4) & 63; if (r10 == 0) ez++;
+- r11 = (r[8] >> 2) & 63; if (r11 == 0) ez++;
+- r12 = r[9] & 63; if (r12 == 0) ez++;
+- r13 = (r[9] >> 6 | r[10] << 2) & 63; if (r13 == 0) ez++;
+- r14 = (r[10] >> 4 | r[11] << 4) & 63; if (r14 == 0) ez++;
+- r15 = (r[11] >> 2) & 63; if (r15 == 0) ez++;
+-
+- /* Additional parens will allow the compiler to optimize the
+- * code more with a loss of precision that is not very relevant
+- * here (floating point math is not commutative!). */
+- E += (PE[r0] + PE[r1]) + (PE[r2] + PE[r3]) + (PE[r4] + PE[r5]) +
+- (PE[r6] + PE[r7]) + (PE[r8] + PE[r9]) + (PE[r10] + PE[r11]) +
+- (PE[r12] + PE[r13]) + (PE[r14] + PE[r15]);
++ r0 = r[0] & 63;
++ r1 = (r[0] >> 6 | r[1] << 2) & 63;
++ r2 = (r[1] >> 4 | r[2] << 4) & 63;
++ r3 = (r[2] >> 2) & 63;
++ r4 = r[3] & 63;
++ r5 = (r[3] >> 6 | r[4] << 2) & 63;
++ r6 = (r[4] >> 4 | r[5] << 4) & 63;
++ r7 = (r[5] >> 2) & 63;
++ r8 = r[6] & 63;
++ r9 = (r[6] >> 6 | r[7] << 2) & 63;
++ r10 = (r[7] >> 4 | r[8] << 4) & 63;
++ r11 = (r[8] >> 2) & 63;
++ r12 = r[9] & 63;
++ r13 = (r[9] >> 6 | r[10] << 2) & 63;
++ r14 = (r[10] >> 4 | r[11] << 4) & 63;
++ r15 = (r[11] >> 2) & 63;
++
++ regHisto[r0] += 1;
++ regHisto[r1] += 1;
++ regHisto[r2] += 1;
++ regHisto[r3] += 1;
++ regHisto[r4] += 1;
++ regHisto[r5] += 1;
++ regHisto[r6] += 1;
++ regHisto[r7] += 1;
++ regHisto[r8] += 1;
++ regHisto[r9] += 1;
++ regHisto[r10] += 1;
++ regHisto[r11] += 1;
++ regHisto[r12] += 1;
++ regHisto[r13] += 1;
++ regHisto[r14] += 1;
++ regHisto[r15] += 1;
++
+ r += 12;
+ }
+ } else {
+- for (j = 0; j < HLL_REGISTERS; j++) {
++ for(j = 0; j < HLL_REGISTERS; j++) {
+ unsigned long reg;
+-
+ HLL_DENSE_GET_REGISTER(reg,registers,j);
+- if (reg == 0) {
+- ez++;
+- /* Increment E at the end of the loop. */
+- } else {
+- E += PE[reg]; /* Precomputed 2^(-reg[j]). */
+- }
++ regHisto[reg] += 1;
+ }
+- E += ez; /* Add 2^0 'ez' times. */
+ }
+- *ezp = ez;
+- return E;
+ }
+
+ /* ================== Sparse representation implementation ================= */
+@@ -903,76 +902,96 @@ int hllSparseAdd(robj *o, unsigned char *ele, size_t elesize) {
+ return hllSparseSet(o,index,count);
+ }
+
+-/* Compute SUM(2^-reg) in the sparse representation.
+- * PE is an array with a pre-computer table of values 2^-reg indexed by reg.
+- * As a side effect the integer pointed by 'ezp' is set to the number
+- * of zero registers. */
+-double hllSparseSum(uint8_t *sparse, int sparselen, double *PE, int *ezp, int *invalid) {
+- double E = 0;
+- int ez = 0, idx = 0, runlen, regval;
++/* Compute the register histogram in the sparse representation. */
++void hllSparseRegHisto(uint8_t *sparse, int sparselen, int *invalid, int* regHisto) {
++ int idx = 0, runlen, regval;
+ uint8_t *end = sparse+sparselen, *p = sparse;
+
+ while(p < end) {
+ if (HLL_SPARSE_IS_ZERO(p)) {
+ runlen = HLL_SPARSE_ZERO_LEN(p);
+ idx += runlen;
+- ez += runlen;
+- /* Increment E at the end of the loop. */
++ regHisto[0] += runlen;
+ p++;
+ } else if (HLL_SPARSE_IS_XZERO(p)) {
+ runlen = HLL_SPARSE_XZERO_LEN(p);
+ idx += runlen;
+- ez += runlen;
+- /* Increment E at the end of the loop. */
++ regHisto[0] += runlen;
+ p += 2;
+ } else {
+ runlen = HLL_SPARSE_VAL_LEN(p);
+ regval = HLL_SPARSE_VAL_VALUE(p);
+ idx += runlen;
+- E += PE[regval]*runlen;
++ regHisto[regval] += runlen;
+ p++;
+ }
+ }
+ if (idx != HLL_REGISTERS && invalid) *invalid = 1;
+- E += ez; /* Add 2^0 'ez' times. */
+- *ezp = ez;
+- return E;
+ }
+
+ /* ========================= HyperLogLog Count ==============================
+ * This is the core of the algorithm where the approximated count is computed.
+- * The function uses the lower level hllDenseSum() and hllSparseSum() functions
+- * as helpers to compute the SUM(2^-reg) part of the computation, which is
+- * representation-specific, while all the rest is common. */
+-
+-/* Implements the SUM operation for uint8_t data type which is only used
+- * internally as speedup for PFCOUNT with multiple keys. */
+-double hllRawSum(uint8_t *registers, double *PE, int *ezp) {
+- double E = 0;
+- int j, ez = 0;
++ * The function uses the lower level hllDenseRegHisto() and hllSparseRegHisto()
++ * functions as helpers to compute histogram of register values part of the
++ * computation, which is representation-specific, while all the rest is common. */
++
++/* Implements the register histogram calculation for uint8_t data type
++ * which is only used internally as speedup for PFCOUNT with multiple keys. */
++void hllRawRegHisto(uint8_t *registers, int* regHisto) {
+ uint64_t *word = (uint64_t*) registers;
+ uint8_t *bytes;
++ int j;
+
+ for (j = 0; j < HLL_REGISTERS/8; j++) {
+ if (*word == 0) {
+- ez += 8;
++ regHisto[0] += 8;
+ } else {
+ bytes = (uint8_t*) word;
+- if (bytes[0]) E += PE[bytes[0]]; else ez++;
+- if (bytes[1]) E += PE[bytes[1]]; else ez++;
+- if (bytes[2]) E += PE[bytes[2]]; else ez++;
+- if (bytes[3]) E += PE[bytes[3]]; else ez++;
+- if (bytes[4]) E += PE[bytes[4]]; else ez++;
+- if (bytes[5]) E += PE[bytes[5]]; else ez++;
+- if (bytes[6]) E += PE[bytes[6]]; else ez++;
+- if (bytes[7]) E += PE[bytes[7]]; else ez++;
++ regHisto[bytes[0]] += 1;
++ regHisto[bytes[1]] += 1;
++ regHisto[bytes[2]] += 1;
++ regHisto[bytes[3]] += 1;
++ regHisto[bytes[4]] += 1;
++ regHisto[bytes[5]] += 1;
++ regHisto[bytes[6]] += 1;
++ regHisto[bytes[7]] += 1;
+ }
+ word++;
+ }
+- E += ez; /* 2^(-reg[j]) is 1 when m is 0, add it 'ez' times for every
+- zero register in the HLL. */
+- *ezp = ez;
+- return E;
++}
++
++/* Helper function sigma as defined in
++ * "New cardinality estimation algorithms for HyperLogLog sketches"
++ * Otmar Ertl, arXiv:1702.01284 */
++double hllSigma(double x) {
++ if (x == 1.) return INFINITY;
++ double zPrime;
++ double y = 1;
++ double z = x;
++ do {
++ x *= x;
++ zPrime = z;
++ z += x * y;
++ y += y;
++ } while(zPrime != z);
++ return z;
++}
++
++/* Helper function tau as defined in
++ * "New cardinality estimation algorithms for HyperLogLog sketches"
++ * Otmar Ertl, arXiv:1702.01284 */
++double hllTau(double x) {
++ if (x == 0. || x == 1.) return 0.;
++ double zPrime;
++ double y = 1.0;
++ double z = 1 - x;
++ do {
++ x = sqrt(x);
++ zPrime = z;
++ y *= 0.5;
++ z -= pow(1 - x, 2)*y;
++ } while(zPrime != z);
++ return z / 3;
+ }
+
+ /* Return the approximated cardinality of the set based on the harmonic
+@@ -988,49 +1007,34 @@ double hllRawSum(uint8_t *registers, double *PE, int *ezp) {
+ * keys (no need to work with 6-bit integers encoding). */
+ uint64_t hllCount(struct hllhdr *hdr, int *invalid) {
+ double m = HLL_REGISTERS;
+- double E, alpha = 0.7213/(1+1.079/m);
+- int j, ez; /* Number of registers equal to 0. */
+-
+- /* We precompute 2^(-reg[j]) in a small table in order to
+- * speedup the computation of SUM(2^-register[0..i]). */
+- static int initialized = 0;
+- static double PE[64];
+- if (!initialized) {
+- PE[0] = 1; /* 2^(-reg[j]) is 1 when m is 0. */
+- for (j = 1; j < 64; j++) {
+- /* 2^(-reg[j]) is the same as 1/2^reg[j]. */
+- PE[j] = 1.0/(1ULL << j);
+- }
+- initialized = 1;
+- }
++ double E;
++ int j;
++ double alphaInf = 0.5 / log(2.);
++ int regHisto[HLL_Q+2] = {0};
+
+- /* Compute SUM(2^-register[0..i]). */
++ /* Compute register histogram */
+ if (hdr->encoding == HLL_DENSE) {
+- E = hllDenseSum(hdr->registers,PE,&ez);
++ hllDenseRegHisto(hdr->registers,regHisto);
+ } else if (hdr->encoding == HLL_SPARSE) {
+- E = hllSparseSum(hdr->registers,
+- sdslen((sds)hdr)-HLL_HDR_SIZE,PE,&ez,invalid);
++ hllSparseRegHisto(hdr->registers,
++ sdslen((sds)hdr)-HLL_HDR_SIZE,invalid,regHisto);
+ } else if (hdr->encoding == HLL_RAW) {
+- E = hllRawSum(hdr->registers,PE,&ez);
++ hllRawRegHisto(hdr->registers,regHisto);
+ } else {
+ serverPanic("Unknown HyperLogLog encoding in hllCount()");
+ }
+
+- /* Apply loglog-beta to the raw estimate. See:
+- * "LogLog-Beta and More: A New Algorithm for Cardinality Estimation
+- * Based on LogLog Counting" Jason Qin, Denys Kim, Yumei Tung
+- * arXiv:1612.02284 */
+- double zl = log(ez + 1);
+- double beta = -0.370393911*ez +
+- 0.070471823*zl +
+- 0.17393686*pow(zl,2) +
+- 0.16339839*pow(zl,3) +
+- -0.09237745*pow(zl,4) +
+- 0.03738027*pow(zl,5) +
+- -0.005384159*pow(zl,6) +
+- 0.00042419*pow(zl,7);
+-
+- E = llroundl(alpha*m*(m-ez)*(1/(E+beta)));
++ /* Estimate cardinality form register histogram. See:
++ * "New cardinality estimation algorithms for HyperLogLog sketches"
++ * Otmar Ertl, arXiv:1702.01284 */
++ double z = m * hllTau((m-regHisto[HLL_Q+1])/(double)m);
++ for (j = HLL_Q; j >= 1; --j) {
++ z += regHisto[j];
++ z *= 0.5;
++ }
++ z += m * hllSigma(regHisto[0]/(double)m);
++ E = llroundl(alphaInf*m*m/z);
++
+ return (uint64_t) E;
+ }
+