diff options
Diffstat (limited to 'improved-HyperLogLog-cardinality-estimation.patch')
-rw-r--r-- | improved-HyperLogLog-cardinality-estimation.patch | 328 |
1 files changed, 328 insertions, 0 deletions
diff --git a/improved-HyperLogLog-cardinality-estimation.patch b/improved-HyperLogLog-cardinality-estimation.patch new file mode 100644 index 0000000..0a2256b --- /dev/null +++ b/improved-HyperLogLog-cardinality-estimation.patch @@ -0,0 +1,328 @@ +From 1e9a7748716e1cd234893dd858d07ffa77920e41 Mon Sep 17 00:00:00 2001 +From: Otmar Ertl <otmar.ertl@gmail.com> +Date: Sat, 10 Mar 2018 20:13:21 +0100 +Subject: [PATCH] improved HyperLogLog cardinality estimation + +based on method described in https://arxiv.org/abs/1702.01284 +that does not rely on any magic constants +--- + src/hyperloglog.c | 230 +++++++++++++++++++++++----------------------- + 1 file changed, 117 insertions(+), 113 deletions(-) + +diff --git a/src/hyperloglog.c b/src/hyperloglog.c +index 8ab9d2a30c0..7f5f62445c9 100644 +--- a/src/hyperloglog.c ++++ b/src/hyperloglog.c +@@ -192,6 +192,7 @@ struct hllhdr { + #define HLL_VALID_CACHE(hdr) (((hdr)->card[7] & (1<<7)) == 0) + + #define HLL_P 14 /* The greater is P, the smaller the error. */ ++#define HLL_Q (63-HLL_P) + #define HLL_REGISTERS (1<<HLL_P) /* With P=14, 16384 registers. */ + #define HLL_P_MASK (HLL_REGISTERS-1) /* Mask to index register. */ + #define HLL_BITS 6 /* Enough to count up to 63 leading zeroes. */ +@@ -510,13 +511,9 @@ int hllDenseAdd(uint8_t *registers, unsigned char *ele, size_t elesize) { + return hllDenseSet(registers,index,count); + } + +-/* Compute SUM(2^-reg) in the dense representation. +- * PE is an array with a pre-computer table of values 2^-reg indexed by reg. +- * As a side effect the integer pointed by 'ezp' is set to the number +- * of zero registers. */ +-double hllDenseSum(uint8_t *registers, double *PE, int *ezp) { +- double E = 0; +- int j, ez = 0; ++/* Compute the register histogram in the dense representation. */ ++void hllDenseRegHisto(uint8_t *registers, int* regHisto) { ++ int j; + + /* Redis default is to use 16384 registers 6 bits each. The code works + * with other values by modifying the defines, but for our target value +@@ -527,47 +524,49 @@ double hllDenseSum(uint8_t *registers, double *PE, int *ezp) { + r10, r11, r12, r13, r14, r15; + for (j = 0; j < 1024; j++) { + /* Handle 16 registers per iteration. */ +- r0 = r[0] & 63; if (r0 == 0) ez++; +- r1 = (r[0] >> 6 | r[1] << 2) & 63; if (r1 == 0) ez++; +- r2 = (r[1] >> 4 | r[2] << 4) & 63; if (r2 == 0) ez++; +- r3 = (r[2] >> 2) & 63; if (r3 == 0) ez++; +- r4 = r[3] & 63; if (r4 == 0) ez++; +- r5 = (r[3] >> 6 | r[4] << 2) & 63; if (r5 == 0) ez++; +- r6 = (r[4] >> 4 | r[5] << 4) & 63; if (r6 == 0) ez++; +- r7 = (r[5] >> 2) & 63; if (r7 == 0) ez++; +- r8 = r[6] & 63; if (r8 == 0) ez++; +- r9 = (r[6] >> 6 | r[7] << 2) & 63; if (r9 == 0) ez++; +- r10 = (r[7] >> 4 | r[8] << 4) & 63; if (r10 == 0) ez++; +- r11 = (r[8] >> 2) & 63; if (r11 == 0) ez++; +- r12 = r[9] & 63; if (r12 == 0) ez++; +- r13 = (r[9] >> 6 | r[10] << 2) & 63; if (r13 == 0) ez++; +- r14 = (r[10] >> 4 | r[11] << 4) & 63; if (r14 == 0) ez++; +- r15 = (r[11] >> 2) & 63; if (r15 == 0) ez++; +- +- /* Additional parens will allow the compiler to optimize the +- * code more with a loss of precision that is not very relevant +- * here (floating point math is not commutative!). */ +- E += (PE[r0] + PE[r1]) + (PE[r2] + PE[r3]) + (PE[r4] + PE[r5]) + +- (PE[r6] + PE[r7]) + (PE[r8] + PE[r9]) + (PE[r10] + PE[r11]) + +- (PE[r12] + PE[r13]) + (PE[r14] + PE[r15]); ++ r0 = r[0] & 63; ++ r1 = (r[0] >> 6 | r[1] << 2) & 63; ++ r2 = (r[1] >> 4 | r[2] << 4) & 63; ++ r3 = (r[2] >> 2) & 63; ++ r4 = r[3] & 63; ++ r5 = (r[3] >> 6 | r[4] << 2) & 63; ++ r6 = (r[4] >> 4 | r[5] << 4) & 63; ++ r7 = (r[5] >> 2) & 63; ++ r8 = r[6] & 63; ++ r9 = (r[6] >> 6 | r[7] << 2) & 63; ++ r10 = (r[7] >> 4 | r[8] << 4) & 63; ++ r11 = (r[8] >> 2) & 63; ++ r12 = r[9] & 63; ++ r13 = (r[9] >> 6 | r[10] << 2) & 63; ++ r14 = (r[10] >> 4 | r[11] << 4) & 63; ++ r15 = (r[11] >> 2) & 63; ++ ++ regHisto[r0] += 1; ++ regHisto[r1] += 1; ++ regHisto[r2] += 1; ++ regHisto[r3] += 1; ++ regHisto[r4] += 1; ++ regHisto[r5] += 1; ++ regHisto[r6] += 1; ++ regHisto[r7] += 1; ++ regHisto[r8] += 1; ++ regHisto[r9] += 1; ++ regHisto[r10] += 1; ++ regHisto[r11] += 1; ++ regHisto[r12] += 1; ++ regHisto[r13] += 1; ++ regHisto[r14] += 1; ++ regHisto[r15] += 1; ++ + r += 12; + } + } else { +- for (j = 0; j < HLL_REGISTERS; j++) { ++ for(j = 0; j < HLL_REGISTERS; j++) { + unsigned long reg; +- + HLL_DENSE_GET_REGISTER(reg,registers,j); +- if (reg == 0) { +- ez++; +- /* Increment E at the end of the loop. */ +- } else { +- E += PE[reg]; /* Precomputed 2^(-reg[j]). */ +- } ++ regHisto[reg] += 1; + } +- E += ez; /* Add 2^0 'ez' times. */ + } +- *ezp = ez; +- return E; + } + + /* ================== Sparse representation implementation ================= */ +@@ -903,76 +902,96 @@ int hllSparseAdd(robj *o, unsigned char *ele, size_t elesize) { + return hllSparseSet(o,index,count); + } + +-/* Compute SUM(2^-reg) in the sparse representation. +- * PE is an array with a pre-computer table of values 2^-reg indexed by reg. +- * As a side effect the integer pointed by 'ezp' is set to the number +- * of zero registers. */ +-double hllSparseSum(uint8_t *sparse, int sparselen, double *PE, int *ezp, int *invalid) { +- double E = 0; +- int ez = 0, idx = 0, runlen, regval; ++/* Compute the register histogram in the sparse representation. */ ++void hllSparseRegHisto(uint8_t *sparse, int sparselen, int *invalid, int* regHisto) { ++ int idx = 0, runlen, regval; + uint8_t *end = sparse+sparselen, *p = sparse; + + while(p < end) { + if (HLL_SPARSE_IS_ZERO(p)) { + runlen = HLL_SPARSE_ZERO_LEN(p); + idx += runlen; +- ez += runlen; +- /* Increment E at the end of the loop. */ ++ regHisto[0] += runlen; + p++; + } else if (HLL_SPARSE_IS_XZERO(p)) { + runlen = HLL_SPARSE_XZERO_LEN(p); + idx += runlen; +- ez += runlen; +- /* Increment E at the end of the loop. */ ++ regHisto[0] += runlen; + p += 2; + } else { + runlen = HLL_SPARSE_VAL_LEN(p); + regval = HLL_SPARSE_VAL_VALUE(p); + idx += runlen; +- E += PE[regval]*runlen; ++ regHisto[regval] += runlen; + p++; + } + } + if (idx != HLL_REGISTERS && invalid) *invalid = 1; +- E += ez; /* Add 2^0 'ez' times. */ +- *ezp = ez; +- return E; + } + + /* ========================= HyperLogLog Count ============================== + * This is the core of the algorithm where the approximated count is computed. +- * The function uses the lower level hllDenseSum() and hllSparseSum() functions +- * as helpers to compute the SUM(2^-reg) part of the computation, which is +- * representation-specific, while all the rest is common. */ +- +-/* Implements the SUM operation for uint8_t data type which is only used +- * internally as speedup for PFCOUNT with multiple keys. */ +-double hllRawSum(uint8_t *registers, double *PE, int *ezp) { +- double E = 0; +- int j, ez = 0; ++ * The function uses the lower level hllDenseRegHisto() and hllSparseRegHisto() ++ * functions as helpers to compute histogram of register values part of the ++ * computation, which is representation-specific, while all the rest is common. */ ++ ++/* Implements the register histogram calculation for uint8_t data type ++ * which is only used internally as speedup for PFCOUNT with multiple keys. */ ++void hllRawRegHisto(uint8_t *registers, int* regHisto) { + uint64_t *word = (uint64_t*) registers; + uint8_t *bytes; ++ int j; + + for (j = 0; j < HLL_REGISTERS/8; j++) { + if (*word == 0) { +- ez += 8; ++ regHisto[0] += 8; + } else { + bytes = (uint8_t*) word; +- if (bytes[0]) E += PE[bytes[0]]; else ez++; +- if (bytes[1]) E += PE[bytes[1]]; else ez++; +- if (bytes[2]) E += PE[bytes[2]]; else ez++; +- if (bytes[3]) E += PE[bytes[3]]; else ez++; +- if (bytes[4]) E += PE[bytes[4]]; else ez++; +- if (bytes[5]) E += PE[bytes[5]]; else ez++; +- if (bytes[6]) E += PE[bytes[6]]; else ez++; +- if (bytes[7]) E += PE[bytes[7]]; else ez++; ++ regHisto[bytes[0]] += 1; ++ regHisto[bytes[1]] += 1; ++ regHisto[bytes[2]] += 1; ++ regHisto[bytes[3]] += 1; ++ regHisto[bytes[4]] += 1; ++ regHisto[bytes[5]] += 1; ++ regHisto[bytes[6]] += 1; ++ regHisto[bytes[7]] += 1; + } + word++; + } +- E += ez; /* 2^(-reg[j]) is 1 when m is 0, add it 'ez' times for every +- zero register in the HLL. */ +- *ezp = ez; +- return E; ++} ++ ++/* Helper function sigma as defined in ++ * "New cardinality estimation algorithms for HyperLogLog sketches" ++ * Otmar Ertl, arXiv:1702.01284 */ ++double hllSigma(double x) { ++ if (x == 1.) return INFINITY; ++ double zPrime; ++ double y = 1; ++ double z = x; ++ do { ++ x *= x; ++ zPrime = z; ++ z += x * y; ++ y += y; ++ } while(zPrime != z); ++ return z; ++} ++ ++/* Helper function tau as defined in ++ * "New cardinality estimation algorithms for HyperLogLog sketches" ++ * Otmar Ertl, arXiv:1702.01284 */ ++double hllTau(double x) { ++ if (x == 0. || x == 1.) return 0.; ++ double zPrime; ++ double y = 1.0; ++ double z = 1 - x; ++ do { ++ x = sqrt(x); ++ zPrime = z; ++ y *= 0.5; ++ z -= pow(1 - x, 2)*y; ++ } while(zPrime != z); ++ return z / 3; + } + + /* Return the approximated cardinality of the set based on the harmonic +@@ -988,49 +1007,34 @@ double hllRawSum(uint8_t *registers, double *PE, int *ezp) { + * keys (no need to work with 6-bit integers encoding). */ + uint64_t hllCount(struct hllhdr *hdr, int *invalid) { + double m = HLL_REGISTERS; +- double E, alpha = 0.7213/(1+1.079/m); +- int j, ez; /* Number of registers equal to 0. */ +- +- /* We precompute 2^(-reg[j]) in a small table in order to +- * speedup the computation of SUM(2^-register[0..i]). */ +- static int initialized = 0; +- static double PE[64]; +- if (!initialized) { +- PE[0] = 1; /* 2^(-reg[j]) is 1 when m is 0. */ +- for (j = 1; j < 64; j++) { +- /* 2^(-reg[j]) is the same as 1/2^reg[j]. */ +- PE[j] = 1.0/(1ULL << j); +- } +- initialized = 1; +- } ++ double E; ++ int j; ++ double alphaInf = 0.5 / log(2.); ++ int regHisto[HLL_Q+2] = {0}; + +- /* Compute SUM(2^-register[0..i]). */ ++ /* Compute register histogram */ + if (hdr->encoding == HLL_DENSE) { +- E = hllDenseSum(hdr->registers,PE,&ez); ++ hllDenseRegHisto(hdr->registers,regHisto); + } else if (hdr->encoding == HLL_SPARSE) { +- E = hllSparseSum(hdr->registers, +- sdslen((sds)hdr)-HLL_HDR_SIZE,PE,&ez,invalid); ++ hllSparseRegHisto(hdr->registers, ++ sdslen((sds)hdr)-HLL_HDR_SIZE,invalid,regHisto); + } else if (hdr->encoding == HLL_RAW) { +- E = hllRawSum(hdr->registers,PE,&ez); ++ hllRawRegHisto(hdr->registers,regHisto); + } else { + serverPanic("Unknown HyperLogLog encoding in hllCount()"); + } + +- /* Apply loglog-beta to the raw estimate. See: +- * "LogLog-Beta and More: A New Algorithm for Cardinality Estimation +- * Based on LogLog Counting" Jason Qin, Denys Kim, Yumei Tung +- * arXiv:1612.02284 */ +- double zl = log(ez + 1); +- double beta = -0.370393911*ez + +- 0.070471823*zl + +- 0.17393686*pow(zl,2) + +- 0.16339839*pow(zl,3) + +- -0.09237745*pow(zl,4) + +- 0.03738027*pow(zl,5) + +- -0.005384159*pow(zl,6) + +- 0.00042419*pow(zl,7); +- +- E = llroundl(alpha*m*(m-ez)*(1/(E+beta))); ++ /* Estimate cardinality form register histogram. See: ++ * "New cardinality estimation algorithms for HyperLogLog sketches" ++ * Otmar Ertl, arXiv:1702.01284 */ ++ double z = m * hllTau((m-regHisto[HLL_Q+1])/(double)m); ++ for (j = HLL_Q; j >= 1; --j) { ++ z += regHisto[j]; ++ z *= 0.5; ++ } ++ z += m * hllSigma(regHisto[0]/(double)m); ++ E = llroundl(alphaInf*m*m/z); ++ + return (uint64_t) E; + } + |