API Snapshots: Java Core, C++ Core, Python, Memory, Pig, Hive,

CPC Sketch C++ Example

#include <iostream>
#include <fstream>

#include <cpc_sketch.hpp>
#include <cpc_union.hpp>

//simplified file operations and no error handling for clarity
int main(int argc, char **argv) {
  const int lg_k = 10;

  // this section generates two sketches with some overlap and serializes them into files
  {
    // 100000 distinct keys
    datasketches::cpc_sketch sketch1(lg_k);
    for (int key = 0; key < 100000; key++) sketch1.update(key);
    std::ofstream os1("cpc_sketch1.bin");
    sketch1.serialize(os1);

    // 100000 distinct keys
    datasketches::cpc_sketch sketch2(lg_k);
    for (int key = 50000; key < 150000; key++) sketch2.update(key);
    std::ofstream os2("cpc_sketch2.bin");
    sketch2.serialize(os2);
  }

  // this section deserializes the sketches, produces union and prints the result
  {
    std::ifstream is1("cpc_sketch1.bin");
    auto sketch1 = datasketches::cpc_sketch::deserialize(is1);

    std::ifstream is2("cpc_sketch2.bin");
    auto sketch2 = datasketches::cpc_sketch::deserialize(is2);

    datasketches::cpc_union u(lg_k);
    u.update(sketch1);
    u.update(sketch2);
    auto sketch = u.get_result();

    // debug summary of the union result sketch
    sketch.to_string();

    std::cout << "Distinct count estimate: " << sketch.get_estimate() << std::endl;
    std::cout << "Distinct count lower bound 95% confidence: " << sketch.get_lower_bound(2) << std::endl;
    std::cout << "Distinct count upper bound 95% confidence: " << sketch.get_upper_bound(2) << std::endl;
  }

  return 0;
}

Output:
### CPC sketch summary:
   lg_k           : 10
   seed hash      : 93cc
   C              : 7706
   flavor         : 4
   merged         : true
   intresting col : 4
   table entries  : 27
   window         : allocated
   window offset  : 5
### End sketch summary
Distinct count estimate: 149797
Distinct count lower bound 95% confidence: 143416
Distinct count upper bound 95% confidence: 156397