API Snapshots: Java Core, C++ Core, Python, Memory, Pig, Hive,

KLL Sketch C++ Example

#include <iostream>
#include <fstream>
#include <random>
#include <chrono>

#include <kll_sketch.hpp>

//simplified file operations and no error handling for clarity
int main(int argc, char **argv) {
  // this section generates two sketches from random data and serializes them into files
  {
    std::default_random_engine generator(std::chrono::system_clock::now().time_since_epoch().count());
    std::normal_distribution<float> nd(0, 1); // mean=0, stddev=1

    datasketches::kll_sketch<float> sketch1; // default k=200
    for (int i = 0; i < 10000; i++) {
      sketch1.update(nd(generator)); // mean=0, stddev=1
    }
    std::ofstream os1("kll_sketch_float1.bin");
    sketch1.serialize(os1);

    datasketches::kll_sketch<float> sketch2; // default k=200
    for (int i = 0; i < 10000; i++) {
      sketch2.update(nd(generator) + 1); // shift the mean for the second sketch
    }
    std::ofstream os2("kll_sketch_float2.bin");
    sketch2.serialize(os2);
  }

  // this section deserializes the sketches, produces a union and prints some results
  {
    std::ifstream is1("kll_sketch_float1.bin");
    auto sketch1 = datasketches::kll_sketch<float>::deserialize(is1);

    std::ifstream is2("kll_sketch_float2.bin");
    auto sketch2 = datasketches::kll_sketch<float>::deserialize(is2);

    // we could merge sketch2 into sketch1 or the other way around
    // this is an example of using a new sketch as a union and keeping the original sketches intact
    datasketches::kll_sketch<float> u; // default k=200
    u.merge(sketch1);
    u.merge(sketch2);

    // Debug output
    std::cout << u.to_string();

    std::cout << "Min: " << u.get_min_item() << std::endl;
    std::cout << "Max: " << u.get_max_item() << std::endl;
    auto quantiles = u.get_quantiles((double[3]){0.5, 0.75, 0.9}, 3);
    std::cout << "Quantiles: 0.5 (median), 0.75, 0.9:\n";
    std::cout << quantiles[0] << ", " << quantiles[1] << ", " << quantiles[2] << std::endl;

    std::cout << "Probability Histogram: estimated probability mass in 4 bins: (-inf, -2), [-2, 0), [0, 2), [2, +inf)" << std::endl;
    const float split_points[] {-2, 0, 2};
    const int num_split_points = 3;
    auto pmf = u.get_PMF(split_points, num_split_points);
    std::cout << pmf[0] << ", " << pmf[1] << ", " << pmf[2] << ", " << pmf[3] << std::endl;

    std::cout << "Frequency Histogram: estimated number of original values in the same bins" << std::endl;
    const int num_bins = num_split_points + 1;
    int histogram[num_bins];
    for (int i = 0; i < num_bins; i++) {
      histogram[i] = pmf[i] * u.get_n(); // scale the fractions by the total count of values
    }
    std::cout << histogram[0] << ", " << histogram[1] << ", " << histogram[2] << ", " << histogram[3] << std::endl;
  }

  return 0;
}

Output (will be slightly different every time due to random input):
### KLL sketch summary:
   K              : 200
   min K          : 200
   M              : 8
   N              : 20000
   Epsilon        : 1.33%
   Epsilon PMF    : 1.65%
   Empty          : false
   Estimation mode: true
   Levels         : 7
   Sorted         : false
   Capacity items : 565
   Retained items : 394
   Min item      : -3.75
   Max item      : 4.6
### End sketch summary
Min: -3.75359
Max: 4.60465
Quantiles: 0.5 (median), 0.75, 0.9:
0.508168, 1.25914, 1.938
Probability Histogram: estimated probability mass in 4 bins: (-inf, -2), [-2, 0), [0, 2), [2, +inf)
0.0118, 0.3134, 0.58475, 0.09005
Frequency Histogram: estimated number of original values in the same bins
236, 6268, 11695, 1800