#include <iostream>
#include <fstream>
#include <random>
#include <chrono>
#include <kll_sketch.hpp>
//simplified file operations and no error handling for clarity
int main(int argc, char **argv) {
// this section generates two sketches from random data and serializes them into files
{
std::default_random_engine generator(std::chrono::system_clock::now().time_since_epoch().count());
std::normal_distribution<float> nd(0, 1); // mean=0, stddev=1
datasketches::kll_sketch<float> sketch1; // default k=200
for (int i = 0; i < 10000; i++) {
sketch1.update(nd(generator)); // mean=0, stddev=1
}
std::ofstream os1("kll_sketch_float1.bin");
sketch1.serialize(os1);
datasketches::kll_sketch<float> sketch2; // default k=200
for (int i = 0; i < 10000; i++) {
sketch2.update(nd(generator) + 1); // shift the mean for the second sketch
}
std::ofstream os2("kll_sketch_float2.bin");
sketch2.serialize(os2);
}
// this section deserializes the sketches, produces a union and prints some results
{
std::ifstream is1("kll_sketch_float1.bin");
auto sketch1 = datasketches::kll_sketch<float>::deserialize(is1);
std::ifstream is2("kll_sketch_float2.bin");
auto sketch2 = datasketches::kll_sketch<float>::deserialize(is2);
// we could merge sketch2 into sketch1 or the other way around
// this is an example of using a new sketch as a union and keeping the original sketches intact
datasketches::kll_sketch<float> u; // default k=200
u.merge(sketch1);
u.merge(sketch2);
// Debug output
std::cout << u.to_string();
std::cout << "Min: " << u.get_min_item() << std::endl;
std::cout << "Max: " << u.get_max_item() << std::endl;
auto quantiles = u.get_quantiles((double[3]){0.5, 0.75, 0.9}, 3);
std::cout << "Quantiles: 0.5 (median), 0.75, 0.9:\n";
std::cout << quantiles[0] << ", " << quantiles[1] << ", " << quantiles[2] << std::endl;
std::cout << "Probability Histogram: estimated probability mass in 4 bins: (-inf, -2), [-2, 0), [0, 2), [2, +inf)" << std::endl;
const float split_points[] {-2, 0, 2};
const int num_split_points = 3;
auto pmf = u.get_PMF(split_points, num_split_points);
std::cout << pmf[0] << ", " << pmf[1] << ", " << pmf[2] << ", " << pmf[3] << std::endl;
std::cout << "Frequency Histogram: estimated number of original values in the same bins" << std::endl;
const int num_bins = num_split_points + 1;
int histogram[num_bins];
for (int i = 0; i < num_bins; i++) {
histogram[i] = pmf[i] * u.get_n(); // scale the fractions by the total count of values
}
std::cout << histogram[0] << ", " << histogram[1] << ", " << histogram[2] << ", " << histogram[3] << std::endl;
}
return 0;
}
Output (will be slightly different every time due to random input):
### KLL sketch summary:
K : 200
min K : 200
M : 8
N : 20000
Epsilon : 1.33%
Epsilon PMF : 1.65%
Empty : false
Estimation mode: true
Levels : 7
Sorted : false
Capacity items : 565
Retained items : 394
Min item : -3.75
Max item : 4.6
### End sketch summary
Min: -3.75359
Max: 4.60465
Quantiles: 0.5 (median), 0.75, 0.9:
0.508168, 1.25914, 1.938
Probability Histogram: estimated probability mass in 4 bins: (-inf, -2), [-2, 0), [0, 2), [2, +inf)
0.0118, 0.3134, 0.58475, 0.09005
Frequency Histogram: estimated number of original values in the same bins
236, 6268, 11695, 1800