add jar datasketches-memory-2.0.0.jar;
add jar datasketches-java-3.1.0.jar;
add jar datasketches-hive-1.2.0.jar;
create temporary function data2sketch as 'org.apache.datasketches.hive.cpc.DataToSketchUDAF';
create temporary function unionSketches as 'org.apache.datasketches.hive.cpc.UnionSketchUDAF';
create temporary function estimate as 'org.apache.datasketches.hive.cpc.GetEstimateUDF';
create temporary function estimateAndBounds as 'org.apache.datasketches.hive.cpc.GetEstimateAndErrorBoundsUDF';
use <your-db-name-here>;
create temporary table sketch_input (id int, category char(1));
insert into table sketch_input values
(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'),
(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b');
create temporary table sketch_intermediate (category char(1), sketch binary);
insert into sketch_intermediate select category, data2sketch(id) from sketch_input group by category;
select category, estimate(sketch) from sketch_intermediate;
Output:
a 10.007331400971685
b 10.007331400971685
select estimate(unionSketches(sketch)) from sketch_intermediate;
Output:
15.017114660336853
select estimateAndBounds(unionSketches(sketch)) from sketch_intermediate;
Output:
[15.017114660336853,15.0,16.0]
Notice the difference between UnionUDF in this example, which takes two sketches, and UnionUDAF in the previous example, which is an aggregate function taking a collection of sketches as one parameter.
add jar datasketches-memory-2.0.0.jar;
add jar datasketches-java-3.1.0.jar;
add jar datasketches-hive-1.2.0.jar;
create temporary function data2sketch as 'org.apache.datasketches.hive.cpc.DataToSketchUDAF';
create temporary function union2 as 'org.apache.datasketches.hive.cpc.UnionSketchUDF';
create temporary function estimate as 'org.apache.datasketches.hive.cpc.GetEstimateUDF';
use <your-db-nasme-here>;
create temporary table sketch_input (id1 int, id2 int);
insert into table sketch_input values
(1, 2), (2, 4), (3, 6), (4, 8), (5, 10), (6, 12), (7, 14), (8, 16), (9, 18), (10, 20);
create temporary table sketch_intermediate (sketch1 binary, sketch2 binary);
insert into sketch_intermediate select data2sketch(id1), data2sketch(id2) from sketch_input;
select
estimate(sketch1),
estimate(sketch2),
estimate(union2(sketch1, sketch2))
from sketch_intermediate;
Output:
10.007331400971685 10.007331400971685 15.017114660336853