1
1
/********************************************************************
4
G. Cormode 2003,2004, 2010, 2012
6
Updated: 2004-06 Added a floating point sketch and support for
6
Updated: 2004-06 Added a floating point sketch and support for
7
7
inner product point estimation
8
8
Initial version: 2003-12
10
This work is licensed under the Creative Commons
11
Attribution-NonCommercial License. To view a copy of this license,
12
visit http://creativecommons.org/licenses/by-nc/1.0/ or send a letter
13
to Creative Commons, 559 Nathan Abbott Way, Stanford, California
11
This program is free software; you can redistribute it and/or modify
12
it under the terms of the GNU General Public License as published by
13
the Free Software Foundation; either version 2 of the License, or
14
(at your option) any later version.
16
This program is distributed in the hope that it will be useful,
17
but WITHOUT ANY WARRANTY; without even the implied warranty of
18
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
GNU General Public License for more details.
21
You should have received a copy of the GNU General Public License along
22
with this program; if not, write to the Free Software Foundation, Inc.,
23
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
15
24
*********************************************************************/
17
26
#include <stdlib.h>
18
28
#include "countmin.h"
20
30
#define min(x,y) ((x) < (y) ? (x) : (y))
23
33
double eps; /* 1+epsilon = approximation factor */
24
34
double delta; /* probability of failure */
28
36
/************************************************************************/
29
37
/* Routines to support Count-Min sketches */
30
38
/************************************************************************/
38
46
cm=(CM_type *) malloc(sizeof(CM_type));
39
prng=prng_Init(-abs(seed),2);
47
prng=prng_Init(-abs(seed),2);
40
48
// initialize the generator to pick the hash functions
47
cm->prng = prng; /* L. Deri */
48
55
cm->counts=(int **)calloc(sizeof(int *),cm->depth);
49
56
cm->counts[0]=(int *)calloc(sizeof(int), cm->depth*cm->width);
50
57
cm->hasha=(unsigned int *)calloc(sizeof(unsigned int),cm->depth);
190
int CM_InnerProd(CM_type * cm1, CM_type * cm2)
198
long long CM_InnerProd(CM_type * cm1, CM_type * cm2)
191
199
{ // Estimate the inner product of two vectors by comparing their sketches
192
int i,j, tmp, result;
201
long long result, tmp;
195
204
if (CM_Compatible(cm1,cm2))
220
long long CM_F2Est(CM_type * cm)
221
{ // Estimate the second frequency moment of the stream
223
long long result, tmp, *ans;
226
ans=(long long *) calloc(1+cm->depth,sizeof(long long));
228
for (j=0;j<cm->depth;j++)
231
for (i=0;i<cm->width;i+=2)
233
tmp=(cm->counts[j][i]-cm->counts[j][i+1]);
238
result=LLMedSelect((cm->depth+1)/2,cm->depth,ans);
210
243
int CM_Residue(CM_type * cm, unsigned int * Q)
212
// CM_Residue computes the sum of everything left after the points
245
// CM_Residue computes the sum of everything left after the points
213
246
// from Q have been removed
214
247
// Q is a list of points, where Q[0] gives the length of the list
225
258
for (i=0;i<cm->width;i++)
260
for (i=1;i<(int) Q[0];i++)
228
261
bitmap[hash31(cm->hasha[j],cm->hashb[j],Q[i]) % cm->width]=1;
229
262
for (i=0;i<cm->width;i++)
230
263
if (bitmap[i]==0) nextest+=cm->counts[j][i];
246
279
cm=(CMF_type *) malloc(sizeof(CMF_type));
248
prng=prng_Init(-abs(seed),2);
281
prng=prng_Init(-abs(seed),2);
249
282
// initialize the generator to pick the hash functions
339
372
// this can be done more efficiently if the width is a power of two
342
int CMF_PointEst(CMF_type * cm, unsigned int query)
375
double CMF_PointEst(CMF_type * cm, unsigned int query)
344
377
// return an estimate of the count of an item by taking the minimum
347
381
if (!cm) return 0;
348
382
ans=cm->counts[0][hash31(cm->hasha[0],cm->hashb[0],query) % cm->width];
381
415
loc=hash31(cm1->hasha[j],cm1->hashb[j],query) % cm1->width;
382
416
tmp=cm1->counts[j][loc]*cm2->counts[j][loc];
389
423
double CMF_InnerProd(CMF_type * cm1, CMF_type * cm2)
390
424
{ // Estimate the inner product of two vectors by comparing their sketches
414
448
CMH_type * CMH_Init(int width, int depth, int U, int gran)
416
// initialize a hierarchical set of sketches for range queries
450
// initialize a hierarchical set of sketches for range queries
417
451
// heavy hitters or quantiles
424
458
// U is the log the size of the universe in bits
426
460
if (gran>U || gran<1) return(NULL);
427
// gran is the granularity to look at the universe in
461
// gran is the granularity to look at the universe in
428
462
// check that the parameters make sense...
430
cmh=(CMH_type *) malloc(sizeof(CMH_type));
464
cmh=(CMH_type *) calloc(1,sizeof(CMH_type));
432
466
prng=prng_Init(-12784,2);
433
467
// initialize the generator for picking the hash functions
442
476
cmh->levels=(int) ceil(((float) U)/((float) gran));
443
477
for (j=0;j<cmh->levels;j++)
444
if (1<<(cmh->gran*j) <= cmh->depth*cmh->width)
478
if ((long long) 1<<(cmh->gran*j) <= cmh->depth*cmh->width)
446
480
//find the level up to which it is cheaper to keep exact counts
447
481
cmh->freelim=cmh->levels-cmh->freelim;
449
483
cmh->counts=(int **) calloc(sizeof(int *), 1+cmh->levels);
450
484
cmh->hasha=(unsigned int **)calloc(sizeof(unsigned int *),1+cmh->levels);
451
485
cmh->hashb=(unsigned int **)calloc(sizeof(unsigned int *),1+cmh->levels);
455
489
if (i>=cmh->freelim)
456
490
{ // allocate space for representing things exactly at high levels
457
cmh->counts[i]=calloc(1<<(cmh->gran*j),sizeof(int));
491
cmh->counts[i]=(int *) calloc(1<<(cmh->gran*j),sizeof(int));
459
493
cmh->hasha[i]=NULL;
460
494
cmh->hashb[i]=NULL;
463
497
{ // allocate space for a sketch
464
498
cmh->counts[i]=(int *)calloc(sizeof(int), cmh->depth*cmh->width);
465
499
cmh->hasha[i]=(unsigned int *)
515
551
if (i>=cmh->freelim)
552
{ // level 0 = leaves, higher levels = internal nodes
517
553
cmh->counts[i][item]+=diff;
518
// keep exact counts at high levels in the hierarchy
554
// keep exact counts at high levels in the hierarchy
521
557
for (j=0;j<cmh->depth;j++)
523
cmh->counts[i][(hash31(cmh->hasha[i][j],cmh->hashb[i][j],item)
559
cmh->counts[i][(hash31(cmh->hasha[i][j],cmh->hashb[i][j],item)
524
560
% cmh->width) + offset]+=diff;
525
561
// this can be done more efficiently if the width is a power of two
526
562
offset+=cmh->width;
545
581
return(admin + hashes + counts);
548
int CMH_count(CMH_type * cmh, int depth, int item)
584
int CMH_count(CMH_type * cmh, int depth, unsigned int item)
550
586
// return an estimate of item at level depth
561
597
// else, use the appropriate sketch to make an estimate
563
599
estimate=cmh->counts[depth][(hash31(cmh->hasha[depth][0],
564
cmh->hashb[depth][0],item)
600
cmh->hashb[depth][0],item)
565
601
% cmh->width) + offset];
566
602
for (j=1;j<cmh->depth;j++)
568
604
offset+=cmh->width;
569
605
estimate=min(estimate,
570
606
cmh->counts[depth][(hash31(cmh->hasha[depth][j],
571
cmh->hashb[depth][j],item)
607
cmh->hashb[depth][j],item)
572
608
% cmh->width) + offset]);
574
610
return(estimate);
577
void CMH_recursive(CMH_type * cmh, int depth, int start,
613
void CMH_recursive(CMH_type * cmh, int depth, int start,
578
614
int thresh, unsigned int * results)
580
// for finding heavy hitters, recursively descend looking
616
// for finding heavy hitters, recursively descend looking
581
617
// for ranges that exceed the threshold
588
624
estcount=CMH_count(cmh,depth,start);
589
if (estcount>=thresh)
625
if (estcount>=thresh)
593
if (results[0]<cmh->width)
629
if (results[0]<(unsigned int) cmh->width)
596
632
results[results[0]]=start;
621
int CMH_Rangesum(CMH_type * cmh, int start, int end)
657
int CMH_Rangesum(CMH_type * cmh, long long start, long long end)
623
// compute a range sum:
659
// compute a range sum:
624
660
// start at bottom level
625
661
// compute any estimates needed at each level
628
int leftend,rightend,i,depth, result, topend;
665
long long range, leftend, rightend;
668
topend=((long long) 1)<<cmh->U;
631
669
end=min(topend,end);
632
if ((end>topend) && (start==0))
670
if ((end>topend) && (start==0)) {
633
671
return cmh->count;
635
674
end+=1; // adjust for end effects
637
676
for (depth=0;depth<=cmh->levels;depth++)
639
678
if (start==end) break;
640
if ((end-start+1)<(1<<cmh->gran))
641
{ // at the highest level, avoid overcounting
680
if ((unsigned int) (end-start+1)<(((unsigned int) 1)<<cmh->gran))
681
{ // if only a few nodes to probe at this level, probe them all
642
682
for (i=start;i<end;i++)
643
683
result+=CMH_count(cmh,depth,i);
647
687
{ // figure out what needs to be done at each end
648
688
leftend=(((start>>cmh->gran)+1)<<cmh->gran) - start;
689
if (leftend>= 1<<cmh->gran) leftend=0;
649
690
rightend=(end)-((end>>cmh->gran)<<cmh->gran);
650
691
if ((leftend>0) && (start<end))
651
692
for (i=0;i<leftend;i++)
668
int CMH_FindRange(CMH_type * cmh, int sum)
709
long long CMH_FindRange(CMH_type * cmh, int sum)
670
unsigned long low, high, mid=0, est;
711
long long low, high, mid=0;
672
713
// find a range starting from zero that adds up to sum
674
715
if (cmh->count<sum) return 1<<(cmh->U);
717
high=((long long) 1)<<cmh->U;
677
719
for (i=0;i<cmh->U;i++)
679
721
mid=(low+high)/2;
680
est=CMH_Rangesum(cmh,0,mid);
722
est=CMH_Rangesum(cmh,0,(unsigned int) mid);
690
int CMH_AltFindRange(CMH_type * cmh, int sum)
732
long long CMH_AltFindRange(CMH_type * cmh, int sum)
692
unsigned long low, high, mid=0, est, top;
734
long long low, high, mid=0,top;
694
736
// find a range starting from the right hand side that adds up to sum
696
738
if (cmh->count<sum) return 1<<(cmh->U);
740
top=((long long) 1)<<cmh->U;
700
for (i=0;i<cmh->U;i++)
742
for (i=0;i<cmh->U;i++) {
702
743
mid=(low+high)/2;
703
744
est=CMH_Rangesum(cmh,mid,top);
713
int CMH_Quantile(CMH_type * cmh, float frac)
754
long long CMH_Quantile(CMH_type * cmh, float frac)
715
756
// find a quantile by doing the appropriate range search
716
757
if (frac<0) return 0;
718
759
return 1<<cmh->U;
719
return ((CMH_FindRange(cmh,cmh->count*frac)+
720
CMH_AltFindRange(cmh,cmh->count*(1-frac)))/2);
760
return ((CMH_FindRange(cmh,(long long) (cmh->count*frac))+
761
CMH_AltFindRange(cmh,(long long) (cmh->count*(1-frac))))/2);
721
762
// each result gives a lower/upper bound on the location of the quantile
722
763
// with high probability, these will be close: only a small number of values
723
// will be between the estimates.
764
// will be between the estimates.
726
767
long long CMH_F2Est(CMH_type * cmh)