JAVA实现较完善的布隆过滤器的示例代码

布隆过滤器是可以用于判断一个元素是不是在一个集合里，并且相比于其它的数据结构，布隆过滤器在空间和时间方面都有巨大的优势。布隆过滤器存储空间和插入/查询时间都是常数。但是它也是拥有一定的缺点：布隆过滤器是有一定的误识别率以及删除困难的。本文中给出的布隆过滤器的实现，基本满足了日常使用所需要的功能。

先简单来说一下布隆过滤器。其实现方法就是：利用内存中一个长度为m的位数组b并初始化里面的所有位都为0，如下面的表格所示：

然后我们根据h个不同的散列函数，对传进来的字符串进行散列，并且每次的散列结果都不能大于位数组的长度。布隆过滤器的误判率取决于你使用多少个不同的散列函数，下面给出的代码中，给出了一些参考的误判率（参考代码中的枚举类：misjudgmentrate）。现在我们先假定有4个不同散列函数，传入一个字符串并进行一次插入操作，这时会进行4次散列，假设到了4个不同的下标，这个时候我们就会去数组中，将这些下标的位置置为1，数组变更为：

如果接下来我们再传入同一个字符串时，因为4次的散列结果都是跟上一次一样的，所以会得出跟上面一样的结果，所有应该置1的位都已经置1了，这个时候我们就可以认为这个字符串是已经存在的了。因此不难发现，这是会存在一定的误判率的，具体由你采用的散列函数质量，以及散列函数的数量确定。

代码如下：

								
									 import   java.io.fileinputstream; 

									 import   java.io.fileoutputstream; 

									 import   java.io.objectinputstream; 

									 import   java.io.objectoutputstream; 

									 import   java.io.serializable; 

									 import   java.util.bitset; 

									 import   java.util.concurrent.atomic.atomicinteger; 

									 public   class   bloomfileter   implements   serializable { 

									    private   static   final   long   serialversionuid = -5221305273707291280l; 

									    private   final   int  [] seeds; 

									    private   final   int   size; 

									    private   final   bitset notebook; 

									    private   final   misjudgmentrate rate; 

									    private   final   atomicinteger usecount =   new   atomicinteger(  0  ); 

									    private   final   double   autoclearrate; 

									    /** 

									    * 默认中等程序的误判率：misjudgmentrate.middle 以及不自动清空数据（性能会有少许提升） 

									    *  

									    * @param datacount 

									    *      预期处理的数据规模，如预期用于处理1百万数据的查重，这里则填写1000000 

									    */ 

									    public   bloomfileter(  int   datacount) { 

									    this  (misjudgmentrate.middle, datacount,   null  ); 

									    } 

									    /** 

									    *  

									    * @param rate 

									    *      一个枚举类型的误判率 

									    * @param datacount 

									    *      预期处理的数据规模，如预期用于处理1百万数据的查重，这里则填写1000000 

									    * @param autoclearrate 

									    *      自动清空过滤器内部信息的使用比率，传null则表示不会自动清理， 

									    *      当过滤器使用率达到100%时，则无论传入什么数据，都会认为在数据已经存在了 

									    *      当希望过滤器使用率达到80%时自动清空重新使用，则传入0.8 

									    */ 

									    public   bloomfileter(misjudgmentrate rate,   int   datacount,   double   autoclearrate) { 

									    long   bitsize = rate.seeds.length * datacount; 

									    if   (bitsize <   0   || bitsize > integer.max_value) { 

									     throw   new   runtimeexception(  "位数太大溢出了，请降低误判率或者降低数据大小"  ); 

									    } 

									    this  .rate = rate; 

									    seeds = rate.seeds; 

									    size = (  int  ) bitsize; 

									    notebook =   new   bitset(size); 

									    this  .autoclearrate = autoclearrate; 

									    } 

									    public   void   add(string data) { 

									    checkneedclear(); 

									    for   (  int   i =   0  ; i < seeds.length; i++) { 

									     int   index = hash(data, seeds[i]); 

									     settrue(index); 

									    } 

									    } 

									    public   boolean   check(string data) { 

									    for   (  int   i =   0  ; i < seeds.length; i++) { 

									     int   index = hash(data, seeds[i]); 

									     if   (!notebook.get(index)) { 

									     return   false  ; 

									     } 

									    } 

									    return   true  ; 

									    } 

									    /** 

									    * 如果不存在就进行记录并返回false，如果存在了就返回true 

									    *  

									    * @param data 

									    * @return 

									    */ 

									    public   boolean   addifnotexist(string data) { 

									    checkneedclear(); 

									    int  [] indexs =   new   int  [seeds.length]; 

									    // 先假定存在 

									    boolean   exist =   true  ; 

									    int   index; 

									    for   (  int   i =   0  ; i < seeds.length; i++) { 

									     indexs[i] = index = hash(data, seeds[i]); 

									     if   (exist) { 

									     if   (!notebook.get(index)) { 

									      // 只要有一个不存在，就可以认为整个字符串都是第一次出现的 

									      exist =   false  ; 

									      // 补充之前的信息 

									      for   (  int   j =   0  ; j <= i; j++) { 

									      settrue(indexs[j]); 

									      } 

									     } 

									     }   else   { 

									     settrue(index); 

									     } 

									    } 

									    return   exist; 

									    } 

									    private   void   checkneedclear() { 

									    if   (autoclearrate !=   null  ) { 

									     if   (getuserate() >= autoclearrate) { 

									     synchronized   (  this  ) { 

									      if   (getuserate() >= autoclearrate) { 

									      notebook.clear(); 

									      usecount.set(  0  ); 

									      } 

									     } 

									     } 

									    } 

									    } 

									    public   void   settrue(  int   index) { 

									    usecount.incrementandget(); 

									    notebook.set(index,   true  ); 

									    } 

									    private   int   hash(string data,   int   seeds) { 

									    char  [] value = data.tochararray(); 

									    int   hash =   0  ; 

									    if   (value.length >   0  ) { 

									     for   (  int   i =   0  ; i < value.length; i++) { 

									     hash = i * hash + value[i]; 

									     } 

									    } 

									    hash = hash * seeds % size; 

									    // 防止溢出变成负数 

									    return   math.abs(hash); 

									    } 

									    public   double   getuserate() { 

									    return   (  double  ) usecount.intvalue() / (  double  ) size; 

									    } 

									    public   void   savefiltertofile(string path) { 

									    try   (objectoutputstream oos =   new   objectoutputstream(  new   fileoutputstream(path))) { 

									     oos.writeobject(  this  ); 

									    }   catch   (exception e) { 

									     throw   new   runtimeexception(e); 

									    } 

									    } 

									    public   static   bloomfileter readfilterfromfile(string path) { 

									    try   (objectinputstream ois =   new   objectinputstream(  new   fileinputstream(path))) { 

									     return   (bloomfileter) ois.readobject(); 

									    }   catch   (exception e) { 

									     throw   new   runtimeexception(e); 

									    } 

									    } 

									    /** 

									    * 清空过滤器中的记录信息 

									    */ 

									    public   void   clear() { 

									    usecount.set(  0  ); 

									    notebook.clear(); 

									    } 

									    public   misjudgmentrate getrate() { 

									    return   rate; 

									    } 

									    /** 

									    * 分配的位数越多，误判率越低但是越占内存 

									    *  

									    * 4个位误判率大概是0.14689159766308 

									    *  

									    * 8个位误判率大概是0.02157714146322 

									    *  

									    * 16个位误判率大概是0.00046557303372 

									    *  

									    * 32个位误判率大概是0.00000021167340 

									    *  

									    * @author lianghaohui 

									    * 

									    */ 

									    public   enum   misjudgmentrate { 

									    // 这里要选取质数，能很好的降低错误率 

									    /** 

									     * 每个字符串分配4个位 

									     */ 

									    very_small(  new   int  [] {   2  ,   3  ,   5  ,   7   }), 

									    /** 

									     * 每个字符串分配8个位 

									     */ 

									    small(  new   int  [] {   2  ,   3  ,   5  ,   7  ,   11  ,   13  ,   17  ,   19   }),   // 

									    /** 

									     * 每个字符串分配16个位 

									     */ 

									    middle(  new   int  [] {   2  ,   3  ,   5  ,   7  ,   11  ,   13  ,   17  ,   19  ,   23  ,   29  ,   31  ,   37  ,   41  ,   43  ,   47  ,   53   }),   // 

									    /** 

									     * 每个字符串分配32个位 

									     */ 

									    high(  new   int  [] {   2  ,   3  ,   5  ,   7  ,   11  ,   13  ,   17  ,   19  ,   23  ,   29  ,   31  ,   37  ,   41  ,   43  ,   47  ,   53  ,   59  ,   61  ,   67  ,   71  ,   73  ,   79  ,   83  ,   89  ,   97  , 

									     101  ,   103  ,   107  ,   109  ,   113  ,   127  ,   131   }); 

									    private   int  [] seeds; 

									    private   misjudgmentrate(  int  [] seeds) { 

									     this  .seeds = seeds; 

									    } 

									    public   int  [] getseeds() { 

									     return   seeds; 

									    } 

									    public   void   setseeds(  int  [] seeds) { 

									     this  .seeds = seeds; 

									    } 

									    } 

									    public   static   void   main(string[] args) { 

									    bloomfileter fileter =   new   bloomfileter(  7  ); 

									    system.out.println(fileter.addifnotexist(  "1111111111111"  )); 

									    system.out.println(fileter.addifnotexist(  "2222222222222222"  )); 

									    system.out.println(fileter.addifnotexist(  "3333333333333333"  )); 

									    system.out.println(fileter.addifnotexist(  "444444444444444"  )); 

									    system.out.println(fileter.addifnotexist(  "5555555555555"  )); 

									    system.out.println(fileter.addifnotexist(  "6666666666666"  )); 

									    system.out.println(fileter.addifnotexist(  "1111111111111"  )); 

									    fileter.savefiltertofile(  "c:\\users\\john\\desktop\\1111\\11.obj"  ); 

									    fileter = readfilterfromfile(  "c:\\users\\john\\desktop\\111\\11.obj"  ); 

									    system.out.println(fileter.getuserate()); 

									    system.out.println(fileter.addifnotexist(  "1111111111111"  )); 

									    } 

									 }

以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持。

原文链接：https://blog.csdn.net/u014653197/article/details/76397037

查看更多关于JAVA实现较完善的布隆过滤器的示例代码的详细内容...

声明：本文来自网络，不代表【好得很程序员自学网】立场，转载请注明出处：http://www.haodehen.cn/did250398

更新时间：2023-07-13 阅读：41次