基于Hadoop实现Knn算法

knn算法的核心思想是如果一个样本在特征空间中的k个最相邻的样本中的大多数属于某一个类别，则该样本也属于这个类别，并具有这个类别上样本的特性。该方法在确定分类决策上只依据最邻近的一个或者几个样本的类别来决定待分样本所属的类别。knn方法在类别决策时，只与极少量的相邻样本有关。由于knn方法主要靠周围有限的邻近的样本，而不是靠判别类域的方法来确定所属类别的，因此对于类域的交叉或重叠较多的待分样本集来说，knn方法较其他方法更为合适。

knn算法流程如下：

1. 计算当前测试数据与训练数据中的每条数据的距离

2. 圈定距离最近的k个训练对象，作为测试对象的近邻

3. 计算这k个训练对象中出现最多的那个类别，并将这个类别作为当前测试数据的类别

以上流程是knn的大致流程，按照这个流程实现的mr效率并不高，可以在这之上进行优化。在这里只写，跟着这个流程走的mr实现过程。

mapper的设计：

由于测试数据相比于训练数据来说，会小很多，因此将测试数据用java api读取，放到内存中。所以，在setup中需要对测试数据进行初始化。在map中，计算当前测试数据与每条训练数据的距离，mapper的值类型为：<object, text, intwritable,mywritable>。map输出键类型为intwritable，存放当前测试数据的下标，输出值类型为mywritable，这是自定义值类型，其中存放的是距离以及与测试数据比较的训练数据的类别。

								
									 public   class   knnmapper   extends   mapper<object, text, intwritable,mywritable> { 

									    logger log = loggerfactory.getlogger(knnmapper.  class  ); 

									    private   list<  float  []> testdata; 

									    @override 

									    protected   void   setup(context context) 

									    throws   ioexception, interruptedexception { 

									    // todo auto-generated method stub 

									    configuration conf= context.getconfiguration(); 

									    conf.set(  "fs.defaultfs"  ,   "master:8020"  ); 

									    string testpath= conf.get(  "testfilepath"  ); 

									    path testdatapath=   new   path(testpath); 

									    filesystem fs = filesystem.get(conf); 

									    this  .testdata = readtestdata(fs,testdatapath); 

									    } 

									    @override 

									    protected   void   map(object key, text value, context context) 

									    throws   ioexception, interruptedexception { 

									    // todo auto-generated method stub 

									    string[] line = value.tostring().split(  ","  ); 

									    float  [] traindata =   new   float  [line.length-  1  ]; 

									    for  (  int   i=  0  ;i<traindata.length;i++){ 

									    traindata[i] =   float  .valueof(line[i]); 

									    log.info(  "训练数据："  +line[i]+  "类别："  +line[line.length-  1  ]); 

									    } 

									    for  (  int   i=  0  ; i<   this  .testdata.size();i++){ 

									    float  [] testi =   this  .testdata.get(i); 

									    float   distance = outh(testi, traindata); 

									    log.info(  "距离："  +distance); 

									    context.write(  new   intwritable(i),   new   mywritable(distance, line[line.length-  1  ])); 

									    } 

									    } 

									    private   list<  float  []> readtestdata(filesystem fs,path path)   throws   ioexception { 

									    //补充代码完整 

									    fsdatainputstream data = fs.open(path); 

									    bufferedreader bf =   new   bufferedreader(  new   inputstreamreader(data)); 

									    string line =   ""  ; 

									    list<  float  []> list =   new   arraylist<>(); 

									    while   ((line = bf.readline()) !=   null  ) { 

									    string[] items = line.split(  ","  ); 

									    float  [] item =   new   float  [items.length]; 

									    for  (  int   i=  0  ;i<items.length;i++){ 

									    item[i] =   float  .valueof(items[i]); 

									    } 

									    list.add(item); 

									    } 

									    return   list; 

									    } 

									    // 计算欧式距离 

									    private   static   float   outh(  float  [] testdata,   float  [] indata) { 

									    float   distance =  0  .0f; 

									    for  (  int   i=  0  ;i<testdata.length;i++){ 

									    distance += (testdata[i]-indata[i])*(testdata[i]-indata[i]); 

									    } 

									    distance = (  float  )math.sqrt(distance); 

									    return   distance; 

									    } 

									 }

自定义值类型mywritable如下：

								
									 public   class   mywritable   implements   writable{ 

									    private   float   distance; 

									    private   string label; 

									    public   mywritable() { 

									    // todo auto-generated constructor stub 

									    } 

									    public   mywritable(  float   distance, string label){ 

									    this  .distance = distance; 

									    this  .label = label; 

									    } 

									    @override 

									    public   string tostring() { 

									    // todo auto-generated method stub 

									    return   this  .distance+  ","  +  this  .label; 

									    } 

									    @override 

									    public   void   write(dataoutput out)   throws   ioexception { 

									    // todo auto-generated method stub 

									    out.writefloat(distance); 

									    out.writeutf(label); 

									    } 

									    @override 

									    public   void   readfields(datainput in)   throws   ioexception { 

									    // todo auto-generated method stub 

									    this  .distance = in.readfloat(); 

									    this  .label = in.readutf(); 

									    } 

									    public   float   getdistance() { 

									    return   distance; 

									    } 

									    public   void   setdistance(  float   distance) { 

									    this  .distance = distance; 

									    } 

									    public   string getlabel() { 

									    return   label; 

									    } 

									    public   void   setlabel(string label) { 

									    this  .label = label; 

									    } 

									 }

在reducer端中，需要初始化参数k，也就是圈定距离最近的k个对象的k值。在reduce中需要对距离按照从小到大的距离排序，然后选取前k条数据，再计算这k条数据中，出现次数最多的那个类别并将这个类别与测试数据的下标相对应并以k，v的形式输出到hdfs上。

								
									 public   class   knnreducer   extends   reducer<intwritable, mywritable, intwritable, text> { 

									    private   int   k; 

									    @override 

									    protected   void   setup(context context) 

									    throws   ioexception, interruptedexception { 

									    // todo auto-generated method stub 

									    this  .k = context.getconfiguration().getint(  "k"  ,   5  ); 

									    } 

									    @override 

									    /*** 

									    * key => 0 

									    * values =>([1,lable1],[2,lable2],[3,label2],[2.5,lable2]) 

									    */ 

									    protected   void   reduce(intwritable key, iterable<mywritable> values, 

									    context context)   throws   ioexception, interruptedexception { 

									    // todo auto-generated method stub 

									    mywritable[] mywrit =   new   mywritable[k]; 

									    for  (  int   i=  0  ;i<k;i++){ 

									    mywrit[i] =   new   mywritable(  float  .max_value,   "-1"  ); 

									    } 

									    // 找出距离最小的前k个 

									    for   (mywritable m : values) { 

									    float   distance = m.getdistance(); 

									    string label = m.getlabel(); 

									    for  (mywritable m1: mywrit){ 

									    if   (distance < m1.getdistance()){ 

									     m1.setdistance(distance); 

									     m1.setlabel(label); 

									    } 

									    } 

									    } 

									    // 找出前k个中，出现次数最多的类别 

									    string[] testclass =   new   string[k]; 

									    for  (  int   i=  0  ;i<k;i++){ 

									    testclass[i] = mywrit[i].getlabel(); 

									    } 

									    string countmost = mostele(testclass); 

									    context.write(key,   new   text(countmost)); 

									    } 

									    public   static   string mostele(string[] strarray) {  

									     hashmap<string, integer> map =   new   hashmap<>();  

									     for   (  int   i =   0  ; i < strarray.length; i++) { 

									    string str = strarray[i]; 

									      if   (map.containskey(str)) { 

									    int   tmp = map.get(str); 

									    map.put(str, tmp+  1  ); 

									    }  else  { 

									    map.put(str,   1  ); 

									    } 

									    } 

									     // 得到hashmap中值最大的键，也就是出现次数最多的类别 

									     collection<integer> count = map.values(); 

									     int   maxcount = collections.max(count); 

									     string maxstring =   ""  ; 

									     for  (map.entry<string, integer> entry: map.entryset()){ 

									      if   (maxcount == entry.getvalue()) { 

									    maxstring = entry.getkey(); 

									    } 

									     } 

									     return   maxstring;  

									    } 

									 }

最后输出结果如下：

以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持。

原文链接：https://blog.csdn.net/Angelababy_huan/article/details/53045579

查看更多关于基于Hadoop实现Knn算法的详细内容...

声明：本文来自网络，不代表【好得很程序员自学网】立场，转载请注明出处：http://www.haodehen.cn/did249675

更新时间：2023-07-10 阅读：28次