informations
[mldemos:ashwini_shuklas-mldemos.git] / _AlgorithmsPlugins / DBSCAN / clustererDBSCAN.cpp
1 /*********************************************************************\r
2 MLDemos: A User-Friendly visualization toolkit for machine learning\r
3 Copyright (C) 2010  Basilio Noris\r
4 Contact: mldemos@b4silio.com\r
5 \r
6 This library is free software; you can redistribute it and/or\r
7 modify it under the terms of the GNU Lesser General Public\r
8 License as published by the Free Software Foundation; either\r
9 version 2.1 of the License, or (at your option) any later version.\r
10 \r
11 This library is distributed in the hope that it will be useful,\r
12 but WITHOUT ANY WARRANTY; without even the implied warranty of\r
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\r
14 Library General Public License for more details.\r
15 \r
16 You should have received a copy of the GNU Lesser General Public\r
17 License along with this library; if not, write to the Free\r
18 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.\r
19 *********************************************************************/\r
20 #include "public.h"\r
21 #include "clustererDBSCAN.h"\r
22 #include <boost/foreach.hpp>\r
23 \r
24 using namespace std;\r
25 \r
26 \r
27 void ClustererDBSCAN::Train(std::vector< fvec > samples)\r
28 {\r
29     if(!samples.size()) return; // no sample :(\r
30 \r
31     // prepare the different array storing information about the clusters\r
32     _noise.resize(samples.size(), false);\r
33     _visited.resize(samples.size(), false);\r
34     _core.resize(samples.size(), false);\r
35     _pointId_to_clusterId.resize(samples.size(), 0);\r
36     _reachability.resize(samples.size(), -1);\r
37 \r
38     // convert from fvec to Point\r
39     for (int j = 0; j < samples.size(); ++j)\r
40     {\r
41         Point v (samples[j].size());\r
42         for (int i = 0; i < samples[j].size(); ++i)\r
43         {\r
44             v(i)=samples[j][i];\r
45         }\r
46         pts.push_back(v);\r
47     }\r
48 \r
49     // build the similarity matrix according to the selected metric\r
50     if(_metric == 0)\r
51     {\r
52         Metrics::Distance<Metrics::Cosine<Point> > d;\r
53         computeSimilarity(d,pts);\r
54     }\r
55     else\r
56     {\r
57         Metrics::Distance<Metrics::Euclidean<Point> > d;\r
58         computeSimilarity(d,pts);\r
59     }\r
60 \r
61     // run clustering\r
62 \r
63     if (_type>0) //OPTICS\r
64     {\r
65         run_optics(pts);\r
66 \r
67         if(_type>1) //OPTICS WP\r
68         {\r
69             find_clusters_WF(); // old name was for "water-filling"\r
70         }\r
71         else //OPTICS default\r
72         {\r
73             find_clusters();\r
74         }\r
75     }\r
76     else // DBSCAN\r
77     {\r
78         run_cluster(pts);\r
79     }\r
80 }\r
81 \r
82 fvec ClustererDBSCAN::Test( const fvec &sample)\r
83 {\r
84     fvec res;\r
85     res.resize(nbClusters+1,0);\r
86 \r
87     //convert input to Point\r
88     Point v (sample.size());\r
89     for (int i = 0; i < sample.size(); ++i)\r
90     {\r
91         v(i)=sample[i];\r
92     }\r
93 \r
94     // find the nearest point in our samples\r
95     int nearest = -1;\r
96     double dist = INFINITY;\r
97     double temp_d = 0;\r
98 \r
99     if (_type==0) //if DBSCAN we set _depth like _eps\r
100     {\r
101         _depth=_eps;\r
102     }\r
103     for (int j = 0; j < pts.size(); ++j)\r
104     {\r
105         // according to the selected metric\r
106         if(_metric == 0)\r
107         {\r
108             Metrics::Distance<Metrics::Cosine<Point> > d;\r
109             temp_d = d.distance(v, pts[j]);\r
110         }\r
111         else\r
112         {\r
113             Metrics::Distance<Metrics::Euclidean<Point> > d;\r
114             temp_d = d.distance(v, pts[j]);\r
115         }\r
116         if (temp_d < dist && temp_d < _eps && _pointId_to_clusterId[j] > 0 && _core[j]){\r
117             dist = temp_d;\r
118             nearest = j;\r
119         }\r
120     }\r
121 \r
122     // did we find something?\r
123     if (nearest > -1){\r
124         if (dist < _depth){ // is it near enough?\r
125             res[_pointId_to_clusterId[nearest]-1] = 1; //take the color of that cluster\r
126         }\r
127         else if (abs(dist - _eps) < _eps*0.01) //in OPTICS, we are at the border of _eps : draw a thin line, darker\r
128         {\r
129             res[_pointId_to_clusterId[nearest]-1] = 0.5;\r
130         }\r
131     }\r
132 \r
133     return res;\r
134 }\r
135 \r
136 const char *ClustererDBSCAN::GetInfoString()\r
137 {\r
138     char *text = new char[1024];\r
139     if (_type==0)\r
140     {\r
141         sprintf(text, "DBSCAN\n\nTraining information: minPts : %d, eps: %f, metric: %d\n",_minPts,_eps,_metric);\r
142     }\r
143     else if (_type==1)\r
144     {\r
145         sprintf(text, "OPTICS\n\nTraining information: minPts : %d, eps: %f, depth: %f, metric: %d\n",_minPts,_eps,_depth,_metric);\r
146     }\r
147     else\r
148     {\r
149         sprintf(text, "OPTICS WP\n\nTraining information: minPts : %d, eps: %f, depth: %f, metric: %d\n",_minPts,_eps,_depth,_metric);\r
150     }\r
151 \r
152     sprintf(text, "%sNumber of clusters: %d\n",text,_clusters.size());\r
153 \r
154     int countN = 0;\r
155     int countC = 0;\r
156 \r
157     for (int i=0;i<_noise.size();i++)\r
158     {\r
159        if (_noise[i]) countN++;\r
160        if (_core[i]) countC++;\r
161     }\r
162     sprintf(text, "%sNumber of core points: %d\nNumber of noise points: %d\n",text,countC,countN);\r
163 \r
164     return text;\r
165 }\r
166 \r
167 void ClustererDBSCAN::SetParams(float minpts, float eps, int metric, float depth, int type)\r
168 {\r
169     _eps = eps;\r
170     _metric = metric;\r
171     _minPts = minpts;\r
172     _depth = depth;\r
173     _type = type;\r
174 }\r
175 \r
176 void ClustererDBSCAN::run_cluster(Points samples)\r
177 {\r
178 \r
179         ClusterId cid = 1;\r
180         // foreach pid\r
181         for (PointId pid = 0; pid < samples.size(); pid++)\r
182         {\r
183                 // not already visited\r
184                 if (!_visited[pid]){\r
185 \r
186                         _visited[pid] = true;\r
187 \r
188                         // get the neighbors\r
189                         Neighbors ne = findNeighbors(pid, _eps);\r
190 \r
191                         // not enough support -> mark as noise\r
192                         if (ne.size() < _minPts)\r
193                         {\r
194                                 _noise[pid] = true;\r
195                         }\r
196                         else\r
197                         {\r
198                             //else it's a core point\r
199                                 _core[pid] = true;\r
200 \r
201                                 // Add p to current cluster\r
202 \r
203                                 CCluster c;              // a new cluster\r
204                                 c.push_back(pid);       // assign pid to cluster\r
205                                 _pointId_to_clusterId[pid]=cid;\r
206 \r
207                                 // go to neighbors\r
208                                 for (unsigned int i = 0; i < ne.size(); i++)\r
209                                 {\r
210                                         PointId nPid = ne[i];\r
211 \r
212                                         // not already visited\r
213                                         if (!_visited[nPid])\r
214                                         {\r
215                                                 _visited[nPid] = true;\r
216 \r
217                                                 // go to neighbors\r
218                                                 Neighbors ne1 = findNeighbors(nPid, _eps);\r
219 \r
220                                                 // enough support\r
221                                                 if (ne1.size() >= _minPts)\r
222                                                 {\r
223                                                         _core[nPid] = true;\r
224 \r
225                                                         // join\r
226                                                         BOOST_FOREACH(Neighbors::value_type n1, ne1)\r
227                                                         {\r
228                                                                 // join neighbord\r
229                                                                 ne.push_back(n1);    \r
230                                                         }\r
231                                                 }\r
232                                         }\r
233 \r
234                                         // not already assigned to a cluster\r
235                                         if (!_pointId_to_clusterId[nPid])\r
236                                         {\r
237                                             // add it to the current cluster\r
238                                                 c.push_back(nPid);\r
239                                                 _pointId_to_clusterId[nPid]=cid;\r
240                                         }\r
241                                 }\r
242                                 // start a new cluster\r
243                                 _clusters.push_back(c);\r
244                                 cid++;\r
245                         }\r
246                 } // if (!visited\r
247         } // for\r
248 \r
249         nbClusters = cid;\r
250 }\r
251 \r
252 \r
253 void ClustererDBSCAN::run_optics(Points samples)\r
254 {\r
255         // foreach pid\r
256         for (PointId pid = 0; pid < samples.size(); pid++)\r
257         {\r
258                 // not already visited\r
259                 if (!_visited[pid]){\r
260 \r
261                         _visited[pid] = true;\r
262 \r
263                         // get the neighbors\r
264                         Neighbors ne = findNeighbors(pid, _eps);\r
265                         // add it to the ordered list\r
266                         _optics_list.push_back(pid);\r
267                         // use the multiMap as priority queue\r
268                         QMultiMap<double,PointId> queue;\r
269 \r
270                         double d = this->core_distance(pid,_eps);\r
271                         // not enough support -> mark as noise\r
272                         if (d < 0)\r
273                         {\r
274                                 _noise[pid] = true;\r
275                         }\r
276                         else\r
277                         {\r
278                             //else it is a core point\r
279                                 _core[pid] = true;\r
280                                 this->update_reachability(ne,pid,d,queue);\r
281 \r
282                                 // go to neighbors in the good order\r
283                                 while(queue.size()>0)\r
284                                 {\r
285                                     //take element with lowest distance from the queue\r
286                                         PointId nPid = queue.begin().value();\r
287                                         queue.erase(queue.begin());\r
288 \r
289                                         // not already visited\r
290                                         if (!_visited[nPid])\r
291                                         {\r
292                                                 _visited[nPid] = true;\r
293 \r
294                                                 // go to neighbors\r
295                                                 Neighbors ne1 = findNeighbors(nPid, _eps);\r
296 \r
297                                                 _optics_list.push_back(nPid);\r
298 \r
299                                                 double dd = this->core_distance(nPid,_eps);\r
300                                                 // enough support\r
301                                                 if (dd >= 0)\r
302                                                 {\r
303                                                         _core[nPid] = true;\r
304                                                         this->update_reachability(ne1,nPid,dd,queue);\r
305 \r
306                                                 }\r
307                                         }\r
308                                 }\r
309 \r
310                         }\r
311                 } // if (!visited\r
312         } // for\r
313 \r
314 }\r
315 \r
316 void ClustererDBSCAN::update_reachability(Neighbors ne,PointId pid,double core_dist,QMultiMap<double,PointId> &queue)\r
317 {\r
318     BOOST_FOREACH(Neighbors::value_type n, ne)\r
319     {\r
320         if(!_visited[n])\r
321         {\r
322             double ndist = max(core_dist,_sim(pid,n));\r
323             if(_reachability[n]< 0)\r
324             {\r
325                 _reachability[n] = ndist;\r
326                 queue.insert(ndist,n);\r
327             }\r
328             else{\r
329                 if(_reachability[n]>ndist)\r
330                 {\r
331                     queue.remove(_reachability[n],n);\r
332                     _reachability[n] = ndist;\r
333                     queue.insert(ndist,n);\r
334                 }\r
335 \r
336             }\r
337         }\r
338     }\r
339 }\r
340 \r
341 void ClustererDBSCAN::find_clusters()\r
342 {\r
343 \r
344     ClusterId cid = 1;\r
345     CCluster c;   // a new cluster\r
346 \r
347     if (!_optics_list.empty()) _reachability[_optics_list[0]] = 0; //first element is anyway reachable\r
348 \r
349     // find pits below the threshold\r
350     BOOST_FOREACH(PointId pid, _optics_list)\r
351     {\r
352 \r
353         if (_reachability[pid]== -1) _reachability[pid] = _eps * 1.1;// undefined values are set to 10% more than Eps\r
354         if (_reachability[pid] > _depth && !c.empty()) //we pass the threshold up\r
355         {\r
356             if(c.size() >= _minPts) // cluster is valid\r
357             {\r
358                 _clusters.push_back(c);\r
359                 cid++;\r
360                 c = CCluster();\r
361             }\r
362             else     // cluster is too small -> noise\r
363             {\r
364                 BOOST_FOREACH(PointId cpid, c)\r
365                 {\r
366                     _noise[cpid] = true;\r
367                     _pointId_to_clusterId[cpid] = 0;\r
368                 }\r
369                 c.clear(); //restart\r
370             }\r
371         }\r
372         if (!_noise[pid])\r
373         {\r
374             c.push_back(pid);           // assign pid to cluster\r
375             _pointId_to_clusterId[pid]=cid;\r
376         }\r
377     }\r
378     // do it once again after the loop to finalize the last cluster\r
379     if(c.size() >= _minPts) // cluster is valid\r
380     {\r
381         _clusters.push_back(c);\r
382     }\r
383     else     // cluster is too small -> noise\r
384     {\r
385         BOOST_FOREACH(PointId cpid, c)\r
386         {\r
387             _noise[cpid] = true;\r
388             _pointId_to_clusterId[cpid] = 0;\r
389         }\r
390     }\r
391 \r
392 \r
393     nbClusters = cid;\r
394 }\r
395 \r
396 \r
397 void ClustererDBSCAN::find_clusters_WF()\r
398 {\r
399 \r
400     ClusterId cid = 1;\r
401     CCluster c;   // a new cluster\r
402     double bottom = _eps;\r
403     double top = _eps;\r
404 \r
405 \r
406     if (!_optics_list.empty()) {\r
407         _reachability[_optics_list[0]] = 0; //first element is anyway reachable\r
408         }\r
409 \r
410     for (int i = 0; i < _optics_list.size(); i++)\r
411     {\r
412         PointId pid = _optics_list[i];\r
413 \r
414         if (_reachability[pid]== -1) _reachability[pid] = _eps * 1.1;// undefined values are set to 10% more than Eps\r
415 \r
416         if (i>0 && _reachability[pid] < _reachability[_optics_list[i-1]]) // going down\r
417         {\r
418             if (_reachability[pid] < bottom)\r
419             {\r
420                 bottom = _reachability[pid]; //keep track of the minima\r
421             }\r
422             if (_reachability[_optics_list[i-1]] > top)\r
423             {\r
424                 top = _reachability[_optics_list[i-1]];  // previous maxima\r
425             }\r
426         }\r
427         else   // going up\r
428         {\r
429 \r
430             if (top > bottom + _depth && _reachability[pid] > bottom + _depth) // we detected a pit with enough depth\r
431             {\r
432                 int j = i;\r
433                 for(; j>0 && _reachability[_optics_list[j-1]] < bottom + _depth ; j--) // let's fill it backwards\r
434                 {\r
435                     if (!_noise[_optics_list[j-1]])\r
436                     {\r
437                         _pointId_to_clusterId[_optics_list[j-1]]=cid;\r
438                         c.push_back(_optics_list[j-1]);\r
439                     }\r
440 \r
441                 }\r
442                 if (j>0){\r
443                     _pointId_to_clusterId[_optics_list[j-1]]=cid;//add first point\r
444                     c.push_back(_optics_list[j-1]);\r
445                 }\r
446                 if (!c.empty())\r
447                 {\r
448                     if(c.size() >= _minPts) // cluster is valid\r
449                     {\r
450                         _clusters.push_back(c);\r
451                         cid++;\r
452                         c = CCluster();\r
453                     }\r
454                     else     // cluster is too small -> noise\r
455                     {\r
456                         BOOST_FOREACH(PointId cpid, c)\r
457                         {\r
458                             _noise[cpid] = true;\r
459                             _pointId_to_clusterId[cpid] = 0;\r
460                         }\r
461                         c.clear();\r
462                     }\r
463                 }\r
464                 top = _reachability[pid]; // restart from here\r
465                 bottom = _eps;\r
466 \r
467             }\r
468         }\r
469 \r
470 \r
471     }\r
472     // and check if there is a last cluster\r
473 \r
474     if (top > bottom + _depth) // there is a last cluster here\r
475     {\r
476         int j = _optics_list.size();\r
477         for(; j>0 && _reachability[_optics_list[j-1]] < bottom + _depth && _reachability[_optics_list[j-1]] < _eps ; j--)\r
478         {\r
479             if (!_noise[_optics_list[j-1]])\r
480             {\r
481                 _pointId_to_clusterId[_optics_list[j-1]]=cid;\r
482                 c.push_back(_optics_list[j-1]);\r
483             }\r
484 \r
485         }\r
486         if (j>0){\r
487             _pointId_to_clusterId[_optics_list[j-1]]=cid;//add first point\r
488             c.push_back(_optics_list[j-1]);\r
489         }\r
490         if (!c.empty())\r
491         {\r
492             if(c.size() >= _minPts) // cluster is valid\r
493             {\r
494                 _clusters.push_back(c);\r
495                 cid++;\r
496                 c = CCluster();\r
497             }\r
498             else     // cluster is too small -> noise\r
499             {\r
500                 BOOST_FOREACH(PointId cpid, c)\r
501                 {\r
502                     _noise[cpid] = true;\r
503                     _pointId_to_clusterId[cpid] = 0;\r
504                 }\r
505                 c.clear();\r
506             }\r
507         }\r
508     }\r
509 \r
510     nbClusters = cid;\r
511 }\r
512 \r
513 \r
514 // compute the core-distance\r
515 double ClustererDBSCAN::core_distance(PointId pid,double threshold)\r
516 {\r
517         QMultiMap<double,int> p;\r
518         for (unsigned int j=0; j < _sim.size1(); j++)\r
519         {\r
520                 if      ((pid != j ) && (_sim(pid, j)) < threshold)\r
521                 {\r
522                     p.insert(_sim(pid, j),j);\r
523                 }\r
524         }\r
525         QList<double> k = p.keys();\r
526         if (k.size()<_minPts)\r
527         {\r
528             return -1;\r
529         }\r
530         else\r
531         {\r
532             return k[_minPts-1];\r
533         }\r
534 \r
535 }\r
536 \r
537 \r
538 Neighbors ClustererDBSCAN::findNeighbors(PointId pid, double threshold)\r
539 {\r
540         Neighbors ne;\r
541 \r
542         for (unsigned int j=0; j < _sim.size1(); j++)\r
543         {\r
544                 if      ((pid != j ) && (_sim(pid, j)) < threshold)\r
545                 {\r
546                         ne.push_back(j);\r
547                 }\r
548         }\r
549         return ne;\r
550 }\r
551 \r
552 template <typename Distance_type>\r
553 void ClustererDBSCAN::computeSimilarity(Distance_type & d,Points samples)\r
554 {\r
555         unsigned int size = samples.size();\r
556         _sim.resize(size, size, false);\r
557         for (unsigned int i=0; i < size; i++)\r
558         {\r
559                 for (unsigned int j=i+1; j < size; j++)\r
560                 {\r
561                         _sim(j, i) = _sim(i, j) = d.distance(samples[i], samples[j]);\r
562 \r
563                 }\r
564         }\r
565 }\r
566 \r