[ VIGRA Homepage | Function Index | Class Index | Namespaces | File List | Main Page ]

rf_algorithm.hxx
1/************************************************************************/
2/* */
3/* Copyright 2008-2009 by Rahul Nair */
4/* */
5/* This file is part of the VIGRA computer vision library. */
6/* The VIGRA Website is */
7/* http://hci.iwr.uni-heidelberg.de/vigra/ */
8/* Please direct questions, bug reports, and contributions to */
9/* ullrich.koethe@iwr.uni-heidelberg.de or */
10/* vigra@informatik.uni-hamburg.de */
11/* */
12/* Permission is hereby granted, free of charge, to any person */
13/* obtaining a copy of this software and associated documentation */
14/* files (the "Software"), to deal in the Software without */
15/* restriction, including without limitation the rights to use, */
16/* copy, modify, merge, publish, distribute, sublicense, and/or */
17/* sell copies of the Software, and to permit persons to whom the */
18/* Software is furnished to do so, subject to the following */
19/* conditions: */
20/* */
21/* The above copyright notice and this permission notice shall be */
22/* included in all copies or substantial portions of the */
23/* Software. */
24/* */
25/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND */
26/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES */
27/* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */
28/* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT */
29/* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, */
30/* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING */
31/* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR */
32/* OTHER DEALINGS IN THE SOFTWARE. */
33/* */
34/************************************************************************/
35#ifndef VIGRA_RF_ALGORITHM_HXX
36#define VIGRA_RF_ALGORITHM_HXX
37#include <vector>
38#include <random>
39#include "splices.hxx"
40#include <queue>
41#include <fstream>
42namespace vigra
43{
44
45namespace rf
46{
47/** This namespace contains all algorithms developed for feature
48 * selection
49 *
50 */
51namespace algorithms
52{
53
54namespace detail
55{
56 /** create a MultiArray containing only columns supplied between iterators
57 b and e
58 */
59 template<class OrigMultiArray,
60 class Iter,
61 class DestMultiArray>
62 void choose(OrigMultiArray const & in,
63 Iter const & b,
64 Iter const & e,
66 {
67 int columnCount = std::distance(b, e);
68 int rowCount = in.shape(0);
69 out.reshape(MultiArrayShape<2>::type(rowCount, columnCount));
70 int ii = 0;
71 for(Iter iter = b; iter != e; ++iter, ++ii)
72 {
73 columnVector(out, ii) = columnVector(in, *iter);
74 }
75 }
76}
77
78
79
80/** Standard random forest Errorrate callback functor
81 *
82 * returns the random forest error estimate when invoked.
83 */
85{
86 RandomForestOptions options;
87
88 public:
89 /** Default constructor
90 *
91 * optionally supply options to the random forest classifier
92 * \sa RandomForestOptions
93 */
97
98 /** returns the RF OOB error estimate given features and
99 * labels
100 */
101 template<class Feature_t, class Response_t>
102 double operator() (Feature_t const & features,
103 Response_t const & response)
104 {
105 RandomForest<> rf(options);
107 rf.learn(features,
108 response,
110 return oob.oob_breiman;
111 }
112};
113
114
115/** Structure to hold Variable Selection results
116 */
118{
119 bool initialized;
120
121 public:
123 : initialized(false)
124 {}
125
126 typedef std::vector<int> FeatureList_t;
127 typedef std::vector<double> ErrorList_t;
128 typedef FeatureList_t::iterator Pivot_t;
129
130 Pivot_t pivot;
131
132 /** list of features.
133 */
134 FeatureList_t selected;
135
136 /** vector of size (number of features)
137 *
138 * the i-th entry encodes the error rate obtained
139 * while using features [0 - i](including i)
140 *
141 * if the i-th entry is -1 then no error rate was obtained
142 * this may happen if more than one feature is added to the
143 * selected list in one step of the algorithm.
144 *
145 * during initialisation error[m+n-1] is always filled
146 */
147 ErrorList_t errors;
148
149
150 /** errorrate using no features
151 */
153
154 template<class FeatureT,
155 class ResponseT,
156 class Iter,
157 class ErrorRateCallBack>
158 bool init(FeatureT const & all_features,
159 ResponseT const & response,
160 Iter b,
161 Iter e,
163 {
164 bool ret_ = init(all_features, response, errorcallback);
165 if(!ret_)
166 return false;
167 vigra_precondition(std::distance(b, e) == static_cast<std::ptrdiff_t>(selected.size()),
168 "Number of features in ranking != number of features matrix");
169 std::copy(b, e, selected.begin());
170 return true;
171 }
172
173 template<class FeatureT,
174 class ResponseT,
175 class Iter>
176 bool init(FeatureT const & all_features,
177 ResponseT const & response,
178 Iter b,
179 Iter e)
180 {
182 return init(all_features, response, b, e, ecallback);
183 }
184
185
186 template<class FeatureT,
187 class ResponseT>
188 bool init(FeatureT const & all_features,
189 ResponseT const & response)
190 {
191 return init(all_features, response, RFErrorCallback());
192 }
193 /**initialization routine. Will be called only once in the lifetime
194 * of a VariableSelectionResult. Subsequent calls will not reinitialize
195 * member variables.
196 *
197 * This is intended, to allow continuing variable selection at a point
198 * stopped in an earlier iteration.
199 *
200 * returns true if initialization was successful and false if
201 * the object was already initialized before.
202 */
203 template<class FeatureT,
204 class ResponseT,
205 class ErrorRateCallBack>
207 ResponseT const & response,
209 {
210 if(initialized)
211 {
212 return false;
213 }
214 initialized = true;
215 // calculate error with all features
216 selected.resize(all_features.shape(1), 0);
217 for(unsigned int ii = 0; ii < selected.size(); ++ii)
218 selected[ii] = ii;
219 errors.resize(all_features.shape(1), -1);
220 errors.back() = errorcallback(all_features, response);
221
222 // calculate error rate if no features are chosen
223 // corresponds to max(prior probability) of the classes
224 std::map<typename ResponseT::value_type, int> res_map;
225 std::vector<int> cts;
226 int counter = 0;
227 for(int ii = 0; ii < response.shape(0); ++ii)
228 {
229 if(res_map.find(response(ii, 0)) == res_map.end())
230 {
231 res_map[response(ii, 0)] = counter;
232 ++counter;
233 cts.push_back(0);
234 }
235 cts[res_map[response(ii,0)]] +=1;
236 }
237 no_features = double(*(std::max_element(cts.begin(),
238 cts.end())))
239 / double(response.shape(0));
240
241 /*init not_selected vector;
242 not_selected.resize(all_features.shape(1), 0);
243 for(int ii = 0; ii < not_selected.size(); ++ii)
244 {
245 not_selected[ii] = ii;
246 }
247 initialized = true;
248 */
249 pivot = selected.begin();
250 return true;
251 }
252};
253
254
255
256/** Perform forward selection
257 *
258 * \param features IN: n x p matrix containing n instances with p attributes/features
259 * used in the variable selection algorithm
260 * \param response IN: n x 1 matrix containing the corresponding response
261 * \param result IN/OUT: VariableSelectionResult struct which will contain the results
262 * of the algorithm.
263 * Features between result.selected.begin() and result.pivot will
264 * be left untouched.
265 * \sa VariableSelectionResult
266 * \param errorcallback
267 * IN, OPTIONAL:
268 * Functor that returns the error rate given a set of
269 * features and labels. Default is the RandomForest OOB Error.
270 *
271 * Forward selection subsequently chooses the next feature that decreases the Error rate most.
272 *
273 * usage:
274 * \code
275 * MultiArray<2, double> features = createSomeFeatures();
276 * MultiArray<2, int> labels = createCorrespondingLabels();
277 * VariableSelectionResult result;
278 * forward_selection(features, labels, result);
279 * \endcode
280 * To use forward selection but ensure that a specific feature e.g. feature 5 is always
281 * included one would do the following
282 *
283 * \code
284 * VariableSelectionResult result;
285 * result.init(features, labels);
286 * std::swap(result.selected[0], result.selected[5]);
287 * result.setPivot(1);
288 * forward_selection(features, labels, result);
289 * \endcode
290 *
291 * \sa VariableSelectionResult
292 *
293 */
294template<class FeatureT, class ResponseT, class ErrorRateCallBack>
295void forward_selection(FeatureT const & features,
296 ResponseT const & response,
299{
300 VariableSelectionResult::FeatureList_t & selected = result.selected;
301 VariableSelectionResult::ErrorList_t & errors = result.errors;
302 VariableSelectionResult::Pivot_t & pivot = result.pivot;
303 int featureCount = features.shape(1);
304 // initialize result struct if in use for the first time
305 if(!result.init(features, response, errorcallback))
306 {
307 //result is being reused just ensure that the number of features is
308 //the same.
309 vigra_precondition(static_cast<int>(selected.size()) == featureCount,
310 "forward_selection(): Number of features in Feature "
311 "matrix and number of features in previously used "
312 "result struct mismatch!");
313 }
314
315
316 int not_selected_size = std::distance(pivot, selected.end());
317 while(not_selected_size > 1)
318 {
319 std::vector<double> current_errors;
320 VariableSelectionResult::Pivot_t next = pivot;
321 for(int ii = 0; ii < not_selected_size; ++ii, ++next)
322 {
323 std::swap(*pivot, *next);
325 detail::choose( features,
326 selected.begin(),
327 pivot+1,
328 cur_feats);
329 double error = errorcallback(cur_feats, response);
330 current_errors.push_back(error);
331 std::swap(*pivot, *next);
332 }
333 int pos = std::distance(current_errors.begin(),
334 std::min_element(current_errors.begin(),
336 next = pivot;
337 std::advance(next, pos);
338 std::swap(*pivot, *next);
339 errors[std::distance(selected.begin(), pivot)] = current_errors[pos];
340#ifdef RN_VERBOSE
341 std::copy(current_errors.begin(), current_errors.end(), std::ostream_iterator<double>(std::cerr, ", "));
342 std::cerr << "Choosing " << *pivot << " at error of " << current_errors[pos] << std::endl;
343#endif
344 ++pivot;
345 not_selected_size = std::distance(pivot, selected.end());
346 }
347}
348template<class FeatureT, class ResponseT>
349void forward_selection(FeatureT const & features,
350 ResponseT const & response,
351 VariableSelectionResult & result)
352{
353 forward_selection(features, response, result, RFErrorCallback());
354}
355
356
357/** Perform backward elimination
358 *
359 * \param features IN: n x p matrix containing n instances with p attributes/features
360 * used in the variable selection algorithm
361 * \param response IN: n x 1 matrix containing the corresponding response
362 * \param result IN/OUT: VariableSelectionResult struct which will contain the results
363 * of the algorithm.
364 * Features between result.pivot and result.selected.end() will
365 * be left untouched.
366 * \sa VariableSelectionResult
367 * \param errorcallback
368 * IN, OPTIONAL:
369 * Functor that returns the error rate given a set of
370 * features and labels. Default is the RandomForest OOB Error.
371 *
372 * Backward elimination subsequently eliminates features that have the least influence
373 * on the error rate
374 *
375 * usage:
376 * \code
377 * MultiArray<2, double> features = createSomeFeatures();
378 * MultiArray<2, int> labels = createCorrespondingLabels();
379 * VariableSelectionResult result;
380 * backward_elimination(features, labels, result);
381 * \endcode
382 * To use backward elimination but ensure that a specific feature e.g. feature 5 is always
383 * excluded one would do the following:
384 *
385 * \code
386 * VariableSelectionResult result;
387 * result.init(features, labels);
388 * std::swap(result.selected[result.selected.size()-1], result.selected[5]);
389 * result.setPivot(result.selected.size()-1);
390 * backward_elimination(features, labels, result);
391 * \endcode
392 *
393 * \sa VariableSelectionResult
394 *
395 */
396template<class FeatureT, class ResponseT, class ErrorRateCallBack>
397void backward_elimination(FeatureT const & features,
398 ResponseT const & response,
401{
402 int featureCount = features.shape(1);
403 VariableSelectionResult::FeatureList_t & selected = result.selected;
404 VariableSelectionResult::ErrorList_t & errors = result.errors;
405 VariableSelectionResult::Pivot_t & pivot = result.pivot;
406
407 // initialize result struct if in use for the first time
408 if(!result.init(features, response, errorcallback))
409 {
410 //result is being reused just ensure that the number of features is
411 //the same.
412 vigra_precondition(static_cast<int>(selected.size()) == featureCount,
413 "backward_elimination(): Number of features in Feature "
414 "matrix and number of features in previously used "
415 "result struct mismatch!");
416 }
417 pivot = selected.end() - 1;
418
419 int selected_size = std::distance(selected.begin(), pivot);
420 while(selected_size > 1)
421 {
422 VariableSelectionResult::Pivot_t next = selected.begin();
423 std::vector<double> current_errors;
424 for(int ii = 0; ii < selected_size; ++ii, ++next)
425 {
426 std::swap(*pivot, *next);
428 detail::choose( features,
429 selected.begin(),
430 pivot+1,
431 cur_feats);
432 double error = errorcallback(cur_feats, response);
433 current_errors.push_back(error);
434 std::swap(*pivot, *next);
435 }
436 int pos = std::distance(current_errors.begin(),
437 std::min_element(current_errors.begin(),
439 next = selected.begin();
440 std::advance(next, pos);
441 std::swap(*pivot, *next);
442// std::cerr << std::distance(selected.begin(), pivot) << " " << pos << " " << current_errors.size() << " " << errors.size() << std::endl;
443 errors[std::distance(selected.begin(), pivot)-1] = current_errors[pos];
444 selected_size = std::distance(selected.begin(), pivot);
445#ifdef RN_VERBOSE
446 std::copy(current_errors.begin(), current_errors.end(), std::ostream_iterator<double>(std::cerr, ", "));
447 std::cerr << "Eliminating " << *pivot << " at error of " << current_errors[pos] << std::endl;
448#endif
449 --pivot;
450 }
451}
452
453template<class FeatureT, class ResponseT>
454void backward_elimination(FeatureT const & features,
455 ResponseT const & response,
456 VariableSelectionResult & result)
457{
458 backward_elimination(features, response, result, RFErrorCallback());
459}
460
461/** Perform rank selection using a predefined ranking
462 *
463 * \param features IN: n x p matrix containing n instances with p attributes/features
464 * used in the variable selection algorithm
465 * \param response IN: n x 1 matrix containing the corresponding response
466 * \param result IN/OUT: VariableSelectionResult struct which will contain the results
467 * of the algorithm. The struct should be initialized with the
468 * predefined ranking.
469 *
470 * \sa VariableSelectionResult
471 * \param errorcallback
472 * IN, OPTIONAL:
473 * Functor that returns the error rate given a set of
474 * features and labels. Default is the RandomForest OOB Error.
475 *
476 * Often some variable importance, score measure is used to create the ordering in which
477 * variables have to be selected. This method takes such a ranking and calculates the
478 * corresponding error rates.
479 *
480 * usage:
481 * \code
482 * MultiArray<2, double> features = createSomeFeatures();
483 * MultiArray<2, int> labels = createCorrespondingLabels();
484 * std::vector<int> ranking = createRanking(features);
485 * VariableSelectionResult result;
486 * result.init(features, labels, ranking.begin(), ranking.end());
487 * backward_elimination(features, labels, result);
488 * \endcode
489 *
490 * \sa VariableSelectionResult
491 *
492 */
493template<class FeatureT, class ResponseT, class ErrorRateCallBack>
494void rank_selection (FeatureT const & features,
495 ResponseT const & response,
498{
499 VariableSelectionResult::FeatureList_t & selected = result.selected;
500 VariableSelectionResult::ErrorList_t & errors = result.errors;
501 VariableSelectionResult::Pivot_t & iter = result.pivot;
502 int featureCount = features.shape(1);
503 // initialize result struct if in use for the first time
504 if(!result.init(features, response, errorcallback))
505 {
506 //result is being reused just ensure that the number of features is
507 //the same.
508 vigra_precondition(static_cast<int>(selected.size()) == featureCount,
509 "forward_selection(): Number of features in Feature "
510 "matrix and number of features in previously used "
511 "result struct mismatch!");
512 }
513
514 for(; iter != selected.end(); ++iter)
515 {
517 detail::choose( features,
518 selected.begin(),
519 iter+1,
520 cur_feats);
521 double error = errorcallback(cur_feats, response);
522 errors[std::distance(selected.begin(), iter)] = error;
523#ifdef RN_VERBOSE
524 std::copy(selected.begin(), iter+1, std::ostream_iterator<int>(std::cerr, ", "));
525 std::cerr << "Choosing " << *(iter+1) << " at error of " << error << std::endl;
526#endif
527
528 }
529}
530
531template<class FeatureT, class ResponseT>
532void rank_selection (FeatureT const & features,
533 ResponseT const & response,
534 VariableSelectionResult & result)
535{
536 rank_selection(features, response, result, RFErrorCallback());
537}
538
539
540
541enum ClusterLeafTypes{c_Leaf = 95, c_Node = 99};
542
543/* View of a Node in the hierarchical clustering
544 * class
545 * For internal use only -
546 * \sa NodeBase
547 */
548class ClusterNode
549: public NodeBase
550{
551 public:
552
553 typedef NodeBase BT;
554
555 /**constructors **/
556 ClusterNode():NodeBase(){}
557 ClusterNode( int nCol,
558 BT::T_Container_type & topology,
559 BT::P_Container_type & split_param)
560 : BT(nCol + 5, 5,topology, split_param)
561 {
562 status() = 0;
563 BT::column_data()[0] = nCol;
564 if(nCol == 1)
565 BT::typeID() = c_Leaf;
566 else
567 BT::typeID() = c_Node;
568 }
569
570 ClusterNode( BT::T_Container_type const & topology,
571 BT::P_Container_type const & split_param,
572 int n )
573 : NodeBase(5 , 5,topology, split_param, n)
574 {
575 //TODO : is there a more elegant way to do this?
576 BT::topology_size_ += BT::column_data()[0];
577 }
578
579 ClusterNode( BT & node_)
580 : BT(5, 5, node_)
581 {
582 //TODO : is there a more elegant way to do this?
583 BT::topology_size_ += BT::column_data()[0];
584 BT::parameter_size_ += 0;
585 }
586 int index()
587 {
588 return static_cast<int>(BT::parameters_begin()[1]);
589 }
590 void set_index(int in)
591 {
592 BT::parameters_begin()[1] = in;
593 }
594 double& mean()
595 {
596 return BT::parameters_begin()[2];
597 }
598 double& stdev()
599 {
600 return BT::parameters_begin()[3];
601 }
602 double& status()
603 {
604 return BT::parameters_begin()[4];
605 }
606};
607
608/** Stackentry class for HClustering class
609 */
611{
612 int parent;
613 int level;
614 int addr;
615 bool infm;
616 HC_Entry(int p, int l, int a, bool in)
617 : parent(p), level(l), addr(a), infm(in)
618 {}
619};
620
621
622/** Hierarchical Clustering class.
623 * Performs single linkage clustering
624 * \code
625 * Matrix<double> distance = get_distance_matrix();
626 * linkage.cluster(distance);
627 * // Draw clustering tree.
628 * Draw<double, int> draw(features, labels, "linkagetree.graph");
629 * linkage.breadth_first_traversal(draw);
630 * \endcode
631 * \sa ClusterImportanceVisitor
632 *
633 * once the clustering has taken place. Information queries can be made
634 * using the breadth_first_traversal() method and iterate() method
635 *
636 */
638{
639public:
641 ArrayVector<int> topology_;
642 ArrayVector<double> parameters_;
643 int begin_addr;
644
645 // Calculates the distance between two
646 double dist_func(double a, double b)
647 {
648 return std::min(a, b);
649 }
650
651 /** Visit each node with a Functor
652 * in creation order (should be depth first)
653 */
654 template<class Functor>
655 void iterate(Functor & tester)
656 {
657
658 std::vector<int> stack;
659 stack.push_back(begin_addr);
660 while(!stack.empty())
661 {
662 ClusterNode node(topology_, parameters_, stack.back());
663 stack.pop_back();
664 if(!tester(node))
665 {
666 if(node.columns_size() != 1)
667 {
668 stack.push_back(node.child(0));
669 stack.push_back(node.child(1));
670 }
671 }
672 }
673 }
674
675 /** Perform breadth first traversal of hierarchical cluster tree
676 */
677 template<class Functor>
679 {
680
681 std::queue<HC_Entry> queue;
682 int level = 0;
683 int parent = -1;
684 int addr = -1;
685 bool infm = false;
686 queue.push(HC_Entry(parent,level,begin_addr, infm));
687 while(!queue.empty())
688 {
689 level = queue.front().level;
690 parent = queue.front().parent;
691 addr = queue.front().addr;
692 infm = queue.front().infm;
693 ClusterNode node(topology_, parameters_, queue.front().addr);
694 ClusterNode parnt;
695 if(parent != -1)
696 {
697 parnt = ClusterNode(topology_, parameters_, parent);
698 }
699 queue.pop();
700 bool istrue = tester(node, level, parnt, infm);
701 if(node.columns_size() != 1)
702 {
703 queue.push(HC_Entry(addr, level +1,node.child(0),istrue));
704 queue.push(HC_Entry(addr, level +1,node.child(1),istrue));
705 }
706 }
707 }
708 /**save to HDF5 - defunct - has to be updated to new HDF5 interface
709 */
710#ifdef HasHDF5
711 void save(std::string file, std::string prefix)
712 {
713
714 vigra::writeHDF5(file.c_str(), (prefix + "topology").c_str(),
716 Shp(topology_.size(),1),
717 topology_.data()));
718 vigra::writeHDF5(file.c_str(), (prefix + "parameters").c_str(),
720 Shp(parameters_.size(), 1),
721 parameters_.data()));
722 vigra::writeHDF5(file.c_str(), (prefix + "begin_addr").c_str(),
723 MultiArrayView<2, int>(Shp(1,1), &begin_addr));
724
725 }
726#endif
727
728 /**Perform single linkage clustering
729 * \param distance distance matrix used. \sa CorrelationVisitor
730 */
731 template<class T, class C>
733 {
734 MultiArray<2, T> dist(distance);
735 std::vector<std::pair<int, int> > addr;
736 int index = 0;
737 for(int ii = 0; ii < distance.shape(0); ++ii)
738 {
739 addr.push_back(std::make_pair(topology_.size(), ii));
740 ClusterNode leaf(1, topology_, parameters_);
741 leaf.set_index(index);
742 ++index;
743 leaf.columns_begin()[0] = ii;
744 }
745
746 while(addr.size() != 1)
747 {
748 //find the two nodes with the smallest distance
749 int ii_min = 0;
750 int jj_min = 1;
751 double min_dist = dist((addr.begin()+ii_min)->second,
752 (addr.begin()+jj_min)->second);
753 for(unsigned int ii = 0; ii < addr.size(); ++ii)
754 {
755 for(unsigned int jj = ii+1; jj < addr.size(); ++jj)
756 {
757 if( dist((addr.begin()+ii_min)->second,
758 (addr.begin()+jj_min)->second)
759 > dist((addr.begin()+ii)->second,
760 (addr.begin()+jj)->second))
761 {
762 min_dist = dist((addr.begin()+ii)->second,
763 (addr.begin()+jj)->second);
764 ii_min = ii;
765 jj_min = jj;
766 }
767 }
768 }
769
770 //merge two nodes
771 int col_size = 0;
772 // The problem is that creating a new node invalidates the iterators stored
773 // in firstChild and secondChild.
774 {
775 ClusterNode firstChild(topology_,
776 parameters_,
777 (addr.begin() +ii_min)->first);
778 ClusterNode secondChild(topology_,
779 parameters_,
780 (addr.begin() +jj_min)->first);
781 col_size = firstChild.columns_size() + secondChild.columns_size();
782 }
783 int cur_addr = topology_.size();
784 begin_addr = cur_addr;
785// std::cerr << col_size << std::endl;
786 ClusterNode parent(col_size,
787 topology_,
788 parameters_);
789 ClusterNode firstChild(topology_,
790 parameters_,
791 (addr.begin() +ii_min)->first);
792 ClusterNode secondChild(topology_,
793 parameters_,
794 (addr.begin() +jj_min)->first);
795 parent.parameters_begin()[0] = min_dist;
796 parent.set_index(index);
797 ++index;
798 std::merge(firstChild.columns_begin(), firstChild.columns_end(),
799 secondChild.columns_begin(),secondChild.columns_end(),
800 parent.columns_begin());
801 //merge nodes in addr
802 int to_desc;
803 int ii_keep;
804 if(*parent.columns_begin() == *firstChild.columns_begin())
805 {
806 parent.child(0) = (addr.begin()+ii_min)->first;
807 parent.child(1) = (addr.begin()+jj_min)->first;
808 (addr.begin()+ii_min)->first = cur_addr;
809 ii_keep = ii_min;
810 to_desc = (addr.begin()+jj_min)->second;
811 addr.erase(addr.begin()+jj_min);
812 }
813 else
814 {
815 parent.child(1) = (addr.begin()+ii_min)->first;
816 parent.child(0) = (addr.begin()+jj_min)->first;
817 (addr.begin()+jj_min)->first = cur_addr;
818 ii_keep = jj_min;
819 to_desc = (addr.begin()+ii_min)->second;
820 addr.erase(addr.begin()+ii_min);
821 }
822 //update distances;
823
824 for(int jj = 0 ; jj < static_cast<int>(addr.size()); ++jj)
825 {
826 if(jj == ii_keep)
827 continue;
828 double bla = dist_func(
829 dist(to_desc, (addr.begin()+jj)->second),
830 dist((addr.begin()+ii_keep)->second,
831 (addr.begin()+jj)->second));
832
833 dist((addr.begin()+ii_keep)->second,
834 (addr.begin()+jj)->second) = bla;
835 dist((addr.begin()+jj)->second,
836 (addr.begin()+ii_keep)->second) = bla;
837 }
838 }
839 }
840
841};
842
843
844/** Normalize the status value in the HClustering tree (HClustering Visitor)
845 */
847{
848public:
849 double n;
850 /** Constructor
851 * \param m normalize status() by m
852 */
854 :n(m)
855 {}
856 template<class Node>
857 bool operator()(Node& node)
858 {
859 node.status()/=n;
860 return false;
861 }
862};
863
864
865/** Perform Permutation importance on HClustering clusters
866 * (See visit_after_tree() method of visitors::VariableImportance to
867 * see the basic idea. (Just that we apply the permutation not only to
868 * variables but also to clusters))
869 */
870template<class Iter, class DT>
872{
873public:
875 Matrix<double> tmp_mem_;
878 Matrix<double> feats_;
879 Matrix<int> labels_;
880 const int nPerm;
881 DT const & dt;
882 int index;
883 int oob_size;
884
885 template<class Feat_T, class Label_T>
886 PermuteCluster(Iter a,
887 Iter b,
888 Feat_T const & feats,
889 Label_T const & labls,
892 int np,
893 DT const & dt_)
894 :tmp_mem_(_spl(a, b).size(), feats.shape(1)),
895 perm_imp(p_imp),
896 orig_imp(o_imp),
897 feats_(_spl(a,b).size(), feats.shape(1)),
898 labels_(_spl(a,b).size(),1),
899 nPerm(np),
900 dt(dt_),
901 index(0),
902 oob_size(b-a)
903 {
904 copy_splice(_spl(a,b),
905 _spl(feats.shape(1)),
906 feats,
907 feats_);
908 copy_splice(_spl(a,b),
909 _spl(labls.shape(1)),
910 labls,
911 labels_);
912 }
913
914 template<class Node>
915 bool operator()(Node& node)
916 {
917 tmp_mem_ = feats_;
918 RandomMT19937 random;
919 int class_count = perm_imp.shape(1) - 1;
920 //permute columns together
921 for(int kk = 0; kk < nPerm; ++kk)
922 {
923 tmp_mem_ = feats_;
924 for(int ii = 0; ii < rowCount(feats_); ++ii)
925 {
926 int index = random.uniformInt(rowCount(feats_) - ii) +ii;
927 for(int jj = 0; jj < node.columns_size(); ++jj)
928 {
929 if(node.columns_begin()[jj] != feats_.shape(1))
930 tmp_mem_(ii, node.columns_begin()[jj])
931 = tmp_mem_(index, node.columns_begin()[jj]);
932 }
933 }
934
935 for(int ii = 0; ii < rowCount(tmp_mem_); ++ii)
936 {
937 if(dt
938 .predictLabel(rowVector(tmp_mem_, ii))
939 == labels_(ii, 0))
940 {
941 //per class
942 ++perm_imp(index,labels_(ii, 0));
943 //total
944 ++perm_imp(index, class_count);
945 }
946 }
947 }
948 double node_status = perm_imp(index, class_count);
949 node_status /= nPerm;
950 node_status -= orig_imp(0, class_count);
951 node_status *= -1;
952 node_status /= oob_size;
953 node.status() += node_status;
954 ++index;
955
956 return false;
957 }
958};
959
960/** Convert ClusteringTree into a list (HClustering visitor)
961 */
963{
964public:
965 /** NumberOfClusters x NumberOfVariables MultiArrayView containing
966 * in each row the variable belonging to a cluster
967 */
969 int index;
971 :variables(vars), index(0)
972 {}
973#ifdef HasHDF5
974 void save(std::string file, std::string prefix)
975 {
976 vigra::writeHDF5(file.c_str(), (prefix + "_variables").c_str(),
977 variables);
978 }
979#endif
980
981 template<class Node>
982 bool operator()(Node& node)
983 {
984 for(int ii = 0; ii < node.columns_size(); ++ii)
985 variables(index, ii) = node.columns_begin()[ii];
986 ++index;
987 return false;
988 }
989};
990/** corrects the status fields of a linkage Clustering (HClustering Visitor)
991 *
992 * such that status(currentNode) = min(status(parent), status(currentNode))
993 * \sa cluster_permutation_importance()
994 */
996{
997public:
998 template<class Nde>
999 bool operator()(Nde & cur, int /*level*/, Nde parent, bool /*infm*/)
1000 {
1001 if(parent.hasData_)
1002 cur.status() = std::min(parent.status(), cur.status());
1003 return true;
1004 }
1005};
1006
1007
1008/** draw current linkage Clustering (HClustering Visitor)
1009 *
1010 * create a graphviz .dot file
1011 * usage:
1012 * \code
1013 * Matrix<double> distance = get_distance_matrix();
1014 * linkage.cluster(distance);
1015 * Draw<double, int> draw(features, labels, "linkagetree.graph");
1016 * linkage.breadth_first_traversal(draw);
1017 * \endcode
1018 */
1019template<class T1,
1020 class T2,
1021 class C1 = UnstridedArrayTag,
1022 class C2 = UnstridedArrayTag>
1023class Draw
1024{
1025public:
1027 MultiArrayView<2, T1, C1> const & features_;
1028 MultiArrayView<2, T2, C2> const & labels_;
1029 std::ofstream graphviz;
1030
1031
1032 Draw(MultiArrayView<2, T1, C1> const & features,
1033 MultiArrayView<2, T2, C2> const& labels,
1034 std::string const gz)
1035 :features_(features), labels_(labels),
1036 graphviz(gz.c_str(), std::ios::out)
1037 {
1038 graphviz << "digraph G\n{\n node [shape=\"record\"]";
1039 }
1040 ~Draw()
1041 {
1042 graphviz << "\n}\n";
1043 graphviz.close();
1044 }
1045
1046 template<class Nde>
1047 bool operator()(Nde & cur, int /*level*/, Nde parent, bool /*infm*/)
1048 {
1049 graphviz << "node" << cur.index() << " [style=\"filled\"][label = \" #Feats: "<< cur.columns_size() << "\\n";
1050 graphviz << " status: " << cur.status() << "\\n";
1051 for(int kk = 0; kk < cur.columns_size(); ++kk)
1052 {
1053 graphviz << cur.columns_begin()[kk] << " ";
1054 if(kk % 15 == 14)
1055 graphviz << "\\n";
1056 }
1057 graphviz << "\"] [color = \"" <<cur.status() << " 1.000 1.000\"];\n";
1058 if(parent.hasData_)
1059 graphviz << "\"node" << parent.index() << "\" -> \"node" << cur.index() <<"\";\n";
1060 return true;
1061 }
1062};
1063
1064/** calculate Cluster based permutation importance while learning. (RandomForestVisitor)
1065 */
1067{
1068 public:
1069
1070 /** List of variables as produced by GetClusterVariables
1071 */
1073 /** Corresponding importance measures
1074 */
1076 /** Corresponding error
1077 */
1079 int repetition_count_;
1080 bool in_place_;
1081 HClustering & clustering;
1082
1083
1084#ifdef HasHDF5
1085 void save(std::string filename, std::string prefix)
1086 {
1087 std::string prefix1 = "cluster_importance_" + prefix;
1088 writeHDF5(filename.c_str(),
1089 prefix1.c_str(),
1091 prefix1 = "vars_" + prefix;
1092 writeHDF5(filename.c_str(),
1093 prefix1.c_str(),
1094 variables);
1095 }
1096#endif
1097
1099 : repetition_count_(rep_cnt), clustering(clst)
1100
1101 {}
1102
1103 /** Allocate enough memory
1104 */
1105 template<class RF, class PR>
1106 void visit_at_beginning(RF const & rf, PR const & /*pr*/)
1107 {
1108 Int32 const class_count = rf.ext_param_.class_count_;
1109 Int32 const column_count = rf.ext_param_.column_count_+1;
1111 .reshape(MultiArrayShape<2>::type(2*column_count-1,
1112 class_count+1));
1114 .reshape(MultiArrayShape<2>::type(2*column_count-1,
1115 class_count+1));
1116 variables
1117 .reshape(MultiArrayShape<2>::type(2*column_count-1,
1118 column_count), -1);
1120 clustering.iterate(gcv);
1121
1122 }
1123
1124 /**compute permutation based var imp.
1125 * (Only an Array of size oob_sample_count x 1 is created.
1126 * - apposed to oob_sample_count x feature_count in the other method.
1127 *
1128 * \sa FieldProxy
1129 */
1130 template<class RF, class PR, class SM, class ST>
1131 void after_tree_ip_impl(RF& rf, PR & pr, SM & sm, ST & /*st*/, int index)
1132 {
1134 Int32 column_count = rf.ext_param_.column_count_ +1;
1135 Int32 class_count = rf.ext_param_.class_count_;
1136
1137 // remove the const cast on the features (yep , I know what I am
1138 // doing here.) data is not destroyed.
1139 typename PR::Feature_t & features
1140 = const_cast<typename PR::Feature_t &>(pr.features());
1141
1142 //find the oob indices of current tree.
1144 ArrayVector<Int32>::iterator
1145 iter;
1146
1147 if(rf.ext_param_.actual_msample_ < pr.features().shape(0)- 10000)
1148 {
1149 ArrayVector<int> cts(2, 0);
1150 ArrayVector<Int32> indices(pr.features().shape(0));
1151 for(int ii = 0; ii < pr.features().shape(0); ++ii)
1152 indices.push_back(ii); ;
1153 std::random_device rd;
1154 std::mt19937 g(rd());
1155 std::shuffle(indices.begin(), indices.end(), g);
1156 for(int ii = 0; ii < rf.ext_param_.row_count_; ++ii)
1157 {
1158 if(!sm.is_used()[indices[ii]] && cts[pr.response()(indices[ii], 0)] < 3000)
1159 {
1160 oob_indices.push_back(indices[ii]);
1161 ++cts[pr.response()(indices[ii], 0)];
1162 }
1163 }
1164 }
1165 else
1166 {
1167 for(int ii = 0; ii < rf.ext_param_.row_count_; ++ii)
1168 if(!sm.is_used()[ii])
1169 oob_indices.push_back(ii);
1170 }
1171
1172 // Random foo
1173 RandomMT19937 random(RandomSeed);
1175 randint(random);
1176
1177 //make some space for the results
1179 oob_right(Shp_t(1, class_count + 1));
1180
1181 // get the oob success rate with the original samples
1182 for(iter = oob_indices.begin();
1183 iter != oob_indices.end();
1184 ++iter)
1185 {
1186 if(rf.tree(index)
1187 .predictLabel(rowVector(features, *iter))
1188 == pr.response()(*iter, 0))
1189 {
1190 //per class
1191 ++oob_right[pr.response()(*iter,0)];
1192 //total
1193 ++oob_right[class_count];
1194 }
1195 }
1196
1198 perm_oob_right (Shp_t(2* column_count-1, class_count + 1));
1199
1200 PermuteCluster<ArrayVector<Int32>::iterator,typename RF::DecisionTree_t>
1202 pr.features(),
1203 pr.response(),
1205 oob_right,
1206 repetition_count_,
1207 rf.tree(index));
1208 clustering.iterate(pc);
1209
1210 perm_oob_right /= repetition_count_;
1211 for(int ii = 0; ii < rowCount(perm_oob_right); ++ii)
1212 rowVector(perm_oob_right, ii) -= oob_right;
1213
1214 perm_oob_right *= -1;
1217 }
1218
1219 /** calculate permutation based impurity after every tree has been
1220 * learned default behaviour is that this happens out of place.
1221 * If you have very big data sets and want to avoid copying of data
1222 * set the in_place_ flag to true.
1223 */
1224 template<class RF, class PR, class SM, class ST>
1225 void visit_after_tree(RF& rf, PR & pr, SM & sm, ST & st, int index)
1226 {
1227 after_tree_ip_impl(rf, pr, sm, st, index);
1228 }
1229
1230 /** Normalise variable importance after the number of trees is known.
1231 */
1232 template<class RF, class PR>
1233 void visit_at_end(RF & rf, PR & /*pr*/)
1234 {
1235 NormalizeStatus nrm(rf.tree_count());
1236 clustering.iterate(nrm);
1237 cluster_importance_ /= rf.trees_.size();
1238 }
1239};
1240
1241/** Perform hierarchical clustering of variables and assess importance of clusters
1242 *
1243 * \param features IN: n x p matrix containing n instances with p attributes/features
1244 * used in the variable selection algorithm
1245 * \param response IN: n x 1 matrix containing the corresponding response
1246 * \param linkage OUT: Hierarchical grouping of variables.
1247 * \param distance OUT: distance matrix used for creating the linkage
1248 *
1249 * Performs Hierarchical clustering of variables. And calculates the permutation importance
1250 * measures of each of the clusters. Use the Draw functor to create human readable output
1251 * The cluster-permutation importance measure corresponds to the normal permutation importance
1252 * measure with all columns corresponding to a cluster permuted.
1253 * The importance measure for each cluster is stored as the status() field of each clusternode
1254 * \sa HClustering
1255 *
1256 * usage:
1257 * \code
1258 * MultiArray<2, double> features = createSomeFeatures();
1259 * MultiArray<2, int> labels = createCorrespondingLabels();
1260 * HClustering linkage;
1261 * MultiArray<2, double> distance;
1262 * cluster_permutation_importance(features, labels, linkage, distance)
1263 * // create graphviz output
1264 *
1265 * Draw<double, int> draw(features, labels, "linkagetree.graph");
1266 * linkage.breadth_first_traversal(draw);
1267 *
1268 * \endcode
1269 *
1270 *
1271 */
1272template<class FeatureT, class ResponseT>
1274 ResponseT const & response,
1276 MultiArray<2, double> & distance)
1277{
1278
1280 opt.tree_count(100);
1281 if(features.shape(0) > 40000)
1282 opt.samples_per_tree(20000).use_stratification(RF_EQUAL);
1283
1284
1288 RF.learn(features, response,
1289 create_visitor(missc, progress));
1290 distance = missc.distance;
1291 /*
1292 missc.save(exp_dir + dset.name() + "_result.h5", dset.name()+"MACH");
1293 */
1294
1295
1296 // Produce linkage
1297 linkage.cluster(distance);
1298
1299 //linkage.save(exp_dir + dset.name() + "_result.h5", "_linkage_CC/");
1302 RF2.learn(features,
1303 response,
1304 create_visitor(progress, ci));
1305
1306
1308 linkage.breadth_first_traversal(cs);
1309
1310 //ci.save(exp_dir + dset.name() + "_result.h5", dset.name());
1311 //Draw<double, int> draw(dset.features(), dset.response(), exp_dir+ dset.name() + ".graph");
1312 //linkage.breadth_first_traversal(draw);
1313
1314}
1315
1316
1317template<class FeatureT, class ResponseT>
1318void cluster_permutation_importance(FeatureT const & features,
1319 ResponseT const & response,
1320 HClustering & linkage)
1321{
1322 MultiArray<2, double> distance;
1323 cluster_permutation_importance(features, response, linkage, distance);
1324}
1325
1326
1327template<class Array1, class Vector1>
1328void get_ranking(Array1 const & in, Vector1 & out)
1329{
1330 std::map<double, int> mymap;
1331 for(int ii = 0; ii < in.size(); ++ii)
1332 mymap[in[ii]] = ii;
1333 for(std::map<double, int>::reverse_iterator iter = mymap.rbegin(); iter!= mymap.rend(); ++iter)
1334 {
1335 out.push_back(iter->second);
1336 }
1337}
1338}//namespace algorithms
1339}//namespace rf
1340}//namespace vigra
1341#endif //VIGRA_RF_ALGORITHM_HXX
void reshape(const difference_type &shape)
Definition multi_array.hxx:2863
Topology_type column_data() const
Definition rf_nodeproxy.hxx:159
INT & typeID()
Definition rf_nodeproxy.hxx:136
NodeBase()
Definition rf_nodeproxy.hxx:237
Parameter_type parameters_begin() const
Definition rf_nodeproxy.hxx:207
Class for a single RGB value.
Definition rgbvalue.hxx:128
Options object for the random forest.
Definition rf_common.hxx:171
size_type size() const
Definition tinyvector.hxx:913
iterator end()
Definition tinyvector.hxx:864
iterator begin()
Definition tinyvector.hxx:861
Class for fixed size vectors.
Definition tinyvector.hxx:1008
Definition rf_algorithm.hxx:1067
MultiArray< 2, double > cluster_importance_
Definition rf_algorithm.hxx:1075
MultiArray< 2, int > variables
Definition rf_algorithm.hxx:1072
void visit_at_end(RF &rf, PR &)
Definition rf_algorithm.hxx:1233
void visit_after_tree(RF &rf, PR &pr, SM &sm, ST &st, int index)
Definition rf_algorithm.hxx:1225
MultiArray< 2, double > cluster_stdev_
Definition rf_algorithm.hxx:1078
void after_tree_ip_impl(RF &rf, PR &pr, SM &sm, ST &, int index)
Definition rf_algorithm.hxx:1131
void visit_at_beginning(RF const &rf, PR const &)
Definition rf_algorithm.hxx:1106
Definition rf_algorithm.hxx:996
Definition rf_algorithm.hxx:1024
Definition rf_algorithm.hxx:963
MultiArrayView< 2, int > variables
Definition rf_algorithm.hxx:968
Definition rf_algorithm.hxx:638
void iterate(Functor &tester)
Definition rf_algorithm.hxx:655
void cluster(MultiArrayView< 2, T, C > distance)
Definition rf_algorithm.hxx:732
void breadth_first_traversal(Functor &tester)
Definition rf_algorithm.hxx:678
Definition rf_algorithm.hxx:847
NormalizeStatus(double m)
Definition rf_algorithm.hxx:853
Definition rf_algorithm.hxx:872
Definition rf_algorithm.hxx:85
double operator()(Feature_t const &features, Response_t const &response)
Definition rf_algorithm.hxx:102
RFErrorCallback(RandomForestOptions opt=RandomForestOptions())
Definition rf_algorithm.hxx:94
Definition rf_algorithm.hxx:118
double no_features
Definition rf_algorithm.hxx:152
ErrorList_t errors
Definition rf_algorithm.hxx:147
FeatureList_t selected
Definition rf_algorithm.hxx:134
bool init(FeatureT const &all_features, ResponseT const &response, ErrorRateCallBack errorcallback)
Definition rf_algorithm.hxx:206
Definition rf_visitors.hxx:1494
Definition rf_visitors.hxx:865
Definition rf_visitors.hxx:103
void backward_elimination(FeatureT const &features, ResponseT const &response, VariableSelectionResult &result, ErrorRateCallBack errorcallback)
Definition rf_algorithm.hxx:397
void rank_selection(FeatureT const &features, ResponseT const &response, VariableSelectionResult &result, ErrorRateCallBack errorcallback)
Definition rf_algorithm.hxx:494
void forward_selection(FeatureT const &features, ResponseT const &response, VariableSelectionResult &result, ErrorRateCallBack errorcallback)
Definition rf_algorithm.hxx:295
void cluster_permutation_importance(FeatureT const &features, ResponseT const &response, HClustering &linkage, MultiArray< 2, double > &distance)
Definition rf_algorithm.hxx:1273
detail::VisitorNode< A > create_visitor(A &a)
Definition rf_visitors.hxx:345
void writeHDF5(...)
Store array data in an HDF5 file.
detail::SelectIntegerType< 32, detail::SignedIntTypes >::type Int32
32-bit signed int
Definition sized_int.hxx:175
Definition metaprogramming.hxx:123
Definition rf_algorithm.hxx:611

© Ullrich Köthe (ullrich.koethe@iwr.uni-heidelberg.de)
Heidelberg Collaboratory for Image Processing, University of Heidelberg, Germany

html generated using doxygen and Python
vigra 1.12.2