[ VIGRA Homepage | Function Index | Class Index | Namespaces | File List | Main Page ]

sampling.hxx
1/************************************************************************/
2/* */
3/* Copyright 2008-2009 by Ullrich Koethe and Rahul Nair */
4/* */
5/* This file is part of the VIGRA computer vision library. */
6/* The VIGRA Website is */
7/* http://hci.iwr.uni-heidelberg.de/vigra/ */
8/* Please direct questions, bug reports, and contributions to */
9/* ullrich.koethe@iwr.uni-heidelberg.de or */
10/* vigra@informatik.uni-hamburg.de */
11/* */
12/* Permission is hereby granted, free of charge, to any person */
13/* obtaining a copy of this software and associated documentation */
14/* files (the "Software"), to deal in the Software without */
15/* restriction, including without limitation the rights to use, */
16/* copy, modify, merge, publish, distribute, sublicense, and/or */
17/* sell copies of the Software, and to permit persons to whom the */
18/* Software is furnished to do so, subject to the following */
19/* conditions: */
20/* */
21/* The above copyright notice and this permission notice shall be */
22/* included in all copies or substantial portions of the */
23/* Software. */
24/* */
25/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND */
26/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES */
27/* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */
28/* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT */
29/* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, */
30/* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING */
31/* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR */
32/* OTHER DEALINGS IN THE SOFTWARE. */
33/* */
34/************************************************************************/
35
36#ifndef VIGRA_SAMPLING_HXX
37#define VIGRA_SAMPLING_HXX
38
39#include "array_vector.hxx"
40#include "random.hxx"
41#include <map>
42#include <memory>
43#include <cmath>
44
45namespace vigra
46{
47
48/**\brief Options object for the Sampler class.
49
50 \ingroup MachineLearning
51
52 <b>Usage:</b>
53
54 \code
55 SamplerOptions opt = SamplerOptions()
56 .withReplacement()
57 .sampleProportion(0.5);
58 \endcode
59
60 Note that the return value of all methods is <tt>*this</tt> which makes
61 concatenating of options as above possible.
62*/
64{
65 public:
66
67 double sample_proportion;
68 unsigned int sample_size;
69 bool sample_with_replacement;
70 bool stratified_sampling;
71
73 : sample_proportion(1.0),
74 sample_size(0),
75 sample_with_replacement(true),
76 stratified_sampling(false)
77 {}
78
79 /**\brief Sample from training population with replacement.
80 *
81 * <br> Default: true
82 */
84 {
85 sample_with_replacement = in;
86 return *this;
87 }
88
89 /**\brief Sample from training population without replacement.
90 *
91 * <br> Default (if you don't call this function): false
92 */
94 {
95 sample_with_replacement = !in;
96 return *this;
97 }
98
99 /**\brief Draw the given number of samples.
100 * If stratifiedSampling is true, the \a size is equally distributed
101 * across all strata (e.g. <tt>size / strataCount</tt> samples are taken
102 * from each stratum, subject to rounding).
103 *
104 * <br> Default: 0 (i.e. determine the count by means of sampleProportion())
105 */
106 SamplerOptions& sampleSize(unsigned int size)
107 {
108 sample_size = size;
109 return *this;
110 }
111
112
113 /**\brief Determine the number of samples to draw as a proportion of the total
114 * number. That is, we draw <tt>count = totalCount * proportion</tt> samples.
115 * This option is overridden when an absolute count is specified by sampleSize().
116 *
117 * If stratifiedSampling is true, the count is equally distributed
118 * across all strata (e.g. <tt>totalCount * proportion / strataCount</tt> samples are taken
119 * from each stratum).
120 *
121 * <br> Default: 1.0
122 */
124 {
125 vigra_precondition(proportion >= 0.0,
126 "SamplerOptions::sampleProportion(): argument must not be negative.");
127 sample_proportion = proportion;
128 return *this;
129 }
130
131 /**\brief Draw equally many samples from each "stratum".
132 * A stratum is a group of like entities, e.g. pixels belonging
133 * to the same object class. This is useful to create balanced samples
134 * when the class probabilities are very unbalanced (e.g.
135 * when there are many background and few foreground pixels).
136 * Stratified sampling thus avoids that a trained classifier is biased
137 * towards the majority class.
138 *
139 * <br> Default (if you don't call this function): false
140 */
142 {
143 stratified_sampling = in;
144 return *this;
145 }
146};
147
148/************************************************************/
149/* */
150/* Sampler */
151/* */
152/************************************************************/
153
154/** \brief Create random samples from a sequence of indices.
155
156 \ingroup MachineLearning
157
158 Selecting data items at random is a basic task of machine learning,
159 for example in boostrapping, RandomForest training, and cross validation.
160 This class implements various ways to select random samples via their indices.
161 Indices are assumed to be consecutive in
162 the range <tt>0 &lt;= index &lt; total_sample_count</tt>.
163
164 The class always contains a current sample which can be accessed by
165 the index operator or by the function sampledIndices(). The indices
166 that are not in the current sample (out-of-bag indices) can be accessed
167 via the function oobIndices().
168
169 The sampling method (with/without replacement, stratified or not) and the
170 number of samples to draw are determined by the option object
171 SamplerOptions.
172
173 <b>Usage:</b>
174
175 <b>\#include</b> <vigra/sampling.hxx><br>
176 Namespace: vigra
177
178 Create a Sampler with default options, i.e. sample as many indices as there
179 are data elements, with replacement. On average, the sample will contain
180 <tt>0.63*totalCount</tt> distinct indices.
181
182 \code
183 int totalCount = 10000; // total number of data elements
184 int numberOfSamples = 20; // repeat experiment 20 times
185 Sampler<> sampler(totalCount);
186 for(int k=0; k<numberOfSamples; ++k)
187 {
188 // process current sample
189 for(int i=0; i<sampler.sampleSize(); ++i)
190 {
191 int currentIndex = sampler[i];
192 processData(data[currentIndex]);
193 }
194 // create next sample
195 sampler.sample();
196 }
197 \endcode
198
199 Create a Sampler for stratified sampling, without replacement.
200
201 \code
202 // prepare the strata (i.e. specify which stratum each element belongs to)
203 int stratumSize1 = 2000, stratumSize2 = 8000,
204 totalCount = stratumSize1 + stratumSize2;
205 ArrayVerctor<int> strata(totalCount);
206 for(int i=0; i<stratumSize1; ++i)
207 strata[i] = 1;
208 for(int i=stratumSize1; i<stratumSize2; ++i)
209 strata[i] = 2;
210
211 int sampleSize = 200; // i.e. sample 100 elements from each of the two strata
212 int numberOfSamples = 20; // repeat experiment 20 times
213 Sampler<> stratifiedSampler(strata.begin(), strata.end(),
214 SamplerOptions().withoutReplacement().stratified().sampleSize(sampleSize));
215 // create first sample
216 sampler.sample();
217
218 for(int k=0; k<numberOfSamples; ++k)
219 {
220 // process current sample
221 for(int i=0; i<sampler.sampleSize(); ++i)
222 {
223 int currentIndex = sampler[i];
224 processData(data[currentIndex]);
225 }
226 // create next sample
227 sampler.sample();
228 }
229 \endcode
230*/
231template<class Random = MersenneTwister >
233{
234 public:
235 /** Internal type of the indices.
236 Currently, 64-bit indices are not supported because this
237 requires extension of the random number generator classes.
238 */
240
242
243 /** Type of the array view object that is returned by
244 sampledIndices() and oobIndices().
245 */
247
248 private:
249 typedef std::map<IndexType, IndexArrayType> StrataIndicesType;
250 typedef std::map<IndexType, int> StrataSizesType;
253
254 static const int oobInvalid = -1;
255
256 int total_count_, sample_size_;
257 mutable int current_oob_count_;
258 StrataIndicesType strata_indices_;
259 StrataSizesType strata_sample_size_;
260 IndexArrayType current_sample_;
261 mutable IndexArrayType current_oob_sample_;
262 IsUsedArrayType is_used_;
263 Random default_random_;
264 Random const & random_;
265 SamplerOptions options_;
266
267 void initStrataCount()
268 {
269 // compute how many samples to take from each stratum
270 // (may be unequal if sample_size_ is not a multiple of strataCount())
271 int strata_sample_size = static_cast<int>(std::ceil(double(sample_size_) / strataCount()));
273
274 for(StrataIndicesType::iterator i = strata_indices_.begin();
275 i != strata_indices_.end(); ++i)
276 {
277 if(strata_total_count > sample_size_)
278 {
279 strata_sample_size_[i->first] = strata_sample_size - 1;
281 }
282 else
283 {
284 strata_sample_size_[i->first] = strata_sample_size;
285 }
286 }
287 }
288
289 public:
290
291 /** Create a sampler for \a totalCount data objects.
292
293 In each invocation of <tt>sample()</tt> below, it will sample
294 indices according to the options passed. If no options are given,
295 <tt>totalCount</tt> indices will be drawn with replacement.
296 */
298 Random const * rnd = 0)
299 : total_count_(totalCount),
300 sample_size_(opt.sample_size == 0
301 ? static_cast<int>((std::ceil(total_count_ * opt.sample_proportion)))
302 : opt.sample_size),
303 current_oob_count_(oobInvalid),
304 current_sample_(sample_size_),
305 current_oob_sample_(total_count_),
306 is_used_(total_count_),
307 default_random_(RandomSeed),
308 random_(rnd ? *rnd : default_random_),
309 options_(opt)
310 {
311 vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_,
312 "Sampler(): Cannot draw without replacement when data size is smaller than sample count.");
313
314 vigra_precondition(!opt.stratified_sampling,
315 "Sampler(): Stratified sampling requested, but no strata given.");
316
317 // initialize a single stratum containing all data
318 strata_indices_[0].resize(total_count_);
319 for(int i=0; i<total_count_; ++i)
320 strata_indices_[0][i] = i;
321
322 initStrataCount();
323 //this is screwing up the random forest tests.
324 //sample();
325 }
326
327 /** Create a sampler for stratified sampling.
328
329 <tt>strataBegin</tt> and <tt>strataEnd</tt> must refer to a sequence
330 which specifies for each sample the stratum it belongs to. The
331 total number of data objects will be set to <tt>strataEnd - strataBegin</tt>.
332 Equally many samples (subject to rounding) will be drawn from each stratum,
333 unless the option object explicitly requests unstratified sampling,
334 in which case the strata are ignored.
335 */
336 template <class Iterator>
338 Random const * rnd = 0)
339 : total_count_(strataEnd - strataBegin),
340 sample_size_(opt.sample_size == 0
341 ? static_cast<int>((std::ceil(total_count_ * opt.sample_proportion)))
342 : opt.sample_size),
343 current_oob_count_(oobInvalid),
344 current_sample_(sample_size_),
345 current_oob_sample_(total_count_),
346 is_used_(total_count_),
347 default_random_(RandomSeed),
348 random_(rnd ? *rnd : default_random_),
349 options_(opt)
350 {
351 vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_,
352 "Sampler(): Cannot draw without replacement when data size is smaller than sample count.");
353
354 // copy the strata indices
355 if(opt.stratified_sampling)
356 {
357 for(int i = 0; strataBegin != strataEnd; ++i, ++strataBegin)
358 {
359 strata_indices_[*strataBegin].push_back(i);
360 }
361 }
362 else
363 {
364 strata_indices_[0].resize(total_count_);
365 for(int i=0; i<total_count_; ++i)
366 strata_indices_[0][i] = i;
367 }
368
369 vigra_precondition(sample_size_ >= static_cast<int>(strata_indices_.size()),
370 "Sampler(): Requested sample count must be at least as large as the number of strata.");
371
372 initStrataCount();
373 //this is screwing up the random forest tests.
374 //sample();
375 }
376
377 /** Return the k-th index in the current sample.
378 */
380 {
381 return current_sample_[k];
382 }
383
384 /** Create a new sample.
385 */
386 void sample();
387
388 /** The total number of data elements.
389 */
390 int totalCount() const
391 {
392 return total_count_;
393 }
394
395 /** The number of data elements that have been sampled.
396 */
397 int sampleSize() const
398 {
399 return sample_size_;
400 }
401
402 /** Same as sampleSize().
403 */
404 int size() const
405 {
406 return sample_size_;
407 }
408
409 /** The number of strata to be used.
410 Will be 1 if no strata are given. Will be ignored when
411 stratifiedSampling() is false.
412 */
413 int strataCount() const
414 {
415 return strata_indices_.size();
416 }
417
418 /** Whether to use stratified sampling.
419 (If this is false, strata will be ignored even if present.)
420 */
422 {
423 return options_.stratified_sampling;
424 }
425
426 /** Whether sampling should be performed with replacement.
427 */
428 bool withReplacement() const
429 {
430 return options_.sample_with_replacement;
431 }
432
433 /** Return an array view containing the indices in the current sample.
434 */
436 {
437 return current_sample_;
438 }
439
440 /** Return an array view containing the out-of-bag indices.
441 (i.e. the indices that are not in the current sample)
442 */
444 {
445 if(current_oob_count_ == oobInvalid)
446 {
447 current_oob_count_ = 0;
448 for(int i = 0; i<total_count_; ++i)
449 {
450 if(!is_used_[i])
451 {
452 current_oob_sample_[current_oob_count_] = i;
453 ++current_oob_count_;
454 }
455 }
456 }
457 return current_oob_sample_.subarray(0, current_oob_count_);
458 }
459 IsUsedArrayType const & is_used() const
460 {
461 return is_used_;
462 }
463};
464
465
466template<class Random>
468{
469 current_oob_count_ = oobInvalid;
470 is_used_.init(false);
471
472 if(options_.sample_with_replacement)
473 {
474 //Go thru all strata
475 int j = 0;
476 StrataIndicesType::iterator iter;
477 for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter)
478 {
479 // do sampling with replacement in each strata and copy data.
480 int stratum_size = iter->second.size();
481 for(int i = 0; i < static_cast<int>(strata_sample_size_[iter->first]); ++i, ++j)
482 {
483 current_sample_[j] = iter->second[random_.uniformInt(stratum_size)];
484 is_used_[current_sample_[j]] = true;
485 }
486 }
487 }
488 else
489 {
490 //Go thru all strata
491 int j = 0;
492 StrataIndicesType::iterator iter;
493 for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter)
494 {
495 // do sampling without replacement in each strata and copy data.
496 int stratum_size = iter->second.size();
497 for(int i = 0; i < static_cast<int>(strata_sample_size_[iter->first]); ++i, ++j)
498 {
499 std::swap(iter->second[i], iter->second[i+ random_.uniformInt(stratum_size - i)]);
500 current_sample_[j] = iter->second[i];
501 is_used_[current_sample_[j]] = true;
502 }
503 }
504 }
505}
506
507template<class Random =RandomTT800 >
508class PoissonSampler
509{
510public:
511 Random randfloat;
512 typedef Int32 IndexType;
513 typedef vigra::ArrayVector <IndexType> IndexArrayType;
514 IndexArrayType used_indices_;
515 double lambda;
516 int minIndex;
517 int maxIndex;
518
519 PoissonSampler(double lambda,IndexType minIndex,IndexType maxIndex)
520 : lambda(lambda),
521 minIndex(minIndex),
522 maxIndex(maxIndex)
523 {}
524
525 void sample( )
526 {
527 used_indices_.clear();
528 IndexType i;
529 for(i=minIndex;i<maxIndex;++i)
530 {
531 //from http://en.wikipedia.org/wiki/Poisson_distribution
532 int k=0;
533 double p=1;
534 double L=exp(-lambda);
535 do
536 {
537 ++k;
538 p*=randfloat.uniform53();
539
540 }while(p>L);
541 --k;
542 //Insert i this many time
543 while(k>0)
544 {
545 used_indices_.push_back(i);
546 --k;
547 }
548 }
549 }
550
551 IndexType const & operator[](int in) const
552 {
553 return used_indices_[in];
554 }
555
556 int numOfSamples() const
557 {
558 return used_indices_.size();
559 }
560};
561
562} // namespace vigra
563
564#endif /*VIGRA_SAMPLING_HXX*/
void init(U const &initial)
Definition array_vector.hxx:146
this_type subarray(size_type begin, size_type end) const
Definition array_vector.hxx:200
Class for a single RGB value.
Definition rgbvalue.hxx:128
RGBValue()
Definition rgbvalue.hxx:209
Options object for the Sampler class.
Definition sampling.hxx:64
SamplerOptions & withReplacement(bool in=true)
Sample from training population with replacement.
Definition sampling.hxx:83
SamplerOptions & stratified(bool in=true)
Draw equally many samples from each "stratum". A stratum is a group of like entities,...
Definition sampling.hxx:141
SamplerOptions & withoutReplacement(bool in=true)
Sample from training population without replacement.
Definition sampling.hxx:93
SamplerOptions & sampleProportion(double proportion)
Determine the number of samples to draw as a proportion of the total number. That is,...
Definition sampling.hxx:123
SamplerOptions & sampleSize(unsigned int size)
Draw the given number of samples. If stratifiedSampling is true, the size is equally distributed acro...
Definition sampling.hxx:106
Create random samples from a sequence of indices.
Definition sampling.hxx:233
int totalCount() const
Definition sampling.hxx:390
bool withReplacement() const
Definition sampling.hxx:428
ArrayVectorView< IndexType > IndexArrayViewType
Definition sampling.hxx:246
IndexType operator[](int k) const
Definition sampling.hxx:379
int strataCount() const
Definition sampling.hxx:413
Int32 IndexType
Definition sampling.hxx:239
IndexArrayViewType sampledIndices() const
Definition sampling.hxx:435
void sample()
Definition sampling.hxx:467
Sampler(Iterator strataBegin, Iterator strataEnd, SamplerOptions const &opt=SamplerOptions(), Random const *rnd=0)
Definition sampling.hxx:337
IndexArrayViewType oobIndices() const
Definition sampling.hxx:443
int sampleSize() const
Definition sampling.hxx:397
Sampler(UInt32 totalCount, SamplerOptions const &opt=SamplerOptions(), Random const *rnd=0)
Definition sampling.hxx:297
int size() const
Definition sampling.hxx:404
bool stratifiedSampling() const
Definition sampling.hxx:421
size_type size() const
Definition tinyvector.hxx:913
iterator end()
Definition tinyvector.hxx:864
iterator begin()
Definition tinyvector.hxx:861
detail::SelectIntegerType< 32, detail::UnsignedIntTypes >::type UInt32
32-bit unsigned int
Definition sized_int.hxx:183
detail::SelectIntegerType< 32, detail::SignedIntTypes >::type Int32
32-bit signed int
Definition sized_int.hxx:175
int ceil(FixedPoint< IntBits, FracBits > v)
rounding up.
Definition fixedpoint.hxx:675

© Ullrich Köthe (ullrich.koethe@iwr.uni-heidelberg.de)
Heidelberg Collaboratory for Image Processing, University of Heidelberg, Germany

html generated using doxygen and Python
vigra 1.12.2