Rosetta
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
reservoir_sample.hh
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 //
4 // (c) Copyright Rosetta Commons Member Institutions.
5 // (c) This file is part of the Rosetta software suite and is made available under license.
6 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
7 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
8 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
9 
10 /// @file numeric/random/reservoir_sample.hh
11 /// @brief Randomly select the best N elements from a stream of elements using one
12 /// pass over a dataset.
13 /// @details
14 /// @author James Thompson
15 
16 
17 #ifndef INCLUDED_numeric_random_reservoir_sample_hh
18 #define INCLUDED_numeric_random_reservoir_sample_hh
19 
20 // Unit headers
21 #include <numeric/types.hh>
22 #include <numeric/random/random.hh>
23 
24 // Utility headers
25 #include <utility/vector1.hh>
26 
27 // C++ Headers
28 
29 namespace numeric {
30 namespace random {
31 
32 /// @brief Returns the probability that the Nth value in a sequence
33 /// should be accepted using the reservoir sampling criterion.
34 /// @details If we've seen N values and we want to keep K of them,
35 /// the probability of the Nth value being accepted is min(K/N,1.0).
36 inline numeric::Real
38  numeric::Size n_wanted,
39  numeric::Size n_seen
40 ) {
41  Real const accept_prob(
42  static_cast< Real > (n_wanted) / static_cast< Real > (n_seen)
43  );
44  return std::min( accept_prob, 1.0 );
45 }
46 
47 /// @brief Simple container for keeping K random values.
48 /// @details Values are stochastically preserved so that the probability of
49 /// any value existing is always 1 / N, where N is the number of values seen.
50 template< typename T >
52 public:
53  ReservoirSampler( numeric::Size const wanted ) :
54  n_seen_(0),
55  n_wanted_( wanted )
56  {}
57 
59 
60  void add_value( T const & val ) {
61  ++n_seen_;
62  if ( n_vals() < n_wanted() ) {
63  values_.push_back( val );
64  } else {
65  Real const accept_prob(
67  );
68 
69  if ( numeric::random::uniform() <= accept_prob ) {
70  Size const idx = numeric::random::random_range( 1, n_vals() );
71  values_[ idx ] = val;
72  }
73  }
74  } // add_value
75 
77  return values_.size();
78  }
79 
81  return n_wanted_;
82  }
83 
85  return n_seen_;
86  }
87 
88  utility::vector1< T > const & values() const {
89  return values_;
90  }
91 
92 private:
93 
94  ReservoirSampler(); // Must pass n_wanted to constructor
95 
99 };
100 
101 template< typename T >
104  utility::vector1< T > const & vec,
105  numeric::Size n_wanted,
107 ) {
108  using numeric::Real;
110 
111  assert( n_wanted <= vec.size() );
112 
113  utility::vector1< T > values;
114  values.reserve( n_wanted );
115  Size n_seen(0);
116  for ( iter it = vec.begin(), end = vec.end(); it != end; ++it ) {
117  ++n_seen;
118  if ( values.size() < n_wanted ) {
119  values.push_back( *it );
120  } else {
121  Real const accept_prob(
122  reservoir_sample_accept_prob( n_wanted, n_seen )
123  );
124 
125  if ( rg.uniform() <= accept_prob ) {
126  Size const idx = rg.random_range( 1, values.size() );
127  values[ idx ] = *it;
128  }
129  }
130  }
131  return values;
132 } // reservoir_sample
133 
134 } // namespace random
135 } // namespace numeric
136 
137 #endif // INCLUDED_numeric_random_reservoir_sampling_HH
static T min(T x, T y)
Definition: Svm.cc:16
utility::vector1< T > reservoir_sample(utility::vector1< T > const &vec, numeric::Size n_wanted, RandomGenerator &rg=numeric::random::rg())
RandomGenerator & rg()
Return the one-per-thread "singleton" random generator.
Definition: random.cc:46
platform::Size Size
Definition: types.hh:42
Random number generator system.
Definition: random.hh:73
Simple container for keeping K random values.
Random number generator system.
int random_range(int low, int high)
Return a number uniformly drawn from the inclusive range between low and high. Threadsafe since each ...
Definition: random.cc:58
ReservoirSampler(numeric::Size const wanted)
super::const_iterator const_iterator
Definition: vector1.hh:62
rosetta project type declarations. Should be kept updated with core/types.hh. This exists because num...
double Real
Definition: types.hh:39
utility::vector1< T > const & values() const
vector1: std::vector with 1-based indexing
int random_range(int low, int high)
Returns a random int in the range specified by the arguments.
Definition: random.cc:126
double uniform()
Generate a random number between 0 and 1. Threadsafe since each thread uses its own random generator...
Definition: random.cc:56
numeric::Real reservoir_sample_accept_prob(numeric::Size n_wanted, numeric::Size n_seen)
Returns the probability that the Nth value in a sequence should be accepted using the reservoir sampl...
platform::Size Size
Definition: random.fwd.hh:30