Rosetta
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
reservoir_sample.hh
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 //
4 // (c) Copyright Rosetta Commons Member Institutions.
5 // (c) This file is part of the Rosetta software suite and is made available under license.
6 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
7 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
8 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
9 
10 /// @file numeric/random/reservoir_sample.hh
11 /// @brief Randomly select the best N elements from a stream of elements using one
12 /// pass over a dataset.
13 /// @details
14 /// @author James Thompson
15 
16 
17 #ifndef INCLUDED_numeric_random_reservoir_sample_hh
18 #define INCLUDED_numeric_random_reservoir_sample_hh
19 
20 // Unit headers
21 #include <numeric/types.hh>
22 #include <numeric/random/random.hh>
23 
24 // Utility headers
25 #include <utility/vector1.hh>
26 
27 // C++ Headers
28 
29 namespace numeric {
30 namespace random {
31 
32 /// @brief Simple container for keeping K random values.
33 /// @details Values are stochastically preserved so that the probability of
34 /// any value existing is always 1 / N, where N is the number of values seen.
35 template< typename T >
37 public:
38  ReservoirSampler( numeric::Size const wanted ) : n_wanted_( wanted ) {}
40 
41  void add_value( T val ) {
42  if ( n_vals() < n_wanted() ) {
43  values_.push_back( val );
44  } else {
45  Real const accept_prob(
47  );
48 
49  if ( numeric::random::uniform() <= accept_prob ) {
50  Size const idx = numeric::random::random_range( 1, n_vals() );
51  values_[ idx ] = val;
52  }
53  }
54  } // add_value
55 
57  return values_.size();
58  }
59 
61  return n_wanted_;
62  }
63 
65  return n_seen_;
66  }
67 
69  return values_;
70  }
71 
72 private:
76 };
77 
78 /// @brief Returns the probability that the Nth value in a sequence
79 /// should be accepted using the reservoir sampling criterion.
80 /// @details If we've seen N values and we want to keep K of them,
81 /// the probability of the Nth value being accepted is min(K/N,1.0).
82 inline numeric::Real
84  numeric::Size n_wanted,
85  numeric::Size const n_seen
86 ) {
87  Real const accept_prob(
88  static_cast< Real > (n_wanted) / static_cast< Real > (n_seen)
89  );
90  return std::min( accept_prob, 1.0 );
91 }
92 
93 template< typename T >
96  utility::vector1< T > const & vec,
97  numeric::Size const n_wanted,
99 ) {
100  using numeric::Real;
101  typedef typename utility::vector1< T >::const_iterator iter;
102 
103  assert( n_wanted <= vec.size() );
104 
105  utility::vector1< T > values;
106  values.reserve( n_wanted );
107  Size n_seen(0);
108  for ( iter it = vec.begin(), end = vec.end(); it != end; ++it ) {
109  ++n_seen;
110  if ( values.size() < n_wanted ) {
111  values.push_back( *it );
112  } else {
113  Real const accept_prob(
114  reservoir_sample_accept_prob( n_wanted, n_seen )
115  );
116 
117  if ( rg.uniform() <= accept_prob ) {
118  Size const idx = rg.random_range( 1, values.size() );
119  values[ idx ] = *it;
120  }
121  }
122  }
123  return values;
124 } // reservoir_sample
125 
126 } // namespace random
127 } // namespace numeric
128 
129 #endif // INCLUDED_numeric_random_reservoir_sampling_HH
static T min(T x, T y)
Definition: Svm.cc:16
utility::vector1< T > values() const
RandomGenerator & rg()
Return the one-per-thread "singleton" random generator.
Definition: random.cc:46
platform::Size Size
Definition: types.hh:42
Random number generator system.
Definition: random.hh:73
Simple container for keeping K random values.
utility::keys::lookup::end< KeyType > const end
Random number generator system.
int random_range(int low, int high)
Return a number uniformly drawn from the inclusive range between low and high. Threadsafe since each ...
Definition: random.cc:58
ReservoirSampler(numeric::Size const wanted)
super::const_iterator const_iterator
Definition: vector1.hh:62
rosetta project type declarations. Should be kept updated with core/types.hh. This exists because num...
double Real
Definition: types.hh:39
vector1: std::vector with 1-based indexing
int random_range(int low, int high)
Returns a random int in the range specified by the arguments.
Definition: random.cc:126
numeric::Real reservoir_sample_accept_prob(numeric::Size n_wanted, numeric::Size const n_seen)
Returns the probability that the Nth value in a sequence should be accepted using the reservoir sampl...
double uniform()
Generate a random number between 0 and 1. Threadsafe since each thread uses its own random generator...
Definition: random.cc:56
platform::Size Size
Definition: random.fwd.hh:30
utility::vector1< T > reservoir_sample(utility::vector1< T > const &vec, numeric::Size const n_wanted, RandomGenerator &rg)