RDKit
Open-source cheminformatics and machine learning.
MaxMinPicker.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
3 // Copyright (C) 2017 Greg Landrum and NextMove Software
4 //
5 // @@ All Rights Reserved @@
6 // This file is part of the RDKit.
7 // The contents are covered by the terms of the BSD license
8 // which is included in the file license.txt, found at the root
9 // of the RDKit source tree.
10 //
11 #include <RDGeneral/export.h>
12 #ifndef RD_MAXMINPICKER_H
13 #define RD_MAXMINPICKER_H
14 
15 #include <RDGeneral/types.h>
16 #include <RDGeneral/utils.h>
17 #include <RDGeneral/Invariant.h>
18 #include <RDGeneral/RDLog.h>
19 #include <RDGeneral/Exceptions.h>
20 #include <cstdlib>
21 #include "DistPicker.h"
22 #include <boost/random.hpp>
23 #include <random>
24 
25 namespace RDPickers {
26 
27 namespace {
28 class RDKIT_SIMDIVPICKERS_EXPORT distmatFunctor {
29  public:
30  distmatFunctor(const double *distMat) : dp_distMat(distMat){};
31  double operator()(unsigned int i, unsigned int j) {
32  return getDistFromLTM(this->dp_distMat, i, j);
33  }
34 
35  private:
36  const double *dp_distMat;
37 };
38 } // namespace
39 
40 /*! \brief Implements the MaxMin algorithm for picking a subset of item from a
41  *pool
42  *
43  * This class inherits from the DistPicker and implements a specific picking
44  *strategy
45  * aimed at diversity. See documentation for "pick()" member function for the
46  *algorithm details
47  */
49  public:
50  /*! \brief Default Constructor
51  *
52  */
54 
55  /*! \brief Contains the implementation for a lazy MaxMin diversity picker
56  *
57  * See the documentation for the pick() method for details about the algorithm
58  *
59  * \param func - a function (or functor) taking two unsigned ints as
60  *arguments
61  * and returning the distance (as a double) between those two
62  *elements.
63  * \param poolSize - the size of the pool to pick the items from. It is
64  *assumed that the
65  * distance matrix above contains the right number of elements;
66  *i.e.
67  * poolSize*(poolSize-1)
68  * \param pickSize - the number items to pick from pool (<= poolSize)
69  * \param firstPicks - (optional)the first items in the pick list
70  * \param seed - (optional) seed for the random number generator.
71  * If this is <0 the generator will be seeded with a
72  * random number.
73  */
74  template <typename T>
75  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
76  unsigned int pickSize) const;
77 
78  template <typename T>
79  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
80  unsigned int pickSize,
81  const RDKit::INT_VECT &firstPicks,
82  int seed = -1) const;
83 
84  template <typename T>
85  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
86  unsigned int pickSize,
87  const RDKit::INT_VECT &firstPicks, int seed,
88  double &threshold) const;
89 
90  /*! \brief Contains the implementation for the MaxMin diversity picker
91  *
92  * Here is how the picking algorithm works, refer to
93  * Ashton, M. et. al., Quant. Struct.-Act. Relat., 21 (2002), 598-604
94  * for more detail:
95  *
96  * A subset of k items is to be selected from a pool containing N molecules.
97  * Then the MaxMin method is as follows:
98  * -# Initialise Subset with some appropriately chosen seed
99  * compound and set x = 1.
100  * -# For each of the N-x remaining compounds in Dataset
101  * calculate its dissimilarity with each of the x compounds in
102  * Subset and retain the smallest of these x dissimilarities for
103  * each compound in Dataset.
104  * -# Select the molecule from Dataset with the largest value
105  * for the smallest dissimilarity calculated in Step 2 and
106  * transfer it to Subset.
107  * -# Set x = x + 1 and return to Step 2 if x < k.
108  *
109  *
110  *
111  * \param distMat - distance matrix - a vector of double. It is assumed that
112  *only the
113  * lower triangle element of the matrix are supplied in a 1D
114  *array\n
115  * \param poolSize - the size of the pool to pick the items from. It is
116  *assumed that the
117  * distance matrix above contains the right number of elements;
118  *i.e.
119  * poolSize*(poolSize-1) \n
120  * \param pickSize - the number items to pick from pool (<= poolSize)
121  * \param firstPicks - indices of the items used to seed the pick set.
122  * \param seed - (optional) seed for the random number generator
123  * If this is <0 the generator will be seeded with a
124  * random number.
125  */
126  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
127  unsigned int pickSize, RDKit::INT_VECT firstPicks,
128  int seed = -1) const {
129  CHECK_INVARIANT(distMat, "Invalid Distance Matrix");
130  if (!poolSize) throw ValueErrorException("empty pool to pick from");
131  if (poolSize < pickSize)
132  throw ValueErrorException("pickSize cannot be larger than the poolSize");
133  distmatFunctor functor(distMat);
134  return this->lazyPick(functor, poolSize, pickSize, firstPicks, seed);
135  }
136 
137  /*! \overload */
138  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
139  unsigned int pickSize) const {
140  RDKit::INT_VECT iv;
141  return pick(distMat, poolSize, pickSize, iv);
142  }
143 };
144 
146  double dist_bound; // distance to closest reference
147  unsigned int picks; // number of references considered
148  unsigned int next; // singly linked list of candidates
149 };
150 
151 // we implement this here in order to allow arbitrary functors without link
152 // errors
153 template <typename T>
154 RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
155  unsigned int pickSize,
156  const RDKit::INT_VECT &firstPicks,
157  int seed, double &threshold) const {
158  if (!poolSize) throw ValueErrorException("empty pool to pick from");
159 
160  if (poolSize < pickSize)
161  throw ValueErrorException("pickSize cannot be larger than the poolSize");
162 
163  RDKit::INT_VECT picks;
164 
165  unsigned int memsize = (unsigned int)(poolSize * sizeof(MaxMinPickInfo));
166  MaxMinPickInfo *pinfo = new MaxMinPickInfo[memsize];
167  if (!pinfo) {
168  threshold = -1.0;
169  return picks;
170  }
171  memset(pinfo, 0, memsize);
172 
173  picks.reserve(pickSize);
174  unsigned int picked = 0; // picks.size()
175  unsigned int pick = 0;
176 
177  // pick the first entry
178  if (firstPicks.empty()) {
179  // get a seeded random number generator:
180  typedef boost::mt19937 rng_type;
181  typedef boost::uniform_int<> distrib_type;
182  typedef boost::variate_generator<rng_type &, distrib_type> source_type;
183  rng_type generator;
184  distrib_type dist(0, poolSize - 1);
185  if (seed >= 0) {
186  generator.seed(static_cast<rng_type::result_type>(seed));
187  } else {
188  generator.seed(std::random_device()());
189  }
190  source_type randomSource(generator, dist);
191  pick = randomSource();
192  // add the pick to the picks
193  picks.push_back(pick);
194  // and remove it from the pool
195  pinfo[pick].picks = 1;
196  picked = 1;
197 
198  } else {
199  for (RDKit::INT_VECT::const_iterator pIdx = firstPicks.begin();
200  pIdx != firstPicks.end(); ++pIdx) {
201  pick = static_cast<unsigned int>(*pIdx);
202  if (pick >= poolSize) {
203  delete[] pinfo;
204  throw ValueErrorException("pick index was larger than the poolSize");
205  }
206  picks.push_back(pick);
207  pinfo[pick].picks = 1;
208  picked++;
209  }
210  }
211 
212  if (picked >= pickSize) {
213  threshold = -1.0;
214  delete[] pinfo;
215  return picks;
216  }
217 
218  unsigned int pool_list = 0;
219  unsigned int *prev = &pool_list;
220  // enter the pool into a list so that we can pick out of it easily
221  for (unsigned int i = 0; i < poolSize; i++)
222  if (pinfo[i].picks == 0) {
223  *prev = i;
224  prev = &pinfo[i].next;
225  }
226  *prev = 0;
227 
228  unsigned int poolIdx;
229  unsigned int pickIdx;
230 
231  // First pass initialize dist_bound
232  prev = &pool_list;
233  pickIdx = picks[0];
234  do {
235  poolIdx = *prev;
236  pinfo[poolIdx].dist_bound = func(poolIdx, pickIdx);
237  pinfo[poolIdx].picks = 1;
238  prev = &pinfo[poolIdx].next;
239  } while (*prev != 0);
240 
241  // now pick 1 compound at a time
242  double maxOFmin = -1.0;
243  double tmpThreshold = -1.0;
244  while (picked < pickSize) {
245  unsigned int *pick_prev = 0;
246  maxOFmin = -1.0;
247  prev = &pool_list;
248  do {
249  poolIdx = *prev;
250  double minTOi = pinfo[poolIdx].dist_bound;
251  if (minTOi > maxOFmin) {
252  unsigned int pi = pinfo[poolIdx].picks;
253  while (pi < picked) {
254  unsigned int picki = picks[pi];
255  CHECK_INVARIANT(poolIdx != picki, "pool index != pick index");
256  double dist = func(poolIdx, picki);
257  pi++;
258  if (dist <= minTOi) {
259  minTOi = dist;
260  if (minTOi <= maxOFmin) break;
261  }
262  }
263  pinfo[poolIdx].dist_bound = minTOi;
264  pinfo[poolIdx].picks = pi;
265  if (minTOi > maxOFmin) {
266  maxOFmin = minTOi;
267  pick_prev = prev;
268  pick = poolIdx;
269  }
270  }
271  prev = &pinfo[poolIdx].next;
272  } while (*prev != 0);
273 
274  // if the current distance is closer then threshold, we're done
275  if (threshold >= 0.0 && maxOFmin < threshold) break;
276  tmpThreshold = maxOFmin;
277  // now add the new pick to picks and remove it from the pool
278  *pick_prev = pinfo[pick].next;
279  picks.push_back(pick);
280  picked++;
281  }
282 
283  threshold = tmpThreshold;
284  delete[] pinfo;
285  return picks;
286 }
287 
288 template <typename T>
289 RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
290  unsigned int pickSize,
291  const RDKit::INT_VECT &firstPicks,
292  int seed) const {
293  double threshold = -1.0;
294  return MaxMinPicker::lazyPick(func, poolSize, pickSize, firstPicks, seed,
295  threshold);
296 }
297 
298 template <typename T>
299 RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
300  unsigned int pickSize) const {
301  RDKit::INT_VECT firstPicks;
302  double threshold = -1.0;
303  int seed = -1;
304  return MaxMinPicker::lazyPick(func, poolSize, pickSize, firstPicks, seed,
305  threshold);
306 }
307 }; // namespace RDPickers
308 
309 #endif
boost::minstd_rand rng_type
Definition: utils.h:38
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:100
#define RDKIT_SIMDIVPICKERS_EXPORT
Definition: export.h:606
RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize, unsigned int pickSize) const
Contains the implementation for a lazy MaxMin diversity picker.
Definition: MaxMinPicker.h:299
std::vector< int > INT_VECT
Definition: types.h:253
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const
Definition: MaxMinPicker.h:138
Implements the MaxMin algorithm for picking a subset of item from a pool.
Definition: MaxMinPicker.h:48
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize, RDKit::INT_VECT firstPicks, int seed=-1) const
Contains the implementation for the MaxMin diversity picker.
Definition: MaxMinPicker.h:126
MaxMinPicker()
Default Constructor.
Definition: MaxMinPicker.h:53
RDKIT_SIMDIVPICKERS_EXPORT double getDistFromLTM(const double *distMat, unsigned int i, unsigned int j)
function to lookup distance from 1D lower triangular distance matrix
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:33
Abstract base class to do perform item picking (typically molecules) using a distance matrix...
Definition: DistPicker.h:44