OpenWalnut 1.2.5
WValueSetHistogram.h
00001 //---------------------------------------------------------------------------
00002 //
00003 // Project: OpenWalnut ( http://www.openwalnut.org )
00004 //
00005 // Copyright 2009 OpenWalnut Community, BSV@Uni-Leipzig and CNCF@MPI-CBS
00006 // For more information see http://www.openwalnut.org/copying
00007 //
00008 // This file is part of OpenWalnut.
00009 //
00010 // OpenWalnut is free software: you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as published by
00012 // the Free Software Foundation, either version 3 of the License, or
00013 // (at your option) any later version.
00014 //
00015 // OpenWalnut is distributed in the hope that it will be useful,
00016 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00018 // GNU Lesser General Public License for more details.
00019 //
00020 // You should have received a copy of the GNU Lesser General Public License
00021 // along with OpenWalnut. If not, see <http://www.gnu.org/licenses/>.
00022 //
00023 //---------------------------------------------------------------------------
00024 
00025 #ifndef WVALUESETHISTOGRAM_H
00026 #define WVALUESETHISTOGRAM_H
00027 
00028 #include <map>
00029 #include <list>
00030 #include <utility>
00031 
00032 #include <boost/shared_ptr.hpp>
00033 #include <boost/scoped_array.hpp>
00034 #include <boost/shared_array.hpp>
00035 
00036 #include "../../common/WHistogram.h"
00037 #include "../WValueSet.h"
00038 #include "../WExportDataHandler.h"
00039 
00040 /**
00041  * Used to find the occurrence frequencies of values in a value set. It implements a classical histogram but allows easy modification of bucket
00042  * sizes without unnecessary recalculation of the whole histogram. This histogram uses right-open intervals for counting, which is why there
00043  * always is a bucket at the end from max to infinity which holds all the max values.
00044  *
00045  * \note This histogram is different from from WValueSetHistogram which is a generic histogram class.
00046  */
00047 class OWDATAHANDLER_EXPORT WValueSetHistogram: public WHistogram // NOLINT
00048 {
00049 friend class WValueSetHistogramTest;
00050 public:
00051     /**
00052      * Constructor. Creates the histogram for the specified value set.
00053      *
00054      * \param valueSet source of the data for the histogram
00055      * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
00056      */
00057     explicit WValueSetHistogram( boost::shared_ptr< WValueSetBase > valueSet, size_t buckets = 1000 );
00058 
00059     /**
00060      * Constructor. Creates the histogram for the specified value set.
00061      *
00062      * \param valueSet source of the data for the histogram
00063      * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
00064      */
00065     explicit WValueSetHistogram( const WValueSetBase& valueSet, size_t buckets = 1000 );
00066 
00067     /**
00068      * Constructor. Creates a histogram from the specified value set but allows cropping of values below the given min and above the given max.
00069      * It actually interprets all values below min and above max to be exactly min and exactly max and sorts them into the appropriate bin. This
00070      * is especially useful to filter out outliers in data.
00071      *
00072      * \param valueSet source data
00073      * \param min the new minimum to use
00074      * \param max the maximum to use
00075      * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
00076      */
00077      WValueSetHistogram( boost::shared_ptr< WValueSetBase > valueSet, double min, double max, size_t buckets = 1000 );
00078 
00079     /**
00080      * Constructor. Creates a histogram from the specified value set but allows cropping of values below the given min and above the given max.
00081      * It actually interprets all values below min and above max to be exactly min and exactly max and sorts them into the appropriate bin. This
00082      * is especially useful to filter out outliers in data.
00083      *
00084      * \param valueSet source data
00085      * \param min the new minimum to use
00086      * \param max the maximum to use
00087      * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
00088      */
00089     WValueSetHistogram( const WValueSetBase& valueSet, double min, double max, size_t buckets = 1000 );
00090 
00091     /**
00092      * Copy constructor. If another interval size is given the histogram gets matched to it using the initial bucket data.
00093      * \note this does not deep copy the m_initialBuckets and m_mappedBuckets array as these are shared_array instances.
00094      *
00095      * \param histogram another WValueSetHistogram
00096      * \param buckets the new number of buckets. Must be larger than 1 if specified.
00097      */
00098     WValueSetHistogram( const WValueSetHistogram& histogram, size_t buckets = 0 );
00099 
00100     /**
00101      * Destructor.
00102      */
00103     ~WValueSetHistogram();
00104 
00105     /**
00106      * Copy assignment. Copies the contents of the specified histogram to this instance.
00107      *
00108      * \param other the other instance
00109      *
00110      * \return this instance with the contents of the other one.
00111      * \note this does not deep copy the m_initialBuckets and m_mappedBuckets array as these are shared_array instances.
00112      */
00113     WValueSetHistogram& operator=( const WValueSetHistogram& other );
00114 
00115     /**
00116      * Get the count of the bucket.
00117      *
00118      * \param index which buckets count is to be returned; starts with 0 which is the bucket containing the smallest values.
00119      *
00120      * \return elements in the bucket.
00121      */
00122     virtual size_t operator[]( size_t index ) const;
00123 
00124     /**
00125      * Get the count of the bucket. Testing if the position is valid.
00126      *
00127      * \param index which buckets count is to be returned; starts with 0 which is the bucket containing the smallest values.
00128      *
00129      * \return elements in the bucket
00130      */
00131     virtual size_t at( size_t index ) const;
00132 
00133     /**
00134      * Returns the number of buckets in the histogram with the actual mapping.
00135      *
00136      * \return number of buckets
00137      */
00138     virtual size_t size() const;
00139 
00140     /**
00141      * Return the size of one bucket.
00142      *
00143      * \param index the width for this bucket is queried.
00144      *
00145      * \return the size of a bucket.
00146      */
00147     virtual double getBucketSize( size_t index = 0 ) const;
00148 
00149     /**
00150      * Returns the actual interval associated with the given index. The interval is open, meaning that
00151      * getIntervalForIndex( i ).second == getIntervalForIndex( i + 1 ).first but does not belong anymore to the interval itself but every value
00152      * smaller than getIntervalForIndex( i ).second.
00153      *
00154      * \param index the intex
00155      *
00156      * \return the open interval.
00157      */
00158     virtual std::pair< double, double > getIntervalForIndex( size_t index ) const;
00159 
00160     /**
00161      * Returns the right index to the bucket containing the given value. If a value larger than the maximum, the maximum index is returned. Same
00162      * for minimum; if the value is smaller than the minimum, 0 is returned.
00163      *
00164      * \param value the value to search the index for
00165      *
00166      * \return the index of the bucket
00167      */
00168     virtual size_t getIndexForValue( double value ) const;
00169 
00170     /**
00171      * This returns the number of value set entries added to the histogram. This is especially useful to normalize the histogram counts.
00172      *
00173      * \return the number of elements distributed in the buckets.
00174      */
00175     virtual size_t getTotalElementCount() const;
00176 
00177     /**
00178      * Sums up the buckets in the specified interval. Especially useful for cumulative distribution functions or similar.
00179      *
00180      * \param startIndex the index where to start counting including this one
00181      * \param endIndex the index where to end summing up excluding this one.
00182      *
00183      * \return the sum of all buckets in the interval.
00184      * \throw WOutOfBounds if one of the indices is invalid.
00185      */
00186     virtual size_t accumulate( size_t startIndex, size_t endIndex ) const;
00187 
00188 protected:
00189     /**
00190      * Return the initial buckets.
00191      *
00192      * \return m_initialBuckets
00193      */
00194     boost::shared_array< size_t > getInitialBuckets() const;
00195 
00196     /**
00197      * Return the number of initial buckets.
00198      *
00199      * \return m_nInitialBuckets
00200      */
00201     size_t getNInitialBuckets() const;
00202 
00203     /**
00204      * Return the size of one initial bucket.
00205      *
00206      * \return m_bucketSize
00207      */
00208     double getInitialBucketSize() const;
00209 
00210     /**
00211      * increment the value by one, contains the logic to find the element place in the array.
00212      * Should only be used in the constructor i.e. while iterating over WValueSet.
00213      *
00214      * \param value value to increment
00215      */
00216     virtual void insert( double value );
00217 
00218 private:
00219 
00220     /**
00221      * Size of one bucket in the initial histogram.
00222      */
00223     double m_initialBucketSize;
00224 
00225     /**
00226      * Pointer to all initial buckets of the histogram.
00227      */
00228     boost::shared_array< size_t > m_initialBuckets;
00229 
00230     /**
00231      * Number of buckets in the initial histogram.
00232      */
00233     size_t m_nInitialBuckets;
00234 
00235     /**
00236      * Pointer to all initial buckets of the histogram.
00237      */
00238     boost::shared_array< size_t > m_mappedBuckets;
00239 
00240     /**
00241      * Tracks the number of a buckets in the mapped histogram.
00242      */
00243     size_t m_nMappedBuckets;
00244 
00245     /**
00246      * Size of one bucket in the mapped histogram.
00247      */
00248     double m_mappedBucketSize;
00249 
00250     /**
00251      * The number of elements distributed in the buckets.
00252      */
00253     size_t m_nbTotalElements;
00254 
00255     /**
00256      * Actually builds the histogram. This function is simply used for avoiding code duplication in all these constructors.
00257      *
00258      * \param valueSet the value set.
00259      */
00260     void buildHistogram( const WValueSetBase& valueSet );
00261 };
00262 
00263 /**
00264  * Write a histogram in string representation to the given output stream.
00265  */
00266 std::ostream& operator<<( std::ostream& out, const WValueSetHistogram& h );
00267 
00268 inline size_t WValueSetHistogram::getIndexForValue( double value ) const
00269 {
00270     // the position on the scala
00271     double pos = ( value - m_minimum ) / static_cast< double >( m_mappedBucketSize );
00272     // the index is the floor( position )
00273     size_t idx = static_cast< size_t >( std::floor( pos ) );
00274 
00275     // is the index larger than the size?
00276     bool inU = ( idx < m_nMappedBuckets );
00277     // is the index smaller than the size?
00278     bool inL = ( pos >= 0.0 );
00279 
00280     // the trick done here is to clamp value into [m_minimum,m_maximum] without using if statements. The C++ Standard says that booleans are
00281     // always 1 if true.
00282     // NOTE: this is integral arithmetic
00283     return ( inL && inU ) * idx + ( !inU && inL ) * ( m_nMappedBuckets - 1 );
00284 }
00285 
00286 #endif  // WVALUESETHISTOGRAM_H
00287 
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends