SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
 All Classes Namespaces Files Functions Variables Typedefs Friends
sdsl/include/sdsl/enc_vector.hpp
Go to the documentation of this file.
00001 /* sdsl - succinct data structures library
00002     Copyright (C) 2008 Simon Gog
00003 
00004     This program is free software: you can redistribute it and/or modify
00005     it under the terms of the GNU General Public License as published by
00006     the Free Software Foundation, either version 3 of the License, or
00007     (at your option) any later version.
00008 
00009     This program is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012     GNU General Public License for more details.
00013 
00014     You should have received a copy of the GNU General Public License
00015     along with this program.  If not, see http://www.gnu.org/licenses/ .
00016 */
00021 #include "int_vector.hpp"
00022 #include "elias_delta_coder.hpp"
00023 #include "iterators.hpp"
00024 
00025 #ifndef SDSL_ENC_VECTOR
00026 #define SDSL_ENC_VECTOR
00027 
00029 namespace sdsl
00030 {
00031 
00032 template<uint8_t fixedIntWidth>
00033 struct enc_vector_trait {
00034     typedef int_vector<0> int_vector_type;
00035 };
00036 
00037 template<>
00038 struct enc_vector_trait<32> {
00039     typedef int_vector<32> int_vector_type;
00040 };
00041 
00043 
00050 template<class Coder=coder::elias_delta,
00051          uint32_t SampleDens = 8, uint8_t fixedIntWidth=0>
00052 class enc_vector
00053 {
00054     public:
00055         typedef uint64_t                                                        value_type;     // STL Container requirement
00056         typedef random_access_const_iterator<enc_vector> iterator;// STL Container requirement
00057         typedef iterator                                                        const_iterator; // STL Container requirement
00058         typedef const value_type                                        reference;
00059         typedef const value_type                                        const_reference;
00060         typedef const value_type*                                       const_pointer;
00061         typedef ptrdiff_t                                                       difference_type;// STL Container requirement
00062         typedef int_vector<>::size_type                         size_type;              // STL Container requirement
00063         typedef Coder                                                           coder;
00064         typedef typename enc_vector_trait<fixedIntWidth>::int_vector_type int_vector_type;
00065         static  const uint32_t                                          sample_dens     = SampleDens;   // Required member
00066 
00067         int_vector<0>   m_z;            // compressed bit stream
00068     private:
00069         int_vector_type   m_sample_vals_and_pointer;
00070         size_type                 m_elements;    // number of elements
00071 
00072         // workaround function for the constructor
00073         void construct() {
00074             m_elements = 0;
00075         }
00076         void copy(const enc_vector& v);
00077 
00078         void clear() {
00079             m_z.resize(0);
00080             m_elements = 0;
00081             m_sample_vals_and_pointer.resize(0);
00082         }
00083 
00084     public:
00086         enc_vector() {
00087             construct();
00088         };
00090 
00093         enc_vector(const enc_vector& v);
00094 
00096 
00099         template<class Container>
00100         enc_vector(const Container& c) {
00101             construct();
00102             init(c);
00103         }
00104 
00106         /*
00107             \param v_buf A int_vector_file_buf.
00108                 \pre No two adjacent values should be equal.
00109         */
00110         template<uint8_t int_width, class size_type_class>
00111         enc_vector(int_vector_file_buffer<int_width, size_type_class>& v_buf) {
00112             construct();
00113             init(v_buf);
00114         }
00115 
00116         template<class Container>
00117         void init(const Container& c);
00118 
00119         template<uint8_t int_width, class size_type_class>
00120         void init(int_vector_file_buffer<int_width, size_type_class>& v_buf);
00121 
00123         ~enc_vector() {
00124         }
00125 
00127 
00131         size_type size()const;
00132 
00134 
00136         static size_type max_size();
00137 
00139 
00144         bool empty() const;
00145 
00147 
00155         void swap(enc_vector& v);
00156 
00158 
00162         const const_iterator begin()const;
00163 
00165 
00169         const const_iterator end()const;
00170 
00171         // Iterator that points to the last element of the enc_vector.
00172         /*
00173          *      Required for the Container Concept of the STL.
00174          *  \sa rend()
00175          */
00176 //              reverse_iterator rbegin()const;
00177 
00178         // Iterator that points to the position before the first element of the enc_vector.
00179         /*
00180          *      Required for the Container Concept of the STL
00181          *  \sa rbegin()
00182          */
00183 //              reverse_iterator rend()const;
00184 
00186 
00190         value_type operator[](size_type i)const;
00191 
00193 
00196         enc_vector& operator=(const enc_vector& v);
00197 
00199 
00207         bool operator==(const enc_vector& v)const;
00208 
00210 
00216         bool operator!=(const enc_vector& v)const;
00217 
00219 
00222         size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const;
00223 
00225         void load(std::istream& in);
00226 
00228 
00231         value_type sample(const size_type i) const;
00232 
00233         uint32_t get_sample_dens() const;
00234 
00239 //              template<class Iterator>
00240         void get_inter_sampled_values(const size_type i, uint64_t* it)const {
00241             *(it++) = 0;
00242             if (i*SampleDens + SampleDens - 1 < size()) {
00243                 Coder::template decode<true, true>(m_z.data(), m_sample_vals_and_pointer[(i<<1)+1], SampleDens - 1, it);
00244             } else {
00245                 assert(i*SampleDens < size());
00246                 Coder::template decode<true, true>(m_z.data(), m_sample_vals_and_pointer[(i<<1)+1], size()-i*SampleDens - 1, it);
00247             }
00248         };
00249 };
00250 
00251 
00252 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth>
00253 inline uint32_t enc_vector<Coder, SampleDens, fixedIntWidth>::get_sample_dens() const
00254 {
00255     return SampleDens;
00256 }
00257 
00258 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth>
00259 inline typename enc_vector<Coder, SampleDens,fixedIntWidth>::value_type enc_vector<Coder, SampleDens,fixedIntWidth>::operator[](const size_type i)const
00260 {
00261     assert(i+1 != 0);
00262 #ifdef SDSL_DEBUG
00263     if (i >= m_elements) {
00264         throw std::out_of_range("OUT_OF_RANGE_ERROR: enc_vector::operator[](size_type); i >= size()!");
00265         return 0;
00266     }
00267 #endif
00268     size_type idx = i/get_sample_dens();
00269     return m_sample_vals_and_pointer[idx<<1] + Coder::decode_prefix_sum(m_z.data(), m_sample_vals_and_pointer[(idx<<1)+1], i-SampleDens*idx);
00270 }
00271 
00272 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth>
00273 inline typename enc_vector<Coder, SampleDens,fixedIntWidth>::value_type enc_vector<Coder, SampleDens,fixedIntWidth>::sample(const size_type i)const
00274 {
00275     assert(i*get_sample_dens()+1 != 0);
00276 #ifdef SDSL_DEBUG
00277     if (i*get_sample_dens() >= m_elements) {
00278         throw std::out_of_range("OUT_OF_RANGE_ERROR: enc_vector::sample(size_type); i*get_sample_dens() >= size()!");
00279         return 0;
00280     }
00281 #endif
00282     return m_sample_vals_and_pointer[i<<1];
00283 }
00284 
00285 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth>
00286 inline enc_vector<>::size_type enc_vector<Coder, SampleDens,fixedIntWidth>::size()const
00287 {
00288     return m_elements;
00289 }
00290 
00291 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth>
00292 inline enc_vector<>::size_type enc_vector<Coder, SampleDens,fixedIntWidth>::max_size()
00293 {
00294     return int_vector<>::max_size()/2; // each element could possible occupy double space with selfdelimiting codes
00295 }
00296 
00297 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth>
00298 inline bool enc_vector<Coder, SampleDens,fixedIntWidth>::empty()const
00299 {
00300     return 0==m_elements;
00301 }
00302 
00303 
00304 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth>
00305 void enc_vector<Coder, SampleDens,fixedIntWidth>::copy(const enc_vector<Coder, SampleDens,fixedIntWidth>& v)
00306 {
00307     m_z                                                 = v.m_z;                                // copy compressed bit stream
00308     m_sample_vals_and_pointer   = v.m_sample_vals_and_pointer;      // copy sample values
00309     m_elements                                  = v.m_elements;                 // copy number of stored elements
00310 }
00311 
00312 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth>
00313 enc_vector<Coder, SampleDens,fixedIntWidth>::enc_vector(const enc_vector& v)
00314 {
00315     copy(v);
00316 }
00317 
00318 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth>
00319 enc_vector<Coder, SampleDens,fixedIntWidth>& enc_vector<Coder, SampleDens,fixedIntWidth>::operator=(const enc_vector<Coder, SampleDens,fixedIntWidth>& v)
00320 {
00321     if (this != &v) {// if v and _this_ are not the same object
00322         copy(v);
00323     }
00324     return *this;
00325 }
00326 
00327 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth>
00328 bool enc_vector<Coder, SampleDens,fixedIntWidth>::operator==(const enc_vector<Coder, SampleDens,fixedIntWidth>& v)const
00329 {
00330     if (this == &v)
00331         return true;
00332     return              m_elements == v.m_elements
00333                 and     m_z == v.m_z
00334                 and     m_sample_vals_and_pointer == v.m_sample_vals_and_pointer;
00335 }
00336 
00337 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth>
00338 bool enc_vector<Coder, SampleDens,fixedIntWidth>::operator!=(const enc_vector<Coder, SampleDens,fixedIntWidth>& v)const
00339 {
00340     return !(*this == v);
00341 }
00342 
00343 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth>
00344 void enc_vector<Coder, SampleDens,fixedIntWidth>::swap(enc_vector<Coder, SampleDens,fixedIntWidth>& v)
00345 {
00346     if (this != &v) { // if v and _this_ are not the same object
00347         m_z.swap(v.m_z);                                        // swap compressed bit streams
00348         m_sample_vals_and_pointer.swap(v.m_sample_vals_and_pointer);
00349         std::swap(m_elements, v.m_elements);// swap the number of elements
00350     }
00351 }
00352 
00353 
00354 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth>
00355 template<class Container>
00356 void enc_vector<Coder, SampleDens,fixedIntWidth>::init(const Container& c)
00357 {
00358     // clear bit_vectors
00359     clear();
00360 
00361     if (c.empty())  // if c is empty there is nothing to do...
00362         return;
00363     typename Container::const_iterator  it                      = c.begin(), end = c.end();
00364     typename Container::value_type              v1                      = *it, v2, max_sample_value=0, x;
00365     size_type samples=0;
00366     size_type z_size = 0;
00367 //  (1) Calculate maximal value of samples and of deltas
00368     for (size_type i=0, no_sample=0; it != end; ++it,++i, --no_sample) {
00369         v2 = *it;
00370         if (!no_sample) { // add a sample
00371             no_sample = get_sample_dens();
00372             if (max_sample_value < v2) max_sample_value = v2;
00373             ++samples;
00374         } else {
00375             z_size += Coder::encoding_length(v2-v1);
00376         }
00377         v1=v2;
00378     }
00379 //      (2) Write sample values and deltas
00380     {
00381         if (max_sample_value > z_size+1)
00382             m_sample_vals_and_pointer.set_int_width(bit_magic::l1BP(max_sample_value) + 1);
00383         else
00384             m_sample_vals_and_pointer.set_int_width(bit_magic::l1BP(z_size+1) + 1);
00385         m_sample_vals_and_pointer.resize(2*samples+2); // add 2 for last entry
00386         util::set_zero_bits(m_sample_vals_and_pointer);
00387         typename int_vector_type::iterator sv_it = m_sample_vals_and_pointer.begin();
00388         z_size = 0;
00389         size_type no_sample=0;
00390         for (it = c.begin(); it != end; ++it, --no_sample) {
00391             v2 = *it;
00392             if (!no_sample) { // add a sample
00393                 no_sample = get_sample_dens();
00394                 *sv_it = v2; ++sv_it;
00395                 *sv_it = z_size; ++sv_it;
00396             } else {
00397                 x = v2-v1;
00398 //                if (v2 == v1) {
00399 //                    throw std::logic_error("enc_vector cannot decode adjacent equal values!");
00400 //                }
00401                 z_size += Coder::encoding_length(x);
00402             }
00403             v1=v2;
00404         }
00405         *sv_it = 0; ++sv_it;        // initialize
00406         *sv_it = z_size+1; ++sv_it; // last entry
00407 
00408         util::assign(m_z, int_vector<>(z_size, 0, 1));
00409         uint64_t* z_data = Coder::raw_data(m_z);
00410         uint8_t offset = 0;
00411         no_sample = 0;
00412         for (it = c.begin(); it != end; ++it, --no_sample) {
00413             v2 = *it;
00414             if (!no_sample) { // add a sample
00415                 no_sample = get_sample_dens();
00416             } else {
00417                 Coder::encode(v2-v1, z_data, offset);
00418             }
00419             v1=v2;
00420         }
00421 //              Coder::encode(delta_c, m_z); // encode delta_c to m_z
00422     }
00423 //      delta_c.resize(0);
00424 //std::cerr<<"Finished "<<std::endl;,
00425     m_elements = c.size();
00426 }
00427 
00428 
00429 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth>
00430 template<uint8_t int_width, class size_type_class>
00431 void enc_vector<Coder, SampleDens,fixedIntWidth>::init(int_vector_file_buffer<int_width, size_type_class>& v_buf)
00432 {
00433     // clear bit_vectors
00434     clear();
00435     size_type n = v_buf.int_vector_size;
00436     if (n == 0)  // if c is empty there is nothing to do...
00437         return;
00438     v_buf.reset();
00439     value_type  v1=0, v2=0, max_sample_value=0;
00440     size_type samples=0, z_size=0;
00441     const size_type sd = get_sample_dens();
00442 //  (1) Calculate maximal value of samples and of deltas
00443     for (size_type i=0, r_sum=0, r = v_buf.load_next_block(), no_sample = 0; r_sum < n;) {
00444         for (; i < r_sum+r; ++i, --no_sample) {
00445             v2 = v_buf[i-r_sum];
00446             if (!no_sample) { // is sample
00447                 no_sample = sd;
00448                 if (max_sample_value < v2) max_sample_value = v2;
00449                 ++samples;
00450             } else {
00451 //                if (v2 == v1) {
00452 //                    throw std::logic_error("enc_vector cannot decode adjacent equal values!");
00453 //                }
00454                 z_size += Coder::encoding_length(v2-v1);
00455             }
00456             v1 = v2;
00457         }
00458         r_sum += r; r = v_buf.load_next_block();
00459     }
00460 
00461 //      (2) Write sample values and deltas
00462 //     (a) Initialize array for sample values and pointers
00463     if (max_sample_value > z_size+1)
00464         m_sample_vals_and_pointer.set_int_width(bit_magic::l1BP(max_sample_value) + 1);
00465     else
00466         m_sample_vals_and_pointer.set_int_width(bit_magic::l1BP(z_size+1) + 1);
00467     m_sample_vals_and_pointer.resize(2*samples+2); // add 2 for last entry
00468     util::set_zero_bits(m_sample_vals_and_pointer);
00469 
00470 //     (b) Initilize bit_vector for encoded data
00471     util::assign(m_z, int_vector<>(z_size, 0, 1));
00472     uint64_t* z_data = Coder::raw_data(m_z);
00473     uint8_t offset = 0;
00474 
00475 //     (c) Write sample values and deltas
00476     v_buf.reset();
00477     z_size = 0;
00478     for (size_type i=0, j=0, r_sum=0, r = v_buf.load_next_block(), no_sample = 0; r_sum < n;) {
00479         for (; i < r_sum+r; ++i, --no_sample) {
00480             v2 = v_buf[i-r_sum];
00481             if (!no_sample) { // is sample
00482                 no_sample = sd;
00483                 m_sample_vals_and_pointer[j++] = v2;    // write samples
00484                 m_sample_vals_and_pointer[j++] = z_size;// write pointers
00485             } else {
00486                 z_size += Coder::encoding_length(v2-v1);
00487                 Coder::encode(v2-v1, z_data, offset);   // write encoded values
00488             }
00489             v1 = v2;
00490         }
00491         r_sum += r; r = v_buf.load_next_block();
00492     }
00493     m_elements = n;
00494 }
00495 
00496 
00497 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth>
00498 enc_vector<>::size_type enc_vector<Coder, SampleDens,fixedIntWidth>::serialize(std::ostream& out, structure_tree_node* v, std::string name)const
00499 {
00500     structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
00501     size_type written_bytes = 0;
00502     written_bytes += util::write_member(m_elements, out, child, "elements");
00503     written_bytes += m_z.serialize(out, child, "compressed differences");
00504     written_bytes += m_sample_vals_and_pointer.serialize(out, child, "samples_and_pointers");
00505     structure_tree::add_size(child, written_bytes);
00506     return written_bytes;
00507 }
00508 
00509 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth>
00510 void enc_vector<Coder, SampleDens,fixedIntWidth>::load(std::istream& in)
00511 {
00512     util::read_member(m_elements, in);
00513     m_z.load(in);
00514     m_sample_vals_and_pointer.load(in);
00515 }
00516 
00517 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth>
00518 const typename enc_vector<Coder,SampleDens,fixedIntWidth>::const_iterator enc_vector<Coder, SampleDens,fixedIntWidth>::begin()const
00519 {
00520     return const_iterator(this, 0);
00521 }
00522 
00523 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth>
00524 const typename enc_vector<Coder,SampleDens,fixedIntWidth>::const_iterator enc_vector<Coder, SampleDens,fixedIntWidth>::end()const
00525 {
00526     return const_iterator(this, this->m_elements);
00527 }
00528 
00529 
00530 } // end namespace sdsl
00531 
00532 #endif