SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
 All Classes Namespaces Files Functions Variables Typedefs Friends
sdsl/include/sdsl/lcp_dac.hpp
Go to the documentation of this file.
00001 /* sdsl - succinct data structures library
00002     Copyright (C) 2011 Simon Gog
00003 
00004     This program is free software: you can redistribute it and/or modify
00005     it under the terms of the GNU General Public License as published by
00006     the Free Software Foundation, either version 3 of the License, or
00007     (at your option) any later version.
00008 
00009     This program is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012     GNU General Public License for more details.
00013 
00014     You should have received a copy of the GNU General Public License
00015     along with this program.  If not, see http://www.gnu.org/licenses/ .
00016 */
00022 #ifndef INCLUDED_SDSL_LCP_DAC
00023 #define INCLUDED_SDSL_LCP_DAC
00024 
00025 #include "lcp.hpp"
00026 #include "int_vector.hpp"
00027 #include "algorithms.hpp"
00028 #include "iterators.hpp"
00029 #include "bitmagic.hpp"
00030 #include "util.hpp"
00031 #include "testutils.hpp"
00032 #include "rank_support_v.hpp"
00033 #include "rank_support_v5.hpp"
00034 #include <iostream>
00035 #include <cassert>
00036 #include <cstring> // for strlen
00037 #include <iomanip>
00038 #include <iterator>
00039 #include <vector>
00040 #include <algorithm> // for max
00041 #include <stdexcept>
00042 
00043 namespace sdsl
00044 {
00045 
00047 
00066 template<uint8_t b=4, class rank_support_type=rank_support_v5<> >
00067 class lcp_dac
00068 {
00069     public:
00070         typedef typename int_vector<>::value_type                value_type;    // STL Container requirement
00071         typedef random_access_const_iterator<lcp_dac>            const_iterator;// STL Container requirement
00072         typedef const_iterator                                                           iterator;              // STL Container requirement
00073         typedef const value_type                                                         const_reference;
00074         typedef const_reference                                                          reference;
00075         typedef const_reference*                                                         pointer;
00076         typedef const pointer                                                            const_pointer;
00077         typedef int_vector<>::size_type                                          size_type;             // STL Container requirement
00078         typedef ptrdiff_t                                                                        difference_type; // STL Container requirement
00079 
00080         typedef lcp_plain_tag                                                            lcp_category; // TODO?
00081 
00082         enum { fast_access = 0,
00083                text_order  = 0,
00084                sa_order   = 1
00085              }; // as the lcp_dac is not fast for texts with long repetition
00086 
00087         template<class Cst>  // template inner class which is used in CSTs to parametrize lcp classes
00088         class type           // with information about the CST. Thanks Stefan Arnold! (2011-03-02)
00089         {
00090             public:
00091                 typedef lcp_dac lcp_type;
00092         };
00093 
00094     private:
00095 
00096         int_vector<b>           m_data;                         // vector which holds the block data for every level
00097         bit_vector                      m_overflow;                     // indicates, if there exists another block for the current number
00098         rank_support_type       m_overflow_rank;        // rank data structure for m_overflow
00099         int_vector<64>          m_level_pointer_and_rank;
00100         uint8_t                         m_max_level;   // maximal number of levels, at most (log n)/b+1
00101 #ifdef LCP_DAC_CACHING
00102         int_vector<64>          m_rank_cache;
00103 #endif
00104 
00105 
00106         void copy(const lcp_dac& lcp_c) {
00107             m_data                                              = lcp_c.m_data;
00108             m_overflow                                  = lcp_c.m_overflow;
00109             m_overflow_rank                             = lcp_c.m_overflow_rank;
00110             m_overflow_rank.set_vector(&m_overflow);
00111             m_level_pointer_and_rank    = lcp_c.m_level_pointer_and_rank;
00112             m_max_level                                 = lcp_c.m_max_level;
00113         }
00114 
00115     public:
00117         lcp_dac() {
00118             // has to be initialized for size() methode
00119             // m_level_pointer_and_rank[2] contains the length of the LCP array
00120             m_level_pointer_and_rank = int_vector<64>(4,0);
00121         }
00123         ~lcp_dac() {}
00125         lcp_dac(const lcp_dac& lcp_c) {
00126             copy(lcp_c);
00127         }
00128 
00130         template<uint8_t int_width, class size_type_class>
00131         lcp_dac(int_vector_file_buffer<int_width, size_type_class>& lcp_buf);
00132 
00133         template<uint8_t int_width, class size_type_class>
00134         void construct(int_vector_file_buffer<int_width, size_type_class>& lcp_buf);
00135 
00137 
00140         size_type size()const {
00141             return m_level_pointer_and_rank[2];
00142         }
00143 
00145 
00148         static size_type max_size() {
00149             return int_vector<>::max_size();
00150         }
00151 
00153 
00156         bool empty()const {
00157             return m_level_pointer_and_rank[2]==0;
00158         }
00159 
00161 
00169         void swap(lcp_dac& lcp_c);
00170 
00172 
00175         const_iterator begin()const;
00176 
00178 
00181         const_iterator end()const;
00182 
00184 
00188         inline value_type operator[](size_type i)const;
00189 
00191 
00194         lcp_dac& operator=(const lcp_dac& lcp_c);
00195 
00197 
00202         bool operator==(const lcp_dac& lcp_c)const;
00203 
00205 
00210         bool operator!=(const lcp_dac& lcp_c)const;
00211 
00213 
00216         size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="") const;
00217 
00219 
00221         void load(std::istream& in);
00222 };
00223 
00224 // == template functions ==
00225 
00226 
00227 template<uint8_t b, class rank_support_type>
00228 template<uint8_t int_width, class size_type_class>
00229 lcp_dac<b, rank_support_type>::lcp_dac(int_vector_file_buffer<int_width, size_type_class>& lcp_buf)
00230 {
00231     construct(lcp_buf);
00232 }
00233 
00234 template<uint8_t b, class rank_support_type>
00235 template<uint8_t int_width, class size_type_class>
00236 void lcp_dac<b, rank_support_type>::construct(int_vector_file_buffer<int_width, size_type_class>& lcp_buf)
00237 {
00238 
00239 //  (1) Count for each level, how many blocks are needed for the representation
00240 //      Running time: \f$ O(n \times \frac{\log n}{b}  \f$
00241 //      Result is sorted in m_level_pointer_and_rank
00242     lcp_buf.reset();
00243     size_type n = lcp_buf.int_vector_size, val=0;
00244     if (n == 0)
00245         return;
00246 //              initialize counter
00247     m_level_pointer_and_rank.resize(std::max(4*bit_magic::l1BP(2), 2*(((bit_magic::l1BP(n)+1)+b-1) / b)));
00248     for (size_type i=0; i < m_level_pointer_and_rank.size(); ++i)
00249         m_level_pointer_and_rank[i] = 0;
00250     m_level_pointer_and_rank[0] = n; // level 0 has n entries
00251 
00252     uint8_t level_x_2 = 0;
00253     for (size_type i=0, r_sum=0, r = lcp_buf.load_next_block(); r_sum < n;) {
00254         for (; i < r_sum+r; ++i) {
00255             val=lcp_buf[i-r_sum];
00256             val >>= b; // shift value b bits to the right
00257             level_x_2 = 2;
00258             while (val) {
00259                 ++m_level_pointer_and_rank[level_x_2]; // increase counter for current level by 1
00260                 val >>= b; // shift value b bits to the right
00261                 level_x_2 += 2; // increase level by 1
00262             }
00263         }
00264         r_sum += r; r = lcp_buf.load_next_block();
00265     }
00266 
00267 //  (2) Determine maximum level and prefix sums of level counters
00268     m_max_level = 0;
00269     size_type sum_blocks = 0, last_block_size=0;
00270     for (size_type i=0, t=0; i < m_level_pointer_and_rank.size(); i+=2) {
00271         t = sum_blocks;
00272         sum_blocks += m_level_pointer_and_rank[i];
00273         m_level_pointer_and_rank[i] = t;
00274 //              std::cout<<"i="<<i<<" cnt="<<t<<std::endl;
00275         if (sum_blocks > t) {
00276             ++m_max_level;
00277             last_block_size = sum_blocks - t;
00278         }
00279     }
00280     m_overflow = bit_vector(sum_blocks - last_block_size, 0);
00281     m_data.resize(sum_blocks);
00282 
00283 //      std::cerr<<"m_overflow.size()="<<m_overflow.size()<<" "<<"m_data.size()="<<m_data.size()<<std::endl;
00284     if (last_block_size == 0) {
00285         std::cerr<<"ERROR: lcp_dac constructor: last_block_size=0"<<std::endl;
00286     }
00287 
00288 //  (3) Enter block and overflow data
00289     int_vector<64> cnt = m_level_pointer_and_rank;
00290     const uint64_t mask = bit_magic::Li1Mask[b];
00291 
00292     lcp_buf.reset();
00293     for (size_type i=0,j=0, r_sum=0, r = lcp_buf.load_next_block(); r_sum < n;) {
00294         for (; i < r_sum+r; ++i) {
00295             val=lcp_buf[i-r_sum];
00296             j = cnt[0]++;
00297             m_data[ j ] =  val & mask;
00298             val >>= b; // shift value b bits to the right
00299             level_x_2 = 2;
00300             while (val) {
00301 //                                      if(j >= m_overflow.size()){
00302 //                                              std::cerr<<"i="<<i<<" j="<<j<<" n="<<n<<" m_overflow.size()="<<m_overflow.size()<<" val="<<val<<" level="<<level_x_2/2<<std::endl;
00303 //                                              return;
00304 //                                      }
00305                 m_overflow[j] = 1;
00306                 j = cnt[level_x_2]++; // increase counter for current level by 1
00307 //                                      if(j >= m_data.size()){
00308 //                                              std::cerr<<"i="<<i<<" j="<<j<<" n="<<n<<" m_data.size()="<<m_data.size()<<" val="<<val<<" level="<<level_x_2/2<<std::endl;
00309 //                                      }
00310                 m_data[ j ] = val & mask;
00311                 val >>= b; // shift value b bits to the right
00312                 level_x_2 += 2; // increase level by 1
00313             }
00314         }
00315         r_sum += r; r = lcp_buf.load_next_block();
00316     }
00317 
00318 //      std::cerr<<"m_overflow.size()="<<m_overflow.size()<<" "<<"m_data.size()="<<m_data.size()<<std::endl;
00319 //  (4) Initialize rank data structure for m_overflow and precalc rank for
00320 //      pointers
00321     util::init_support(m_overflow_rank, &m_overflow);
00322     for (size_type i=0; 2*i < m_level_pointer_and_rank.size() and
00323          m_level_pointer_and_rank[2*i] < m_overflow.size(); ++i) {
00324         m_level_pointer_and_rank[2*i+1] = m_overflow_rank(m_level_pointer_and_rank[2*i]);
00325     }
00326 }
00327 
00328 template<uint8_t b, class rank_support_type>
00329 void lcp_dac<b, rank_support_type>::swap(lcp_dac& lcp_c)
00330 {
00331     m_data.swap(lcp_c.m_data);
00332     m_overflow.swap(lcp_c.m_overflow);
00333     util::swap_support(m_overflow_rank, lcp_c.m_overflow_rank, &m_overflow, &(lcp_c.m_overflow));
00334 
00335     m_level_pointer_and_rank.swap(lcp_c.m_level_pointer_and_rank);
00336     std::swap(m_max_level, lcp_c.m_max_level);
00337 }
00338 
00339 template<uint8_t b, class rank_support_type>
00340 inline typename lcp_dac<b, rank_support_type>::value_type lcp_dac<b, rank_support_type>::operator[](size_type i)const
00341 {
00342     uint8_t level = 1;
00343     uint8_t offset = b;
00344     size_type result = m_data[i];
00345 //      return result;
00346     const uint64_t* p = m_level_pointer_and_rank.data();
00347     uint64_t ppi = (*p)+i; // *p plus i
00348     while (level < m_max_level and m_overflow[ppi]) {
00349         p += 2;
00350         ppi = *p + (m_overflow_rank(ppi) - *(p-1));
00351         result |= (m_data[ppi] << (offset));
00352         ++level;
00353         offset += b;
00354     }
00355     return result;
00356 }
00357 
00358 
00359 template<uint8_t b, class rank_support_type>
00360 typename lcp_dac<b, rank_support_type>::size_type lcp_dac<b, rank_support_type>::serialize(std::ostream& out, structure_tree_node* v, std::string name) const
00361 {
00362     structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
00363     size_type written_bytes = 0;
00364     written_bytes += m_data.serialize(out, child, "data");
00365     written_bytes += m_overflow.serialize(out, child, "overflow");
00366     written_bytes += m_overflow_rank.serialize(out, child, "overflow_rank");
00367     written_bytes += m_level_pointer_and_rank.serialize(out, child, "level_pointer_and_rank");
00368     written_bytes += util::write_member(m_max_level, out, child, "max_level");
00369     structure_tree::add_size(child, written_bytes);
00370     return written_bytes;
00371 }
00372 
00373 template<uint8_t b, class rank_support_type>
00374 void lcp_dac<b, rank_support_type>::load(std::istream& in)
00375 {
00376     m_data.load(in);
00377     m_overflow.load(in);
00378     m_overflow_rank.load(in, &m_overflow); // FIXED that at 2012-02-01 15:34
00379     m_level_pointer_and_rank.load(in);
00380     util::read_member(m_max_level, in);
00381 }
00382 
00383 
00384 template<uint8_t b, class rank_support_type>
00385 lcp_dac<b, rank_support_type>& lcp_dac<b, rank_support_type>::operator=(const lcp_dac& lcp_c)
00386 {
00387     if (this != &lcp_c) {
00388         copy(lcp_c);
00389     }
00390     return *this;
00391 }
00392 
00393 
00394 template<uint8_t b, class rank_support_type>
00395 bool lcp_dac<b, rank_support_type>::operator==(const lcp_dac& lcp_c)const
00396 {
00397     if (this == &lcp_c)
00398         return true;
00399     return      m_data == lcp_c.m_data and m_overflow == lcp_c.overflow
00400             and m_overflow_rank == lcp_c.m_overflow_rank
00401             and m_level_pointer_and_rank = lcp_c.m_level_pointer_and_rank
00402                                            and m_max_level = lcp_c.m_max_level;
00403 }
00404 
00405 template<uint8_t b, class rank_support_type>
00406 bool lcp_dac<b, rank_support_type>::operator!=(const lcp_dac& lcp_c)const
00407 {
00408     return !(*this == lcp_c);
00409 }
00410 
00411 template<uint8_t b, class rank_support_type>
00412 typename lcp_dac<b, rank_support_type>::const_iterator lcp_dac<b, rank_support_type>::begin()const
00413 {
00414     return const_iterator(this, 0);
00415 }
00416 
00417 template<uint8_t b, class rank_support_type>
00418 typename lcp_dac<b, rank_support_type>::const_iterator lcp_dac<b, rank_support_type>::end()const
00419 {
00420     return const_iterator(this, size());
00421 }
00422 
00423 
00424 
00425 } // end namespace sdsl
00426 
00427 #endif