SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2011 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00022 #ifndef INCLUDED_SDSL_LCP_DAC 00023 #define INCLUDED_SDSL_LCP_DAC 00024 00025 #include "lcp.hpp" 00026 #include "int_vector.hpp" 00027 #include "algorithms.hpp" 00028 #include "iterators.hpp" 00029 #include "bitmagic.hpp" 00030 #include "util.hpp" 00031 #include "testutils.hpp" 00032 #include "rank_support_v.hpp" 00033 #include "rank_support_v5.hpp" 00034 #include <iostream> 00035 #include <cassert> 00036 #include <cstring> // for strlen 00037 #include <iomanip> 00038 #include <iterator> 00039 #include <vector> 00040 #include <algorithm> // for max 00041 #include <stdexcept> 00042 00043 namespace sdsl 00044 { 00045 00047 00066 template<uint8_t b=4, class rank_support_type=rank_support_v5<> > 00067 class lcp_dac 00068 { 00069 public: 00070 typedef typename int_vector<>::value_type value_type; // STL Container requirement 00071 typedef random_access_const_iterator<lcp_dac> const_iterator;// STL Container requirement 00072 typedef const_iterator iterator; // STL Container requirement 00073 typedef const value_type const_reference; 00074 typedef const_reference reference; 00075 typedef const_reference* pointer; 00076 typedef const pointer const_pointer; 00077 typedef int_vector<>::size_type size_type; // STL Container requirement 00078 typedef ptrdiff_t difference_type; // STL Container requirement 00079 00080 typedef lcp_plain_tag lcp_category; // TODO? 00081 00082 enum { fast_access = 0, 00083 text_order = 0, 00084 sa_order = 1 00085 }; // as the lcp_dac is not fast for texts with long repetition 00086 00087 template<class Cst> // template inner class which is used in CSTs to parametrize lcp classes 00088 class type // with information about the CST. Thanks Stefan Arnold! (2011-03-02) 00089 { 00090 public: 00091 typedef lcp_dac lcp_type; 00092 }; 00093 00094 private: 00095 00096 int_vector<b> m_data; // vector which holds the block data for every level 00097 bit_vector m_overflow; // indicates, if there exists another block for the current number 00098 rank_support_type m_overflow_rank; // rank data structure for m_overflow 00099 int_vector<64> m_level_pointer_and_rank; 00100 uint8_t m_max_level; // maximal number of levels, at most (log n)/b+1 00101 #ifdef LCP_DAC_CACHING 00102 int_vector<64> m_rank_cache; 00103 #endif 00104 00105 00106 void copy(const lcp_dac& lcp_c) { 00107 m_data = lcp_c.m_data; 00108 m_overflow = lcp_c.m_overflow; 00109 m_overflow_rank = lcp_c.m_overflow_rank; 00110 m_overflow_rank.set_vector(&m_overflow); 00111 m_level_pointer_and_rank = lcp_c.m_level_pointer_and_rank; 00112 m_max_level = lcp_c.m_max_level; 00113 } 00114 00115 public: 00117 lcp_dac() { 00118 // has to be initialized for size() methode 00119 // m_level_pointer_and_rank[2] contains the length of the LCP array 00120 m_level_pointer_and_rank = int_vector<64>(4,0); 00121 } 00123 ~lcp_dac() {} 00125 lcp_dac(const lcp_dac& lcp_c) { 00126 copy(lcp_c); 00127 } 00128 00130 template<uint8_t int_width, class size_type_class> 00131 lcp_dac(int_vector_file_buffer<int_width, size_type_class>& lcp_buf); 00132 00133 template<uint8_t int_width, class size_type_class> 00134 void construct(int_vector_file_buffer<int_width, size_type_class>& lcp_buf); 00135 00137 00140 size_type size()const { 00141 return m_level_pointer_and_rank[2]; 00142 } 00143 00145 00148 static size_type max_size() { 00149 return int_vector<>::max_size(); 00150 } 00151 00153 00156 bool empty()const { 00157 return m_level_pointer_and_rank[2]==0; 00158 } 00159 00161 00169 void swap(lcp_dac& lcp_c); 00170 00172 00175 const_iterator begin()const; 00176 00178 00181 const_iterator end()const; 00182 00184 00188 inline value_type operator[](size_type i)const; 00189 00191 00194 lcp_dac& operator=(const lcp_dac& lcp_c); 00195 00197 00202 bool operator==(const lcp_dac& lcp_c)const; 00203 00205 00210 bool operator!=(const lcp_dac& lcp_c)const; 00211 00213 00216 size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="") const; 00217 00219 00221 void load(std::istream& in); 00222 }; 00223 00224 // == template functions == 00225 00226 00227 template<uint8_t b, class rank_support_type> 00228 template<uint8_t int_width, class size_type_class> 00229 lcp_dac<b, rank_support_type>::lcp_dac(int_vector_file_buffer<int_width, size_type_class>& lcp_buf) 00230 { 00231 construct(lcp_buf); 00232 } 00233 00234 template<uint8_t b, class rank_support_type> 00235 template<uint8_t int_width, class size_type_class> 00236 void lcp_dac<b, rank_support_type>::construct(int_vector_file_buffer<int_width, size_type_class>& lcp_buf) 00237 { 00238 00239 // (1) Count for each level, how many blocks are needed for the representation 00240 // Running time: \f$ O(n \times \frac{\log n}{b} \f$ 00241 // Result is sorted in m_level_pointer_and_rank 00242 lcp_buf.reset(); 00243 size_type n = lcp_buf.int_vector_size, val=0; 00244 if (n == 0) 00245 return; 00246 // initialize counter 00247 m_level_pointer_and_rank.resize(std::max(4*bit_magic::l1BP(2), 2*(((bit_magic::l1BP(n)+1)+b-1) / b))); 00248 for (size_type i=0; i < m_level_pointer_and_rank.size(); ++i) 00249 m_level_pointer_and_rank[i] = 0; 00250 m_level_pointer_and_rank[0] = n; // level 0 has n entries 00251 00252 uint8_t level_x_2 = 0; 00253 for (size_type i=0, r_sum=0, r = lcp_buf.load_next_block(); r_sum < n;) { 00254 for (; i < r_sum+r; ++i) { 00255 val=lcp_buf[i-r_sum]; 00256 val >>= b; // shift value b bits to the right 00257 level_x_2 = 2; 00258 while (val) { 00259 ++m_level_pointer_and_rank[level_x_2]; // increase counter for current level by 1 00260 val >>= b; // shift value b bits to the right 00261 level_x_2 += 2; // increase level by 1 00262 } 00263 } 00264 r_sum += r; r = lcp_buf.load_next_block(); 00265 } 00266 00267 // (2) Determine maximum level and prefix sums of level counters 00268 m_max_level = 0; 00269 size_type sum_blocks = 0, last_block_size=0; 00270 for (size_type i=0, t=0; i < m_level_pointer_and_rank.size(); i+=2) { 00271 t = sum_blocks; 00272 sum_blocks += m_level_pointer_and_rank[i]; 00273 m_level_pointer_and_rank[i] = t; 00274 // std::cout<<"i="<<i<<" cnt="<<t<<std::endl; 00275 if (sum_blocks > t) { 00276 ++m_max_level; 00277 last_block_size = sum_blocks - t; 00278 } 00279 } 00280 m_overflow = bit_vector(sum_blocks - last_block_size, 0); 00281 m_data.resize(sum_blocks); 00282 00283 // std::cerr<<"m_overflow.size()="<<m_overflow.size()<<" "<<"m_data.size()="<<m_data.size()<<std::endl; 00284 if (last_block_size == 0) { 00285 std::cerr<<"ERROR: lcp_dac constructor: last_block_size=0"<<std::endl; 00286 } 00287 00288 // (3) Enter block and overflow data 00289 int_vector<64> cnt = m_level_pointer_and_rank; 00290 const uint64_t mask = bit_magic::Li1Mask[b]; 00291 00292 lcp_buf.reset(); 00293 for (size_type i=0,j=0, r_sum=0, r = lcp_buf.load_next_block(); r_sum < n;) { 00294 for (; i < r_sum+r; ++i) { 00295 val=lcp_buf[i-r_sum]; 00296 j = cnt[0]++; 00297 m_data[ j ] = val & mask; 00298 val >>= b; // shift value b bits to the right 00299 level_x_2 = 2; 00300 while (val) { 00301 // if(j >= m_overflow.size()){ 00302 // std::cerr<<"i="<<i<<" j="<<j<<" n="<<n<<" m_overflow.size()="<<m_overflow.size()<<" val="<<val<<" level="<<level_x_2/2<<std::endl; 00303 // return; 00304 // } 00305 m_overflow[j] = 1; 00306 j = cnt[level_x_2]++; // increase counter for current level by 1 00307 // if(j >= m_data.size()){ 00308 // std::cerr<<"i="<<i<<" j="<<j<<" n="<<n<<" m_data.size()="<<m_data.size()<<" val="<<val<<" level="<<level_x_2/2<<std::endl; 00309 // } 00310 m_data[ j ] = val & mask; 00311 val >>= b; // shift value b bits to the right 00312 level_x_2 += 2; // increase level by 1 00313 } 00314 } 00315 r_sum += r; r = lcp_buf.load_next_block(); 00316 } 00317 00318 // std::cerr<<"m_overflow.size()="<<m_overflow.size()<<" "<<"m_data.size()="<<m_data.size()<<std::endl; 00319 // (4) Initialize rank data structure for m_overflow and precalc rank for 00320 // pointers 00321 util::init_support(m_overflow_rank, &m_overflow); 00322 for (size_type i=0; 2*i < m_level_pointer_and_rank.size() and 00323 m_level_pointer_and_rank[2*i] < m_overflow.size(); ++i) { 00324 m_level_pointer_and_rank[2*i+1] = m_overflow_rank(m_level_pointer_and_rank[2*i]); 00325 } 00326 } 00327 00328 template<uint8_t b, class rank_support_type> 00329 void lcp_dac<b, rank_support_type>::swap(lcp_dac& lcp_c) 00330 { 00331 m_data.swap(lcp_c.m_data); 00332 m_overflow.swap(lcp_c.m_overflow); 00333 util::swap_support(m_overflow_rank, lcp_c.m_overflow_rank, &m_overflow, &(lcp_c.m_overflow)); 00334 00335 m_level_pointer_and_rank.swap(lcp_c.m_level_pointer_and_rank); 00336 std::swap(m_max_level, lcp_c.m_max_level); 00337 } 00338 00339 template<uint8_t b, class rank_support_type> 00340 inline typename lcp_dac<b, rank_support_type>::value_type lcp_dac<b, rank_support_type>::operator[](size_type i)const 00341 { 00342 uint8_t level = 1; 00343 uint8_t offset = b; 00344 size_type result = m_data[i]; 00345 // return result; 00346 const uint64_t* p = m_level_pointer_and_rank.data(); 00347 uint64_t ppi = (*p)+i; // *p plus i 00348 while (level < m_max_level and m_overflow[ppi]) { 00349 p += 2; 00350 ppi = *p + (m_overflow_rank(ppi) - *(p-1)); 00351 result |= (m_data[ppi] << (offset)); 00352 ++level; 00353 offset += b; 00354 } 00355 return result; 00356 } 00357 00358 00359 template<uint8_t b, class rank_support_type> 00360 typename lcp_dac<b, rank_support_type>::size_type lcp_dac<b, rank_support_type>::serialize(std::ostream& out, structure_tree_node* v, std::string name) const 00361 { 00362 structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); 00363 size_type written_bytes = 0; 00364 written_bytes += m_data.serialize(out, child, "data"); 00365 written_bytes += m_overflow.serialize(out, child, "overflow"); 00366 written_bytes += m_overflow_rank.serialize(out, child, "overflow_rank"); 00367 written_bytes += m_level_pointer_and_rank.serialize(out, child, "level_pointer_and_rank"); 00368 written_bytes += util::write_member(m_max_level, out, child, "max_level"); 00369 structure_tree::add_size(child, written_bytes); 00370 return written_bytes; 00371 } 00372 00373 template<uint8_t b, class rank_support_type> 00374 void lcp_dac<b, rank_support_type>::load(std::istream& in) 00375 { 00376 m_data.load(in); 00377 m_overflow.load(in); 00378 m_overflow_rank.load(in, &m_overflow); // FIXED that at 2012-02-01 15:34 00379 m_level_pointer_and_rank.load(in); 00380 util::read_member(m_max_level, in); 00381 } 00382 00383 00384 template<uint8_t b, class rank_support_type> 00385 lcp_dac<b, rank_support_type>& lcp_dac<b, rank_support_type>::operator=(const lcp_dac& lcp_c) 00386 { 00387 if (this != &lcp_c) { 00388 copy(lcp_c); 00389 } 00390 return *this; 00391 } 00392 00393 00394 template<uint8_t b, class rank_support_type> 00395 bool lcp_dac<b, rank_support_type>::operator==(const lcp_dac& lcp_c)const 00396 { 00397 if (this == &lcp_c) 00398 return true; 00399 return m_data == lcp_c.m_data and m_overflow == lcp_c.overflow 00400 and m_overflow_rank == lcp_c.m_overflow_rank 00401 and m_level_pointer_and_rank = lcp_c.m_level_pointer_and_rank 00402 and m_max_level = lcp_c.m_max_level; 00403 } 00404 00405 template<uint8_t b, class rank_support_type> 00406 bool lcp_dac<b, rank_support_type>::operator!=(const lcp_dac& lcp_c)const 00407 { 00408 return !(*this == lcp_c); 00409 } 00410 00411 template<uint8_t b, class rank_support_type> 00412 typename lcp_dac<b, rank_support_type>::const_iterator lcp_dac<b, rank_support_type>::begin()const 00413 { 00414 return const_iterator(this, 0); 00415 } 00416 00417 template<uint8_t b, class rank_support_type> 00418 typename lcp_dac<b, rank_support_type>::const_iterator lcp_dac<b, rank_support_type>::end()const 00419 { 00420 return const_iterator(this, size()); 00421 } 00422 00423 00424 00425 } // end namespace sdsl 00426 00427 #endif