SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2009 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00021 #ifndef INCLUDED_SDSL_CSA_WT 00022 #define INCLUDED_SDSL_CSA_WT 00023 00024 #ifdef SDSL_DEBUG 00025 #define SDSL_DEBUG_CSA_WT 00026 #endif 00027 00028 #include "wt_huff.hpp" 00029 #include "algorithms.hpp" 00030 #include "iterators.hpp" 00031 #include "util.hpp" 00032 #include "suffixarrays.hpp" 00033 #include "bwt_construct.hpp" 00034 #include "fast_cache.hpp" 00035 #include <iostream> 00036 #include <algorithm> // for std::swap 00037 #include <cassert> 00038 #include <cstring> // for strlen 00039 #include <iomanip> 00040 #include <iterator> 00041 00042 namespace sdsl 00043 { 00044 00045 template<class WaveletTree = wt_huff<>, uint32_t SampleDens = 32, uint32_t InvSampleDens = 64, uint8_t fixedIntWidth = 0, class charType=unsigned char> // forward declaration 00046 class csa_wt; 00047 00048 template<uint8_t fixedIntWidth> 00049 struct csa_wt_trait { 00050 typedef int_vector<0> int_vector_type; 00051 }; 00052 00053 template<> 00054 struct csa_wt_trait<32> { 00055 typedef int_vector<32> int_vector_type; 00056 }; 00057 00058 template<> 00059 struct csa_wt_trait<64> { 00060 typedef int_vector<64> int_vector_type; 00061 }; 00062 00064 template<class CsaWT> 00065 class psi_of_csa_wt 00066 { 00067 public: 00068 typedef typename CsaWT::value_type value_type; 00069 typedef typename CsaWT::size_type size_type; 00070 typedef typename CsaWT::char_type char_type; 00071 typedef typename CsaWT::difference_type difference_type; 00072 typedef random_access_const_iterator<psi_of_csa_wt> const_iterator;// STL Container requirement 00073 private: 00074 CsaWT* m_csa_wt; //<- pointer to the (compressed) suffix array that is based on a wavelet tree 00075 public: 00076 00078 psi_of_csa_wt(CsaWT* csa_wt=NULL) { 00079 m_csa_wt = csa_wt; 00080 } 00081 00083 psi_of_csa_wt(const psi_of_csa_wt& psi_of_csa) { 00084 m_csa_wt = psi_of_csa.m_csa_wt; 00085 } 00086 00088 00092 value_type operator[](size_type i)const { 00093 assert(m_csa_wt != NULL); 00094 char_type c = algorithm::get_ith_character_of_the_first_row(i, *m_csa_wt); 00095 return m_csa_wt->wavelet_tree.select(i - m_csa_wt->C[m_csa_wt->char2comp[c]] + 1 , c); 00096 } 00097 00099 00104 value_type psi_k(size_type i, size_type k)const { 00105 assert(m_csa_wt != NULL); 00106 if (k < m_csa_wt->sa_sample_dens + m_csa_wt->isa_sample_dens) { 00107 for (size_type j=0; j<k; ++j) { 00108 i = (*this)[i]; 00109 } 00110 return i; 00111 } else { 00112 size_type x = (*m_csa_wt)[i]; 00113 x += k; 00114 if (x >= m_csa_wt->size()) { 00115 x -= m_csa_wt->size(); 00116 } 00117 return (*m_csa_wt)(x); 00118 } 00119 } 00120 00122 00127 value_type operator()(size_type i)const { 00128 assert(m_csa_wt != NULL); 00129 assert(i < size()); 00130 // (1) get the ith character in the bwt, this takes \f$\log |\Sigma| \f$ time with use of the wavelet tree 00131 // value_type c = m_csa_wt->m_wavelet_tree[i]; 00132 // (2) next count how many symbols c are in the prefix [0..i-1] of the bwt, this takes \f$\log |\Sigma| \f$ time with use of the wavelet tree. 00133 // size_type j = m_csa_wt->m_wavelet_tree.rank(i, c); 00134 // TODO: (1) and (2) can be calculated in only one request to the wavelet tree -> method rank_ith_symbol !!! 00135 // (3) calculate the position of the j+1 th c in the sorted string of the bwt in constant time. 00136 typename CsaWT::char_type c; 00137 size_type j = m_csa_wt->m_wavelet_tree.rank_ith_symbol(i,c); // see documentation of rank_ith_symbol in wt_huff 00138 return m_csa_wt->C[ m_csa_wt->char2comp[c] ] + j; 00139 } 00140 00142 00147 value_type lf_k(size_type i, size_type k)const { 00148 assert(m_csa_wt != NULL); 00149 if (k < m_csa_wt->sa_sample_dens + m_csa_wt->isa_sample_dens) { 00150 for (size_type j=0; j<k; ++j) { 00151 i = (*this)(i); 00152 } 00153 return i; 00154 } else { 00155 size_type x = (*m_csa_wt)[i]; 00156 if (x < k) { 00157 x += m_csa_wt->size(); 00158 } 00159 x -= k; 00160 return (*m_csa_wt)(x); 00161 } 00162 } 00163 00165 psi_of_csa_wt& operator=(const psi_of_csa_wt& psi_of_csa) { 00166 if (this != &psi_of_csa) { 00167 m_csa_wt = psi_of_csa.m_csa_wt; 00168 } 00169 return *this; 00170 } 00171 00173 00175 bool operator==(const psi_of_csa_wt& psi) { 00176 return true; 00177 } 00178 00180 size_type size()const { 00181 return m_csa_wt->size(); 00182 } 00183 00185 size_type empty()const { 00186 return m_csa_wt->empty(); 00187 } 00188 00190 void swap(psi_of_csa_wt& psi_of_csa) { 00191 if (this != &psi_of_csa) { 00192 ;// std::swap(m_csa_wt, psi_of_csa.m_csa_wt); // do not exchange the supported structure! 00193 } 00194 } 00195 00197 00200 const_iterator begin()const { 00201 return const_iterator(this, 0); 00202 } 00203 00205 00208 const_iterator end()const { 00209 return const_iterator(this, size()); 00210 } 00211 00212 // Get the number of sampled \f$\Psi\f$ values. In the wavelet tree approach the number of sampled (i.e. explicitly stored) \f$\Psi\f$ values is zero. 00213 uint32_t get_sample_dens()const { 00214 return 0; 00215 } 00216 }; 00217 00218 template<class CsaWT> 00219 class bwt_of_csa_wt 00220 { 00221 public: 00222 typedef const typename CsaWT::char_type value_type; 00223 typedef typename CsaWT::size_type size_type; 00224 typedef typename CsaWT::difference_type difference_type; 00225 typedef random_access_const_iterator<bwt_of_csa_wt> const_iterator;// STL Container requirement 00226 private: 00227 CsaWT* m_csa_wt; //<- pointer to the (compressed) suffix array that is based on a wavelet tree 00228 public: 00229 00231 bwt_of_csa_wt(CsaWT* csa_wt=NULL) { 00232 m_csa_wt = csa_wt; 00233 } 00234 00236 bwt_of_csa_wt(const bwt_of_csa_wt& bwt_of_csa) { 00237 m_csa_wt = bwt_of_csa.m_csa_wt; 00238 } 00239 00241 00245 value_type operator[](size_type i)const { 00246 assert(m_csa_wt != NULL); 00247 assert(i < size()); 00248 return m_csa_wt->m_wavelet_tree[i]; 00249 } 00250 00252 bwt_of_csa_wt& operator=(const bwt_of_csa_wt& bwt_of_csa) { 00253 if (this != &bwt_of_csa) { 00254 m_csa_wt = bwt_of_csa.m_csa_wt; 00255 } 00256 return *this; 00257 } 00258 00260 size_type size()const { 00261 return m_csa_wt->size(); 00262 } 00263 00265 size_type empty()const { 00266 return m_csa_wt->empty(); 00267 } 00268 00270 void swap(bwt_of_csa_wt& bwt_of_csa) { 00271 if (this != &bwt_of_csa) { 00272 ;// std::swap(m_csa_wt, bwt_of_csa.m_csa_wt); // do not exchange the supported structure! 00273 } 00274 } 00275 00277 00280 const_iterator begin()const { 00281 return const_iterator(this, 0); 00282 } 00283 00285 00288 const_iterator end()const { 00289 return const_iterator(this, size()); 00290 } 00291 }; 00292 00293 00295 00302 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth, class charType> 00303 class csa_wt 00304 { 00305 public: 00306 typedef uint64_t value_type; // STL Container requirement 00307 typedef random_access_const_iterator<csa_wt> const_iterator;// STL Container requirement 00308 typedef const_iterator iterator; // STL Container requirement 00309 typedef const value_type const_reference; 00310 typedef const_reference reference; 00311 typedef const_reference* pointer; 00312 typedef const pointer const_pointer; 00313 typedef int_vector<>::size_type size_type; // STL Container requirement 00314 typedef size_type csa_size_type; 00315 typedef ptrdiff_t difference_type; // STL Container requirement 00316 typedef psi_of_csa_wt<csa_wt> psi_type; 00317 typedef bwt_of_csa_wt<csa_wt> bwt_type; 00318 typedef WaveletTree wavelet_tree_type; 00319 typedef charType char_type; 00320 typedef const char_type* pattern_type; 00321 typedef typename csa_wt_trait<fixedIntWidth>::int_vector_type sa_sample_type; 00322 typedef typename csa_wt_trait<fixedIntWidth>::int_vector_type isa_sample_type; 00323 00324 typedef csa_tag index_category; 00325 00326 00327 00328 enum { sa_sample_dens = SampleDens, 00329 isa_sample_dens = InvSampleDens 00330 }; 00331 00332 friend class psi_of_csa_wt<csa_wt>; 00333 friend class bwt_of_csa_wt<csa_wt>; 00334 // static const uint32_t sample_dens = SampleDens; 00335 private: 00336 WaveletTree m_wavelet_tree; // the wavelet tree 00337 psi_type m_psi; // psi function 00338 bwt_type m_bwt; // bwt 00339 sa_sample_type m_sa_sample; // suffix array samples 00340 isa_sample_type m_isa_sample; // inverse suffix array samples 00341 int_vector<8> m_char2comp; 00342 int_vector<8> m_comp2char; 00343 00344 int_vector<64> m_C; // counts for the compact alphabet [0..sigma-1] 00345 uint16_t m_sigma; 00346 // uint32_t m_sample_dens; // additional to SampleDens value 00347 //#define USE_CSA_CACHE 00348 #ifdef USE_CSA_CACHE 00349 mutable fast_cache csa_cache; 00350 #endif 00351 00352 template<typename RandomAccessContainer> 00353 void construct(const RandomAccessContainer& sa, const char_type* str); 00354 00355 template<typename RandomAccessContainer> 00356 void construct_samples(const RandomAccessContainer& sa, const char_type* str); 00357 00358 void copy(const csa_wt& csa) { 00359 m_wavelet_tree = csa.m_wavelet_tree; 00360 m_sa_sample = csa.m_sa_sample; 00361 m_isa_sample = csa.m_isa_sample; 00362 m_sigma = csa.m_sigma; 00363 m_char2comp = csa.m_char2comp; 00364 m_comp2char = csa.m_comp2char; 00365 m_C = csa.m_C; 00366 m_psi = psi_type(this); 00367 m_bwt = bwt_type(this); 00368 } 00369 00370 public: 00371 const int_vector<8>& char2comp; 00372 const int_vector<8>& comp2char; 00373 const int_vector<64>& C; 00374 const uint16_t& sigma; 00375 const psi_type& psi; 00376 const bwt_type& bwt; 00377 const sa_sample_type& sa_sample; 00378 const isa_sample_type& isa_sample; 00379 const wavelet_tree_type& wavelet_tree; 00380 00382 csa_wt():char2comp(m_char2comp), comp2char(m_comp2char), C(m_C), sigma(m_sigma) ,psi(m_psi), bwt(m_bwt),sa_sample(m_sa_sample), isa_sample(m_isa_sample), wavelet_tree(m_wavelet_tree) {} 00384 ~csa_wt() {} 00386 csa_wt(const csa_wt& csa): m_sa_sample(csa.m_sa_sample), m_isa_sample(csa.m_isa_sample), char2comp(m_char2comp), comp2char(m_comp2char), C(m_C), sigma(m_sigma), psi(m_psi), bwt(m_bwt), sa_sample(m_sa_sample), isa_sample(m_isa_sample), wavelet_tree(m_wavelet_tree) { 00387 copy(csa); 00388 } 00389 00391 template<typename RandomAccessContainer> 00392 csa_wt(const RandomAccessContainer& sa, const char_type* str); 00393 00395 csa_wt(const char_type* str); 00396 00398 template<class size_type_class, uint8_t int_width, class size_type_class_1> 00399 csa_wt(int_vector_file_buffer<8, size_type_class>& bwt_buf, 00400 int_vector_file_buffer<int_width, size_type_class_1>& sa_buf 00401 ); 00402 00404 template<class size_type_class> 00405 csa_wt(int_vector_file_buffer<8, size_type_class>& bwt_buf); 00406 00407 csa_wt(tMSS& file_map, const std::string& dir, const std::string& id); 00408 00409 void construct(tMSS& file_map, const std::string& dir, const std::string& id); 00410 00412 00417 size_type size()const { 00418 return m_wavelet_tree.size(); 00419 } 00420 00422 00425 static size_type max_size() { 00426 return bit_vector::max_size(); 00427 } 00428 00430 00433 bool empty()const { 00434 return m_wavelet_tree.empty(); 00435 } 00436 00438 00446 void swap(csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>& csa); 00447 00449 00452 const_iterator begin()const; 00453 00455 00458 const_iterator end()const; 00459 00461 00467 inline value_type operator[](size_type i)const; 00468 00470 00475 inline value_type operator()(size_type i)const; 00476 00478 00481 csa_wt& operator=(const csa_wt& csa); 00482 00484 00489 bool operator==(const csa_wt& csa)const; 00490 00492 00497 bool operator!=(const csa_wt& csa)const; 00498 00500 00503 size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const; 00504 00506 00508 void load(std::istream& in); 00509 00510 // uint32_t get_sample_dens() const; 00511 // void set_sample_dens(const uint32_t sample_dens); 00512 00513 uint32_t get_psi_sample_dens() const; 00514 void set_psi_sample_dens(const uint32_t sample_dens); 00515 00517 00524 size_type rank_bwt(size_type i, const char_type c)const { 00525 return m_wavelet_tree.rank(i, c); 00526 } 00527 00529 00536 size_type select_bwt(size_type i, const char_type c)const { 00537 assert(i > 0); 00538 char_type cc = m_char2comp[c]; 00539 if (cc==0 and c!=0) // character is not in the text => return size() 00540 return size(); 00541 assert(cc != 255); 00542 if (m_C[cc]+i-1 < m_C[cc+1]) { 00543 return m_wavelet_tree.select(i, c); 00544 } else 00545 return size(); 00546 } 00547 }; 00548 00549 // == template functions == 00550 00551 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth, class charType> 00552 csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::csa_wt(const char_type* str):char2comp(m_char2comp), comp2char(m_comp2char), C(m_C), sigma(m_sigma), psi(m_psi), bwt(m_bwt), sa_sample(m_sa_sample), isa_sample(m_isa_sample), wavelet_tree(m_wavelet_tree) 00553 { 00554 csa_uncompressed sa(str); 00555 /* size_type n = strlen((const char*)str); 00556 int_vector<> sa(n+1, 0, bit_magic::l1BP(n+1)+1); 00557 algorithm::calculate_sa(str, n+1, sa); // calculate the suffix array sa of str 00558 assert(sa.size() == n+1); 00559 */ 00560 algorithm::set_text<csa_wt>(str, sa.size(), m_C, m_char2comp, m_comp2char, m_sigma); 00561 construct(sa, str); 00562 // if( n+1 > 0 and n+1 != size() ) 00563 // throw std::logic_error(util::demangle(typeid(this).name())+": text size differ with sa size!"); 00564 } 00565 00566 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth, class charType> 00567 template<typename RandomAccessContainer> 00568 csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::csa_wt(const RandomAccessContainer& sa, const char_type* str):char2comp(m_char2comp), comp2char(m_comp2char),C(m_C), sigma(m_sigma), psi(m_psi), bwt(m_bwt),sa_sample(m_sa_sample), isa_sample(m_isa_sample),wavelet_tree(m_wavelet_tree) 00569 { 00570 size_type n = 1; 00571 if (str != NULL) { 00572 n = strlen((const char*)str); 00573 } 00574 algorithm::set_text<csa_wt>(str, n+1, m_C, m_char2comp, m_comp2char, m_sigma); 00575 assert(sa.size() == n+1); 00576 construct(sa, str); 00577 if (n+1 > 0 and n+1 != size()) 00578 throw std::logic_error(util::demangle(typeid(this).name())+": text size differ with sa size! "); 00579 } 00580 00581 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth, class charType> 00582 template<class size_type_class, uint8_t int_width, class size_type_class_1> 00583 csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::csa_wt(int_vector_file_buffer<8, size_type_class>& bwt_buf, 00584 int_vector_file_buffer<int_width, size_type_class_1>& sa_buf):char2comp(m_char2comp), comp2char(m_comp2char),C(m_C), sigma(m_sigma), psi(m_psi), bwt(m_bwt),sa_sample(m_sa_sample),isa_sample(m_isa_sample),wavelet_tree(m_wavelet_tree) 00585 { 00586 bwt_buf.reset(); sa_buf.reset(); 00587 size_type n = bwt_buf.int_vector_size; 00588 algorithm::set_text<csa_wt>(bwt_buf, n, m_C, m_char2comp, m_comp2char, m_sigma); 00589 assert(sa_buf.int_vetor_size == n); 00590 00591 // m_wavelet_tree = WaveletTree(bwt_buf, n); 00592 m_wavelet_tree.construct(bwt_buf, n); 00593 00594 algorithm::set_sa_and_isa_samples<csa_wt>(sa_buf, m_sa_sample, m_isa_sample); 00595 00596 m_psi = psi_type(this); 00597 m_bwt = bwt_type(this); 00598 } 00599 00600 00601 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth, class charType> 00602 template<class size_type_class> 00603 csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::csa_wt(int_vector_file_buffer<8, size_type_class>& bwt_buf):char2comp(m_char2comp), comp2char(m_comp2char),C(m_C), sigma(m_sigma), psi(m_psi), bwt(m_bwt),sa_sample(m_sa_sample),isa_sample(m_isa_sample),wavelet_tree(m_wavelet_tree) 00604 { 00605 if (SampleDens != 0 or InvSampleDens !=0) { 00606 std::cerr << "Warning: Construct only the BWT part of the CSA, please make sure SampleDens and InvSampleDens equal 0!" << std::endl; 00607 } 00608 bwt_buf.reset(); 00609 size_type n = bwt_buf.int_vector_size; 00610 algorithm::set_text<csa_wt>(bwt_buf, n, m_C, m_char2comp, m_comp2char, m_sigma); 00611 00612 m_wavelet_tree.construct(bwt_buf, n); 00613 00614 m_psi = psi_type(this); 00615 m_bwt = bwt_type(this); 00616 } 00617 00618 00619 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth, class charType> 00620 csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::csa_wt(tMSS& file_map, const std::string& dir, const std::string& id):char2comp(m_char2comp), comp2char(m_comp2char),C(m_C), sigma(m_sigma), psi(m_psi), bwt(m_bwt),sa_sample(m_sa_sample),isa_sample(m_isa_sample),wavelet_tree(m_wavelet_tree) 00621 { 00622 construct(file_map, dir, id); 00623 } 00624 00625 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth, class charType> 00626 void csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::construct(tMSS& file_map, const std::string& dir, const std::string& id) 00627 { 00628 if (file_map.find("bwt") == file_map.end()) { // if bwt is not already stored on disk => construct bwt 00629 construct_bwt(file_map, dir, id); 00630 // construct_bwt2(file_map, dir, id); 00631 } 00632 int_vector_file_buffer<8> bwt_buf(file_map["bwt"].c_str()); 00633 int_vector_file_buffer<> sa_buf(file_map["sa"].c_str()); 00634 size_type n = bwt_buf.int_vector_size; 00635 algorithm::set_text<csa_wt>(bwt_buf, n, m_C, m_char2comp, m_comp2char, m_sigma); 00636 // m_wavelet_tree = WaveletTree(bwt_buf, n); 00637 write_R_output("csa", "construct WT", "begin", 1, 0); 00638 m_wavelet_tree.construct(bwt_buf, n); 00639 write_R_output("csa", "construct WT", "end", 1, 0); 00640 00641 algorithm::set_sa_and_isa_samples<csa_wt>(sa_buf, m_sa_sample, m_isa_sample); 00642 00643 m_psi = psi_type(this); 00644 m_bwt = bwt_type(this); 00645 } 00646 00647 /* 00648 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth> 00649 uint32_t csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth>::get_sample_dens()const{ 00650 if(SampleDens==0) 00651 return m_sample_dens; 00652 else 00653 return SampleDens; 00654 } 00655 00656 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth> 00657 void csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth>::set_sample_dens(const uint32_t sample_dens){ 00658 m_sample_dens = sample_dens; 00659 } 00660 */ 00661 00662 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth, class charType> 00663 uint32_t csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::get_psi_sample_dens()const 00664 { 00665 return m_psi.get_sample_dens(); 00666 } 00667 00668 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth, class charType> 00669 void csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::set_psi_sample_dens(const uint32_t sample_dens) 00670 { 00671 } 00672 00673 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth, class charType> 00674 template<typename RandomAccessContainer> 00675 void csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::construct_samples(const RandomAccessContainer& sa, const char_type* str) 00676 { 00677 m_sa_sample.set_int_width(bit_magic::l1BP(sa.size())+1); 00678 m_sa_sample.resize((sa.size()+SampleDens-1)/SampleDens); 00679 typename RandomAccessContainer::size_type i=0, idx=0; 00680 for (typename RandomAccessContainer::const_iterator it = sa.begin(); i < sa.size(); it += (difference_type)SampleDens, i += SampleDens, ++idx) { 00681 m_sa_sample[idx] = *it; 00682 } 00683 m_isa_sample.set_int_width(bit_magic::l1BP(sa.size())+1); 00684 m_isa_sample.resize((sa.size()+(InvSampleDens)-1)/(InvSampleDens)); 00685 i = 0; 00686 for (typename RandomAccessContainer::const_iterator it = sa.begin(), end = sa.end(); it != end; ++it, ++i) { 00687 if ((*it % InvSampleDens) == 0) { 00688 m_isa_sample[*it/InvSampleDens] = i; 00689 } 00690 } 00691 } 00692 00693 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth, class charType> 00694 template<typename RandomAccessContainer> 00695 void csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::construct(const RandomAccessContainer& sa, const char_type* str) 00696 { 00697 construct_samples(sa, str); 00698 const typename RandomAccessContainer::size_type n = sa.size(); 00699 typename RandomAccessContainer::size_type i=0; 00700 if (str == NULL) { 00701 throw std::logic_error(util::demangle(typeid(this).name())+": text is needed to construct the csa_wt!"); 00702 } else { 00703 char_type* _bwt = new char_type[n+1]; 00704 i = 0; 00705 assert(str[n-1]=='\0'); 00706 // TODO: %-operation is slow, replace by to_add[] lookup 00707 for (typename RandomAccessContainer::const_iterator it = sa.begin(), end = sa.end(); it != end; ++it, ++i) { 00708 _bwt[i] = str[(*it+n-1)%(n)]; 00709 } 00710 m_wavelet_tree = WaveletTree(_bwt, n); 00711 delete [] _bwt; 00712 00713 m_psi = psi_type(this); 00714 m_bwt = bwt_type(this); 00715 } 00716 } 00717 00718 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth, class charType> 00719 typename csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::const_iterator csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::begin()const 00720 { 00721 return const_iterator(this, 0); 00722 } 00723 00724 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth, class charType> 00725 typename csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::const_iterator csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::end()const 00726 { 00727 return const_iterator(this, size()); 00728 } 00729 00730 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth, class charType> 00731 inline typename csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::value_type csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::operator[](size_type i)const 00732 { 00733 size_type off = 0; 00734 while (i % SampleDens) { 00735 i = m_psi(i); 00736 ++off; 00737 } 00738 value_type result = m_sa_sample[i/SampleDens]; 00739 if (result + off < size()) { 00740 return result + off; 00741 } else { 00742 return result + off - size(); 00743 } 00744 /* 00745 #ifdef USE_CSA_CACHE 00746 size_type r = 0; 00747 if( csa_cache.exists(i, r) ){ 00748 return r; 00749 } 00750 #endif 00751 size_type off = 0; 00752 while( i % SampleDens ){// while i mod SampleDens != 0 (SA[i] is not sampled) 00753 i = m_psi[i]; // go to the position where SA[i]+1 is located 00754 ++off; // add 1 to the offset 00755 } 00756 value_type result = m_sa_sample[i/SampleDens]; 00757 // TODO: try LF-function for iteration 00758 if( result < off ){ 00759 #ifdef USE_CSA_CACHE 00760 r = m_psi.size()-(off-result); 00761 return r; 00762 #endif 00763 return m_psi.size()-(off-result); 00764 } 00765 else{ 00766 #ifdef USE_CSA_CACHE 00767 r = result-off; 00768 return r; 00769 #endif 00770 return result-off; 00771 } 00772 */ 00773 } 00774 00775 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth, class charType> 00776 inline typename csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::value_type csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::operator()(size_type i)const 00777 { 00778 size_type ii; 00779 value_type result = m_isa_sample[ ii = ((i+InvSampleDens-1)/InvSampleDens) ]; // get the leftmost sampled isa value to the right of i 00780 ii *= InvSampleDens; 00781 if (ii >= size()) { 00782 i = size() - 1 - i; 00783 } else { 00784 i = ii - i; 00785 } 00786 while (i--) { 00787 result = m_psi(result); 00788 } 00789 return result; 00790 } 00791 00792 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth, class charType> 00793 csa_wt<WaveletTree,SampleDens, InvSampleDens, fixedIntWidth, charType>& csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::operator=(const csa_wt<WaveletTree,SampleDens, InvSampleDens, fixedIntWidth, charType>& csa) 00794 { 00795 if (this != &csa) { 00796 copy(csa); 00797 } 00798 return *this; 00799 } 00800 00801 00802 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth, class charType> 00803 typename csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::size_type csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::serialize(std::ostream& out, structure_tree_node* v, std::string name)const 00804 { 00805 structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); 00806 size_type written_bytes = 0; 00807 written_bytes += m_wavelet_tree.serialize(out, child, "wavelet_tree"); 00808 written_bytes += m_sa_sample.serialize(out, child, "sa_samples"); 00809 written_bytes += m_isa_sample.serialize(out, child, "isa_samples"); 00810 written_bytes += m_char2comp.serialize(out, child, "char2comp"); 00811 written_bytes += m_comp2char.serialize(out, child, "comp2char"); 00812 written_bytes += m_C.serialize(out, child, "C"); 00813 written_bytes += util::write_member(m_sigma, out, child, "sigma"); 00814 structure_tree::add_size(child, written_bytes); 00815 return written_bytes; 00816 } 00817 00818 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth, class charType> 00819 void csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::load(std::istream& in) 00820 { 00821 m_wavelet_tree.load(in); 00822 m_sa_sample.load(in); 00823 m_isa_sample.load(in); 00824 m_char2comp.load(in); 00825 m_comp2char.load(in); 00826 m_C.load(in); 00827 util::read_member(m_sigma, in); 00828 m_psi = psi_type(this); 00829 m_bwt = bwt_type(this); 00830 } 00831 00832 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth, class charType> 00833 void csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>::swap(csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth, charType>& csa) 00834 { 00835 if (this != &csa) { 00836 m_wavelet_tree.swap(csa.m_wavelet_tree); 00837 m_sa_sample.swap(csa.m_sa_sample); 00838 m_isa_sample.swap(csa.m_isa_sample); 00839 m_char2comp.swap(csa.m_char2comp); 00840 m_comp2char.swap(csa.m_comp2char); 00841 m_C.swap(csa.m_C); 00842 std::swap(m_sigma, csa.m_sigma); 00843 // m_psi.swap(csa.m_psi); 00844 m_psi = psi_type(this); 00845 csa.m_psi = psi_type(&csa); 00846 m_bwt = bwt_type(this); 00847 csa.m_bwt = bwt_type(&csa); 00848 } 00849 } 00850 00851 /* TODO 00852 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth> 00853 bool csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth>::operator==(const csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth> &csa)const{ 00854 for(uint16_t i=0;i<256;++i) 00855 if(m_char2comp[i] != csa.m_char2comp[i] or m_comp2char[i] != csa.m_comp2char[i] or m_C[i] != csa.m_C[i]) 00856 return false; 00857 return m_psi == csa.m_psi and m_sa_sample == csa.m_sa_sample and m_isa_sample == csa.m_isa_sample and m_C[256] == csa.m_C[256] and m_sigma == csa.m_sigma; 00858 } 00859 00860 template<class WaveletTree, uint32_t SampleDens, uint32_t InvSampleDens, uint8_t fixedIntWidth> 00861 bool csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth>::operator!=(const csa_wt<WaveletTree, SampleDens, InvSampleDens, fixedIntWidth> &csa)const{ 00862 return !(*this == csa); 00863 // return m_psi != csa.m_psi or m_sa_sample != csa.m_sa_sample or m_isa_sample != csa.m_isa_sample; 00864 } 00865 */ 00866 00867 } // end namespace sdsl 00868 00869 #endif