SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2008 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00021 #ifndef INCLUDED_SDSL_CSA_SADA_THEO 00022 #define INCLUDED_SDSL_CSA_SADA_THEO 00023 00024 #include "int_vector.hpp" 00025 #include "enc_vector.hpp" 00026 #include "algorithms.hpp" 00027 #include "iterators.hpp" 00028 #include "csa_uncompressed.hpp" 00029 #include <iostream> 00030 #include <algorithm> 00031 #include <cassert> 00032 #include <cstring> // for strlen 00033 #include <iomanip> 00034 #include <iterator> 00035 00036 namespace sdsl 00037 { 00038 00039 template<class EncVector, class RankSupport> 00040 class csa_sada_theo; 00041 00043 00049 template<class EncVector = enc_vector<>, class RankSupport = rank_support_v<> > 00050 class csa_sada_theo 00051 { 00052 public: 00053 typedef uint64_t value_type; // STL Container requirement 00054 typedef random_access_const_iterator<csa_sada_theo> const_iterator;// STL Container requirement 00055 typedef const_iterator iterator; // STL Container requirement 00056 typedef const value_type const_reference; 00057 typedef const_reference reference; 00058 typedef const_reference* pointer; 00059 typedef const pointer const_pointer; 00060 typedef int_vector<>::size_type size_type; // STL Container requirement 00061 typedef size_type csa_size_type; 00062 typedef ptrdiff_t difference_type; // STL Container requirement 00063 typedef const unsigned char* pattern_type; 00064 00065 private: 00066 uint8_t m_level; // number of levels of the data structure 00067 // size_type m_size; // number of eleements of the suffix array --> implicit stored in m_b[0].size() 00068 EncVector* m_psi; // psi function at different levels 00069 int_vector<1>* m_b; // bit vectors supporting navigation at different levels 00070 RankSupport* m_b_rank; // rank support for m_b 00071 int_vector<> m_sa; // suffix array at last level (m_level) 00072 int_vector<> m_last_pos; // isa[m_b[l].size()-1] 00073 00074 uint16_t m_char2comp[256]; 00075 unsigned char m_comp2char[256]; 00076 size_type m_C[257]; 00077 00078 uint8_t m_sigma; 00079 00080 void copy(const csa_sada_theo<EncVector, RankSupport>& csa); 00081 // lookup SA at position i in level l 00082 value_type lookup(size_type i, size_type l=0)const { 00083 //std::cerr<<"i="<<i<<" l="<<l<<std::endl; 00084 if (l < m_level) { 00085 if (m_b[l][i]) { 00086 return lookup(m_b_rank[l].rank(i), l+1)<<1; 00087 } else { 00088 if (i==m_last_pos[l]) // if i is the index of the greatest entry in SA_l 00089 return m_b[l].size()-1; 00090 // if( i - m_b_rank[l].rank(i) >= m_psi[l].size() ){ 00091 // std::cerr<<"i="<<i<<" l="<<l<<" m_psi.size()="<<m_psi[l].size()<<std::endl; 00092 // } 00093 return lookup(m_psi[l][i - m_b_rank[l].rank(i)], l)-1; 00094 } 00095 } else { // at the last level return the stored values 00096 return m_sa[i]; 00097 } 00098 } 00099 00100 // value_type lookup_psi(size_type i, size_type l=0)const{ 00101 // 00102 // } 00103 00104 // construct the compressed suffix array from a suffix array 00105 template<typename RandomAccessContainer> 00106 void construct(RandomAccessContainer& sa, typename RandomAccessContainer::value_type isa_0); 00107 00108 00109 void setText(const unsigned char* str, size_type len); 00110 public: 00111 const uint16_t* char2comp; 00112 const unsigned char* comp2char; 00113 const size_type* C; 00114 00116 csa_sada_theo():m_level(0),m_psi(NULL),m_b(NULL),m_b_rank(NULL) { } 00117 00119 csa_sada_theo(const csa_sada_theo<EncVector, RankSupport>& csa); 00120 00122 00125 template<typename RandomAccessContainer> 00126 csa_sada_theo(const RandomAccessContainer& sa, const unsigned char* str); 00127 00129 csa_sada_theo(const unsigned char* str); 00130 00132 ~csa_sada_theo(); 00133 00135 00138 size_type size()const { 00139 return m_level > 0 ? m_b[0].size() : m_sa.size(); 00140 } 00141 00143 00146 static size_type max_size() { 00147 return EncVector::max_size(); 00148 } 00149 00151 00153 bool empty()const { 00154 return size()==0; 00155 } 00156 00158 00166 void swap(csa_sada_theo<EncVector, RankSupport>& csa); 00167 00169 00172 const_iterator begin()const; 00173 00175 00178 const_iterator end()const; 00179 00181 00185 inline value_type operator[](size_type i)const; 00186 00188 00191 csa_sada_theo& operator=(const csa_sada_theo& csa); 00192 00194 00199 bool operator==(const csa_sada_theo& csa)const; 00200 00202 00207 bool operator!=(const csa_sada_theo& csa)const; 00208 00210 00213 size_type serialize(std::ostream& out) const; 00214 00216 void load(std::istream& in); 00217 }; 00218 00219 // == template functions == 00220 00221 template<class EncVector, class RankSupport> 00222 void csa_sada_theo<EncVector,RankSupport>::swap(csa_sada_theo<EncVector, RankSupport>& csa) 00223 { 00224 if (this!=&csa) { 00225 std::swap(m_level, csa.m_level); 00226 m_sa.swap(csa.m_sa); 00227 m_last_pos.swap(csa.m_last_pos); 00228 std::swap(m_psi , csa.m_psi); // only pointers => swap it with std::swap 00229 std::swap(m_b , csa.m_b); // only pointers => swap it with std::swap 00230 std::swap(m_b_rank , csa.m_b_rank); // only pointers => swap it with std::swap 00231 for (uint16_t i=0; i<256; ++i) { 00232 std::swap(m_char2comp[i], csa.m_char2comp[i]); 00233 std::swap(m_comp2char[i], csa.m_comp2char[i]); 00234 std::swap(m_C[i], csa.m_C[i]); 00235 } 00236 std::swap(m_C[256], csa.m_C[256]); 00237 } 00238 } 00239 00240 00241 template<class EncVector, class RankSupport> 00242 csa_sada_theo<EncVector,RankSupport>::csa_sada_theo(const csa_sada_theo<EncVector,RankSupport>& csa):m_psi(NULL),m_b(NULL),m_b_rank(NULL),char2comp(m_char2comp),comp2char(m_comp2char),C(m_C) 00243 { 00244 copy(csa); 00245 } 00246 00247 template<class EncVector, class RankSupport> 00248 csa_sada_theo<EncVector,RankSupport>& csa_sada_theo<EncVector,RankSupport>::operator=(const csa_sada_theo<EncVector,RankSupport>& csa) 00249 { 00250 if (this != &csa) { 00251 copy(csa); 00252 } 00253 return *this; 00254 } 00255 00256 00257 template<class EncVector, class RankSupport> 00258 void csa_sada_theo<EncVector, RankSupport>::setText(const unsigned char* str, size_type len) 00259 { 00260 for (uint16_t i=0; i<256; ++i) 00261 m_char2comp[i] = m_comp2char[i] = m_C[i] = 0; 00262 m_C[256] = 0; 00263 if (str == NULL or len ==0) 00264 return; 00265 const unsigned char* p = str; 00266 for (size_type i=0; i<len-1; ++i) { 00267 m_char2comp[*p++] = 1; 00268 } 00269 size_type value = 0; 00270 for (uint16_t i=0; i<256; ++i) 00271 if (m_char2comp[i]) 00272 m_char2comp[i] = value++; 00273 m_sigma = value; 00274 for (uint16_t i=0; i<256; ++i) 00275 if (m_char2comp[i]) 00276 m_comp2char[m_char2comp[i]] = i; 00277 for (uint16_t i=0; i<257; ++i) m_C[i]=0; 00278 for (size_type i=0; i<len; ++i) ++m_C[m_char2comp[str[i]]]; 00279 for (uint16_t i=256; i>0; --i) m_C[i] = m_C[i-1]; 00280 m_C[0]=0; 00281 for (uint16_t i=1; i<257; ++i) m_C[i] += m_C[i-1]; 00282 assert(m_C[256]==len); 00283 } 00284 00285 template<class EncVector, class RankSupport> 00286 void csa_sada_theo<EncVector,RankSupport>::copy(const csa_sada_theo<EncVector,RankSupport>& csa) 00287 { 00288 // first free the data structure if it is already in use 00289 if (m_b_rank != NULL) { 00290 delete [] m_b_rank; 00291 m_b_rank = NULL; 00292 } 00293 if (m_b != NULL) { 00294 delete [] m_b; 00295 m_b = NULL; 00296 } 00297 if (m_psi != NULL) { 00298 delete [] m_psi; 00299 m_psi = NULL; 00300 } 00301 // second copy the data from csa to this 00302 m_level = csa.m_level; 00303 m_sa = csa.m_sa; 00304 m_last_pos = csa.m_last_pos; 00305 00306 if (m_level > 0) { 00307 // copy arrays 00308 m_psi = new EncVector[m_level]; 00309 for (size_type i=0; i < m_level; ++i) { 00310 m_psi[i] = csa.m_psi[i]; 00311 } 00312 00313 m_b = new int_vector<1>[m_level]; 00314 for (size_type i=0; i < m_level; ++i) { 00315 m_b[i] = csa.m_b[i]; 00316 } 00317 00318 m_b_rank = new RankSupport[m_level]; 00319 for (size_type i=0; i < m_level; ++i) { 00320 m_b_rank[i] = csa.m_b_rank[i]; 00321 m_b_rank[i].set_vector(&m_b[i]); 00322 } 00323 } 00324 for (int i=0; i<256; ++i) { 00325 m_char2comp[i] = csa.m_char2comp[i]; 00326 m_comp2char[i] = csa.m_comp2char[i]; 00327 m_C[i] = csa.m_C[i]; 00328 } 00329 m_C[256] = csa.m_C[256]; 00330 } 00331 00332 template<class EncVector, class RankSupport> 00333 typename csa_sada_theo<EncVector,RankSupport>::const_iterator csa_sada_theo<EncVector,RankSupport>::begin()const 00334 { 00335 return const_iterator(this,0); 00336 } 00337 00338 template<class EncVector, class RankSupport> 00339 typename csa_sada_theo<EncVector,RankSupport>::const_iterator csa_sada_theo<EncVector,RankSupport>::end()const 00340 { 00341 return const_iterator(this, size()); 00342 } 00343 00344 template<class EncVector, class RankSupport> 00345 csa_sada_theo<EncVector, RankSupport>::~csa_sada_theo() 00346 { 00347 // free data structure in that order: m_b_rank depends on m_b 00348 if (m_b_rank != NULL) { 00349 delete [] m_b_rank; 00350 m_b_rank = NULL; 00351 } 00352 if (m_b != NULL) { 00353 delete [] m_b; 00354 m_b = NULL; 00355 } 00356 if (m_psi != NULL) { 00357 delete [] m_psi; 00358 m_psi = NULL; 00359 } 00360 } 00361 00362 template<class EncVector, class RankSupport> 00363 template<typename RandomAccessContainer> 00364 void csa_sada_theo<EncVector,RankSupport>::construct(RandomAccessContainer& psi, typename RandomAccessContainer::value_type isa_0) 00365 { 00366 // first free the data structure if it is already in use 00367 if (m_b_rank != NULL) { 00368 delete [] m_b_rank; 00369 m_b_rank = NULL; 00370 } 00371 if (m_b != NULL) { 00372 delete [] m_b; 00373 m_b = NULL; 00374 } 00375 if (m_psi != NULL) { 00376 delete [] m_psi; 00377 m_psi = NULL; 00378 } 00379 m_sa.resize(0); m_last_pos.resize(0); m_level = 0; 00380 // if psi is empty there is nothing to do.... 00381 if (psi.empty()) { 00382 return; 00383 } 00384 typename RandomAccessContainer::size_type n = psi.size(); 00385 EncVector psiz(psi);// Store psi compressed 00386 psi.resize(0); // free the memory for the uncompressed psi array 00387 m_level = 0; 00388 while (bit_magic::l1BP(n)+1 > (1ULL<<m_level)) 00389 ++m_level; 00390 // special case if m_level equals 0! 00391 if (m_level==0) { 00392 algorithm::psi2sa(psiz, isa_0, m_sa); 00393 m_b = new int_vector<1>[1]; m_b[0].resize(1); m_b[0][0] = 1; 00394 return; 00395 } 00396 m_psi = new EncVector[m_level]; 00397 m_b = new int_vector<1>[m_level]; 00398 m_b_rank = new RankSupport[m_level]; 00399 m_last_pos = int_vector<>(m_level, 0, bit_magic::l1BP(n)+1); 00400 util::set_zero_bits(m_last_pos); 00401 00402 size_type nn = n; 00403 int_vector<> temp_psi; 00404 for (size_type l = 0; l < m_level; ++l) { // build data structure for each level 00405 m_b[l].resize(nn); util::set_zero_bits(m_b[l]); // initialize b to zero values 00406 typename EncVector::value_type isa_2k = isa_0; // variable for values of isa[0], isa[2],...,isa[2k] 00407 // walk through the conceptual suffix array and calculate b at level l 00408 for (size_type k = 0; k < nn-2; k+=2, isa_2k = psiz[psiz[isa_2k]]) { // isa[2(k+1)] = \psi(\psi(isa[2k])) 00409 m_b[l][ isa_2k ] = 1; 00410 } 00411 m_b[l][ isa_2k ] = 1; 00412 m_last_pos[l] = nn&1 ? isa_2k : psiz[isa_2k]; // save position of entry nn-1 in SA 00413 util::init_support(m_b_rank[l], &m_b[l]); // build rank structure for b 00414 temp_psi.set_int_width(bit_magic::l1BP(nn)+1); 00415 temp_psi.resize(nn/2); 00416 for (size_type i=0, j=0; i<nn; ++i) { 00417 // collect the psi values where SA[i] is odd 00418 if (!m_b[l][i]) temp_psi[j++] = psiz[i]; 00419 } 00420 m_psi[l] = EncVector(temp_psi); // save compressed psi at level l 00421 // calculate psi function of the next level 00422 temp_psi = int_vector<>((nn+1)/2, 0,bit_magic::l1BP((nn+1)/2)+1); 00423 isa_2k = isa_0;//psiz[0]; 00424 for (size_type k=1, t; k+1 <= nn; k+=2) { 00425 temp_psi[ m_b_rank[l].rank(isa_2k) ] = m_b_rank[l].rank((t=psiz[psiz[isa_2k]])); 00426 isa_2k = t; 00427 } 00428 // if nn is odd add one entry 00429 if (nn&1) temp_psi[ m_b_rank[l].rank(isa_2k) ] = m_b_rank[l].rank(psiz[isa_2k]); 00430 nn = (nn+1)/2; // calculate new nn 00431 00432 isa_0 = m_b_rank[l].rank(isa_0); 00433 int_vector<> ttt; algorithm::psi2sa(temp_psi, isa_0, ttt); 00434 psiz = EncVector(temp_psi); 00435 } 00436 algorithm::psi2sa(temp_psi, isa_0, m_sa);// calc sa of the last level 00437 } 00438 00439 template<class EncVector, class RankSupport> 00440 template<typename RandomAccessContainer> 00441 csa_sada_theo<EncVector,RankSupport>::csa_sada_theo(const RandomAccessContainer& sa, const unsigned char* str):m_level(0),m_psi(NULL),m_b(NULL),m_b_rank(NULL),char2comp(m_char2comp),comp2char(m_comp2char),C(m_C) 00442 { 00443 assert(str != NULL); 00444 size_type n = 0; 00445 n = strlen((const char*)str)+1; 00446 setText(str, n); 00447 assert(n == sa.size()); 00448 typename RandomAccessContainer::value_type isa_0 = 0; 00449 for (typename RandomAccessContainer::const_iterator it = sa.begin(), end = sa.end(); it!=end; ++it) 00450 if (*it==0) { 00451 isa_0 = it-sa.begin(); 00452 break; 00453 } 00454 //std::cerr<<"verify sa = "<<(SDSAlgorithm::verify_sa(str, n, sa))<<std::endl; 00455 int_vector<> psi(sa.size(), 0, bit_magic::l1BP(sa.size())+1); 00456 algorithm::sa2psi(sa, psi); // calculate psi out of sa 00457 construct(psi, isa_0); 00458 } 00459 00460 template<class EncVector, class RankSupport> 00461 typename csa_sada_theo<EncVector, RankSupport>::value_type csa_sada_theo<EncVector, RankSupport>::operator[](size_type i)const 00462 { 00463 value_type result= 0; 00464 uint64_t history = 0; 00465 uint8_t history_len = 0; 00466 uint8_t l = 0; 00467 00468 for (l=0; l < m_level;) { 00469 history <<= 1; 00470 ++history_len; 00471 if (m_b[l][i]) { 00472 i = m_b_rank[l].rank(i); 00473 ++history; 00474 ++l; 00475 } else { 00476 if (i==m_last_pos[l]) { 00477 result = m_b[l].size()-1; 00478 history >>= 1; --history_len; 00479 goto ready; 00480 } 00481 i = m_psi[l][i-m_b_rank[l].rank(i)]; 00482 } 00483 } 00484 result = m_sa[i]; 00485 ready: 00486 while (history_len-- > 0) { 00487 if (history&1) 00488 result<<=1; 00489 else 00490 result-=1; 00491 history>>=1; 00492 } 00493 return result; 00494 } 00495 00496 00497 template<class EncVector, class RankSupport> 00498 typename csa_sada_theo<EncVector, RankSupport>::size_type csa_sada_theo<EncVector, RankSupport>::serialize(std::ostream& out)const 00499 { 00500 size_type written_bytes = 0; 00501 out.write((char*) &m_level, sizeof(m_level)); 00502 written_bytes += sizeof(m_level); 00503 written_bytes += m_sa.serialize(out); 00504 if (m_level > 0) { 00505 for (size_type l=0; l<m_level; ++l) written_bytes += m_psi[l].serialize(out); 00506 for (size_type l=0; l<m_level; ++l) written_bytes += m_b[l].serialize(out); 00507 // for(size_type l=0; l<m_level; ++l) written_bytes += m_b_rank[l].serialize(out); 00508 written_bytes += m_last_pos.serialize(out); 00509 } 00510 size_type wb = sizeof(m_char2comp[0])*256; 00511 out.write((char*)m_char2comp, wb); 00512 written_bytes += wb; 00513 wb = sizeof(m_comp2char[0])*256; 00514 out.write((char*)m_comp2char, wb); 00515 written_bytes += wb; 00516 wb = sizeof(m_C[0])*257; 00517 out.write((char*)C, wb); 00518 written_bytes += wb; 00519 return written_bytes; 00520 } 00521 00522 template<class EncVector, class RankSupport> 00523 void csa_sada_theo<EncVector, RankSupport>::load(std::istream& in) 00524 { 00525 in.read((char*) &m_level, sizeof(m_level)); 00526 m_sa.load(in); 00527 if (m_psi != NULL) { 00528 delete [] m_psi; 00529 m_psi = NULL; 00530 } 00531 if (m_b != NULL) { 00532 delete [] m_b; 00533 m_b = NULL; 00534 } 00535 if (m_b_rank != NULL) { 00536 delete [] m_b_rank; 00537 m_b_rank = NULL; 00538 } 00539 if (m_level > 0) { 00540 m_psi = new EncVector[m_level]; 00541 m_b = new int_vector<1>[m_level]; 00542 m_b_rank = new RankSupport[m_level]; 00543 for (size_type l=0; l<m_level; ++l) m_psi[l].load(in); 00544 for (size_type l=0; l<m_level; ++l) m_b[l].load(in); 00545 // for(size_type l=0; l<m_level; ++l) m_b_rank[l].load(in, &m_b[l]); 00546 for (size_type l=0; l<m_level; ++l) util::init_support(m_b_rank[l], &m_b[l]); 00547 m_last_pos.load(in); 00548 } 00549 in.read((char*)m_char2comp, sizeof(m_char2comp[0])*256); 00550 in.read((char*)m_comp2char, sizeof(m_comp2char[0])*256); 00551 in.read((char*)m_C, sizeof(m_C[0])*257); 00552 } 00553 00554 template<class EncVector, class RankSupport> 00555 csa_sada_theo<EncVector,RankSupport>::csa_sada_theo(const unsigned char* str):m_level(0),m_psi(NULL),m_b(NULL),m_b_rank(NULL),char2comp(m_char2comp),comp2char(m_comp2char),C(m_C) 00556 { 00557 // m_b = NULL; m_psi = NULL; m_b_rank = NULL; 00558 // assert(str!=NULL); 00559 csa_uncompressed sa(str); 00560 // uint64_t n = strlen((const char*)str); // get size of the text 00561 setText(str, sa.size()); 00562 // int_vector<> sa(n, 0, bit_magic::l1BP(n)+1 ); 00563 // algorithm::calculate_sa(str, n, sa); // calculate the suffix array sa of str 00564 00565 typename int_vector<>::value_type isa_0 = 0; 00566 for (csa_uncompressed::size_type i=0; i!=sa.size(); ++i) 00567 if (sa[i]==0) { 00568 isa_0 = i; 00569 break; 00570 } 00571 //std::cerr<<"verify sa = "<<(SDSAlgorithm::verify_sa(str, n, sa))<<std::endl; 00572 int_vector<> psi(sa.size(), bit_magic::l1BP(sa.size())+1); 00573 algorithm::sa2psi(sa, psi); // calculate psi out of sa 00574 sa = csa_uncompressed(); 00575 construct(psi, isa_0); 00576 } 00577 00578 template<class EncVector, class RankSupport> 00579 bool csa_sada_theo<EncVector,RankSupport>::operator==(const csa_sada_theo& csa)const 00580 { 00581 if (m_level != csa.m_level or m_sa != csa.m_sa or m_last_pos != csa.m_last_pos) 00582 return false; 00583 if ((m_psi == NULL and csa.m_psi != NULL) or (m_psi !=NULL and csa.m_psi == NULL)) 00584 return false; 00585 if ((m_b == NULL and csa.m_b != NULL) or (m_b !=NULL and csa.m_b == NULL)) 00586 return false; 00587 if ((m_b_rank == NULL and csa.m_b_rank != NULL) or (m_b_rank !=NULL and csa.m_b_rank == NULL)) 00588 return false; 00589 if (m_level > 0) { 00590 assert(m_psi != NULL and m_b != NULL and m_b_rank != NULL and 00591 csa.m_psi!=NULL and csa.m_b!=NULL and csa.m_b_rank!=NULL); 00592 for (size_type i=0; i<m_level; ++i) { 00593 if (m_psi[i] != csa.m_psi[i] or m_b[i] != csa.m_b[i] or m_b_rank[i] != csa.m_b_rank[i]) 00594 return false; 00595 } 00596 } 00597 return true; 00598 } 00599 00600 template<class EncVector, class RankSupport> 00601 bool csa_sada_theo<EncVector,RankSupport>::operator!=(const csa_sada_theo<EncVector,RankSupport>& csa)const 00602 { 00603 return !(*this==csa); 00604 } 00605 00606 00607 } // end namespace sdsl 00608 00609 #endif