SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
 All Classes Namespaces Files Functions Variables Typedefs Friends
sdsl/include/sdsl/csa_sada_theo.hpp
Go to the documentation of this file.
00001 /* sdsl - succinct data structures library
00002     Copyright (C) 2008 Simon Gog
00003 
00004     This program is free software: you can redistribute it and/or modify
00005     it under the terms of the GNU General Public License as published by
00006     the Free Software Foundation, either version 3 of the License, or
00007     (at your option) any later version.
00008 
00009     This program is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012     GNU General Public License for more details.
00013 
00014     You should have received a copy of the GNU General Public License
00015     along with this program.  If not, see http://www.gnu.org/licenses/ .
00016 */
00021 #ifndef INCLUDED_SDSL_CSA_SADA_THEO
00022 #define INCLUDED_SDSL_CSA_SADA_THEO
00023 
00024 #include "int_vector.hpp"
00025 #include "enc_vector.hpp"
00026 #include "algorithms.hpp"
00027 #include "iterators.hpp"
00028 #include "csa_uncompressed.hpp"
00029 #include <iostream>
00030 #include <algorithm>
00031 #include <cassert>
00032 #include <cstring> // for strlen
00033 #include <iomanip>
00034 #include <iterator>
00035 
00036 namespace sdsl
00037 {
00038 
00039 template<class EncVector, class RankSupport>
00040 class csa_sada_theo;
00041 
00043 
00049 template<class EncVector = enc_vector<>, class RankSupport = rank_support_v<> >
00050 class csa_sada_theo
00051 {
00052     public:
00053         typedef uint64_t                                                                                         value_type;    // STL Container requirement
00054         typedef random_access_const_iterator<csa_sada_theo> const_iterator;// STL Container requirement
00055         typedef const_iterator                                                                           iterator;              // STL Container requirement
00056         typedef const value_type                                                                         const_reference;
00057         typedef const_reference                                                                          reference;
00058         typedef const_reference*                                                                         pointer;
00059         typedef const pointer                                                                            const_pointer;
00060         typedef int_vector<>::size_type                                                          size_type;             // STL Container requirement
00061         typedef size_type                                                                                        csa_size_type;
00062         typedef ptrdiff_t                                                                                        difference_type; // STL Container requirement
00063         typedef const unsigned char*                                                             pattern_type;
00064 
00065     private:
00066         uint8_t m_level;   // number of levels of the data structure
00067 //              size_type m_size; // number of eleements of the suffix array --> implicit stored in m_b[0].size()
00068         EncVector* m_psi;  // psi function at different levels
00069         int_vector<1>* m_b; // bit vectors supporting navigation at different levels
00070         RankSupport* m_b_rank; // rank support for m_b
00071         int_vector<> m_sa; // suffix array at last level (m_level)
00072         int_vector<> m_last_pos; // isa[m_b[l].size()-1]
00073 
00074         uint16_t                m_char2comp[256];
00075         unsigned char   m_comp2char[256];
00076         size_type       m_C[257];
00077 
00078         uint8_t m_sigma;
00079 
00080         void copy(const csa_sada_theo<EncVector, RankSupport>& csa);
00081         // lookup SA at position i in level l
00082         value_type lookup(size_type i, size_type l=0)const {
00083 //std::cerr<<"i="<<i<<" l="<<l<<std::endl;
00084             if (l < m_level) {
00085                 if (m_b[l][i]) {
00086                     return lookup(m_b_rank[l].rank(i), l+1)<<1;
00087                 } else {
00088                     if (i==m_last_pos[l]) // if i is the index of the greatest entry in SA_l
00089                         return m_b[l].size()-1;
00090 //                                              if( i - m_b_rank[l].rank(i) >= m_psi[l].size() ){
00091 //                                                      std::cerr<<"i="<<i<<" l="<<l<<" m_psi.size()="<<m_psi[l].size()<<std::endl;
00092 //                                              }
00093                     return lookup(m_psi[l][i - m_b_rank[l].rank(i)], l)-1;
00094                 }
00095             } else { // at the last level return the stored values
00096                 return m_sa[i];
00097             }
00098         }
00099 
00100 //              value_type lookup_psi(size_type i, size_type l=0)const{
00101 //
00102 //              }
00103 
00104         // construct the compressed suffix array from a suffix array
00105         template<typename RandomAccessContainer>
00106         void construct(RandomAccessContainer& sa, typename RandomAccessContainer::value_type isa_0);
00107 
00108 
00109         void setText(const unsigned char* str, size_type len);
00110     public:
00111         const uint16_t* char2comp;
00112         const unsigned char* comp2char;
00113         const size_type* C;
00114 
00116         csa_sada_theo():m_level(0),m_psi(NULL),m_b(NULL),m_b_rank(NULL) { }
00117 
00119         csa_sada_theo(const csa_sada_theo<EncVector, RankSupport>& csa);
00120 
00122 
00125         template<typename RandomAccessContainer>
00126         csa_sada_theo(const RandomAccessContainer& sa, const unsigned char* str);
00127 
00129         csa_sada_theo(const unsigned char* str);
00130 
00132         ~csa_sada_theo();
00133 
00135 
00138         size_type size()const {
00139             return m_level > 0 ? m_b[0].size() : m_sa.size();
00140         }
00141 
00143 
00146         static size_type max_size() {
00147             return EncVector::max_size();
00148         }
00149 
00151 
00153         bool empty()const {
00154             return size()==0;
00155         }
00156 
00158 
00166         void swap(csa_sada_theo<EncVector, RankSupport>& csa);
00167 
00169 
00172         const_iterator begin()const;
00173 
00175 
00178         const_iterator end()const;
00179 
00181 
00185         inline value_type operator[](size_type i)const;
00186 
00188 
00191         csa_sada_theo& operator=(const csa_sada_theo& csa);
00192 
00194 
00199         bool operator==(const csa_sada_theo& csa)const;
00200 
00202 
00207         bool operator!=(const csa_sada_theo& csa)const;
00208 
00210 
00213         size_type serialize(std::ostream& out) const;
00214 
00216         void load(std::istream& in);
00217 };
00218 
00219 // == template functions ==
00220 
00221 template<class EncVector, class RankSupport>
00222 void csa_sada_theo<EncVector,RankSupport>::swap(csa_sada_theo<EncVector, RankSupport>& csa)
00223 {
00224     if (this!=&csa) {
00225         std::swap(m_level, csa.m_level);
00226         m_sa.swap(csa.m_sa);
00227         m_last_pos.swap(csa.m_last_pos);
00228         std::swap(m_psi         , csa.m_psi);     // only pointers => swap it with std::swap
00229         std::swap(m_b           , csa.m_b);       // only pointers => swap it with std::swap
00230         std::swap(m_b_rank      , csa.m_b_rank);  // only pointers => swap it with std::swap
00231         for (uint16_t i=0; i<256; ++i) {
00232             std::swap(m_char2comp[i], csa.m_char2comp[i]);
00233             std::swap(m_comp2char[i], csa.m_comp2char[i]);
00234             std::swap(m_C[i], csa.m_C[i]);
00235         }
00236         std::swap(m_C[256], csa.m_C[256]);
00237     }
00238 }
00239 
00240 
00241 template<class EncVector, class RankSupport>
00242 csa_sada_theo<EncVector,RankSupport>::csa_sada_theo(const csa_sada_theo<EncVector,RankSupport>& csa):m_psi(NULL),m_b(NULL),m_b_rank(NULL),char2comp(m_char2comp),comp2char(m_comp2char),C(m_C)
00243 {
00244     copy(csa);
00245 }
00246 
00247 template<class EncVector, class RankSupport>
00248 csa_sada_theo<EncVector,RankSupport>& csa_sada_theo<EncVector,RankSupport>::operator=(const csa_sada_theo<EncVector,RankSupport>& csa)
00249 {
00250     if (this != &csa) {
00251         copy(csa);
00252     }
00253     return *this;
00254 }
00255 
00256 
00257 template<class EncVector, class RankSupport>
00258 void csa_sada_theo<EncVector, RankSupport>::setText(const unsigned char* str, size_type len)
00259 {
00260     for (uint16_t i=0; i<256; ++i)
00261         m_char2comp[i] = m_comp2char[i] = m_C[i] = 0;
00262     m_C[256] = 0;
00263     if (str == NULL or len ==0)
00264         return;
00265     const unsigned char* p = str;
00266     for (size_type i=0; i<len-1; ++i) {
00267         m_char2comp[*p++] = 1;
00268     }
00269     size_type value = 0;
00270     for (uint16_t i=0; i<256; ++i)
00271         if (m_char2comp[i])
00272             m_char2comp[i] = value++;
00273     m_sigma = value;
00274     for (uint16_t i=0; i<256; ++i)
00275         if (m_char2comp[i])
00276             m_comp2char[m_char2comp[i]] = i;
00277     for (uint16_t i=0; i<257; ++i) m_C[i]=0;
00278     for (size_type i=0; i<len; ++i) ++m_C[m_char2comp[str[i]]];
00279     for (uint16_t i=256; i>0; --i) m_C[i] = m_C[i-1];
00280     m_C[0]=0;
00281     for (uint16_t i=1; i<257; ++i) m_C[i] += m_C[i-1];
00282     assert(m_C[256]==len);
00283 }
00284 
00285 template<class EncVector, class RankSupport>
00286 void csa_sada_theo<EncVector,RankSupport>::copy(const csa_sada_theo<EncVector,RankSupport>& csa)
00287 {
00288     // first free the data structure if it is already in use
00289     if (m_b_rank != NULL) {
00290         delete [] m_b_rank;
00291         m_b_rank = NULL;
00292     }
00293     if (m_b             != NULL) {
00294         delete [] m_b;
00295         m_b = NULL;
00296     }
00297     if (m_psi   != NULL) {
00298         delete [] m_psi;
00299         m_psi = NULL;
00300     }
00301     // second copy the data from csa to this
00302     m_level     = csa.m_level;
00303     m_sa                = csa.m_sa;
00304     m_last_pos  = csa.m_last_pos;
00305 
00306     if (m_level > 0) {
00307         // copy arrays
00308         m_psi   = new EncVector[m_level];
00309         for (size_type i=0; i < m_level; ++i) {
00310             m_psi[i]    = csa.m_psi[i];
00311         }
00312 
00313         m_b             = new int_vector<1>[m_level];
00314         for (size_type i=0; i < m_level; ++i) {
00315             m_b[i]      = csa.m_b[i];
00316         }
00317 
00318         m_b_rank = new RankSupport[m_level];
00319         for (size_type i=0; i < m_level; ++i) {
00320             m_b_rank[i] = csa.m_b_rank[i];
00321             m_b_rank[i].set_vector(&m_b[i]);
00322         }
00323     }
00324     for (int i=0; i<256; ++i) {
00325         m_char2comp[i]  = csa.m_char2comp[i];
00326         m_comp2char[i]  = csa.m_comp2char[i];
00327         m_C[i]                  = csa.m_C[i];
00328     }
00329     m_C[256] = csa.m_C[256];
00330 }
00331 
00332 template<class EncVector, class RankSupport>
00333 typename csa_sada_theo<EncVector,RankSupport>::const_iterator csa_sada_theo<EncVector,RankSupport>::begin()const
00334 {
00335     return const_iterator(this,0);
00336 }
00337 
00338 template<class EncVector, class RankSupport>
00339 typename csa_sada_theo<EncVector,RankSupport>::const_iterator csa_sada_theo<EncVector,RankSupport>::end()const
00340 {
00341     return const_iterator(this, size());
00342 }
00343 
00344 template<class EncVector, class RankSupport>
00345 csa_sada_theo<EncVector, RankSupport>::~csa_sada_theo()
00346 {
00347     // free data structure in that order: m_b_rank depends on m_b
00348     if (m_b_rank != NULL) {
00349         delete [] m_b_rank;
00350         m_b_rank = NULL;
00351     }
00352     if (m_b             != NULL) {
00353         delete [] m_b;
00354         m_b = NULL;
00355     }
00356     if (m_psi   != NULL) {
00357         delete [] m_psi;
00358         m_psi = NULL;
00359     }
00360 }
00361 
00362 template<class EncVector, class RankSupport>
00363 template<typename RandomAccessContainer>
00364 void csa_sada_theo<EncVector,RankSupport>::construct(RandomAccessContainer& psi, typename RandomAccessContainer::value_type isa_0)
00365 {
00366     // first free the data structure if it is already in use
00367     if (m_b_rank != NULL) {
00368         delete [] m_b_rank;
00369         m_b_rank = NULL;
00370     }
00371     if (m_b             != NULL) {
00372         delete [] m_b;
00373         m_b = NULL;
00374     }
00375     if (m_psi   != NULL) {
00376         delete [] m_psi;
00377         m_psi = NULL;
00378     }
00379     m_sa.resize(0); m_last_pos.resize(0); m_level = 0;
00380     // if psi is empty there is nothing to do....
00381     if (psi.empty()) {
00382         return;
00383     }
00384     typename RandomAccessContainer::size_type n = psi.size();
00385     EncVector psiz(psi);// Store psi compressed
00386     psi.resize(0);              // free the memory for the uncompressed psi array
00387     m_level = 0;
00388     while (bit_magic::l1BP(n)+1 > (1ULL<<m_level))
00389         ++m_level;
00390     // special case if m_level equals 0!
00391     if (m_level==0) {
00392         algorithm::psi2sa(psiz, isa_0, m_sa);
00393         m_b =  new int_vector<1>[1]; m_b[0].resize(1); m_b[0][0] = 1;
00394         return;
00395     }
00396     m_psi               = new EncVector[m_level];
00397     m_b                 = new int_vector<1>[m_level];
00398     m_b_rank    = new RankSupport[m_level];
00399     m_last_pos  = int_vector<>(m_level, 0, bit_magic::l1BP(n)+1);
00400     util::set_zero_bits(m_last_pos);
00401 
00402     size_type nn = n;
00403     int_vector<> temp_psi;
00404     for (size_type l = 0; l < m_level; ++l) { // build data structure for each level
00405         m_b[l].resize(nn); util::set_zero_bits(m_b[l]); // initialize b to zero values
00406         typename EncVector::value_type isa_2k = isa_0; // variable for values of isa[0], isa[2],...,isa[2k]
00407         // walk through the conceptual suffix array and calculate b at level l
00408         for (size_type k = 0; k < nn-2; k+=2, isa_2k = psiz[psiz[isa_2k]]) { // isa[2(k+1)] = \psi(\psi(isa[2k]))
00409             m_b[l][ isa_2k ] = 1;
00410         }
00411         m_b[l][ isa_2k ] = 1;
00412         m_last_pos[l] = nn&1 ? isa_2k : psiz[isa_2k]; // save position of entry nn-1 in SA
00413         util::init_support(m_b_rank[l], &m_b[l]); // build rank structure for b
00414         temp_psi.set_int_width(bit_magic::l1BP(nn)+1);
00415         temp_psi.resize(nn/2);
00416         for (size_type i=0, j=0; i<nn; ++i) {
00417             // collect the psi values where SA[i] is odd
00418             if (!m_b[l][i]) temp_psi[j++] = psiz[i];
00419         }
00420         m_psi[l] = EncVector(temp_psi); // save compressed psi at level l
00421         // calculate psi function of the next level
00422         temp_psi = int_vector<>((nn+1)/2, 0,bit_magic::l1BP((nn+1)/2)+1);
00423         isa_2k   = isa_0;//psiz[0];
00424         for (size_type k=1, t; k+1 <= nn; k+=2) {
00425             temp_psi[ m_b_rank[l].rank(isa_2k) ] = m_b_rank[l].rank((t=psiz[psiz[isa_2k]]));
00426             isa_2k = t;
00427         }
00428         // if nn is odd add one entry
00429         if (nn&1) temp_psi[ m_b_rank[l].rank(isa_2k) ] = m_b_rank[l].rank(psiz[isa_2k]);
00430         nn = (nn+1)/2; // calculate new nn
00431 
00432         isa_0 = m_b_rank[l].rank(isa_0);
00433         int_vector<> ttt; algorithm::psi2sa(temp_psi, isa_0, ttt);
00434         psiz = EncVector(temp_psi);
00435     }
00436     algorithm::psi2sa(temp_psi, isa_0, m_sa);// calc sa of the last level
00437 }
00438 
00439 template<class EncVector, class RankSupport>
00440 template<typename RandomAccessContainer>
00441 csa_sada_theo<EncVector,RankSupport>::csa_sada_theo(const RandomAccessContainer& sa, const unsigned char* str):m_level(0),m_psi(NULL),m_b(NULL),m_b_rank(NULL),char2comp(m_char2comp),comp2char(m_comp2char),C(m_C)
00442 {
00443     assert(str != NULL);
00444     size_type n = 0;
00445     n = strlen((const char*)str)+1;
00446     setText(str, n);
00447     assert(n == sa.size());
00448     typename RandomAccessContainer::value_type isa_0 = 0;
00449     for (typename RandomAccessContainer::const_iterator it = sa.begin(), end = sa.end(); it!=end; ++it)
00450         if (*it==0) {
00451             isa_0 = it-sa.begin();
00452             break;
00453         }
00454 //std::cerr<<"verify sa = "<<(SDSAlgorithm::verify_sa(str, n, sa))<<std::endl;
00455     int_vector<> psi(sa.size(), 0, bit_magic::l1BP(sa.size())+1);
00456     algorithm::sa2psi(sa, psi); // calculate psi out of sa
00457     construct(psi, isa_0);
00458 }
00459 
00460 template<class EncVector, class RankSupport>
00461 typename csa_sada_theo<EncVector, RankSupport>::value_type csa_sada_theo<EncVector, RankSupport>::operator[](size_type i)const
00462 {
00463     value_type result= 0;
00464     uint64_t history = 0;
00465     uint8_t  history_len = 0;
00466     uint8_t l            = 0;
00467 
00468     for (l=0; l < m_level;) {
00469         history <<= 1;
00470         ++history_len;
00471         if (m_b[l][i]) {
00472             i = m_b_rank[l].rank(i);
00473             ++history;
00474             ++l;
00475         } else {
00476             if (i==m_last_pos[l]) {
00477                 result = m_b[l].size()-1;
00478                 history >>= 1; --history_len;
00479                 goto ready;
00480             }
00481             i = m_psi[l][i-m_b_rank[l].rank(i)];
00482         }
00483     }
00484     result = m_sa[i];
00485 ready:
00486     while (history_len-- > 0) {
00487         if (history&1)
00488             result<<=1;
00489         else
00490             result-=1;
00491         history>>=1;
00492     }
00493     return result;
00494 }
00495 
00496 
00497 template<class EncVector, class RankSupport>
00498 typename csa_sada_theo<EncVector, RankSupport>::size_type csa_sada_theo<EncVector, RankSupport>::serialize(std::ostream& out)const
00499 {
00500     size_type written_bytes = 0;
00501     out.write((char*) &m_level, sizeof(m_level));
00502     written_bytes += sizeof(m_level);
00503     written_bytes += m_sa.serialize(out);
00504     if (m_level > 0) {
00505         for (size_type l=0; l<m_level; ++l) written_bytes += m_psi[l].serialize(out);
00506         for (size_type l=0; l<m_level; ++l) written_bytes += m_b[l].serialize(out);
00507         //      for(size_type l=0; l<m_level; ++l) written_bytes += m_b_rank[l].serialize(out);
00508         written_bytes += m_last_pos.serialize(out);
00509     }
00510     size_type wb   = sizeof(m_char2comp[0])*256;
00511     out.write((char*)m_char2comp, wb);
00512     written_bytes += wb;
00513     wb                     = sizeof(m_comp2char[0])*256;
00514     out.write((char*)m_comp2char, wb);
00515     written_bytes += wb;
00516     wb                     = sizeof(m_C[0])*257;
00517     out.write((char*)C, wb);
00518     written_bytes += wb;
00519     return written_bytes;
00520 }
00521 
00522 template<class EncVector, class RankSupport>
00523 void csa_sada_theo<EncVector, RankSupport>::load(std::istream& in)
00524 {
00525     in.read((char*) &m_level, sizeof(m_level));
00526     m_sa.load(in);
00527     if (m_psi    != NULL) {
00528         delete [] m_psi;
00529         m_psi = NULL;
00530     }
00531     if (m_b      != NULL) {
00532         delete [] m_b;
00533         m_b = NULL;
00534     }
00535     if (m_b_rank != NULL) {
00536         delete [] m_b_rank;
00537         m_b_rank = NULL;
00538     }
00539     if (m_level > 0) {
00540         m_psi           = new EncVector[m_level];
00541         m_b             = new int_vector<1>[m_level];
00542         m_b_rank        = new RankSupport[m_level];
00543         for (size_type l=0; l<m_level; ++l) m_psi[l].load(in);
00544         for (size_type l=0; l<m_level; ++l) m_b[l].load(in);
00545         //      for(size_type l=0; l<m_level; ++l) m_b_rank[l].load(in, &m_b[l]);
00546         for (size_type l=0; l<m_level; ++l) util::init_support(m_b_rank[l], &m_b[l]);
00547         m_last_pos.load(in);
00548     }
00549     in.read((char*)m_char2comp, sizeof(m_char2comp[0])*256);
00550     in.read((char*)m_comp2char, sizeof(m_comp2char[0])*256);
00551     in.read((char*)m_C, sizeof(m_C[0])*257);
00552 }
00553 
00554 template<class EncVector, class RankSupport>
00555 csa_sada_theo<EncVector,RankSupport>::csa_sada_theo(const unsigned char* str):m_level(0),m_psi(NULL),m_b(NULL),m_b_rank(NULL),char2comp(m_char2comp),comp2char(m_comp2char),C(m_C)
00556 {
00557 //      m_b = NULL; m_psi = NULL; m_b_rank = NULL;
00558 //      assert(str!=NULL);
00559     csa_uncompressed sa(str);
00560 //      uint64_t n = strlen((const char*)str); // get size of the text
00561     setText(str, sa.size());
00562 //      int_vector<> sa(n, 0, bit_magic::l1BP(n)+1 );
00563 //      algorithm::calculate_sa(str, n, sa); // calculate the suffix array sa of str
00564 
00565     typename int_vector<>::value_type isa_0 = 0;
00566     for (csa_uncompressed::size_type i=0; i!=sa.size(); ++i)
00567         if (sa[i]==0) {
00568             isa_0 = i;
00569             break;
00570         }
00571 //std::cerr<<"verify sa = "<<(SDSAlgorithm::verify_sa(str, n, sa))<<std::endl;
00572     int_vector<> psi(sa.size(), bit_magic::l1BP(sa.size())+1);
00573     algorithm::sa2psi(sa, psi); // calculate psi out of sa
00574     sa = csa_uncompressed();
00575     construct(psi, isa_0);
00576 }
00577 
00578 template<class EncVector, class RankSupport>
00579 bool csa_sada_theo<EncVector,RankSupport>::operator==(const csa_sada_theo& csa)const
00580 {
00581     if (m_level != csa.m_level or m_sa != csa.m_sa or m_last_pos != csa.m_last_pos)
00582         return false;
00583     if ((m_psi == NULL and csa.m_psi != NULL) or (m_psi !=NULL and csa.m_psi == NULL))
00584         return false;
00585     if ((m_b == NULL and csa.m_b != NULL) or (m_b !=NULL and csa.m_b == NULL))
00586         return false;
00587     if ((m_b_rank == NULL and csa.m_b_rank != NULL) or (m_b_rank !=NULL and csa.m_b_rank == NULL))
00588         return false;
00589     if (m_level > 0) {
00590         assert(m_psi != NULL and m_b != NULL and m_b_rank != NULL and
00591                csa.m_psi!=NULL and csa.m_b!=NULL and csa.m_b_rank!=NULL);
00592         for (size_type i=0; i<m_level; ++i) {
00593             if (m_psi[i] != csa.m_psi[i] or m_b[i] != csa.m_b[i] or m_b_rank[i] != csa.m_b_rank[i])
00594                 return false;
00595         }
00596     }
00597     return true;
00598 }
00599 
00600 template<class EncVector, class RankSupport>
00601 bool csa_sada_theo<EncVector,RankSupport>::operator!=(const csa_sada_theo<EncVector,RankSupport>& csa)const
00602 {
00603     return !(*this==csa);
00604 }
00605 
00606 
00607 } // end namespace sdsl
00608 
00609 #endif