SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
 All Classes Namespaces Files Functions Variables Typedefs Friends
sdsl/include/sdsl/algorithms_for_compressed_suffix_arrays.hpp
Go to the documentation of this file.
00001 /* sdsl - succinct data structures library
00002     Copyright (C) 20010 Simon Gog
00003 
00004     This program is free software: you can redistribute it and/or modify
00005     it under the terms of the GNU General Public License as published by
00006     the Free Software Foundation, either version 3 of the License, or
00007     (at your option) any later version.
00008 
00009     This program is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012     GNU General Public License for more details.
00013 
00014     You should have received a copy of the GNU General Public License
00015     along with this program.  If not, see http://www.gnu.org/licenses/ .
00016 */
00021 #ifndef INCLUDED_SDSL_ALGORITHMS_FOR_COMPRESSED_SUFFIX_ARRAYS
00022 #define INCLUDED_SDSL_ALGORITHMS_FOR_COMPRESSED_SUFFIX_ARRAYS
00023 
00024 #include "int_vector.hpp" // for bit_vector
00025 #include <stack> // for calculate_supercartesian_tree_bp
00026 
00027 #ifdef SDSL_DEBUG_ALGORITHMS_FOR_COMPRESSED_SUFFIX_ARRAYS
00028 #include "testutils.hpp"
00029 #endif
00030 
00031 namespace sdsl
00032 {
00033 
00034 namespace algorithm
00035 {
00036 
00037 template<class Csa>
00038 void set_text(const unsigned char* str, typename Csa::size_type len, int_vector<64>& C, int_vector<8>& char2comp, int_vector<8>& comp2char, uint16_t& sigma)
00039 {
00040 #ifdef SDSL_DEBUG_ALGORITHMS_FOR_COMPRESSED_SUFFIX_ARRAYS
00041     stop_watch sw; sw.start();
00042 #endif
00043     typedef typename Csa::size_type size_type;
00044     util::assign(C, int_vector<64>(257,0));
00045     util::assign(char2comp, int_vector<8>(256,0));
00046     util::assign(comp2char, int_vector<8>(256,0));
00047 
00048     if (str == NULL or len ==0)
00049         return;
00050     const unsigned char* p = str;
00051 //      assert(str[len-1]==0); // muss bei bwt z.B. nicht so sein
00052     for (size_type i=0; i<len; ++i) {
00053         ++C[*p++];
00054     }
00055     assert(1 == C[0]);
00056     size_type value = 0;
00057     for (uint16_t i=0; i<256; ++i)
00058         if (C[i]) {
00059             char2comp[i]        = value;
00060             C[value]            = C[i];
00061             if (i > value)
00062                 C[i]=0;
00063             ++value;
00064         }
00065     sigma = value;
00066     for (uint16_t i=0; i<256; ++i)
00067         if (char2comp[i])
00068             comp2char[char2comp[i]] = i;
00069     for (uint16_t i=256; i>0; --i) C[i] = C[i-1];
00070     C[0] = 0;
00071     for (uint16_t i=1; i<257; ++i) C[i] += C[i-1];
00072     if (C[256]!=len) {
00073         std::cerr<<"C[256]="<<C[256]<<" "<<len<<std::endl;
00074     }
00075     assert(C[256]==len);
00076     assert(C[sigma+1]==len);
00077 #ifdef SDSL_DEBUG_ALGORITHMS_FOR_COMPRESSED_SUFFIX_ARRAYS
00078     sw.stop();
00079     std::cerr<<"set_text takes "<<sw.get_real_time()<<" ms real time";
00080     std::cerr<<"set_text takes "<<sw.get_user_time()<<" ms user time";
00081 #endif
00082 }
00083 
00084 template<class Csa, class size_type_class>
00085 void set_text(int_vector_file_buffer<8, size_type_class>& str_buf, typename Csa::size_type len, int_vector<64>& C, int_vector<8>& char2comp, int_vector<8>& comp2char, uint16_t& sigma)
00086 {
00087 #ifdef SDSL_DEBUG_ALGORITHMS_FOR_COMPRESSED_SUFFIX_ARRAYS
00088     stop_watch sw; sw.start();
00089 #endif
00090     typedef typename Csa::size_type size_type;
00091     util::assign(C, int_vector<64>(257,0));
00092     util::assign(char2comp, int_vector<8>(256,0));
00093     util::assign(comp2char, int_vector<8>(256,0));
00094     str_buf.reset();
00095     if (0 == str_buf.int_vector_size)
00096         return;
00097     for (size_type i=0, r_sum=0, r = str_buf.load_next_block(); i < len;) {
00098         for (; i < r_sum+r; ++i) {
00099             ++C[str_buf[i-r_sum]];
00100         }
00101         r_sum += r; r = str_buf.load_next_block();
00102     }
00103     assert(1 == C[0]);
00104     uint16_t value = 0;
00105     for (int i=0; i<256; ++i)
00106         if (C[i]) {
00107             char2comp[i]        = value;
00108             C[value]            = C[i];
00109             if (i > value)
00110                 C[i]=0;
00111             ++value;
00112         }
00113     sigma = value;
00114     for (int i=0; i<256; ++i)
00115         if (char2comp[i])
00116             comp2char[char2comp[i]] = i;
00117     for (int i=256; i>0; --i) C[i] = C[i-1];
00118     C[0] = 0;
00119     for (int i=1; i<257; ++i) C[i] += C[i-1];
00120     if (C[256]!=len) {
00121         std::cerr<<"C[256]="<<C[256]<<" "<<len<<std::endl;
00122     }
00123     assert(C[256]==len);
00124     assert(C[sigma+1]==len);
00125 #ifdef SDSL_DEBUG_ALGORITHMS_FOR_COMPRESSED_SUFFIX_ARRAYS
00126     sw.stop();
00127     std::cerr<<"set_text takes "<<sw.get_real_time()<<" ms real time";
00128     std::cerr<<"set_text takes "<<sw.get_user_time()<<" ms user time";
00129 #endif
00130 }
00131 
00133 
00139 template<class Csa>
00140 bool char_occures_in_text_of_csa(const Csa& csa, typename Csa::char_type c)
00141 {
00142     return (csa.char2comp[c] > 0) or (csa.char2comp[c]==c);
00143 }
00144 
00145 template<class Csa, uint8_t int_width, class size_type_class>
00146 void set_sa_and_isa_samples(int_vector_file_buffer<int_width, size_type_class>& sa_buf, typename Csa::sa_sample_type& sa_sample, typename Csa::isa_sample_type& isa_sample)
00147 {
00148     typedef typename Csa::size_type size_type;
00149     size_type  n = sa_buf.int_vector_size;
00150 
00151     sa_sample.set_int_width(bit_magic::l1BP(n)+1);
00152     sa_sample.resize((n+Csa::sa_sample_dens-1)/Csa::sa_sample_dens);
00153 
00154     isa_sample.set_int_width(bit_magic::l1BP(n)+1);
00155     if (n >= 1) { // so n+Csa::isa_sample_dens >= 2
00156         isa_sample.resize((n-1+Csa::isa_sample_dens-1)/Csa::isa_sample_dens + 1);
00157     }
00158 
00159     util::set_one_bits(sa_sample);
00160     util::set_one_bits(isa_sample);
00161 
00162     sa_buf.reset();
00163     for (size_type i=0, r_sum = 0, r = sa_buf.load_next_block(), cnt_mod=Csa::sa_sample_dens, cnt_sum=0; r_sum < n;) {
00164         for (; i < r_sum+r; ++i, ++cnt_mod) {
00165             size_type sa = sa_buf[i-r_sum];
00166             if ((sa % Csa::isa_sample_dens) == 0) {
00167                 isa_sample[sa/Csa::isa_sample_dens] = i;
00168             } else if (sa+1 == n) {
00169                 isa_sample[(sa+Csa::isa_sample_dens-1)/Csa::isa_sample_dens] = i;
00170             }
00171             if (Csa::sa_sample_dens == cnt_mod) {
00172                 cnt_mod = 0;
00173                 sa_sample[cnt_sum++] = sa;
00174             }
00175         }
00176         r_sum += r; r = sa_buf.load_next_block();
00177     }
00178     /*  if(isa_sample.size() < 20 ){
00179                 std::cerr<<"isa_samples = ";
00180                 for(int_vector<>::size_type i=0;i<isa_sample.size(); ++i)
00181                         std::cerr<<isa_sample[i]<<" ";
00182                 std::cerr<<std::endl;
00183                 std::cerr<<"Csa::isa_sample_dens = "<<Csa::isa_sample_dens <<std::endl;
00184                 std::cerr<<"n = "<<n <<std::endl;
00185                 std::cerr<<"isa_sample.size() = "<< isa_sample.size() <<std::endl;
00186         }
00187     */
00188 }
00189 
00190 //template<class Csa>
00191 //bool char_at_char_pos_equals_char(const Csa &csa, typename Csa::size_type char_pos, typename Csa::char_type c){
00192 //      typename Csa::
00193 //}
00194 
00195 /*
00196  * \par Time complexity
00197  *    \f$ \Order{\log \sigma} \f$
00198  *  TODO: add hinted binary search? Two way binary search?
00199 */
00200 template<class Csa>
00201 const unsigned char get_ith_character_of_the_first_row(const typename Csa::size_type i, const Csa& csa)
00202 {
00203     assert(i < csa.size());
00204     if (csa.sigma < 16) { //<- if sigma is small search linear
00205         typename Csa::size_type res=1;
00206         while (csa.C[res] <= i)
00207             ++res;
00208         return csa.comp2char[res-1];
00209     } else {
00210         // binary search the character with C
00211         typename Csa::size_type upper_c = csa.sigma, lower_c = 0; // lower_c inclusive, upper_c exclusive
00212         typename Csa::size_type res=0;
00213 //std::cout<<" binary search: csa.sigma= "<<upper_c<<" i="<<i<<std::endl;
00214         do {
00215             res = (upper_c+lower_c)/2;
00216             if (i < csa.C[res]) {
00217                 upper_c = res;
00218             } else if (i >= csa.C[res+1]) {
00219                 lower_c = res+1;
00220             }
00221         } while (i < csa.C[res] or i >= csa.C[res+1]);  // i is not in the interval
00222         return csa.comp2char[res];
00223     }
00224 }
00225 
00226 }// end namespace algorithm
00227 
00228 }// end namespace sdsl
00229 
00230 #endif
00231