SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 20010 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00021 #ifndef INCLUDED_SDSL_ALGORITHMS_FOR_COMPRESSED_SUFFIX_ARRAYS 00022 #define INCLUDED_SDSL_ALGORITHMS_FOR_COMPRESSED_SUFFIX_ARRAYS 00023 00024 #include "int_vector.hpp" // for bit_vector 00025 #include <stack> // for calculate_supercartesian_tree_bp 00026 00027 #ifdef SDSL_DEBUG_ALGORITHMS_FOR_COMPRESSED_SUFFIX_ARRAYS 00028 #include "testutils.hpp" 00029 #endif 00030 00031 namespace sdsl 00032 { 00033 00034 namespace algorithm 00035 { 00036 00037 template<class Csa> 00038 void set_text(const unsigned char* str, typename Csa::size_type len, int_vector<64>& C, int_vector<8>& char2comp, int_vector<8>& comp2char, uint16_t& sigma) 00039 { 00040 #ifdef SDSL_DEBUG_ALGORITHMS_FOR_COMPRESSED_SUFFIX_ARRAYS 00041 stop_watch sw; sw.start(); 00042 #endif 00043 typedef typename Csa::size_type size_type; 00044 util::assign(C, int_vector<64>(257,0)); 00045 util::assign(char2comp, int_vector<8>(256,0)); 00046 util::assign(comp2char, int_vector<8>(256,0)); 00047 00048 if (str == NULL or len ==0) 00049 return; 00050 const unsigned char* p = str; 00051 // assert(str[len-1]==0); // muss bei bwt z.B. nicht so sein 00052 for (size_type i=0; i<len; ++i) { 00053 ++C[*p++]; 00054 } 00055 assert(1 == C[0]); 00056 size_type value = 0; 00057 for (uint16_t i=0; i<256; ++i) 00058 if (C[i]) { 00059 char2comp[i] = value; 00060 C[value] = C[i]; 00061 if (i > value) 00062 C[i]=0; 00063 ++value; 00064 } 00065 sigma = value; 00066 for (uint16_t i=0; i<256; ++i) 00067 if (char2comp[i]) 00068 comp2char[char2comp[i]] = i; 00069 for (uint16_t i=256; i>0; --i) C[i] = C[i-1]; 00070 C[0] = 0; 00071 for (uint16_t i=1; i<257; ++i) C[i] += C[i-1]; 00072 if (C[256]!=len) { 00073 std::cerr<<"C[256]="<<C[256]<<" "<<len<<std::endl; 00074 } 00075 assert(C[256]==len); 00076 assert(C[sigma+1]==len); 00077 #ifdef SDSL_DEBUG_ALGORITHMS_FOR_COMPRESSED_SUFFIX_ARRAYS 00078 sw.stop(); 00079 std::cerr<<"set_text takes "<<sw.get_real_time()<<" ms real time"; 00080 std::cerr<<"set_text takes "<<sw.get_user_time()<<" ms user time"; 00081 #endif 00082 } 00083 00084 template<class Csa, class size_type_class> 00085 void set_text(int_vector_file_buffer<8, size_type_class>& str_buf, typename Csa::size_type len, int_vector<64>& C, int_vector<8>& char2comp, int_vector<8>& comp2char, uint16_t& sigma) 00086 { 00087 #ifdef SDSL_DEBUG_ALGORITHMS_FOR_COMPRESSED_SUFFIX_ARRAYS 00088 stop_watch sw; sw.start(); 00089 #endif 00090 typedef typename Csa::size_type size_type; 00091 util::assign(C, int_vector<64>(257,0)); 00092 util::assign(char2comp, int_vector<8>(256,0)); 00093 util::assign(comp2char, int_vector<8>(256,0)); 00094 str_buf.reset(); 00095 if (0 == str_buf.int_vector_size) 00096 return; 00097 for (size_type i=0, r_sum=0, r = str_buf.load_next_block(); i < len;) { 00098 for (; i < r_sum+r; ++i) { 00099 ++C[str_buf[i-r_sum]]; 00100 } 00101 r_sum += r; r = str_buf.load_next_block(); 00102 } 00103 assert(1 == C[0]); 00104 uint16_t value = 0; 00105 for (int i=0; i<256; ++i) 00106 if (C[i]) { 00107 char2comp[i] = value; 00108 C[value] = C[i]; 00109 if (i > value) 00110 C[i]=0; 00111 ++value; 00112 } 00113 sigma = value; 00114 for (int i=0; i<256; ++i) 00115 if (char2comp[i]) 00116 comp2char[char2comp[i]] = i; 00117 for (int i=256; i>0; --i) C[i] = C[i-1]; 00118 C[0] = 0; 00119 for (int i=1; i<257; ++i) C[i] += C[i-1]; 00120 if (C[256]!=len) { 00121 std::cerr<<"C[256]="<<C[256]<<" "<<len<<std::endl; 00122 } 00123 assert(C[256]==len); 00124 assert(C[sigma+1]==len); 00125 #ifdef SDSL_DEBUG_ALGORITHMS_FOR_COMPRESSED_SUFFIX_ARRAYS 00126 sw.stop(); 00127 std::cerr<<"set_text takes "<<sw.get_real_time()<<" ms real time"; 00128 std::cerr<<"set_text takes "<<sw.get_user_time()<<" ms user time"; 00129 #endif 00130 } 00131 00133 00139 template<class Csa> 00140 bool char_occures_in_text_of_csa(const Csa& csa, typename Csa::char_type c) 00141 { 00142 return (csa.char2comp[c] > 0) or (csa.char2comp[c]==c); 00143 } 00144 00145 template<class Csa, uint8_t int_width, class size_type_class> 00146 void set_sa_and_isa_samples(int_vector_file_buffer<int_width, size_type_class>& sa_buf, typename Csa::sa_sample_type& sa_sample, typename Csa::isa_sample_type& isa_sample) 00147 { 00148 typedef typename Csa::size_type size_type; 00149 size_type n = sa_buf.int_vector_size; 00150 00151 sa_sample.set_int_width(bit_magic::l1BP(n)+1); 00152 sa_sample.resize((n+Csa::sa_sample_dens-1)/Csa::sa_sample_dens); 00153 00154 isa_sample.set_int_width(bit_magic::l1BP(n)+1); 00155 if (n >= 1) { // so n+Csa::isa_sample_dens >= 2 00156 isa_sample.resize((n-1+Csa::isa_sample_dens-1)/Csa::isa_sample_dens + 1); 00157 } 00158 00159 util::set_one_bits(sa_sample); 00160 util::set_one_bits(isa_sample); 00161 00162 sa_buf.reset(); 00163 for (size_type i=0, r_sum = 0, r = sa_buf.load_next_block(), cnt_mod=Csa::sa_sample_dens, cnt_sum=0; r_sum < n;) { 00164 for (; i < r_sum+r; ++i, ++cnt_mod) { 00165 size_type sa = sa_buf[i-r_sum]; 00166 if ((sa % Csa::isa_sample_dens) == 0) { 00167 isa_sample[sa/Csa::isa_sample_dens] = i; 00168 } else if (sa+1 == n) { 00169 isa_sample[(sa+Csa::isa_sample_dens-1)/Csa::isa_sample_dens] = i; 00170 } 00171 if (Csa::sa_sample_dens == cnt_mod) { 00172 cnt_mod = 0; 00173 sa_sample[cnt_sum++] = sa; 00174 } 00175 } 00176 r_sum += r; r = sa_buf.load_next_block(); 00177 } 00178 /* if(isa_sample.size() < 20 ){ 00179 std::cerr<<"isa_samples = "; 00180 for(int_vector<>::size_type i=0;i<isa_sample.size(); ++i) 00181 std::cerr<<isa_sample[i]<<" "; 00182 std::cerr<<std::endl; 00183 std::cerr<<"Csa::isa_sample_dens = "<<Csa::isa_sample_dens <<std::endl; 00184 std::cerr<<"n = "<<n <<std::endl; 00185 std::cerr<<"isa_sample.size() = "<< isa_sample.size() <<std::endl; 00186 } 00187 */ 00188 } 00189 00190 //template<class Csa> 00191 //bool char_at_char_pos_equals_char(const Csa &csa, typename Csa::size_type char_pos, typename Csa::char_type c){ 00192 // typename Csa:: 00193 //} 00194 00195 /* 00196 * \par Time complexity 00197 * \f$ \Order{\log \sigma} \f$ 00198 * TODO: add hinted binary search? Two way binary search? 00199 */ 00200 template<class Csa> 00201 const unsigned char get_ith_character_of_the_first_row(const typename Csa::size_type i, const Csa& csa) 00202 { 00203 assert(i < csa.size()); 00204 if (csa.sigma < 16) { //<- if sigma is small search linear 00205 typename Csa::size_type res=1; 00206 while (csa.C[res] <= i) 00207 ++res; 00208 return csa.comp2char[res-1]; 00209 } else { 00210 // binary search the character with C 00211 typename Csa::size_type upper_c = csa.sigma, lower_c = 0; // lower_c inclusive, upper_c exclusive 00212 typename Csa::size_type res=0; 00213 //std::cout<<" binary search: csa.sigma= "<<upper_c<<" i="<<i<<std::endl; 00214 do { 00215 res = (upper_c+lower_c)/2; 00216 if (i < csa.C[res]) { 00217 upper_c = res; 00218 } else if (i >= csa.C[res+1]) { 00219 lower_c = res+1; 00220 } 00221 } while (i < csa.C[res] or i >= csa.C[res+1]); // i is not in the interval 00222 return csa.comp2char[res]; 00223 } 00224 } 00225 00226 }// end namespace algorithm 00227 00228 }// end namespace sdsl 00229 00230 #endif 00231