SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2009 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00023 #ifndef INCLUDED_SDSL_WT 00024 #define INCLUDED_SDSL_WT 00025 00026 #include "int_vector.hpp" 00027 #include "bitmagic.hpp" 00028 #include "util.hpp" // for util::ssign 00029 #include <set> // for calculating the alphabet size 00030 #include <map> // for mapping a symbol to its lexicographical index 00031 #include <algorithm> // for std::swap 00032 #include <stdexcept> 00033 #include <vector> 00034 00035 #ifdef SDSL_DEBUG 00036 #define SDSL_DEBUG_WT 00037 #endif 00038 00039 00040 //#define SDSL_DEBUG_WT 00041 00042 #ifdef SDSL_DEBUG_WT 00043 #include "testutils.hpp" 00044 #endif 00045 00047 namespace sdsl 00048 { 00049 00050 struct unsigned_char_map { 00051 unsigned char m_map[256]; 00052 00053 unsigned char& operator[](unsigned char i) { 00054 return *(m_map+i); 00055 } 00056 00057 unsigned char operator[](unsigned char i)const { 00058 return m_map[i]; 00059 } 00060 00061 void clear() { 00062 for (uint16_t i=0; i<256; ++i) 00063 m_map[i] = 0; 00064 } 00065 00066 uint16_t serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const { 00067 structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); 00068 uint16_t written_bytes = 0; 00069 for (uint16_t i=0; i<256; ++i) { 00070 out.write((char*)&m_map[i], sizeof(m_map[i])); 00071 written_bytes += sizeof(m_map[256]); 00072 } 00073 structure_tree::add_size(child, written_bytes); 00074 return written_bytes; 00075 } 00076 00077 void load(std::istream& in) { 00078 for (uint16_t i=0; i<256; ++i) { 00079 in.read((char*) &m_map[i], sizeof(m_map[i])); 00080 } 00081 } 00082 00083 void swap(unsigned_char_map& map) { 00084 if (this != &map) { 00085 for (uint16_t i=0; i<256; ++i) { 00086 std::swap(m_map[i], map.m_map[i]); 00087 } 00088 } 00089 } 00090 }; 00091 00092 00093 template<class RandomAccessContainer> 00094 class wt_trait 00095 { 00096 public: 00097 typedef typename RandomAccessContainer::size_type size_type; 00098 typedef typename RandomAccessContainer::value_type value_type; 00099 typedef RandomAccessContainer& reference_type; 00100 typedef std::map<value_type, size_type> map_type; 00101 typedef std::map<value_type, size_type> inv_map_type; 00102 enum { char_node_map_size=0 }; 00103 00104 static size_type alphabet_size_and_map(const reference_type rac, size_type n, map_type& map, inv_map_type& inv_map, value_type& first_symbol) { 00105 if (n > 0) 00106 first_symbol = rac[0]; 00107 map.clear(); 00108 inv_map.clear(); 00109 size_type alphabet_size = 0; 00110 for (size_type i=0; i<n; ++i) { 00111 if (map.find(rac[i]) == map.end()) { 00112 map[rac[i]] = 1; 00113 } 00114 } 00115 for (typename map_type::iterator it = map.begin(); it != map.end(); ++it) { // this preserves the order 00116 it->second = alphabet_size; 00117 inv_map[alphabet_size] = it->first; 00118 ++alphabet_size; 00119 } 00120 return alphabet_size; 00121 } 00122 00123 static bool symbol_available(const map_type& map, const value_type c, const value_type first_symbol, const size_type) { 00124 return map.find(c)!=map.end(); 00125 } 00126 00127 static size_type serialize_maps(std::ostream& out, const map_type& map, const inv_map_type& inv_map, structure_tree_node* v=NULL, std::string name="") { 00128 throw std::logic_error(util::demangle(typeid(wt_trait<RandomAccessContainer>).name())+": serialize not implemented"); 00129 return 0; 00130 } 00131 00132 static size_type load_maps(std::istream& in, map_type& map, inv_map_type& inv_map) { 00133 throw std::logic_error(util::demangle(typeid(wt_trait<RandomAccessContainer>).name())+": load not implemented"); 00134 return 0; 00135 } 00136 }; 00137 00138 template<class character> 00139 class wt_trait<character*> 00140 { 00141 public: 00142 typedef bit_vector::size_type size_type; 00143 typedef character value_type; 00144 typedef character* reference_type; 00145 typedef std::map<value_type, size_type> map_type; 00146 typedef std::map<value_type, size_type> inv_map_type; 00147 enum { char_node_map_size=256 }; 00148 00149 static size_type alphabet_size_and_map(const reference_type rac, size_type n, map_type& map, inv_map_type& inv_map, value_type& first_symbol) { 00150 if (n > 0) 00151 first_symbol = *(rac+0); 00152 map.clear(); 00153 inv_map.clear(); 00154 size_type alphabet_size = 0; 00155 for (size_type i=0; i<n; ++i) { 00156 if (map.find(*(rac+i)) == map.end()) { 00157 map[*(rac+i)] = 1; 00158 } 00159 } 00160 00161 for (typename map_type::iterator it = map.begin(); it != map.end(); ++it) { // this preserves the order 00162 it->second = alphabet_size; 00163 inv_map[alphabet_size] = it->first; 00164 ++alphabet_size; 00165 } 00166 return alphabet_size; 00167 } 00168 00169 static bool symbol_available(const map_type& map, const value_type c, const value_type first_symbol, const size_type) { 00170 return map.find(c)!=map.end(); 00171 } 00172 00173 static size_type serialize_maps(std::ostream& out, const map_type& map, const inv_map_type& inv_map, structure_tree_node* v=NULL, std::string name="") { 00174 throw std::logic_error(util::demangle(typeid(wt_trait<character*>).name())+": serialize not implemented"); 00175 return 0; 00176 } 00177 00178 static void load_maps(std::istream& in, map_type& map, inv_map_type& inv_map) { 00179 throw std::logic_error(util::demangle(typeid(wt_trait<character*>).name())+": load not implemented"); 00180 } 00181 }; 00182 00183 template<> 00184 class wt_trait<unsigned char*> 00185 { 00186 public: 00187 typedef bit_vector::size_type size_type; 00188 typedef unsigned char value_type; 00189 typedef unsigned char* reference_type; 00190 typedef unsigned_char_map map_type; 00191 typedef unsigned_char_map inv_map_type; 00192 enum { char_node_map_size=256 }; 00193 00194 static size_type alphabet_size_and_map(const reference_type rac, size_type n, map_type& map, inv_map_type& inv_map, value_type& first_symbol) { 00195 map.clear(); 00196 inv_map.clear(); 00197 if (n==0) { 00198 for (size_type i=0; i<256; ++i) { 00199 map[i] = 255; // mark each symbol as absent 00200 } 00201 return 0; 00202 } 00203 first_symbol = *rac; 00204 map[*rac] = 0; 00205 inv_map[0] = *rac; 00206 size_type alphabet_size = 0; 00207 00208 for (size_type i=0; i<256; ++i) { 00209 map[i] = 0; 00210 } 00211 00212 for (size_type i=0; i<n; ++i) { 00213 value_type c = *(rac+i); 00214 map[c] = 1; 00215 } 00216 00217 for (size_type i=0; i<256; ++i) { 00218 if (map[i]) { 00219 map[i] = alphabet_size; 00220 ++alphabet_size; 00221 } else { 00222 map[i] = 255; 00223 } 00224 inv_map[map[i]] = i; 00225 } 00226 return alphabet_size; 00227 } 00228 00229 static bool symbol_available(const map_type& map, const value_type c, const value_type first_symbol, const size_type sigma) { 00230 return sigma==256 or map[c] < 255; 00231 } 00232 00233 static size_type serialize_maps(std::ostream& out, const map_type& map, const inv_map_type& inv_map, structure_tree_node* v=NULL, std::string name="") { 00234 size_type written_bytes = 0; 00235 written_bytes += map.serialize(out, v, "alphabet_map"); 00236 written_bytes += inv_map.serialize(out, v, "inverse_alphabet_map"); 00237 return written_bytes; 00238 } 00239 00240 static void load_maps(std::istream& in, map_type& map, inv_map_type& inv_map) { 00241 map.load(in); 00242 inv_map.load(in); 00243 } 00244 }; 00245 00246 template<class size_type_class> 00247 class wt_trait<int_vector_file_buffer<8, size_type_class> > 00248 { 00249 public: 00250 typedef size_type_class size_type; 00251 typedef unsigned char value_type; 00252 typedef int_vector_file_buffer<8, size_type_class>& reference_type; 00253 typedef unsigned_char_map map_type; 00254 typedef unsigned_char_map inv_map_type; 00255 enum { char_node_map_size=256 }; 00256 00257 static size_type alphabet_size_and_map(reference_type rac, size_type n, map_type& map, inv_map_type& inv_map, value_type& first_symbol) { 00258 map.clear(); 00259 inv_map.clear(); 00260 if (n==0) { 00261 for (size_type i=0; i<256; ++i) { 00262 map[i] = 255; // mark each symbol as absent 00263 } 00264 return 0; 00265 } 00266 rac.reset(); 00267 if (rac.int_vector_size < n) { 00268 throw std::logic_error("wt<int_vector_file_buffer<8> >: n > rac.int_vector_size!"); 00269 return 0; 00270 } 00271 00272 for (size_type i=0; i<256; ++i) { 00273 map[i] = 0; 00274 } 00275 00276 size_type alphabet_size = 0; 00277 size_type r = rac.load_next_block(); 00278 first_symbol = rac[0]; 00279 for (size_type i=0, r_sum=0; r_sum < n;) { 00280 if (r_sum +r > n) { // make sure that not more than n characters are read 00281 r = n-r_sum; 00282 } 00283 for (; i< r_sum+r; ++i) { 00284 value_type c = rac[i-r_sum]; 00285 map[c] = 1; 00286 } 00287 r_sum += r; r = rac.load_next_block(); 00288 } 00289 00290 for (size_type i=0; i<256; ++i) { 00291 if (map[i]) { 00292 map[i] = alphabet_size; 00293 ++alphabet_size; 00294 } else { 00295 map[i] = 255; 00296 } 00297 inv_map[map[i]] = i; 00298 } 00299 return alphabet_size; 00300 } 00301 00302 static bool symbol_available(const map_type& map, const value_type c, const value_type first_symbol, const size_type sigma) { 00303 return sigma==256 or map[c] < 255; 00304 } 00305 00306 static size_type serialize_maps(std::ostream& out, const map_type& map, const inv_map_type& inv_map, structure_tree_node* v=NULL, std::string name="") { 00307 size_type written_bytes = 0; 00308 written_bytes += map.serialize(out, v, "alphabet_map"); 00309 written_bytes += inv_map.serialize(out, v, "inverse_alphabet_map"); 00310 return written_bytes; 00311 } 00312 00313 static void load_maps(std::istream& in, map_type& map, inv_map_type& inv_map) { 00314 map.load(in); 00315 inv_map.load(in); 00316 } 00317 }; 00318 00320 00341 template<class RandomAccessContainer=unsigned char*, 00342 class BitVector = bit_vector, 00343 class RankSupport = typename BitVector::rank_1_type, 00344 class SelectSupport = typename BitVector::select_1_type, 00345 class SelectSupportZero = typename BitVector::select_0_type> 00346 class wt 00347 { 00348 public: 00349 typedef typename wt_trait<RandomAccessContainer>::size_type size_type; 00350 typedef typename wt_trait<RandomAccessContainer>::value_type value_type; 00351 typedef typename wt_trait<RandomAccessContainer>::map_type map_type; 00352 typedef typename wt_trait<RandomAccessContainer>::inv_map_type inv_map_type; 00353 private: 00354 size_type m_size; 00355 size_type m_sigma; //<- \f$ |\Sigma| \f$ 00356 BitVector m_tree; // bit vector to store the wavelet tree 00357 RankSupport m_tree_rank; // rank support for the wavelet tree bit vector 00358 SelectSupport m_tree_select1; // select support for the wavelet tree bit vector 00359 SelectSupportZero m_tree_select0; 00360 int_vector<64> m_node_pointers; 00361 int_vector<64> m_node_pointers_rank; 00362 size_type m_char_node_map[wt_trait<RandomAccessContainer>::char_node_map_size]; 00363 00364 value_type m_first_symbol; 00365 00366 mutable map_type m_char_map; // map the characters to integers; we use mutable as the std::map has no const version of the []-operator 00367 mutable inv_map_type m_inv_char_map; // map the integers back to characters; we use mutable " " " " " " " " " " " " " " " " 00368 00369 void copy(const wt& wt) { 00370 m_size = wt.m_size; 00371 m_sigma = wt.m_sigma; 00372 m_tree = wt.m_tree; 00373 m_tree_rank = wt.m_tree_rank; 00374 m_tree_rank.set_vector(&m_tree); 00375 m_tree_select1 = wt.m_tree_select1; 00376 m_tree_select1.set_vector(&m_tree); 00377 m_tree_select0 = wt.m_tree_select0; 00378 m_tree_select0.set_vector(&m_tree); 00379 m_node_pointers = wt.m_node_pointers; 00380 m_node_pointers_rank = wt.m_node_pointers_rank; 00381 m_first_symbol = wt.m_first_symbol; 00382 m_char_map = wt.m_char_map; 00383 m_inv_char_map = wt.m_inv_char_map; 00384 if (wt_trait<RandomAccessContainer>::char_node_map_size == 256) { 00385 for (size_type i=0; i<256; ++i) m_char_node_map[i] = wt.m_char_node_map[i]; 00386 } 00387 } 00388 00389 void init_char_node_map() { 00390 if (wt_trait<RandomAccessContainer>::char_node_map_size == 256) { 00391 for (size_type i=0; i<256; ++i) m_char_node_map[i] = 0; 00392 } 00393 } 00394 00395 public: 00396 00397 const size_type& sigma; 00398 00399 // Default constructor 00400 wt():m_size(0),m_sigma(0), sigma(m_sigma) {}; 00401 00402 00403 00405 00411 wt(const typename wt_trait<RandomAccessContainer>::reference_type rac, size_type size):m_size(size), m_sigma(0), sigma(m_sigma) { 00412 #ifdef SDSL_DEBUG_WT 00413 std::cerr<<"Wavelet Tree construct: size="<<size<<std::endl; 00414 #endif 00415 // calculate alphabet size and the mappings for the symbols to the integers and back 00416 m_sigma = wt_trait<RandomAccessContainer>::alphabet_size_and_map(rac, m_size, m_char_map, m_inv_char_map, m_first_symbol); 00417 init_char_node_map(); 00418 int_vector<> node_sizes = int_vector<>(2*m_sigma+1, 0, bit_magic::l1BP(m_size)+1); 00419 m_node_pointers = int_vector<64>(node_sizes.size()+1, 0); 00420 m_node_pointers_rank = int_vector<64>(node_sizes.size()+1, 0); 00421 00422 if (m_sigma < 2) { 00423 if (m_sigma == 1) { // ==> m_size > 0 00424 if (wt_trait<RandomAccessContainer>::char_node_map_size == 256) 00425 m_char_node_map[ m_first_symbol ] = 0; // first symbol corresponds to root node of the wavelet tree 00426 } 00427 return; 00428 } else { 00429 // TODO: slow compared to other constructor which uses 00430 // int_vector_file_buffer 00431 for (size_type i=0; i<m_size; ++i) { 00432 size_type lex_idx = m_char_map[rac[(const size_type)i]]; // lex_idx in [0..m_sigma-1] 00433 size_type sigma = m_sigma, node=0; 00434 while (sigma >= 2) { 00435 #ifdef SDSL_DEBUG_WT 00436 if (node>=node_sizes.size()) { 00437 std::cerr<<" node="<<node<<" node_sizes.size()="<<node_sizes.size()<<" rac["<<i<<"]="<<rac[i]<<" orig_lex_idx="<< (int)m_char_map[rac[i]] <<" sigma="<<sigma<<" lex_idx="<<lex_idx<<std::endl; 00438 } 00439 #endif 00440 assert(node<node_sizes.size()); 00441 node_sizes[node] = node_sizes[node]+1; 00442 if (lex_idx < (sigma+1)/2) { 00443 sigma = (sigma+1)/2; 00444 node = 2*node+1; 00445 } else { 00446 lex_idx -= (sigma+1)/2; 00447 sigma -= (sigma+1)/2; 00448 node = 2*node+2; 00449 } 00450 } 00451 00452 if (wt_trait<RandomAccessContainer>::char_node_map_size == 256) 00453 m_char_node_map[rac[i]] = node; 00454 if (lex_idx!=0) { 00455 std::cout<<" _"<<rac[i]<<" ("<<lex_idx<<")"; 00456 } 00457 } 00458 #ifdef SDSL_DEBUG_WT 00459 std::cerr<<"wavelet tree node sizes"<<std::endl; 00460 for (size_type i=0; i<node_sizes.size(); ++i) { 00461 std::cerr<<node_sizes[i]<<" "; 00462 } std::cerr << std::endl; 00463 #endif 00464 00465 size_type max_node = 0; 00466 m_node_pointers[0] = 0; 00467 for (size_type i=1; i < m_node_pointers.size(); ++i) { 00468 m_node_pointers[i] = m_node_pointers[i-1] + node_sizes[i-1]; 00469 if (node_sizes[i-1]!=0) 00470 max_node = i-1; 00471 } 00472 node_sizes = int_vector<>(max_node+1, 0, 64); 00473 #ifdef SDSL_DEBUG_WT 00474 std::cerr<<"all nodes size = "<<m_node_pointers[max_node+1]<<std::endl; 00475 #endif 00476 bit_vector tree = bit_vector(m_node_pointers[max_node+1]); 00477 for (size_type i=0; i<m_size; ++i) { 00478 size_type lex_idx = m_char_map[rac[i]]; // lex_idx in [0..m_sigma-1] 00479 //std::cerr<<"lex_idx="<<lex_idx<<" "<<rac[i]<<std::endl; 00480 size_type sigma = m_sigma, node=0; 00481 while (sigma >= 2) { 00482 if (lex_idx >= ((sigma+1)/2)) 00483 tree[m_node_pointers[node]+node_sizes[node]] = 1; 00484 node_sizes[node] = node_sizes[node]+1; 00485 if (lex_idx < (sigma+1)/2) { 00486 sigma = (sigma+1)/2; 00487 node = 2*node+1; 00488 } else { 00489 lex_idx -= (sigma+1)/2; 00490 sigma -= (sigma+1)/2; 00491 node = 2*node+2; 00492 } 00493 } 00494 } 00495 util::assign(m_tree, tree); 00496 util::init_support(m_tree_rank, &m_tree); 00497 for (size_type i=0; i < m_node_pointers.size(); ++i) { 00498 m_node_pointers_rank[i] = m_tree_rank(m_node_pointers[i]); 00499 } 00500 util::init_support(m_tree_select0, &m_tree); 00501 util::init_support(m_tree_select1, &m_tree); 00502 } 00503 } 00504 00505 template<class size_type_class> 00506 wt(int_vector_file_buffer<8, size_type_class>& rac, size_type size):m_size(size), m_sigma(0), sigma(m_sigma) { 00507 construct(rac, size); 00508 } 00509 00511 00514 template<class size_type_class> 00515 void construct(int_vector_file_buffer<8, size_type_class>& rac, size_type size) { 00516 m_size = size; 00517 init_char_node_map(); 00518 typedef int_vector_file_buffer<8, size_type_class> tIVFB; 00519 //#define SDSL_DEBUG_WT 00520 #ifdef SDSL_DEBUG_WT 00521 std::cerr<<"Wavelet Tree construct for int_vector_file_buffer: size="<<size<<std::endl; 00522 stop_watch sw; sw.start(); 00523 #endif 00524 00525 // std::cerr << wt_trait<RandomAccessContainer>::char_node_map_size << std::endl; 00526 // calculate alphabet size and the mappings for the symbols to the integers and back 00527 m_sigma = wt_trait<tIVFB>::alphabet_size_and_map(rac, m_size, m_char_map, m_inv_char_map, m_first_symbol); 00528 00529 int_vector<64> node_sizes = int_vector<64>(2*m_sigma+1, 0/*, bit_magic::l1BP(m_size)+1*/); 00530 m_node_pointers = int_vector<64>(node_sizes.size()+1, 0); 00531 m_node_pointers_rank = int_vector<64>(node_sizes.size()+1, 0); 00532 00533 if (m_sigma < 2) { 00534 if (1 == m_sigma) { // handle special case more efficient 00535 m_char_node_map[m_first_symbol] = 0; // map the first symbol to the root node of the wavelet tree 00536 } 00537 } else { 00538 // O(n + |\Sigma|\log|\Sigma|) algorithm for calculating node sizes 00539 size_type C[256] = {0}; 00540 rac.reset(); 00541 // 1. Count occurrences of characters 00542 for (size_type i=0, r_sum=0, r = rac.load_next_block(); r_sum < m_size;) { 00543 if (r_sum + r > m_size) { // read not more than size chars in the next loop 00544 r = m_size-r_sum; 00545 } 00546 for (; i < r_sum+r; ++i) { 00547 ++C[rac[i-r_sum]]; 00548 } 00549 r_sum += r; r = rac.load_next_block(); 00550 } 00551 // 2. Sum up the node sizes for each character 00552 for (size_type i=0; i < 256; ++i) { 00553 if (C[i]) { 00554 size_type lex_idx = m_char_map[i]; // lex_idx in [0..m_sigma-1] 00555 size_type sigma = m_sigma, node=0; 00556 while (sigma >= 2) { 00557 assert(node<node_sizes.size()); 00558 node_sizes[node] = node_sizes[node]+C[i]; 00559 if (lex_idx < ((sigma+1)>>1)) { 00560 sigma = ((sigma+1)>>1); 00561 node = (node<<1)+1; 00562 } else { 00563 lex_idx -= ((sigma+1)>>1); 00564 sigma -= ((sigma+1)>>1); 00565 node = (node<<1)+2; 00566 } 00567 } 00568 m_char_node_map[i] = node; 00569 } else { 00570 m_char_node_map[i] = 0; 00571 } 00572 } 00573 #ifdef SDSL_DEBUG_WT 00574 sw.stop(); 00575 std::cerr<<"Time for counting phase: "<< sw.get_real_time() << " ms real time , "<< sw.get_user_time()<<" ms user time"<< std::endl; 00576 // std::cerr<<"wavelet tree node sizes"<<std::endl; 00577 // for(size_type i=0; i<node_sizes.size(); ++i){ std::cerr<<node_sizes[i]<<" "; } std::cerr << std::endl; 00578 #endif 00579 size_type max_node = 0; 00580 m_node_pointers[0] = 0; 00581 for (size_type i=1; i < m_node_pointers.size(); ++i) { 00582 m_node_pointers[i] = m_node_pointers[i-1] + node_sizes[i-1]; 00583 if (node_sizes[i-1]!=0) 00584 max_node = i-1; 00585 } 00586 node_sizes = int_vector<64>(max_node+1, 0); 00587 #ifdef SDSL_DEBUG_WT 00588 std::cerr<<"all nodes size = "<<m_node_pointers[max_node+1]<<std::endl; 00589 sw.start(); 00590 #endif 00591 // initialize bit vector with 0's 00592 bit_vector tree = bit_vector(m_node_pointers[max_node+1], 0); 00593 // precalc paths in the tree for all symbols in the alphabet 00594 uint8_t path_len[256] = {0}; 00595 uint8_t path[256] = {0}; 00596 for (size_type i=0; i < 256; ++i) { 00597 if (C[i]) { 00598 size_type lex_idx = m_char_map[i]; // lex_idx in [0..m_sigma-1] 00599 size_type sigma = m_sigma, depth=0; 00600 while (sigma >= 2) { 00601 if (lex_idx < ((sigma+1)>>1)) { 00602 sigma = ((sigma+1)>>1); 00603 } else { 00604 lex_idx -= ((sigma+1)>>1); 00605 sigma -= ((sigma+1)>>1); 00606 path[i] |= (1<<depth); 00607 } 00608 ++path_len[i]; 00609 ++depth; 00610 } 00611 } 00612 } 00614 rac.reset(); 00615 for (size_type i=0, r_sum=0, r = rac.load_next_block(); r_sum < m_size;) { 00616 if (r_sum + r > size) { // read not more than size chars in the next loop 00617 r = size-r_sum; 00618 } 00619 uint8_t old_chr = rac[i-r_sum]; 00620 uint8_t times = 0; 00621 for (; i < r_sum+r; ++i) { 00622 uint8_t chr = rac[i-r_sum]; 00623 if (chr != old_chr) { 00624 uint8_t p = path[old_chr]; 00625 for (uint32_t l=0, node=0; l<path_len[old_chr]; ++l, p >>= 1) { 00626 if (p&1) { 00627 tree.set_int(m_node_pointers[node]+node_sizes[node], 0xFFFFFFFFFFFFFFFFULL,times); 00628 node_sizes[node] += times; node = (node<<1)+2; 00629 } else { 00630 node_sizes[node] += times; node = (node<<1)+1; 00631 } 00632 } 00633 times = 1; 00634 old_chr = chr; 00635 } else { // chr == old_chr 00636 ++times; 00637 if (times == 64) { 00638 uint8_t p = path[old_chr]; 00639 for (uint32_t l=0, node=0; l<path_len[old_chr]; ++l, p >>= 1) { 00640 if (p&1) { 00641 tree.set_int(m_node_pointers[node]+node_sizes[node], 0xFFFFFFFFFFFFFFFFULL,times); 00642 node_sizes[node] += times; node = (node<<1)+2; 00643 } else { 00644 node_sizes[node] += times; node = (node<<1)+1; 00645 } 00646 } 00647 times = 0; 00648 } 00649 } 00650 } 00651 if (times > 0) { 00652 uint8_t p = path[old_chr]; 00653 for (uint32_t l=0, node=0; l<path_len[old_chr]; ++l, p >>= 1) { 00654 if (p&1) { 00655 tree.set_int(m_node_pointers[node]+node_sizes[node], 0xFFFFFFFFFFFFFFFFULL,times); 00656 node_sizes[node] += times; node = (node<<1)+2; 00657 } else { 00658 node_sizes[node] += times; node = (node<<1)+1; 00659 } 00660 } 00661 } 00662 r_sum += r; r = rac.load_next_block(); 00663 } 00664 //*/ 00665 #ifdef SDSL_DEBUG_WT 00666 sw.stop(); 00667 std::cerr<<"Time for building phase: "<< sw.get_real_time() << " ms real time , "<< sw.get_user_time()<<" ms user time"<< std::endl; 00668 sw.start(); 00669 #endif 00670 util::assign(m_tree, tree); 00671 util::init_support(m_tree_rank,&m_tree); 00672 #ifdef SDSL_DEBUG_WT 00673 sw.stop(); 00674 std::cerr<<"Time for rank init: "<< sw.get_real_time() << " ms real time , "<< sw.get_user_time()<<" ms user time"<< std::endl; 00675 sw.start(); 00676 #endif 00677 for (size_type i=0; i < m_node_pointers.size(); ++i) { 00678 m_node_pointers_rank[i] = m_tree_rank(m_node_pointers[i]); 00679 } 00680 util::init_support(m_tree_select0,&m_tree); 00681 #ifdef SDSL_DEBUG_WT 00682 std::cerr<<"select0 init ready!"<<std::endl; 00683 #endif 00684 util::init_support(m_tree_select1,&m_tree); 00685 #ifdef SDSL_DEBUG_WT 00686 sw.stop(); 00687 std::cerr<<"Time for select init: "<< sw.get_real_time() << " ms real time , "<< sw.get_user_time()<<" ms user time"<< std::endl; 00688 sw.start(); 00689 #endif 00690 } 00691 } 00692 00694 wt(const wt& wt):sigma(m_sigma) { 00695 copy(wt); 00696 } 00697 00699 wt& operator=(const wt& wt) { 00700 if (this != &wt) { 00701 copy(wt); 00702 } 00703 return *this; 00704 } 00705 00707 void swap(wt& wt) { 00708 if (this != &wt) { 00709 std::swap(m_size, wt.m_size); 00710 std::swap(m_sigma, wt.m_sigma); 00711 m_tree.swap(wt.m_tree); 00712 util::swap_support(m_tree_rank, wt.m_tree_rank, &m_tree, &(wt.m_tree)); 00713 util::swap_support(m_tree_select1, wt.m_tree_select1, &m_tree, &(wt.m_tree)); 00714 util::swap_support(m_tree_select0, wt.m_tree_select0, &m_tree, &(wt.m_tree)); 00715 m_node_pointers.swap(wt.m_node_pointers); 00716 m_node_pointers_rank.swap(wt.m_node_pointers_rank); 00717 std::swap(m_first_symbol, wt.m_first_symbol); 00718 m_char_map.swap(wt.m_char_map); 00719 m_inv_char_map.swap(wt.m_inv_char_map); 00720 // swap char_node_map 00721 if (wt_trait<RandomAccessContainer>::char_node_map_size == 256) { 00722 for (size_type i=0; i<256; ++i) { 00723 std::swap(m_char_node_map[i],wt.m_char_node_map[i]); 00724 } 00725 } 00726 } 00727 } 00728 00730 size_type size()const { 00731 return m_size; 00732 } 00733 00735 bool empty()const { 00736 return m_size == 0; 00737 } 00738 00740 00743 value_type operator[](size_type i)const { 00744 size_type lex_idx = 0; 00745 size_type sigma = m_sigma; 00746 size_type node = 0; 00747 while (sigma >= 2) { 00748 if (m_tree[ m_node_pointers[node]+i ]) { // go to the right child 00749 lex_idx += (sigma+1)/2; 00750 i = m_tree_rank(m_node_pointers[node]+i) - m_node_pointers_rank[node]; //m_tree_rank( m_node_pointers[node] ); 00751 node = 2*node+2; 00752 sigma -= (sigma+1)/2; 00753 } else { // go to the left child 00754 i = i - (m_tree_rank(m_node_pointers[node]+i) - m_node_pointers_rank[node]); //m_tree_rank( m_node_pointers[node] ) ); 00755 node = 2*node+1; 00756 sigma = (sigma+1)/2; 00757 } 00758 } 00759 return m_inv_char_map[lex_idx]; 00760 }; 00761 00763 00770 size_type rank(size_type i, value_type c)const { 00771 if (!wt_trait<RandomAccessContainer>::symbol_available(m_char_map, c, m_first_symbol, m_sigma)) { 00772 return 0; 00773 } 00774 size_type lex_idx = m_char_map[c]; // koennte man auch nur path, path_len ersetzen 00775 size_type sigma = m_sigma; 00776 size_type node = 0; 00777 size_type result = i; 00778 while (sigma >= 2 and result > 0) { 00779 if (lex_idx < (sigma+1)/2) { 00780 result = result - (m_tree_rank(m_node_pointers[node]+result) - m_node_pointers_rank[node]); //m_tree_rank(m_node_pointers[node])); 00781 sigma = (sigma+1)/2; 00782 node = 2*node+1; 00783 } else { 00784 result = m_tree_rank(m_node_pointers[node]+result) - m_node_pointers_rank[node]; // m_tree_rank(m_node_pointers[node]); 00785 lex_idx -= (sigma+1)/2; 00786 sigma -= (sigma+1)/2; 00787 node = 2*node+2; 00788 } 00789 } 00790 return result; 00791 }; 00792 00794 00801 size_type rank_ith_symbol(size_type i, value_type& c)const { 00802 assert(i>=0 and i < size()); 00803 size_type lex_idx = 0; 00804 size_type sigma = m_sigma; 00805 size_type node = 0; 00806 while (sigma >= 2) { 00807 if (m_tree[ m_node_pointers[node]+i ]) { // go to the right child 00808 lex_idx += (sigma+1)/2; 00809 i = m_tree_rank(m_node_pointers[node]+i) - m_node_pointers_rank[node]; //m_tree_rank( m_node_pointers[node] ); 00810 node = 2*node+2; 00811 sigma -= (sigma+1)/2; 00812 } else { // go to the left child 00813 i = i - (m_tree_rank(m_node_pointers[node]+i) - m_node_pointers_rank[node]); //m_tree_rank( m_node_pointers[node] ) ); 00814 node = 2*node+1; 00815 sigma = (sigma+1)/2; 00816 } 00817 } 00818 c = m_inv_char_map[lex_idx]; 00819 return i; 00820 } 00821 00822 // recursive internal version of the method interval_symbols 00823 void _interval_symbols(size_type i, size_type j, size_type& k, 00824 std::vector<unsigned char>& cs, 00825 std::vector<size_type>& rank_c_i, 00826 std::vector<size_type>& rank_c_j, 00827 size_type lex_idx, size_type sigma, size_type node) const { 00828 // not in a leaf 00829 if (sigma >= 2) { 00830 size_type i_new = (m_tree_rank(m_node_pointers[node] + i) - m_node_pointers_rank[node]); 00831 size_type j_new = (m_tree_rank(m_node_pointers[node] + j) - m_node_pointers_rank[node]); 00832 // goto left child 00833 i -= i_new; j -= j_new; 00834 if (i != j) { 00835 _interval_symbols(i, j, k, cs, rank_c_i, rank_c_j, lex_idx, (sigma+1)>>1, (node<<1)|1 /*2*node+1*/); 00836 } 00837 // goto right child 00838 if (i_new != j_new) { 00839 _interval_symbols(i_new, j_new, k, cs, rank_c_i, rank_c_j, lex_idx + ((sigma+1)>>1), sigma-((sigma+1)>>1), (node+1)<<1 /*2*node+2*/); 00840 } 00841 } else { 00842 rank_c_i[k] = i; 00843 rank_c_j[k] = j; 00844 cs[k++] = m_inv_char_map[lex_idx]; 00845 return; 00846 } 00847 } 00848 00850 /* If the character c does not occur in the sequence 0 is returned. 00851 * 00852 */ 00853 size_type count_lex_smaller(size_type i, value_type c)const { 00854 if (!wt_trait<RandomAccessContainer>::symbol_available(m_char_map, c, m_first_symbol, m_sigma)) { 00855 return 0; 00856 } 00857 size_type lex_idx = m_char_map[c]; 00858 size_type sigma = m_sigma; // start with the whole alphabet 00859 size_type node = 0; 00860 size_type result = 0; 00861 while (sigma >= 2) { 00862 if (lex_idx < (sigma+1)/2) { // symbols belongs to the left half of the alphabet 00863 // calculate new i for the left child bit_vector 00864 i = i - (m_tree_rank(m_node_pointers[node]+i) - m_node_pointers_rank[node]); 00865 sigma = (sigma+1)/2; 00866 node = 2*node+1; 00867 } else { // symbol belongs to the right half of the alphabet 00868 size_type ones = m_tree_rank(m_node_pointers[node]+i) - m_node_pointers_rank[node]; 00869 result += (i - ones); // all elements prefixed with 0 are lexicographic smaller than c 00870 i = ones; 00871 // calclate new i for the right child bit_vector 00872 lex_idx -= (sigma+1)/2; 00873 sigma -= (sigma+1)/2; 00874 node = 2*node+2; 00875 } 00876 } 00877 return result; 00878 } 00879 00881 size_type count_lex_smaller(size_type i, size_type j, value_type c)const { 00882 if (i==j) 00883 return 0; 00884 if (i+1 == j) { 00885 return (*this)[i] < c; 00886 } else { 00887 return count_lex_smaller(j, c) - count_lex_smaller(i, c); 00888 } 00889 } 00890 00892 00908 void interval_symbols(size_type i, size_type j, size_type& k, 00909 std::vector<unsigned char>& cs, 00910 std::vector<size_type>& rank_c_i, 00911 std::vector<size_type>& rank_c_j) const { 00912 if (i==j) { 00913 k = 0; 00914 return; 00915 } else if ((j-i)==1) { 00916 k = 1; 00917 rank_c_i[0] = rank_ith_symbol(i, cs[0]); 00918 rank_c_j[0] = rank_c_i[0]+1; 00919 return; 00920 } else if ((j-i)==2) { 00921 rank_c_i[0] = rank_ith_symbol(i, cs[0]); 00922 rank_c_i[1] = rank_ith_symbol(i+1, cs[1]); 00923 if (cs[0]==cs[1]) { 00924 k = 1; 00925 rank_c_j[0] = rank_c_i[0]+2; 00926 return; 00927 } else { 00928 k = 2; 00929 rank_c_j[0] = rank_c_i[0]+1; 00930 rank_c_j[1] = rank_c_i[1]+1; 00931 return; 00932 } 00933 } else { 00934 k = 0; 00935 _interval_symbols(i, j, k, cs, rank_c_i, rank_c_j, 0, m_sigma, 0); 00936 } 00937 } 00938 00939 00941 00947 // TODO: was ist wenn c gar nicht vorkommt, oder es keine i Vorkommen gibt? 00948 size_type select(size_type i, value_type c)const { 00949 if (!wt_trait<RandomAccessContainer>::symbol_available(m_char_map, c, m_first_symbol, m_sigma)) { 00950 return size(); 00951 } 00952 assert(i > 0); 00953 #ifdef SDSL_DEBUG_WT 00954 std::cerr<<"wt: select("<<i<<", "<<c<<")"<<std::endl; 00955 std::cerr<<" i="<<i<< " <= "<< rank(size(),c) <<" ?" <<std::endl; 00956 #endif 00957 assert(i <= rank(size(), c)); 00958 00959 size_type node = 0; 00960 // first phase: go down to the node corresponding to the lex_idx of the character 00961 if (wt_trait<RandomAccessContainer>::char_node_map_size == 256) { 00962 node = m_char_node_map[c]; 00963 } else { 00964 size_type lex_idx = m_char_map[c]; 00965 size_type sigma = m_sigma; 00966 while (sigma >= 2) { 00967 if (lex_idx < (sigma+1)/2) { 00968 sigma = (sigma+1)/2; 00969 node = 2*node+1; 00970 } else { 00971 lex_idx -= (sigma+1)/2; 00972 sigma -= (sigma+1)/2; 00973 node = 2*node+2; 00974 } 00975 } 00976 } 00977 // second phase go up and select the right position 00978 size_type result = i-1; 00979 while (node != 0) { 00980 if (node&1) {// node is a left child 00981 node = (node-1)/2; 00982 result = m_tree_select0(m_node_pointers[node]-m_node_pointers_rank[node]+result+1)-m_node_pointers[node]; 00983 } else { //node is a right child 00984 node = (node-1)/2; 00985 result = m_tree_select1(m_node_pointers_rank[node]+result+1)-m_node_pointers[node]; 00986 } 00987 } 00988 return result; 00989 }; 00990 00992 void range_search_2d(size_type lb, size_type rb, value_type c1, value_type c2, std::vector<size_type>& result) const { 00993 size_type lex_idx_1 = m_char_map[c1]; 00994 size_type lex_idx_2 = m_char_map[c2]; 00995 _range_search_2d(0, lb, rb, lex_idx_1, lex_idx_2, m_sigma, result); 00996 } 00997 00998 // add parameter path 00999 void _range_search_2d(size_type node, size_type lb, size_type rb, size_type lex_idx_1, size_type lex_idx_2, size_type sigma, std::vector<size_type>& result)const { 01000 // std::cerr<<"lb="<<lb<<" rb="<<rb<<" lex_idx_1="<<lex_idx_1<<" lex_idx_2="<<lex_idx_2<<" sigma="<<sigma<<" node="<<node<<std::endl; 01001 // [lex_idx_1..lex_idx_2] in [0..sigma] 01002 if (lb > rb) 01003 return; 01004 if (sigma <= 1) { 01005 // result[i]... 01006 size_type r = 0; 01007 while (node != 0) { 01008 if (node&1) {// node is a left child 01009 node = (node-1)/2; 01010 size_type node_idx = m_node_pointers[node]; 01011 r = m_tree_select0(node_idx-m_tree_rank(node_idx)+r+1)-node_idx; 01012 } else { //node is a right child 01013 node = (node-1)/2; 01014 r = m_tree_select1(m_tree_rank(m_node_pointers[node])+r+1)-m_node_pointers[node]; 01015 } 01016 } 01017 result.push_back(r); 01018 return; 01019 } 01020 size_type lex_mid = (sigma+1)/2; 01021 size_type lsigma = lex_mid; 01022 size_type rsigma = sigma - lex_mid; 01023 01024 size_type ones_lb_1 = m_tree_rank(m_node_pointers[node]+lb) - m_node_pointers_rank[node]; // ones in [0,lb) 01025 size_type ones_rb = m_tree_rank(m_node_pointers[node]+rb+1) - m_node_pointers_rank[node]; // ones in [0,rb] 01026 size_type zeros_lb_1 = lb-ones_lb_1; // zeros in [0,lb) 01027 size_type zeros_rb = rb+1-ones_rb; // zeros in [0,rb] 01028 01029 // std::cerr<<"ones_lb_1="<<ones_lb_1<<" ones_rb="<<ones_rb<<std::endl; 01030 // std::cerr<<"zeros_lb_1="<<zeros_lb_1<<" zeros_rb="<<zeros_rb<<std::endl; 01031 01032 if (lex_idx_1 < lex_mid and zeros_rb) { 01033 _range_search_2d(2*node+1, zeros_lb_1, zeros_rb-1, lex_idx_1, std::min(lex_idx_2,lex_mid-1), lsigma, result); 01034 } 01035 01036 if (lex_idx_2 >= lex_mid and ones_rb) { 01037 size_type _lex_idx_1 = 0; 01038 if (lex_idx_1 > lex_mid) 01039 _lex_idx_1 = lex_idx_1 - lex_mid; 01040 _range_search_2d(2*node+2, ones_lb_1, ones_rb-1, _lex_idx_1, lex_idx_2-lex_mid, rsigma, result); 01041 } 01042 } 01043 01045 size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const { 01046 structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); 01047 size_type written_bytes = 0; 01048 written_bytes += util::write_member(m_size, out, child, "size"); 01049 written_bytes += util::write_member(m_sigma, out, child, "sigma"); 01050 written_bytes += m_tree.serialize(out, child, "tree"); 01051 written_bytes += m_tree_rank.serialize(out, child, "tree_rank"); 01052 written_bytes += m_tree_select1.serialize(out, child, "tree_select_1"); 01053 written_bytes += m_tree_select0.serialize(out, child, "tree_select_0"); 01054 written_bytes += m_node_pointers.serialize(out, child, "node_pointers"); 01055 written_bytes += m_node_pointers_rank.serialize(out, child, "node_pointers_rank"); 01056 written_bytes += wt_trait<RandomAccessContainer>::serialize_maps(out, m_char_map, m_inv_char_map, child, "alphabet_mapping"); 01057 written_bytes += util::write_member(m_first_symbol, out, child, "first symbol"); 01058 // serialize char_node_map 01059 if (wt_trait<RandomAccessContainer>::char_node_map_size == 256) { 01060 size_type written_bytes2 = 0; 01061 for (size_type i=0; i<256; ++i) { 01062 written_bytes2 += util::write_member(m_char_node_map[i], out); 01063 } 01064 structure_tree_node* child2 = structure_tree::add_child(child, "char_node_map", "char_map"); 01065 structure_tree::add_size(child2, written_bytes2); 01066 written_bytes += written_bytes2; 01067 } 01068 structure_tree::add_size(child, written_bytes); 01069 return written_bytes; 01070 } 01071 01073 void load(std::istream& in) { 01074 util::read_member(m_size, in); 01075 util::read_member(m_sigma, in); 01076 m_tree.load(in); 01077 m_tree_rank.load(in, &m_tree); 01078 m_tree_select1.load(in, &m_tree); 01079 m_tree_select0.load(in, &m_tree); 01080 m_node_pointers.load(in); 01081 m_node_pointers_rank.load(in); 01082 wt_trait<RandomAccessContainer>::load_maps(in, m_char_map, m_inv_char_map); 01083 util::read_member(m_first_symbol, in); 01084 // serialize char_node_map 01085 if (wt_trait<RandomAccessContainer>::char_node_map_size == 256) { 01086 for (size_type i=0; i<256; ++i) { 01087 util::read_member(m_char_node_map[i], in); 01088 } 01089 } 01090 } 01091 }; 01092 01093 }// end namespace sds 01094 01095 #endif // end file