SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
 All Classes Namespaces Files Functions Variables Typedefs Friends
sdsl/include/sdsl/wt.hpp
Go to the documentation of this file.
00001 /* sdsl - succinct data structures library
00002     Copyright (C) 2009 Simon Gog
00003 
00004     This program is free software: you can redistribute it and/or modify
00005     it under the terms of the GNU General Public License as published by
00006     the Free Software Foundation, either version 3 of the License, or
00007     (at your option) any later version.
00008 
00009     This program is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012     GNU General Public License for more details.
00013 
00014     You should have received a copy of the GNU General Public License
00015     along with this program.  If not, see http://www.gnu.org/licenses/ .
00016 */
00023 #ifndef INCLUDED_SDSL_WT
00024 #define INCLUDED_SDSL_WT
00025 
00026 #include "int_vector.hpp"
00027 #include "bitmagic.hpp"
00028 #include "util.hpp" // for util::ssign
00029 #include <set> // for calculating the alphabet size
00030 #include <map> // for mapping a symbol to its lexicographical index
00031 #include <algorithm> // for std::swap
00032 #include <stdexcept>
00033 #include <vector>
00034 
00035 #ifdef SDSL_DEBUG
00036 #define SDSL_DEBUG_WT
00037 #endif
00038 
00039 
00040 //#define SDSL_DEBUG_WT
00041 
00042 #ifdef SDSL_DEBUG_WT
00043 #include "testutils.hpp"
00044 #endif
00045 
00047 namespace sdsl
00048 {
00049 
00050 struct unsigned_char_map {
00051     unsigned char m_map[256];
00052 
00053     unsigned char& operator[](unsigned char i) {
00054         return *(m_map+i);
00055     }
00056 
00057     unsigned char operator[](unsigned char i)const {
00058         return m_map[i];
00059     }
00060 
00061     void clear() {
00062         for (uint16_t i=0; i<256; ++i)
00063             m_map[i] = 0;
00064     }
00065 
00066     uint16_t serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const {
00067         structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
00068         uint16_t written_bytes = 0;
00069         for (uint16_t i=0; i<256; ++i) {
00070             out.write((char*)&m_map[i], sizeof(m_map[i]));
00071             written_bytes += sizeof(m_map[256]);
00072         }
00073         structure_tree::add_size(child, written_bytes);
00074         return written_bytes;
00075     }
00076 
00077     void load(std::istream& in) {
00078         for (uint16_t i=0; i<256; ++i) {
00079             in.read((char*) &m_map[i], sizeof(m_map[i]));
00080         }
00081     }
00082 
00083     void swap(unsigned_char_map& map) {
00084         if (this != &map) {
00085             for (uint16_t i=0; i<256; ++i) {
00086                 std::swap(m_map[i], map.m_map[i]);
00087             }
00088         }
00089     }
00090 };
00091 
00092 
00093 template<class RandomAccessContainer>
00094 class wt_trait
00095 {
00096     public:
00097         typedef typename RandomAccessContainer::size_type               size_type;
00098         typedef typename RandomAccessContainer::value_type              value_type;
00099         typedef RandomAccessContainer&                                                  reference_type;
00100         typedef std::map<value_type, size_type>                                 map_type;
00101         typedef std::map<value_type, size_type>                                 inv_map_type;
00102         enum { char_node_map_size=0 };
00103 
00104         static size_type alphabet_size_and_map(const reference_type rac, size_type n, map_type& map, inv_map_type& inv_map, value_type& first_symbol) {
00105             if (n > 0)
00106                 first_symbol = rac[0];
00107             map.clear();
00108             inv_map.clear();
00109             size_type alphabet_size = 0;
00110             for (size_type i=0; i<n; ++i) {
00111                 if (map.find(rac[i]) == map.end()) {
00112                     map[rac[i]] = 1;
00113                 }
00114             }
00115             for (typename map_type::iterator it = map.begin(); it != map.end(); ++it) { // this preserves the order
00116                 it->second = alphabet_size;
00117                 inv_map[alphabet_size] = it->first;
00118                 ++alphabet_size;
00119             }
00120             return alphabet_size;
00121         }
00122 
00123         static bool symbol_available(const map_type& map, const value_type c, const value_type first_symbol, const size_type) {
00124             return map.find(c)!=map.end();
00125         }
00126 
00127         static size_type serialize_maps(std::ostream& out, const map_type& map, const inv_map_type& inv_map, structure_tree_node* v=NULL, std::string name="") {
00128             throw std::logic_error(util::demangle(typeid(wt_trait<RandomAccessContainer>).name())+": serialize not implemented");
00129             return 0;
00130         }
00131 
00132         static size_type load_maps(std::istream& in, map_type& map, inv_map_type& inv_map) {
00133             throw std::logic_error(util::demangle(typeid(wt_trait<RandomAccessContainer>).name())+": load not implemented");
00134             return 0;
00135         }
00136 };
00137 
00138 template<class character>
00139 class wt_trait<character*>
00140 {
00141     public:
00142         typedef bit_vector::size_type                   size_type;
00143         typedef character                                               value_type;
00144         typedef character*                                              reference_type;
00145         typedef std::map<value_type, size_type> map_type;
00146         typedef std::map<value_type, size_type> inv_map_type;
00147         enum { char_node_map_size=256 };
00148 
00149         static size_type alphabet_size_and_map(const reference_type rac, size_type n, map_type& map, inv_map_type& inv_map, value_type& first_symbol) {
00150             if (n > 0)
00151                 first_symbol = *(rac+0);
00152             map.clear();
00153             inv_map.clear();
00154             size_type alphabet_size = 0;
00155             for (size_type i=0; i<n; ++i) {
00156                 if (map.find(*(rac+i)) == map.end()) {
00157                     map[*(rac+i)]       = 1;
00158                 }
00159             }
00160 
00161             for (typename map_type::iterator it = map.begin(); it != map.end(); ++it) { // this preserves the order
00162                 it->second = alphabet_size;
00163                 inv_map[alphabet_size] = it->first;
00164                 ++alphabet_size;
00165             }
00166             return alphabet_size;
00167         }
00168 
00169         static bool symbol_available(const map_type& map, const value_type c, const value_type first_symbol, const size_type) {
00170             return map.find(c)!=map.end();
00171         }
00172 
00173         static size_type serialize_maps(std::ostream& out, const map_type& map, const inv_map_type& inv_map, structure_tree_node* v=NULL, std::string name="") {
00174             throw std::logic_error(util::demangle(typeid(wt_trait<character*>).name())+": serialize not implemented");
00175             return 0;
00176         }
00177 
00178         static void load_maps(std::istream& in, map_type& map, inv_map_type& inv_map) {
00179             throw std::logic_error(util::demangle(typeid(wt_trait<character*>).name())+": load not implemented");
00180         }
00181 };
00182 
00183 template<>
00184 class wt_trait<unsigned char*>
00185 {
00186     public:
00187         typedef bit_vector::size_type           size_type;
00188         typedef unsigned char                           value_type;
00189         typedef unsigned char*                          reference_type;
00190         typedef unsigned_char_map                       map_type;
00191         typedef unsigned_char_map                       inv_map_type;
00192         enum { char_node_map_size=256 };
00193 
00194         static size_type alphabet_size_and_map(const reference_type rac, size_type n, map_type& map, inv_map_type& inv_map, value_type& first_symbol) {
00195             map.clear();
00196             inv_map.clear();
00197             if (n==0) {
00198                 for (size_type i=0; i<256; ++i) {
00199                     map[i] = 255;    // mark each symbol as absent
00200                 }
00201                 return 0;
00202             }
00203             first_symbol        = *rac;
00204             map[*rac] = 0;
00205             inv_map[0] = *rac;
00206             size_type alphabet_size = 0;
00207 
00208             for (size_type i=0; i<256; ++i) {
00209                 map[i] = 0;
00210             }
00211 
00212             for (size_type i=0; i<n; ++i) {
00213                 value_type c = *(rac+i);
00214                 map[c] = 1;
00215             }
00216 
00217             for (size_type i=0; i<256; ++i) {
00218                 if (map[i]) {
00219                     map[i] = alphabet_size;
00220                     ++alphabet_size;
00221                 } else {
00222                     map[i] = 255;
00223                 }
00224                 inv_map[map[i]] = i;
00225             }
00226             return alphabet_size;
00227         }
00228 
00229         static bool symbol_available(const map_type& map, const value_type c, const value_type first_symbol, const size_type sigma) {
00230             return sigma==256 or map[c] < 255;
00231         }
00232 
00233         static size_type serialize_maps(std::ostream& out, const map_type& map, const inv_map_type& inv_map, structure_tree_node* v=NULL, std::string name="") {
00234             size_type written_bytes = 0;
00235             written_bytes += map.serialize(out, v, "alphabet_map");
00236             written_bytes += inv_map.serialize(out, v, "inverse_alphabet_map");
00237             return written_bytes;
00238         }
00239 
00240         static void load_maps(std::istream& in, map_type& map, inv_map_type& inv_map) {
00241             map.load(in);
00242             inv_map.load(in);
00243         }
00244 };
00245 
00246 template<class size_type_class>
00247 class wt_trait<int_vector_file_buffer<8, size_type_class> >
00248 {
00249     public:
00250         typedef size_type_class         size_type;
00251         typedef unsigned char           value_type;
00252         typedef int_vector_file_buffer<8, size_type_class>&             reference_type;
00253         typedef unsigned_char_map       map_type;
00254         typedef unsigned_char_map       inv_map_type;
00255         enum { char_node_map_size=256 };
00256 
00257         static size_type alphabet_size_and_map(reference_type rac, size_type n, map_type& map, inv_map_type& inv_map, value_type& first_symbol) {
00258             map.clear();
00259             inv_map.clear();
00260             if (n==0) {
00261                 for (size_type i=0; i<256; ++i) {
00262                     map[i] = 255;    // mark each symbol as absent
00263                 }
00264                 return 0;
00265             }
00266             rac.reset();
00267             if (rac.int_vector_size < n) {
00268                 throw std::logic_error("wt<int_vector_file_buffer<8> >: n > rac.int_vector_size!");
00269                 return 0;
00270             }
00271 
00272             for (size_type i=0; i<256; ++i) {
00273                 map[i] = 0;
00274             }
00275 
00276             size_type alphabet_size = 0;
00277             size_type r = rac.load_next_block();
00278             first_symbol = rac[0];
00279             for (size_type i=0, r_sum=0; r_sum < n;) {
00280                 if (r_sum +r > n) {  // make sure that not more than n characters are read
00281                     r = n-r_sum;
00282                 }
00283                 for (; i< r_sum+r; ++i) {
00284                     value_type c = rac[i-r_sum];
00285                     map[c] = 1;
00286                 }
00287                 r_sum += r; r = rac.load_next_block();
00288             }
00289 
00290             for (size_type i=0; i<256; ++i) {
00291                 if (map[i]) {
00292                     map[i] = alphabet_size;
00293                     ++alphabet_size;
00294                 } else {
00295                     map[i] = 255;
00296                 }
00297                 inv_map[map[i]] = i;
00298             }
00299             return alphabet_size;
00300         }
00301 
00302         static bool symbol_available(const map_type& map, const value_type c, const value_type first_symbol, const size_type sigma) {
00303             return sigma==256 or map[c] < 255;
00304         }
00305 
00306         static size_type serialize_maps(std::ostream& out, const map_type& map, const inv_map_type& inv_map, structure_tree_node* v=NULL, std::string name="") {
00307             size_type written_bytes = 0;
00308             written_bytes += map.serialize(out, v, "alphabet_map");
00309             written_bytes += inv_map.serialize(out, v, "inverse_alphabet_map");
00310             return written_bytes;
00311         }
00312 
00313         static void load_maps(std::istream& in, map_type& map, inv_map_type& inv_map) {
00314             map.load(in);
00315             inv_map.load(in);
00316         }
00317 };
00318 
00320 
00341 template<class RandomAccessContainer=unsigned char*,
00342          class BitVector         = bit_vector,
00343          class RankSupport       = typename BitVector::rank_1_type,
00344          class SelectSupport     = typename BitVector::select_1_type,
00345          class SelectSupportZero = typename BitVector::select_0_type>
00346 class wt
00347 {
00348     public:
00349         typedef typename wt_trait<RandomAccessContainer>::size_type             size_type;
00350         typedef typename wt_trait<RandomAccessContainer>::value_type            value_type;
00351         typedef typename wt_trait<RandomAccessContainer>::map_type                   map_type;
00352         typedef typename wt_trait<RandomAccessContainer>::inv_map_type  inv_map_type;
00353     private:
00354         size_type                       m_size;
00355         size_type                       m_sigma;                //<- \f$ |\Sigma| \f$
00356         BitVector                       m_tree;                 // bit vector to store the wavelet tree
00357         RankSupport                     m_tree_rank;    // rank support for the wavelet tree bit vector
00358         SelectSupport           m_tree_select1; // select support for the wavelet tree bit vector
00359         SelectSupportZero       m_tree_select0;
00360         int_vector<64>          m_node_pointers;
00361         int_vector<64>          m_node_pointers_rank;
00362         size_type                       m_char_node_map[wt_trait<RandomAccessContainer>::char_node_map_size];
00363 
00364         value_type                      m_first_symbol;
00365 
00366         mutable map_type                        m_char_map;             // map the characters to integers; we use mutable as the std::map has no const version of the []-operator
00367         mutable inv_map_type            m_inv_char_map; // map the integers back to characters; we use mutable " "  "  " "  "  "  "  "  "  "  "  "  "  "    "
00368 
00369         void copy(const wt& wt) {
00370             m_size                      = wt.m_size;
00371             m_sigma             = wt.m_sigma;
00372             m_tree                      = wt.m_tree;
00373             m_tree_rank         = wt.m_tree_rank;
00374             m_tree_rank.set_vector(&m_tree);
00375             m_tree_select1      = wt.m_tree_select1;
00376             m_tree_select1.set_vector(&m_tree);
00377             m_tree_select0      = wt.m_tree_select0;
00378             m_tree_select0.set_vector(&m_tree);
00379             m_node_pointers     = wt.m_node_pointers;
00380             m_node_pointers_rank = wt.m_node_pointers_rank;
00381             m_first_symbol      = wt.m_first_symbol;
00382             m_char_map          = wt.m_char_map;
00383             m_inv_char_map      = wt.m_inv_char_map;
00384             if (wt_trait<RandomAccessContainer>::char_node_map_size == 256) {
00385                 for (size_type i=0; i<256; ++i) m_char_node_map[i] = wt.m_char_node_map[i];
00386             }
00387         }
00388 
00389         void init_char_node_map() {
00390             if (wt_trait<RandomAccessContainer>::char_node_map_size == 256) {
00391                 for (size_type i=0; i<256; ++i) m_char_node_map[i] = 0;
00392             }
00393         }
00394 
00395     public:
00396 
00397         const size_type& sigma;
00398 
00399         // Default constructor
00400         wt():m_size(0),m_sigma(0), sigma(m_sigma) {};
00401 
00402 
00403 
00405 
00411         wt(const typename wt_trait<RandomAccessContainer>::reference_type rac, size_type size):m_size(size), m_sigma(0), sigma(m_sigma) {
00412 #ifdef SDSL_DEBUG_WT
00413             std::cerr<<"Wavelet Tree construct: size="<<size<<std::endl;
00414 #endif
00415             // calculate alphabet size and the mappings for the symbols to the integers and back
00416             m_sigma = wt_trait<RandomAccessContainer>::alphabet_size_and_map(rac, m_size, m_char_map, m_inv_char_map, m_first_symbol);
00417             init_char_node_map();
00418             int_vector<> node_sizes = int_vector<>(2*m_sigma+1, 0, bit_magic::l1BP(m_size)+1);
00419             m_node_pointers = int_vector<64>(node_sizes.size()+1, 0);
00420             m_node_pointers_rank = int_vector<64>(node_sizes.size()+1, 0);
00421 
00422             if (m_sigma < 2) {
00423                 if (m_sigma == 1) {  // ==> m_size > 0
00424                     if (wt_trait<RandomAccessContainer>::char_node_map_size == 256)
00425                         m_char_node_map[ m_first_symbol ] = 0; // first symbol corresponds to root node of the wavelet tree
00426                 }
00427                 return;
00428             } else {
00429                 // TODO: slow compared to other constructor which uses
00430                 // int_vector_file_buffer
00431                 for (size_type i=0; i<m_size; ++i) {
00432                     size_type lex_idx = m_char_map[rac[(const size_type)i]]; // lex_idx in [0..m_sigma-1]
00433                     size_type sigma = m_sigma, node=0;
00434                     while (sigma >= 2) {
00435 #ifdef SDSL_DEBUG_WT
00436                         if (node>=node_sizes.size()) {
00437                             std::cerr<<" node="<<node<<" node_sizes.size()="<<node_sizes.size()<<" rac["<<i<<"]="<<rac[i]<<" orig_lex_idx="<< (int)m_char_map[rac[i]] <<" sigma="<<sigma<<" lex_idx="<<lex_idx<<std::endl;
00438                         }
00439 #endif
00440                         assert(node<node_sizes.size());
00441                         node_sizes[node] = node_sizes[node]+1;
00442                         if (lex_idx < (sigma+1)/2) {
00443                             sigma = (sigma+1)/2;
00444                             node = 2*node+1;
00445                         } else {
00446                             lex_idx -= (sigma+1)/2;
00447                             sigma -= (sigma+1)/2;
00448                             node = 2*node+2;
00449                         }
00450                     }
00451 
00452                     if (wt_trait<RandomAccessContainer>::char_node_map_size == 256)
00453                         m_char_node_map[rac[i]] = node;
00454                     if (lex_idx!=0) {
00455                         std::cout<<" _"<<rac[i]<<" ("<<lex_idx<<")";
00456                     }
00457                 }
00458 #ifdef SDSL_DEBUG_WT
00459                 std::cerr<<"wavelet tree node sizes"<<std::endl;
00460                 for (size_type i=0; i<node_sizes.size(); ++i) {
00461                     std::cerr<<node_sizes[i]<<" ";
00462                 } std::cerr << std::endl;
00463 #endif
00464 
00465                 size_type max_node = 0;
00466                 m_node_pointers[0] = 0;
00467                 for (size_type i=1; i < m_node_pointers.size(); ++i) {
00468                     m_node_pointers[i] = m_node_pointers[i-1] + node_sizes[i-1];
00469                     if (node_sizes[i-1]!=0)
00470                         max_node = i-1;
00471                 }
00472                 node_sizes = int_vector<>(max_node+1, 0, 64);
00473 #ifdef SDSL_DEBUG_WT
00474                 std::cerr<<"all nodes size = "<<m_node_pointers[max_node+1]<<std::endl;
00475 #endif
00476                 bit_vector tree = bit_vector(m_node_pointers[max_node+1]);
00477                 for (size_type i=0; i<m_size; ++i) {
00478                     size_type lex_idx = m_char_map[rac[i]]; // lex_idx in [0..m_sigma-1]
00479 //std::cerr<<"lex_idx="<<lex_idx<<" "<<rac[i]<<std::endl;
00480                     size_type sigma = m_sigma, node=0;
00481                     while (sigma >= 2) {
00482                         if (lex_idx >= ((sigma+1)/2))
00483                             tree[m_node_pointers[node]+node_sizes[node]] = 1;
00484                         node_sizes[node] = node_sizes[node]+1;
00485                         if (lex_idx < (sigma+1)/2) {
00486                             sigma = (sigma+1)/2;
00487                             node = 2*node+1;
00488                         } else {
00489                             lex_idx -= (sigma+1)/2;
00490                             sigma -= (sigma+1)/2;
00491                             node = 2*node+2;
00492                         }
00493                     }
00494                 }
00495                 util::assign(m_tree, tree);
00496                 util::init_support(m_tree_rank, &m_tree);
00497                 for (size_type i=0; i < m_node_pointers.size(); ++i) {
00498                     m_node_pointers_rank[i] = m_tree_rank(m_node_pointers[i]);
00499                 }
00500                 util::init_support(m_tree_select0, &m_tree);
00501                 util::init_support(m_tree_select1, &m_tree);
00502             }
00503         }
00504 
00505         template<class size_type_class>
00506         wt(int_vector_file_buffer<8, size_type_class>& rac, size_type size):m_size(size), m_sigma(0), sigma(m_sigma) {
00507             construct(rac, size);
00508         }
00509 
00511 
00514         template<class size_type_class>
00515         void construct(int_vector_file_buffer<8, size_type_class>& rac, size_type size) {
00516             m_size = size;
00517             init_char_node_map();
00518             typedef int_vector_file_buffer<8, size_type_class> tIVFB;
00519 //#define SDSL_DEBUG_WT
00520 #ifdef SDSL_DEBUG_WT
00521             std::cerr<<"Wavelet Tree construct for int_vector_file_buffer: size="<<size<<std::endl;
00522             stop_watch sw; sw.start();
00523 #endif
00524 
00525 //              std::cerr << wt_trait<RandomAccessContainer>::char_node_map_size << std::endl;
00526             // calculate alphabet size and the mappings for the symbols to the integers and back
00527             m_sigma = wt_trait<tIVFB>::alphabet_size_and_map(rac, m_size, m_char_map, m_inv_char_map, m_first_symbol);
00528 
00529             int_vector<64> node_sizes = int_vector<64>(2*m_sigma+1, 0/*, bit_magic::l1BP(m_size)+1*/);
00530             m_node_pointers = int_vector<64>(node_sizes.size()+1, 0);
00531             m_node_pointers_rank = int_vector<64>(node_sizes.size()+1, 0);
00532 
00533             if (m_sigma < 2) {
00534                 if (1 == m_sigma) {  // handle special case more efficient
00535                     m_char_node_map[m_first_symbol] = 0; // map the first symbol to the root node of the wavelet tree
00536                 }
00537             } else {
00538                 // O(n + |\Sigma|\log|\Sigma|) algorithm for calculating node sizes
00539                 size_type C[256] = {0};
00540                 rac.reset();
00541                 //  1. Count occurrences of characters
00542                 for (size_type i=0, r_sum=0, r = rac.load_next_block(); r_sum < m_size;) {
00543                     if (r_sum + r > m_size) {  // read not more than size chars in the next loop
00544                         r = m_size-r_sum;
00545                     }
00546                     for (; i < r_sum+r; ++i) {
00547                         ++C[rac[i-r_sum]];
00548                     }
00549                     r_sum += r; r = rac.load_next_block();
00550                 }
00551                 //  2. Sum up the node sizes for each character
00552                 for (size_type i=0; i < 256; ++i) {
00553                     if (C[i]) {
00554                         size_type lex_idx = m_char_map[i]; // lex_idx in [0..m_sigma-1]
00555                         size_type sigma = m_sigma, node=0;
00556                         while (sigma >= 2) {
00557                             assert(node<node_sizes.size());
00558                             node_sizes[node] = node_sizes[node]+C[i];
00559                             if (lex_idx < ((sigma+1)>>1)) {
00560                                 sigma = ((sigma+1)>>1);
00561                                 node = (node<<1)+1;
00562                             } else {
00563                                 lex_idx -= ((sigma+1)>>1);
00564                                 sigma -= ((sigma+1)>>1);
00565                                 node = (node<<1)+2;
00566                             }
00567                         }
00568                         m_char_node_map[i] = node;
00569                     } else {
00570                         m_char_node_map[i] = 0;
00571                     }
00572                 }
00573 #ifdef SDSL_DEBUG_WT
00574                 sw.stop();
00575                 std::cerr<<"Time for counting phase: "<< sw.get_real_time() << " ms real time , "<< sw.get_user_time()<<" ms user time"<< std::endl;
00576 //                      std::cerr<<"wavelet tree node sizes"<<std::endl;
00577 //                      for(size_type i=0; i<node_sizes.size(); ++i){ std::cerr<<node_sizes[i]<<" "; } std::cerr << std::endl;
00578 #endif
00579                 size_type max_node = 0;
00580                 m_node_pointers[0] = 0;
00581                 for (size_type i=1; i < m_node_pointers.size(); ++i) {
00582                     m_node_pointers[i] = m_node_pointers[i-1] + node_sizes[i-1];
00583                     if (node_sizes[i-1]!=0)
00584                         max_node = i-1;
00585                 }
00586                 node_sizes = int_vector<64>(max_node+1, 0);
00587 #ifdef SDSL_DEBUG_WT
00588                 std::cerr<<"all nodes size = "<<m_node_pointers[max_node+1]<<std::endl;
00589                 sw.start();
00590 #endif
00591                 // initialize bit vector with 0's
00592                 bit_vector tree = bit_vector(m_node_pointers[max_node+1], 0);
00593                 // precalc paths in the tree for all symbols in the alphabet
00594                 uint8_t path_len[256] = {0};
00595                 uint8_t path[256] = {0};
00596                 for (size_type i=0; i < 256; ++i) {
00597                     if (C[i]) {
00598                         size_type lex_idx = m_char_map[i]; // lex_idx in [0..m_sigma-1]
00599                         size_type sigma = m_sigma, depth=0;
00600                         while (sigma >= 2) {
00601                             if (lex_idx < ((sigma+1)>>1)) {
00602                                 sigma = ((sigma+1)>>1);
00603                             } else {
00604                                 lex_idx -= ((sigma+1)>>1);
00605                                 sigma -= ((sigma+1)>>1);
00606                                 path[i] |= (1<<depth);
00607                             }
00608                             ++path_len[i];
00609                             ++depth;
00610                         }
00611                     }
00612                 }
00614                 rac.reset();
00615                 for (size_type i=0, r_sum=0, r = rac.load_next_block(); r_sum < m_size;) {
00616                     if (r_sum + r > size) {  // read not more than size chars in the next loop
00617                         r = size-r_sum;
00618                     }
00619                     uint8_t old_chr = rac[i-r_sum];
00620                     uint8_t times = 0;
00621                     for (; i < r_sum+r; ++i) {
00622                         uint8_t chr = rac[i-r_sum];
00623                         if (chr != old_chr) {
00624                             uint8_t p = path[old_chr];
00625                             for (uint32_t l=0, node=0; l<path_len[old_chr]; ++l, p >>= 1) {
00626                                 if (p&1) {
00627                                     tree.set_int(m_node_pointers[node]+node_sizes[node], 0xFFFFFFFFFFFFFFFFULL,times);
00628                                     node_sizes[node] += times; node = (node<<1)+2;
00629                                 } else {
00630                                     node_sizes[node] += times; node = (node<<1)+1;
00631                                 }
00632                             }
00633                             times = 1;
00634                             old_chr = chr;
00635                         } else { // chr == old_chr
00636                             ++times;
00637                             if (times == 64) {
00638                                 uint8_t p = path[old_chr];
00639                                 for (uint32_t l=0, node=0; l<path_len[old_chr]; ++l, p >>= 1) {
00640                                     if (p&1) {
00641                                         tree.set_int(m_node_pointers[node]+node_sizes[node], 0xFFFFFFFFFFFFFFFFULL,times);
00642                                         node_sizes[node] += times; node = (node<<1)+2;
00643                                     } else {
00644                                         node_sizes[node] += times; node = (node<<1)+1;
00645                                     }
00646                                 }
00647                                 times = 0;
00648                             }
00649                         }
00650                     }
00651                     if (times > 0) {
00652                         uint8_t p = path[old_chr];
00653                         for (uint32_t l=0, node=0; l<path_len[old_chr]; ++l, p >>= 1) {
00654                             if (p&1) {
00655                                 tree.set_int(m_node_pointers[node]+node_sizes[node], 0xFFFFFFFFFFFFFFFFULL,times);
00656                                 node_sizes[node] += times; node = (node<<1)+2;
00657                             } else {
00658                                 node_sizes[node] += times; node = (node<<1)+1;
00659                             }
00660                         }
00661                     }
00662                     r_sum += r; r = rac.load_next_block();
00663                 }
00664 //*/
00665 #ifdef SDSL_DEBUG_WT
00666                 sw.stop();
00667                 std::cerr<<"Time for building phase: "<< sw.get_real_time() << " ms real time , "<< sw.get_user_time()<<" ms user time"<< std::endl;
00668                 sw.start();
00669 #endif
00670                 util::assign(m_tree, tree);
00671                 util::init_support(m_tree_rank,&m_tree);
00672 #ifdef SDSL_DEBUG_WT
00673                 sw.stop();
00674                 std::cerr<<"Time for rank init: "<< sw.get_real_time() << " ms real time , "<< sw.get_user_time()<<" ms user time"<< std::endl;
00675                 sw.start();
00676 #endif
00677                 for (size_type i=0; i < m_node_pointers.size(); ++i) {
00678                     m_node_pointers_rank[i] = m_tree_rank(m_node_pointers[i]);
00679                 }
00680                 util::init_support(m_tree_select0,&m_tree);
00681 #ifdef SDSL_DEBUG_WT
00682                 std::cerr<<"select0 init ready!"<<std::endl;
00683 #endif
00684                 util::init_support(m_tree_select1,&m_tree);
00685 #ifdef SDSL_DEBUG_WT
00686                 sw.stop();
00687                 std::cerr<<"Time for select init: "<< sw.get_real_time() << " ms real time , "<< sw.get_user_time()<<" ms user time"<< std::endl;
00688                 sw.start();
00689 #endif
00690             }
00691         }
00692 
00694         wt(const wt& wt):sigma(m_sigma) {
00695             copy(wt);
00696         }
00697 
00699         wt& operator=(const wt& wt) {
00700             if (this != &wt) {
00701                 copy(wt);
00702             }
00703             return *this;
00704         }
00705 
00707         void swap(wt& wt) {
00708             if (this != &wt) {
00709                 std::swap(m_size, wt.m_size);
00710                 std::swap(m_sigma,  wt.m_sigma);
00711                 m_tree.swap(wt.m_tree);
00712                 util::swap_support(m_tree_rank, wt.m_tree_rank, &m_tree, &(wt.m_tree));
00713                 util::swap_support(m_tree_select1, wt.m_tree_select1, &m_tree, &(wt.m_tree));
00714                 util::swap_support(m_tree_select0, wt.m_tree_select0, &m_tree, &(wt.m_tree));
00715                 m_node_pointers.swap(wt.m_node_pointers);
00716                 m_node_pointers_rank.swap(wt.m_node_pointers_rank);
00717                 std::swap(m_first_symbol, wt.m_first_symbol);
00718                 m_char_map.swap(wt.m_char_map);
00719                 m_inv_char_map.swap(wt.m_inv_char_map);
00720                 // swap char_node_map
00721                 if (wt_trait<RandomAccessContainer>::char_node_map_size == 256) {
00722                     for (size_type i=0; i<256; ++i) {
00723                         std::swap(m_char_node_map[i],wt.m_char_node_map[i]);
00724                     }
00725                 }
00726             }
00727         }
00728 
00730         size_type size()const {
00731             return m_size;
00732         }
00733 
00735         bool empty()const {
00736             return m_size == 0;
00737         }
00738 
00740 
00743         value_type operator[](size_type i)const {
00744             size_type lex_idx   = 0;
00745             size_type sigma             = m_sigma;
00746             size_type node              = 0;
00747             while (sigma >= 2) {
00748                 if (m_tree[ m_node_pointers[node]+i ]) { // go to the right child
00749                     lex_idx += (sigma+1)/2;
00750                     i = m_tree_rank(m_node_pointers[node]+i) - m_node_pointers_rank[node];  //m_tree_rank( m_node_pointers[node] );
00751                     node = 2*node+2;
00752                     sigma -= (sigma+1)/2;
00753                 } else { // go to the left child
00754                     i = i - (m_tree_rank(m_node_pointers[node]+i) - m_node_pointers_rank[node]);   //m_tree_rank( m_node_pointers[node] ) );
00755                     node = 2*node+1;
00756                     sigma = (sigma+1)/2;
00757                 }
00758             }
00759             return m_inv_char_map[lex_idx];
00760         };
00761 
00763 
00770         size_type rank(size_type i, value_type c)const {
00771             if (!wt_trait<RandomAccessContainer>::symbol_available(m_char_map, c, m_first_symbol, m_sigma)) {
00772                 return 0;
00773             }
00774             size_type lex_idx   = m_char_map[c]; // koennte man auch nur path, path_len ersetzen
00775             size_type sigma     = m_sigma;
00776             size_type node              = 0;
00777             size_type result    = i;
00778             while (sigma >= 2 and result > 0) {
00779                 if (lex_idx < (sigma+1)/2) {
00780                     result      = result - (m_tree_rank(m_node_pointers[node]+result) -  m_node_pointers_rank[node]); //m_tree_rank(m_node_pointers[node]));
00781                     sigma       = (sigma+1)/2;
00782                     node        = 2*node+1;
00783                 } else {
00784                     result = m_tree_rank(m_node_pointers[node]+result) -  m_node_pointers_rank[node]; // m_tree_rank(m_node_pointers[node]);
00785                     lex_idx -= (sigma+1)/2;
00786                     sigma -= (sigma+1)/2;
00787                     node = 2*node+2;
00788                 }
00789             }
00790             return result;
00791         };
00792 
00794 
00801         size_type rank_ith_symbol(size_type i, value_type& c)const {
00802             assert(i>=0 and i < size());
00803             size_type lex_idx   = 0;
00804             size_type sigma             = m_sigma;
00805             size_type node              = 0;
00806             while (sigma >= 2) {
00807                 if (m_tree[ m_node_pointers[node]+i ]) { // go to the right child
00808                     lex_idx += (sigma+1)/2;
00809                     i = m_tree_rank(m_node_pointers[node]+i) - m_node_pointers_rank[node];  //m_tree_rank( m_node_pointers[node] );
00810                     node = 2*node+2;
00811                     sigma -= (sigma+1)/2;
00812                 } else { // go to the left child
00813                     i = i - (m_tree_rank(m_node_pointers[node]+i) - m_node_pointers_rank[node]);   //m_tree_rank( m_node_pointers[node] ) );
00814                     node = 2*node+1;
00815                     sigma = (sigma+1)/2;
00816                 }
00817             }
00818             c = m_inv_char_map[lex_idx];
00819             return i;
00820         }
00821 
00822         // recursive internal version of the method interval_symbols
00823         void _interval_symbols(size_type i, size_type j, size_type& k,
00824                                std::vector<unsigned char>& cs,
00825                                std::vector<size_type>& rank_c_i,
00826                                std::vector<size_type>& rank_c_j,
00827                                size_type lex_idx, size_type sigma, size_type node) const {
00828             // not in a leaf
00829             if (sigma >= 2) {
00830                 size_type i_new = (m_tree_rank(m_node_pointers[node] + i) - m_node_pointers_rank[node]);
00831                 size_type j_new = (m_tree_rank(m_node_pointers[node] + j) - m_node_pointers_rank[node]);
00832                 // goto left child
00833                 i -= i_new; j -= j_new;
00834                 if (i != j) {
00835                     _interval_symbols(i, j, k, cs, rank_c_i, rank_c_j, lex_idx, (sigma+1)>>1, (node<<1)|1 /*2*node+1*/);
00836                 }
00837                 // goto right child
00838                 if (i_new != j_new) {
00839                     _interval_symbols(i_new, j_new, k, cs, rank_c_i, rank_c_j, lex_idx + ((sigma+1)>>1), sigma-((sigma+1)>>1), (node+1)<<1 /*2*node+2*/);
00840                 }
00841             } else {
00842                 rank_c_i[k] = i;
00843                 rank_c_j[k] = j;
00844                 cs[k++] = m_inv_char_map[lex_idx];
00845                 return;
00846             }
00847         }
00848 
00850         /* If the character c does not occur in the sequence 0 is returned.
00851          *
00852          */
00853         size_type count_lex_smaller(size_type i, value_type c)const {
00854             if (!wt_trait<RandomAccessContainer>::symbol_available(m_char_map, c, m_first_symbol, m_sigma)) {
00855                 return 0;
00856             }
00857             size_type lex_idx   = m_char_map[c];
00858             size_type sigma     = m_sigma;  // start with the whole alphabet
00859             size_type node              = 0;
00860             size_type result    = 0;
00861             while (sigma >= 2) {
00862                 if (lex_idx < (sigma+1)/2) { // symbols belongs to the left half of the alphabet
00863                     // calculate new i for the left child bit_vector
00864                     i   = i - (m_tree_rank(m_node_pointers[node]+i) -  m_node_pointers_rank[node]);
00865                     sigma       = (sigma+1)/2;
00866                     node        = 2*node+1;
00867                 } else { // symbol belongs to the right half of the alphabet
00868                     size_type ones = m_tree_rank(m_node_pointers[node]+i) -  m_node_pointers_rank[node];
00869                     result += (i - ones); // all elements prefixed with 0 are lexicographic smaller than c
00870                     i = ones;
00871                     // calclate new i for the right child bit_vector
00872                     lex_idx -= (sigma+1)/2;
00873                     sigma -= (sigma+1)/2;
00874                     node = 2*node+2;
00875                 }
00876             }
00877             return result;
00878         }
00879 
00881         size_type count_lex_smaller(size_type i, size_type j, value_type c)const {
00882             if (i==j)
00883                 return 0;
00884             if (i+1 == j) {
00885                 return (*this)[i] < c;
00886             } else {
00887                 return count_lex_smaller(j, c) - count_lex_smaller(i, c);
00888             }
00889         }
00890 
00892 
00908         void interval_symbols(size_type i, size_type j, size_type& k,
00909                               std::vector<unsigned char>& cs,
00910                               std::vector<size_type>& rank_c_i,
00911                               std::vector<size_type>& rank_c_j) const {
00912             if (i==j) {
00913                 k = 0;
00914                 return;
00915             } else if ((j-i)==1) {
00916                 k = 1;
00917                 rank_c_i[0] = rank_ith_symbol(i, cs[0]);
00918                 rank_c_j[0] = rank_c_i[0]+1;
00919                 return;
00920             } else if ((j-i)==2) {
00921                 rank_c_i[0] = rank_ith_symbol(i, cs[0]);
00922                 rank_c_i[1] = rank_ith_symbol(i+1, cs[1]);
00923                 if (cs[0]==cs[1]) {
00924                     k = 1;
00925                     rank_c_j[0] = rank_c_i[0]+2;
00926                     return;
00927                 } else {
00928                     k = 2;
00929                     rank_c_j[0] = rank_c_i[0]+1;
00930                     rank_c_j[1] = rank_c_i[1]+1;
00931                     return;
00932                 }
00933             } else {
00934                 k = 0;
00935                 _interval_symbols(i, j, k, cs, rank_c_i, rank_c_j, 0, m_sigma, 0);
00936             }
00937         }
00938 
00939 
00941 
00947         // TODO: was ist wenn c gar nicht vorkommt, oder es keine i Vorkommen gibt?
00948         size_type select(size_type i, value_type c)const {
00949             if (!wt_trait<RandomAccessContainer>::symbol_available(m_char_map, c, m_first_symbol, m_sigma)) {
00950                 return size();
00951             }
00952             assert(i > 0);
00953 #ifdef SDSL_DEBUG_WT
00954             std::cerr<<"wt: select("<<i<<", "<<c<<")"<<std::endl;
00955             std::cerr<<" i="<<i<< " <= "<< rank(size(),c) <<" ?" <<std::endl;
00956 #endif
00957             assert(i <= rank(size(), c));
00958 
00959             size_type node              = 0;
00960             // first phase: go down to the node corresponding to the lex_idx of the character
00961             if (wt_trait<RandomAccessContainer>::char_node_map_size == 256) {
00962                 node = m_char_node_map[c];
00963             } else {
00964                 size_type lex_idx       = m_char_map[c];
00965                 size_type sigma         = m_sigma;
00966                 while (sigma >= 2) {
00967                     if (lex_idx < (sigma+1)/2) {
00968                         sigma = (sigma+1)/2;
00969                         node = 2*node+1;
00970                     } else {
00971                         lex_idx -= (sigma+1)/2;
00972                         sigma -= (sigma+1)/2;
00973                         node = 2*node+2;
00974                     }
00975                 }
00976             }
00977             // second phase go up and select the right position
00978             size_type result = i-1;
00979             while (node != 0) {
00980                 if (node&1) {// node is a left child
00981                     node = (node-1)/2;
00982                     result = m_tree_select0(m_node_pointers[node]-m_node_pointers_rank[node]+result+1)-m_node_pointers[node];
00983                 } else { //node is a right child
00984                     node = (node-1)/2;
00985                     result = m_tree_select1(m_node_pointers_rank[node]+result+1)-m_node_pointers[node];
00986                 }
00987             }
00988             return result;
00989         };
00990 
00992         void range_search_2d(size_type lb, size_type rb, value_type c1, value_type c2, std::vector<size_type>& result) const {
00993             size_type lex_idx_1 = m_char_map[c1];
00994             size_type lex_idx_2 = m_char_map[c2];
00995             _range_search_2d(0, lb, rb, lex_idx_1, lex_idx_2, m_sigma, result);
00996         }
00997 
00998         // add parameter path
00999         void _range_search_2d(size_type node, size_type lb, size_type rb, size_type lex_idx_1, size_type lex_idx_2, size_type sigma, std::vector<size_type>& result)const {
01000 //              std::cerr<<"lb="<<lb<<" rb="<<rb<<" lex_idx_1="<<lex_idx_1<<" lex_idx_2="<<lex_idx_2<<" sigma="<<sigma<<" node="<<node<<std::endl;
01001             // [lex_idx_1..lex_idx_2] in [0..sigma]
01002             if (lb > rb)
01003                 return;
01004             if (sigma <= 1) {
01005                 // result[i]...
01006                 size_type r = 0;
01007                 while (node != 0) {
01008                     if (node&1) {// node is a left child
01009                         node = (node-1)/2;
01010                         size_type node_idx = m_node_pointers[node];
01011                         r = m_tree_select0(node_idx-m_tree_rank(node_idx)+r+1)-node_idx;
01012                     } else { //node is a right child
01013                         node = (node-1)/2;
01014                         r = m_tree_select1(m_tree_rank(m_node_pointers[node])+r+1)-m_node_pointers[node];
01015                     }
01016                 }
01017                 result.push_back(r);
01018                 return;
01019             }
01020             size_type lex_mid = (sigma+1)/2;
01021             size_type lsigma = lex_mid;
01022             size_type rsigma = sigma - lex_mid;
01023 
01024             size_type ones_lb_1 = m_tree_rank(m_node_pointers[node]+lb) -  m_node_pointers_rank[node];  // ones in [0,lb)
01025             size_type ones_rb   = m_tree_rank(m_node_pointers[node]+rb+1) -     m_node_pointers_rank[node]; // ones in [0,rb]
01026             size_type zeros_lb_1 = lb-ones_lb_1; // zeros in [0,lb)
01027             size_type zeros_rb  = rb+1-ones_rb; // zeros in [0,rb]
01028 
01029 //              std::cerr<<"ones_lb_1="<<ones_lb_1<<" ones_rb="<<ones_rb<<std::endl;
01030 //              std::cerr<<"zeros_lb_1="<<zeros_lb_1<<" zeros_rb="<<zeros_rb<<std::endl;
01031 
01032             if (lex_idx_1 < lex_mid and zeros_rb) {
01033                 _range_search_2d(2*node+1, zeros_lb_1, zeros_rb-1, lex_idx_1, std::min(lex_idx_2,lex_mid-1), lsigma, result);
01034             }
01035 
01036             if (lex_idx_2 >= lex_mid  and  ones_rb) {
01037                 size_type _lex_idx_1 = 0;
01038                 if (lex_idx_1 > lex_mid)
01039                     _lex_idx_1 = lex_idx_1 - lex_mid;
01040                 _range_search_2d(2*node+2, ones_lb_1, ones_rb-1, _lex_idx_1, lex_idx_2-lex_mid, rsigma, result);
01041             }
01042         }
01043 
01045         size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const {
01046             structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
01047             size_type written_bytes = 0;
01048             written_bytes += util::write_member(m_size, out, child, "size");
01049             written_bytes += util::write_member(m_sigma, out, child, "sigma");
01050             written_bytes += m_tree.serialize(out, child, "tree");
01051             written_bytes += m_tree_rank.serialize(out, child, "tree_rank");
01052             written_bytes += m_tree_select1.serialize(out, child, "tree_select_1");
01053             written_bytes += m_tree_select0.serialize(out, child, "tree_select_0");
01054             written_bytes += m_node_pointers.serialize(out, child, "node_pointers");
01055             written_bytes += m_node_pointers_rank.serialize(out, child, "node_pointers_rank");
01056             written_bytes += wt_trait<RandomAccessContainer>::serialize_maps(out, m_char_map, m_inv_char_map, child, "alphabet_mapping");
01057             written_bytes += util::write_member(m_first_symbol, out, child, "first symbol");
01058             // serialize char_node_map
01059             if (wt_trait<RandomAccessContainer>::char_node_map_size == 256) {
01060                 size_type written_bytes2 = 0;
01061                 for (size_type i=0; i<256; ++i) {
01062                     written_bytes2 += util::write_member(m_char_node_map[i], out);
01063                 }
01064                 structure_tree_node* child2 = structure_tree::add_child(child, "char_node_map", "char_map");
01065                 structure_tree::add_size(child2, written_bytes2);
01066                 written_bytes += written_bytes2;
01067             }
01068             structure_tree::add_size(child, written_bytes);
01069             return written_bytes;
01070         }
01071 
01073         void load(std::istream& in) {
01074             util::read_member(m_size, in);
01075             util::read_member(m_sigma, in);
01076             m_tree.load(in);
01077             m_tree_rank.load(in, &m_tree);
01078             m_tree_select1.load(in, &m_tree);
01079             m_tree_select0.load(in, &m_tree);
01080             m_node_pointers.load(in);
01081             m_node_pointers_rank.load(in);
01082             wt_trait<RandomAccessContainer>::load_maps(in, m_char_map, m_inv_char_map);
01083             util::read_member(m_first_symbol, in);
01084             // serialize char_node_map
01085             if (wt_trait<RandomAccessContainer>::char_node_map_size == 256) {
01086                 for (size_type i=0; i<256; ++i) {
01087                     util::read_member(m_char_node_map[i], in);
01088                 }
01089             }
01090         }
01091 };
01092 
01093 }// end namespace sds
01094 
01095 #endif // end file