SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
 All Classes Namespaces Files Functions Variables Typedefs Friends
sdsl/include/sdsl/wt_int.hpp
Go to the documentation of this file.
00001 /* sdsl - succinct data structures library
00002     Copyright (C) 2009 Simon Gog
00003 
00004     This program is free software: you can redistribute it and/or modify
00005     it under the terms of the GNU General Public License as published by
00006     the Free Software Foundation, either version 3 of the License, or
00007     (at your option) any later version.
00008 
00009     This program is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012     GNU General Public License for more details.
00013 
00014     You should have received a copy of the GNU General Public License
00015     along with this program.  If not, see http://www.gnu.org/licenses/ .
00016 */
00023 #ifndef INCLUDED_SDSL_INT_WAVELET_TREE
00024 #define INCLUDED_SDSL_INT_WAVELET_TREE
00025 
00026 #include "int_vector.hpp"
00027 #include "rank_support_v.hpp"
00028 #include "select_support_mcl.hpp"
00029 #include "bitmagic.hpp"
00030 #include "testutils.hpp"
00031 #include "temp_write_read_buffer.hpp"
00032 #include "util.hpp"
00033 #include <set> // for calculating the alphabet size
00034 #include <map> // for mapping a symbol to its lexicographical index
00035 #include <algorithm> // for std::swap
00036 #include <stdexcept>
00037 #include <vector>
00038 
00040 namespace sdsl
00041 {
00042 
00044 
00062 template<class RandomAccessContainer=int_vector<>,
00063          class BitVector                 = bit_vector,
00064          class RankSupport               = typename BitVector::rank_1_type,
00065          class SelectSupport     = typename BitVector::select_1_type,
00066          class SelectSupportZero = typename BitVector::select_0_type>
00067 class wt_int
00068 {
00069     public:
00070         typedef typename RandomAccessContainer::size_type               size_type;
00071         typedef typename RandomAccessContainer::value_type              value_type;
00072         typedef BitVector                                                                               bit_vector_type;
00073         typedef RankSupport                                                                             rank_1_type;
00074         typedef SelectSupport                                                                   select_1_type;
00075         typedef SelectSupportZero                                                               select_0_type;
00076     protected:
00077         size_type                       m_size;
00078         size_type                       m_sigma;                //<- \f$ |\Sigma| \f$
00079         bit_vector_type         m_tree;                 // bit vector to store the wavelet tree
00080         rank_1_type                     m_tree_rank;    // rank support for the wavelet tree bit vector
00081         select_1_type           m_tree_select1; // select support for the wavelet tree bit vector
00082         select_0_type           m_tree_select0;
00083         uint32_t                        m_logn;
00084 
00085         void copy(const wt_int& wt) {
00086             m_size                      = wt.m_size;
00087             m_sigma             = wt.m_sigma;
00088             m_tree                      = wt.m_tree;
00089             m_tree_rank         = wt.m_tree_rank;
00090             m_tree_rank.set_vector(&m_tree);
00091             m_tree_select1      = wt.m_tree_select1;
00092             m_tree_select1.set_vector(&m_tree);
00093             m_tree_select0      = wt.m_tree_select0;
00094             m_tree_select0.set_vector(&m_tree);
00095             m_logn                      = wt.m_logn;
00096         }
00097 
00098     public:
00099 
00100         const size_type& sigma; 
00101         const bit_vector_type& tree; 
00102 
00104         wt_int():m_size(0),m_sigma(0), m_logn(0), sigma(m_sigma), tree(m_tree) {};
00105 
00106 
00108 
00117         wt_int(const RandomAccessContainer& rac, uint32_t logn=0):m_size(rac.size()), m_sigma(0), sigma(m_sigma), tree(m_tree) {
00118 #ifdef SDSL_DEBUG_INT_WAVELET_TREE
00119             std::cerr<<"wt_int construct: size="<<m_size<<std::endl;
00120 #endif
00121             size_type n =       m_size;
00122             if (logn == 0) { // detect the biggest value in rac and set logn properly
00123                 value_type x = 1;
00124                 for (size_type i=0; i < n; ++i)
00125                     if (rac[i] > x)
00126                         x = rac[i];
00127                 m_logn  = bit_magic::l1BP(x)+1; // we need logn bits to represent all values in the range [0..x]
00128             } else {
00129                 m_logn = logn;
00130             }
00131             bit_vector tree(n*m_logn, 0);  // initialize the tree
00132 
00133             int_vector<>        perms[2];
00134             perms[0].set_int_width(m_logn); perms[1].set_int_width(m_logn);
00135             perms[0].resize(n); perms[1].resize(n);
00136 
00137             for (size_type i=0; i < n; ++i) { // copy original array to perms[0]
00138                 perms[0][i] = rac[i];
00139             }
00140 
00141             size_type tree_pos = 0;
00142 
00143             uint64_t            mask_old = 1ULL<<(m_logn);
00144             for (uint32_t k=0; k<m_logn; ++k) {
00145                 int_vector<>&   act_perm         = perms[k%2];
00146                 int_vector<>&   next_perm        = perms[1-(k%2)];
00147                 size_type               start            = 0;
00148                 const uint64_t          mask_new = 1ULL<<(m_logn-k-1);
00149                 do {
00150                     size_type   i               = start;
00151                     size_type   cnt1    =       0;
00152                     uint64_t    start_value = (act_perm[i]&mask_old);
00153                     uint64_t    x;
00154                     while (i < n and((x=act_perm[i])&mask_old)==start_value) {
00155                         if (x&mask_new) {
00156                             ++cnt1; tree[tree_pos] = 1;
00157                         }
00158                         ++tree_pos;
00159                         ++i;
00160                     }
00161                     size_type cnt0 = i-start-cnt1;
00162                     if (k+1 < m_logn) { // inner node
00163                         size_type idxs[2] = {start, start+cnt0};
00164                         for (i=start, start = start+cnt0+cnt1; i < start; ++i) {
00165                             next_perm[idxs[(act_perm[i]&mask_new)>0 ]++] = act_perm[i];
00166                         }
00167                     } else { // leaf node
00168                         start += cnt0+cnt1;
00169                         ++m_sigma;  // increase sigma for each leaf
00170                     }
00171                 } while (start < n);
00172                 mask_old += mask_new;
00173             }
00174 #ifdef SDSL_DEBUG_INT_WAVELET_TREE
00175             if (tree.size()<100) {
00176                 std::cerr<<"tree="<<tree<<std::endl;
00177             }
00178 #endif
00179             util::assign(m_tree, tree);
00180             util::init_support(m_tree_rank, &m_tree);
00181             util::init_support(m_tree_select0, &m_tree);
00182             util::init_support(m_tree_select1, &m_tree);
00183         }
00184 
00186 
00195         template<uint8_t int_width>
00196         wt_int(int_vector_file_buffer<int_width>& buf, uint32_t logn=0, std::string dir="./")
00197             : m_size(0),m_sigma(0), m_logn(0), sigma(m_sigma), tree(m_tree) {
00198             construct(buf, logn, dir);
00199         }
00200 
00201 
00202         // TODO external construction method which occupies only additional constant memory
00203         template<uint8_t int_width>
00204         void construct(int_vector_file_buffer<int_width>& buf, uint32_t logn=0, std::string dir="./") {
00205             buf.reset();
00206             size_type n = buf.int_vector_size;  // set n
00207             m_size = n;                         // set sigma and size
00208             m_sigma = 0;
00209             temp_write_read_buffer<> buf1(5000000, buf.int_width, dir);   // buffer for elements in the rigth node
00210             int_vector<int_width> rac(n, 0, buf.int_width);                               // initialze rac
00211 
00212             value_type x = 1;  // variable for the biggest value in rac
00213             for (size_type i=0,r=0,r_sum=0; i<n;) { // detect the biggest value in rac
00214                 for (; i < r+r_sum; ++i) {
00215                     if (buf[i-r_sum] > x)
00216                         x = buf[i-r_sum];
00217                     rac[i] = buf[i-r_sum];
00218                 }
00219                 r_sum += r; r = buf.load_next_block();
00220             }
00221 
00222             if (logn == 0) {
00223                 m_logn  = bit_magic::l1BP(x)+1; // we need logn bits to represent all values in the range [0..x]
00224             } else {
00225                 m_logn = logn;
00226             }
00227             std::string tree_out_buf_file_name = (dir+"m_tree"+util::to_string(util::get_pid())+"_"+util::to_string(util::get_id()));
00228             std::ofstream tree_out_buf(tree_out_buf_file_name.c_str(),
00229                                        std::ios::binary | std::ios::trunc | std::ios::out);   // open buffer for tree
00230             size_type bit_size = n*m_logn;
00231             tree_out_buf.write((char*) &bit_size, sizeof(bit_size));    // write size of bit_vector
00232 
00233             size_type tree_pos = 0;
00234             uint64_t tree_word = 0;
00235 
00236             uint64_t            mask_old = 1ULL<<(m_logn);
00237             for (uint32_t k=0; k<m_logn; ++k) {
00238                 size_type               start            = 0;
00239                 const uint64_t          mask_new = 1ULL<<(m_logn-k-1);
00240                 do {
00241                     buf1.reset();
00242                     size_type   i               = start;
00243                     size_type   cnt0    =       0;
00244                     uint64_t    start_value = (rac[i]&mask_old);
00245                     uint64_t    x;
00246                     while (i < n and((x=rac[i])&mask_old)==start_value) {
00247                         if (x&mask_new) {
00248                             tree_word |= (1ULL << (tree_pos&0x3FULL));
00249                             buf1 << x;
00250                         } else {
00251                             rac[start + cnt0++ ] = x;
00252                         }
00253                         ++tree_pos;
00254                         if ((tree_pos & 0x3FULL) == 0) { // if tree_pos % 64 == 0 write old word
00255                             tree_out_buf.write((char*) &tree_word, sizeof(tree_word));
00256                             tree_word = 0;
00257                         }
00258                         ++i;
00259                     }
00260                     buf1.write_close();
00261                     size_type cnt1 = i-start-cnt0;
00262                     if (k+1 < m_logn) { // inner node
00263                         for (i=start + cnt0, start = start+cnt0+cnt1; i < start; ++i) {
00264                             buf1 >> x;
00265                             rac[ i ] = x;
00266                         }
00267                     } else { // leaf node
00268                         start += cnt0+cnt1;
00269                         ++m_sigma; // increase sigma for each leaf
00270                     }
00271 
00272                 } while (start < n);
00273                 mask_old += mask_new;
00274             }
00275             if ((tree_pos & 0x3FULL) != 0) { // if tree_pos % 64 > 0 => there are remaining entries we have to write
00276                 tree_out_buf.write((char*) &tree_word, sizeof(tree_word));
00277             }
00278             tree_out_buf.close();
00279             rac.resize(0);
00280             bit_vector tree;
00281             util::load_from_file(tree, tree_out_buf_file_name.c_str());
00282             std::remove(tree_out_buf_file_name.c_str());
00283 #ifdef SDSL_DEBUG_INT_WAVELET_TREE
00284             if (tree.size()<100) {
00285                 std::cerr<<"tree="<<tree<<std::endl;
00286             }
00287 #endif
00288             util::assign(m_tree, tree);
00289             util::init_support(m_tree_rank, &m_tree);
00290             util::init_support(m_tree_select0, &m_tree);
00291             util::init_support(m_tree_select1, &m_tree);
00292         }
00293 
00295         wt_int(const wt_int& wt):sigma(m_sigma) {
00296             copy(wt);
00297         }
00298 
00300         wt_int& operator=(const wt_int& wt) {
00301             if (this != &wt) {
00302                 copy(wt);
00303             }
00304             return *this;
00305         }
00306 
00308         void swap(wt_int& wt) {
00309             if (this != &wt) {
00310                 std::swap(m_size, wt.m_size);
00311                 std::swap(m_sigma,  wt.m_sigma);
00312                 m_tree.swap(wt.m_tree);
00313                 m_tree_rank.swap(wt.m_tree_rank); // rank swap after the swap of the bit vector m_tree
00314                 m_tree_rank.set_vector(&m_tree);
00315                 m_tree_select1.swap(wt.m_tree_select1); // select1 swap after the swap of the bit vector m_tree
00316                 m_tree_select1.set_vector(&m_tree);
00317                 m_tree_select0.swap(wt.m_tree_select0); // select0 swap after the swap of the bit vector m_tree
00318                 m_tree_select0.set_vector(&m_tree);
00319                 std::swap(m_logn,  wt.m_logn);
00320             }
00321         }
00322 
00324         size_type size()const {
00325             return m_size;
00326         }
00327 
00329         bool empty()const {
00330             return m_size == 0;
00331         }
00332 
00334 
00337         value_type operator[](size_type i)const {
00338 //              size_type zeros_left_of_pos = 0;
00339             size_type offset = 0;
00340             value_type res = 0;
00341             size_type node_size = m_size;
00342             for (uint32_t k=0; k < m_logn; ++k) {
00343                 res <<= 1;
00344                 size_type ones_before_o   = m_tree_rank(offset);
00345                 size_type ones_before_i   = m_tree_rank(offset + i) - ones_before_o;
00346                 size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_o;
00347 //                      std::cerr<<"k="<<k<<" i="<<i<<" offset="<<offset<<" node_size="<<node_size<<" obo="<<ones_before_o<<" obi="<<ones_before_i<<" obe="<<ones_before_end<<std::endl;
00348                 if (m_tree[offset+i]) { // one at position i => follow right child
00349 //                              zeros_left_of_pos += (node_size - ones_before_end);
00350                     offset += (node_size - ones_before_end);
00351                     node_size = ones_before_end;
00352                     i = ones_before_i;
00353                     res |= 1;
00354                 } else { // zero at position i => follow left child
00355                     node_size = (node_size - ones_before_end);
00356                     i = (i-ones_before_i);
00357                 }
00358                 //offset = (k+1)*m_size + zeros_left_of_pos;
00359                 offset += m_size;
00360             }
00361 //              std::cerr<<" offset="<<offset<<" node_size="<<node_size<<std::endl;
00362             return res;
00363         };
00364 
00365         //TODO: buchstaben, die nicht im orginaltext vorkommen behandeln!!!
00367 
00374         size_type rank(size_type i, value_type c)const {
00375             size_type offset = 0;
00376             uint64_t mask        = (1ULL) << (m_logn-1);
00377             size_type node_size = m_size;
00378             for (uint32_t k=0; k < m_logn and i; ++k) {
00379                 size_type ones_before_o   = m_tree_rank(offset);
00380                 size_type ones_before_i   = m_tree_rank(offset + i) - ones_before_o;
00381                 size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_o;
00382                 if (c & mask) { // search for a one at this level
00383                     offset += (node_size - ones_before_end);
00384                     node_size = ones_before_end;
00385                     i = ones_before_i;
00386                 } else { // search for a zero at this level
00387                     node_size = (node_size - ones_before_end);
00388                     i = (i-ones_before_i);
00389                 }
00390                 offset += m_size;
00391                 mask >>= 1;
00392             }
00393             return i;
00394         };
00395 
00397 
00403         // TODO: was ist wenn c gar nicht vorkommt, oder es keine i Vorkommen gibt?
00404         size_type select(size_type i, value_type c)const {
00405             // possible optimization: if the array is a permutation we can start at the bottom of the tree
00406             size_type offset = 0;
00407             uint64_t mask        = (1ULL) << (m_logn-1);
00408             size_type node_size = m_size;
00409             size_type offsets[m_logn+1];                        // offsets down the path
00410             size_type ones_before_os[m_logn+1];   // ones before the offsets
00411             offsets[0] = ones_before_os[0] = 0;
00412 
00413             for (uint32_t k=0; k < m_logn and node_size; ++k) {
00414                 size_type ones_before_o   = m_tree_rank(offset);
00415                 ones_before_os[k] = ones_before_o;
00416                 size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_o;
00417                 if (c & mask) { // search for a one at this level
00418                     offset += (node_size - ones_before_end);
00419                     node_size = ones_before_end;
00420                 } else { // search for a zero at this level
00421                     node_size = (node_size - ones_before_end);
00422                 }
00423                 offset += m_size;
00424                 offsets[k+1] = offset;
00425                 mask >>= 1;
00426             }
00427             if (node_size < i) {
00428                 std::cerr<<"c="<<c<<" does not occure "<<i<<" times in the WT"<<std::endl;
00429                 return m_size;
00430             }
00431             mask = 1ULL;
00432             for (uint32_t k=m_logn; k>0; --k) {
00433                 offset = offsets[k-1];
00434                 size_type ones_before_o = ones_before_os[k-1];
00435                 if (c & mask) { // right child => search i'th
00436                     i = m_tree_select1(ones_before_o + i) - offset + 1;
00437                 } else { // left child => search i'th zero
00438                     i = m_tree_select0(offset - ones_before_o + i) - offset + 1;
00439                 }
00440                 mask <<= 1;
00441             }
00442             return i-1;
00443         };
00444 
00446 
00453         size_type range_search_2d(size_type lb, size_type rb, value_type vlb, value_type vrb,
00454                                   std::vector<size_type>* idx_result=NULL,
00455                                   std::vector<value_type>* val_result=NULL
00456                                  ) const {
00457             size_type offsets[m_logn+1];
00458             size_type ones_before_os[m_logn+1];
00459             offsets[0] = 0;
00460             if (vrb > (1ULL << m_logn))
00461                 vrb = (1ULL << m_logn);
00462             if (vlb > vrb)
00463                 return 0;
00464             size_type cnt_answers = 0;
00465             _range_search_2d(lb, rb, vlb, vrb, 0, 0, m_size, offsets, ones_before_os, 0, idx_result, val_result, cnt_answers);
00466             return cnt_answers;
00467         }
00468 
00469         // add parameter path
00470         // ilb interval left bound
00471         // irb interval right bound
00472         void _range_search_2d(size_type lb, size_type rb, value_type vlb, value_type vrb, size_type depth,
00473                               size_type ilb, size_type node_size, size_type offsets[], size_type ones_before_os[], size_type path,
00474                               std::vector<size_type>* idx_result, std::vector<size_type>* val_result, size_type& cnt_answers)
00475         const {
00476             if (lb > rb)
00477                 return;
00478             if (depth == m_logn) {
00479                 if (idx_result != NULL) {
00480                     for (size_type j=1; j <= node_size; ++j) {
00481                         size_type i = j;
00482                         size_type c = path;
00483                         for (uint32_t k=m_logn; k>0; --k) {
00484                             size_type offset = offsets[k-1];
00485                             size_type ones_before_o = ones_before_os[k-1];
00486                             if (c&1) {
00487                                 i = m_tree_select1(ones_before_o + i) - offset + 1;
00488                             } else {
00489                                 i = m_tree_select0(offset - ones_before_o + i) - offset + 1;
00490                             }
00491                             c >>= 1;
00492                         }
00493                         idx_result->push_back(i-1); // add resulting index; -1 cause of 0 based indexing
00494                     }
00495                 }
00496                 if (val_result != NULL) {
00497                     for (size_type j=1; j <= node_size; ++j) {
00498                         val_result->push_back(path);
00499                     }
00500                 }
00501                 cnt_answers += node_size;
00502                 return;
00503             }
00504             size_type irb = ilb + (1ULL << (m_logn-depth));
00505             size_type mid = (irb + ilb)>>1;
00506 
00507             size_type offset = offsets[depth];
00508 
00509             size_type ones_before_o             = m_tree_rank(offset);
00510             ones_before_os[depth]               = ones_before_o;
00511             size_type ones_before_lb    = m_tree_rank(offset + lb);
00512             size_type ones_before_rb    = m_tree_rank(offset + rb + 1);
00513             size_type ones_before_end   = m_tree_rank(offset + node_size);
00514             size_type zeros_before_o    = offset - ones_before_o;
00515             size_type zeros_before_lb   = offset + lb - ones_before_lb;
00516             size_type zeros_before_rb   = offset + rb + 1 - ones_before_rb;
00517             size_type zeros_before_end  = offset + node_size - ones_before_end;
00518             if (vlb < mid and mid) {
00519                 size_type nlb                   = zeros_before_lb - zeros_before_o;
00520                 size_type nrb                   = zeros_before_rb - zeros_before_o;
00521                 offsets[depth+1]                = offset + m_size;
00522                 if (nrb)
00523                     _range_search_2d(nlb, nrb-1, vlb, std::min(vrb,mid-1), depth+1, ilb, zeros_before_end - zeros_before_o, offsets, ones_before_os, path<<1, idx_result, val_result, cnt_answers);
00524             }
00525             if (vrb >= mid) {
00526                 size_type nlb                   = ones_before_lb - ones_before_o;
00527                 size_type nrb                   = ones_before_rb - ones_before_o;
00528                 offsets[depth+1]                = offset + m_size + (zeros_before_end - zeros_before_o);
00529                 if (nrb)
00530                     _range_search_2d(nlb, nrb-1, std::max(mid, vlb), vrb, depth+1, mid, ones_before_end - ones_before_o, offsets, ones_before_os, (path<<1)+1 ,idx_result, val_result, cnt_answers);
00531             }
00532         }
00533 
00535         size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const {
00536             structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
00537             size_type written_bytes = 0;
00538             written_bytes += util::write_member(m_size, out, child, "size");
00539             written_bytes += util::write_member(m_sigma, out, child, "sigma");
00540             written_bytes += m_tree.serialize(out, child, "tree");
00541             written_bytes += m_tree_rank.serialize(out, child, "tree_rank");
00542             written_bytes += m_tree_select1.serialize(out, child, "tree_select_1");
00543             written_bytes += m_tree_select0.serialize(out, child, "tree_select_0");
00544             written_bytes += util::write_member(m_logn, out, child, "log_n");
00545             structure_tree::add_size(child, written_bytes);
00546             return written_bytes;
00547         }
00548 
00550         void load(std::istream& in) {
00551             util::read_member(m_size, in);
00552             util::read_member(m_sigma, in);
00553             m_tree.load(in);
00554             m_tree_rank.load(in, &m_tree);
00555             m_tree_select1.load(in, &m_tree);
00556             m_tree_select0.load(in, &m_tree);
00557             util::read_member(m_logn, in);
00558         }
00559         /*
00560                 void print_info()const{
00561                         size_type rle_ed = 0;
00562                         for(size_type i=0; i < m_tree.size(); ++i){
00563 
00564                         }
00565                 }
00566         */
00567 
00568 };
00569 
00570 }// end namespace sds
00571 
00572 #endif // end file