SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2009 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00023 #ifndef INCLUDED_SDSL_INT_WAVELET_TREE 00024 #define INCLUDED_SDSL_INT_WAVELET_TREE 00025 00026 #include "int_vector.hpp" 00027 #include "rank_support_v.hpp" 00028 #include "select_support_mcl.hpp" 00029 #include "bitmagic.hpp" 00030 #include "testutils.hpp" 00031 #include "temp_write_read_buffer.hpp" 00032 #include "util.hpp" 00033 #include <set> // for calculating the alphabet size 00034 #include <map> // for mapping a symbol to its lexicographical index 00035 #include <algorithm> // for std::swap 00036 #include <stdexcept> 00037 #include <vector> 00038 00040 namespace sdsl 00041 { 00042 00044 00062 template<class RandomAccessContainer=int_vector<>, 00063 class BitVector = bit_vector, 00064 class RankSupport = typename BitVector::rank_1_type, 00065 class SelectSupport = typename BitVector::select_1_type, 00066 class SelectSupportZero = typename BitVector::select_0_type> 00067 class wt_int 00068 { 00069 public: 00070 typedef typename RandomAccessContainer::size_type size_type; 00071 typedef typename RandomAccessContainer::value_type value_type; 00072 typedef BitVector bit_vector_type; 00073 typedef RankSupport rank_1_type; 00074 typedef SelectSupport select_1_type; 00075 typedef SelectSupportZero select_0_type; 00076 protected: 00077 size_type m_size; 00078 size_type m_sigma; //<- \f$ |\Sigma| \f$ 00079 bit_vector_type m_tree; // bit vector to store the wavelet tree 00080 rank_1_type m_tree_rank; // rank support for the wavelet tree bit vector 00081 select_1_type m_tree_select1; // select support for the wavelet tree bit vector 00082 select_0_type m_tree_select0; 00083 uint32_t m_logn; 00084 00085 void copy(const wt_int& wt) { 00086 m_size = wt.m_size; 00087 m_sigma = wt.m_sigma; 00088 m_tree = wt.m_tree; 00089 m_tree_rank = wt.m_tree_rank; 00090 m_tree_rank.set_vector(&m_tree); 00091 m_tree_select1 = wt.m_tree_select1; 00092 m_tree_select1.set_vector(&m_tree); 00093 m_tree_select0 = wt.m_tree_select0; 00094 m_tree_select0.set_vector(&m_tree); 00095 m_logn = wt.m_logn; 00096 } 00097 00098 public: 00099 00100 const size_type& sigma; 00101 const bit_vector_type& tree; 00102 00104 wt_int():m_size(0),m_sigma(0), m_logn(0), sigma(m_sigma), tree(m_tree) {}; 00105 00106 00108 00117 wt_int(const RandomAccessContainer& rac, uint32_t logn=0):m_size(rac.size()), m_sigma(0), sigma(m_sigma), tree(m_tree) { 00118 #ifdef SDSL_DEBUG_INT_WAVELET_TREE 00119 std::cerr<<"wt_int construct: size="<<m_size<<std::endl; 00120 #endif 00121 size_type n = m_size; 00122 if (logn == 0) { // detect the biggest value in rac and set logn properly 00123 value_type x = 1; 00124 for (size_type i=0; i < n; ++i) 00125 if (rac[i] > x) 00126 x = rac[i]; 00127 m_logn = bit_magic::l1BP(x)+1; // we need logn bits to represent all values in the range [0..x] 00128 } else { 00129 m_logn = logn; 00130 } 00131 bit_vector tree(n*m_logn, 0); // initialize the tree 00132 00133 int_vector<> perms[2]; 00134 perms[0].set_int_width(m_logn); perms[1].set_int_width(m_logn); 00135 perms[0].resize(n); perms[1].resize(n); 00136 00137 for (size_type i=0; i < n; ++i) { // copy original array to perms[0] 00138 perms[0][i] = rac[i]; 00139 } 00140 00141 size_type tree_pos = 0; 00142 00143 uint64_t mask_old = 1ULL<<(m_logn); 00144 for (uint32_t k=0; k<m_logn; ++k) { 00145 int_vector<>& act_perm = perms[k%2]; 00146 int_vector<>& next_perm = perms[1-(k%2)]; 00147 size_type start = 0; 00148 const uint64_t mask_new = 1ULL<<(m_logn-k-1); 00149 do { 00150 size_type i = start; 00151 size_type cnt1 = 0; 00152 uint64_t start_value = (act_perm[i]&mask_old); 00153 uint64_t x; 00154 while (i < n and((x=act_perm[i])&mask_old)==start_value) { 00155 if (x&mask_new) { 00156 ++cnt1; tree[tree_pos] = 1; 00157 } 00158 ++tree_pos; 00159 ++i; 00160 } 00161 size_type cnt0 = i-start-cnt1; 00162 if (k+1 < m_logn) { // inner node 00163 size_type idxs[2] = {start, start+cnt0}; 00164 for (i=start, start = start+cnt0+cnt1; i < start; ++i) { 00165 next_perm[idxs[(act_perm[i]&mask_new)>0 ]++] = act_perm[i]; 00166 } 00167 } else { // leaf node 00168 start += cnt0+cnt1; 00169 ++m_sigma; // increase sigma for each leaf 00170 } 00171 } while (start < n); 00172 mask_old += mask_new; 00173 } 00174 #ifdef SDSL_DEBUG_INT_WAVELET_TREE 00175 if (tree.size()<100) { 00176 std::cerr<<"tree="<<tree<<std::endl; 00177 } 00178 #endif 00179 util::assign(m_tree, tree); 00180 util::init_support(m_tree_rank, &m_tree); 00181 util::init_support(m_tree_select0, &m_tree); 00182 util::init_support(m_tree_select1, &m_tree); 00183 } 00184 00186 00195 template<uint8_t int_width> 00196 wt_int(int_vector_file_buffer<int_width>& buf, uint32_t logn=0, std::string dir="./") 00197 : m_size(0),m_sigma(0), m_logn(0), sigma(m_sigma), tree(m_tree) { 00198 construct(buf, logn, dir); 00199 } 00200 00201 00202 // TODO external construction method which occupies only additional constant memory 00203 template<uint8_t int_width> 00204 void construct(int_vector_file_buffer<int_width>& buf, uint32_t logn=0, std::string dir="./") { 00205 buf.reset(); 00206 size_type n = buf.int_vector_size; // set n 00207 m_size = n; // set sigma and size 00208 m_sigma = 0; 00209 temp_write_read_buffer<> buf1(5000000, buf.int_width, dir); // buffer for elements in the rigth node 00210 int_vector<int_width> rac(n, 0, buf.int_width); // initialze rac 00211 00212 value_type x = 1; // variable for the biggest value in rac 00213 for (size_type i=0,r=0,r_sum=0; i<n;) { // detect the biggest value in rac 00214 for (; i < r+r_sum; ++i) { 00215 if (buf[i-r_sum] > x) 00216 x = buf[i-r_sum]; 00217 rac[i] = buf[i-r_sum]; 00218 } 00219 r_sum += r; r = buf.load_next_block(); 00220 } 00221 00222 if (logn == 0) { 00223 m_logn = bit_magic::l1BP(x)+1; // we need logn bits to represent all values in the range [0..x] 00224 } else { 00225 m_logn = logn; 00226 } 00227 std::string tree_out_buf_file_name = (dir+"m_tree"+util::to_string(util::get_pid())+"_"+util::to_string(util::get_id())); 00228 std::ofstream tree_out_buf(tree_out_buf_file_name.c_str(), 00229 std::ios::binary | std::ios::trunc | std::ios::out); // open buffer for tree 00230 size_type bit_size = n*m_logn; 00231 tree_out_buf.write((char*) &bit_size, sizeof(bit_size)); // write size of bit_vector 00232 00233 size_type tree_pos = 0; 00234 uint64_t tree_word = 0; 00235 00236 uint64_t mask_old = 1ULL<<(m_logn); 00237 for (uint32_t k=0; k<m_logn; ++k) { 00238 size_type start = 0; 00239 const uint64_t mask_new = 1ULL<<(m_logn-k-1); 00240 do { 00241 buf1.reset(); 00242 size_type i = start; 00243 size_type cnt0 = 0; 00244 uint64_t start_value = (rac[i]&mask_old); 00245 uint64_t x; 00246 while (i < n and((x=rac[i])&mask_old)==start_value) { 00247 if (x&mask_new) { 00248 tree_word |= (1ULL << (tree_pos&0x3FULL)); 00249 buf1 << x; 00250 } else { 00251 rac[start + cnt0++ ] = x; 00252 } 00253 ++tree_pos; 00254 if ((tree_pos & 0x3FULL) == 0) { // if tree_pos % 64 == 0 write old word 00255 tree_out_buf.write((char*) &tree_word, sizeof(tree_word)); 00256 tree_word = 0; 00257 } 00258 ++i; 00259 } 00260 buf1.write_close(); 00261 size_type cnt1 = i-start-cnt0; 00262 if (k+1 < m_logn) { // inner node 00263 for (i=start + cnt0, start = start+cnt0+cnt1; i < start; ++i) { 00264 buf1 >> x; 00265 rac[ i ] = x; 00266 } 00267 } else { // leaf node 00268 start += cnt0+cnt1; 00269 ++m_sigma; // increase sigma for each leaf 00270 } 00271 00272 } while (start < n); 00273 mask_old += mask_new; 00274 } 00275 if ((tree_pos & 0x3FULL) != 0) { // if tree_pos % 64 > 0 => there are remaining entries we have to write 00276 tree_out_buf.write((char*) &tree_word, sizeof(tree_word)); 00277 } 00278 tree_out_buf.close(); 00279 rac.resize(0); 00280 bit_vector tree; 00281 util::load_from_file(tree, tree_out_buf_file_name.c_str()); 00282 std::remove(tree_out_buf_file_name.c_str()); 00283 #ifdef SDSL_DEBUG_INT_WAVELET_TREE 00284 if (tree.size()<100) { 00285 std::cerr<<"tree="<<tree<<std::endl; 00286 } 00287 #endif 00288 util::assign(m_tree, tree); 00289 util::init_support(m_tree_rank, &m_tree); 00290 util::init_support(m_tree_select0, &m_tree); 00291 util::init_support(m_tree_select1, &m_tree); 00292 } 00293 00295 wt_int(const wt_int& wt):sigma(m_sigma) { 00296 copy(wt); 00297 } 00298 00300 wt_int& operator=(const wt_int& wt) { 00301 if (this != &wt) { 00302 copy(wt); 00303 } 00304 return *this; 00305 } 00306 00308 void swap(wt_int& wt) { 00309 if (this != &wt) { 00310 std::swap(m_size, wt.m_size); 00311 std::swap(m_sigma, wt.m_sigma); 00312 m_tree.swap(wt.m_tree); 00313 m_tree_rank.swap(wt.m_tree_rank); // rank swap after the swap of the bit vector m_tree 00314 m_tree_rank.set_vector(&m_tree); 00315 m_tree_select1.swap(wt.m_tree_select1); // select1 swap after the swap of the bit vector m_tree 00316 m_tree_select1.set_vector(&m_tree); 00317 m_tree_select0.swap(wt.m_tree_select0); // select0 swap after the swap of the bit vector m_tree 00318 m_tree_select0.set_vector(&m_tree); 00319 std::swap(m_logn, wt.m_logn); 00320 } 00321 } 00322 00324 size_type size()const { 00325 return m_size; 00326 } 00327 00329 bool empty()const { 00330 return m_size == 0; 00331 } 00332 00334 00337 value_type operator[](size_type i)const { 00338 // size_type zeros_left_of_pos = 0; 00339 size_type offset = 0; 00340 value_type res = 0; 00341 size_type node_size = m_size; 00342 for (uint32_t k=0; k < m_logn; ++k) { 00343 res <<= 1; 00344 size_type ones_before_o = m_tree_rank(offset); 00345 size_type ones_before_i = m_tree_rank(offset + i) - ones_before_o; 00346 size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_o; 00347 // std::cerr<<"k="<<k<<" i="<<i<<" offset="<<offset<<" node_size="<<node_size<<" obo="<<ones_before_o<<" obi="<<ones_before_i<<" obe="<<ones_before_end<<std::endl; 00348 if (m_tree[offset+i]) { // one at position i => follow right child 00349 // zeros_left_of_pos += (node_size - ones_before_end); 00350 offset += (node_size - ones_before_end); 00351 node_size = ones_before_end; 00352 i = ones_before_i; 00353 res |= 1; 00354 } else { // zero at position i => follow left child 00355 node_size = (node_size - ones_before_end); 00356 i = (i-ones_before_i); 00357 } 00358 //offset = (k+1)*m_size + zeros_left_of_pos; 00359 offset += m_size; 00360 } 00361 // std::cerr<<" offset="<<offset<<" node_size="<<node_size<<std::endl; 00362 return res; 00363 }; 00364 00365 //TODO: buchstaben, die nicht im orginaltext vorkommen behandeln!!! 00367 00374 size_type rank(size_type i, value_type c)const { 00375 size_type offset = 0; 00376 uint64_t mask = (1ULL) << (m_logn-1); 00377 size_type node_size = m_size; 00378 for (uint32_t k=0; k < m_logn and i; ++k) { 00379 size_type ones_before_o = m_tree_rank(offset); 00380 size_type ones_before_i = m_tree_rank(offset + i) - ones_before_o; 00381 size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_o; 00382 if (c & mask) { // search for a one at this level 00383 offset += (node_size - ones_before_end); 00384 node_size = ones_before_end; 00385 i = ones_before_i; 00386 } else { // search for a zero at this level 00387 node_size = (node_size - ones_before_end); 00388 i = (i-ones_before_i); 00389 } 00390 offset += m_size; 00391 mask >>= 1; 00392 } 00393 return i; 00394 }; 00395 00397 00403 // TODO: was ist wenn c gar nicht vorkommt, oder es keine i Vorkommen gibt? 00404 size_type select(size_type i, value_type c)const { 00405 // possible optimization: if the array is a permutation we can start at the bottom of the tree 00406 size_type offset = 0; 00407 uint64_t mask = (1ULL) << (m_logn-1); 00408 size_type node_size = m_size; 00409 size_type offsets[m_logn+1]; // offsets down the path 00410 size_type ones_before_os[m_logn+1]; // ones before the offsets 00411 offsets[0] = ones_before_os[0] = 0; 00412 00413 for (uint32_t k=0; k < m_logn and node_size; ++k) { 00414 size_type ones_before_o = m_tree_rank(offset); 00415 ones_before_os[k] = ones_before_o; 00416 size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_o; 00417 if (c & mask) { // search for a one at this level 00418 offset += (node_size - ones_before_end); 00419 node_size = ones_before_end; 00420 } else { // search for a zero at this level 00421 node_size = (node_size - ones_before_end); 00422 } 00423 offset += m_size; 00424 offsets[k+1] = offset; 00425 mask >>= 1; 00426 } 00427 if (node_size < i) { 00428 std::cerr<<"c="<<c<<" does not occure "<<i<<" times in the WT"<<std::endl; 00429 return m_size; 00430 } 00431 mask = 1ULL; 00432 for (uint32_t k=m_logn; k>0; --k) { 00433 offset = offsets[k-1]; 00434 size_type ones_before_o = ones_before_os[k-1]; 00435 if (c & mask) { // right child => search i'th 00436 i = m_tree_select1(ones_before_o + i) - offset + 1; 00437 } else { // left child => search i'th zero 00438 i = m_tree_select0(offset - ones_before_o + i) - offset + 1; 00439 } 00440 mask <<= 1; 00441 } 00442 return i-1; 00443 }; 00444 00446 00453 size_type range_search_2d(size_type lb, size_type rb, value_type vlb, value_type vrb, 00454 std::vector<size_type>* idx_result=NULL, 00455 std::vector<value_type>* val_result=NULL 00456 ) const { 00457 size_type offsets[m_logn+1]; 00458 size_type ones_before_os[m_logn+1]; 00459 offsets[0] = 0; 00460 if (vrb > (1ULL << m_logn)) 00461 vrb = (1ULL << m_logn); 00462 if (vlb > vrb) 00463 return 0; 00464 size_type cnt_answers = 0; 00465 _range_search_2d(lb, rb, vlb, vrb, 0, 0, m_size, offsets, ones_before_os, 0, idx_result, val_result, cnt_answers); 00466 return cnt_answers; 00467 } 00468 00469 // add parameter path 00470 // ilb interval left bound 00471 // irb interval right bound 00472 void _range_search_2d(size_type lb, size_type rb, value_type vlb, value_type vrb, size_type depth, 00473 size_type ilb, size_type node_size, size_type offsets[], size_type ones_before_os[], size_type path, 00474 std::vector<size_type>* idx_result, std::vector<size_type>* val_result, size_type& cnt_answers) 00475 const { 00476 if (lb > rb) 00477 return; 00478 if (depth == m_logn) { 00479 if (idx_result != NULL) { 00480 for (size_type j=1; j <= node_size; ++j) { 00481 size_type i = j; 00482 size_type c = path; 00483 for (uint32_t k=m_logn; k>0; --k) { 00484 size_type offset = offsets[k-1]; 00485 size_type ones_before_o = ones_before_os[k-1]; 00486 if (c&1) { 00487 i = m_tree_select1(ones_before_o + i) - offset + 1; 00488 } else { 00489 i = m_tree_select0(offset - ones_before_o + i) - offset + 1; 00490 } 00491 c >>= 1; 00492 } 00493 idx_result->push_back(i-1); // add resulting index; -1 cause of 0 based indexing 00494 } 00495 } 00496 if (val_result != NULL) { 00497 for (size_type j=1; j <= node_size; ++j) { 00498 val_result->push_back(path); 00499 } 00500 } 00501 cnt_answers += node_size; 00502 return; 00503 } 00504 size_type irb = ilb + (1ULL << (m_logn-depth)); 00505 size_type mid = (irb + ilb)>>1; 00506 00507 size_type offset = offsets[depth]; 00508 00509 size_type ones_before_o = m_tree_rank(offset); 00510 ones_before_os[depth] = ones_before_o; 00511 size_type ones_before_lb = m_tree_rank(offset + lb); 00512 size_type ones_before_rb = m_tree_rank(offset + rb + 1); 00513 size_type ones_before_end = m_tree_rank(offset + node_size); 00514 size_type zeros_before_o = offset - ones_before_o; 00515 size_type zeros_before_lb = offset + lb - ones_before_lb; 00516 size_type zeros_before_rb = offset + rb + 1 - ones_before_rb; 00517 size_type zeros_before_end = offset + node_size - ones_before_end; 00518 if (vlb < mid and mid) { 00519 size_type nlb = zeros_before_lb - zeros_before_o; 00520 size_type nrb = zeros_before_rb - zeros_before_o; 00521 offsets[depth+1] = offset + m_size; 00522 if (nrb) 00523 _range_search_2d(nlb, nrb-1, vlb, std::min(vrb,mid-1), depth+1, ilb, zeros_before_end - zeros_before_o, offsets, ones_before_os, path<<1, idx_result, val_result, cnt_answers); 00524 } 00525 if (vrb >= mid) { 00526 size_type nlb = ones_before_lb - ones_before_o; 00527 size_type nrb = ones_before_rb - ones_before_o; 00528 offsets[depth+1] = offset + m_size + (zeros_before_end - zeros_before_o); 00529 if (nrb) 00530 _range_search_2d(nlb, nrb-1, std::max(mid, vlb), vrb, depth+1, mid, ones_before_end - ones_before_o, offsets, ones_before_os, (path<<1)+1 ,idx_result, val_result, cnt_answers); 00531 } 00532 } 00533 00535 size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const { 00536 structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); 00537 size_type written_bytes = 0; 00538 written_bytes += util::write_member(m_size, out, child, "size"); 00539 written_bytes += util::write_member(m_sigma, out, child, "sigma"); 00540 written_bytes += m_tree.serialize(out, child, "tree"); 00541 written_bytes += m_tree_rank.serialize(out, child, "tree_rank"); 00542 written_bytes += m_tree_select1.serialize(out, child, "tree_select_1"); 00543 written_bytes += m_tree_select0.serialize(out, child, "tree_select_0"); 00544 written_bytes += util::write_member(m_logn, out, child, "log_n"); 00545 structure_tree::add_size(child, written_bytes); 00546 return written_bytes; 00547 } 00548 00550 void load(std::istream& in) { 00551 util::read_member(m_size, in); 00552 util::read_member(m_sigma, in); 00553 m_tree.load(in); 00554 m_tree_rank.load(in, &m_tree); 00555 m_tree_select1.load(in, &m_tree); 00556 m_tree_select0.load(in, &m_tree); 00557 util::read_member(m_logn, in); 00558 } 00559 /* 00560 void print_info()const{ 00561 size_type rle_ed = 0; 00562 for(size_type i=0; i < m_tree.size(); ++i){ 00563 00564 } 00565 } 00566 */ 00567 00568 }; 00569 00570 }// end namespace sds 00571 00572 #endif // end file