SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2011 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00022 #ifndef SDSL_RRR_VECTOR 00023 #define SDSL_RRR_VECTOR 00024 00025 #include "int_vector.hpp" 00026 #include "bitmagic.hpp" 00027 #include "util.hpp" 00028 #include "rrr_helper.hpp" // for binomial helper class 00029 #include <vector> 00030 #include <algorithm> // for next_permutation 00031 #include <iostream> 00032 00034 namespace sdsl 00035 { 00036 00037 00038 template<uint8_t b=1, uint16_t block_size=15, class wt_type=int_vector<> > // forward declaration needed for friend declaration 00039 class rrr_rank_support; // in rrr_vector 00040 00041 template<uint8_t b=1, uint16_t block_size=15, class wt_type=int_vector<> > // forward declaration needed for friend declaration 00042 class rrr_select_support; // in rrr_vector 00043 00045 00060 template<uint16_t block_size=15, class wt_type=int_vector<> > 00061 class rrr_vector 00062 { 00063 public: 00064 typedef bit_vector::size_type size_type; 00065 typedef bit_vector::value_type value_type; 00066 00067 typedef rrr_rank_support<1, block_size, wt_type> rank_1_type; // typedef for default types for rank and select 00068 typedef rrr_rank_support<0, block_size, wt_type> rank_0_type; 00069 typedef rrr_select_support<1, block_size, wt_type> select_1_type; 00070 typedef rrr_select_support<0, block_size, wt_type> select_0_type; 00071 00072 // template<uint16_t b, uint16_t block_size, class wt_type> Hm, the second and third argument is 00073 // friend class rrr_rank_support; // fixed 00074 friend class rrr_rank_support<0, block_size, wt_type>; 00075 friend class rrr_rank_support<1, block_size, wt_type>; 00076 friend class rrr_select_support<0, block_size, wt_type>; 00077 friend class rrr_select_support<1, block_size, wt_type>; 00078 00079 typedef rrr_helper<block_size> rrr_helper_type; 00080 typedef typename rrr_helper_type::number_type number_type; 00081 00082 enum { rrr_block_size = block_size }; 00083 private: 00084 size_type m_size; // length of the original bit_vector 00085 uint16_t m_sample_rate; 00086 wt_type m_bt; // data structure, which stores the block types (bt). The block type equals the number 00087 // of ones in a block. Another option for this data structure is wt_huff 00088 bit_vector m_btnr; // data structure, which stores the block type numbers of the blocks 00089 int_vector<> m_btnrp; // sample pointers into btnr 00090 int_vector<> m_rank; // sample rank values 00091 bit_vector m_invert; // specifies if a superblock (i.e. sample_rate blocks) have to be considered as inverted 00092 // i.e. 1 and 0 are swapped 00093 00094 void copy(const rrr_vector& rrr) { 00095 m_size = rrr.m_size; 00096 m_sample_rate = rrr.m_sample_rate; 00097 m_bt = rrr.m_bt; 00098 m_btnr = rrr.m_btnr; 00099 m_btnrp = rrr.m_btnrp; 00100 m_rank = rrr.m_rank; 00101 m_invert = rrr.m_invert; 00102 } 00103 00104 public: 00105 const wt_type& bt; 00106 const bit_vector& btnr; 00107 00109 rrr_vector(uint16_t sample_rate=32):m_size(0), m_sample_rate(sample_rate), bt(m_bt), btnr(m_btnr) {}; 00110 00112 rrr_vector(const rrr_vector& rrr):bt(m_bt), btnr(m_btnr) { 00113 copy(rrr); 00114 } 00115 00117 00120 rrr_vector(const bit_vector& bv, uint16_t sample_rate=32): m_sample_rate(sample_rate), bt(m_bt), btnr(m_btnr) { 00121 m_size = bv.size(); 00122 int_vector<> bt_array; 00123 bt_array.set_int_width(bit_magic::l1BP(block_size)+1); 00124 bt_array.resize((m_size+block_size)/((size_type)block_size)); 00125 00126 // (1) calculate the block types and store them in m_bt 00127 size_type pos = 0, i = 0, x; 00128 size_type btnr_pos = 0; 00129 size_type sum_rank = 0; 00130 while (pos + block_size <= m_size) { // handle all blocks full blocks 00131 bt_array[ i++ ] = x = rrr_helper_type::get_bt(bv, pos, block_size); 00132 sum_rank += x; 00133 btnr_pos += rrr_helper_type::space_for_bt(x); 00134 pos += block_size; 00135 } 00136 if (pos < m_size) { // handle last not full block 00137 bt_array[ i++ ] = x = rrr_helper_type::get_bt(bv, pos, m_size - pos); 00138 sum_rank += x; 00139 btnr_pos += rrr_helper_type::space_for_bt(x); 00140 } 00141 util::assign(m_btnr, bit_vector(std::max(btnr_pos, (size_type)64), 0)); // max necessary for case: block_size == 1 00142 util::assign(m_btnrp, int_vector<>((bt_array.size()+m_sample_rate-1)/m_sample_rate, 0, bit_magic::l1BP(btnr_pos)+1)); 00143 util::assign(m_rank, int_vector<>((bt_array.size()+m_sample_rate-1)/m_sample_rate + 1, 0, bit_magic::l1BP(sum_rank)+1)); 00144 util::assign(m_invert, bit_vector((bt_array.size()+m_sample_rate-1)/m_sample_rate, 0)); 00145 00146 // (2) calculate block type numbers and pointers into btnr and rank samples 00147 pos = 0; i = 0; 00148 btnr_pos= 0, sum_rank = 0; 00149 bool invert = false; 00150 while (pos + block_size <= m_size) { // handle all full blocks 00151 if ((i % m_sample_rate) == (size_type)0) { 00152 m_btnrp[ i/m_sample_rate ] = btnr_pos; 00153 m_rank[ i/m_sample_rate ] = sum_rank; 00154 // calculate invert bit for that superblock 00155 if (i+m_sample_rate <= bt_array.size()) { 00156 size_type gt_half_block_size = 0; // counter for blocks greater than half of the blocksize 00157 for (size_type j=i; j < i+m_sample_rate; ++j) { 00158 if (bt_array[j] > block_size/2) 00159 ++gt_half_block_size; 00160 } 00161 if (gt_half_block_size > (m_sample_rate/2)) { 00162 m_invert[ i/m_sample_rate ] = 1; 00163 for (size_type j=i; j < i+m_sample_rate; ++j) { 00164 bt_array[j] = block_size - bt_array[j]; 00165 } 00166 invert = true; 00167 } else { 00168 invert = false; 00169 } 00170 } else { 00171 invert = false; 00172 } 00173 } 00174 uint16_t space_for_bt = rrr_helper_type::space_for_bt(x=bt_array[i++]); 00175 sum_rank += (invert ? (block_size - x) : x); 00176 if (space_for_bt) { 00177 number_type bin = rrr_helper_type::decode_btnr(bv, pos, block_size); 00178 number_type nr = rrr_helper_type::bin_to_nr(bin); 00179 rrr_helper_type::set_bt(m_btnr, btnr_pos, nr, space_for_bt); 00180 } 00181 btnr_pos += space_for_bt; 00182 pos += block_size; 00183 } 00184 if (pos < m_size) { // handle last not full block 00185 if ((i % m_sample_rate) == (size_type)0) { 00186 m_btnrp[ i/m_sample_rate ] = btnr_pos; 00187 m_rank[ i/m_sample_rate ] = sum_rank; 00188 m_invert[ i/m_sample_rate ] = 0; // default: set last block to not inverted 00189 invert = false; 00190 } 00191 uint16_t space_for_bt = rrr_helper_type::space_for_bt(x=bt_array[i++]); 00192 sum_rank += invert ? (block_size - x) : x; 00193 if (space_for_bt) { 00194 number_type bin = rrr_helper_type::decode_btnr(bv, pos, m_size-pos); 00195 number_type nr = rrr_helper_type::bin_to_nr(bin); 00196 rrr_helper_type::set_bt(m_btnr, btnr_pos, nr, space_for_bt); 00197 } 00198 btnr_pos += space_for_bt; 00199 } 00200 // for technical reasons add an additional element to m_rank 00201 m_rank[ m_rank.size()-1 ] = sum_rank; // sum_rank contains the total number of set bits in bv 00202 util::assign(m_bt, bt_array); 00203 } 00204 00206 void swap(rrr_vector& rrr) { 00207 if (this != &rrr) { 00208 std::swap(m_size, rrr.m_size); 00209 std::swap(m_sample_rate, rrr.m_sample_rate); 00210 m_bt.swap(rrr.m_bt); 00211 m_btnr.swap(rrr.m_btnr); 00212 m_btnrp.swap(rrr.m_btnrp); 00213 m_rank.swap(rrr.m_rank); 00214 m_invert.swap(rrr.m_invert); 00215 } 00216 } 00217 00219 00222 value_type operator[](size_type i)const { 00223 size_type bt_idx = i/block_size; 00224 uint16_t bt = m_bt[ bt_idx ]; 00225 size_type sample_pos = bt_idx/m_sample_rate; 00226 if (m_invert[sample_pos]) 00227 bt = block_size - bt; 00228 if (bt == 0 or bt == block_size) { // very effective optimization 00229 return bt>0; 00230 } 00231 uint16_t off = i % block_size; //i - bt_idx*block_size; 00232 size_type btnrp = m_btnrp[ sample_pos ]; 00233 for (size_type j = sample_pos*m_sample_rate; j < bt_idx; ++j) { 00234 btnrp += rrr_helper_type::space_for_bt(m_bt[j]); 00235 } 00236 uint16_t btnrlen = rrr_helper_type::space_for_bt(bt); 00237 number_type btnr = rrr_helper_type::decode_btnr(m_btnr, btnrp, btnrlen); 00238 return rrr_helper_type::decode_bit(bt, btnr, off); 00239 } 00240 00242 rrr_vector& operator=(const rrr_vector& rrr) { 00243 if (this != &rrr) { 00244 copy(rrr); 00245 } 00246 return *this; 00247 } 00248 00250 size_type size()const { 00251 return m_size; 00252 } 00253 00256 size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const { 00257 structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); 00258 size_type written_bytes = 0; 00259 written_bytes += util::write_member(m_size, out, child, "size"); 00260 written_bytes += util::write_member(m_sample_rate, out, child, "sample_rate"); 00261 written_bytes += m_bt.serialize(out, child, "bt"); 00262 written_bytes += m_btnr.serialize(out, child, "btnr"); 00263 written_bytes += m_btnrp.serialize(out, child, "btnrp"); 00264 written_bytes += m_rank.serialize(out, child, "rank_samples"); 00265 written_bytes += m_invert.serialize(out, child, "invert"); 00266 structure_tree::add_size(child, written_bytes); 00267 return written_bytes; 00268 } 00269 00271 void load(std::istream& in) { 00272 util::read_member(m_size, in); 00273 util::read_member(m_sample_rate, in); 00274 m_bt.load(in); 00275 m_btnr.load(in); 00276 m_btnrp.load(in); 00277 m_rank.load(in); 00278 m_invert.load(in); 00279 } 00280 00281 // Print information about the object to stdout. 00282 void print_info()const { 00283 size_type orig_bv_size = m_size; // size of the original bit vector in bits 00284 size_type rrr_size = util::get_size_in_bytes(*this)*8; 00285 size_type bt_size = util::get_size_in_bytes(m_bt)*8; 00286 size_type btnr_size = util::get_size_in_bytes(m_btnr)*8; 00287 size_type btnrp_and_rank_size = util::get_size_in_bytes(m_btnrp)*8 + util::get_size_in_bytes(m_rank)*8 + util::get_size_in_bytes(m_invert)*8; 00288 std::cout << "#block_size\tsample_rate\torig_bv_size\trrr_size\tbt_size\tbtnr_size\tbtnrp_and_rank_size" << std::endl; 00289 std::cout << (int)block_size << "\t" << m_sample_rate << "\t"; 00290 std::cout << orig_bv_size << "\t" << rrr_size << "\t" << bt_size << "\t" << btnr_size << "\t" 00291 << btnrp_and_rank_size << std::endl; 00292 } 00293 }; 00294 00295 template<uint8_t bit_pattern> 00296 struct rrr_rank_support_trait { 00297 typedef bit_vector::size_type size_type; 00298 00299 static size_type adjust_rank(size_type r, size_type n) { 00300 return r; 00301 } 00302 }; 00303 00304 template<> 00305 struct rrr_rank_support_trait<0> { 00306 typedef bit_vector::size_type size_type; 00307 00308 static size_type adjust_rank(size_type r, size_type n) { 00309 return n - r; 00310 } 00311 }; 00312 00314 00323 template< uint8_t b, uint16_t block_size, class wt_type> 00324 class rrr_rank_support 00325 { 00326 public: 00327 typedef rrr_vector<block_size, wt_type> bit_vector_type; 00328 typedef typename bit_vector_type::size_type size_type; 00329 typedef typename bit_vector_type::rrr_helper_type rrr_helper_type; 00330 typedef typename rrr_helper_type::number_type number_type; 00331 00332 private: 00333 const bit_vector_type* m_v; 00334 uint16_t m_sample_rate; 00335 00336 public: 00338 00340 explicit rrr_rank_support(const bit_vector_type* v=NULL) { 00341 init(v); 00342 } 00343 00345 void init(const bit_vector_type* v=NULL) { 00346 set_vector(v); 00347 } 00348 00350 00355 const size_type rank(size_type i)const { 00356 size_type bt_idx = i/block_size; 00357 size_type sample_pos = bt_idx/m_sample_rate; 00358 size_type btnrp = m_v->m_btnrp[ sample_pos ]; 00359 size_type rank = m_v->m_rank[ sample_pos ]; 00360 size_type diff_rank = m_v->m_rank[ sample_pos+1 ] - rank; 00361 if (diff_rank == (size_type)0) { 00362 return rrr_rank_support_trait<b>::adjust_rank(rank, i); 00363 } else if (diff_rank == (size_type)block_size*m_sample_rate) { 00364 return rrr_rank_support_trait<b>::adjust_rank( 00365 rank + i - sample_pos*m_sample_rate*block_size, i); 00366 } 00367 const bool inv = m_v->m_invert[ sample_pos ]; 00368 for (size_type j = sample_pos*m_sample_rate; j < bt_idx; ++j) { 00369 uint16_t r = m_v->m_bt[j]; 00370 rank += (inv ? block_size - r: r); 00371 btnrp += rrr_helper_type::space_for_bt(r); 00372 } 00373 uint16_t off = i % block_size; 00374 if (!off) { // needed for special case: if i=size() is a multiple of block_size 00375 // the access to m_bt would cause a invalid memory access 00376 return rrr_rank_support_trait<b>::adjust_rank(rank, i); 00377 } 00378 uint16_t bt = inv ? block_size - m_v->m_bt[ bt_idx ] : m_v->m_bt[ bt_idx ]; 00379 00380 uint16_t btnrlen = rrr_helper_type::space_for_bt(bt); 00381 number_type btnr = rrr_helper_type::decode_btnr(m_v->m_btnr, btnrp, btnrlen); 00382 uint16_t popcnt = rrr_helper_type::decode_popcount(bt, btnr, off); 00383 return rrr_rank_support_trait<b>::adjust_rank(rank + popcnt, i); 00384 } 00385 00387 const size_type operator()(size_type i)const { 00388 return rank(i); 00389 } 00390 00392 const size_type size()const { 00393 return m_v->size(); 00394 } 00395 00397 void set_vector(const bit_vector_type* v=NULL) { 00398 m_v = v; 00399 if (v != NULL) { 00400 m_sample_rate = m_v->m_sample_rate; 00401 } else { 00402 m_sample_rate = 0; 00403 } 00404 } 00405 00406 rrr_rank_support& operator=(const rrr_rank_support& rs) { 00407 if (this != &rs) { 00408 set_vector(rs.m_v); 00409 m_sample_rate = rs.m_sample_rate; 00410 } 00411 return *this; 00412 } 00413 00414 void swap(rrr_rank_support& rs) { 00415 if (this != &rs) { 00416 std::swap(m_sample_rate, rs.m_sample_rate); 00417 } 00418 } 00419 00420 bool operator==(const rrr_rank_support& rs)const { 00421 if (this == &rs) 00422 return true; 00423 return m_sample_rate == rs.m_sample_rate; 00424 } 00425 00426 bool operator!=(const rrr_rank_support& rs)const { 00427 return !(*this == rs); 00428 } 00429 00431 void load(std::istream& in, const bit_vector_type* v=NULL) { 00432 util::read_member(m_sample_rate, in); 00433 set_vector(v); 00434 } 00435 00437 size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const { 00438 structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); 00439 size_type written_bytes = 0; 00440 written_bytes += util::write_member(m_sample_rate, out, child, "sample_rate"); 00441 structure_tree::add_size(child, written_bytes); 00442 return written_bytes; 00443 } 00444 }; 00445 00446 00448 template< uint8_t b, uint16_t block_size, class wt_type> 00449 class rrr_select_support 00450 { 00451 public: 00452 typedef rrr_vector<block_size, wt_type> bit_vector_type; 00453 typedef typename bit_vector_type::size_type size_type; 00454 typedef typename bit_vector_type::rrr_helper_type rrr_helper_type; 00455 typedef typename rrr_helper_type::number_type number_type; 00456 00457 private: 00458 const bit_vector_type* m_v; 00459 uint16_t m_sample_rate; 00460 00461 size_type select1(size_type i)const { 00462 if (m_v->m_rank[m_v->m_rank.size()-1] < i) 00463 return size(); 00464 // (1) binary search for the answer in the rank_samples 00465 size_type begin=0, end=m_v->m_rank.size()-1; // min included, max excluded 00466 size_type idx, rank; 00467 // invariant: m_rank[end] >= i 00468 // m_rank[begin] < i 00469 while (end-begin > 1) { 00470 idx = (begin+end) >> 1; // idx in [0..m_rank.size()-1] 00471 rank = m_v->m_rank[idx]; 00472 if (rank >= i) 00473 end = idx; 00474 else { // rank < i 00475 begin = idx; 00476 } 00477 } 00478 // (2) linear search between the samples 00479 rank = m_v->m_rank[begin]; // now i>rank 00480 idx = begin * m_sample_rate; // initialize idx for select result 00481 size_type diff_rank = m_v->m_rank[end] - rank; 00482 if (diff_rank == (size_type)block_size*m_sample_rate) {// optimisation for select<1> 00483 return idx*block_size + i-rank -1; 00484 } 00485 const bool inv = m_v->m_invert[ begin ]; 00486 size_type btnrp = m_v->m_btnrp[ begin ]; 00487 uint16_t bt = 0, btnrlen = 0; // temp variables for block_type and space for block type 00488 while (i > rank) { 00489 bt = m_v->m_bt[idx++]; bt = inv ? block_size-bt : bt; 00490 rank += bt; 00491 btnrp += (btnrlen=rrr_helper_type::space_for_bt(bt)); 00492 } 00493 rank -= bt; 00494 number_type btnr = rrr_helper_type::decode_btnr(m_v->m_btnr, btnrp-btnrlen, btnrlen); 00495 return (idx-1) * block_size + rrr_helper_type::decode_select(bt, btnr, i-rank); 00496 } 00497 00498 size_type select0(size_type i)const { 00499 if ((size() - m_v->m_rank[m_v->m_rank.size()-1]) < i) { 00500 // std::cout<<"i="<<i<<" size()-m_v->m_rank[m_v->m_rank.size()-1] = " << m_v->m_rank[m_v->m_rank.size()-1]<< std::endl; 00501 return size(); 00502 } 00503 // (1) binary search for the answer in the rank_samples 00504 size_type begin=0, end=m_v->m_rank.size()-1; // min included, max excluded 00505 size_type idx, rank; 00506 // invariant: m_rank[end] >= i 00507 // m_rank[begin] < i 00508 while (end-begin > 1) { 00509 idx = (begin+end) >> 1; // idx in [0..m_rank.size()-1] 00510 rank = idx*block_size*m_sample_rate - m_v->m_rank[idx]; 00511 if (rank >= i) 00512 end = idx; 00513 else { // rank < i 00514 begin = idx; 00515 } 00516 } 00517 // (2) linear search between the samples 00518 rank = begin*block_size*m_sample_rate - m_v->m_rank[begin]; // now i>rank 00519 idx = begin * m_sample_rate; // initialize idx for select result 00520 if (m_v->m_rank[end] == m_v->m_rank[begin]) { // only for select<0> 00521 return idx*block_size + i-rank -1; 00522 } 00523 const bool inv = m_v->m_invert[ begin ]; 00524 size_type btnrp = m_v->m_btnrp[ begin ]; 00525 uint16_t bt = 0, btnrlen = 0; // temp variables for block_type and space for block type 00526 while (i > rank) { 00527 bt = m_v->m_bt[idx++]; bt = inv ? block_size-bt : bt; 00528 rank += (block_size-bt); 00529 btnrp += (btnrlen=rrr_helper_type::space_for_bt(bt)); 00530 } 00531 rank -= (block_size-bt); 00532 number_type btnr = rrr_helper_type::decode_btnr(m_v->m_btnr, btnrp-btnrlen, btnrlen); 00533 return (idx-1) * block_size + rrr_helper_type::template decode_select_bitpattern<0, 1>(bt, btnr, i-rank); 00534 } 00535 00536 00537 00538 public: 00539 explicit rrr_select_support(const bit_vector_type* v=NULL) { 00540 init(v); 00541 } 00542 00543 void init(const bit_vector_type* v=NULL) { 00544 set_vector(v); 00545 } 00546 00548 size_type select(size_type i)const { 00549 return b ? select1(i) : select0(i); 00550 } 00551 00552 00553 const size_type operator()(size_type i)const { 00554 return select(i); 00555 } 00556 00557 const size_type size()const { 00558 return m_v->size(); 00559 } 00560 00561 void set_vector(const bit_vector_type* v=NULL) { 00562 m_v = v; 00563 if (v != NULL) { 00564 m_sample_rate = m_v->m_sample_rate; 00565 } else { 00566 m_sample_rate = 0; 00567 } 00568 } 00569 00570 rrr_select_support& operator=(const rrr_select_support& rs) { 00571 if (this != &rs) { 00572 set_vector(rs.m_v); 00573 m_sample_rate = rs.m_sample_rate; 00574 } 00575 return *this; 00576 } 00577 00578 void swap(rrr_select_support& rs) { 00579 if (this != &rs) { 00580 std::swap(m_sample_rate, rs.m_sample_rate); 00581 } 00582 } 00583 00584 bool operator==(const rrr_select_support& rs)const { 00585 if (this == &rs) 00586 return true; 00587 return m_sample_rate == rs.m_sample_rate; 00588 } 00589 00590 bool operator!=(const rrr_select_support& rs)const { 00591 return !(*this == rs); 00592 } 00593 00594 00595 void load(std::istream& in, const bit_vector_type* v=NULL) { 00596 util::read_member(m_sample_rate, in); 00597 set_vector(v); 00598 } 00599 00600 size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const { 00601 structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); 00602 size_type written_bytes = 0; 00603 written_bytes += util::write_member(m_sample_rate, out, child, "sample_rate"); 00604 structure_tree::add_size(child, written_bytes); 00605 return written_bytes; 00606 } 00607 }; 00608 00609 }// end namespace sdsl 00610 #include "rrr_vector_15.hpp" // include specialization 00611 00612 #endif