SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2009 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00021 #ifndef INCLUDED_SDSL_BP_SUPPORT_J 00022 #define INCLUDED_SDSL_BP_SUPPORT_J 00023 00024 #include "int_vector.hpp" 00025 #include "rank_support.hpp" 00026 #include "select_support.hpp" 00027 #include "algorithms.hpp" 00028 #include <stack> 00029 #include <map> 00030 #include <set> 00031 #include <utility> 00032 #include <iostream> 00033 #include <sstream> // for get_info method 00034 #include <fstream> 00035 00036 namespace sdsl 00037 { 00038 00040 00044 template<class RankSupport = rank_support_v<>, class SelectSupport = select_support_mcl<1> > 00045 class bp_support_j 00046 { 00047 public: 00048 typedef bit_vector::size_type size_type; 00049 private: 00050 const bit_vector* m_bp; // the supported balanced parentheses sequence as bit_vector 00051 RankSupport m_rank; // a rank dictionary for calculation of excess 00052 SelectSupport m_select; // additional select dictionary 00053 00054 bit_vector m_pioneer_bitmap; // bitmap for pioneer positions 00055 RankSupport m_rank_pioneer_bitmap; // 00056 int_vector<> m_pioneer; // 00057 00058 bit_vector m_enclose_pioneer_bitmap; // bitmap for enclose pioneer positions 00059 RankSupport m_rank_enclose_pioneer_bitmap; // 00060 int_vector<> m_enclose_pioneer; // 00061 00062 uint32_t m_block_size; 00063 uint32_t m_blocks; // number of blocks 00064 size_type m_size; 00065 00066 00067 // TODO: implement this space efficient!!! with the sorted_stack_support!!! 2009-12-03 00068 //TODO: replace this by a call of algorithm::calculate_pioneer... 00069 void calculate_pioneers_bitmap() { 00070 std::map<size_type, size_type> pioneer_matches; 00071 m_pioneer_bitmap.resize(m_size); // resize pioneers bitmap 00072 util::set_zero_bits(m_pioneer_bitmap); // initialize bitmap with zeros 00073 00074 std::map<size_type, size_type> pioneer_matches_for_enclose; 00075 m_enclose_pioneer_bitmap.resize(m_size); // resize pioneers bitmap for enclose 00076 util::set_zero_bits(m_enclose_pioneer_bitmap); // initialize bitmap with zeros 00077 00078 // algorithm::calculate_pioneers_bitmap(*m_bp, m_block_size, m_pioneer_bitmap); 00079 // algorithm::calculate_matches_for_pioneers(*m_bp, m_pioneer_bitmap, m_pioneer); 00080 00081 std::stack<size_type> opening_parenthesis; 00082 00083 // calculate positions of findclose and findopen pioneers 00084 for (size_type block_nr = 0; block_nr < m_blocks; ++block_nr) { 00085 std::map<size_type, size_type> block_and_position; // for find_open and find_close 00086 std::map<size_type, size_type> matching_position; // for find_open and find_close 00087 std::map<size_type, size_type> block_and_position_for_enclose; // for enclose 00088 std::map<size_type, size_type> matching_position_for_enclose; // for enclose 00089 for (size_type i=0, j=block_nr*m_block_size; i < m_block_size and j < m_size; ++i, ++j) { 00090 if ((*m_bp)[j]) {//opening parenthesis 00091 if (!opening_parenthesis.empty()) { 00092 size_type position = opening_parenthesis.top(); 00093 size_type blockpos = position/m_block_size; 00094 // if( block_and_position_for_enclose.find(blockpos) == block_and_position_for_enclose.end() ){ // smallest j is pioneer 00095 block_and_position_for_enclose[blockpos] = position; 00096 matching_position_for_enclose[blockpos] = j; 00097 // } 00098 } 00099 opening_parenthesis.push(j); 00100 } else { // closing parenthesis 00101 size_type position = opening_parenthesis.top(); 00102 size_type blockpos = position/m_block_size; 00103 opening_parenthesis.pop(); 00104 block_and_position[blockpos] = position; 00105 matching_position[blockpos] = j; // greatest j is pioneer 00106 } 00107 } 00108 for (std::map<size_type, size_type>::const_iterator it = block_and_position.begin(), 00109 end = block_and_position.end(), 00110 mit = matching_position.begin(); it != end and it->first != block_nr; ++it, ++mit) { 00111 // opening and closing pioneers are symmetric 00112 m_pioneer_bitmap[it->second] = 1; 00113 pioneer_matches[it->second] = mit->second; 00114 m_pioneer_bitmap[mit->second] = 1; 00115 pioneer_matches[mit->second] = it->second; 00116 } 00117 for (std::map<size_type, size_type>::const_iterator it = block_and_position_for_enclose.begin(), 00118 end = block_and_position_for_enclose.end(), 00119 mit = matching_position_for_enclose.begin(); it != end and it->first != block_nr; ++it, ++mit) { 00120 m_enclose_pioneer_bitmap[mit->second] = 1; 00121 pioneer_matches_for_enclose[mit->second] = it->second ; 00122 } 00123 00124 } 00125 // assert that the sequence is balanced 00126 assert(opening_parenthesis.empty()); 00127 // store matching positions of pioneers 00128 m_pioneer.set_int_width(bit_magic::l1BP(m_size)+1); 00129 m_pioneer.resize(pioneer_matches.size()); 00130 size_type cnt=0; 00131 for (std::map<size_type, size_type>::const_iterator mit = pioneer_matches.begin(); mit!= pioneer_matches.end(); ++mit) { 00132 m_pioneer[cnt++] = mit->second; 00133 } 00134 00135 // initialize the rank dictionary for the pioneer bitmap 00136 util::init_support(m_rank_pioneer_bitmap, &m_pioneer_bitmap); 00137 00138 // store matching positions of enclose pioneers 00139 m_enclose_pioneer.set_int_width(bit_magic::l1BP(m_size)+1); 00140 m_enclose_pioneer.resize(pioneer_matches_for_enclose.size()); 00141 cnt = 0; 00142 for (std::map<size_type, size_type>::const_iterator mit = pioneer_matches_for_enclose.begin(); mit != pioneer_matches_for_enclose.end(); ++mit) { 00143 m_enclose_pioneer[cnt++] = mit->second; 00144 } 00145 // initialize the rank dictionary for the enclose pioneer bitmap 00146 util::init_support(m_rank_enclose_pioneer_bitmap, &m_enclose_pioneer_bitmap); 00147 00148 /* if(m_size<120 and m_size>0){ 00149 std::cerr<<"bp"<<std::endl; 00150 for(size_type i=0; i<m_size; ++i){ std::cerr<<(*m_bp)[i]; } 00151 std::cerr<<std::endl<<"pioneer bitmap"<<std::endl; 00152 for(size_type i=0; i<m_size; ++i){ std::cerr<<m_pioneer_bitmap[i]; } 00153 std::cerr<<std::endl; 00154 for(std::map<size_type, size_type>::const_iterator mit = pioneer_matches.begin(); mit!= pioneer_matches.end(); ++mit){ 00155 std::cerr<<"_"<<mit->first<<" "<<mit->second<<std::endl; 00156 } 00157 for(size_type i=0;i<m_pioneer.size();++i){ 00158 std::cerr<<"__"<<i<<" "<<m_pioneer[i]<<std::endl; 00159 } 00160 } 00161 */ 00162 } 00163 00164 void copy(const bp_support_j& bp_support) { 00165 m_bp = bp_support.m_bp; 00166 m_block_size = bp_support.m_block_size; 00167 m_size = bp_support.m_size; 00168 m_blocks = bp_support.m_blocks; 00169 00170 m_rank = bp_support.m_rank; 00171 m_rank.set_vector(m_bp); 00172 00173 m_select = bp_support.m_select; 00174 m_select.set_vector(m_bp); 00175 00176 m_pioneer_bitmap = bp_support.m_pioneer_bitmap; 00177 m_rank_pioneer_bitmap = bp_support.m_rank_pioneer_bitmap; 00178 m_rank_pioneer_bitmap.set_vector(&m_pioneer_bitmap); 00179 m_pioneer = bp_support.m_pioneer; 00180 00181 m_enclose_pioneer_bitmap = bp_support.m_enclose_pioneer_bitmap; 00182 m_rank_enclose_pioneer_bitmap = bp_support.m_rank_enclose_pioneer_bitmap; 00183 m_rank_enclose_pioneer_bitmap.set_vector(&m_enclose_pioneer_bitmap); 00184 m_enclose_pioneer = bp_support.m_enclose_pioneer; 00185 } 00186 00187 public: 00188 00190 bp_support_j(const bit_vector* bp = NULL, uint32_t ignore=0/*, uint32_t used_block_size = 64*/):m_bp(bp), m_block_size(64/*block_size==0?used_block_size:block_size*/), m_size(bp==NULL?0:bp->size()) { 00191 // assert(m_block_size > 0 and m_block_size <= 8192 and m_block_size%64 == 0); 00192 m_blocks = (m_size+m_block_size-1)/m_block_size; 00193 util::init_support(m_rank, m_bp); 00194 util::init_support(m_select, m_bp); 00195 calculate_pioneers_bitmap(); 00196 } 00197 00199 bp_support_j(const bp_support_j& bp_support) { 00200 copy(bp_support); 00201 } 00202 00204 bp_support_j& operator=(const bp_support_j& bp_support) { 00205 if (this != &bp_support) { 00206 copy(bp_support); 00207 } 00208 return *this; 00209 } 00210 00211 void set_vector(const bit_vector* bp) { 00212 m_bp = bp; 00213 m_rank.set_vector(bp); 00214 m_select.set_vector(bp); 00215 } 00216 00220 size_type excess(size_type i)const { 00221 return (m_rank(i+1)<<1)-i-1; 00222 } 00223 00227 size_type rank(size_type i)const { 00228 return m_rank(i+1); 00229 } 00230 00236 size_type select(size_type i)const { 00237 return m_select(i); 00238 } 00239 00246 size_type find_close(size_type i)const { 00247 #ifdef SDSL_DEBUG_BP 00248 if (i >= m_size) { 00249 throw std::out_of_range("OUT_OF_RANGE: bp_support_j::find_close"); 00250 } 00251 #endif 00252 if (!(*m_bp)[i]) {// if there is a closing parenthesis at index i return i 00253 return i; 00254 } 00255 if (m_pioneer_bitmap[i]) { 00256 return m_pioneer[m_rank_pioneer_bitmap.rank(i)]; 00257 } else { 00258 size_type ip1 = i+1; // move one position to the left 00259 uint64_t byte_prefix_sums_x_2; 00260 const uint64_t* data = m_bp->data() + (ip1>>6); 00261 uint8_t offset = ip1&0x3F; 00262 uint64_t w = (~(*data))>>offset; // add $offset$ opening parenthesis at the beginning of the word 00263 00264 uint8_t pos = bit_magic::first_excess_position(w, 1, byte_prefix_sums_x_2); 00265 if (pos != 64) {// found position of matching parenthesis 00266 return ip1+pos; 00267 } else { // the excess value at the end of w is lower or equal zero 00268 assert((i%64) != 0); 00269 offset = (offset + 63)&0x3F; 00270 // search the pioneer bitmap for the closesd preceding pioneer 00271 pos = bit_magic::l1BP(*(m_pioneer_bitmap.data() + ((i-1)>>6)) & *(m_bp->data() + ((i-1)>>6)) & bit_magic::Li1Mask[offset]); 00272 size_type pioneer_index = ((i-1)&0xFFFFFFFFFFFFFFC0ULL)+pos; 00273 assert(m_pioneer_bitmap[pioneer_index] == 1); 00274 size_type match_index = m_pioneer[m_rank_pioneer_bitmap.rank(pioneer_index)]; 00275 assert((match_index&0xFFFFFFFFFFFFFFC0ULL) > i); 00276 uint8_t excess_difference = excess((match_index&0xFFFFFFFFFFFFFFC0ULL)-1)-(excess(i)-1); 00277 assert(excess_difference <=63); 00278 return (match_index&0xFFFFFFFFFFFFFFC0ULL) 00279 + bit_magic::first_excess_position(~*(m_bp->data()+(match_index>>6)) , excess_difference, byte_prefix_sums_x_2); 00280 } 00281 } 00282 } 00283 00285 00290 size_type find_open(size_type i)const { 00291 #ifdef SDSL_DEBUG_BP 00292 if (i >= m_size) { 00293 throw std::out_of_range("OUT_OF_RANGE: bp_support_j::find_open"); 00294 } 00295 #endif 00296 if ((*m_bp)[i]) {// if there is a opening parenthesis at index i return i 00297 return i; 00298 } 00299 if (m_pioneer_bitmap[i]) { 00300 return m_pioneer[m_rank_pioneer_bitmap.rank(i)]; 00301 } else { 00302 size_type im1 = i-1; // move one position to the right 00303 const uint64_t* data = m_bp->data() + (im1>>6); 00304 uint8_t close_parenthesis_index = (i&0x3F) + ((i==0)<<6); 00305 uint8_t pos = bit_magic::find_open(*data, close_parenthesis_index); 00306 if (pos!=64) { // found position of the matching parenthesis 00307 return (im1&0xFFFFFFFFFFFFFFC0ULL)+pos; 00308 } else { 00309 assert((i%64)!=63); 00310 // search the pioneer bitmap for the closest succeeding pioneer 00311 pos = bit_magic::r1BP(*(m_pioneer_bitmap.data() + ((i+1)>>6)) & ~*(m_bp->data() + ((i+1)>>6)) & bit_magic::Li0Mask[i&0x3F]); 00312 size_type pioneer_index = ((i+1)&0xFFFFFFFFFFFFFFC0ULL)+pos; 00313 assert(m_pioneer_bitmap[pioneer_index] == 1); 00314 size_type match_index = m_pioneer[m_rank_pioneer_bitmap.rank(pioneer_index)]; 00315 assert(match_index < i); 00316 int8_t excess_difference = excess(i); 00317 if (match_index >= 64) { 00318 excess_difference -= excess((match_index&0xFFFFFFFFFFFFFFC0ULL)-1); 00319 } 00320 assert(excess_difference >=-64 and excess_difference <= 64); 00321 uint64_t dummy; 00322 if (excess_difference >= 0) { 00323 return (match_index&0xFFFFFFFFFFFFFFC0ULL) 00324 + bit_magic::last_excess_position(*(m_bp->data()+(match_index>>6)), excess_difference, dummy) + 1; 00325 } else { 00326 return (match_index&0xFFFFFFFFFFFFFFC0ULL) 00327 + bit_magic::last_excess_position(~*(m_bp->data()+(match_index>>6)), -excess_difference, dummy) + 1; 00328 } 00329 } 00330 } 00331 } 00332 00334 00338 size_type enclose(size_type i)const { 00339 #ifdef SDSL_DEBUG_BP 00340 if (i >= m_size) { 00341 throw std::out_of_range("OUT_OF_RANGE: bp_support_j::enclose."); 00342 } 00343 #endif 00344 if (!(*m_bp)[i]) { // if there is closing parenthesis at position i 00345 throw std::logic_error("LOGIC_ERROR: bp_support_j::enclose. A opening parenthesis is expected as argument."); 00346 } 00347 if (m_enclose_pioneer_bitmap[i]) { 00348 return m_enclose_pioneer[m_rank_enclose_pioneer_bitmap.rank(i)]; 00349 } else { 00350 if (i==0) 00351 return size(); 00352 size_type im1 = i-1; // move one position to the right 00353 const uint64_t* data = m_bp->data() + (im1>>6); 00354 uint8_t open_parenthesis_index = (i&0x3F) + ((i==0)<<6); 00355 uint8_t pos = bit_magic::find_open(*data, open_parenthesis_index); 00356 if (pos!=64) { 00357 return (im1&0xFFFFFFFFFFFFFFC0ULL)+pos; 00358 } else { 00359 assert((i%64)!= 63); 00360 // search the pioneer bitmap for the closest succeeding pioneer 00361 uint64_t w = *(m_enclose_pioneer_bitmap.data() + ((i+1)>>6)) & bit_magic::Li0Mask[i&0x3F]; 00362 if (w) { 00363 pos = bit_magic::r1BP(*(m_enclose_pioneer_bitmap.data() + ((i+1)>>6)) & bit_magic::Li0Mask[i&0x3F]); 00364 size_type pioneer_index = ((i+1)&0xFFFFFFFFFFFFFFC0ULL)+pos; 00365 assert(m_enclose_pioneer_bitmap[pioneer_index] == 1); 00366 size_type match_index = m_enclose_pioneer[m_rank_enclose_pioneer_bitmap.rank(pioneer_index)]; 00367 assert(match_index < i); 00368 int8_t excess_difference = excess(i)-2; 00369 if (match_index >= 64) { 00370 excess_difference -= excess((match_index&0xFFFFFFFFFFFFFFC0ULL)-1); 00371 } 00372 assert(excess_difference >=-64 and excess_difference <= 64); 00373 uint64_t dummy; 00374 if (excess_difference >= 0) { 00375 pos = bit_magic::last_excess_position(*(m_bp->data()+(match_index>>6)), excess_difference, dummy); 00376 if (pos==64) 00377 return match_index&0xFFFFFFFFFFFFFFC0ULL; 00378 else 00379 return (match_index&0xFFFFFFFFFFFFFFC0ULL) + pos + 1; 00380 // return (match_index&0xFFFFFFFFFFFFFFC0ULL) 00381 // + bit_magic::last_excess_position( *(m_bp.data()+(match_index>>6)), excess_difference, dummy) + 1; 00382 } else { 00383 pos = bit_magic::last_excess_position(~*(m_bp->data()+(match_index>>6)), -excess_difference, dummy); 00384 if (pos==64) 00385 return match_index&0xFFFFFFFFFFFFFFC0ULL; 00386 else 00387 return (match_index&0xFFFFFFFFFFFFFFC0ULL) + pos + 1; 00388 // return (match_index&0xFFFFFFFFFFFFFFC0ULL) 00389 // + bit_magic::last_excess_position( ~*(m_bp.data()+(match_index>>6)), -excess_difference, dummy) + 1; 00390 } 00391 } else { // there exists no pioneer => there exists no enclosing parentheses pair 00392 return m_size; 00393 } 00394 } 00395 } 00396 } 00397 00399 00404 size_type rr_enclose(size_type i, size_type j)const { 00405 assert(j > i and j < m_size); 00406 size_type mi = find_close(i); // matching parenthesis to i 00407 assert(mi > i and mi < j); 00408 assert(find_close(j) > j); 00409 size_type k = enclose(j); 00410 if (k == m_size or k < i) // there exists no opening parenthesis at position mi<k<j. 00411 return m_size; 00412 size_type kk; 00413 do { 00414 kk = k; 00415 k = enclose(k); 00416 } while (k != m_size and k > mi); 00417 return kk; 00418 } 00419 00420 size_type rr_enclose_naive(size_type i, size_type j)const { 00421 return rr_enclose(i, j); 00422 } 00423 00425 00430 size_type double_enclose(size_type i, size_type j)const { 00431 assert(j > i); 00432 assert((*m_bp)[i]==1 and (*m_bp)[j]==1); 00433 size_type k = rr_enclose(i, j); 00434 if (k == size()) 00435 return enclose(j); 00436 else 00437 return enclose(k); 00438 } 00439 00441 00443 size_type preceding_closing_parentheses(size_type i)const { 00444 assert(i < m_size); 00445 if (!i) return 0; 00446 size_type ones = m_rank(i); 00447 if (ones) { // ones > 0 00448 assert(m_select(ones) < i); 00449 return i - m_select(ones) - 1; 00450 } else { 00451 return i; 00452 } 00453 /* 00454 size_type im1 = i-1; 00455 const uint64_t *data = m_bp->data()+(im1>>6); 00456 uint8_t offset = (im1&0x3F)+1; 00457 uint64_t w = *data << (64-offset);//& bit_magic::Li1Mask[offset]; 00458 if( offset != 64 ){ 00459 w |= *(--data)>>(offset); 00460 } 00461 size_type result = 0; 00462 */ 00463 } 00464 /* 00465 size_type restricted_enclose2(size_type i, size_type j)const{ 00466 assert( j > i ); 00467 size_type mi = find_close(i); // matching parenthesis to i 00468 assert( find_close(i) > i and mi < j ); 00469 // 00470 // m_rank_pioneer_bitmap.rank(mi) 00471 00472 assert( find_close(j) > j ); 00473 size_type k = enclose(j); 00474 if( k == m_size or k < i )// there exists no opening parenthesis at position mi<k<j. 00475 return m_size; 00476 size_type kk; 00477 do{ 00478 kk = k; 00479 k = enclose(k); 00480 }while( k != m_size and k > mi ); 00481 return kk; 00482 } 00483 */ 00484 00488 size_type size() const { 00489 return m_size; 00490 } 00491 00493 00497 size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const { 00498 structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); 00499 size_type written_bytes = 0; 00500 written_bytes += m_rank.serialize(out, child, "bp_rank"); 00501 written_bytes += m_select.serialize(out, child, "bp_select"); 00502 00503 written_bytes += m_pioneer_bitmap.serialize(out, child, "pioneer_bitmap"); 00504 written_bytes += m_rank_pioneer_bitmap.serialize(out, child, "pioneer_bitmap_rank"); 00505 written_bytes += m_pioneer.serialize(out, child, "pioneer"); 00506 00507 written_bytes += m_enclose_pioneer_bitmap.serialize(out, child, "enclose"); 00508 written_bytes += m_rank_enclose_pioneer_bitmap.serialize(out, child, "enclose_rank"); 00509 written_bytes += m_enclose_pioneer.serialize(out, child, "enclose_pioneer"); 00510 00511 structure_tree::add_size(child, written_bytes); 00512 return written_bytes; 00513 } 00514 00516 00520 void load(std::istream& in, const bit_vector* bp) { 00521 m_bp = bp; 00522 if (m_bp == NULL) 00523 return; 00524 m_size = m_bp->size(); 00525 m_block_size = 64; 00526 m_blocks = (m_size+m_block_size-1)/m_block_size; 00527 00528 m_rank.load(in, m_bp); 00529 m_select.load(in, m_bp); 00530 00531 m_pioneer_bitmap.load(in); 00532 m_rank_pioneer_bitmap.load(in, &m_pioneer_bitmap); 00533 m_pioneer.load(in); 00534 00535 m_enclose_pioneer_bitmap.load(in); 00536 m_rank_enclose_pioneer_bitmap.load(in, &m_enclose_pioneer_bitmap); 00537 m_enclose_pioneer.load(in); 00538 } 00539 00540 std::string get_info()const { 00541 std::stringstream ss; 00542 if (m_bp == NULL) { 00543 ss<<"ERROR: bp_support_j is unitialized!"<<std::endl; 00544 return ss.str(); 00545 } 00546 ss<<"number of parentheses: "<<m_bp->size()<<std::endl; 00547 ss<<"number of pioneers: "<< m_pioneer.size()<<std::endl; 00548 std::ofstream out("/dev/null"); 00549 ss<<"size of parentheses sequence: "<< m_bp->serialize(out) << std::endl; 00550 ss<<"size of pioneer bitmap: "<< m_pioneer_bitmap.serialize(out) << std::endl; 00551 ss<<"size of pioneer data : "<< m_pioneer.serialize(out) << std::endl; 00552 ss<<"size of rank for pioneers: "<< m_rank_pioneer_bitmap.serialize(out) << std::endl; 00553 00554 ss<<"size of enclose pioneer bitmap: "<< m_enclose_pioneer_bitmap.serialize(out) << std::endl; 00555 ss<<"size of pioneer data : "<< m_pioneer.serialize(out) << std::endl; 00556 ss<<"size of rank for pioneers: "<< m_rank_enclose_pioneer_bitmap.serialize(out) << std::endl; 00557 00558 return ss.str(); 00559 } 00560 }; 00561 }// end namespace sdsl 00562 00563 00564 00565 00566 #endif