SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2009 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00021 #ifndef INCLUDED_SDSL_ALGORITHMS_FOR_COMPRESSED_SUFFIX_TREES 00022 #define INCLUDED_SDSL_ALGORITHMS_FOR_COMPRESSED_SUFFIX_TREES 00023 00024 #include "int_vector.hpp" // for bit_vector 00025 #include "sorted_stack_support.hpp" // for construct_supercartesian_tree_bp 00026 #include "sorted_multi_stack_support.hpp" // for first_p_index_construction 00027 #include "util.hpp" 00028 #include <stack> // for calculate_supercartesian_tree_bp 00029 00030 00031 namespace sdsl 00032 { 00033 00034 namespace algorithm 00035 { 00036 00038 00047 template<class RandomAccessContainer> 00048 void construct_supercartesian_tree_bp(const RandomAccessContainer& vec, bit_vector& bp, const bool minimum=true) 00049 { 00050 typedef typename RandomAccessContainer::size_type size_type; 00051 bp.resize(2*vec.size()); // resize bit vector for balanaced parantheses to 2 n bits 00052 util::set_zero_bits(bp); 00053 std::stack<typename RandomAccessContainer::value_type> vec_stack; 00054 00055 size_type k=0; 00056 for (size_type i=0; i < vec.size(); ++i) { 00057 typename RandomAccessContainer::value_type l = vec[i]; 00058 if (minimum) { 00059 while (vec_stack.size() > 0 and l < vec_stack.top()) { 00060 vec_stack.pop(); ++k; /*bp[k++] = 0; bp is already initialized to zero*/ // writing a closing parenthesis 00061 } 00062 } else { 00063 while (vec_stack.size() > 0 and l > vec_stack.top()) { 00064 vec_stack.pop(); ++k; /*bp[k++] = 0; bp is already initialized to zero*/ // writing a closing parenthesis 00065 } 00066 } 00067 vec_stack.push(l); 00068 bp[k++] = 1; // writing an opening parenthesis 00069 } 00070 while (vec_stack.size() > 0) { 00071 vec_stack.pop(); 00072 bp[k++] = 0; // writing a closing parenthesis 00073 } 00074 assert(k == 2*vec.size()); 00075 } 00076 00078 00087 // TODO: sorted_multi_stack_support einbauen, RandomAccessContainer durch int_vector_file_buffer ersetzen 00088 template<class RandomAccessContainer> 00089 void construct_supercartesian_tree_bp_succinct(const RandomAccessContainer& vec, bit_vector& bp, const bool minimum=true) 00090 { 00091 typedef typename RandomAccessContainer::size_type size_type; 00092 bp.resize(2*vec.size()); // resize bit vector for balanaced parantheses to 2 n bits 00093 util::set_zero_bits(bp); 00094 sorted_stack_support vec_stack(vec.size()); // <- das ist ein Problem fuer int_vector_file_buffer 00095 00096 size_type k=0; 00097 if (minimum) { 00098 bp[k++] = 1; 00099 for (size_type i=1; i < vec.size(); ++i) { 00100 if (vec[i] < vec[i-1]) { 00101 ++k; 00102 while (vec_stack.size() > 0 and vec[i] < vec[vec_stack.top()]) { 00103 vec_stack.pop(); ++k; // writing a closing parenthesis, bp is already initialized to zero 00104 } 00105 } else { 00106 vec_stack.push(i-1); // "lazy stack" trick: Beschleunigung: ca. 25% 00107 } 00108 bp[k++] = 1; // writing an opening parenthesis 00109 } 00110 /* 00111 vec_stack.push(0); 00112 bp[k++] = 1; 00113 for(size_type i=1,j, start_run=1; i < vec.size(); ++i){ 00114 if( vec[i] < vec[i-1] ){ 00115 j = i; 00116 while( --j >= start_run and vec[i] < vec[j]) ++k; 00117 while(start_run <= j){ // auf den stack pushen 00118 vec_stack.push(start_run++); 00119 } 00120 while( vec_stack.size() > 0 and vec[i] < vec[vec_stack.top()] ){ 00121 vec_stack.pop(); ++k; 00122 } 00123 start_run = i; 00124 } 00125 bp[k++] = 1; 00126 } 00127 */ 00128 } else { 00129 // hier noch ohne "lazy stack" trick 00130 for (size_type i=0; i < vec.size(); ++i) { 00131 while (vec_stack.size() > 0 and vec[i] > vec[vec_stack.top()]) { 00132 vec_stack.pop(); ++k; /*bp[k++] = 0; bp is already initialized to zero*/ // writing a closing parenthesis 00133 } 00134 vec_stack.push(i); 00135 bp[k++] = 1; // writing an opening parenthesis 00136 } 00137 } 00138 #ifdef SDSL_DEBUG 00139 // not neccessary as bp is already initialized to zero 00140 while (!vec_stack.empty()) { 00141 vec_stack.pop(); 00142 bp[k++] = 0; // writing a closing parenthesis 00143 } 00144 assert(k == 2*vec.size()); 00145 #endif 00146 } 00147 00149 00158 template<uint8_t fixedIntWidth, class size_type_class> 00159 void construct_supercartesian_tree_bp_succinct(int_vector_file_buffer<fixedIntWidth, size_type_class>& lcp_buf, bit_vector& bp, const bool minimum=true) 00160 { 00161 typedef size_type_class size_type; 00162 lcp_buf.reset(); 00163 size_type n = lcp_buf.int_vector_size; 00164 bp.resize(2*n); // resize bit vector for balanaced parantheses to 2 n bits 00165 if (n == 0) // if n == 0 we are done 00166 return; 00167 util::set_zero_bits(bp); 00168 sorted_multi_stack_support vec_stack(n); 00169 00170 size_type k=0; 00171 if (minimum) { 00172 bp[k++] = 1; 00173 size_type r = lcp_buf.load_next_block(); 00174 size_type last = lcp_buf[0]; 00175 for (size_type i=1, r_sum = 0, x; r_sum < n;) { 00176 for (; i < r_sum +r; ++i) { 00177 x = lcp_buf[i-r_sum]; 00178 if (x < last) { 00179 ++k; // writing a closing parenthesis for last 00180 while (!vec_stack.empty() and x < vec_stack.top()) { 00181 vec_stack.pop(); ++k; // writing a closing parenthesis, bp is already initialized to zeros 00182 } 00183 } else { 00184 vec_stack.push(last); // "lazy stack" trick: Beschleunigung: ca 25 % 00185 } 00186 bp[k++] = 1; // writing an opening parenthesis 00187 last = x; 00188 } 00189 r_sum += r; r = lcp_buf.load_next_block(); 00190 } 00191 } else { 00192 // hier noch ohne "lazy stack" trick 00193 for (size_type i=0, r_sum = 0, r = lcp_buf.load_next_block(), x; r_sum < n;) { 00194 for (; i < r_sum +r; ++i) { 00195 x = lcp_buf[i-r_sum]; 00196 while (!vec_stack.empty() and x > vec_stack.top()) { 00197 vec_stack.pop(); ++k; // writing a closing parenthesis, bp is already initialized to zeros 00198 } 00199 vec_stack.push(x); 00200 bp[k++] = 1; // writing an opening parenthesis 00201 } 00202 r_sum += r; r = lcp_buf.load_next_block(); 00203 } 00204 } 00205 #ifdef SDSL_DEBUG 00206 // not neccessary as bp is already initialized to zero 00207 while (!vec_stack.empty()) { 00208 vec_stack.pop(); 00209 bp[k++] = 0; // writing a closing parenthesis 00210 } 00211 assert(k == 2*vec.size()); 00212 #endif 00213 } 00214 00216 00226 template<uint8_t fixedIntWidth, class size_type_class> 00227 size_type_class construct_supercartesian_tree_bp_succinct_and_first_child(int_vector_file_buffer<fixedIntWidth, size_type_class>& lcp_buf, bit_vector& bp, bit_vector& bp_fc, const bool minimum=true) 00228 { 00229 typedef size_type_class size_type; 00230 lcp_buf.reset(); 00231 size_type n = lcp_buf.int_vector_size; 00232 bp.resize(2*n); // resize bit vector for balanaced parantheses to 2 n bits 00233 bp_fc.resize(n); 00234 if (n == 0) // if n == 0 we are done 00235 return 0; 00236 size_type fc_cnt=0; // first child counter 00237 util::set_zero_bits(bp); 00238 util::set_zero_bits(bp_fc); 00239 sorted_multi_stack_support vec_stack(n); 00240 00241 size_type k=0; 00242 size_type k_fc=0; // first child index 00243 if (minimum) { 00244 // hier noch ohne "lazy stack" trick 00245 for (size_type i=0, r_sum = 0, r = lcp_buf.load_next_block(), x; r_sum < n;) { 00246 for (; i < r_sum +r; ++i) { 00247 x = lcp_buf[i-r_sum]; 00248 while (!vec_stack.empty() and x < vec_stack.top()) { 00249 if (vec_stack.pop()) { 00250 bp_fc[k_fc] = 1; 00251 ++fc_cnt; 00252 } 00253 ++k; // writing a closing parenthesis, bp is already initialized to zeros 00254 ++k_fc; // write a bit in first_child 00255 } 00256 vec_stack.push(x); 00257 bp[k++] = 1; // writing an opening parenthesis 00258 } 00259 r_sum += r; r = lcp_buf.load_next_block(); 00260 } 00261 00262 } else { 00263 // hier noch ohne "lazy stack" trick 00264 for (size_type i=0, r_sum = 0, r = lcp_buf.load_next_block(), x; r_sum < n;) { 00265 for (; i < r_sum +r; ++i) { 00266 x = lcp_buf[i-r_sum]; 00267 while (!vec_stack.empty() and x > vec_stack.top()) { 00268 if (vec_stack.pop()) { 00269 bp_fc[k_fc] = 1; 00270 ++fc_cnt; 00271 } 00272 ++k; // writing a closing parenthesis, bp is already initialized to zeros 00273 ++k_fc; // write a bit in first_child 00274 } 00275 vec_stack.push(x); 00276 bp[k++] = 1; // writing an opening parenthesis 00277 } 00278 r_sum += r; r = lcp_buf.load_next_block(); 00279 } 00280 } 00281 while (!vec_stack.empty()) { 00282 if (vec_stack.pop()) { 00283 bp_fc[k_fc] = 1; 00284 ++fc_cnt; 00285 } 00286 // writing a closing parenthesis in bp, not necessary as bp is initalized with zeros 00287 ++k; 00288 ++k_fc; 00289 } 00290 // assert( k == 2*vec.size() ); 00291 return fc_cnt; 00292 } 00293 00294 template<uint8_t fixedIntWidth, class size_type_class> 00295 size_type_class construct_first_child_lcp(int_vector_file_buffer<fixedIntWidth, size_type_class>& lcp_buf, int_vector<fixedIntWidth, size_type_class>& fc_lcp, size_type_class first_child_size=0) 00296 { 00297 typedef size_type_class size_type; 00298 lcp_buf.reset(); 00299 size_type n = lcp_buf.int_vector_size; 00300 if (n == 0) { // if n == 0 we are done 00301 fc_lcp = int_vector<>(0); 00302 return 0; 00303 } 00304 if (first_child_size == 0) { 00305 util::assign(fc_lcp, int_vector<>(n, 0, bit_magic::l1BP(n)+1)); 00306 } 00307 00308 size_type fc_cnt=0; // first child counter 00309 sorted_multi_stack_support vec_stack(n); 00310 size_type y; 00311 for (size_type i=0, r_sum = 0, r = lcp_buf.load_next_block(), x; r_sum < n;) { 00312 for (; i < r_sum +r; ++i) { 00313 x = lcp_buf[i-r_sum]; 00314 while (!vec_stack.empty() and x < vec_stack.top()) { 00315 y = vec_stack.top(); 00316 if (vec_stack.pop()) { 00317 fc_lcp[fc_cnt++] = y; 00318 } 00319 } 00320 vec_stack.push(x); 00321 } 00322 r_sum += r; r = lcp_buf.load_next_block(); 00323 } 00324 00325 while (!vec_stack.empty()) { 00326 y = vec_stack.top(); 00327 if (vec_stack.pop()) { 00328 fc_lcp[fc_cnt++] = y; 00329 } 00330 // writing a closing parenthesis in bp, not necessary as bp is initalized with zeros 00331 } 00332 if (fc_cnt < fc_lcp.size()) { 00333 fc_lcp.resize(fc_cnt); 00334 } 00335 return fc_cnt; 00336 } 00337 00338 00339 template<uint32_t SampleDens, uint8_t fixedIntWidth, class size_type_class> 00340 size_type_class construct_first_child_and_lf_lcp(int_vector_file_buffer<fixedIntWidth, size_type_class>& lcp_buf, 00341 int_vector_file_buffer<8, size_type_class>& bwt_buf, 00342 std::string small_lcp_file_name, 00343 std::string big_lcp_file_name, 00344 int_vector<>& big_lcp, 00345 size_type_class first_child_size=0) 00346 { 00347 typedef size_type_class size_type; 00348 const size_type M = 255; 00349 size_type_class buf_len = 1000000; 00350 lcp_buf.reset(buf_len); 00351 bwt_buf.reset(buf_len); 00352 size_type n = lcp_buf.int_vector_size; 00353 00354 std::ofstream sml_lcp_out(small_lcp_file_name.c_str()); 00355 size_type bit_size = 8*n; 00356 sml_lcp_out.write((char*) &bit_size, sizeof(bit_size)); 00357 00358 std::ofstream big_lcp_out(big_lcp_file_name.c_str()); 00359 00360 size_type fc_cnt = 0; // number of lcp values at the first child r 00361 size_type fc_cnt_big = 0; // number of lcp values at the first child which are big and not reducable 00362 size_type fc_cnt_big2 = 0; 00363 sorted_multi_stack_support vec_stack(n); // occupies 2n bits 00364 bit_vector is_big_and_not_reducable(n); // initialized with 0s 00365 bool is_one_big_and_not_reducable = false; // all positions have to be reducable 00366 00367 size_type y, max_lcp=0; 00368 uint8_t last_bwti=0, val; 00369 for (size_type i=0, r_sum = 0, r = 0, x; r_sum < n;) { 00370 for (; i < r_sum + r; ++i) { 00371 x = lcp_buf[i-r_sum]; 00372 is_one_big_and_not_reducable = false; 00373 00374 while (!vec_stack.empty() and x < vec_stack.top()) { 00375 y = vec_stack.top(); 00376 is_one_big_and_not_reducable |= is_big_and_not_reducable[vec_stack.size()-1]; 00377 if (vec_stack.pop()) { // if y was the last copy of y on the stack 00378 if (y > M-2) { 00379 if (is_one_big_and_not_reducable) { 00380 val = M; 00381 big_lcp_out.write((char*)&y, sizeof(y)); 00382 ++fc_cnt_big; 00383 if (y > max_lcp) max_lcp = y; 00384 } else { 00385 val = M-1; 00386 ++fc_cnt_big2; 00387 } 00388 } else { 00389 val = y; 00390 } 00391 sml_lcp_out.write((const char*)&val, 1); 00392 ++fc_cnt; 00393 is_one_big_and_not_reducable = false; 00394 } 00395 } 00396 if (x > M-2 and (0 == i or last_bwti != bwt_buf[i - r_sum] or x % SampleDens == 0)) { 00397 is_big_and_not_reducable[vec_stack.size()] = 1; 00398 } else { 00399 is_big_and_not_reducable[vec_stack.size()] = 0; 00400 } 00401 vec_stack.push(x); 00402 last_bwti = bwt_buf[i - r_sum]; 00403 } 00404 r_sum += r; r = lcp_buf.load_next_block(); 00405 bwt_buf.load_next_block(); 00406 } 00407 00408 while (!vec_stack.empty()) { 00409 y = vec_stack.top(); 00410 if (vec_stack.pop()) { 00411 if (y > M-2) { 00412 if (is_big_and_not_reducable[vec_stack.size()]) { 00413 val = M; 00414 big_lcp_out.write((char*)&y, sizeof(y)); 00415 ++fc_cnt_big; 00416 if (y > max_lcp) max_lcp = y; 00417 } else { 00418 val = M-1; 00419 ++fc_cnt_big2; 00420 } 00421 } else { 00422 val = y; 00423 } 00424 sml_lcp_out.write((const char*)&val, 1); 00425 ++fc_cnt; 00426 } 00427 } 00428 00429 // write number of elements of sml_lcp into the out file stream 00430 sml_lcp_out.seekp(0); 00431 bit_size = 8*fc_cnt; 00432 sml_lcp_out.write((char*) &bit_size, sizeof(bit_size)); 00433 sml_lcp_out.close(); 00434 00435 big_lcp_out.close(); 00436 std::ifstream big_lcp_in(big_lcp_file_name.c_str()); 00437 big_lcp.set_int_width(bit_magic::l1BP(max_lcp)+1); 00438 big_lcp.resize(fc_cnt_big); 00439 00440 for (size_type i=0; i<fc_cnt_big; ++i) { 00441 big_lcp_in.read((char*)&y, sizeof(y)); 00442 big_lcp[i] = y; 00443 } 00444 big_lcp_in.close(); 00445 std::remove(big_lcp_file_name.c_str()); 00446 00447 // std::cout<<"#n="<<n<<" fc_cnt="<<fc_cnt<<" fc_cnt_big="<<fc_cnt_big<<" fc_cnt_big2="<<fc_cnt_big2<<std::endl; 00448 00449 return fc_cnt; 00450 } 00451 00452 00453 00454 template<class RandomAccessContainer> 00455 void construct_supercartesian_tree_bp_succinct2(const RandomAccessContainer& vec, bit_vector& bp, const bool minimum=true) 00456 { 00457 typedef typename RandomAccessContainer::size_type size_type; 00458 bp.resize(2*vec.size()); // resize bit vector for balanaced parantheses to 2 n bits 00459 util::set_zero_bits(bp); 00460 sorted_stack_support vec_stack(vec.size()); // <- das ist ein Problem fuer int_vector_file_buffer 00461 00462 size_type k=0; 00463 // uint64_t wbuf=0; 00464 for (size_type i=0/*, cnt64=0*/; i < vec.size(); ++i) { 00465 while (vec_stack.size() > 0 and vec[i] < vec[vec_stack.top()]) { 00466 vec_stack.pop(); ++k; /*bp[k++] = 0; bp is already initialized to zero*/ // writing a closing parenthesis 00467 } 00468 vec_stack.push(i); 00469 bp[k++] = 1; // writing an opening parenthesis 00470 while (i+1 < vec.size() and vec[i+1] >= vec[i]) { 00471 vec_stack.push(++i); 00472 bp[k++]; 00473 } 00474 } 00475 #ifdef SDSL_DEBUG 00476 // not neccessary as bp is already initialized to zero 00477 while (vec_stack.size() > 0) { 00478 vec_stack.pop(); 00479 bp[k++] = 0; // writing a closing parenthesis 00480 } 00481 assert(k == 2*vec.size()); 00482 #endif 00483 } 00484 00485 template<class RandomAccessContainer> 00486 typename RandomAccessContainer::size_type construct_first_p_index(const RandomAccessContainer& vec, bit_vector& bp, const bool minimum=true) 00487 { 00488 typedef typename RandomAccessContainer::size_type size_type; 00489 size_type nr_of_first_indices = 0; 00490 bp = bit_vector(vec.size(), 0); 00491 // std::cerr<<"bp.size()="<<bp.size()<<std::endl; 00492 sorted_stack_support vec_stack(vec.size()); 00493 size_type k=0; 00494 for (size_type i=0, t; i < vec.size(); ++i) { 00495 if (minimum) { 00496 while (vec_stack.size() > 0 and vec[i] < vec[vec_stack.top()]) { 00497 t = vec[vec_stack.top()]; 00498 vec_stack.pop(); 00499 if (vec_stack.size() == 0 or t != vec[vec_stack.top()]) { 00500 bp[k] = 1; 00501 ++nr_of_first_indices; 00502 } 00503 ++k; 00504 00505 } 00506 } else { 00507 while (vec_stack.size() > 0 and vec[i] > vec[vec_stack.top()]) { 00508 t = vec[vec_stack.top()]; 00509 vec_stack.pop(); 00510 if (vec_stack.size() == 0 or t != vec[vec_stack.top()]) { 00511 bp[k] = 1; 00512 ++nr_of_first_indices; 00513 } 00514 ++k; 00515 } 00516 } 00517 vec_stack.push(i); 00518 } 00519 while (vec_stack.size() > 0) { 00520 size_type t = vec[vec_stack.top()]; 00521 vec_stack.pop(); 00522 if (vec_stack.size() == 0 or t != vec[vec_stack.top()]) { 00523 bp[k] = 1; 00524 ++nr_of_first_indices; 00525 } 00526 ++k; 00527 } 00528 assert(k == vec.size()); 00529 return nr_of_first_indices; 00530 } 00531 00532 //TODO implementation 00533 template<uint8_t fixedIntWidth, class size_type_class> 00534 bit_vector::size_type construct_first_p_index(int_vector_file_buffer<fixedIntWidth, size_type_class>& lcp_buf, bit_vector& bp, const bool minimum=true) 00535 { 00536 typedef bit_vector::size_type size_type; 00537 size_type nr_of_first_indices = 0; 00538 lcp_buf.reset(); 00539 size_type n = lcp_buf.int_vector_size; 00540 00541 bp = bit_vector(n, 0); 00542 sorted_multi_stack_support vec_stack(n); 00543 size_type k=0; 00544 00545 if (minimum) { 00546 for (size_type i = 0, r_sum = 0, r = lcp_buf.load_next_block(),x; r_sum < n;) { 00547 for (; i<r_sum+r; ++i) { 00548 x = lcp_buf[i-r_sum]; 00549 while (!vec_stack.empty() and x < vec_stack.top()) { 00550 if (vec_stack.pop()) { 00551 bp[k] = 1; 00552 ++nr_of_first_indices; 00553 } 00554 ++k; 00555 } 00556 vec_stack.push(x); 00557 } 00558 r_sum += r; r = lcp_buf.load_next_block(); 00559 } 00560 } else { 00561 for (size_type i = 0, r_sum = 0, r = lcp_buf.load_next_block(),x; r_sum < n;) { 00562 for (; i<r_sum+r; ++i) { 00563 x = lcp_buf[i-r_sum]; 00564 while (!vec_stack.empty() and x > vec_stack.top()) { 00565 if (vec_stack.pop()) { 00566 bp[k] = 1; 00567 ++nr_of_first_indices; 00568 } 00569 ++k; 00570 } 00571 vec_stack.push(x); 00572 } 00573 r_sum += r; r = lcp_buf.load_next_block(); 00574 } 00575 } 00576 00577 while (!vec_stack.empty()) { 00578 if (vec_stack.pop()) { 00579 bp[k] = 1; 00580 ++nr_of_first_indices; 00581 } 00582 ++k; 00583 } 00584 // assert( k == vec.size() ); 00585 return nr_of_first_indices; 00586 } 00587 00588 }// end namespace algorithm 00589 00590 }// end namespace sdsl 00591 00592 #endif 00593