SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
 All Classes Namespaces Files Functions Variables Typedefs Friends
sdsl/include/sdsl/algorithms_for_compressed_suffix_trees.hpp
Go to the documentation of this file.
00001 /* sdsl - succinct data structures library
00002     Copyright (C) 2009 Simon Gog
00003 
00004     This program is free software: you can redistribute it and/or modify
00005     it under the terms of the GNU General Public License as published by
00006     the Free Software Foundation, either version 3 of the License, or
00007     (at your option) any later version.
00008 
00009     This program is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012     GNU General Public License for more details.
00013 
00014     You should have received a copy of the GNU General Public License
00015     along with this program.  If not, see http://www.gnu.org/licenses/ .
00016 */
00021 #ifndef INCLUDED_SDSL_ALGORITHMS_FOR_COMPRESSED_SUFFIX_TREES
00022 #define INCLUDED_SDSL_ALGORITHMS_FOR_COMPRESSED_SUFFIX_TREES
00023 
00024 #include "int_vector.hpp" // for bit_vector
00025 #include "sorted_stack_support.hpp" // for construct_supercartesian_tree_bp
00026 #include "sorted_multi_stack_support.hpp" // for first_p_index_construction
00027 #include "util.hpp"
00028 #include <stack> // for calculate_supercartesian_tree_bp
00029 
00030 
00031 namespace sdsl
00032 {
00033 
00034 namespace algorithm
00035 {
00036 
00038 
00047 template<class RandomAccessContainer>
00048 void construct_supercartesian_tree_bp(const RandomAccessContainer& vec, bit_vector& bp, const bool minimum=true)
00049 {
00050     typedef typename RandomAccessContainer::size_type size_type;
00051     bp.resize(2*vec.size());      // resize bit vector for balanaced parantheses to 2 n bits
00052     util::set_zero_bits(bp);
00053     std::stack<typename RandomAccessContainer::value_type> vec_stack;
00054 
00055     size_type k=0;
00056     for (size_type i=0; i < vec.size(); ++i) {
00057         typename RandomAccessContainer::value_type l = vec[i];
00058         if (minimum) {
00059             while (vec_stack.size() > 0 and l < vec_stack.top()) {
00060                 vec_stack.pop(); ++k; /*bp[k++] = 0; bp is already initialized to zero*/ // writing a closing parenthesis
00061             }
00062         } else {
00063             while (vec_stack.size() > 0 and l > vec_stack.top()) {
00064                 vec_stack.pop(); ++k; /*bp[k++] = 0; bp is already initialized to zero*/ // writing a closing parenthesis
00065             }
00066         }
00067         vec_stack.push(l);
00068         bp[k++] = 1; // writing an opening  parenthesis
00069     }
00070     while (vec_stack.size() > 0) {
00071         vec_stack.pop();
00072         bp[k++] = 0; // writing a closing parenthesis
00073     }
00074     assert(k == 2*vec.size());
00075 }
00076 
00078 
00087 // TODO: sorted_multi_stack_support einbauen, RandomAccessContainer durch int_vector_file_buffer ersetzen
00088 template<class RandomAccessContainer>
00089 void construct_supercartesian_tree_bp_succinct(const RandomAccessContainer& vec, bit_vector& bp, const bool minimum=true)
00090 {
00091     typedef typename RandomAccessContainer::size_type size_type;
00092     bp.resize(2*vec.size());      // resize bit vector for balanaced parantheses to 2 n bits
00093     util::set_zero_bits(bp);
00094     sorted_stack_support vec_stack(vec.size()); // <- das ist ein Problem fuer int_vector_file_buffer
00095 
00096     size_type k=0;
00097     if (minimum) {
00098         bp[k++] = 1;
00099         for (size_type i=1; i < vec.size(); ++i) {
00100             if (vec[i] < vec[i-1]) {
00101                 ++k;
00102                 while (vec_stack.size() > 0 and vec[i] < vec[vec_stack.top()]) {
00103                     vec_stack.pop(); ++k; // writing a closing parenthesis, bp is already initialized to zero
00104                 }
00105             } else {
00106                 vec_stack.push(i-1); // "lazy stack" trick: Beschleunigung: ca. 25%
00107             }
00108             bp[k++] = 1; // writing an opening  parenthesis
00109         }
00110         /*
00111         vec_stack.push(0);
00112         bp[k++] = 1;
00113         for(size_type i=1,j, start_run=1; i < vec.size(); ++i){
00114                 if( vec[i] < vec[i-1] ){
00115                         j = i;
00116                         while( --j >= start_run and vec[i] < vec[j]) ++k;
00117                         while(start_run <= j){  // auf den stack pushen
00118                                 vec_stack.push(start_run++);
00119                         }
00120                         while( vec_stack.size() > 0 and vec[i] < vec[vec_stack.top()] ){
00121                                 vec_stack.pop(); ++k;
00122                         }
00123                         start_run = i;
00124                 }
00125                 bp[k++] = 1;
00126         }
00127         */
00128     } else {
00129         // hier noch ohne "lazy stack" trick
00130         for (size_type i=0; i < vec.size(); ++i) {
00131             while (vec_stack.size() > 0 and vec[i] > vec[vec_stack.top()]) {
00132                 vec_stack.pop(); ++k; /*bp[k++] = 0; bp is already initialized to zero*/ // writing a closing parenthesis
00133             }
00134             vec_stack.push(i);
00135             bp[k++] = 1; // writing an opening  parenthesis
00136         }
00137     }
00138 #ifdef SDSL_DEBUG
00139 // not neccessary as bp is already initialized to zero
00140     while (!vec_stack.empty()) {
00141         vec_stack.pop();
00142         bp[k++] = 0; // writing a closing parenthesis
00143     }
00144     assert(k == 2*vec.size());
00145 #endif
00146 }
00147 
00149 
00158 template<uint8_t fixedIntWidth, class size_type_class>
00159 void construct_supercartesian_tree_bp_succinct(int_vector_file_buffer<fixedIntWidth, size_type_class>& lcp_buf, bit_vector& bp, const bool minimum=true)
00160 {
00161     typedef size_type_class size_type;
00162     lcp_buf.reset();
00163     size_type n = lcp_buf.int_vector_size;
00164     bp.resize(2*n);      // resize bit vector for balanaced parantheses to 2 n bits
00165     if (n == 0) // if n == 0 we are done
00166         return;
00167     util::set_zero_bits(bp);
00168     sorted_multi_stack_support vec_stack(n);
00169 
00170     size_type k=0;
00171     if (minimum) {
00172         bp[k++] = 1;
00173         size_type r = lcp_buf.load_next_block();
00174         size_type last = lcp_buf[0];
00175         for (size_type i=1, r_sum = 0, x; r_sum < n;) {
00176             for (; i < r_sum +r; ++i) {
00177                 x = lcp_buf[i-r_sum];
00178                 if (x < last) {
00179                     ++k; // writing a closing parenthesis for last
00180                     while (!vec_stack.empty() and x < vec_stack.top()) {
00181                         vec_stack.pop(); ++k; // writing a closing parenthesis, bp is already initialized to zeros
00182                     }
00183                 } else {
00184                     vec_stack.push(last); // "lazy stack" trick: Beschleunigung: ca 25 %
00185                 }
00186                 bp[k++] = 1; // writing an opening parenthesis
00187                 last = x;
00188             }
00189             r_sum += r; r = lcp_buf.load_next_block();
00190         }
00191     } else {
00192         // hier noch ohne "lazy stack" trick
00193         for (size_type i=0, r_sum = 0, r = lcp_buf.load_next_block(), x; r_sum < n;) {
00194             for (; i < r_sum +r; ++i) {
00195                 x = lcp_buf[i-r_sum];
00196                 while (!vec_stack.empty() and x > vec_stack.top()) {
00197                     vec_stack.pop(); ++k; // writing a closing parenthesis, bp is already initialized to zeros
00198                 }
00199                 vec_stack.push(x);
00200                 bp[k++] = 1; // writing an opening parenthesis
00201             }
00202             r_sum += r; r = lcp_buf.load_next_block();
00203         }
00204     }
00205 #ifdef SDSL_DEBUG
00206 // not neccessary as bp is already initialized to zero
00207     while (!vec_stack.empty()) {
00208         vec_stack.pop();
00209         bp[k++] = 0; // writing a closing parenthesis
00210     }
00211     assert(k == 2*vec.size());
00212 #endif
00213 }
00214 
00216 
00226 template<uint8_t fixedIntWidth, class size_type_class>
00227 size_type_class construct_supercartesian_tree_bp_succinct_and_first_child(int_vector_file_buffer<fixedIntWidth, size_type_class>& lcp_buf, bit_vector& bp, bit_vector& bp_fc, const bool minimum=true)
00228 {
00229     typedef size_type_class size_type;
00230     lcp_buf.reset();
00231     size_type n = lcp_buf.int_vector_size;
00232     bp.resize(2*n);      // resize bit vector for balanaced parantheses to 2 n bits
00233     bp_fc.resize(n);
00234     if (n == 0) // if n == 0 we are done
00235         return 0;
00236     size_type fc_cnt=0; // first child counter
00237     util::set_zero_bits(bp);
00238     util::set_zero_bits(bp_fc);
00239     sorted_multi_stack_support vec_stack(n);
00240 
00241     size_type k=0;
00242     size_type k_fc=0; // first child index
00243     if (minimum) {
00244         // hier noch ohne "lazy stack" trick
00245         for (size_type i=0, r_sum = 0, r = lcp_buf.load_next_block(), x; r_sum < n;) {
00246             for (; i < r_sum +r; ++i) {
00247                 x = lcp_buf[i-r_sum];
00248                 while (!vec_stack.empty() and x < vec_stack.top()) {
00249                     if (vec_stack.pop()) {
00250                         bp_fc[k_fc] = 1;
00251                         ++fc_cnt;
00252                     }
00253                     ++k; // writing a closing parenthesis, bp is already initialized to zeros
00254                     ++k_fc; // write a bit in first_child
00255                 }
00256                 vec_stack.push(x);
00257                 bp[k++] = 1; // writing an opening parenthesis
00258             }
00259             r_sum += r; r = lcp_buf.load_next_block();
00260         }
00261 
00262     } else {
00263         // hier noch ohne "lazy stack" trick
00264         for (size_type i=0, r_sum = 0, r = lcp_buf.load_next_block(), x; r_sum < n;) {
00265             for (; i < r_sum +r; ++i) {
00266                 x = lcp_buf[i-r_sum];
00267                 while (!vec_stack.empty() and x > vec_stack.top()) {
00268                     if (vec_stack.pop()) {
00269                         bp_fc[k_fc] = 1;
00270                         ++fc_cnt;
00271                     }
00272                     ++k; // writing a closing parenthesis, bp is already initialized to zeros
00273                     ++k_fc; // write a bit in first_child
00274                 }
00275                 vec_stack.push(x);
00276                 bp[k++] = 1; // writing an opening parenthesis
00277             }
00278             r_sum += r; r = lcp_buf.load_next_block();
00279         }
00280     }
00281     while (!vec_stack.empty()) {
00282         if (vec_stack.pop()) {
00283             bp_fc[k_fc] = 1;
00284             ++fc_cnt;
00285         }
00286         // writing a closing parenthesis in bp, not necessary as bp is initalized with zeros
00287         ++k;
00288         ++k_fc;
00289     }
00290 //      assert( k == 2*vec.size() );
00291     return fc_cnt;
00292 }
00293 
00294 template<uint8_t fixedIntWidth, class size_type_class>
00295 size_type_class construct_first_child_lcp(int_vector_file_buffer<fixedIntWidth, size_type_class>& lcp_buf, int_vector<fixedIntWidth, size_type_class>& fc_lcp, size_type_class first_child_size=0)
00296 {
00297     typedef size_type_class size_type;
00298     lcp_buf.reset();
00299     size_type n = lcp_buf.int_vector_size;
00300     if (n == 0) {       // if n == 0 we are done
00301         fc_lcp = int_vector<>(0);
00302         return 0;
00303     }
00304     if (first_child_size == 0) {
00305         util::assign(fc_lcp, int_vector<>(n, 0, bit_magic::l1BP(n)+1));
00306     }
00307 
00308     size_type fc_cnt=0; // first child counter
00309     sorted_multi_stack_support vec_stack(n);
00310     size_type y;
00311     for (size_type i=0, r_sum = 0, r = lcp_buf.load_next_block(), x; r_sum < n;) {
00312         for (; i < r_sum +r; ++i) {
00313             x = lcp_buf[i-r_sum];
00314             while (!vec_stack.empty() and x < vec_stack.top()) {
00315                 y = vec_stack.top();
00316                 if (vec_stack.pop()) {
00317                     fc_lcp[fc_cnt++] = y;
00318                 }
00319             }
00320             vec_stack.push(x);
00321         }
00322         r_sum += r; r = lcp_buf.load_next_block();
00323     }
00324 
00325     while (!vec_stack.empty()) {
00326         y = vec_stack.top();
00327         if (vec_stack.pop()) {
00328             fc_lcp[fc_cnt++] = y;
00329         }
00330         // writing a closing parenthesis in bp, not necessary as bp is initalized with zeros
00331     }
00332     if (fc_cnt < fc_lcp.size()) {
00333         fc_lcp.resize(fc_cnt);
00334     }
00335     return fc_cnt;
00336 }
00337 
00338 
00339 template<uint32_t SampleDens, uint8_t fixedIntWidth, class size_type_class>
00340 size_type_class construct_first_child_and_lf_lcp(int_vector_file_buffer<fixedIntWidth, size_type_class>& lcp_buf,
00341         int_vector_file_buffer<8, size_type_class>& bwt_buf,
00342         std::string small_lcp_file_name,
00343         std::string big_lcp_file_name,
00344         int_vector<>& big_lcp,
00345         size_type_class first_child_size=0)
00346 {
00347     typedef size_type_class size_type;
00348     const size_type M = 255;
00349     size_type_class buf_len = 1000000;
00350     lcp_buf.reset(buf_len);
00351     bwt_buf.reset(buf_len);
00352     size_type n = lcp_buf.int_vector_size;
00353 
00354     std::ofstream sml_lcp_out(small_lcp_file_name.c_str());
00355     size_type bit_size = 8*n;
00356     sml_lcp_out.write((char*) &bit_size, sizeof(bit_size));
00357 
00358     std::ofstream big_lcp_out(big_lcp_file_name.c_str());
00359 
00360     size_type fc_cnt = 0; // number of lcp values at the first child r
00361     size_type fc_cnt_big = 0; // number of lcp values at the first child which are big and not reducable
00362     size_type fc_cnt_big2 = 0;
00363     sorted_multi_stack_support vec_stack(n); // occupies 2n bits
00364     bit_vector is_big_and_not_reducable(n); // initialized with 0s
00365     bool is_one_big_and_not_reducable = false; // all positions have to be reducable
00366 
00367     size_type y, max_lcp=0;
00368     uint8_t last_bwti=0, val;
00369     for (size_type i=0, r_sum = 0, r = 0, x; r_sum < n;) {
00370         for (; i < r_sum + r; ++i) {
00371             x = lcp_buf[i-r_sum];
00372             is_one_big_and_not_reducable = false;
00373 
00374             while (!vec_stack.empty() and x < vec_stack.top()) {
00375                 y = vec_stack.top();
00376                 is_one_big_and_not_reducable |= is_big_and_not_reducable[vec_stack.size()-1];
00377                 if (vec_stack.pop()) { // if y was the last copy of y on the stack
00378                     if (y > M-2) {
00379                         if (is_one_big_and_not_reducable) {
00380                             val = M;
00381                             big_lcp_out.write((char*)&y, sizeof(y));
00382                             ++fc_cnt_big;
00383                             if (y > max_lcp) max_lcp = y;
00384                         } else {
00385                             val = M-1;
00386                             ++fc_cnt_big2;
00387                         }
00388                     } else {
00389                         val = y;
00390                     }
00391                     sml_lcp_out.write((const char*)&val, 1);
00392                     ++fc_cnt;
00393                     is_one_big_and_not_reducable = false;
00394                 }
00395             }
00396             if (x > M-2 and (0 == i or last_bwti != bwt_buf[i - r_sum] or x % SampleDens == 0)) {
00397                 is_big_and_not_reducable[vec_stack.size()] = 1;
00398             } else {
00399                 is_big_and_not_reducable[vec_stack.size()] = 0;
00400             }
00401             vec_stack.push(x);
00402             last_bwti = bwt_buf[i - r_sum];
00403         }
00404         r_sum += r; r = lcp_buf.load_next_block();
00405         bwt_buf.load_next_block();
00406     }
00407 
00408     while (!vec_stack.empty()) {
00409         y = vec_stack.top();
00410         if (vec_stack.pop()) {
00411             if (y > M-2) {
00412                 if (is_big_and_not_reducable[vec_stack.size()]) {
00413                     val = M;
00414                     big_lcp_out.write((char*)&y, sizeof(y));
00415                     ++fc_cnt_big;
00416                     if (y > max_lcp) max_lcp = y;
00417                 } else {
00418                     val = M-1;
00419                     ++fc_cnt_big2;
00420                 }
00421             } else {
00422                 val = y;
00423             }
00424             sml_lcp_out.write((const char*)&val, 1);
00425             ++fc_cnt;
00426         }
00427     }
00428 
00429     // write number of elements of sml_lcp into the out file stream
00430     sml_lcp_out.seekp(0);
00431     bit_size = 8*fc_cnt;
00432     sml_lcp_out.write((char*) &bit_size, sizeof(bit_size));
00433     sml_lcp_out.close();
00434 
00435     big_lcp_out.close();
00436     std::ifstream big_lcp_in(big_lcp_file_name.c_str());
00437     big_lcp.set_int_width(bit_magic::l1BP(max_lcp)+1);
00438     big_lcp.resize(fc_cnt_big);
00439 
00440     for (size_type i=0; i<fc_cnt_big; ++i) {
00441         big_lcp_in.read((char*)&y, sizeof(y));
00442         big_lcp[i] = y;
00443     }
00444     big_lcp_in.close();
00445     std::remove(big_lcp_file_name.c_str());
00446 
00447 //    std::cout<<"#n="<<n<<" fc_cnt="<<fc_cnt<<" fc_cnt_big="<<fc_cnt_big<<" fc_cnt_big2="<<fc_cnt_big2<<std::endl;
00448 
00449     return fc_cnt;
00450 }
00451 
00452 
00453 
00454 template<class RandomAccessContainer>
00455 void construct_supercartesian_tree_bp_succinct2(const RandomAccessContainer& vec, bit_vector& bp, const bool minimum=true)
00456 {
00457     typedef typename RandomAccessContainer::size_type size_type;
00458     bp.resize(2*vec.size());      // resize bit vector for balanaced parantheses to 2 n bits
00459     util::set_zero_bits(bp);
00460     sorted_stack_support vec_stack(vec.size()); // <- das ist ein Problem fuer int_vector_file_buffer
00461 
00462     size_type k=0;
00463 //      uint64_t wbuf=0;
00464     for (size_type i=0/*, cnt64=0*/; i < vec.size(); ++i) {
00465         while (vec_stack.size() > 0 and vec[i] < vec[vec_stack.top()]) {
00466             vec_stack.pop(); ++k; /*bp[k++] = 0; bp is already initialized to zero*/ // writing a closing parenthesis
00467         }
00468         vec_stack.push(i);
00469         bp[k++] = 1; // writing an opening  parenthesis
00470         while (i+1 < vec.size() and vec[i+1] >= vec[i]) {
00471             vec_stack.push(++i);
00472             bp[k++];
00473         }
00474     }
00475 #ifdef SDSL_DEBUG
00476 // not neccessary as bp is already initialized to zero
00477     while (vec_stack.size() > 0) {
00478         vec_stack.pop();
00479         bp[k++] = 0; // writing a closing parenthesis
00480     }
00481     assert(k == 2*vec.size());
00482 #endif
00483 }
00484 
00485 template<class RandomAccessContainer>
00486 typename RandomAccessContainer::size_type construct_first_p_index(const RandomAccessContainer& vec, bit_vector& bp, const bool minimum=true)
00487 {
00488     typedef typename RandomAccessContainer::size_type size_type;
00489     size_type nr_of_first_indices = 0;
00490     bp = bit_vector(vec.size(), 0);
00491 //      std::cerr<<"bp.size()="<<bp.size()<<std::endl;
00492     sorted_stack_support vec_stack(vec.size());
00493     size_type k=0;
00494     for (size_type i=0, t; i < vec.size(); ++i) {
00495         if (minimum) {
00496             while (vec_stack.size() > 0 and vec[i] < vec[vec_stack.top()]) {
00497                 t = vec[vec_stack.top()];
00498                 vec_stack.pop();
00499                 if (vec_stack.size() == 0 or t != vec[vec_stack.top()]) {
00500                     bp[k] = 1;
00501                     ++nr_of_first_indices;
00502                 }
00503                 ++k;
00504 
00505             }
00506         } else {
00507             while (vec_stack.size() > 0 and vec[i] > vec[vec_stack.top()]) {
00508                 t = vec[vec_stack.top()];
00509                 vec_stack.pop();
00510                 if (vec_stack.size() == 0 or t != vec[vec_stack.top()]) {
00511                     bp[k] = 1;
00512                     ++nr_of_first_indices;
00513                 }
00514                 ++k;
00515             }
00516         }
00517         vec_stack.push(i);
00518     }
00519     while (vec_stack.size() > 0) {
00520         size_type t = vec[vec_stack.top()];
00521         vec_stack.pop();
00522         if (vec_stack.size() == 0 or t != vec[vec_stack.top()]) {
00523             bp[k] = 1;
00524             ++nr_of_first_indices;
00525         }
00526         ++k;
00527     }
00528     assert(k == vec.size());
00529     return nr_of_first_indices;
00530 }
00531 
00532 //TODO implementation
00533 template<uint8_t fixedIntWidth, class size_type_class>
00534 bit_vector::size_type construct_first_p_index(int_vector_file_buffer<fixedIntWidth, size_type_class>& lcp_buf, bit_vector& bp, const bool minimum=true)
00535 {
00536     typedef bit_vector::size_type size_type;
00537     size_type nr_of_first_indices = 0;
00538     lcp_buf.reset();
00539     size_type n = lcp_buf.int_vector_size;
00540 
00541     bp = bit_vector(n, 0);
00542     sorted_multi_stack_support vec_stack(n);
00543     size_type k=0;
00544 
00545     if (minimum) {
00546         for (size_type i = 0, r_sum = 0, r = lcp_buf.load_next_block(),x; r_sum < n;) {
00547             for (; i<r_sum+r; ++i) {
00548                 x = lcp_buf[i-r_sum];
00549                 while (!vec_stack.empty() and x < vec_stack.top()) {
00550                     if (vec_stack.pop()) {
00551                         bp[k] = 1;
00552                         ++nr_of_first_indices;
00553                     }
00554                     ++k;
00555                 }
00556                 vec_stack.push(x);
00557             }
00558             r_sum += r; r = lcp_buf.load_next_block();
00559         }
00560     } else {
00561         for (size_type i = 0, r_sum = 0, r = lcp_buf.load_next_block(),x; r_sum < n;) {
00562             for (; i<r_sum+r; ++i) {
00563                 x = lcp_buf[i-r_sum];
00564                 while (!vec_stack.empty() and x > vec_stack.top()) {
00565                     if (vec_stack.pop()) {
00566                         bp[k] = 1;
00567                         ++nr_of_first_indices;
00568                     }
00569                     ++k;
00570                 }
00571                 vec_stack.push(x);
00572             }
00573             r_sum += r; r = lcp_buf.load_next_block();
00574         }
00575     }
00576 
00577     while (!vec_stack.empty()) {
00578         if (vec_stack.pop()) {
00579             bp[k] = 1;
00580             ++nr_of_first_indices;
00581         }
00582         ++k;
00583     }
00584 //      assert( k == vec.size() );
00585     return nr_of_first_indices;
00586 }
00587 
00588 }// end namespace algorithm
00589 
00590 }// end namespace sdsl
00591 
00592 #endif
00593