SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2010 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00021 #ifndef INCLUDED_SDSL_CST_SCT2 00022 #define INCLUDED_SDSL_CST_SCT2 00023 00024 #include "int_vector.hpp" 00025 #include "algorithms.hpp" 00026 #include "iterators.hpp" 00027 #include "lcp_support_sada.hpp" 00028 #include "bp_support.hpp" 00029 #include "csa_wt.hpp" // for std initialization of cst_sct2 00030 #include "csa_uncompressed.hpp" 00031 #include "cst_iterators.hpp" 00032 #include "cst_sct.hpp" // for lcp_interval 00033 #include "rank_support.hpp" 00034 #include "testutils.hpp" 00035 #include "util.hpp" 00036 #include <iostream> 00037 #include <algorithm> 00038 #include <cassert> 00039 #include <cstring> // for strlen 00040 #include <iomanip> 00041 #include <iterator> 00042 #include <stack> // for the calculation of the balanced parantheses sequence 00043 #include <ostream> 00044 00045 namespace sdsl 00046 { 00047 00049 00068 template<class Csa = csa_wt<>, class Lcp = lcp_support_sada<Csa>, class Bp_support = bp_support_sada<>, class Rank_support = rank_support_v5<> > 00069 class cst_sct2 00070 { 00071 public: 00072 typedef typename Csa::value_type value_type; // STL Container requirement/TODO: ist das nicht gleich node type??? 00073 typedef cst_dfs_const_forward_iterator<cst_sct2> const_iterator;// STL Container requirement 00074 typedef const_iterator iterator; // STL Container requirement 00075 typedef cst_bottom_up_const_forward_iterator<cst_sct2> const_bottom_up_iterator; 00076 typedef const_bottom_up_iterator bottom_up_iterator; 00077 typedef const value_type const_reference; 00078 typedef const_reference reference; 00079 typedef const_reference* pointer; 00080 typedef const pointer const_pointer; 00081 typedef typename Csa::size_type size_type; // STL Container requirement 00082 typedef size_type cst_size_type; 00083 typedef ptrdiff_t difference_type; // STL Container requirement 00084 typedef Csa csa_type; 00085 typedef Lcp lcp_type; 00086 typedef Bp_support bp_support_type; 00087 typedef typename Csa::pattern_type pattern_type; 00088 typedef typename Csa::char_type char_type; 00089 typedef lcp_interval<size_type> node_type; 00090 00091 typedef cst_tag index_category; 00092 private: 00093 Csa m_csa; 00094 Lcp m_lcp; 00095 bit_vector m_bp; 00096 Bp_support m_bp_support; 00097 bit_vector m_first_child; 00098 Rank_support m_first_child_rank; 00099 size_type m_nodes; 00100 00101 void copy(const cst_sct2& cst) { 00102 m_csa = cst.m_csa; 00103 // lcp_trait<Csa, Lcp>::copy_lcp(m_lcp, cst.m_lcp, m_csa); 00104 copy_lcp(m_lcp, cst.m_lcp, *this); 00105 m_bp = cst.m_bp; 00106 m_bp_support = cst.m_bp_support; 00107 m_bp_support.set_vector(&m_bp); 00108 m_first_child = cst.m_first_child; 00109 m_first_child_rank = cst.m_first_child_rank; 00110 m_first_child_rank.set_vector(&m_first_child); 00111 m_nodes = cst.m_nodes; 00112 } 00113 00114 // Get the first l index of a l-[i,j] interval. 00115 /* I.e. given an interval [i,j], the function returns the position of the smallest entry lcp[k] with \f$ i<k\leq j \f$ 00116 * Time complexity: O(1) + O(lcp access) 00117 */ // TODO: if interval is last child interval -> we don't need the m_lcp comparisson 00118 size_type get_first_l_index(size_type i, size_type j, size_type& kpos, size_type& find_close_k)const { 00119 if (j+1 != m_csa.size() and m_lcp[i] <= m_lcp[j+1]) { 00120 size_type jp1pos = m_bp_support.select(j+2); 00121 assert(m_bp[jp1pos] == 1); // opening parenthesis 00122 assert(jp1pos > 0 and m_bp[jp1pos-1] == 0); // closing parenthesis 00123 find_close_k = jp1pos-1; 00124 kpos = m_bp_support.find_open(find_close_k); 00125 return m_bp_support.rank(kpos)-1; 00126 } else { 00127 size_type ipos = m_bp_support.select(i+1); 00128 assert(m_bp[ipos] == 1); // opening parenthesis 00129 ipos = m_bp_support.find_close(ipos); 00130 assert(m_bp[ipos] == 0 and m_bp[ipos-1] == 0); // closing parenthesis 00131 find_close_k = ipos-1; 00132 kpos = m_bp_support.find_open(find_close_k); 00133 return m_bp_support.rank(kpos)-1; 00134 } 00135 } 00136 00137 // Get the postion of the first l index of a l-[i,j] interval in the balanced parentheses sequence. 00138 /* \par Time complexity 00139 * \f$ \Order{\lcpaccess} \f$ 00140 */ 00141 size_type get_pos_of_closing_parenthesis_of_the_first_l_index(size_type i, size_type j)const { 00142 if (j+1 != m_csa.size() and m_lcp[i] <= m_lcp[j+1]) { 00143 size_type jp1pos = m_bp_support.select(j+2); 00144 assert(m_bp[jp1pos] == 1); // opening parenthesis 00145 assert(jp1pos > 0 and m_bp[jp1pos-1] == 0); // closing parenthesis 00146 return jp1pos-1; 00147 } else { 00148 size_type ipos = m_bp_support.select(i+1); 00149 assert(m_bp[ipos] == 1); // opening parenthesis 00150 ipos = m_bp_support.find_close(ipos); 00151 assert(m_bp[ipos] == 0 and m_bp[ipos-1] == 0); // closing parenthesis 00152 return ipos-1; 00153 } 00154 } 00155 00156 // Get the next smaller value. 00157 /* \par Time complexity 00158 * \f$ \Order{1} \f$ 00159 */ 00160 size_type nsv(size_type i)const { 00161 size_type pos = m_bp_support.select(i+1); 00162 size_type rank = m_bp_support.find_close(pos); 00163 size_type result = m_bp_support.rank(rank); 00164 return result; 00165 } 00166 00167 // Get the previous smaller value. 00168 /* \pre lcp[i] > 0! 00169 * \par Time complexity 00170 * \f$ \Order{\sigma \cdot \lcpaccess} \f$ 00171 * \sa psv 00172 */ 00173 size_type psv_linear(size_type i, size_type lcpi)const { 00174 assert(lcpi>0); 00175 size_type pos = m_bp_support.select(i+1); 00176 size_type ec = m_bp_support.enclose(pos); 00177 size_type j = m_bp_support.rank(ec); 00178 while (lcpi == m_lcp[j-1]) { 00179 ec = m_bp_support.enclose(ec); 00180 j = m_bp_support.rank(ec); 00181 } 00182 return j-1; 00183 } 00184 00185 // Get the previous smaller value. 00186 /* \pre lcp[i] > 0! 00187 * \par Time complexity 00188 * \f$ \Order{\log\sigma \cdot \lcpaccess} \f$ 00189 */ 00190 size_type psv2(size_type i, size_type lcpi)const { 00191 assert(lcpi!=0);// if(m_lcp[i]==0) return 0; 00192 size_type begin = m_bp_support.find_close(m_bp_support.select(i+1)); // calculate NSV 00193 size_type end = m_bp_support.rank(begin)+1; // " " // exclusive 00194 if (end > m_csa.size()) { 00195 assert(end == m_csa.size()+1); 00196 end = m_bp.size(); 00197 } else { 00198 end = m_bp_support.select(end); 00199 } 00200 if (++begin < end) { 00201 size_type result = m_bp_support.rank(m_bp_support.find_open(end-1))-1; 00202 if (m_lcp[result] < lcpi) { // TODO: explain this condition 00203 // binary search 00204 --end; 00205 while (begin!=end) { 00206 size_type mid, temp; 00207 mid = (begin+end)>>1; // begin <= mid < end 00208 if (m_lcp[temp = m_bp_support.rank(m_bp_support.find_open(mid))-1] >= lcpi) { 00209 begin = mid+1; 00210 } else { 00211 result = temp; 00212 end = mid; 00213 } 00214 } 00215 return result; 00216 } 00217 } 00218 00219 return m_bp_support.rank(m_bp_support.enclose(end))-1; 00220 } 00221 00222 // Get the previous smaller value. 00223 /* \pre lcp[i] > 0! 00224 * \par Time complexity 00225 * \f$ \Order{\frac{\sigma}{w}} \f$, where w=64 is the word size 00226 */ 00227 size_type psv(size_type i, size_type lcpi)const { 00228 assert(lcpi!=0);// if(m_lcp[i]==0) return 0; 00229 size_type ipos = m_bp_support.select(i+1); 00230 size_type begin = m_bp_support.find_close(ipos); // calculate closing parenthesis 00231 size_type r0 = begin - m_bp_support.rank(begin); // index of clothing parenthesis in first_child bp 00232 size_type next_first_child = 0; 00233 const uint64_t* p = m_first_child.data() + (r0>>6); 00234 uint64_t w = (*p) >> (r0&0x3F); 00235 if (w) { // if w!=0 00236 next_first_child = begin + bit_magic::r1BP(w); 00237 if (begin == next_first_child and m_bp[next_first_child+1]) 00238 return m_bp_support.rank(m_bp_support.enclose(ipos))-1; 00239 } else { 00240 begin += 64-(r0&0x3F); 00241 ++p; 00242 while (!(w=*p)) { // while w==0 00243 ++p; 00244 begin += 64; 00245 } 00246 next_first_child = begin + bit_magic::r1BP(w); 00247 } 00248 assert(next_first_child < m_bp.size()); 00249 if (!m_bp[next_first_child+1]) { // if next parenthesis is a closing one 00250 return m_bp_support.rank(m_bp_support.find_open(next_first_child+1))-1; 00251 } else { 00252 return m_bp_support.rank(m_bp_support.enclose(m_bp_support.find_open(next_first_child)))-1; 00253 } 00254 } 00255 00256 // Range minimum query based on the rr_enclose method. 00257 /* \par Time complexity 00258 * \f$ \Order{\rrenclose} \f$ 00259 */ 00260 size_type rmq(size_type l, size_type r)const { 00261 size_type i = m_bp_support.select(l+1); 00262 size_type j = m_bp_support.select(r+1); 00263 size_type fc_i = m_bp_support.find_close(i); 00264 if (j < fc_i) { // i < j < find_close(j) < find_close(i) 00265 return l; 00266 } else { // i < find_close(i) < j < find_close(j) 00267 size_type ec = m_bp_support.rr_enclose(i,j); 00268 if (ec == m_bp_support.size()) {// no restricted enclosing pair found 00269 return r; 00270 } else { // found range restriced enclosing pair 00271 return m_bp_support.rank(ec)-1; // subtract 1, as the index is 0 based 00272 } 00273 } 00274 } 00275 00276 public: 00277 const Csa& csa; 00278 const Lcp& lcp; 00279 const bit_vector& bp; 00280 const Bp_support& bp_support; 00281 00282 const bit_vector& first_child_bv; 00283 00285 /* @{ */ 00286 00288 cst_sct2(): csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support), first_child_bv(m_first_child) {} 00289 00290 // Constructor for the cst_sct2 taking a string for that the compressed suffix tree should be calculated. 00291 /* 00292 * \param str Text for which the \f$ \CST \f$ should be constructed. The text should be terminated by a zero byte. 00293 * \pre The text has to be terminated by a zero byte. 00294 */ 00295 // cst_sct2(const unsigned char *str); 00296 00297 template<uint8_t int_width, class size_type_class, uint8_t int_width_1, class size_type_class_1, uint8_t int_width_2, class size_type_class_2> 00298 cst_sct2(const std::string& cst_file_name, 00299 int_vector_file_buffer<int_width, size_type_class>& lcp_buf, 00300 int_vector_file_buffer<int_width_1, size_type_class_1>& sa_buf, 00301 int_vector_file_buffer<int_width_2, size_type_class_2>& isa_buf, 00302 std::string dir="./", bool build_only_bps=false); 00303 00304 cst_sct2(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps=false); 00305 00307 00312 cst_sct2(const cst_sct2& cst):csa(m_csa),lcp(m_lcp),bp(m_bp),bp_support(m_bp_support),first_child_bv(m_first_child) { 00313 copy(cst); 00314 } 00315 00316 /* @} */ 00317 00318 void construct(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps=false); 00319 00321 ~cst_sct2() {} 00322 00324 00327 size_type size()const { 00328 return m_csa.size(); 00329 } 00330 00332 00335 static size_type max_size() { 00336 return Csa::max_size(); 00337 } 00338 00340 00343 bool empty()const { 00344 return m_csa.empty(); 00345 } 00346 00348 00356 void swap(cst_sct2<Csa, Lcp>& cst); 00357 00359 00362 const_iterator begin()const { 00363 if (0 == m_bp.size()) // special case: tree is uninitialized 00364 return end(); 00365 else if (2 == m_bp.size()) { // special case: the root is a leaf 00366 return const_iterator(this, root(), true, true); 00367 } 00368 return const_iterator(this, root(), false, true); 00369 }; 00370 00372 00375 const_iterator end()const { 00376 return const_iterator(this, root(), true, false); 00377 } 00378 00380 const_bottom_up_iterator begin_bottom_up()const { 00381 if (0 == m_bp.size()) // special case: tree is uninitialized 00382 return end_bottom_up(); 00383 return const_bottom_up_iterator(this, leftmost_leaf_in_the_subtree(root())); 00384 } 00385 00387 const_bottom_up_iterator end_bottom_up()const { 00388 return const_bottom_up_iterator(this, root(), false); 00389 } 00390 00392 00395 cst_sct2& operator=(const cst_sct2& cst); 00396 00398 00403 bool operator==(const cst_sct2& cst)const; 00404 00406 00411 bool operator!=(const cst_sct2& cst)const; 00412 00414 00417 size_type serialize(std::ostream& out) const; 00418 00420 00422 void load(std::istream& in); 00423 00425 /* @{ */ 00426 00428 00432 node_type root() const { 00433 return node_type(0, 0, m_csa.size()-1); 00434 } 00435 00437 00443 bool is_leaf(const node_type& v)const { 00444 return v.right==v.left; 00445 } 00446 00448 00455 node_type ith_leaf(size_type i)const { 00456 assert(i > 0 and i <= m_csa.size()); 00457 return node_type(m_csa.size()-m_csa[i-1], i-1, i-1); 00458 } 00459 00461 00468 size_type leaves_in_the_subtree(const node_type& v)const { 00469 return v.right-v.left+1; 00470 } 00471 00473 00478 node_type leftmost_leaf_in_the_subtree(const node_type& v)const { 00479 return node_type(m_csa.size()-m_csa[v.left], v.left, v.left); 00480 } 00481 00483 00488 node_type rightmost_leaf_in_the_subtree(const node_type& v)const { 00489 return node_type(m_csa.size()-m_csa[v.right], v.right, v.right); 00490 } 00491 00493 00500 size_type lb(const node_type& v)const { 00501 return v.left; 00502 } 00503 00505 00512 size_type rb(const node_type& v)const { 00513 return v.right; 00514 } 00515 00517 00522 node_type parent(const node_type& v) const { 00523 #ifndef NDEBUG 00524 // std::cout<<"parent interval of "<<v.l<<"-["<<v.left<<" "<<v.right<<"]"<<std::endl; 00525 #endif 00526 if (v.l <= 1) {// if v.l < 1 => root() 00527 return root(); // return root node 00528 } else { // l > 1 00529 if (v.right+1 == m_csa.size()) { 00530 // search for the previous smaller value 00531 size_type lcpi = m_lcp[v.left]; 00532 if (lcpi==0) 00533 return root(); 00534 assert(lcpi!=0); 00535 return node_type(lcpi, psv(v.left, lcpi), v.right); 00536 } else if (v.left == 0) { 00537 // search for the next smaller value 00538 return node_type(m_lcp[v.right+1],0, nsv(v.right+1)-1); 00539 } else { 00540 // TODO: speed up, only two cases? 00541 size_type lcpl = m_lcp[v.left], lcpr = m_lcp[v.right+1]; 00542 if (lcpl < lcpr) { 00543 return node_type(lcpr, v.left, nsv(v.right+1)-1); 00544 } else if (lcpl > lcpr) { 00545 return node_type(lcpl, psv(v.left, lcpl), v.right); 00546 } else { 00547 if (lcpl==0) 00548 return root(); 00549 return node_type(lcpl, psv(v.left, lcpl), nsv(v.right+1)-1); 00550 } 00551 } 00552 } 00553 } 00554 00555 00557 00566 node_type ith_child(const node_type& v, size_type i)const { 00567 if (is_leaf(v)) // if v is a leave, v has no child 00568 return root(); 00569 size_type k = 0, kpos = 0, k_find_close = 0; 00570 // v is not a leave: v has at least two children 00571 k = get_first_l_index(v.left, v.right, kpos, k_find_close);// get first l-index k and the position of k 00572 assert(m_lcp[k]==v.l); 00573 size_type left = v.left, right = k-1; 00574 size_type j=1; 00575 while (j < i) { 00576 // get next l-index m and the position of m 00577 size_type m = m_bp_support.rank(m_bp_support.find_open(k_find_close-j))-1; 00578 ++j; 00579 if (m==k or v.l != m_lcp[m]) { // no next l-index exists 00580 if (i!=j) 00581 return root(); 00582 left = k; 00583 right = v.right; 00584 break; 00585 } 00586 left = k; 00587 right = m-1; 00588 k = m; 00589 } 00590 if (left==right) { // child interval is a leaf 00591 assert(m_csa.size()-m_csa[left] > v.l); 00592 return node_type(m_csa.size()-m_csa[left],left,right); 00593 } else { 00594 // find l of the first child interval 00595 size_type k1 = get_first_l_index(left, right, kpos, kpos); 00596 assert(m_lcp[k1] > v.l); 00597 return node_type(m_lcp[k1], left, right); 00598 } 00599 return root(); 00600 } 00601 00603 00609 size_type degree(const node_type& v)const { 00610 if (is_leaf(v)) // if v is a leave, v has no child 00611 return 0; 00612 // v is not a leave: v has at least two children 00613 size_type r = get_pos_of_closing_parenthesis_of_the_first_l_index(v.left, v.right); 00614 assert(m_lcp[m_bp_support.rank(m_bp_support.find_open(r))-1]==v.l); 00615 size_type begin = m_bp_support.preceding_closing_parentheses(r); 00616 assert(m_csa.sigma>0); 00617 if (begin > (size_type)m_csa.sigma-1) begin = m_csa.sigma-1; 00618 begin = r-begin; 00619 size_type end = r, mid; 00620 while (end != begin) { 00621 mid = (begin+end)>>1; // begin <= mid < end 00622 if (m_lcp[m_bp_support.rank(m_bp_support.find_open(mid))-1] > v.l) 00623 begin = mid+1; 00624 else 00625 end = mid; 00626 } 00627 return r-end+2; 00628 } 00629 00631 00637 node_type sibling(const node_type& v)const { 00638 if (v == root()) 00639 return root(); 00640 if (v.right == m_csa.size()-1) // there exists no next sibling if the interval touches the right border 00641 return root(); 00642 // now let [p..q] be the interval [v.left, v.right] 00643 size_type lcp_p = m_lcp[v.left]; 00644 size_type lcp_qp1 = m_lcp[v.right+1]; // is defined as v.right < m_csa.size()-1 00645 if (lcp_p > lcp_qp1) 00646 return root(); 00647 else { // search next l-index 00648 size_type new_q; 00649 size_type qp1_closing_pos = m_bp_support.find_close(m_bp_support.select(v.right+2)); 00650 if (m_bp[qp1_closing_pos-1]) {// if there exists no closing parenthesis left to qp1_closing_pos 00651 new_q = nsv(v.right+1); // i.e. we could not find a next l-index, only a smaller value: last sibling 00652 assert(new_q > v.right+1); 00653 } else { 00654 new_q = m_bp_support.rank(m_bp_support.find_open(qp1_closing_pos-1))-1; 00655 assert(new_q > v.right+1); 00656 if (m_lcp[new_q] > lcp_qp1) { // no next l-index exists: find next smaller value, last sibling 00657 new_q = nsv(v.right+1); 00658 } else { 00659 assert(m_lcp[new_q] == lcp_qp1); 00660 } 00661 } 00662 assert(new_q > v.right+1); 00663 if (new_q-1 == v.right+1) { // sibling is a singlton interval 00664 return ith_leaf(new_q/*v.right+2*/); 00665 } else { 00666 // get l-index m of the sibling 00667 size_type m_pos, m_close_pos; 00668 size_type m = get_first_l_index(v.right+1, new_q-1, m_pos, m_close_pos); 00669 return node_type(m_lcp[m], v.right+1, new_q-1); 00670 } 00671 00672 } 00673 } 00674 00675 // Returns the next sibling of node v. 00676 // Only for tests. 00677 node_type sibling_naive(const node_type& v)const { 00678 if (v==root()) 00679 return root(); 00680 node_type parent = this->parent(v); 00681 assert(parent != v); 00682 size_type nr = degree(parent); 00683 for (size_type i=1; i <= nr; ++i) 00684 if (ith_child(parent, i) == v and i!=nr) 00685 return ith_child(parent, i+1); 00686 return root(); 00687 } 00688 00690 00698 node_type child_linear(const node_type& v, unsigned char c, size_type& char_pos)const { 00699 if (is_leaf(v)) // if v is a leave, it has no child 00700 return root(); 00701 size_type k, kpos, k_find_close; 00702 // v is not a leave: v has at least two children 00703 k = get_first_l_index(v.left, v.right, kpos, k_find_close);// get first l-index k 00704 assert(m_lcp[k]==v.l); 00705 size_type left = v.left, right = k-1; 00706 char_pos = m_csa(m_csa[left]+v.l); // char_pos = invSA[SA[left]+v.l+1-1] 00707 uint16_t cc = m_csa.char2comp[c]; 00708 size_type char_ex_max_pos = m_csa.C[cc+1], char_inc_min_pos = m_csa.C[cc]; 00709 while (char_pos < char_ex_max_pos) { 00710 if (char_pos >= char_inc_min_pos) { 00711 goto found_child; 00712 } 00713 // get next l-index 00714 size_type mpos = m_bp_support.find_open((m_bp_support.find_close(kpos)-1)); 00715 size_type m = m_bp_support.rank(mpos)-1; 00716 if (m==k or v.l != m_lcp[m]) { // no next l-index exists 00717 left = k; 00718 right = v.right; 00719 char_pos = m_csa(m_csa[left]+v.l); 00720 if (char_pos < char_inc_min_pos or char_pos >= char_ex_max_pos) // check if character is two small 00721 return root(); 00722 goto found_child; 00723 } 00724 left = k; 00725 right = m-1; 00726 k = m; 00727 kpos = mpos; 00728 assert(m_csa(m_csa[left]+v.l) > char_pos); 00729 char_pos = m_csa(m_csa[left]+v.l); 00730 } 00731 return root(); 00732 found_child: 00733 if (left==right) { // child interval is a leaf 00734 assert(m_csa.size()-m_csa[left] > v.l); 00735 return node_type(m_csa.size()-m_csa[left],left,right); 00736 } else { 00737 // find l of the first child interval 00738 size_type k1pos, k1 = get_first_l_index(left, right, k1pos, k_find_close); 00739 assert(m_lcp[k1] > v.l); 00740 return node_type(m_lcp[k1], left, right); 00741 } 00742 } 00743 00744 00745 00747 00755 // TODO const unsigned char c durch char_type ersetzen 00756 node_type child(node_type v, const unsigned char c, size_type& char_pos)const { 00757 if (is_leaf(v)) // if v is a leaf = (), v has no child 00758 return root(); 00759 // else v = ( ( )) 00760 uint16_t cc = m_csa.char2comp[c]; 00761 if (cc==0 and c!=0) // TODO: aendere char2comp so ab, dass man diesen sonderfall nicht braucht 00762 return root(); 00763 size_type char_ex_max_pos = m_csa.C[cc+1], char_inc_min_pos = m_csa.C[cc]; 00764 // v is not a leave: v has at least two children 00765 size_type k, kpos, k_find_close; 00766 k = get_first_l_index(v.left, v.right, kpos, k_find_close); 00767 assert(m_lcp[k]==v.l); 00768 size_type left = v.left, right = k-1; 00769 char_pos = m_csa(m_csa[left]+v.l); 00770 if (char_pos >= char_ex_max_pos) {// c is lex. greater than the first character of the first child interval 00771 return root(); 00772 } else if (char_pos >= char_inc_min_pos) { // found interval [left, right] 00773 goto found_child; 00774 } 00775 { 00776 size_type k_char_pos = m_csa(m_csa[k]+v.l); 00777 { 00778 size_type begin = m_bp_support.preceding_closing_parentheses(k_find_close); 00779 assert(m_csa.sigma>0); 00780 if (begin > (size_type)m_csa.sigma-1) begin = m_csa.sigma-1; 00781 begin = k_find_close - begin; 00782 size_type end = k_find_close, m_find_close, m; 00783 while (end != begin) { 00784 m_find_close = (begin+end)>>1; // begin <= m_find_close < end 00785 m = m_bp_support.rank(m_bp_support.find_open(m_find_close))-1; 00786 if (m_lcp[m] > v.l or /*m_lcp[m]==v.l and */ (char_pos = m_csa(m_csa[m]+v.l)) >= char_ex_max_pos) { 00787 begin = m_find_close+1; 00788 } else { // m_lcp[m] == v.l and char_pos < char_ex_max_pos 00789 end = m_find_close; 00790 k = m; 00791 k_char_pos = char_pos; 00792 } 00793 } 00794 assert(m_lcp[k] == v.l); 00795 assert(m_bp_support.find_close(m_bp_support.select(k+1)) == end); 00796 if (k_char_pos < char_ex_max_pos and k_char_pos >= char_inc_min_pos) {// child interval exists 00797 left = k; 00798 char_pos = k_char_pos; 00799 assert(end > 0); 00800 if (m_bp[end-1] == 1) { // last child interval 00801 right = v.right; 00802 } else { 00803 size_type k1 = m_bp_support.rank(m_bp_support.find_open(end-1))-1; 00804 if (m_lcp[k1] > v.l) {// last child interval 00805 right = v.right; 00806 } else { 00807 right = k1-1; 00808 } 00809 } 00810 } else { 00811 return root(); 00812 } 00813 } 00814 } 00815 found_child: 00816 if (left==right) { // child interval is a leaf 00817 assert(m_csa.size()-m_csa[left] > v.l); 00818 return node_type(m_csa.size()-m_csa[left],left,right); 00819 } else { 00820 // find l of the first child interval 00821 size_type k1pos, k1 = get_first_l_index(left, right, k1pos, k_find_close); 00822 assert(m_lcp[k1] > v.l); 00823 return node_type(m_lcp[k1], left, right); 00824 } 00825 } 00826 00828 // \sa child(node_type v, const unsigned char c, size_type &char_pos) 00829 node_type child(const node_type& v, const unsigned char c) { 00830 size_type char_pos; 00831 return child(v, c, char_pos); 00832 } 00833 00835 00842 unsigned char edge(const node_type& v, size_type d)const { 00843 #ifndef NDEBUG 00844 if (d < 1 or d > depth(v)) { 00845 throw std::out_of_range("OUT_OF_RANGE_ERROR: "+util::demangle(typeid(this).name())+"cst_sct2<>::edge(node_type v, size_type d). d == 0 or d > depth(v)!"); 00846 } 00847 #endif 00848 size_type order = m_csa(m_csa[v.left]+d-1); 00849 uint16_t c_begin = 1, c_end = m_csa.sigma+1, mid; 00850 while (c_begin < c_end) { 00851 mid = (c_begin+c_end)>>1; 00852 if (m_csa.C[mid] <= order) { 00853 c_begin = mid+1; 00854 } else { 00855 c_end = mid; 00856 } 00857 } 00858 return m_csa.comp2char[c_begin-1]; 00859 } 00860 00861 00863 00870 node_type lca(node_type v, node_type w)const { 00871 if (v.left > w.left or (v.left == w.left and v.right < w.right)) { 00872 std::swap(v, w); 00873 } 00874 assert(v.left < w.left or (v.left==w.left and v.right >= w.right)); 00875 assert(!(v.left < w.left and v.right > w.left and v.right < w.right)); // assert that v and w do not overlapp 00876 if (v.right >= w.right) { // v encloses w or v==w 00877 return v; 00878 } else { // v.left < v.right < w.left < w.right 00879 size_type min_index = rmq(v.left+1, w.right); 00880 size_type l = m_lcp[min_index]; 00881 if (l==0) 00882 return root(); 00883 else 00884 return node_type(l, psv(min_index, l), nsv(min_index)-1); 00885 } 00886 } 00887 00889 00895 size_type depth(const node_type& v)const { 00896 return v.l; 00897 } 00898 00900 00906 // TODO: can be implemented in O(1) with o(n) space. See Jansson, Sadakane, Sung, SODA 2007, "Ultra-succinct Representation of Ordered Trees" 00907 size_type node_depth(node_type v)const { 00908 size_type d = 0; 00909 while (v != root()) { 00910 ++d; 00911 v = parent(v); 00912 } 00913 return d; 00914 } 00915 00917 00923 node_type sl(const node_type& v)const { 00924 if (v == root()) 00925 return root(); 00926 // get interval with first char deleted 00927 size_type left = m_csa.psi[v.left]; 00928 if (is_leaf(v)) { 00929 if (v.l==1) 00930 return root(); 00931 else 00932 return node_type(v.l-1, left, left); 00933 } 00934 size_type right = m_csa.psi[v.right]; 00935 assert(left < right); 00936 // rmq 00937 size_type min_index = rmq(left+1, right); 00938 size_type l = m_lcp[min_index]; 00939 if (l==0) 00940 return root(); 00941 else 00942 return node_type(l, psv(min_index, l), nsv(min_index)-1); 00943 } 00944 00946 /* 00947 * \param v A valid not of a cst_sct2. 00948 * \param c The character which should be prepended to the string of the current node. 00949 * \return root() if the Weiner link of (v, c) does not exist, otherwise the Weiner link is returned. 00950 * \par Time complexity 00951 * \f$ \Order{ t_{rank\_bwt}\cdot t_{rmq} } \f$ 00952 */ 00953 node_type wl(const node_type& v, const unsigned char c) const { 00954 size_type c_left = m_csa.rank_bwt(v.left, c); 00955 size_type c_right = m_csa.rank_bwt(v.right+1, c); 00956 if (c_left == c_right) // there exists no Weiner link 00957 return root(); 00958 if (c_left+1 == c_right) 00959 return ith_leaf(m_csa.C[m_csa.char2comp[c]] + c_left + 1); 00960 else { 00961 size_type left = m_csa.C[m_csa.char2comp[c]] + c_left; 00962 size_type right = m_csa.C[m_csa.char2comp[c]] + c_right - 1; 00963 assert(left < right); 00964 // rmq to compute l-index 00965 size_type min_index = rmq(left+1, right); 00966 size_type l = m_lcp[min_index]; 00967 return node_type(l, left, right); 00968 } 00969 } 00970 00972 00977 size_type sn(const node_type& v)const { 00978 assert(is_leaf(v)); 00979 return m_csa[v.left]; 00980 } 00981 00983 00989 size_type id(const node_type& v)const { 00990 if (is_leaf(v)) { // return i in the range from 0..csa.size()-1 00991 return v.left; 00992 } 00993 size_type kpos, find_close_k; 00994 get_first_l_index(v.left, v.right, kpos, find_close_k); 00995 size_type r0clpos = find_close_k - m_bp_support.rank(find_close_k); 00996 00997 return size()+m_first_child_rank(r0clpos); 00998 } 00999 01001 size_type nodes()const { 01002 return m_nodes; 01003 } 01004 01006 /* \param lb Left bound of the lcp-interval [lb..rb] (inclusive). 01007 * \param rb Right bound of the lcp-interval [lb..rb] (inclusive). 01008 * \param l Depth of the lcp-interval. 01009 *\ return The node in the suffix tree corresponding lcp-interval [lb..rb] 01010 */ 01011 node_type node(size_type lb, size_type rb, size_type l=0) const { 01012 return node_type(l, lb, rb); 01013 } 01014 01016 void print_info()const { 01017 std::cout << "# size of cst in bytes per character" << std::endl; 01018 size_type cst_size = util::get_size_in_bytes(*this); 01019 std::cout << ((double)cst_size)/csa.size() << " # = "<< ((double)cst_size)/(1<<20) <<" MB"<<std::endl; 01020 std::cout<< ((double)util::get_size_in_bytes(csa))/cst_size << " # ratio of csa size" << std::endl; 01021 std::cout<< ((double)util::get_size_in_bytes(lcp))/cst_size << " # ratio of lcp size" << std::endl; 01022 std::cout<< ((double)util::get_size_in_bytes(bp))/cst_size << " # ratio of bp size" << std::endl; 01023 std::cout<< ((double)util::get_size_in_bytes(bp_support))/cst_size << " # ratio of bp_support size" << std::endl; 01024 std::cout<< 0 << " # ratio of bp_rank_10 size" << std::endl; 01025 std::cout<< 0 << " # ratio of bp_select_10 size" << std::endl; 01026 } 01027 01028 /* @} */ 01029 01030 }; 01031 01032 // == template functions == 01033 01034 template<class Csa, class Lcp, class Bp_support, class Rank_support> 01035 template<uint8_t int_width, class size_type_class, uint8_t int_width_1, class size_type_class_1, uint8_t int_width_2, class size_type_class_2> 01036 cst_sct2<Csa, Lcp, Bp_support, Rank_support>::cst_sct2(const std::string& csa_file_name, 01037 int_vector_file_buffer<int_width, size_type_class>& lcp_buf, 01038 int_vector_file_buffer<int_width_1, size_type_class_1>& sa_buf, 01039 int_vector_file_buffer<int_width_2, size_type_class_2>& isa_buf, 01040 std::string dir, 01041 bool build_only_bps 01042 ):csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support), first_child_bv(m_first_child) 01043 { 01044 std::string id = util::to_string(util::get_pid())+"_"+util::to_string(util::get_id()).c_str(); 01045 write_R_output("cst", "construct BPS", "begin", 1, 0); 01046 m_nodes = algorithm::construct_supercartesian_tree_bp_succinct_and_first_child(lcp_buf, m_bp, m_first_child) + m_bp.size()/2; 01047 write_R_output("cst", "construct BPS", "end", 1, 0); 01048 01049 write_R_output("cst", "construct CLCP", "begin", 1, 0); 01050 construct_lcp(m_lcp, *this, lcp_buf, isa_buf); 01051 write_R_output("cst", "construct CLCP", "end", 1, 0); 01052 01053 util::load_from_file(m_csa, csa_file_name.c_str()); 01054 01055 write_R_output("cst", "construct BPSS", "begin", 1, 0); 01056 m_bp_support = Bp_support(&m_bp); 01057 util::init_support(m_first_child_rank, &m_first_child); 01058 write_R_output("cst", "construct BPSS", "end",1,0); 01059 } 01060 01061 01062 template<class Csa, class Lcp, class Bp_support, class Rank_support> 01063 cst_sct2<Csa, Lcp, Bp_support, Rank_support>::cst_sct2(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps):csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support), first_child_bv(m_first_child) 01064 { 01065 construct(file_map, dir, id, build_only_bps); 01066 } 01067 01068 01069 template<class Csa, class Lcp, class Bp_support, class Rank_support> 01070 void cst_sct2<Csa, Lcp, Bp_support, Rank_support>::construct(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps) 01071 { 01072 write_R_output("cst", "construct BPS", "begin", 1, 0); 01073 int_vector_file_buffer<> lcp_buf(file_map["lcp"].c_str()); 01074 m_nodes = algorithm::construct_supercartesian_tree_bp_succinct_and_first_child(lcp_buf, m_bp, m_first_child) + m_bp.size()/2; 01075 write_R_output("cst", "construct BPS", "end", 1, 0); 01076 01077 write_R_output("cst", "construct BPSS", "begin", 1, 0); 01078 m_bp_support = Bp_support(&m_bp); 01079 write_R_output("cst", "construct BPSS", "end", 1, 0); 01080 01081 write_R_output("cst", "construct CLCP", "begin", 1, 0); 01082 construct_lcp(m_lcp, *this, file_map, dir, id); 01083 write_R_output("cst", "construct CLCP", "end", 1, 0); 01084 01085 util::load_from_file(m_csa, file_map["csa"].c_str()); 01086 } 01087 01088 01089 01090 template<class Csa, class Lcp, class Bp_support, class Rank_support> 01091 typename cst_sct2<Csa, Lcp, Bp_support, Rank_support>::size_type cst_sct2<Csa, Lcp, Bp_support, Rank_support>::serialize(std::ostream& out) const 01092 { 01093 size_type written_bytes = 0; 01094 written_bytes += m_csa.serialize(out); 01095 written_bytes += m_lcp.serialize(out); 01096 written_bytes += m_bp.serialize(out); 01097 written_bytes += m_bp_support.serialize(out); 01098 written_bytes += m_first_child.serialize(out); 01099 written_bytes += m_first_child_rank.serialize(out); 01100 out.write((char*) &m_nodes, sizeof(m_nodes)); 01101 written_bytes += sizeof(m_nodes); 01102 return written_bytes; 01103 } 01104 01105 template<class Csa, class Lcp, class Bp_support, class Rank_support> 01106 void cst_sct2<Csa, Lcp, Bp_support, Rank_support>::load(std::istream& in) 01107 { 01108 m_csa.load(in); 01109 load_lcp(m_lcp, in, *this); 01110 m_bp.load(in); 01111 m_bp_support.load(in, &m_bp); 01112 m_first_child.load(in); 01113 m_first_child_rank.load(in, &m_first_child); 01114 in.read((char*) &m_nodes, sizeof(m_nodes)); 01115 #ifdef SDSL_DEBUG 01116 assert(algorithm::check_bp_support(m_bp, m_bp_support)); 01117 std::cerr<<"checked bp_support"<<std::endl; 01118 #endif 01119 } 01120 01121 template<class Csa, class Lcp, class Bp_support, class Rank_support> 01122 cst_sct2<Csa, Lcp, Bp_support, Rank_support>& cst_sct2<Csa, Lcp, Bp_support, Rank_support>::operator=(const cst_sct2& cst) 01123 { 01124 if (this != &cst) { 01125 copy(cst); 01126 } 01127 return *this; 01128 } 01129 01130 01131 01132 01133 01134 } // end namespace sdsl 01135 01136 01137 #endif