SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2009 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00021 #ifndef INCLUDED_SDSL_CST_SCT 00022 #define INCLUDED_SDSL_CST_SCT 00023 00024 #include "int_vector.hpp" 00025 #include "algorithms.hpp" 00026 #include "iterators.hpp" 00027 #include "lcp_support_sada.hpp" 00028 #include "bp_support.hpp" 00029 #include "csa_wt.hpp" // for std initialization of cst_sct 00030 #include "csa_uncompressed.hpp" 00031 #include "csa_wt.hpp" 00032 #include "cst_iterators.hpp" 00033 #include "testutils.hpp" 00034 #include "util.hpp" 00035 #include <iostream> 00036 #include <algorithm> 00037 #include <cassert> 00038 #include <cstring> // for strlen 00039 #include <iomanip> 00040 #include <iterator> 00041 #include <stack> // for the calculation of the balanced parantheses sequence 00042 #include <ostream> 00043 00044 namespace sdsl 00045 { 00046 00047 struct cst_tag; // forward declaration 00048 00050 00064 template<class Int = size_t> 00065 struct lcp_interval { 00066 Int l; 00067 Int left; 00068 Int right; 00069 00071 00076 lcp_interval(Int l=0, Int left=0, Int right=0):l(l),left(left),right(right) {}; 00077 00078 00079 bool operator<(const lcp_interval& interval)const { 00080 if (l!=interval.l) 00081 return l<interval.l; 00082 if (left!=interval.left) 00083 return left<interval.left; 00084 return right<interval.right; 00085 00086 } 00087 00089 00091 bool operator==(const lcp_interval& interval)const { 00092 return l==interval.l and 00093 left==interval.left and 00094 right==interval.right; 00095 } 00096 00098 00100 bool operator!=(const lcp_interval& interval)const { 00101 return !(*this==interval); 00102 } 00103 00105 00107 lcp_interval& operator=(const lcp_interval& interval) { 00108 l = interval.l; 00109 left = interval.left; 00110 right = interval.right; 00111 return *this; 00112 } 00113 }; 00114 00115 template<class Int> 00116 inline std::ostream& operator<<(std::ostream& os, const lcp_interval<Int>& interval) 00117 { 00118 os<<interval.l<<"-["<<interval.left<<","<<interval.right<<"]"; 00119 return os; 00120 } 00121 00122 00123 00125 00147 template<class Csa = csa_wt<>, class Lcp = lcp_support_sada<Csa>, class Bp_support = bp_support_sada<> > 00148 class cst_sct 00149 { 00150 public: 00151 typedef typename Csa::value_type value_type; // STL Container requirement/TODO: ist das nicht gleich node type??? 00152 typedef cst_dfs_const_forward_iterator<cst_sct> const_iterator;// STL Container requirement 00153 typedef const_iterator iterator; // STL Container requirement 00154 typedef cst_bottom_up_const_forward_iterator<cst_sct> const_bottom_up_iterator; 00155 typedef const_bottom_up_iterator bottom_up_iterator; 00156 typedef const value_type const_reference; 00157 typedef const_reference reference; 00158 typedef const_reference* pointer; 00159 typedef const pointer const_pointer; 00160 typedef typename Csa::size_type size_type; // STL Container requirement 00161 typedef size_type cst_size_type; 00162 typedef ptrdiff_t difference_type; // STL Container requirement 00163 typedef Csa csa_type; 00164 typedef Lcp lcp_type; 00165 typedef Bp_support bp_support_type; 00166 typedef typename Csa::pattern_type pattern_type; 00167 typedef typename Csa::char_type char_type; 00168 typedef lcp_interval<size_type> node_type; 00169 00170 typedef cst_tag index_category; 00171 private: 00172 Csa m_csa; 00173 Lcp m_lcp; 00174 bit_vector m_bp; 00175 Bp_support m_bp_support; 00176 size_type m_nodes; 00177 00178 void copy(const cst_sct& cst) { 00179 m_csa = cst.m_csa; 00180 // lcp_trait<Csa, Lcp>::copy_lcp(m_lcp, cst.m_lcp, m_csa); 00181 copy_lcp(m_lcp, cst.m_lcp, *this); 00182 m_bp = cst.m_bp; 00183 m_bp_support = cst.m_bp_support; 00184 m_bp_support.set_vector(&m_bp); 00185 m_nodes = cst.m_nodes; 00186 } 00187 00188 // Get the first l index of a l-[i,j] interval. 00189 /* I.e. given an interval [i,j], the function returns the position of the smallest entry lcp[k] with \f$ i<k\leq j \f$ 00190 * Time complexity: O(1) + O(lcp access) 00191 */ // TODO: if interval is last child interval -> we don't need the m_lcp comparisson 00192 size_type get_first_l_index(size_type i, size_type j, size_type& kpos, size_type& find_close_k)const { 00193 if (j+1 != m_csa.size() and m_lcp[i] <= m_lcp[j+1]) { 00194 size_type jp1pos = m_bp_support.select(j+2); 00195 assert(m_bp[jp1pos] == 1); // opening parenthesis 00196 assert(jp1pos > 0 and m_bp[jp1pos-1] == 0); // closing parenthesis 00197 find_close_k = jp1pos-1; 00198 kpos = m_bp_support.find_open(find_close_k); 00199 return m_bp_support.rank(kpos)-1; 00200 } else { 00201 size_type ipos = m_bp_support.select(i+1); 00202 assert(m_bp[ipos] == 1); // opening parenthesis 00203 ipos = m_bp_support.find_close(ipos); 00204 assert(m_bp[ipos] == 0 and m_bp[ipos-1] == 0); // closing parenthesis 00205 find_close_k = ipos-1; 00206 kpos = m_bp_support.find_open(find_close_k); 00207 return m_bp_support.rank(kpos)-1; 00208 } 00209 } 00210 00211 // Get the postion of the first l index of a l-[i,j] interval in the balanced parentheses sequence. 00212 /* \par Time complexity 00213 * \f$ \Order{\lcpaccess} \f$ 00214 */ 00215 size_type get_pos_of_closing_parenthesis_of_the_first_l_index(size_type i, size_type j)const { 00216 if (j+1 != m_csa.size() and m_lcp[i] <= m_lcp[j+1]) { 00217 size_type jp1pos = m_bp_support.select(j+2); 00218 assert(m_bp[jp1pos] == 1); // opening parenthesis 00219 assert(jp1pos > 0 and m_bp[jp1pos-1] == 0); // closing parenthesis 00220 return jp1pos-1; 00221 } else { 00222 size_type ipos = m_bp_support.select(i+1); 00223 assert(m_bp[ipos] == 1); // opening parenthesis 00224 ipos = m_bp_support.find_close(ipos); 00225 assert(m_bp[ipos] == 0 and m_bp[ipos-1] == 0); // closing parenthesis 00226 return ipos-1; 00227 } 00228 } 00229 00230 // Get the next smaller value. 00231 /* \par Time complexity 00232 * \f$ \Order{1} \f$ 00233 */ 00234 size_type nsv(size_type i)const { 00235 size_type pos = m_bp_support.select(i+1); 00236 size_type rank = m_bp_support.find_close(pos); 00237 size_type result = m_bp_support.rank(rank); 00238 return result; 00239 } 00240 00241 // Get the previous smaller value. 00242 /* \pre lcp[i] > 0! 00243 * \par Time complexity 00244 * \f$ \Order{\sigma \cdot \lcpaccess} \f$ 00245 * \sa psv 00246 */ 00247 size_type psv_linear(size_type i, size_type lcpi)const { 00248 assert(lcpi>0); 00249 size_type pos = m_bp_support.select(i+1); 00250 size_type ec = m_bp_support.enclose(pos); 00251 size_type j = m_bp_support.rank(ec); 00252 while (lcpi == m_lcp[j-1]) { 00253 ec = m_bp_support.enclose(ec); 00254 j = m_bp_support.rank(ec); 00255 } 00256 return j-1; 00257 } 00258 00259 // Get the previous smaller value. 00260 /* \pre lcp[i] > 0! 00261 * \par Time complexity 00262 * \f$ \Order{\log\sigma \cdot \lcpaccess} \f$ 00263 */ 00264 size_type psv(size_type i, size_type lcpi)const { 00265 assert(lcpi!=0);// if(m_lcp[i]==0) return 0; 00266 size_type begin = m_bp_support.find_close(m_bp_support.select(i+1)); // calculate NSV 00267 size_type end = m_bp_support.rank(begin)+1; // " " // exclusive 00268 if (end > m_csa.size()) { 00269 assert(end == m_csa.size()+1); 00270 end = m_bp.size(); 00271 } else { 00272 end = m_bp_support.select(end); 00273 } 00274 if (++begin < end) { 00275 size_type result = m_bp_support.rank(m_bp_support.find_open(end-1))-1; 00276 if (m_lcp[result] < lcpi) { // TODO: explain this condition 00277 // binary search 00278 --end; 00279 while (begin!=end) { 00280 size_type mid, temp; 00281 mid = (begin+end)>>1; // begin <= mid < end 00282 if (m_lcp[temp = m_bp_support.rank(m_bp_support.find_open(mid))-1] >= lcpi) { 00283 begin = mid+1; 00284 } else { 00285 result = temp; 00286 end = mid; 00287 } 00288 } 00289 return result; 00290 } 00291 } 00292 00293 return m_bp_support.rank(m_bp_support.enclose(end))-1; 00294 } 00295 00296 // Range minimum query based on the rr_enclose method. 00297 /* \par Time complexity 00298 * \f$ \Order{\rrenclose} \f$ 00299 */ 00300 size_type rmq(size_type l, size_type r)const { 00301 size_type i = m_bp_support.select(l+1); 00302 size_type j = m_bp_support.select(r+1); 00303 size_type fc_i = m_bp_support.find_close(i); 00304 if (j < fc_i) { // i < j < find_close(j) < find_close(i) 00305 return l; 00306 } else { // i < find_close(i) < j < find_close(j) 00307 size_type ec = m_bp_support.rr_enclose(i,j); 00308 if (ec == m_bp_support.size()) {// no restricted enclosing pair found 00309 return r; 00310 } else { // found range restriced enclosing pair 00311 return m_bp_support.rank(ec)-1; // subtract 1, as the index is 0 based 00312 } 00313 } 00314 } 00315 00316 public: 00317 const csa_type& csa; 00318 const lcp_type& lcp; 00319 const bit_vector& bp; 00320 const bp_support_type& bp_support; 00321 00323 /* @{ */ 00324 00326 cst_sct(): csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support) {} 00327 00328 // Constructor for the cst_sct taking a string for that the compressed suffix tree should be calculated. 00329 /* 00330 * \param str Text for which the \f$ \CST \f$ should be constructed. The text should be terminated by a zero byte. 00331 * \pre The text has to be terminated by a zero byte. 00332 * \sa cst_sct::cst_sct(const Csa &csa, const unsigned char *str) 00333 */ 00334 // cst_sct(const unsigned char *str); 00335 00336 template<uint8_t int_width, class size_type_class, uint8_t int_width_1, class size_type_class_1, uint8_t int_width_2, class size_type_class_2> 00337 cst_sct(const std::string& cst_file_name, 00338 int_vector_file_buffer<int_width, size_type_class>& lcp_buf, 00339 int_vector_file_buffer<int_width_1, size_type_class_1>& sa_buf, 00340 int_vector_file_buffer<int_width_2, size_type_class_2>& isa_buf, 00341 std::string dir="./", 00342 bool build_only_bps=false); 00343 00344 cst_sct(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps=false); 00345 00347 00352 cst_sct(const cst_sct& cst):csa(m_csa),lcp(m_lcp),bp(m_bp),bp_support(m_bp_support) { 00353 copy(cst); 00354 } 00355 00356 /* @} */ 00357 00358 void construct(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps=false); 00359 00361 ~cst_sct() {} 00362 00364 00367 size_type size()const { 00368 return m_csa.size(); 00369 } 00370 00372 00375 static size_type max_size() { 00376 return Csa::max_size(); 00377 } 00378 00380 00383 bool empty()const { 00384 return m_csa.empty(); 00385 } 00386 00388 00396 void swap(cst_sct<Csa, Lcp>& cst); 00397 00399 00402 const_iterator begin()const { 00403 if (0 == m_bp.size()) // special case: tree is uninitialized 00404 return end(); 00405 else if (2 == m_bp.size()) { // special case: the root is a leaf 00406 return const_iterator(this, root(), true, true); 00407 } 00408 return const_iterator(this, root(), false, true); 00409 }; 00410 00412 00415 const_iterator end()const { 00416 return const_iterator(this, root(), true, false); 00417 } 00418 00420 const_bottom_up_iterator begin_bottom_up()const { 00421 if (0 == m_bp.size()) // special case: tree is uninitialized 00422 return end_bottom_up(); 00423 return const_bottom_up_iterator(this, leftmost_leaf_in_the_subtree(root())); 00424 } 00425 00427 const_bottom_up_iterator end_bottom_up()const { 00428 return const_bottom_up_iterator(this, root(), false); 00429 } 00430 00432 00435 cst_sct& operator=(const cst_sct& cst); 00436 00438 00443 bool operator==(const cst_sct& cst)const; 00444 00446 00451 bool operator!=(const cst_sct& cst)const; 00452 00454 00457 size_type serialize(std::ostream& out) const; 00458 00460 00462 void load(std::istream& in); 00463 00465 /* @{ */ 00466 00468 00472 node_type root() const { 00473 return node_type(0, 0, m_csa.size()-1); 00474 } 00475 00477 00483 bool is_leaf(const node_type& v)const { 00484 return v.right==v.left; 00485 } 00486 00488 00495 node_type ith_leaf(size_type i)const { 00496 assert(i > 0 and i <= m_csa.size()); 00497 return node_type(m_csa.size()-m_csa[i-1], i-1, i-1); 00498 } 00499 00501 00508 size_type leaves_in_the_subtree(const node_type& v)const { 00509 return v.right-v.left+1; 00510 } 00511 00513 00518 node_type leftmost_leaf_in_the_subtree(const node_type& v)const { 00519 return node_type(m_csa.size()-m_csa[v.left], v.left, v.left); 00520 } 00521 00523 00528 node_type rightmost_leaf_in_the_subtree(const node_type& v)const { 00529 return node_type(m_csa.size()-m_csa[v.right], v.right, v.right); 00530 } 00531 00533 00540 size_type lb(const node_type& v)const { 00541 return v.left; 00542 } 00543 00545 00552 size_type rb(const node_type& v)const { 00553 return v.right; 00554 } 00555 00557 00562 node_type parent(const node_type& v) const { 00563 #ifndef NDEBUG 00564 // std::cout<<"parent interval of "<<v.l<<"-["<<v.left<<" "<<v.right<<"]"<<std::endl; 00565 #endif 00566 if (v.l <= 1) {// if v.l < 1 => root() 00567 return root(); // return root node 00568 } else { // l > 1 00569 if (v.right+1 == m_csa.size()) { 00570 // search for the previous smaller value 00571 size_type lcpi = m_lcp[v.left]; 00572 if (lcpi==0) 00573 return root(); 00574 assert(lcpi!=0); 00575 return node_type(lcpi, psv(v.left, lcpi), v.right); 00576 } else if (v.left == 0) { 00577 // search for the next smaller value 00578 return node_type(m_lcp[v.right+1],0, nsv(v.right+1)-1); 00579 } else { 00580 // TODO: speed up, only two cases? 00581 size_type lcpl = m_lcp[v.left], lcpr = m_lcp[v.right+1]; 00582 if (lcpl < lcpr) { 00583 return node_type(lcpr, v.left, nsv(v.right+1)-1); 00584 } else if (lcpl > lcpr) { 00585 return node_type(lcpl, psv(v.left, lcpl), v.right); 00586 } else { 00587 if (lcpl==0) 00588 return root(); 00589 return node_type(lcpl, psv(v.left, lcpl), nsv(v.right+1)-1); 00590 } 00591 } 00592 } 00593 } 00594 00595 00597 00606 node_type ith_child(const node_type& v, size_type i)const { 00607 if (is_leaf(v)) // if v is a leave, v has no child 00608 return root(); 00609 size_type k = 0, kpos = 0, k_find_close = 0; 00610 // v is not a leave: v has at least two children 00611 k = get_first_l_index(v.left, v.right, kpos, k_find_close);// get first l-index k and the position of k 00612 assert(m_lcp[k]==v.l); 00613 size_type left = v.left, right = k-1; 00614 size_type j=1; 00615 while (j < i) { 00616 // get next l-index m and the position of m 00617 size_type m = m_bp_support.rank(m_bp_support.find_open(k_find_close-j))-1; 00618 ++j; 00619 if (m==k or v.l != m_lcp[m]) { // no next l-index exists 00620 if (i!=j) 00621 return root(); 00622 left = k; 00623 right = v.right; 00624 break; 00625 } 00626 left = k; 00627 right = m-1; 00628 k = m; 00629 } 00630 if (left==right) { // child interval is a leaf 00631 assert(m_csa.size()-m_csa[left] > v.l); 00632 return node_type(m_csa.size()-m_csa[left],left,right); 00633 } else { 00634 // find l of the first child interval 00635 size_type k1 = get_first_l_index(left, right, kpos, kpos); 00636 assert(m_lcp[k1] > v.l); 00637 return node_type(m_lcp[k1], left, right); 00638 } 00639 return root(); 00640 } 00641 00643 00649 size_type degree(const node_type& v)const { 00650 if (is_leaf(v)) // if v is a leave, v has no child 00651 return 0; 00652 // v is not a leave: v has at least two children 00653 size_type r = get_pos_of_closing_parenthesis_of_the_first_l_index(v.left, v.right); 00654 assert(m_lcp[m_bp_support.rank(m_bp_support.find_open(r))-1]==v.l); 00655 size_type begin = m_bp_support.preceding_closing_parentheses(r); 00656 assert(m_csa.sigma>0); 00657 if (begin > (size_type)m_csa.sigma-1) begin = m_csa.sigma-1; 00658 begin = r-begin; 00659 size_type end = r, mid; 00660 while (end != begin) { 00661 mid = (begin+end)>>1; // begin <= mid < end 00662 if (m_lcp[m_bp_support.rank(m_bp_support.find_open(mid))-1] > v.l) 00663 begin = mid+1; 00664 else 00665 end = mid; 00666 } 00667 return r-end+2; 00668 } 00669 00671 00677 node_type sibling(const node_type& v)const { 00678 if (v == root()) 00679 return root(); 00680 if (v.right == m_csa.size()-1) // there exists no next sibling if the interval touches the right border 00681 return root(); 00682 // now let [p..q] be the interval [v.left, v.right] 00683 size_type lcp_p = m_lcp[v.left]; 00684 size_type lcp_qp1 = m_lcp[v.right+1]; // is defined as v.right < m_csa.size()-1 00685 if (lcp_p > lcp_qp1) 00686 return root(); 00687 else { // search next l-index 00688 size_type new_q; 00689 size_type qp1_closing_pos = m_bp_support.find_close(m_bp_support.select(v.right+2)); 00690 if (m_bp[qp1_closing_pos-1]) {// if there exists no closing parenthesis left to qp1_closing_pos 00691 new_q = nsv(v.right+1); // i.e. we could not find a next l-index, only a smaller value: last sibling 00692 assert(new_q > v.right+1); 00693 } else { 00694 new_q = m_bp_support.rank(m_bp_support.find_open(qp1_closing_pos-1))-1; 00695 assert(new_q > v.right+1); 00696 if (m_lcp[new_q] > lcp_qp1) { // no next l-index exists: find next smaller value, last sibling 00697 new_q = nsv(v.right+1); 00698 } else { 00699 assert(m_lcp[new_q] == lcp_qp1); 00700 } 00701 } 00702 assert(new_q > v.right+1); 00703 if (new_q-1 == v.right+1) { // sibling is a singlton interval 00704 return ith_leaf(new_q/*v.right+2*/); 00705 } else { 00706 // get l-index m of the sibling 00707 size_type m_pos, m_close_pos; 00708 size_type m = get_first_l_index(v.right+1, new_q-1, m_pos, m_close_pos); 00709 return node_type(m_lcp[m], v.right+1, new_q-1); 00710 } 00711 00712 } 00713 } 00714 00715 // Returns the next sibling of node v. 00716 // Only for tests. 00717 node_type sibling_naive(const node_type& v)const { 00718 if (v==root()) 00719 return root(); 00720 node_type parent = this->parent(v); 00721 assert(parent != v); 00722 size_type nr = degree(parent); 00723 for (size_type i=1; i <= nr; ++i) 00724 if (ith_child(parent, i) == v and i!=nr) 00725 return ith_child(parent, i+1); 00726 return root(); 00727 } 00728 00730 00738 node_type child_linear(const node_type& v, unsigned char c, size_type& char_pos)const { 00739 if (is_leaf(v)) // if v is a leave, it has no child 00740 return root(); 00741 size_type k, kpos, k_find_close; 00742 // v is not a leave: v has at least two children 00743 k = get_first_l_index(v.left, v.right, kpos, k_find_close);// get first l-index k 00744 assert(m_lcp[k]==v.l); 00745 size_type left = v.left, right = k-1; 00746 char_pos = m_csa(m_csa[left]+v.l); // char_pos = invSA[SA[left]+v.l+1-1] 00747 uint16_t cc = m_csa.char2comp[c]; 00748 size_type char_ex_max_pos = m_csa.C[cc+1], char_inc_min_pos = m_csa.C[cc]; 00749 while (char_pos < char_ex_max_pos) { 00750 if (char_pos >= char_inc_min_pos) { 00751 goto found_child; 00752 } 00753 // get next l-index 00754 size_type mpos = m_bp_support.find_open((m_bp_support.find_close(kpos)-1)); 00755 size_type m = m_bp_support.rank(mpos)-1; 00756 if (m==k or v.l != m_lcp[m]) { // no next l-index exists 00757 left = k; 00758 right = v.right; 00759 char_pos = m_csa(m_csa[left]+v.l); 00760 if (char_pos < char_inc_min_pos or char_pos >= char_ex_max_pos) // check if character is two small 00761 return root(); 00762 goto found_child; 00763 } 00764 left = k; 00765 right = m-1; 00766 k = m; 00767 kpos = mpos; 00768 assert(m_csa(m_csa[left]+v.l) > char_pos); 00769 char_pos = m_csa(m_csa[left]+v.l); 00770 } 00771 return root(); 00772 found_child: 00773 if (left==right) { // child interval is a leaf 00774 assert(m_csa.size()-m_csa[left] > v.l); 00775 return node_type(m_csa.size()-m_csa[left],left,right); 00776 } else { 00777 // find l of the first child interval 00778 size_type k1pos, k1 = get_first_l_index(left, right, k1pos, k_find_close); 00779 assert(m_lcp[k1] > v.l); 00780 return node_type(m_lcp[k1], left, right); 00781 } 00782 } 00783 00784 00785 00787 00795 // TODO const unsigned char c durch char_type ersetzen 00796 node_type child(const node_type& v, const unsigned char c, size_type& char_pos)const { 00797 if (is_leaf(v)) // if v is a leaf = (), v has no child 00798 return root(); 00799 // else v = ( ( )) 00800 uint16_t cc = m_csa.char2comp[c]; 00801 if (cc==0 and c!=0) // TODO: aendere char2comp so ab, dass man diesen sonderfall nicht braucht 00802 return root(); 00803 size_type char_ex_max_pos = m_csa.C[cc+1], char_inc_min_pos = m_csa.C[cc]; 00804 // v is not a leave: v has at least two children 00805 size_type k, kpos, k_find_close; 00806 k = get_first_l_index(v.left, v.right, kpos, k_find_close); 00807 assert(m_lcp[k]==v.l); 00808 size_type left = v.left, right = k-1; 00809 char_pos = m_csa(m_csa[left]+v.l); 00810 if (char_pos >= char_ex_max_pos) {// c is lex. greater than the first character of the first child interval 00811 return root(); 00812 } else if (char_pos >= char_inc_min_pos) { // found interval [left, right] 00813 goto found_child; 00814 } 00815 { 00816 size_type k_char_pos = m_csa(m_csa[k]+v.l); 00817 { 00818 size_type begin = m_bp_support.preceding_closing_parentheses(k_find_close); 00819 assert(m_csa.sigma>0); 00820 if (begin > (size_type)m_csa.sigma-1) begin = m_csa.sigma-1; 00821 begin = k_find_close - begin; 00822 size_type end = k_find_close, m_find_close, m; 00823 while (end != begin) { 00824 m_find_close = (begin+end)>>1; // begin <= m_find_close < end 00825 m = m_bp_support.rank(m_bp_support.find_open(m_find_close))-1; 00826 if (m_lcp[m] > v.l or /*m_lcp[m]==v.l and */ (char_pos = m_csa(m_csa[m]+v.l)) >= char_ex_max_pos) { 00827 begin = m_find_close+1; 00828 } else { // m_lcp[m] == v.l and char_pos < char_ex_max_pos 00829 end = m_find_close; 00830 k = m; 00831 k_char_pos = char_pos; 00832 } 00833 } 00834 assert(m_lcp[k] == v.l); 00835 assert(m_bp_support.find_close(m_bp_support.select(k+1)) == end); 00836 if (k_char_pos < char_ex_max_pos and k_char_pos >= char_inc_min_pos) {// child interval exists 00837 left = k; 00838 char_pos = k_char_pos; 00839 assert(end > 0); 00840 if (m_bp[end-1] == 1) { // last child interval 00841 right = v.right; 00842 } else { 00843 size_type k1 = m_bp_support.rank(m_bp_support.find_open(end-1))-1; 00844 if (m_lcp[k1] > v.l) {// last child interval 00845 right = v.right; 00846 } else { 00847 right = k1-1; 00848 } 00849 } 00850 } else { 00851 return root(); 00852 } 00853 } 00854 } 00855 found_child: 00856 if (left==right) { // child interval is a leaf 00857 assert(m_csa.size()-m_csa[left] > v.l); 00858 return node_type(m_csa.size()-m_csa[left],left,right); 00859 } else { 00860 // find l of the first child interval 00861 size_type k1pos, k1 = get_first_l_index(left, right, k1pos, k_find_close); 00862 assert(m_lcp[k1] > v.l); 00863 return node_type(m_lcp[k1], left, right); 00864 } 00865 } 00866 00868 // \sa child(node_type v, const unsigned char c, size_type &char_pos) 00869 node_type child(const node_type& v, const unsigned char c) { 00870 size_type char_pos; 00871 return child(v, c, char_pos); 00872 } 00873 00875 00882 unsigned char edge(const node_type& v, size_type d)const { 00883 #ifndef NDEBUG 00884 if (d < 1 or d > depth(v)) { 00885 throw std::out_of_range("OUT_OF_RANGE_ERROR: "+util::demangle(typeid(this).name())+"cst_sct<>::edge(node_type v, size_type d). d == 0 or d > depth(v)!"); 00886 } 00887 #endif 00888 size_type order = m_csa(m_csa[v.left]+d-1); 00889 uint16_t c_begin = 1, c_end = m_csa.sigma+1, mid; 00890 while (c_begin < c_end) { 00891 mid = (c_begin+c_end)>>1; 00892 if (m_csa.C[mid] <= order) { 00893 c_begin = mid+1; 00894 } else { 00895 c_end = mid; 00896 } 00897 } 00898 return m_csa.comp2char[c_begin-1]; 00899 } 00900 00901 00903 00910 node_type lca(node_type v, node_type w)const { 00911 if (v.left > w.left or (v.left == w.left and v.right < w.right)) { 00912 std::swap(v, w); 00913 } 00914 assert(v.left < w.left or (v.left==w.left and v.right >= w.right)); 00915 assert(!(v.left < w.left and v.right > w.left and v.right < w.right)); // assert that v and w do not overlapp 00916 if (v.right >= w.right) { // v encloses w or v==w 00917 return v; 00918 } else { // v.left < v.right < w.left < w.right 00919 size_type min_index = rmq(v.left+1, w.right); 00920 size_type l = m_lcp[min_index]; 00921 if (l==0) 00922 return root(); 00923 else 00924 return node_type(l, psv(min_index, l), nsv(min_index)-1); 00925 } 00926 } 00927 00929 00935 size_type depth(const node_type& v)const { 00936 return v.l; 00937 } 00938 00940 00946 // TODO: can be implemented in O(1) with o(n) space. See Jansson, Sadakane, Sung, SODA 2007, "Ultra-succinct Representation of Ordered Trees" 00947 size_type node_depth(node_type v)const { 00948 size_type d = 0; 00949 while (v != root()) { 00950 ++d; 00951 v = parent(v); 00952 } 00953 return d; 00954 } 00955 00957 00963 node_type sl(const node_type& v)const { 00964 if (v == root()) 00965 return root(); 00966 // get interval with first char deleted 00967 size_type left = m_csa.psi[v.left]; 00968 if (is_leaf(v)) { 00969 if (v.l==1) 00970 return root(); 00971 else 00972 return node_type(v.l-1, left, left); 00973 } 00974 size_type right = m_csa.psi[v.right]; 00975 assert(left < right); 00976 // rmq 00977 size_type min_index = rmq(left+1, right); 00978 size_type l = m_lcp[min_index]; 00979 if (l==0) 00980 return root(); 00981 else 00982 return node_type(l, psv(min_index, l), nsv(min_index)-1); 00983 } 00984 00986 /* 00987 * \param v A valid not of a cst_sct. 00988 * \param c The character which should be prepended to the string of the current node. 00989 * \return root() if the Weiner link of (v, c) does not exist, otherwise the Weiner link is returned. 00990 * \par Time complexity 00991 * \f$ \Order{ t_{rank\_bwt}\cdot t_{rmq} } \f$ 00992 */ 00993 node_type wl(const node_type& v, const unsigned char c) const { 00994 size_type c_left = m_csa.rank_bwt(v.left, c); 00995 size_type c_right = m_csa.rank_bwt(v.right+1, c); 00996 if (c_left == c_right) // there exists no Weiner link 00997 return root(); 00998 if (c_left+1 == c_right) 00999 return ith_leaf(m_csa.C[m_csa.char2comp[c]] + c_left + 1); 01000 else { 01001 size_type left = m_csa.C[m_csa.char2comp[c]] + c_left; 01002 size_type right = m_csa.C[m_csa.char2comp[c]] + c_right - 1; 01003 assert(left < right); 01004 // rmq 01005 size_type min_index = rmq(left+1, right); 01006 size_type l = m_lcp[min_index]; 01007 return node_type(l, left, right); 01008 } 01009 } 01010 01012 01017 size_type sn(const node_type& v)const { 01018 assert(is_leaf(v)); 01019 return m_csa[v.left]; 01020 } 01021 01023 01031 size_type id(const node_type& v)const { 01032 if (is_leaf(v)) { // return i in the range from 0..csa.size()-1 01033 return v.left; 01034 } 01035 size_type kpos, find_close_k; 01036 return (m_bp.size()>>1) + get_first_l_index(v.left, v.right, kpos, find_close_k); 01037 } 01038 01040 size_type nodes()const { 01041 return m_nodes; 01042 } 01043 01045 /* \param lb Left bound of the lcp-interval [lb..rb] (inclusive). 01046 * \param rb Right bound of the lcp-interval [lb..rb] (inclusive). 01047 * \param l Depth of the lcp-interval. 01048 *\ return The node in the suffix tree corresponding lcp-interval [lb..rb] 01049 */ 01050 node_type node(size_type lb, size_type rb, size_type l=0) const { 01051 return node_type(l, lb, rb); 01052 } 01053 01055 void print_info()const { 01056 std::cout << "# size of cst in bytes per character" << std::endl; 01057 size_type cst_size = util::get_size_in_bytes(*this); 01058 std::cout << ((double)cst_size)/csa.size() << " # = "<< ((double)cst_size)/(1<<20) <<" MB"<<std::endl; 01059 std::cout<< ((double)util::get_size_in_bytes(csa))/cst_size << " # ratio of csa size" << std::endl; 01060 std::cout<< ((double)util::get_size_in_bytes(lcp))/cst_size << " # ratio of lcp size" << std::endl; 01061 std::cout<< ((double)util::get_size_in_bytes(bp))/cst_size << " # ratio of bp size" << std::endl; 01062 std::cout<< ((double)util::get_size_in_bytes(bp_support))/cst_size << " # ratio of bp_support size" << std::endl; 01063 std::cout<< 0 << " # ratio of bp_rank_10 size" << std::endl; 01064 std::cout<< 0 << " # ratio of bp_select_10 size" << std::endl; 01065 } 01066 01067 /* @} */ 01068 01069 }; 01070 01071 // == template functions == 01072 01073 template<class Csa, class Lcp, class Bp_support> 01074 template<uint8_t int_width, class size_type_class, uint8_t int_width_1, class size_type_class_1, uint8_t int_width_2, class size_type_class_2> 01075 cst_sct<Csa, Lcp, Bp_support>::cst_sct(const std::string& csa_file_name, 01076 int_vector_file_buffer<int_width, size_type_class>& lcp_buf, 01077 int_vector_file_buffer<int_width_1, size_type_class_1>& sa_buf, 01078 int_vector_file_buffer<int_width_2, size_type_class_2>& isa_buf, 01079 std::string dir, 01080 bool build_only_bps 01081 ):csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support) 01082 { 01083 std::string id = util::to_string(util::get_pid())+"_"+util::to_string(util::get_id()).c_str(); 01084 write_R_output("cst", "construct BPS", "begin", 1, 0); 01085 { 01086 bit_vector m_first_child; 01087 m_nodes = algorithm::construct_supercartesian_tree_bp_succinct_and_first_child(lcp_buf, m_bp, m_first_child) + m_bp.size()/2; 01088 } 01089 write_R_output("cst", "construct BPS", "end", 1, 0); 01090 01091 write_R_output("cst", "construct BPSS", "begin", 1, 0); 01092 m_bp_support = Bp_support(&m_bp); 01093 write_R_output("cst", "construct BPSS", "end", 1, 0); 01094 01095 write_R_output("cst", "construct CLCP", "begin", 1, 0); 01096 construct_lcp(m_lcp, *this, lcp_buf, isa_buf); 01097 write_R_output("cst", "construct CLCP", "end", 1, 0); 01098 01099 util::load_from_file(m_csa, csa_file_name.c_str()); 01100 } 01101 01102 template<class Csa, class Lcp, class Bp_support> 01103 cst_sct<Csa, Lcp, Bp_support>::cst_sct(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps):csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support) 01104 { 01105 construct(file_map, dir, id, build_only_bps); 01106 } 01107 01108 template<class Csa, class Lcp, class Bp_support> 01109 void cst_sct<Csa, Lcp, Bp_support>::construct(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps) 01110 { 01111 write_R_output("cst", "construct BPS", "begin", 1, 0); 01112 { 01113 bit_vector m_first_child; 01114 int_vector_file_buffer<> lcp_buf(file_map["lcp"].c_str()); 01115 m_nodes = algorithm::construct_supercartesian_tree_bp_succinct_and_first_child(lcp_buf, m_bp, m_first_child) + m_bp.size()/2; 01116 } 01117 write_R_output("cst", "construct BPS", "end", 1, 0); 01118 01119 write_R_output("cst", "construct CLCP", "begin", 1, 0); 01120 construct_lcp(m_lcp, *this, file_map, dir, id); 01121 write_R_output("cst", "construct CLCP", "end", 1, 0); 01122 01123 write_R_output("cst", "construct BPSS", "begin", 1, 0); 01124 m_bp_support = Bp_support(&m_bp); 01125 write_R_output("cst", "construct BPSS", "end", 1, 0); 01126 01127 util::load_from_file(m_csa, file_map["csa"].c_str()); 01128 std::cout<<"debug = "<<m_csa.size()<<std::endl; 01129 std::cout<<"csa file = "<<file_map["csa"]<<std::endl; 01130 } 01131 01132 01133 01134 template<class Csa, class Lcp, class Bp_support> 01135 typename cst_sct<Csa, Lcp, Bp_support>::size_type cst_sct<Csa, Lcp, Bp_support>::serialize(std::ostream& out) const 01136 { 01137 size_type written_bytes = 0; 01138 written_bytes += m_csa.serialize(out); 01139 written_bytes += m_lcp.serialize(out); 01140 written_bytes += m_bp.serialize(out); 01141 written_bytes += m_bp_support.serialize(out); 01142 out.write((char*) &m_nodes, sizeof(m_nodes)); 01143 written_bytes += sizeof(m_nodes); 01144 return written_bytes; 01145 } 01146 01147 template<class Csa, class Lcp, class Bp_support> 01148 void cst_sct<Csa, Lcp, Bp_support>::load(std::istream& in) 01149 { 01150 m_csa.load(in); 01151 // lcp_trait<Csa, Lcp>::load_lcp(m_lcp, in, m_csa); 01152 load_lcp(m_lcp, in, *this); 01153 m_bp.load(in); 01154 m_bp_support.load(in, &m_bp); 01155 in.read((char*) &m_nodes, sizeof(m_nodes)); 01156 #ifdef SDSL_DEBUG 01157 assert(algorithm::check_bp_support(m_bp, m_bp_support)); 01158 std::cerr<<"checked bp_support"<<std::endl; 01159 #endif 01160 } 01161 01162 template<class Csa, class Lcp, class Bp_support> 01163 cst_sct<Csa, Lcp, Bp_support>& cst_sct<Csa, Lcp, Bp_support>::operator=(const cst_sct& cst) 01164 { 01165 if (this != &cst) { 01166 copy(cst); 01167 } 01168 return *this; 01169 } 01170 01171 01172 01173 01174 01175 } // end namespace sdsl 01176 01177 01178 #endif