SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2010 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00021 #ifndef INCLUDED_SDSL_CST_SCT3 00022 #define INCLUDED_SDSL_CST_SCT3 00023 00024 #include "int_vector.hpp" 00025 #include "algorithms.hpp" 00026 #include "iterators.hpp" 00027 #include "lcp_support_tree.hpp" 00028 #include "lcp_wt.hpp" 00029 #include "bp_support.hpp" 00030 #include "csa_wt.hpp" // for std initialization of cst_sct3 00031 #include "csa_uncompressed.hpp" 00032 #include "cst_iterators.hpp" 00033 #include "cst_sct.hpp" // for lcp_interval 00034 #include "rank_support.hpp" 00035 #include "select_support.hpp" 00036 #include "testutils.hpp" 00037 #include "util.hpp" 00038 #include <iostream> 00039 #include <algorithm> 00040 #include <cassert> 00041 #include <cstring> // for strlen 00042 #include <iomanip> 00043 #include <iterator> 00044 #include <stack> // for the calculation of the balanced parantheses sequence 00045 #include <ostream> 00046 00047 namespace sdsl 00048 { 00049 00050 struct cst_tag; // forward declaration 00051 00052 template<class Int = int_vector<>::size_type> 00053 struct bp_interval { 00054 Int i; 00055 Int j; 00056 Int ipos; // position of the i+1th opening parenthesis in the balanced parentheses sequence 00057 Int cipos; // position of the matching closing parenthesis of the i+1th opening parenthesis in the balanced parentheses sequence 00058 Int jp1pos; // position of the j+2th opening parenthesis in the balanced parentheses sequence 00059 00061 00063 bp_interval(Int i=0, Int j=0, Int ipos=0, Int cipos=0, Int jp1pos=0):i(i),j(j),ipos(ipos),cipos(cipos),jp1pos(jp1pos) {}; 00064 00065 00066 bool operator<(const bp_interval& interval)const { 00067 if (i!=interval.i) 00068 return i<interval.i; 00069 return j<interval.j; 00070 } 00071 00073 00075 bool operator==(const bp_interval& interval)const { 00076 return i==interval.i and 00077 j==interval.j; 00078 } 00079 00081 00083 bool operator!=(const bp_interval& interval)const { 00084 return !(*this==interval); 00085 } 00086 00088 00090 bp_interval& operator=(const bp_interval& interval) { 00091 i = interval.i; 00092 j = interval.j; 00093 ipos = interval.ipos; 00094 cipos = interval.cipos; 00095 jp1pos = interval.jp1pos; 00096 return *this; 00097 } 00098 }; 00099 00100 00101 template<class Int> 00102 inline std::ostream& operator<<(std::ostream& os, const bp_interval<Int>& interval) 00103 { 00104 os<<"-["<<interval.i<<","<<interval.j<<"]("<<interval.ipos<<","<<interval.cipos<<","<<interval.jp1pos<<")"; 00105 return os; 00106 } 00107 00108 00110 00130 template<class Csa = csa_wt<>, // CSA type 00131 class Lcp = lcp_support_tree<lcp_wt<> >, // LCP type 00132 class Bp_support = bp_support_sada<>, // for the balanced parentheses 00133 class Rank_support = rank_support_v5<> // for the 'first child' bit_vector 00134 > 00135 class cst_sct3 00136 { 00137 public: 00138 typedef typename Csa::value_type value_type; // STL Container requirement/TODO: ist das nicht gleich node type??? 00139 typedef cst_dfs_const_forward_iterator<cst_sct3> const_iterator;// STL Container requirement 00140 // typedef const_iterator iterator; // STL Container requirement 00141 typedef cst_bottom_up_const_forward_iterator<cst_sct3> const_bottom_up_iterator; 00142 // typedef const_bottom_up_iterator bottom_up_iterator; 00143 typedef const value_type const_reference; 00144 typedef const_reference reference; 00145 typedef const_reference* pointer; 00146 typedef const pointer const_pointer; 00147 typedef typename Csa::size_type size_type; // STL Container requirement 00148 typedef size_type cst_size_type; 00149 typedef ptrdiff_t difference_type; // STL Container requirement 00150 typedef Csa csa_type; 00151 typedef typename Lcp::template type<cst_sct3>::lcp_type lcp_type; 00152 typedef Bp_support bp_support_type; 00153 typedef typename Csa::pattern_type pattern_type; 00154 typedef typename Csa::char_type char_type; 00155 typedef bp_interval<size_type> node_type; 00156 typedef Rank_support fc_rank_support_type; 00157 00158 typedef cst_tag index_category; 00159 private: 00160 Csa m_csa; 00161 lcp_type m_lcp; 00162 bit_vector m_bp; 00163 bp_support_type m_bp_support; 00164 bit_vector m_first_child; // implementation note: no rank structure is needed for the first_child bit_vector, except for id() 00165 fc_rank_support_type m_first_child_rank; 00166 uint8_t m_sigma; 00167 size_type m_nodes; 00168 00169 void copy(const cst_sct3& cst) { 00170 m_csa = cst.m_csa; 00171 copy_lcp(m_lcp, cst.m_lcp, *this); 00172 m_bp = cst.m_bp; 00173 m_bp_support = cst.m_bp_support; 00174 m_bp_support.set_vector(&m_bp); 00175 m_first_child = cst.m_first_child; 00176 m_first_child_rank = cst.m_first_child_rank; 00177 m_first_child_rank.set_vector(&m_first_child); 00178 m_sigma = cst.m_sigma; 00179 m_nodes = cst.m_nodes; 00180 } 00181 00182 // Get the first l index of a [i,j] interval. 00183 /* I.e. given an interval [i,j], the function returns the position of the smallest entry lcp[k] with \f$ i<k\leq j \f$ 00184 * \par Time complexity 00185 * \f$ \Order{1} \f$ 00186 */ 00187 inline size_type get_first_l_index(const node_type& node, size_type& kpos, size_type& ckpos)const { 00188 if (node.cipos > node.jp1pos) { // corresponds to m_lcp[i] <= m_lcp[j+1] 00189 ckpos = node.jp1pos-1; 00190 } else { // corresponds to m_lcp[i] > m_lcp[j+1] 00191 ckpos = node.cipos-1; 00192 } 00193 assert(m_bp[ckpos]==0); 00194 kpos = m_bp_support.find_open(ckpos); 00195 return m_bp_support.rank(kpos)-1; 00196 } 00197 00198 // Get the ith l index of a node 00199 // if there exists no ith l-index return node.j+1 00200 size_type get_ith_l_index(const node_type& node, size_type i, size_type& kpos, size_type& ckpos)const { 00201 uint64_t children = 0; 00202 00203 assert(i > 0); 00204 if (node.cipos > node.jp1pos) { // corresponds to m_lcp[i] <= m_lcp[j+1] 00205 ckpos = node.jp1pos-1; 00206 } else { // corresponds to m_lcp[i] > m_lcp[j+1] 00207 ckpos = node.cipos-1; 00208 } 00209 assert(m_bp[ckpos] == 0); // at least the first l-index should be present, i.e. node is not leaf 00210 if (1 == i) { 00211 kpos = m_bp_support.find_open(ckpos); 00212 return m_bp_support.rank(kpos)-1; 00213 } else { // i > 1 00214 size_type r = ckpos - m_bp_support.rank(ckpos); // numbers of closing parentheses - 1 = index of first child in m_first_child 00215 if (r+1 >= i) { // if there exist more than i l-indices 00216 // check if m_first_child[r-i+1..r-1] consists of zeros 00217 const uint64_t* p = m_first_child.data() + (r>>6); 00218 uint8_t offset = r&0x3F; 00219 00220 uint64_t w = (*p) & bit_magic::Li1Mask[offset]; 00221 if (w) { 00222 children = offset - bit_magic::l1BP(w) + 1; 00223 } else if (m_first_child.data() == p) { // w==0 and we are in the first word 00224 children = offset + 2; // da bit_magic::l1BP(w)=-1 sein muesste 00225 } else { 00226 children = offset + 2; 00227 while (p > m_first_child.data()) { 00228 w = *(--p); 00229 if (0==w) 00230 children += 64; 00231 else { 00232 children += (63-bit_magic::l1BP(w)); 00233 break; 00234 } 00235 } 00236 children += (w==0); 00237 } 00238 if (i < children) { // there exists an ith l-index 00239 ckpos -= (i-1); 00240 assert(m_bp[ckpos] == 0); 00241 kpos = m_bp_support.find_open(ckpos); 00242 return m_bp_support.rank(kpos)-1; 00243 } 00244 } 00245 // if i >= children 00246 kpos = node.jp1pos; 00247 if (kpos < m_bp.size()) 00248 ckpos = m_bp_support.find_close(kpos); 00249 else 00250 ckpos = m_bp.size(); 00251 return node.j+1; 00252 } 00253 } 00254 00255 // Get the postion of the first l index of a l-[i,j] interval in the balanced parentheses sequence. 00256 /* \par Time complexity 00257 * \f$ \Order{1} \f$ 00258 */ 00259 inline size_type get_pos_of_closing_parenthesis_of_the_first_l_index(const node_type& node)const { 00260 if (node.cipos > node.jp1pos) { // corresponds to m_lcp[i] <= m_lcp[j+1] 00261 return node.jp1pos-1; 00262 } else { // corresponds to m_lcp[i] > m_lcp[j+1] 00263 return node.cipos-1; 00264 } 00265 } 00266 00267 // Get the next smaller value. 00268 /* \par Time complexity 00269 * \f$ \Order{1} \f$ 00270 */ 00271 // Returns n if there is no next smaller value in [i+1..n-1] 00272 inline size_type nsv(size_type i, size_type ipos)const { // possible optimization: calculate also position of nsv, i.e. next ( following position cipos 00273 size_type cipos = m_bp_support.find_close(ipos); 00274 size_type result = m_bp_support.rank(cipos); 00275 return result; 00276 } 00277 00278 // Get the previous smaller value. 00279 /* 00280 * \par Time complexity 00281 * \f$ \Order{\frac{\sigma}{w}} \f$, where w=64 is the word size, can be implemented in \f$\Order{1}\f$ with rank and select 00282 */ 00283 inline size_type psv(size_type i, size_type ipos, size_type cipos, size_type& psvpos, size_type& psvcpos)const { 00284 if (cipos + m_sigma >= m_bp.size()) { // if lcp[i]==0 => psv is the 0th index by definition 00285 psvpos = 0; 00286 psvcpos = m_bp.size()-1; 00287 return 0; 00288 } 00289 // TODO einfach am anfang pruefen ob cipos+1 00290 if (m_bp[cipos+1]) { 00291 psvpos = m_bp_support.enclose(ipos); 00292 psvcpos = m_bp_support.find_close(psvpos); 00293 return m_bp_support.rank(psvpos)-1; 00294 } 00295 00296 size_type r0 = cipos - m_bp_support.rank(cipos); // index of clothing parenthesis in first_child bp 00297 size_type next_first_child = 0; 00298 const uint64_t* p = m_first_child.data() + (r0>>6); 00299 uint64_t w = (*p) >> (r0&0x3F); 00300 if (w) { // if w!=0 00301 next_first_child = cipos + bit_magic::r1BP(w); 00302 if (cipos == next_first_child and m_bp[next_first_child+1]) { 00303 psvpos = m_bp_support.enclose(ipos); 00304 psvcpos = m_bp_support.find_close(psvpos); 00305 return m_bp_support.rank(psvpos)-1; 00306 } 00307 } else { 00308 cipos += 64-(r0&0x3F); 00309 ++p; 00310 while (!(w=*p)) { // while w==0 00311 ++p; 00312 cipos += 64; 00313 } 00314 next_first_child = cipos + bit_magic::r1BP(w); 00315 } 00316 if (!m_bp[next_first_child+1]) { // if next parenthesis is a closing one 00317 psvcpos = next_first_child+1; 00318 psvpos = m_bp_support.find_open(psvcpos); 00319 return m_bp_support.rank(psvpos)-1; 00320 } else { 00321 psvpos = m_bp_support.enclose(m_bp_support.find_open(next_first_child)); 00322 psvcpos = m_bp_support.find_close(psvpos); 00323 return m_bp_support.rank(psvpos)-1; 00324 } 00325 } 00326 00327 // Range minimum query based on the rr_enclose method. 00328 /* \par Time complexity 00329 * \f$ \Order{\rrenclose} \f$ 00330 */ 00331 inline size_type rmq(size_type l, size_type r)const { 00332 size_type i = m_bp_support.select(l+1); 00333 size_type j = m_bp_support.select(r+1); 00334 size_type fc_i = m_bp_support.find_close(i); 00335 if (j < fc_i) { // i < j < find_close(j) < find_close(i) 00336 return l; 00337 } else { // i < find_close(i) < j < find_close(j) 00338 size_type ec = m_bp_support.rr_enclose(i,j); 00339 if (ec == m_bp_support.size()) {// no restricted enclosing pair found 00340 return r; 00341 } else { // found range restriced enclosing pair 00342 return m_bp_support.rank(ec)-1; // subtract 1, as the index is 0 based 00343 } 00344 } 00345 } 00346 00347 public: 00348 const Csa& csa; 00349 const lcp_type& lcp; 00350 const bit_vector& bp; 00351 const bp_support_type& bp_support; 00352 00353 const bit_vector& first_child_bv; 00354 const fc_rank_support_type& first_child_rank; 00355 00357 /* @{ */ 00358 00360 cst_sct3(): csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support), first_child_bv(m_first_child), 00361 first_child_rank(m_first_child_rank) {} 00362 00363 // Constructor for the cst_sct3 taking a string for that the compressed suffix tree should be calculated. 00364 /* 00365 * \param str Text for which the \f$ \CST \f$ should be constructed. The text should be terminated by a zero byte. 00366 * \pre The text has to be terminated by a zero byte. 00367 */ 00368 // cst_sct3(const unsigned char *str); 00369 00370 template<uint8_t int_width, class size_type_class, uint8_t int_width_1, class size_type_class_1, uint8_t int_width_2, class size_type_class_2> 00371 cst_sct3(const std::string& cst_file_name, 00372 int_vector_file_buffer<int_width, size_type_class>& lcp_buf, 00373 int_vector_file_buffer<int_width_1, size_type_class_1>& sa_buf, 00374 int_vector_file_buffer<int_width_2, size_type_class_2>& isa_buf, 00375 std::string dir="./", 00376 bool build_ony_bps=false); 00377 00378 cst_sct3(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps=false); 00379 00381 00386 cst_sct3(const cst_sct3& cst):csa(m_csa),lcp(m_lcp),bp(m_bp),bp_support(m_bp_support),first_child_bv(m_first_child), 00387 first_child_rank(m_first_child_rank) { 00388 copy(cst); 00389 } 00390 00391 /* @} */ 00392 00394 ~cst_sct3() {} 00395 00396 void construct(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps=false); 00397 00399 00402 size_type size()const { 00403 return m_bp.size()>>1; 00404 } 00405 00407 00410 static size_type max_size() { 00411 return Csa::max_size(); 00412 } 00413 00415 00418 bool empty()const { 00419 return m_csa.empty(); 00420 } 00421 00423 00431 void swap(cst_sct3& cst) { 00432 if (this != &cst) { 00433 m_csa.swap(cst.m_csa); 00434 m_bp.swap(cst.m_bp); 00435 util::swap_support(m_bp_support, cst.m_bp_support, &m_bp, &(cst.m_bp)); 00436 00437 m_first_child.swap(cst.m_first_child); 00438 util::swap_support(m_first_child_rank, cst.m_first_child_rank, &m_first_child, &(cst.m_first_child)); 00439 std::swap(m_sigma, cst.m_sigma); 00440 std::swap(m_nodes, cst.m_nodes); 00441 // anything else has to be swapped before swapping lcp 00442 swap_lcp(m_lcp, cst.m_lcp, *this, cst); 00443 } 00444 } 00445 00447 00450 const_iterator begin()const { 00451 if (0 == m_bp.size()) // special case: tree is uninitialized 00452 return end(); 00453 else if (2 == m_bp.size()) { // special case: the root is a leaf 00454 return const_iterator(this, root(), true, true); 00455 } 00456 return const_iterator(this, root(), false, true); 00457 }; 00458 00460 00463 const_iterator end()const { 00464 return const_iterator(this, root(), true, false); 00465 } 00466 00468 const_bottom_up_iterator begin_bottom_up()const { 00469 if (0 == m_bp.size()) // special case: tree is uninitialized 00470 return end_bottom_up(); 00471 return const_bottom_up_iterator(this, leftmost_leaf_in_the_subtree(root())); 00472 } 00473 00475 const_bottom_up_iterator end_bottom_up()const { 00476 return const_bottom_up_iterator(this, root(), false); 00477 } 00478 00480 00483 cst_sct3& operator=(const cst_sct3& cst); 00484 00486 00491 bool operator==(const cst_sct3& cst)const; 00492 00494 00499 bool operator!=(const cst_sct3& cst)const; 00500 00502 00505 size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const; 00506 00508 00510 void load(std::istream& in); 00511 00513 /* @{ */ 00514 00516 00520 node_type root() const { 00521 return node_type(0, size()-1, 0, m_bp.size()-1, m_bp.size()); 00522 } 00523 00525 00531 bool is_leaf(const node_type& v)const { 00532 return v.i==v.j; 00533 } 00534 00536 00543 node_type ith_leaf(size_type i)const { 00544 assert(i > 0 and i <= size()); 00545 size_type ipos = m_bp_support.select(i); 00546 size_type jp1pos; 00547 if (i == size()) 00548 jp1pos = m_bp.size(); 00549 else if (m_bp[ipos+1]) 00550 jp1pos = ipos+1; 00551 else 00552 jp1pos = m_bp_support.select(i+1); 00553 return node_type(i-1, i-1, ipos, m_bp_support.find_close(ipos), jp1pos); 00554 } 00555 00557 00564 size_type leaves_in_the_subtree(const node_type& v)const { 00565 return v.j-v.i+1; 00566 } 00567 00569 00574 node_type leftmost_leaf_in_the_subtree(const node_type& v)const { 00575 return ith_leaf(v.i+1); 00576 } 00577 00579 00584 node_type rightmost_leaf_in_the_subtree(const node_type& v)const { 00585 return ith_leaf(v.j+1); 00586 } 00587 00589 00596 size_type lb(const node_type& v)const { 00597 return v.i; 00598 } 00599 00601 00608 size_type rb(const node_type& v)const { 00609 return v.j; 00610 } 00611 /* 00612 bool lcp_value_equals_zero(size_type i)const{ 00613 size_type ipos = m_bp_support.find_close(m_bp_support.select(i+1)); 00614 return ipos >= (m_bp.size() - m_sigma); 00615 } 00616 */ 00617 00619 00624 node_type parent(const node_type& v) const { 00625 //std::cout<<"parent "<<depth(v)<<v<<std::endl; 00626 #ifndef NDEBUG 00627 // std::cout<<"parent interval of "<<v.l<<"-["<<v.left<<" "<<v.right<<"]"<<std::endl; 00628 #endif 00629 if (v.cipos > v.jp1pos) { // LCP[i] <= LCP[j+1] 00630 size_type psv_pos, psv_cpos, psv_v, nsv_v, nsv_p1pos; 00631 psv_v = psv(v.j+1, v.jp1pos, m_bp_support.find_close(v.jp1pos), psv_pos, psv_cpos); 00632 nsv_v = nsv(v.j+1, v.jp1pos)-1; 00633 if (nsv_v == size()-1) { 00634 nsv_p1pos = m_bp.size(); 00635 } else { // nsv_v < size()-1 00636 nsv_p1pos = m_bp_support.select(nsv_v+2); 00637 } 00638 return node_type(psv_v, nsv_v, psv_pos, psv_cpos, nsv_p1pos); 00639 } else { // LCP[i] > LCP[j+1] 00640 size_type psv_pos, psv_cpos, psv_v; 00641 psv_v = psv(v.i, v.ipos, v.cipos, psv_pos, psv_cpos); 00642 return node_type(psv_v, v.j, psv_pos, psv_cpos, v.jp1pos); 00643 } 00644 } 00645 00647 00653 node_type sibling(const node_type& v)const { 00654 //std::cout<<"sibling "<<depth(v)<<v<<std::endl; 00655 // Vorgehen:(1) Bestimmen, ob v noch einen rechten Bruder hat. Entsp. parent hat gleicht rechte Grenze wie v. Speziallfall rechter Rand ist hier schon behandelt! 00656 if (v.cipos < v.jp1pos) { // LCP[i] > LCP[j+1] => v hat die selbe rechte Grenze wie parent(v) => kein rechter Bruder 00657 return root(); 00658 } 00659 // (2) Es existiert ein rechter Bruder, LCP[j+1] >= LCP[i] und j>i 00660 // es gilt nun: v.cipos > v.jp1pos 00661 size_type cjp1posm1 = m_bp_support.find_close(v.jp1pos)-1; // v.cipos-2 ??? 00662 //std::cerr<<"cjp1posm1="<<cjp1posm1<<std::endl; 00663 // size_type cjp1posm1 = v.cipos-1; 00664 // if( cjp1posm1+1 != v.cipos ){ 00665 // std::cout<<v<<" "<<cjp1posm1+1<<" "<<v.cipos<<std::endl; 00666 // } 00667 // size_type cjp1posm1 = v.cipos-2; 00668 // an der stelle von cjp1posm1 steht 1, falls v das letzte Kind ist 00669 bool last_child = m_bp[cjp1posm1]; 00670 // an der stelle von cjp1posm1 steht 0 -> entweder nicht letztes Kind oder letztes Kind 00671 if (!last_child) { 00672 size_type first_child_idx = cjp1posm1 - m_bp_support.rank(cjp1posm1); 00673 last_child = m_first_child[first_child_idx]; // if first_child indicator is true => the new sibling is the rightmost sibling 00674 } 00675 if (last_child) { 00676 size_type nsv_v = nsv(v.j+1, v.jp1pos)-1, nsv_p1pos; 00677 if (nsv_v == size()-1) { 00678 nsv_p1pos = m_bp.size(); 00679 } else { 00680 nsv_p1pos = m_bp_support.select(nsv_v+2); 00681 } 00682 return node_type(v.j+1, nsv_v, v.jp1pos, m_bp_support.find_close(v.jp1pos), nsv_p1pos); 00683 } else { 00684 //std::cerr<<"not last child: "<<std::endl; 00685 /* 00686 if( cjp1posm1+1 != v.cipos-1 ){ 00687 std::cout<<v<<" "<<cjp1posm1+1<<" "<<v.cipos<<std::endl; 00688 std::cout<<"LCP["<<v.i<<"]="<<m_lcp[v.i]<<" "<<"LCP["<<v.j<<"]="<<m_lcp[v.j]; 00689 if(v.j<size()) 00690 std::cout<<" LCP[j+1]="<<m_lcp[v.j+1]<<std::endl; 00691 } 00692 */ 00693 //std::cerr<<"find_open(cjp1posm1) = "<< m_bp_support.find_open(cjp1posm1) << std::endl; 00694 size_type new_j = m_bp_support.rank(m_bp_support.find_open(cjp1posm1))-2; 00695 return node_type(v.j+1, new_j, v.jp1pos, m_bp_support.find_close(v.jp1pos), m_bp_support.select(new_j+2)); 00696 } 00697 } 00698 00700 00709 node_type ith_child(const node_type& v, size_type i)const { 00710 // std::cerr<<i<<"th child "<<depth(v)<<v<<" i="<<i<<std::endl; 00711 assert(i > 0); 00712 if (is_leaf(v)) // if v is a leave, v has no child 00713 return root(); 00714 if (1 == i) { 00715 size_type k = 0, kpos = 0, k_find_close = 0; 00716 // v is not a leave: v has at least two children 00717 k = get_first_l_index(v, kpos, k_find_close);// get first l-index k and the position of k 00718 return node_type(v.i, k-1, v.ipos, v.cipos, kpos); 00719 } else { // i > 1 00720 size_type k1, kpos1, k_find_close1; 00721 k1 = get_ith_l_index(v, i-1, kpos1, k_find_close1); 00722 if (k1 == v.j+1) 00723 return root(); 00724 size_type k2, kpos2, k_find_close2; 00725 k2 = get_ith_l_index(v, i, kpos2, k_find_close2); 00726 return node_type(k1, k2-1, kpos1, k_find_close1, kpos2); 00727 } 00728 } 00729 00731 00737 size_type degree(const node_type& v)const { 00738 if (is_leaf(v)) // if v is a leave, v has no child 00739 return 0; 00740 // v is not a leave: v has at least two children 00741 size_type r = get_pos_of_closing_parenthesis_of_the_first_l_index(v); 00742 /* if( m_bp[r-1] ){// if there exists no next l-index 00743 return 2; 00744 } 00745 */ 00746 size_type r0 = r - m_bp_support.rank(r); 00747 const uint64_t* p = m_first_child.data() + (r0>>6); 00748 uint8_t offset = r0&0x3F; 00749 00750 uint64_t w = (*p) & bit_magic::Li1Mask[offset]; 00751 if (w) { 00752 return offset-bit_magic::l1BP(w)+1; 00753 } else if (m_first_child.data() == p) { // w==0 and we are in the first word 00754 return offset+2; // da bit_magic::l1BP(w)=-1 sein muesste 00755 } else { 00756 size_type res = offset+2; 00757 while (p > m_first_child.data()) { 00758 w = *(--p); 00759 if (0==w) 00760 res += 64; 00761 else { 00762 return res + (63-bit_magic::l1BP(w)); 00763 } 00764 } 00765 return res + (w==0); 00766 } 00767 } 00768 00769 00770 00771 // Returns the next sibling of node v. 00772 // Only for tests. 00773 node_type sibling_naive(const node_type& v)const { 00774 if (v==root()) 00775 return root(); 00776 node_type parent = this->parent(v); 00777 assert(parent != v); 00778 size_type nr = degree(parent); 00779 for (size_type i=1; i <= nr; ++i) 00780 if (ith_child(parent, i) == v and i!=nr) 00781 return ith_child(parent, i+1); 00782 return root(); 00783 } 00784 00785 00787 00795 // TODO const unsigned char c durch char_type ersetzen 00796 node_type child(const node_type& v, const unsigned char c, size_type& char_pos)const { 00797 if (is_leaf(v)) // if v is a leaf = (), v has no child 00798 return root(); 00799 // else v = ( ( )) 00800 uint16_t cc = m_csa.char2comp[c]; 00801 if (cc==0 and c!=0) // TODO: aendere char2comp so ab, dass man diesen sonderfall nicht braucht 00802 return root(); 00803 size_type char_ex_max_pos = m_csa.C[cc+1], char_inc_min_pos = m_csa.C[cc]; 00804 00805 size_type d = depth(v); 00806 00807 // (1) check the first child 00808 // char_pos = m_csa(m_csa[v.i]+d); // replaced by the next line 00809 char_pos = get_char_pos(v.i, d, m_csa); 00810 if (char_pos >= char_ex_max_pos) {// the first character of the first child interval is lex. greater than c 00811 // => all other first characters of the child intervals are also greater than c => no solution 00812 return root(); 00813 } else if (char_pos >= char_inc_min_pos) { // i.e. char_pos < char_ex_max_pos and char_pos >= char_inc_min_pos 00814 return ith_child(v, 1); 00815 } 00816 00817 size_type child_cnt = degree(v); 00818 00819 // (2) check the last child 00820 // char_pos = m_csa(m_csa[v.j]+d); // replaced by the next line 00821 char_pos = get_char_pos(v.j, d, m_csa); 00822 if (char_pos < char_inc_min_pos) {// the first character of the last child interval is lex. smaller than c 00823 // => all other first characters of the child intervals are also smaller than c => no solution 00824 return root(); 00825 } else if (char_pos < char_ex_max_pos) { // i.e. char_pos < char_ex_max_pos and char_pos >= char_inc_min_pos 00826 return ith_child(v, child_cnt); 00827 } 00828 00829 // (3) binary search for c in the children [2..children) 00830 size_type l_bound = 2, r_bound = child_cnt, mid, kpos, ckpos, l_index; 00831 while (l_bound < r_bound) { 00832 mid = (l_bound + r_bound) >> 1; 00833 00834 l_index = get_ith_l_index(v, mid-1, kpos, ckpos); 00835 // char_pos = m_csa(m_csa[l_index]+d); // replaced by the next line 00836 char_pos = get_char_pos(l_index, d, m_csa); 00837 00838 if (char_inc_min_pos > char_pos) { 00839 l_bound = mid+1; 00840 } else if (char_ex_max_pos <= char_pos) { 00841 r_bound = mid; 00842 } else { // char_inc_min_pos <= char_pos < char_ex_max_pos => found child 00843 // we know that the child is not the last child, see (2) 00844 // find next l_index: we know that a new l_index exists: i.e. assert( 0 == m_bp[ckpos-1]); 00845 size_type lp1_index = m_bp_support.rank(m_bp_support.find_open(ckpos-1))-1; 00846 size_type jp1pos = m_bp.size(); 00847 if (lp1_index-1 < size()-1) { 00848 jp1pos = m_bp_support.select(lp1_index+1); 00849 } 00850 return node_type(l_index, lp1_index-1, kpos, ckpos, jp1pos); 00851 } 00852 } 00853 return root(); 00854 } 00855 00857 // \sa child(node_type v, const unsigned char c, size_type &char_pos) 00858 node_type child(const node_type& v, const unsigned char c) { 00859 size_type char_pos; 00860 return child(v, c, char_pos); 00861 } 00862 00864 00871 unsigned char edge(const node_type& v, size_type d)const { 00872 #ifndef NDEBUG 00873 if (d < 1 or d > depth(v)) { 00874 throw std::out_of_range("OUT_OF_RANGE_ERROR: "+util::demangle(typeid(this).name())+"cst_sct3<>::edge(node_type v, size_type d). d == 0 or d > depth(v)!"); 00875 } 00876 #endif 00877 // size_type order = m_csa(m_csa[v.i]+d-1); replaced by the next line 00878 size_type order = get_char_pos(v.i, d-1, m_csa); 00879 uint16_t c_begin = 1, c_end = m_sigma+1, mid; 00880 while (c_begin < c_end) { 00881 mid = (c_begin+c_end)>>1; 00882 if (m_csa.C[mid] <= order) { 00883 c_begin = mid+1; 00884 } else { 00885 c_end = mid; 00886 } 00887 } 00888 return m_csa.comp2char[c_begin-1]; 00889 } 00890 00892 00900 node_type lca(node_type v, node_type w)const { 00901 if (v.i > w.i or (v.i == w.i and v.j < w.j)) { 00902 std::swap(v, w); 00903 } 00904 // assert(v.i < w.i or (v.i==w.i and v.j >= j)); 00905 // assert( !(v.i < w.i and v.j > w.i and v.j < w.j) ); // assert that v and w do not overlapp 00906 if (v.j >= w.j) { // v encloses w or v==w 00907 return v; 00908 } else { // v.i < v.j < w.i < w.j 00909 size_type min_index = rmq(v.i+1, w.j); 00910 size_type min_index_pos = m_bp_support.select(min_index+1); 00911 size_type min_index_cpos = m_bp_support.find_close(min_index_pos); 00912 00913 if (min_index_cpos >= m_bp.size() - m_sigma) { // if lcp[min_index]==0 => return root 00914 return root(); 00915 } 00916 size_type new_j = nsv(min_index, min_index_pos)-1; 00917 size_type new_ipos, new_icpos; 00918 size_type new_i = psv(min_index, min_index_pos, min_index_cpos, new_ipos, new_icpos); 00919 size_type jp1pos = m_bp.size(); 00920 if (new_j < size()-1) { 00921 jp1pos = m_bp_support.select(new_j+2); 00922 } 00923 return node_type(new_i, new_j, new_ipos, new_icpos, jp1pos); 00924 } 00925 } 00926 00928 00934 size_type depth(const node_type& v)const { 00935 if (v.i == v.j) { 00936 return size()-m_csa[v.i]; 00937 } else if (v == root()) { 00938 return 0; 00939 } else { 00940 size_type kpos, ckpos; 00941 size_type l = get_first_l_index(v, kpos, ckpos); 00942 return m_lcp[l]; 00943 } 00944 } 00945 00947 00953 // TODO: can be implemented in O(1) with o(n) space. See Jansson, Sadakane, Sung, SODA 2007, "Ultra-succinct Representation of Ordered Trees" 00954 size_type node_depth(node_type v)const { 00955 size_type d = 0; 00956 while (v != root()) { 00957 ++d; 00958 v = parent(v); 00959 } 00960 return d; 00961 } 00962 00964 00970 node_type sl(const node_type& v)const { 00971 if (v == root()) 00972 return root(); 00973 // get interval with first char deleted 00974 size_type i = m_csa.psi[v.i]; 00975 if (is_leaf(v)) { 00976 if (v.i==0 and v.j==0) // if( v.l==1 ) 00977 return root(); 00978 else 00979 return ith_leaf(i+1); 00980 } 00981 size_type j = m_csa.psi[v.j]; 00982 assert(i < j); 00983 size_type min_index = rmq(i+1, j); // rmq 00984 size_type min_index_pos = m_bp_support.select(min_index+1); 00985 size_type min_index_cpos = m_bp_support.find_close(min_index_pos); 00986 if (min_index_cpos >= m_bp.size() - m_sigma) { // if lcp[min_index]==0 => return root 00987 return root(); 00988 } 00989 size_type new_j = nsv(min_index, min_index_pos)-1; 00990 size_type new_ipos, new_icpos; 00991 size_type new_i = psv(min_index, min_index_pos, min_index_cpos, new_ipos, new_icpos); 00992 size_type jp1pos = m_bp.size(); 00993 if (new_j < size()-1) { 00994 jp1pos = m_bp_support.select(new_j+2); 00995 } 00996 return node_type(new_i, new_j, new_ipos, new_icpos, jp1pos); 00997 } 00998 00999 01001 /* 01002 * \param v A valid not of a cst_sct3. 01003 * \param c The character which should be prepended to the string of the current node. 01004 * \return root() if the Weiner link of (v, c) does not exist, otherwise the Weiner link is returned. 01005 * \par Time complexity 01006 * \f$ \Order{ t_{rank\_bwt} } \f$ 01007 * 01008 */ 01009 node_type wl(const node_type& v, const unsigned char c) const { 01010 size_type c_left = m_csa.rank_bwt(v.i, c); 01011 size_type c_right = m_csa.rank_bwt(v.j+1, c); 01012 if (c_left == c_right) // there exists no Weiner link 01013 return root(); 01014 if (c_left+1 == c_right) 01015 return ith_leaf(m_csa.C[m_csa.char2comp[c]] + c_left + 1); 01016 else { 01017 size_type left = m_csa.C[m_csa.char2comp[c]] + c_left; 01018 size_type right = m_csa.C[m_csa.char2comp[c]] + c_right - 1; 01019 assert(left < right); 01020 01021 size_type ipos = m_bp_support.select(left+1); 01022 size_type jp1pos = m_bp.size(); 01023 if (right < size()-1) { 01024 jp1pos = m_bp_support.select(right+2); 01025 } 01026 return node_type(left, right, ipos, 01027 m_bp_support.find_close(ipos), jp1pos); 01028 } 01029 } 01030 01032 01037 size_type sn(const node_type& v)const { 01038 assert(is_leaf(v)); 01039 return m_csa[v.i]; 01040 } 01041 01043 01049 size_type id(const node_type& v)const { 01050 if (is_leaf(v)) { // return id in the range from 0..csa.size()-1 01051 return v.i; 01052 } 01053 size_type ckpos; // closing parentheses of the l-index 01054 if (v.cipos > v.jp1pos) { // corresponds to m_lcp[i] <= m_lcp[j+1] 01055 ckpos = v.jp1pos-1; 01056 } else { // corresponds to m_lcp[i] > m_lcp[j+1] 01057 ckpos = v.cipos-1; 01058 } 01059 assert(m_bp[ckpos]==0); 01060 size_type r0ckpos = ckpos-m_bp_support.rank(ckpos); // determine the rank of the closing parenthesis 01061 return size()+m_first_child_rank(r0ckpos); 01062 } 01063 01065 01072 node_type inv_id(size_type id) { 01073 if (id < size()) { // the corresponding node is a leaf 01074 return ith_leaf(id+1); 01075 } else { // the corresponding node is a inner node 01076 // (1) get index of the closing parenthesis in m_first_child 01077 size_type r0ckpos = 0; 01078 { 01079 //binary search for the position of the (id-size()+1)-th set bit in 01080 id = id-size()+1; 01081 size_type lb = 0, rb = m_bp.size(); // lb inclusive, rb exclusive 01082 // invariant: arg(lb) < id, arg(rb) >= id 01083 while (rb-lb > 1) { 01084 size_type mid = lb + (rb-lb)/2; 01085 size_type arg = m_first_child_rank(mid); // ones in the prefix [0..mid-1] 01086 if (arg < id) { 01087 lb = mid; 01088 } else { // arg >= id 01089 rb = mid; 01090 } 01091 } 01092 r0ckpos = lb; 01093 } 01094 // (2) determine position clpos of the r0clpos-th closing parentheses in the parentheses sequence 01095 size_type ckpos = 0; 01096 { 01097 // binary search for the position of the (r0ckpos+1)-th closing parenthesis 01098 size_type lb = 0, rb = m_bp.size(); // lb inclusive, rb exclusive 01099 // invariant: arg(lb) < r0ckpos+1, arg(rb) >= r0ckpos+1 01100 while (rb-lb > 1) { 01101 size_type mid = lb + (rb-lb)/2; 01102 size_type arg = mid - m_bp_support.rank(mid-1); // zeros in the prefix [0..mid-1] 01103 if (arg < r0ckpos+1) { 01104 lb = mid; 01105 } else { // arg >= x 01106 rb = mid; 01107 } 01108 } 01109 ckpos = lb; 01110 } 01111 // if ( m_bp[ckpos] ){ 01112 // std::cerr<<"m_bp[ckpos] should be zero! id=" << id << std::endl; 01113 // std::cerr<<"r0ckpos="<<r0ckpos<<" rank_0(ckpos)="<< ckpos - m_bp_support.rank(ckpos-1) << std::endl; 01114 // } 01115 if (ckpos == m_bp.size()-1) { 01116 return root(); 01117 } 01118 if (m_bp[ckpos+1]) { // jp1pos < cipos 01119 // std::cout<<"case1"<<std::endl; 01120 size_type jp1pos= ckpos+1; 01121 size_type j = m_bp_support.rank(jp1pos-1)-1; 01122 size_type kpos = m_bp_support.find_open(ckpos); 01123 size_type ipos = m_bp_support.enclose(kpos); 01124 size_type cipos = m_bp_support.find_close(ipos); 01125 size_type i = m_bp_support.rank(ipos-1); 01126 return node_type(i, j, ipos, cipos, jp1pos); 01127 } else { // 01128 // std::cout<<"case2"<<std::endl; 01129 size_type cipos = ckpos+1; 01130 size_type ipos = m_bp_support.find_open(cipos); 01131 size_type i = m_bp_support.rank(ipos-1); 01132 size_type j = nsv(i, ipos)-1; 01133 size_type jp1pos= m_bp.size(); 01134 if (j != size()-1) { 01135 jp1pos = m_bp_support.select(j+2); 01136 } 01137 return node_type(i, j, ipos, cipos, jp1pos); 01138 } 01139 } 01140 } 01141 01143 size_type nodes()const { 01144 return m_nodes; 01145 } 01146 01148 /* \param lb Left bound of the lcp-interval [lb..rb] (inclusive). 01149 * \param rb Right bound of the lcp-interval [lb..rb] (inclusive). 01150 * \param l Depth of the lcp-interval. 01151 *\ return The node in the suffix tree corresponding lcp-interval [lb..rb] 01152 */ 01153 node_type node(size_type lb, size_type rb, size_type l=0) const { 01154 size_type ipos = m_bp_support.select(lb+1); 01155 size_type jp1pos; 01156 if (rb == size()-1) { 01157 jp1pos = m_bp.size(); 01158 } else { 01159 jp1pos = m_bp_support.select(rb+2); 01160 } 01161 return node_type(lb, rb, ipos, m_bp_support.find_close(ipos), jp1pos); 01162 } 01163 01165 01169 size_type tlcp_idx(size_type i) const { 01170 size_type ipos = m_bp_support.select(i+1); 01171 size_type cipos = m_bp_support.find_close(ipos); 01172 return m_first_child_rank.rank(((ipos+cipos-1)>>1)-i); 01173 } 01174 /* @} */ 01175 }; 01176 01177 // == template functions == 01178 01179 01180 template<class Csa, class Lcp, class Bp_support, class Rank_support> 01181 template<uint8_t int_width, class size_type_class, uint8_t int_width_1, class size_type_class_1, uint8_t int_width_2, class size_type_class_2> 01182 cst_sct3<Csa, Lcp, Bp_support, Rank_support>::cst_sct3(const std::string& csa_file_name, 01183 int_vector_file_buffer<int_width, size_type_class>& lcp_buf, 01184 int_vector_file_buffer<int_width_1, size_type_class_1>& sa_buf, 01185 int_vector_file_buffer<int_width_2, size_type_class_2>& isa_buf, 01186 std::string dir, 01187 bool build_only_bps 01188 ):csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support), first_child_bv(m_first_child), first_child_rank(m_first_child_rank) 01189 { 01190 std::string id = util::to_string(util::get_pid())+"_"+util::to_string(util::get_id()).c_str(); 01191 01192 write_R_output("cst", "construct BPS", "begin", 1, 0); 01193 m_nodes = algorithm::construct_supercartesian_tree_bp_succinct_and_first_child(lcp_buf, m_bp, m_first_child) + m_bp.size()/2; 01194 write_R_output("cst", "construct BPS", "end", 1, 0); 01195 01196 if (!build_only_bps) { 01197 util::load_from_file(m_csa, csa_file_name.c_str()); 01198 01199 write_R_output("cst", "construct CLCP", "begin", 1, 0); 01200 construct_lcp(m_lcp, *this, lcp_buf, isa_buf); 01201 write_R_output("cst", "construct CLCP", "end", 1, 0); 01202 } 01203 write_R_output("cst", "construct BPSS", "begin", 1, 0); 01204 util::init_support(m_bp_support, &m_bp); 01205 util::init_support(m_first_child_rank, &m_first_child); 01206 write_R_output("cst", "construct BPSS", "end", 1, 0); 01207 m_sigma = degree(root()); 01208 } 01209 01210 01211 template<class Csa, class Lcp, class Bp_support, class Rank_support> 01212 cst_sct3<Csa, Lcp, Bp_support, Rank_support>::cst_sct3(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps):csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support), first_child_bv(m_first_child), first_child_rank(m_first_child_rank) 01213 { 01214 construct(file_map, dir, id, build_only_bps); 01215 } 01216 01217 01218 template<class Csa, class Lcp, class Bp_support, class Rank_support> 01219 void cst_sct3<Csa, Lcp, Bp_support, Rank_support>::construct(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps) 01220 { 01221 write_R_output("cst", "construct BPS", "begin", 1, 0); 01222 int_vector_file_buffer<> lcp_buf(file_map["lcp"].c_str()); 01223 m_nodes = algorithm::construct_supercartesian_tree_bp_succinct_and_first_child(lcp_buf, m_bp, m_first_child) + m_bp.size()/2; 01224 write_R_output("cst", "construct BPS", "end", 1, 0); 01225 write_R_output("cst", "construct BPSS", "begin", 1, 0); 01226 util::init_support(m_bp_support, &m_bp); 01227 util::init_support(m_first_child_rank, &m_first_child); 01228 write_R_output("cst", "construct BPSS", "end", 1, 0); 01229 01230 if (!build_only_bps) { 01231 write_R_output("cst", "construct CLCP", "begin", 1, 0); 01232 construct_lcp(m_lcp, *this, file_map, dir, id); 01233 write_R_output("cst", "construct CLCP", "end", 1, 0); 01234 } 01235 if (!build_only_bps) { 01236 util::load_from_file(m_csa, file_map["csa"].c_str()); 01237 } 01238 m_sigma = degree(root()); 01239 } 01240 01241 template<class Csa, class Lcp, class Bp_support, class Rank_support> 01242 typename cst_sct3<Csa, Lcp, Bp_support, Rank_support>::size_type cst_sct3<Csa, Lcp, Bp_support, Rank_support>::serialize(std::ostream& out, structure_tree_node* v, std::string name)const 01243 { 01244 structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); 01245 size_type written_bytes = 0; 01246 written_bytes += m_csa.serialize(out, child, "csa"); 01247 written_bytes += m_lcp.serialize(out, child, "lcp"); 01248 written_bytes += m_bp.serialize(out, child, "bp"); 01249 written_bytes += m_bp_support.serialize(out, child, "bp_support"); 01250 written_bytes += m_first_child.serialize(out, child, "mark_child"); 01251 written_bytes += m_first_child_rank.serialize(out, child, "mark_child_rank"); 01252 written_bytes += util::write_member(m_sigma, out, child, "sigma"); 01253 written_bytes += util::write_member(m_nodes, out, child, "node_cnt"); 01254 structure_tree::add_size(child, written_bytes); 01255 return written_bytes; 01256 } 01257 01258 template<class Csa, class Lcp, class Bp_support, class Rank_support> 01259 void cst_sct3<Csa, Lcp, Bp_support, Rank_support>::load(std::istream& in) 01260 { 01261 m_csa.load(in); 01262 load_lcp(m_lcp, in, *this); 01263 m_bp.load(in); 01264 m_bp_support.load(in, &m_bp); 01265 m_first_child.load(in); 01266 m_first_child_rank.load(in, &m_first_child); 01267 util::read_member(m_sigma, in); 01268 util::read_member(m_nodes, in); 01269 #ifdef SDSL_DEBUG 01270 assert(algorithm::check_bp_support(m_bp, m_bp_support)); 01271 std::cerr<<"checked bp_support"<<std::endl; 01272 #endif 01273 } 01274 01275 template<class Csa, class Lcp, class Bp_support, class Rank_support> 01276 cst_sct3<Csa, Lcp, Bp_support, Rank_support>& cst_sct3<Csa, Lcp, Bp_support, Rank_support>::operator=(const cst_sct3& cst) 01277 { 01278 if (this != &cst) { 01279 copy(cst); 01280 } 01281 return *this; 01282 } 01283 01284 01285 01286 01287 01288 } // end namespace sdsl 01289 01290 01291 #endif