SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2010 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00021 #ifndef INCLUDED_SDSL_CST_SCT3P 00022 #define INCLUDED_SDSL_CST_SCT3P 00023 00024 #include "int_vector.hpp" 00025 #include "algorithms.hpp" 00026 #include "iterators.hpp" 00027 #include "lcp_support_tree.hpp" 00028 #include "lcp_wt.hpp" 00029 #include "bp_support.hpp" 00030 #include "csa_wt.hpp" // for std initialization of cst_sct3p 00031 #include "csa_uncompressed.hpp" 00032 #include "cst_iterators.hpp" 00033 #include "cst_sct.hpp" // for lcp_interval 00034 #include "rank_support.hpp" 00035 #include "testutils.hpp" 00036 #include "util.hpp" 00037 #include <iostream> 00038 #include <algorithm> 00039 #include <cassert> 00040 #include <cstring> // for strlen 00041 #include <iomanip> 00042 #include <iterator> 00043 #include <stack> // for the calculation of the balanced parantheses sequence 00044 #include <ostream> 00045 00046 namespace sdsl 00047 { 00048 00049 struct cst_tag; // forward declaration 00050 00051 template<class Int = int_vector<>::size_type /*size_t*/> 00052 struct bp_interval_p 00053 { 00054 Int ipos; // position of the i+1th opening parenthesis in the balanced parenthese sequence 00055 Int cipos; // positon of the matching closing parenthesis of the i+1th openeing parenthesis in the balanced parentheses sequence 00056 Int jp1pos; // position of the j+2th opening parentheis in the balanced parentheses sequence 00057 00059 00061 bp_interval_p(Int ipos=0, Int cipos=0, Int jp1pos=0):ipos(ipos),cipos(cipos),jp1pos(jp1pos) {}; 00062 00063 00064 bool operator<(const bp_interval_p& interval)const { 00065 if (ipos!=interval.ipos) 00066 return ipos<interval.ipos; 00067 return jp1pos<interval.jp1pos; 00068 } 00069 00071 00073 bool operator==(const bp_interval_p& interval)const { 00074 return ipos==interval.ipos and 00075 jp1pos==interval.jp1pos; 00076 } 00077 00079 00081 bool operator!=(const bp_interval_p& interval)const { 00082 return !(*this==interval); 00083 } 00084 00086 00088 bp_interval_p& operator=(const bp_interval_p& interval) { 00089 ipos = interval.ipos; 00090 cipos = interval.cipos; 00091 jp1pos = interval.jp1pos; 00092 return *this; 00093 } 00094 }; 00095 00096 00097 template<class Int> 00098 inline std::ostream& operator<<(std::ostream& os, const bp_interval_p<Int>& interval) 00099 { 00100 os<<"-("<<interval.ipos<<","<<interval.cipos<<","<<interval.jp1pos<<")"; 00101 return os; 00102 } 00103 00104 00106 00126 template<class Csa = csa_wt<>, class Lcp = lcp_support_tree<lcp_wt<> >, class Bp_support = bp_support_sada<>, class Rank_support = rank_support_v5<> > 00127 class cst_sct3p 00128 { 00129 public: 00130 typedef typename Csa::value_type value_type; // STL Container requirement/TODO: ist das nicht gleich node type??? 00131 typedef cst_dfs_const_forward_iterator<cst_sct3p> const_iterator;// STL Container requirement 00132 // typedef const_iterator iterator; // STL Container requirement 00133 typedef cst_bottom_up_const_forward_iterator<cst_sct3p> const_bottom_up_iterator; 00134 // typedef const_bottom_up_iterator bottom_up_iterator; 00135 typedef const value_type const_reference; 00136 typedef const_reference reference; 00137 typedef const_reference* pointer; 00138 typedef const pointer const_pointer; 00139 typedef typename Csa::size_type size_type; // STL Container requirement 00140 typedef size_type cst_size_type; 00141 typedef ptrdiff_t difference_type; // STL Container requirement 00142 typedef Csa csa_type; 00143 typedef typename Lcp::template type<cst_sct3p>::lcp_type lcp_type; 00144 typedef Bp_support bp_support_type; 00145 typedef typename Csa::pattern_type pattern_type; 00146 typedef typename Csa::char_type char_type; 00147 typedef bp_interval_p<size_type> node_type; 00148 typedef Rank_support fc_rank_support_type; 00149 00150 typedef cst_tag index_category; 00151 private: 00152 Csa m_csa; 00153 lcp_type m_lcp; 00154 bit_vector m_bp; 00155 bp_support_type m_bp_support; 00156 bit_vector m_first_child; // implementation note: no rank structure is needed for the first_child bit_vector, except for id() 00157 fc_rank_support_type m_first_child_rank; 00158 uint8_t m_sigma; 00159 size_type m_nodes; 00160 00161 void copy(const cst_sct3p& cst) { 00162 m_csa = cst.m_csa; 00163 copy_lcp(m_lcp, cst.m_lcp, *this); 00164 m_bp = cst.m_bp; 00165 m_bp_support = cst.m_bp_support; 00166 m_bp_support.set_vector(&m_bp); 00167 m_first_child = cst.m_first_child; 00168 m_first_child_rank = cst.m_first_child_rank; 00169 m_first_child_rank.set_vector(&m_first_child); 00170 m_sigma = cst.m_sigma; 00171 m_nodes = cst.m_nodes; 00172 } 00173 00174 // Get the first l index of a [i,j] interval. 00175 /* I.e. given an interval [i,j], the function returns the position of the smallest entry lcp[k] with \f$ i<k\leq j \f$ 00176 * \par Time complexity 00177 * \f$ \Order{1} \f$ 00178 */ 00179 inline size_type get_first_l_index(const node_type& node, size_type& kpos, size_type& ckpos)const { 00180 if (node.cipos > node.jp1pos) { // corresponds to m_lcp[i] <= m_lcp[j+1] 00181 ckpos = node.jp1pos-1; 00182 } else { // corresponds to m_lcp[i] > m_lcp[j+1] 00183 ckpos = node.cipos-1; 00184 } 00185 assert(m_bp[ckpos]==0); 00186 kpos = m_bp_support.find_open(ckpos); 00187 return m_bp_support.rank(kpos)-1; 00188 } 00189 00190 // Get the ith l index of a node 00191 // if there exists no ith l-index return node.j+1 00192 size_type get_ith_l_index(const node_type& node, size_type i, size_type& kpos, size_type& ckpos)const { 00193 uint64_t children = 0; 00194 00195 assert(i > 0); 00196 if (node.cipos > node.jp1pos) { // corresponds to m_lcp[i] <= m_lcp[j+1] 00197 ckpos = node.jp1pos-1; 00198 } else { // corresponds to m_lcp[i] > m_lcp[j+1] 00199 ckpos = node.cipos-1; 00200 } 00201 assert(m_bp[ckpos] == 0); // at least the first l-index should be present, i.e. node is not leaf 00202 if (1 == i) { 00203 kpos = m_bp_support.find_open(ckpos); 00204 return m_bp_support.rank(kpos)-1; 00205 } else { // i > 1 00206 size_type r = ckpos - m_bp_support.rank(ckpos); // numbers of closing parentheses - 1 = index of first child in m_first_child 00207 if (r+1 >= i) { // if there exist more than i l-indices 00208 // check if m_first_child[r-i+1..r-1] consists of zeros 00209 const uint64_t* p = m_first_child.data() + (r>>6); 00210 uint8_t offset = r&0x3F; 00211 00212 uint64_t w = (*p) & bit_magic::Li1Mask[offset]; 00213 if (w) { 00214 children = offset - bit_magic::l1BP(w) + 1; 00215 } else if (m_first_child.data() == p) { // w==0 and we are in the first word 00216 children = offset + 2; // da bit_magic::l1BP(w)=-1 sein muesste 00217 } else { 00218 children = offset + 2; 00219 while (p > m_first_child.data()) { 00220 w = *(--p); 00221 if (0==w) 00222 children += 64; 00223 else { 00224 children += (63-bit_magic::l1BP(w)); 00225 break; 00226 } 00227 } 00228 children += (w==0); 00229 } 00230 if (i < children) { // there exists an ith l-index 00231 ckpos -= (i-1); 00232 assert(m_bp[ckpos] == 0); 00233 kpos = m_bp_support.find_open(ckpos); 00234 return m_bp_support.rank(kpos)-1; 00235 } 00236 } 00237 // if i >= children 00238 kpos = node.jp1pos; 00239 if (kpos < m_bp.size()) 00240 ckpos = m_bp_support.find_close(kpos); 00241 else 00242 ckpos = m_bp.size(); 00243 return node.j+1; 00244 } 00245 } 00246 00247 // Get the postion of the first l index of a l-[i,j] interval in the balanced parentheses sequence. 00248 /* \par Time complexity 00249 * \f$ \Order{1} \f$ 00250 */ 00251 inline size_type get_pos_of_closing_parenthesis_of_the_first_l_index(const node_type& node)const { 00252 if (node.cipos > node.jp1pos) { // corresponds to m_lcp[i] <= m_lcp[j+1] 00253 return node.jp1pos-1; 00254 } else { // corresponds to m_lcp[i] > m_lcp[j+1] 00255 return node.cipos-1; 00256 } 00257 } 00258 00259 // Get the next smaller value. 00260 /* \par Time complexity 00261 * \f$ \Order{1} \f$ 00262 */ 00263 // Returns n if there is no next smaller value in [i+1..n-1] 00264 inline size_type nsv(size_type i, size_type ipos)const { // possible optimization: calculate also position of nsv, i.e. next ( following position cipos 00265 size_type cipos = m_bp_support.find_close(ipos); 00266 size_type result = m_bp_support.rank(cipos); 00267 return result; 00268 } 00269 00270 // Get the previous smaller value. 00271 /* 00272 * \par Time complexity 00273 * \f$ \Order{\frac{\sigma}{w}} \f$, where w=64 is the word size, can be implemented in \f$\Order{1}\f$ with rank and select 00274 */ 00275 inline size_type psv(size_type i, size_type ipos, size_type cipos, size_type& psvpos, size_type& psvcpos)const { 00276 if (cipos >= m_bp.size() - m_sigma) { // if lcp[i]==0 => psv is the 0th index by definition 00277 psvpos = 0; 00278 psvcpos = m_bp.size()-1; 00279 return 0; 00280 } 00281 size_type r0 = cipos - m_bp_support.rank(cipos); // index of clothing parenthesis in first_child bp 00282 size_type next_first_child = 0; 00283 const uint64_t* p = m_first_child.data() + (r0>>6); 00284 uint64_t w = (*p) >> (r0&0x3F); 00285 if (w) { // if w!=0 00286 next_first_child = cipos + bit_magic::r1BP(w); 00287 if (cipos == next_first_child and m_bp[next_first_child+1]) { 00288 psvpos = m_bp_support.enclose(ipos); 00289 psvcpos = m_bp_support.find_close(psvpos); 00290 return m_bp_support.rank(psvpos)-1; 00291 } 00292 } else { 00293 cipos += 64-(r0&0x3F); 00294 ++p; 00295 while (!(w=*p)) { // while w==0 00296 ++p; 00297 cipos += 64; 00298 } 00299 next_first_child = cipos + bit_magic::r1BP(w); 00300 } 00301 if (!m_bp[next_first_child+1]) { // if next parenthesis is a closing one 00302 psvcpos = next_first_child+1; 00303 psvpos = m_bp_support.find_open(psvcpos); 00304 return m_bp_support.rank(psvpos)-1; 00305 } else { 00306 psvpos = m_bp_support.enclose(m_bp_support.find_open(next_first_child)); 00307 psvcpos = m_bp_support.find_close(psvpos); 00308 return m_bp_support.rank(psvpos)-1; 00309 } 00310 } 00311 00312 // Range minimum query based on the rr_enclose method. 00313 /* \par Time complexity 00314 * \f$ \Order{\rrenclose} \f$ 00315 */ 00316 inline size_type rmq(size_type l, size_type r)const { 00317 size_type i = m_bp_support.select(l+1); 00318 size_type j = m_bp_support.select(r+1); 00319 size_type fc_i = m_bp_support.find_close(i); 00320 if (j < fc_i) { // i < j < find_close(j) < find_close(i) 00321 return l; 00322 } else { // i < find_close(i) < j < find_close(j) 00323 size_type ec = m_bp_support.rr_enclose(i,j); 00324 if (ec == m_bp_support.size()) {// no restricted enclosing pair found 00325 return r; 00326 } else { // found range restriced enclosing pair 00327 return m_bp_support.rank(ec)-1; // subtract 1, as the index is 0 based 00328 } 00329 } 00330 } 00331 00332 public: 00333 const Csa& csa; 00334 const lcp_type& lcp; 00335 const bit_vector& bp; 00336 const bp_support_type& bp_support; 00337 00338 const bit_vector& first_child_bv; 00339 const fc_rank_support_type& first_child_rank; 00340 00342 /* @{ */ 00343 00345 cst_sct3p(): csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support), first_child_bv(m_first_child), first_child_rank(m_first_child_rank) {} 00346 00347 // Constructor for the cst_sct3p taking a string for that the compressed suffix tree should be calculated. 00348 /* 00349 * \param str Text for which the \f$ \CST \f$ should be constructed. The text should be terminated by a zero byte. 00350 * \pre The text has to be terminated by a zero byte. 00351 */ 00352 // cst_sct3p(const unsigned char *str); 00353 00354 template<uint8_t int_width, class size_type_class, uint8_t int_width_1, class size_type_class_1, uint8_t int_width_2, class size_type_class_2> 00355 cst_sct3p(const std::string& cst_file_name, 00356 int_vector_file_buffer<int_width, size_type_class>& lcp_buf, 00357 int_vector_file_buffer<int_width_1, size_type_class_1>& sa_buf, 00358 int_vector_file_buffer<int_width_2, size_type_class_2>& isa_buf, 00359 std::string dir, 00360 bool build_ony_bps); 00361 00362 cst_sct3p(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps); 00363 00365 00370 cst_sct3p(const cst_sct3p& cst):csa(m_csa),lcp(m_lcp),bp(m_bp),bp_support(m_bp_support),first_child_bv(m_first_child), first_child_rank(m_first_child_rank) { 00371 copy(cst); 00372 } 00373 00374 /* @} */ 00375 00377 ~cst_sct3p() {} 00378 00379 void construct(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps); 00380 00382 00385 size_type size()const { 00386 return m_bp.size()>>1; 00387 } 00388 00390 00393 static size_type max_size() { 00394 return Csa::max_size(); 00395 } 00396 00398 00401 bool empty()const { 00402 return m_csa.empty(); 00403 } 00404 00406 00414 void swap(cst_sct3p& cst); 00415 00417 00420 const_iterator begin()const { 00421 if (0 == m_bp.size()) // special case: tree is uninitialized 00422 return end(); 00423 else if (2 == m_bp.size()) { // special case: the root is a leaf 00424 return const_iterator(this, root(), true, true); 00425 } 00426 return const_iterator(this, root(), false, true); 00427 }; 00428 00430 00433 const_iterator end()const { 00434 return const_iterator(this, root(), true, false); 00435 } 00436 00438 const_bottom_up_iterator begin_bottom_up()const { 00439 if (0 == m_bp.size()) // special case: tree is uninitialized 00440 return end_bottom_up(); 00441 return const_bottom_up_iterator(this, leftmost_leaf_in_the_subtree(root())); 00442 } 00443 00445 const_bottom_up_iterator end_bottom_up()const { 00446 return const_bottom_up_iterator(this, root(), false); 00447 } 00448 00450 00453 cst_sct3p& operator=(const cst_sct3p& cst); 00454 00456 00461 bool operator==(const cst_sct3p& cst)const; 00462 00464 00469 bool operator!=(const cst_sct3p& cst)const; 00470 00472 00475 size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const; 00476 00478 00480 void load(std::istream& in); 00481 00483 /* @{ */ 00484 00486 00490 node_type root() const { 00491 return node_type(0, m_bp.size()-1, m_bp.size()); 00492 } 00493 00495 00501 bool is_leaf(const node_type& v)const { 00502 return v.i==v.j; 00503 } 00504 00506 00513 node_type ith_leaf(size_type i)const { 00514 assert(i > 0 and i <= size()); 00515 size_type ipos = m_bp_support.select(i); 00516 size_type jp1pos; 00517 if (i == size()) 00518 jp1pos = m_bp.size(); 00519 else if (m_bp[ipos+1]) 00520 jp1pos = ipos+1; 00521 else 00522 jp1pos = m_bp_support.select(i+1); 00523 return node_type(ipos, m_bp_support.find_close(ipos), jp1pos); 00524 } 00525 00527 00534 size_type leaves_in_the_subtree(const node_type& v)const { 00535 return v.j-v.i+1; 00536 } 00537 00539 00544 node_type leftmost_leaf_in_the_subtree(const node_type& v)const { 00545 return ith_leaf(v.i+1); 00546 } 00547 00549 00554 node_type rightmost_leaf_in_the_subtree(const node_type& v)const { 00555 return ith_leaf(v.j+1); 00556 } 00557 00559 00566 size_type lb(const node_type& v)const { 00567 return v.i; 00568 } 00569 00571 00578 size_type rb(const node_type& v)const { 00579 return v.j; 00580 } 00581 /* 00582 bool lcp_value_equals_zero(size_type i)const{ 00583 size_type ipos = m_bp_support.find_close(m_bp_support.select(i+1)); 00584 return ipos >= (m_bp.size() - m_sigma); 00585 } 00586 */ 00587 00589 00594 node_type parent(const node_type& v) const { 00595 //std::cout<<"parent "<<depth(v)<<v<<std::endl; 00596 #ifndef NDEBUG 00597 // std::cout<<"parent interval of "<<v.l<<"-["<<v.left<<" "<<v.right<<"]"<<std::endl; 00598 #endif 00599 if (v.cipos > v.jp1pos) { // LCP[i] <= LCP[j+1] 00600 size_type psv_pos, psv_cpos, psv_v, nsv_v, nsv_p1pos; 00601 psv_v = psv(v.j+1, v.jp1pos, m_bp_support.find_close(v.jp1pos), psv_pos, psv_cpos); 00602 nsv_v = nsv(v.j+1, v.jp1pos)-1; 00603 if (nsv_v == size()-1) { 00604 nsv_p1pos = m_bp.size(); 00605 } else { // nsv_v < size()-1 00606 nsv_p1pos = m_bp_support.select(nsv_v+2); 00607 } 00608 return node_type(psv_v, nsv_v, psv_pos, psv_cpos, nsv_p1pos); 00609 } else { // LCP[i] > LCP[j+1] 00610 size_type psv_pos, psv_cpos, psv_v; 00611 psv_v = psv(v.i, v.ipos, v.cipos, psv_pos, psv_cpos); 00612 return node_type(psv_v, v.j, psv_pos, psv_cpos, v.jp1pos); 00613 } 00614 } 00615 00617 00623 node_type sibling(const node_type& v)const { 00624 //std::cout<<"sibling "<<depth(v)<<v<<std::endl; 00625 // Vorgehen:(1) Bestimmen, ob v noch einen rechten Bruder hat. Entsp. parent hat gleicht rechte Grenze wie v. Speziallfall rechter Rand ist hier schon behandelt! 00626 if (v.cipos < v.jp1pos) { // LCP[i] > LCP[j+1] => v hat die selbe rechte Grenze wie parent(v) => kein rechter Bruder 00627 return root(); 00628 } 00629 // (2) Es existiert ein rechter Bruder, LCP[j+1] >= LCP[i] und j>i 00630 // es gilt nun: v.cipos > v.jp1pos 00631 size_type cjp1posm1 = m_bp_support.find_close(v.jp1pos)-1; // v.cipos-2 ??? 00632 //std::cerr<<"cjp1posm1="<<cjp1posm1<<std::endl; 00633 // size_type cjp1posm1 = v.cipos-1; 00634 // if( cjp1posm1+1 != v.cipos ){ 00635 // std::cout<<v<<" "<<cjp1posm1+1<<" "<<v.cipos<<std::endl; 00636 // } 00637 // size_type cjp1posm1 = v.cipos-2; 00638 // an der stelle von cjp1posm1 steht 1, falls v das letzte Kind ist 00639 bool last_child = m_bp[cjp1posm1]; 00640 // an der stelle von cjp1posm1 steht 0 -> entweder nicht letztes Kind oder letztes Kind 00641 if (!last_child) { 00642 size_type first_child_idx = cjp1posm1 - m_bp_support.rank(cjp1posm1); 00643 last_child = m_first_child[first_child_idx]; // if first_child indicator is true => the new sibling is the rightmost sibling 00644 } 00645 if (last_child) { 00646 size_type nsv_v = nsv(v.j+1, v.jp1pos)-1, nsv_p1pos; 00647 if (nsv_v == size()-1) { 00648 nsv_p1pos = m_bp.size(); 00649 } else { 00650 nsv_p1pos = m_bp_support.select(nsv_v+2); 00651 } 00652 return node_type(v.j+1, nsv_v, v.jp1pos, m_bp_support.find_close(v.jp1pos), nsv_p1pos); 00653 } else { 00654 //std::cerr<<"not last child: "<<std::endl; 00655 /* 00656 if( cjp1posm1+1 != v.cipos-1 ){ 00657 std::cout<<v<<" "<<cjp1posm1+1<<" "<<v.cipos<<std::endl; 00658 std::cout<<"LCP["<<v.i<<"]="<<m_lcp[v.i]<<" "<<"LCP["<<v.j<<"]="<<m_lcp[v.j]; 00659 if(v.j<size()) 00660 std::cout<<" LCP[j+1]="<<m_lcp[v.j+1]<<std::endl; 00661 } 00662 */ 00663 //std::cerr<<"find_open(cjp1posm1) = "<< m_bp_support.find_open(cjp1posm1) << std::endl; 00664 size_type new_j = m_bp_support.rank(m_bp_support.find_open(cjp1posm1))-2; 00665 return node_type(v.j+1, new_j, v.jp1pos, m_bp_support.find_close(v.jp1pos), m_bp_support.select(new_j+2)); 00666 } 00667 } 00668 00670 00679 node_type ith_child(const node_type& v, size_type i)const { 00680 // std::cerr<<i<<"th child "<<depth(v)<<v<<" i="<<i<<std::endl; 00681 assert(i > 0); 00682 if (is_leaf(v)) // if v is a leave, v has no child 00683 return root(); 00684 if (1 == i) { 00685 size_type k = 0, kpos = 0, k_find_close = 0; 00686 // v is not a leave: v has at least two children 00687 k = get_first_l_index(v, kpos, k_find_close);// get first l-index k and the position of k 00688 return node_type(v.i, k-1, v.ipos, v.cipos, kpos); 00689 } else { // i > 1 00690 size_type k1, kpos1, k_find_close1; 00691 k1 = get_ith_l_index(v, i-1, kpos1, k_find_close1); 00692 if (k1 == v.j+1) 00693 return root(); 00694 size_type k2, kpos2, k_find_close2; 00695 k2 = get_ith_l_index(v, i, kpos2, k_find_close2); 00696 return node_type(k1, k2-1, kpos1, k_find_close1, kpos2); 00697 } 00698 } 00699 00701 00707 size_type degree(const node_type& v)const { 00708 if (is_leaf(v)) // if v is a leave, v has no child 00709 return 0; 00710 // v is not a leave: v has at least two children 00711 size_type r = get_pos_of_closing_parenthesis_of_the_first_l_index(v); 00712 /* if( m_bp[r-1] ){// if there exists no next l-index 00713 return 2; 00714 } 00715 */ 00716 size_type r0 = r - m_bp_support.rank(r); 00717 const uint64_t* p = m_first_child.data() + (r0>>6); 00718 uint8_t offset = r0&0x3F; 00719 00720 uint64_t w = (*p) & bit_magic::Li1Mask[offset]; 00721 if (w) { 00722 return offset-bit_magic::l1BP(w)+1; 00723 } else if (m_first_child.data() == p) { // w==0 and we are in the first word 00724 return offset+2; // da bit_magic::l1BP(w)=-1 sein muesste 00725 } else { 00726 size_type res = offset+2; 00727 while (p > m_first_child.data()) { 00728 w = *(--p); 00729 if (0==w) 00730 res += 64; 00731 else { 00732 return res + (63-bit_magic::l1BP(w)); 00733 } 00734 } 00735 return res + (w==0); 00736 } 00737 } 00738 00739 00740 00741 // Returns the next sibling of node v. 00742 // Only for tests. 00743 node_type sibling_naive(const node_type& v)const { 00744 if (v==root()) 00745 return root(); 00746 node_type parent = this->parent(v); 00747 assert(parent != v); 00748 size_type nr = degree(parent); 00749 for (size_type i=1; i <= nr; ++i) 00750 if (ith_child(parent, i) == v and i!=nr) 00751 return ith_child(parent, i+1); 00752 return root(); 00753 } 00754 00755 00757 00765 // TODO const unsigned char c durch char_type ersetzen 00766 node_type child(const node_type& v, const unsigned char c, size_type& char_pos)const { 00767 if (is_leaf(v)) // if v is a leaf = (), v has no child 00768 return root(); 00769 // else v = ( ( )) 00770 uint16_t cc = m_csa.char2comp[c]; 00771 if (cc==0 and c!=0) // TODO: aendere char2comp so ab, dass man diesen sonderfall nicht braucht 00772 return root(); 00773 size_type char_ex_max_pos = m_csa.C[cc+1], char_inc_min_pos = m_csa.C[cc]; 00774 00775 size_type d = depth(v); 00776 00777 // (1) check the first child 00778 // char_pos = m_csa(m_csa[v.i]+d); // replaced by the next line 00779 char_pos = get_char_pos(v.i, d, m_csa); 00780 if (char_pos >= char_ex_max_pos) {// the first character of the first child interval is lex. greater than c 00781 // => all other first characters of the child intervals are also greater than c => no solution 00782 return root(); 00783 } else if (char_pos >= char_inc_min_pos) { // i.e. char_pos < char_ex_max_pos and char_pos >= char_inc_min_pos 00784 return ith_child(v, 1); 00785 } 00786 00787 size_type child_cnt = degree(v); 00788 00789 // (2) check the last child 00790 // char_pos = m_csa(m_csa[v.j]+d); // replaced by the next line 00791 char_pos = get_char_pos(v.j, d, m_csa); 00792 if (char_pos < char_inc_min_pos) {// the first character of the last child interval is lex. smaller than c 00793 // => all other first characters of the child intervals are also smaller than c => no solution 00794 return root(); 00795 } else if (char_pos < char_ex_max_pos) { // i.e. char_pos < char_ex_max_pos and char_pos >= char_inc_min_pos 00796 return ith_child(v, child_cnt); 00797 } 00798 00799 // (3) binary search for c in the children [2..children) 00800 size_type l_bound = 2, r_bound = child_cnt, mid, kpos, ckpos, l_index; 00801 while (l_bound < r_bound) { 00802 mid = (l_bound + r_bound) >> 1; 00803 00804 l_index = get_ith_l_index(v, mid-1, kpos, ckpos); 00805 // char_pos = m_csa(m_csa[l_index]+d); // replaced by the next line 00806 char_pos = get_char_pos(l_index, d, m_csa); 00807 00808 if (char_inc_min_pos > char_pos) { 00809 l_bound = mid+1; 00810 } else if (char_ex_max_pos <= char_pos) { 00811 r_bound = mid; 00812 } else { // char_inc_min_pos <= char_pos < char_ex_max_pos => found child 00813 // we know that the child is not the last child, see (2) 00814 // find next l_index: we know that a new l_index exists: i.e. assert( 0 == m_bp[ckpos-1]); 00815 size_type lp1_index = m_bp_support.rank(m_bp_support.find_open(ckpos-1))-1; 00816 size_type jp1pos = m_bp.size(); 00817 if (lp1_index-1 < size()-1) { 00818 jp1pos = m_bp_support.select(lp1_index+1); 00819 } 00820 return node_type(l_index, lp1_index-1, kpos, ckpos, jp1pos); 00821 } 00822 } 00823 return root(); 00824 } 00825 00827 // \sa child(node_type v, const unsigned char c, size_type &char_pos) 00828 node_type child(const node_type& v, const unsigned char c) { 00829 size_type char_pos; 00830 return child(v, c, char_pos); 00831 } 00832 00834 00841 unsigned char edge(const node_type& v, size_type d)const { 00842 #ifndef NDEBUG 00843 if (d < 1 or d > depth(v)) { 00844 throw std::out_of_range("OUT_OF_RANGE_ERROR: "+util::demangle(typeid(this).name())+"cst_sct3p<>::edge(node_type v, size_type d). d == 0 or d > depth(v)!"); 00845 } 00846 #endif 00847 // size_type order = m_csa(m_csa[v.i]+d-1); replaced by the next line 00848 size_type order = get_char_pos(v.i, d-1, m_csa); 00849 uint16_t c_begin = 1, c_end = m_sigma+1, mid; 00850 while (c_begin < c_end) { 00851 mid = (c_begin+c_end)>>1; 00852 if (m_csa.C[mid] <= order) { 00853 c_begin = mid+1; 00854 } else { 00855 c_end = mid; 00856 } 00857 } 00858 return m_csa.comp2char[c_begin-1]; 00859 } 00860 00862 00870 node_type lca(node_type v, node_type w)const { 00871 if (v.i > w.i or (v.i == w.i and v.j < w.j)) { 00872 std::swap(v, w); 00873 } 00874 assert(v.i < w.i or (v.i==w.i and v.j >= j)); 00875 assert(!(v.i < w.i and v.j > w.i and v.j < w.j)); // assert that v and w do not overlapp 00876 if (v.j >= w.j) { // v encloses w or v==w 00877 return v; 00878 } else { // v.i < v.j < w.i < w.j 00879 size_type min_index = rmq(v.i+1, w.j); 00880 size_type min_index_pos = m_bp_support.select(min_index+1); 00881 size_type min_index_cpos = m_bp_support.find_close(min_index_pos); 00882 00883 if (min_index_cpos >= m_bp.size() - m_sigma) { // if lcp[min_index]==0 => return root 00884 return root(); 00885 } 00886 size_type new_j = nsv(min_index, min_index_pos)-1; 00887 size_type new_ipos, new_icpos; 00888 size_type new_i = psv(min_index, min_index_pos, min_index_cpos, new_ipos, new_icpos); 00889 size_type jp1pos = m_bp.size(); 00890 if (new_j < size()-1) { 00891 jp1pos = m_bp_support.select(new_j+2); 00892 } 00893 return node_type(new_i, new_j, new_ipos, new_icpos, jp1pos); 00894 } 00895 } 00896 00898 00904 size_type depth(const node_type& v)const { 00905 if (v.i == v.j) { 00906 return size()-m_csa[v.i]; 00907 } else if (v == root()) { 00908 return 0; 00909 } else { 00910 size_type kpos, ckpos; 00911 size_type l = get_first_l_index(v, kpos, ckpos); 00912 return m_lcp[l]; 00913 } 00914 } 00915 00917 00923 // TODO: can be implemented in O(1) with o(n) space. See Jansson, Sadakane, Sung, SODA 2007, "Ultra-succinct Representation of Ordered Trees" 00924 size_type node_depth(node_type v)const { 00925 size_type d = 0; 00926 while (v != root()) { 00927 ++d; 00928 v = parent(v); 00929 } 00930 return d; 00931 } 00932 00934 00940 node_type sl(const node_type& v)const { 00941 if (v == root()) 00942 return root(); 00943 // get interval with first char deleted 00944 size_type i = m_csa.psi[v.i]; 00945 if (is_leaf(v)) { 00946 if (v.i==0 and v.j==0) // if( v.l==1 ) 00947 return root(); 00948 else 00949 return ith_leaf(i+1); 00950 } 00951 size_type j = m_csa.psi[v.j]; 00952 assert(i < j); 00953 size_type min_index = rmq(i+1, j); // rmq 00954 size_type min_index_pos = m_bp_support.select(min_index+1); 00955 size_type min_index_cpos = m_bp_support.find_close(min_index_pos); 00956 if (min_index_cpos >= m_bp.size() - m_sigma) { // if lcp[min_index]==0 => return root 00957 return root(); 00958 } 00959 size_type new_j = nsv(min_index, min_index_pos)-1; 00960 size_type new_ipos, new_icpos; 00961 size_type new_i = psv(min_index, min_index_pos, min_index_cpos, new_ipos, new_icpos); 00962 size_type jp1pos = m_bp.size(); 00963 if (new_j < size()-1) { 00964 jp1pos = m_bp_support.select(new_j+2); 00965 } 00966 return node_type(new_i, new_j, new_ipos, new_icpos, jp1pos); 00967 } 00968 00969 00971 /* 00972 * \param v A valid not of a cst_sct3p. 00973 * \param c The character which should be prepended to the string of the current node. 00974 * \return root() if the Weiner link of (v, c) does not exist, otherwise the Weiner link is returned. 00975 * \par Time complexity 00976 * \f$ \Order{ t_{rank\_bwt} } \f$ 00977 * 00978 */ 00979 node_type wl(const node_type& v, const unsigned char c) const { 00980 size_type c_left = m_csa.rank_bwt(v.i, c); 00981 size_type c_right = m_csa.rank_bwt(v.j+1, c); 00982 if (c_left == c_right) // there exists no Weiner link 00983 return root(); 00984 if (c_left+1 == c_right) 00985 return ith_leaf(m_csa.C[m_csa.char2comp[c]] + c_left + 1); 00986 else { 00987 size_type left = m_csa.char2comp[c] + c_left; 00988 size_type right = m_csa.char2comp[c] + c_right - 1; 00989 assert(left < right); 00990 00991 size_type ipos = m_bp_support.select(left+1); 00992 size_type jp1pos = m_bp.size(); 00993 if (right < size()-1) { 00994 jp1pos = m_bp_support.select(right+2); 00995 } 00996 return node_type(left, right, ipos, m_bp_support.find_close(ipos), jp1pos); 00997 } 00998 } 00999 01001 01006 size_type sn(const node_type& v)const { 01007 assert(is_leaf(v)); 01008 return m_csa[v.i]; 01009 } 01010 01012 01018 size_type id(const node_type& v)const { 01019 if (is_leaf(v)) { // return i in the range from 0..csa.size()-1 01020 return v.i; 01021 } 01022 size_type clpos; // closing parentheses of the l-index 01023 if (v.cipos > v.jp1pos) { // corresponds to m_lcp[i] <= m_lcp[j+1] 01024 clpos = v.jp1pos-1; 01025 } else { // corresponds to m_lcp[i] > m_lcp[j+1] 01026 clpos = v.cipos-1; 01027 } 01028 assert(m_bp[clpos]==0); 01029 size_type r0clpos = clpos-m_bp_support.rank(clpos); 01030 01031 return size()+m_first_child_rank(r0clpos); 01032 } 01033 01035 size_type nodes()const { 01036 return m_nodes; 01037 } 01038 01040 /* \param lb Left bound of the lcp-interval [lb..rb] (inclusive). 01041 * \param rb Right bound of the lcp-interval [lb..rb] (inclusive). 01042 * \param l Depth of the lcp-interval. 01043 *\ return The node in the suffix tree corresponding lcp-interval [lb..rb] 01044 */ 01045 node_type node(size_type lb, size_type rb, size_type l=0) const { 01046 size_type ipos = m_bp_support.select(lb+1); 01047 size_type jp1pos; 01048 if (rb == size()-1) { 01049 jp1pos = m_bp.size(); 01050 } else { 01051 jp1pos = m_bp_support.select(rb+2); 01052 } 01053 return node_type(ipos, m_bp_support.find_close(ipos), jp1pos); 01054 } 01055 01057 01061 size_type tlcp_idx(size_type i) const { 01062 size_type ipos = m_bp_support.select(i+1); 01063 size_type cipos = m_bp_support.find_close(ipos); 01064 return m_first_child_rank.rank(((ipos+cipos-1)>>1)-i); 01065 } 01066 /* @} */ 01067 01068 }; 01069 01070 // == template functions == 01071 01072 01073 template<class Csa, class Lcp, class Bp_support, class Rank_support> 01074 template<uint8_t int_width, class size_type_class, uint8_t int_width_1, class size_type_class_1, uint8_t int_width_2, class size_type_class_2> 01075 cst_sct3p<Csa, Lcp, Bp_support, Rank_support>::cst_sct3p(const std::string& csa_file_name, 01076 int_vector_file_buffer<int_width, size_type_class>& lcp_buf, 01077 int_vector_file_buffer<int_width_1, size_type_class_1>& sa_buf, 01078 int_vector_file_buffer<int_width_2, size_type_class_2>& isa_buf, 01079 std::string dir="./", 01080 bool build_only_bps=false 01081 ):csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support), first_child_bv(m_first_child), first_child_rank(m_first_child_rank) 01082 { 01083 std::string id = util::to_string(util::get_pid())+"_"+util::to_string(util::get_id()).c_str(); 01084 01085 write_R_output("cst", "construct BPS", "begin", 1, 0); 01086 m_nodes = algorithm::construct_supercartesian_tree_bp_succinct_and_first_child(lcp_buf, m_bp, m_first_child) + m_bp.size()/2; 01087 write_R_output("cst", "construct BPS", "end", 1, 0); 01088 01089 if (!build_only_bps) { 01090 util::load_from_file(m_csa, csa_file_name.c_str()); 01091 01092 write_R_output("cst", "construct CLCP", "begin", 1, 0); 01093 construct_lcp(m_lcp, *this, lcp_buf, isa_buf); 01094 write_R_output("cst", "construct CLCP", "end", 1, 0); 01095 } 01096 write_R_output("cst", "construct BPSS", "begin", 1, 0); 01097 m_bp_support = Bp_support(&m_bp); 01098 util::init_support(m_first_child_rank, &m_first_child); 01099 write_R_output("cst", "construct BPSS", "end", 1, 0); 01100 m_sigma = degree(root()); 01101 } 01102 01103 01104 template<class Csa, class Lcp, class Bp_support, class Rank_support> 01105 cst_sct3p<Csa, Lcp, Bp_support, Rank_support>::cst_sct3p(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps = false):csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support), first_child_bv(m_first_child), first_child_rank(m_first_child_rank) 01106 { 01107 construct(file_map, dir, id, build_only_bps); 01108 } 01109 01110 01111 template<class Csa, class Lcp, class Bp_support, class Rank_support> 01112 void cst_sct3p<Csa, Lcp, Bp_support, Rank_support>::construct(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps = false) 01113 { 01114 write_R_output("cst", "construct BPS", "begin", 1, 0); 01115 int_vector_file_buffer<> lcp_buf(file_map["lcp"].c_str()); 01116 m_nodes = algorithm::construct_supercartesian_tree_bp_succinct_and_first_child(lcp_buf, m_bp, m_first_child) + m_bp.size()/2; 01117 write_R_output("cst", "construct BPS", "end", 1, 0); 01118 write_R_output("cst", "construct BPSS", "begin", 1, 0); 01119 m_bp_support = Bp_support(&m_bp); 01120 util::init_support(m_first_child_rank, &m_first_child); 01121 write_R_output("cst", "construct BPSS", "end", 1, 0); 01122 01123 if (!build_only_bps) { 01124 write_R_output("cst", "construct CLCP", "begin", 1, 0); 01125 construct_lcp(m_lcp, *this, file_map, dir, id); 01126 write_R_output("cst", "construct CLCP", "end", 1, 0); 01127 } 01128 if (!build_only_bps) { 01129 util::load_from_file(m_csa, file_map["csa"].c_str()); 01130 } 01131 m_sigma = degree(root()); 01132 } 01133 01134 template<class Csa, class Lcp, class Bp_support, class Rank_support> 01135 typename cst_sct3p<Csa, Lcp, Bp_support, Rank_support>::size_type cst_sct3p<Csa, Lcp, Bp_support, Rank_support>::serialize(std::ostream& out, structure_tree_node* v, std::string name)const 01136 { 01137 structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); 01138 size_type written_bytes = 0; 01139 written_bytes += m_csa.serialize(out, child, "csa"); 01140 written_bytes += m_lcp.serialize(out, child, "lcp"); 01141 written_bytes += m_bp.serialize(out, child, "bp"); 01142 written_bytes += m_bp_support.serialize(out, child, "bp_support"); 01143 written_bytes += m_first_child.serialize(out, child, "mark_child"); 01144 written_bytes += m_first_child_rank.serialize(out, child, "mark_child_rank"); 01145 written_bytes += util::write_member(m_sigma, out, child, "sigma"); 01146 written_bytes += util::write_member(m_nodes, out, child, "node_cnt"); 01147 structure_tree::add_size(child, written_bytes); 01148 return written_bytes; 01149 } 01150 01151 template<class Csa, class Lcp, class Bp_support, class Rank_support> 01152 void cst_sct3p<Csa, Lcp, Bp_support, Rank_support>::load(std::istream& in) 01153 { 01154 m_csa.load(in); 01155 load_lcp(m_lcp, in, *this); 01156 m_bp.load(in); 01157 m_bp_support.load(in, &m_bp); 01158 m_first_child.load(in); 01159 m_first_child_rank.load(in, &m_first_child); 01160 util::read_member(m_sigma, in); 01161 util::read_member(m_nodes, in); 01162 #ifdef SDSL_DEBUG 01163 assert(algorithm::check_bp_support(m_bp, m_bp_support)); 01164 std::cerr<<"checked bp_support"<<std::endl; 01165 #endif 01166 } 01167 01168 template<class Csa, class Lcp, class Bp_support, class Rank_support> 01169 cst_sct3p<Csa, Lcp, Bp_support, Rank_support>& cst_sct3p<Csa, Lcp, Bp_support, Rank_support>::operator=(const cst_sct3p& cst) 01170 { 01171 if (this != &cst) { 01172 copy(cst); 01173 } 01174 return *this; 01175 } 01176 01177 01178 01179 01180 01181 } // end namespace sdsl 01182 01183 01184 #endif