SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
 All Classes Namespaces Files Functions Variables Typedefs Friends
sdsl/include/sdsl/cst_sct.hpp
Go to the documentation of this file.
00001 /* sdsl - succinct data structures library
00002     Copyright (C) 2009 Simon Gog
00003 
00004     This program is free software: you can redistribute it and/or modify
00005     it under the terms of the GNU General Public License as published by
00006     the Free Software Foundation, either version 3 of the License, or
00007     (at your option) any later version.
00008 
00009     This program is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012     GNU General Public License for more details.
00013 
00014     You should have received a copy of the GNU General Public License
00015     along with this program.  If not, see http://www.gnu.org/licenses/ .
00016 */
00021 #ifndef INCLUDED_SDSL_CST_SCT
00022 #define INCLUDED_SDSL_CST_SCT
00023 
00024 #include "int_vector.hpp"
00025 #include "algorithms.hpp"
00026 #include "iterators.hpp"
00027 #include "lcp_support_sada.hpp"
00028 #include "bp_support.hpp"
00029 #include "csa_wt.hpp" // for std initialization of cst_sct 
00030 #include "csa_uncompressed.hpp"
00031 #include "csa_wt.hpp"
00032 #include "cst_iterators.hpp"
00033 #include "testutils.hpp"
00034 #include "util.hpp"
00035 #include <iostream>
00036 #include <algorithm>
00037 #include <cassert>
00038 #include <cstring> // for strlen
00039 #include <iomanip>
00040 #include <iterator>
00041 #include <stack> // for the calculation of the balanced parantheses sequence
00042 #include <ostream>
00043 
00044 namespace sdsl
00045 {
00046 
00047 struct cst_tag; // forward declaration
00048 
00050 
00064 template<class Int = size_t>
00065 struct lcp_interval {
00066     Int l;              
00067     Int left;   
00068     Int right;  
00069 
00071 
00076     lcp_interval(Int l=0, Int left=0, Int right=0):l(l),left(left),right(right) {};
00077 
00078 
00079     bool operator<(const lcp_interval& interval)const {
00080         if (l!=interval.l)
00081             return l<interval.l;
00082         if (left!=interval.left)
00083             return left<interval.left;
00084         return right<interval.right;
00085 
00086     }
00087 
00089 
00091     bool operator==(const lcp_interval& interval)const {
00092         return l==interval.l and
00093                left==interval.left and
00094                right==interval.right;
00095     }
00096 
00098 
00100     bool operator!=(const lcp_interval& interval)const {
00101         return !(*this==interval);
00102     }
00103 
00105 
00107     lcp_interval& operator=(const lcp_interval& interval) {
00108         l = interval.l;
00109         left = interval.left;
00110         right = interval.right;
00111         return *this;
00112     }
00113 };
00114 
00115 template<class Int>
00116 inline std::ostream& operator<<(std::ostream& os, const lcp_interval<Int>& interval)
00117 {
00118     os<<interval.l<<"-["<<interval.left<<","<<interval.right<<"]";
00119     return os;
00120 }
00121 
00122 
00123 
00125 
00147 template<class Csa = csa_wt<>, class Lcp = lcp_support_sada<Csa>, class Bp_support = bp_support_sada<> >
00148 class cst_sct
00149 {
00150     public:
00151         typedef typename Csa::value_type                                                         value_type;    // STL Container requirement/TODO: ist das nicht gleich node type???
00152         typedef cst_dfs_const_forward_iterator<cst_sct>  const_iterator;// STL Container requirement
00153         typedef const_iterator                                                                           iterator;              // STL Container requirement
00154         typedef cst_bottom_up_const_forward_iterator<cst_sct>            const_bottom_up_iterator;
00155         typedef  const_bottom_up_iterator                                                        bottom_up_iterator;
00156         typedef const value_type                                                                         const_reference;
00157         typedef const_reference                                                                          reference;
00158         typedef const_reference*                                                                         pointer;
00159         typedef const pointer                                                                            const_pointer;
00160         typedef typename Csa::size_type                                                          size_type;             // STL Container requirement
00161         typedef size_type                                                                                        cst_size_type;
00162         typedef ptrdiff_t                                                                                        difference_type; // STL Container requirement
00163         typedef Csa                                                                                                      csa_type;
00164         typedef Lcp                                                                                                      lcp_type;
00165         typedef Bp_support                                                                                       bp_support_type;
00166         typedef typename Csa::pattern_type                                                       pattern_type;
00167         typedef typename Csa::char_type                                                          char_type;
00168         typedef lcp_interval<size_type>                                                          node_type; 
00169 
00170         typedef cst_tag                                                                                          index_category;
00171     private:
00172         Csa                     m_csa;
00173         Lcp                     m_lcp;
00174         bit_vector              m_bp;
00175         Bp_support              m_bp_support;
00176         size_type               m_nodes;
00177 
00178         void copy(const cst_sct& cst) {
00179             m_csa = cst.m_csa;
00180 //                      lcp_trait<Csa, Lcp>::copy_lcp(m_lcp, cst.m_lcp, m_csa);
00181             copy_lcp(m_lcp, cst.m_lcp, *this);
00182             m_bp = cst.m_bp;
00183             m_bp_support = cst.m_bp_support;
00184             m_bp_support.set_vector(&m_bp);
00185             m_nodes = cst.m_nodes;
00186         }
00187 
00188         // Get the first l index of a l-[i,j] interval.
00189         /* I.e. given an interval [i,j], the function returns the position of the smallest entry lcp[k] with \f$ i<k\leq j \f$
00190          * Time complexity: O(1) + O(lcp access)
00191          */ // TODO: if interval is last child interval -> we don't need the m_lcp comparisson
00192         size_type get_first_l_index(size_type i, size_type j, size_type& kpos, size_type& find_close_k)const {
00193             if (j+1 != m_csa.size() and m_lcp[i] <= m_lcp[j+1]) {
00194                 size_type jp1pos = m_bp_support.select(j+2);
00195                 assert(m_bp[jp1pos] == 1);  // opening parenthesis
00196                 assert(jp1pos > 0 and m_bp[jp1pos-1] == 0); // closing parenthesis
00197                 find_close_k = jp1pos-1;
00198                 kpos    = m_bp_support.find_open(find_close_k);
00199                 return m_bp_support.rank(kpos)-1;
00200             } else {
00201                 size_type ipos = m_bp_support.select(i+1);
00202                 assert(m_bp[ipos] == 1);  // opening parenthesis
00203                 ipos = m_bp_support.find_close(ipos);
00204                 assert(m_bp[ipos] == 0 and m_bp[ipos-1] == 0);   // closing parenthesis
00205                 find_close_k = ipos-1;
00206                 kpos    = m_bp_support.find_open(find_close_k);
00207                 return m_bp_support.rank(kpos)-1;
00208             }
00209         }
00210 
00211         // Get the postion of the first l index of a l-[i,j] interval in the balanced parentheses sequence.
00212         /* \par Time complexity
00213          *              \f$ \Order{\lcpaccess} \f$
00214          */
00215         size_type get_pos_of_closing_parenthesis_of_the_first_l_index(size_type i, size_type j)const {
00216             if (j+1 != m_csa.size() and m_lcp[i] <= m_lcp[j+1]) {
00217                 size_type jp1pos = m_bp_support.select(j+2);
00218                 assert(m_bp[jp1pos] == 1);  // opening parenthesis
00219                 assert(jp1pos > 0 and m_bp[jp1pos-1] == 0); // closing parenthesis
00220                 return jp1pos-1;
00221             } else {
00222                 size_type ipos = m_bp_support.select(i+1);
00223                 assert(m_bp[ipos] == 1);  // opening parenthesis
00224                 ipos = m_bp_support.find_close(ipos);
00225                 assert(m_bp[ipos] == 0 and m_bp[ipos-1] == 0);   // closing parenthesis
00226                 return ipos-1;
00227             }
00228         }
00229 
00230         // Get the next smaller value.
00231         /* \par Time complexity
00232          *      \f$ \Order{1} \f$
00233          */
00234         size_type nsv(size_type i)const {
00235             size_type pos =  m_bp_support.select(i+1);
00236             size_type rank = m_bp_support.find_close(pos);
00237             size_type result = m_bp_support.rank(rank);
00238             return result;
00239         }
00240 
00241         // Get the previous smaller value.
00242         /* \pre lcp[i] > 0!
00243          * \par Time complexity
00244          *     \f$ \Order{\sigma \cdot \lcpaccess} \f$
00245          * \sa psv
00246          */
00247         size_type psv_linear(size_type i, size_type lcpi)const {
00248             assert(lcpi>0);
00249             size_type pos =  m_bp_support.select(i+1);
00250             size_type ec = m_bp_support.enclose(pos);
00251             size_type j = m_bp_support.rank(ec);
00252             while (lcpi == m_lcp[j-1]) {
00253                 ec = m_bp_support.enclose(ec);
00254                 j = m_bp_support.rank(ec);
00255             }
00256             return j-1;
00257         }
00258 
00259         // Get the previous smaller value.
00260         /* \pre lcp[i] > 0!
00261          * \par Time complexity
00262          *    \f$ \Order{\log\sigma \cdot \lcpaccess} \f$
00263          */
00264         size_type psv(size_type i, size_type lcpi)const {
00265             assert(lcpi!=0);// if(m_lcp[i]==0) return 0;
00266             size_type begin = m_bp_support.find_close(m_bp_support.select(i+1)); // calculate NSV
00267             size_type end = m_bp_support.rank(begin)+1;                          //     "      " // exclusive
00268             if (end > m_csa.size()) {
00269                 assert(end == m_csa.size()+1);
00270                 end = m_bp.size();
00271             } else {
00272                 end = m_bp_support.select(end);
00273             }
00274             if (++begin < end) {
00275                 size_type result = m_bp_support.rank(m_bp_support.find_open(end-1))-1;
00276                 if (m_lcp[result] < lcpi) { // TODO: explain this condition
00277                     // binary search
00278                     --end;
00279                     while (begin!=end) {
00280                         size_type mid, temp;
00281                         mid = (begin+end)>>1; // begin <= mid < end
00282                         if (m_lcp[temp = m_bp_support.rank(m_bp_support.find_open(mid))-1] >= lcpi) {
00283                             begin = mid+1;
00284                         } else {
00285                             result = temp;
00286                             end = mid;
00287                         }
00288                     }
00289                     return result;
00290                 }
00291             }
00292 
00293             return m_bp_support.rank(m_bp_support.enclose(end))-1;
00294         }
00295 
00296         // Range minimum query based on the rr_enclose method.
00297         /* \par Time complexity
00298          *     \f$ \Order{\rrenclose} \f$
00299          */
00300         size_type rmq(size_type l, size_type r)const {
00301             size_type i         = m_bp_support.select(l+1);
00302             size_type j         = m_bp_support.select(r+1);
00303             size_type fc_i      = m_bp_support.find_close(i);
00304             if (j < fc_i) { // i < j < find_close(j) < find_close(i)
00305                 return l;
00306             } else { // i < find_close(i) < j < find_close(j)
00307                 size_type ec = m_bp_support.rr_enclose(i,j);
00308                 if (ec == m_bp_support.size()) {// no restricted enclosing pair found
00309                     return r;
00310                 } else { // found range restriced enclosing pair
00311                     return m_bp_support.rank(ec)-1; // subtract 1, as the index is 0 based
00312                 }
00313             }
00314         }
00315 
00316     public:
00317         const csa_type& csa;                            
00318         const lcp_type& lcp;                            
00319         const bit_vector& bp;                   
00320         const bp_support_type& bp_support;      
00321 
00323         /* @{ */
00324 
00326         cst_sct(): csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support) {}
00327 
00328         // Constructor for the cst_sct taking a string for that the compressed suffix tree should be calculated.
00329         /*
00330          * \param str Text for which the \f$ \CST \f$ should be constructed. The text should be terminated by a zero byte.
00331          * \pre The text has to be terminated by a zero byte.
00332          * \sa cst_sct::cst_sct(const Csa &csa, const unsigned char *str)
00333          */
00334 //              cst_sct(const unsigned char *str);
00335 
00336         template<uint8_t int_width, class size_type_class, uint8_t int_width_1, class size_type_class_1, uint8_t int_width_2, class size_type_class_2>
00337         cst_sct(const std::string& cst_file_name,
00338                 int_vector_file_buffer<int_width, size_type_class>& lcp_buf,
00339                 int_vector_file_buffer<int_width_1, size_type_class_1>& sa_buf,
00340                 int_vector_file_buffer<int_width_2, size_type_class_2>& isa_buf,
00341                 std::string dir="./",
00342                 bool build_only_bps=false);
00343 
00344         cst_sct(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps=false);
00345 
00347 
00352         cst_sct(const cst_sct& cst):csa(m_csa),lcp(m_lcp),bp(m_bp),bp_support(m_bp_support) {
00353             copy(cst);
00354         }
00355 
00356         /* @} */
00357 
00358         void construct(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps=false);
00359 
00361         ~cst_sct() {}
00362 
00364 
00367         size_type size()const {
00368             return m_csa.size();
00369         }
00370 
00372 
00375         static size_type max_size() {
00376             return Csa::max_size();
00377         }
00378 
00380 
00383         bool empty()const {
00384             return m_csa.empty();
00385         }
00386 
00388 
00396         void swap(cst_sct<Csa, Lcp>& cst);
00397 
00399 
00402         const_iterator begin()const {
00403             if (0 == m_bp.size())  // special case: tree is uninitialized
00404                 return end();
00405             else if (2 == m_bp.size()) { // special case: the root is a leaf
00406                 return const_iterator(this, root(), true, true);
00407             }
00408             return const_iterator(this, root(), false, true);
00409         };
00410 
00412 
00415         const_iterator end()const {
00416             return const_iterator(this, root(), true, false);
00417         }
00418 
00420         const_bottom_up_iterator begin_bottom_up()const {
00421             if (0 == m_bp.size())  // special case: tree is uninitialized
00422                 return end_bottom_up();
00423             return const_bottom_up_iterator(this, leftmost_leaf_in_the_subtree(root()));
00424         }
00425 
00427         const_bottom_up_iterator end_bottom_up()const {
00428             return const_bottom_up_iterator(this, root(), false);
00429         }
00430 
00432 
00435         cst_sct& operator=(const cst_sct& cst);
00436 
00438 
00443         bool operator==(const cst_sct& cst)const;
00444 
00446 
00451         bool operator!=(const cst_sct& cst)const;
00452 
00454 
00457         size_type serialize(std::ostream& out) const;
00458 
00460 
00462         void load(std::istream& in);
00463 
00465         /* @{ */
00466 
00468 
00472         node_type root() const {
00473             return node_type(0, 0, m_csa.size()-1);
00474         }
00475 
00477 
00483         bool is_leaf(const node_type& v)const {
00484             return v.right==v.left;
00485         }
00486 
00488 
00495         node_type ith_leaf(size_type i)const {
00496             assert(i > 0 and i <= m_csa.size());
00497             return node_type(m_csa.size()-m_csa[i-1], i-1, i-1);
00498         }
00499 
00501 
00508         size_type leaves_in_the_subtree(const node_type& v)const {
00509             return v.right-v.left+1;
00510         }
00511 
00513 
00518         node_type leftmost_leaf_in_the_subtree(const node_type& v)const {
00519             return node_type(m_csa.size()-m_csa[v.left], v.left, v.left);
00520         }
00521 
00523 
00528         node_type rightmost_leaf_in_the_subtree(const node_type& v)const {
00529             return node_type(m_csa.size()-m_csa[v.right], v.right, v.right);
00530         }
00531 
00533 
00540         size_type lb(const node_type& v)const {
00541             return v.left;
00542         }
00543 
00545 
00552         size_type rb(const node_type& v)const {
00553             return v.right;
00554         }
00555 
00557 
00562         node_type parent(const node_type& v) const {
00563 #ifndef NDEBUG
00564 //                      std::cout<<"parent interval of "<<v.l<<"-["<<v.left<<" "<<v.right<<"]"<<std::endl;
00565 #endif
00566             if (v.l <= 1) {// if v.l < 1 => root()
00567                 return root();  // return root node
00568             } else { // l > 1
00569                 if (v.right+1 == m_csa.size()) {
00570                     // search for the previous smaller value
00571                     size_type lcpi = m_lcp[v.left];
00572                     if (lcpi==0)
00573                         return root();
00574                     assert(lcpi!=0);
00575                     return node_type(lcpi, psv(v.left, lcpi), v.right);
00576                 } else if (v.left == 0) {
00577                     // search for the next smaller value
00578                     return node_type(m_lcp[v.right+1],0, nsv(v.right+1)-1);
00579                 } else {
00580                     // TODO: speed up, only two cases?
00581                     size_type lcpl = m_lcp[v.left], lcpr = m_lcp[v.right+1];
00582                     if (lcpl < lcpr) {
00583                         return node_type(lcpr, v.left, nsv(v.right+1)-1);
00584                     } else if (lcpl > lcpr) {
00585                         return node_type(lcpl, psv(v.left, lcpl), v.right);
00586                     } else {
00587                         if (lcpl==0)
00588                             return root();
00589                         return node_type(lcpl, psv(v.left, lcpl), nsv(v.right+1)-1);
00590                     }
00591                 }
00592             }
00593         }
00594 
00595 
00597 
00606         node_type ith_child(const node_type& v, size_type i)const {
00607             if (is_leaf(v))  // if v is a leave, v has no child
00608                 return root();
00609             size_type k = 0, kpos = 0, k_find_close = 0;
00610             // v is not a leave: v has at least two children
00611             k = get_first_l_index(v.left, v.right, kpos, k_find_close);// get first l-index k and the position of k
00612             assert(m_lcp[k]==v.l);
00613             size_type left = v.left, right = k-1;
00614             size_type j=1;
00615             while (j < i) {
00616                 // get next l-index m and the position of m
00617                 size_type m    = m_bp_support.rank(m_bp_support.find_open(k_find_close-j))-1;
00618                 ++j;
00619                 if (m==k or v.l != m_lcp[m]) { // no next l-index exists
00620                     if (i!=j)
00621                         return root();
00622                     left        = k;
00623                     right       = v.right;
00624                     break;
00625                 }
00626                 left  = k;
00627                 right = m-1;
00628                 k = m;
00629             }
00630             if (left==right) { // child interval is a leaf
00631                 assert(m_csa.size()-m_csa[left] > v.l);
00632                 return node_type(m_csa.size()-m_csa[left],left,right);
00633             } else {
00634                 // find l of the first child interval
00635                 size_type k1 = get_first_l_index(left, right, kpos, kpos);
00636                 assert(m_lcp[k1] > v.l);
00637                 return node_type(m_lcp[k1], left, right);
00638             }
00639             return root();
00640         }
00641 
00643 
00649         size_type degree(const node_type& v)const {
00650             if (is_leaf(v))  // if v is a leave, v has no child
00651                 return 0;
00652             // v is not a leave: v has at least two children
00653             size_type r = get_pos_of_closing_parenthesis_of_the_first_l_index(v.left, v.right);
00654             assert(m_lcp[m_bp_support.rank(m_bp_support.find_open(r))-1]==v.l);
00655             size_type begin = m_bp_support.preceding_closing_parentheses(r);
00656             assert(m_csa.sigma>0);
00657             if (begin > (size_type)m_csa.sigma-1) begin = m_csa.sigma-1;
00658             begin = r-begin;
00659             size_type end = r, mid;
00660             while (end != begin) {
00661                 mid = (begin+end)>>1;  // begin <= mid < end
00662                 if (m_lcp[m_bp_support.rank(m_bp_support.find_open(mid))-1] > v.l)
00663                     begin = mid+1;
00664                 else
00665                     end = mid;
00666             }
00667             return r-end+2;
00668         }
00669 
00671 
00677         node_type sibling(const node_type& v)const {
00678             if (v == root())
00679                 return root();
00680             if (v.right == m_csa.size()-1) // there exists no next sibling if the interval touches the right border
00681                 return root();
00682             // now let [p..q] be the interval [v.left, v.right]
00683             size_type lcp_p   = m_lcp[v.left];
00684             size_type lcp_qp1 = m_lcp[v.right+1]; // is defined as v.right < m_csa.size()-1
00685             if (lcp_p > lcp_qp1)
00686                 return root();
00687             else { // search next l-index
00688                 size_type new_q;
00689                 size_type qp1_closing_pos = m_bp_support.find_close(m_bp_support.select(v.right+2));
00690                 if (m_bp[qp1_closing_pos-1]) {// if there exists no closing parenthesis left to qp1_closing_pos
00691                     new_q = nsv(v.right+1);   // i.e. we could not find a next l-index, only a smaller value: last sibling
00692                     assert(new_q > v.right+1);
00693                 } else {
00694                     new_q = m_bp_support.rank(m_bp_support.find_open(qp1_closing_pos-1))-1;
00695                     assert(new_q > v.right+1);
00696                     if (m_lcp[new_q] > lcp_qp1) { // no next l-index exists: find next smaller value, last sibling
00697                         new_q = nsv(v.right+1);
00698                     } else {
00699                         assert(m_lcp[new_q] == lcp_qp1);
00700                     }
00701                 }
00702                 assert(new_q > v.right+1);
00703                 if (new_q-1 == v.right+1) { // sibling is a singlton interval
00704                     return ith_leaf(new_q/*v.right+2*/);
00705                 } else {
00706                     // get l-index m of the sibling
00707                     size_type m_pos, m_close_pos;
00708                     size_type m = get_first_l_index(v.right+1, new_q-1, m_pos, m_close_pos);
00709                     return node_type(m_lcp[m], v.right+1, new_q-1);
00710                 }
00711 
00712             }
00713         }
00714 
00715         // Returns the next sibling of node v.
00716         // Only for tests.
00717         node_type sibling_naive(const node_type& v)const {
00718             if (v==root())
00719                 return root();
00720             node_type parent = this->parent(v);
00721             assert(parent != v);
00722             size_type nr = degree(parent);
00723             for (size_type i=1; i <= nr; ++i)
00724                 if (ith_child(parent, i) == v and i!=nr)
00725                     return ith_child(parent, i+1);
00726             return root();
00727         }
00728 
00730 
00738         node_type child_linear(const node_type& v, unsigned char c, size_type& char_pos)const {
00739             if (is_leaf(v))  // if v is a leave, it has no child
00740                 return root();
00741             size_type k, kpos, k_find_close;
00742             // v is not a leave: v has at least two children
00743             k = get_first_l_index(v.left, v.right, kpos, k_find_close);// get first l-index k
00744             assert(m_lcp[k]==v.l);
00745             size_type left = v.left, right = k-1;
00746             char_pos = m_csa(m_csa[left]+v.l); // char_pos = invSA[SA[left]+v.l+1-1]
00747             uint16_t cc = m_csa.char2comp[c];
00748             size_type char_ex_max_pos = m_csa.C[cc+1], char_inc_min_pos = m_csa.C[cc];
00749             while (char_pos < char_ex_max_pos) {
00750                 if (char_pos >= char_inc_min_pos) {
00751                     goto found_child;
00752                 }
00753                 // get next l-index
00754                 size_type mpos = m_bp_support.find_open((m_bp_support.find_close(kpos)-1));
00755                 size_type m    = m_bp_support.rank(mpos)-1;
00756                 if (m==k or v.l != m_lcp[m]) { // no next l-index exists
00757                     left        = k;
00758                     right       = v.right;
00759                     char_pos = m_csa(m_csa[left]+v.l);
00760                     if (char_pos < char_inc_min_pos or char_pos >= char_ex_max_pos)  // check if character is two small
00761                         return root();
00762                     goto found_child;
00763                 }
00764                 left  = k;
00765                 right = m-1;
00766                 k = m;
00767                 kpos = mpos;
00768                 assert(m_csa(m_csa[left]+v.l) > char_pos);
00769                 char_pos = m_csa(m_csa[left]+v.l);
00770             }
00771             return root();
00772 found_child:
00773             if (left==right) { // child interval is a leaf
00774                 assert(m_csa.size()-m_csa[left] > v.l);
00775                 return node_type(m_csa.size()-m_csa[left],left,right);
00776             } else {
00777                 // find l of the first child interval
00778                 size_type k1pos, k1 = get_first_l_index(left, right, k1pos, k_find_close);
00779                 assert(m_lcp[k1] > v.l);
00780                 return node_type(m_lcp[k1], left, right);
00781             }
00782         }
00783 
00784 
00785 
00787 
00795         // TODO const unsigned char c durch char_type ersetzen
00796         node_type child(const node_type& v, const unsigned char c, size_type& char_pos)const {
00797             if (is_leaf(v))  // if v is a leaf = (), v has no child
00798                 return root();
00799             // else v = ( (     ))
00800             uint16_t cc = m_csa.char2comp[c];
00801             if (cc==0 and c!=0) // TODO: aendere char2comp so ab, dass man diesen sonderfall nicht braucht
00802                 return root();
00803             size_type char_ex_max_pos = m_csa.C[cc+1], char_inc_min_pos = m_csa.C[cc];
00804             // v is not a leave: v has at least two children
00805             size_type k, kpos, k_find_close;
00806             k = get_first_l_index(v.left, v.right, kpos, k_find_close);
00807             assert(m_lcp[k]==v.l);
00808             size_type left = v.left, right = k-1;
00809             char_pos = m_csa(m_csa[left]+v.l);
00810             if (char_pos >= char_ex_max_pos) {// c is lex. greater than the first character of the first child interval
00811                 return root();
00812             } else if (char_pos >= char_inc_min_pos) { // found interval [left, right]
00813                 goto found_child;
00814             }
00815             {
00816                 size_type k_char_pos = m_csa(m_csa[k]+v.l);
00817                 {
00818                     size_type begin = m_bp_support.preceding_closing_parentheses(k_find_close);
00819                     assert(m_csa.sigma>0);
00820                     if (begin > (size_type)m_csa.sigma-1) begin = m_csa.sigma-1;
00821                     begin = k_find_close - begin;
00822                     size_type end = k_find_close, m_find_close, m;
00823                     while (end != begin) {
00824                         m_find_close = (begin+end)>>1;  // begin <= m_find_close < end
00825                         m = m_bp_support.rank(m_bp_support.find_open(m_find_close))-1;
00826                         if (m_lcp[m] > v.l or /*m_lcp[m]==v.l and */ (char_pos = m_csa(m_csa[m]+v.l)) >= char_ex_max_pos) {
00827                             begin = m_find_close+1;
00828                         } else { // m_lcp[m] == v.l and char_pos < char_ex_max_pos
00829                             end = m_find_close;
00830                             k = m;
00831                             k_char_pos  = char_pos;
00832                         }
00833                     }
00834                     assert(m_lcp[k] == v.l);
00835                     assert(m_bp_support.find_close(m_bp_support.select(k+1)) == end);
00836                     if (k_char_pos < char_ex_max_pos and k_char_pos >= char_inc_min_pos) {// child interval exists
00837                         left = k;
00838                         char_pos = k_char_pos;
00839                         assert(end > 0);
00840                         if (m_bp[end-1] == 1) { // last child interval
00841                             right = v.right;
00842                         } else {
00843                             size_type k1 = m_bp_support.rank(m_bp_support.find_open(end-1))-1;
00844                             if (m_lcp[k1] > v.l) {// last child interval
00845                                 right = v.right;
00846                             } else {
00847                                 right = k1-1;
00848                             }
00849                         }
00850                     } else {
00851                         return root();
00852                     }
00853                 }
00854             }
00855 found_child:
00856             if (left==right) { // child interval is a leaf
00857                 assert(m_csa.size()-m_csa[left] > v.l);
00858                 return node_type(m_csa.size()-m_csa[left],left,right);
00859             } else {
00860                 // find l of the first child interval
00861                 size_type k1pos, k1 = get_first_l_index(left, right, k1pos, k_find_close);
00862                 assert(m_lcp[k1] > v.l);
00863                 return node_type(m_lcp[k1], left, right);
00864             }
00865         }
00866 
00868         // \sa child(node_type v, const unsigned char c, size_type &char_pos)
00869         node_type child(const node_type& v, const unsigned char c) {
00870             size_type char_pos;
00871             return child(v, c, char_pos);
00872         }
00873 
00875 
00882         unsigned char edge(const node_type& v, size_type d)const {
00883 #ifndef NDEBUG
00884             if (d < 1 or d > depth(v)) {
00885                 throw std::out_of_range("OUT_OF_RANGE_ERROR: "+util::demangle(typeid(this).name())+"cst_sct<>::edge(node_type v, size_type d). d == 0 or d > depth(v)!");
00886             }
00887 #endif
00888             size_type order = m_csa(m_csa[v.left]+d-1);
00889             uint16_t c_begin = 1, c_end = m_csa.sigma+1, mid;
00890             while (c_begin < c_end) {
00891                 mid = (c_begin+c_end)>>1;
00892                 if (m_csa.C[mid] <= order) {
00893                     c_begin = mid+1;
00894                 } else {
00895                     c_end = mid;
00896                 }
00897             }
00898             return m_csa.comp2char[c_begin-1];
00899         }
00900 
00901 
00903 
00910         node_type lca(node_type v, node_type w)const {
00911             if (v.left > w.left or (v.left == w.left and v.right < w.right)) {
00912                 std::swap(v, w);
00913             }
00914             assert(v.left < w.left or (v.left==w.left and v.right >= w.right));
00915             assert(!(v.left < w.left and v.right > w.left and v.right < w.right));   // assert that v and w do not overlapp
00916             if (v.right >= w.right) { // v encloses w or v==w
00917                 return v;
00918             } else { // v.left < v.right < w.left < w.right
00919                 size_type min_index = rmq(v.left+1, w.right);
00920                 size_type l = m_lcp[min_index];
00921                 if (l==0)
00922                     return root();
00923                 else
00924                     return node_type(l, psv(min_index, l), nsv(min_index)-1);
00925             }
00926         }
00927 
00929 
00935         size_type depth(const node_type& v)const {
00936             return v.l;
00937         }
00938 
00940 
00946         // TODO: can be implemented in O(1) with o(n) space. See Jansson, Sadakane, Sung, SODA 2007, "Ultra-succinct Representation of Ordered Trees"
00947         size_type node_depth(node_type v)const {
00948             size_type d = 0;
00949             while (v != root()) {
00950                 ++d;
00951                 v = parent(v);
00952             }
00953             return d;
00954         }
00955 
00957 
00963         node_type sl(const node_type& v)const {
00964             if (v == root())
00965                 return root();
00966             // get interval with first char deleted
00967             size_type left       = m_csa.psi[v.left];
00968             if (is_leaf(v)) {
00969                 if (v.l==1)
00970                     return root();
00971                 else
00972                     return node_type(v.l-1, left, left);
00973             }
00974             size_type right      = m_csa.psi[v.right];
00975             assert(left < right);
00976             // rmq
00977             size_type min_index = rmq(left+1, right);
00978             size_type l = m_lcp[min_index];
00979             if (l==0)
00980                 return root();
00981             else
00982                 return node_type(l, psv(min_index, l), nsv(min_index)-1);
00983         }
00984 
00986         /*
00987          *  \param v A valid not of a cst_sct.
00988          *  \param c The character which should be prepended to the string of the current node.
00989          *      \return root() if the Weiner link of (v, c) does not exist, otherwise the Weiner link is returned.
00990          *  \par Time complexity
00991          *              \f$ \Order{ t_{rank\_bwt}\cdot t_{rmq} } \f$
00992          */
00993         node_type wl(const node_type& v, const unsigned char c) const {
00994             size_type c_left    = m_csa.rank_bwt(v.left, c);
00995             size_type c_right   = m_csa.rank_bwt(v.right+1, c);
00996             if (c_left == c_right)  // there exists no Weiner link
00997                 return root();
00998             if (c_left+1 == c_right)
00999                 return ith_leaf(m_csa.C[m_csa.char2comp[c]] + c_left + 1);
01000             else {
01001                 size_type left  = m_csa.C[m_csa.char2comp[c]] + c_left;
01002                 size_type right = m_csa.C[m_csa.char2comp[c]] + c_right - 1;
01003                 assert(left < right);
01004                 // rmq
01005                 size_type min_index = rmq(left+1, right);
01006                 size_type l = m_lcp[min_index];
01007                 return node_type(l, left, right);
01008             }
01009         }
01010 
01012 
01017         size_type sn(const node_type& v)const {
01018             assert(is_leaf(v));
01019             return m_csa[v.left];
01020         }
01021 
01023 
01031         size_type id(const node_type& v)const {
01032             if (is_leaf(v)) { // return i in the range from 0..csa.size()-1
01033                 return v.left;
01034             }
01035             size_type kpos, find_close_k;
01036             return (m_bp.size()>>1) + get_first_l_index(v.left, v.right, kpos, find_close_k);
01037         }
01038 
01040         size_type nodes()const {
01041             return m_nodes;
01042         }
01043 
01045         /* \param lb Left bound of the lcp-interval [lb..rb] (inclusive).
01046          * \param rb Right bound of the lcp-interval [lb..rb] (inclusive).
01047          * \param l  Depth of the lcp-interval.
01048          *\ return The node in the suffix tree corresponding lcp-interval [lb..rb]
01049          */
01050         node_type node(size_type lb, size_type rb, size_type l=0) const {
01051             return node_type(l, lb, rb);
01052         }
01053 
01055         void print_info()const {
01056             std::cout << "# size of cst in bytes per character" << std::endl;
01057             size_type cst_size = util::get_size_in_bytes(*this);
01058             std::cout << ((double)cst_size)/csa.size() << " # = "<< ((double)cst_size)/(1<<20) <<" MB"<<std::endl;
01059             std::cout<< ((double)util::get_size_in_bytes(csa))/cst_size << " # ratio of csa size" << std::endl;
01060             std::cout<< ((double)util::get_size_in_bytes(lcp))/cst_size << " # ratio of lcp size" << std::endl;
01061             std::cout<< ((double)util::get_size_in_bytes(bp))/cst_size << " # ratio of bp size" << std::endl;
01062             std::cout<< ((double)util::get_size_in_bytes(bp_support))/cst_size << " # ratio of bp_support size" << std::endl;
01063             std::cout<< 0 << " # ratio of bp_rank_10 size" << std::endl;
01064             std::cout<< 0 << " # ratio of bp_select_10 size" << std::endl;
01065         }
01066 
01067         /* @} */
01068 
01069 };
01070 
01071 // == template functions ==
01072 
01073 template<class Csa, class Lcp, class Bp_support>
01074 template<uint8_t int_width, class size_type_class, uint8_t int_width_1, class size_type_class_1, uint8_t int_width_2, class size_type_class_2>
01075 cst_sct<Csa, Lcp, Bp_support>::cst_sct(const std::string& csa_file_name,
01076                                        int_vector_file_buffer<int_width, size_type_class>& lcp_buf,
01077                                        int_vector_file_buffer<int_width_1, size_type_class_1>& sa_buf,
01078                                        int_vector_file_buffer<int_width_2, size_type_class_2>& isa_buf,
01079                                        std::string dir,
01080                                        bool build_only_bps
01081                                       ):csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support)
01082 {
01083     std::string id =  util::to_string(util::get_pid())+"_"+util::to_string(util::get_id()).c_str();
01084     write_R_output("cst", "construct BPS", "begin", 1, 0);
01085     {
01086         bit_vector m_first_child;
01087         m_nodes = algorithm::construct_supercartesian_tree_bp_succinct_and_first_child(lcp_buf, m_bp, m_first_child) + m_bp.size()/2;
01088     }
01089     write_R_output("cst", "construct BPS", "end", 1, 0);
01090 
01091     write_R_output("cst", "construct BPSS", "begin", 1, 0);
01092     m_bp_support = Bp_support(&m_bp);
01093     write_R_output("cst", "construct BPSS", "end", 1, 0);
01094 
01095     write_R_output("cst", "construct CLCP", "begin", 1, 0);
01096     construct_lcp(m_lcp, *this, lcp_buf, isa_buf);
01097     write_R_output("cst", "construct CLCP", "end", 1, 0);
01098 
01099     util::load_from_file(m_csa, csa_file_name.c_str());
01100 }
01101 
01102 template<class Csa, class Lcp, class Bp_support>
01103 cst_sct<Csa, Lcp, Bp_support>::cst_sct(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps):csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support)
01104 {
01105     construct(file_map, dir, id, build_only_bps);
01106 }
01107 
01108 template<class Csa, class Lcp, class Bp_support>
01109 void cst_sct<Csa, Lcp, Bp_support>::construct(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps)
01110 {
01111     write_R_output("cst", "construct BPS", "begin", 1, 0);
01112     {
01113         bit_vector m_first_child;
01114         int_vector_file_buffer<> lcp_buf(file_map["lcp"].c_str());
01115         m_nodes = algorithm::construct_supercartesian_tree_bp_succinct_and_first_child(lcp_buf, m_bp, m_first_child) + m_bp.size()/2;
01116     }
01117     write_R_output("cst", "construct BPS", "end", 1, 0);
01118 
01119     write_R_output("cst", "construct CLCP", "begin", 1, 0);
01120     construct_lcp(m_lcp, *this, file_map, dir, id);
01121     write_R_output("cst", "construct CLCP", "end", 1, 0);
01122 
01123     write_R_output("cst", "construct BPSS", "begin", 1, 0);
01124     m_bp_support = Bp_support(&m_bp);
01125     write_R_output("cst", "construct BPSS", "end", 1, 0);
01126 
01127     util::load_from_file(m_csa, file_map["csa"].c_str());
01128     std::cout<<"debug = "<<m_csa.size()<<std::endl;
01129     std::cout<<"csa file = "<<file_map["csa"]<<std::endl;
01130 }
01131 
01132 
01133 
01134 template<class Csa, class Lcp, class Bp_support>
01135 typename cst_sct<Csa, Lcp, Bp_support>::size_type cst_sct<Csa, Lcp, Bp_support>::serialize(std::ostream& out) const
01136 {
01137     size_type written_bytes = 0;
01138     written_bytes += m_csa.serialize(out);
01139     written_bytes += m_lcp.serialize(out);
01140     written_bytes += m_bp.serialize(out);
01141     written_bytes += m_bp_support.serialize(out);
01142     out.write((char*) &m_nodes, sizeof(m_nodes));
01143     written_bytes += sizeof(m_nodes);
01144     return written_bytes;
01145 }
01146 
01147 template<class Csa, class Lcp, class Bp_support>
01148 void cst_sct<Csa, Lcp, Bp_support>::load(std::istream& in)
01149 {
01150     m_csa.load(in);
01151 //      lcp_trait<Csa, Lcp>::load_lcp(m_lcp, in, m_csa);
01152     load_lcp(m_lcp, in, *this);
01153     m_bp.load(in);
01154     m_bp_support.load(in, &m_bp);
01155     in.read((char*) &m_nodes, sizeof(m_nodes));
01156 #ifdef SDSL_DEBUG
01157     assert(algorithm::check_bp_support(m_bp, m_bp_support));
01158     std::cerr<<"checked bp_support"<<std::endl;
01159 #endif
01160 }
01161 
01162 template<class Csa, class Lcp, class Bp_support>
01163 cst_sct<Csa, Lcp, Bp_support>& cst_sct<Csa, Lcp, Bp_support>::operator=(const cst_sct& cst)
01164 {
01165     if (this != &cst) {
01166         copy(cst);
01167     }
01168     return *this;
01169 }
01170 
01171 
01172 
01173 
01174 
01175 } // end namespace sdsl
01176 
01177 
01178 #endif