SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
 All Classes Namespaces Files Functions Variables Typedefs Friends
sdsl/include/sdsl/cst_sct2.hpp
Go to the documentation of this file.
00001 /* sdsl - succinct data structures library
00002     Copyright (C) 2010 Simon Gog
00003 
00004     This program is free software: you can redistribute it and/or modify
00005     it under the terms of the GNU General Public License as published by
00006     the Free Software Foundation, either version 3 of the License, or
00007     (at your option) any later version.
00008 
00009     This program is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012     GNU General Public License for more details.
00013 
00014     You should have received a copy of the GNU General Public License
00015     along with this program.  If not, see http://www.gnu.org/licenses/ .
00016 */
00021 #ifndef INCLUDED_SDSL_CST_SCT2
00022 #define INCLUDED_SDSL_CST_SCT2
00023 
00024 #include "int_vector.hpp"
00025 #include "algorithms.hpp"
00026 #include "iterators.hpp"
00027 #include "lcp_support_sada.hpp"
00028 #include "bp_support.hpp"
00029 #include "csa_wt.hpp" // for std initialization of cst_sct2 
00030 #include "csa_uncompressed.hpp"
00031 #include "cst_iterators.hpp"
00032 #include "cst_sct.hpp" // for lcp_interval
00033 #include "rank_support.hpp"
00034 #include "testutils.hpp"
00035 #include "util.hpp"
00036 #include <iostream>
00037 #include <algorithm>
00038 #include <cassert>
00039 #include <cstring> // for strlen
00040 #include <iomanip>
00041 #include <iterator>
00042 #include <stack> // for the calculation of the balanced parantheses sequence
00043 #include <ostream>
00044 
00045 namespace sdsl
00046 {
00047 
00049 
00068 template<class Csa = csa_wt<>, class Lcp = lcp_support_sada<Csa>, class Bp_support = bp_support_sada<>, class Rank_support = rank_support_v5<> >
00069 class cst_sct2
00070 {
00071     public:
00072         typedef typename Csa::value_type                                                         value_type;    // STL Container requirement/TODO: ist das nicht gleich node type???
00073         typedef cst_dfs_const_forward_iterator<cst_sct2>         const_iterator;// STL Container requirement
00074         typedef const_iterator                                                                           iterator;              // STL Container requirement
00075         typedef cst_bottom_up_const_forward_iterator<cst_sct2>           const_bottom_up_iterator;
00076         typedef  const_bottom_up_iterator                                                        bottom_up_iterator;
00077         typedef const value_type                                                                         const_reference;
00078         typedef const_reference                                                                          reference;
00079         typedef const_reference*                                                                         pointer;
00080         typedef const pointer                                                                            const_pointer;
00081         typedef typename Csa::size_type                                                          size_type;             // STL Container requirement
00082         typedef size_type                                                                                        cst_size_type;
00083         typedef ptrdiff_t                                                                                        difference_type; // STL Container requirement
00084         typedef Csa                                                                                                      csa_type;
00085         typedef Lcp                                                                                                      lcp_type;
00086         typedef Bp_support                                                                                       bp_support_type;
00087         typedef typename Csa::pattern_type                                                       pattern_type;
00088         typedef typename Csa::char_type                                                          char_type;
00089         typedef lcp_interval<size_type>                                                          node_type; 
00090 
00091         typedef cst_tag                                                                                          index_category;
00092     private:
00093         Csa                             m_csa;
00094         Lcp                             m_lcp;
00095         bit_vector                      m_bp;
00096         Bp_support                      m_bp_support;
00097         bit_vector                      m_first_child;
00098         Rank_support            m_first_child_rank;
00099         size_type                       m_nodes;
00100 
00101         void copy(const cst_sct2& cst) {
00102             m_csa                       = cst.m_csa;
00103 //                      lcp_trait<Csa, Lcp>::copy_lcp(m_lcp, cst.m_lcp, m_csa);
00104             copy_lcp(m_lcp, cst.m_lcp, *this);
00105             m_bp                        = cst.m_bp;
00106             m_bp_support        = cst.m_bp_support;
00107             m_bp_support.set_vector(&m_bp);
00108             m_first_child       = cst.m_first_child;
00109             m_first_child_rank = cst.m_first_child_rank;
00110             m_first_child_rank.set_vector(&m_first_child);
00111             m_nodes                     = cst.m_nodes;
00112         }
00113 
00114         // Get the first l index of a l-[i,j] interval.
00115         /* I.e. given an interval [i,j], the function returns the position of the smallest entry lcp[k] with \f$ i<k\leq j \f$
00116          * Time complexity: O(1) + O(lcp access)
00117          */ // TODO: if interval is last child interval -> we don't need the m_lcp comparisson
00118         size_type get_first_l_index(size_type i, size_type j, size_type& kpos, size_type& find_close_k)const {
00119             if (j+1 != m_csa.size() and m_lcp[i] <= m_lcp[j+1]) {
00120                 size_type jp1pos = m_bp_support.select(j+2);
00121                 assert(m_bp[jp1pos] == 1);  // opening parenthesis
00122                 assert(jp1pos > 0 and m_bp[jp1pos-1] == 0); // closing parenthesis
00123                 find_close_k = jp1pos-1;
00124                 kpos    = m_bp_support.find_open(find_close_k);
00125                 return m_bp_support.rank(kpos)-1;
00126             } else {
00127                 size_type ipos = m_bp_support.select(i+1);
00128                 assert(m_bp[ipos] == 1);  // opening parenthesis
00129                 ipos = m_bp_support.find_close(ipos);
00130                 assert(m_bp[ipos] == 0 and m_bp[ipos-1] == 0);   // closing parenthesis
00131                 find_close_k = ipos-1;
00132                 kpos    = m_bp_support.find_open(find_close_k);
00133                 return m_bp_support.rank(kpos)-1;
00134             }
00135         }
00136 
00137         // Get the postion of the first l index of a l-[i,j] interval in the balanced parentheses sequence.
00138         /* \par Time complexity
00139          *              \f$ \Order{\lcpaccess} \f$
00140          */
00141         size_type get_pos_of_closing_parenthesis_of_the_first_l_index(size_type i, size_type j)const {
00142             if (j+1 != m_csa.size() and m_lcp[i] <= m_lcp[j+1]) {
00143                 size_type jp1pos = m_bp_support.select(j+2);
00144                 assert(m_bp[jp1pos] == 1);  // opening parenthesis
00145                 assert(jp1pos > 0 and m_bp[jp1pos-1] == 0); // closing parenthesis
00146                 return jp1pos-1;
00147             } else {
00148                 size_type ipos = m_bp_support.select(i+1);
00149                 assert(m_bp[ipos] == 1);  // opening parenthesis
00150                 ipos = m_bp_support.find_close(ipos);
00151                 assert(m_bp[ipos] == 0 and m_bp[ipos-1] == 0);   // closing parenthesis
00152                 return ipos-1;
00153             }
00154         }
00155 
00156         // Get the next smaller value.
00157         /* \par Time complexity
00158          *      \f$ \Order{1} \f$
00159          */
00160         size_type nsv(size_type i)const {
00161             size_type pos =  m_bp_support.select(i+1);
00162             size_type rank = m_bp_support.find_close(pos);
00163             size_type result = m_bp_support.rank(rank);
00164             return result;
00165         }
00166 
00167         // Get the previous smaller value.
00168         /* \pre lcp[i] > 0!
00169          * \par Time complexity
00170          *     \f$ \Order{\sigma \cdot \lcpaccess} \f$
00171          * \sa psv
00172          */
00173         size_type psv_linear(size_type i, size_type lcpi)const {
00174             assert(lcpi>0);
00175             size_type pos =  m_bp_support.select(i+1);
00176             size_type ec = m_bp_support.enclose(pos);
00177             size_type j = m_bp_support.rank(ec);
00178             while (lcpi == m_lcp[j-1]) {
00179                 ec = m_bp_support.enclose(ec);
00180                 j = m_bp_support.rank(ec);
00181             }
00182             return j-1;
00183         }
00184 
00185         // Get the previous smaller value.
00186         /* \pre lcp[i] > 0!
00187          * \par Time complexity
00188          *    \f$ \Order{\log\sigma \cdot \lcpaccess} \f$
00189          */
00190         size_type psv2(size_type i, size_type lcpi)const {
00191             assert(lcpi!=0);// if(m_lcp[i]==0) return 0;
00192             size_type begin = m_bp_support.find_close(m_bp_support.select(i+1)); // calculate NSV
00193             size_type end = m_bp_support.rank(begin)+1;                          //     "      " // exclusive
00194             if (end > m_csa.size()) {
00195                 assert(end == m_csa.size()+1);
00196                 end = m_bp.size();
00197             } else {
00198                 end = m_bp_support.select(end);
00199             }
00200             if (++begin < end) {
00201                 size_type result = m_bp_support.rank(m_bp_support.find_open(end-1))-1;
00202                 if (m_lcp[result] < lcpi) { // TODO: explain this condition
00203                     // binary search
00204                     --end;
00205                     while (begin!=end) {
00206                         size_type mid, temp;
00207                         mid = (begin+end)>>1; // begin <= mid < end
00208                         if (m_lcp[temp = m_bp_support.rank(m_bp_support.find_open(mid))-1] >= lcpi) {
00209                             begin = mid+1;
00210                         } else {
00211                             result = temp;
00212                             end = mid;
00213                         }
00214                     }
00215                     return result;
00216                 }
00217             }
00218 
00219             return m_bp_support.rank(m_bp_support.enclose(end))-1;
00220         }
00221 
00222         // Get the previous smaller value.
00223         /* \pre lcp[i] > 0!
00224          * \par Time complexity
00225          *    \f$ \Order{\frac{\sigma}{w}} \f$, where w=64 is the word size
00226          */
00227         size_type psv(size_type i, size_type lcpi)const {
00228             assert(lcpi!=0);// if(m_lcp[i]==0) return 0;
00229             size_type ipos = m_bp_support.select(i+1);
00230             size_type begin = m_bp_support.find_close(ipos); // calculate closing parenthesis
00231             size_type r0 = begin - m_bp_support.rank(begin); // index of clothing parenthesis in first_child bp
00232             size_type next_first_child = 0;
00233             const uint64_t* p = m_first_child.data() + (r0>>6);
00234             uint64_t w = (*p) >> (r0&0x3F);
00235             if (w) { // if w!=0
00236                 next_first_child = begin + bit_magic::r1BP(w);
00237                 if (begin == next_first_child and m_bp[next_first_child+1])
00238                     return m_bp_support.rank(m_bp_support.enclose(ipos))-1;
00239             } else {
00240                 begin += 64-(r0&0x3F);
00241                 ++p;
00242                 while (!(w=*p)) { // while w==0
00243                     ++p;
00244                     begin += 64;
00245                 }
00246                 next_first_child = begin + bit_magic::r1BP(w);
00247             }
00248             assert(next_first_child < m_bp.size());
00249             if (!m_bp[next_first_child+1]) { // if next parenthesis is a closing one
00250                 return m_bp_support.rank(m_bp_support.find_open(next_first_child+1))-1;
00251             } else {
00252                 return m_bp_support.rank(m_bp_support.enclose(m_bp_support.find_open(next_first_child)))-1;
00253             }
00254         }
00255 
00256         // Range minimum query based on the rr_enclose method.
00257         /* \par Time complexity
00258          *     \f$ \Order{\rrenclose} \f$
00259          */
00260         size_type rmq(size_type l, size_type r)const {
00261             size_type i         = m_bp_support.select(l+1);
00262             size_type j         = m_bp_support.select(r+1);
00263             size_type fc_i      = m_bp_support.find_close(i);
00264             if (j < fc_i) { // i < j < find_close(j) < find_close(i)
00265                 return l;
00266             } else { // i < find_close(i) < j < find_close(j)
00267                 size_type ec = m_bp_support.rr_enclose(i,j);
00268                 if (ec == m_bp_support.size()) {// no restricted enclosing pair found
00269                     return r;
00270                 } else { // found range restriced enclosing pair
00271                     return m_bp_support.rank(ec)-1; // subtract 1, as the index is 0 based
00272                 }
00273             }
00274         }
00275 
00276     public:
00277         const Csa& csa;                         
00278         const Lcp& lcp;                         
00279         const bit_vector& bp;                   
00280         const Bp_support& bp_support;   
00281 
00282         const bit_vector& first_child_bv;
00283 
00285         /* @{ */
00286 
00288         cst_sct2(): csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support), first_child_bv(m_first_child) {}
00289 
00290         // Constructor for the cst_sct2 taking a string for that the compressed suffix tree should be calculated.
00291         /*
00292          * \param str Text for which the \f$ \CST \f$ should be constructed. The text should be terminated by a zero byte.
00293          * \pre The text has to be terminated by a zero byte.
00294          */
00295 //              cst_sct2(const unsigned char *str);
00296 
00297         template<uint8_t int_width, class size_type_class, uint8_t int_width_1, class size_type_class_1, uint8_t int_width_2, class size_type_class_2>
00298         cst_sct2(const std::string& cst_file_name,
00299                  int_vector_file_buffer<int_width, size_type_class>& lcp_buf,
00300                  int_vector_file_buffer<int_width_1, size_type_class_1>& sa_buf,
00301                  int_vector_file_buffer<int_width_2, size_type_class_2>& isa_buf,
00302                  std::string dir="./", bool build_only_bps=false);
00303 
00304         cst_sct2(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps=false);
00305 
00307 
00312         cst_sct2(const cst_sct2& cst):csa(m_csa),lcp(m_lcp),bp(m_bp),bp_support(m_bp_support),first_child_bv(m_first_child) {
00313             copy(cst);
00314         }
00315 
00316         /* @} */
00317 
00318         void construct(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps=false);
00319 
00321         ~cst_sct2() {}
00322 
00324 
00327         size_type size()const {
00328             return m_csa.size();
00329         }
00330 
00332 
00335         static size_type max_size() {
00336             return Csa::max_size();
00337         }
00338 
00340 
00343         bool empty()const {
00344             return m_csa.empty();
00345         }
00346 
00348 
00356         void swap(cst_sct2<Csa, Lcp>& cst);
00357 
00359 
00362         const_iterator begin()const {
00363             if (0 == m_bp.size())  // special case: tree is uninitialized
00364                 return end();
00365             else if (2 == m_bp.size()) { // special case: the root is a leaf
00366                 return const_iterator(this, root(), true, true);
00367             }
00368             return const_iterator(this, root(), false, true);
00369         };
00370 
00372 
00375         const_iterator end()const {
00376             return const_iterator(this, root(), true, false);
00377         }
00378 
00380         const_bottom_up_iterator begin_bottom_up()const {
00381             if (0 == m_bp.size())  // special case: tree is uninitialized
00382                 return end_bottom_up();
00383             return const_bottom_up_iterator(this, leftmost_leaf_in_the_subtree(root()));
00384         }
00385 
00387         const_bottom_up_iterator end_bottom_up()const {
00388             return const_bottom_up_iterator(this, root(), false);
00389         }
00390 
00392 
00395         cst_sct2& operator=(const cst_sct2& cst);
00396 
00398 
00403         bool operator==(const cst_sct2& cst)const;
00404 
00406 
00411         bool operator!=(const cst_sct2& cst)const;
00412 
00414 
00417         size_type serialize(std::ostream& out) const;
00418 
00420 
00422         void load(std::istream& in);
00423 
00425         /* @{ */
00426 
00428 
00432         node_type root() const {
00433             return node_type(0, 0, m_csa.size()-1);
00434         }
00435 
00437 
00443         bool is_leaf(const node_type& v)const {
00444             return v.right==v.left;
00445         }
00446 
00448 
00455         node_type ith_leaf(size_type i)const {
00456             assert(i > 0 and i <= m_csa.size());
00457             return node_type(m_csa.size()-m_csa[i-1], i-1, i-1);
00458         }
00459 
00461 
00468         size_type leaves_in_the_subtree(const node_type& v)const {
00469             return v.right-v.left+1;
00470         }
00471 
00473 
00478         node_type leftmost_leaf_in_the_subtree(const node_type& v)const {
00479             return node_type(m_csa.size()-m_csa[v.left], v.left, v.left);
00480         }
00481 
00483 
00488         node_type rightmost_leaf_in_the_subtree(const node_type& v)const {
00489             return node_type(m_csa.size()-m_csa[v.right], v.right, v.right);
00490         }
00491 
00493 
00500         size_type lb(const node_type& v)const {
00501             return v.left;
00502         }
00503 
00505 
00512         size_type rb(const node_type& v)const {
00513             return v.right;
00514         }
00515 
00517 
00522         node_type parent(const node_type& v) const {
00523 #ifndef NDEBUG
00524 //                      std::cout<<"parent interval of "<<v.l<<"-["<<v.left<<" "<<v.right<<"]"<<std::endl;
00525 #endif
00526             if (v.l <= 1) {// if v.l < 1 => root()
00527                 return root();  // return root node
00528             } else { // l > 1
00529                 if (v.right+1 == m_csa.size()) {
00530                     // search for the previous smaller value
00531                     size_type lcpi = m_lcp[v.left];
00532                     if (lcpi==0)
00533                         return root();
00534                     assert(lcpi!=0);
00535                     return node_type(lcpi, psv(v.left, lcpi), v.right);
00536                 } else if (v.left == 0) {
00537                     // search for the next smaller value
00538                     return node_type(m_lcp[v.right+1],0, nsv(v.right+1)-1);
00539                 } else {
00540                     // TODO: speed up, only two cases?
00541                     size_type lcpl = m_lcp[v.left], lcpr = m_lcp[v.right+1];
00542                     if (lcpl < lcpr) {
00543                         return node_type(lcpr, v.left, nsv(v.right+1)-1);
00544                     } else if (lcpl > lcpr) {
00545                         return node_type(lcpl, psv(v.left, lcpl), v.right);
00546                     } else {
00547                         if (lcpl==0)
00548                             return root();
00549                         return node_type(lcpl, psv(v.left, lcpl), nsv(v.right+1)-1);
00550                     }
00551                 }
00552             }
00553         }
00554 
00555 
00557 
00566         node_type ith_child(const node_type& v, size_type i)const {
00567             if (is_leaf(v))  // if v is a leave, v has no child
00568                 return root();
00569             size_type k = 0, kpos = 0, k_find_close = 0;
00570             // v is not a leave: v has at least two children
00571             k = get_first_l_index(v.left, v.right, kpos, k_find_close);// get first l-index k and the position of k
00572             assert(m_lcp[k]==v.l);
00573             size_type left = v.left, right = k-1;
00574             size_type j=1;
00575             while (j < i) {
00576                 // get next l-index m and the position of m
00577                 size_type m    = m_bp_support.rank(m_bp_support.find_open(k_find_close-j))-1;
00578                 ++j;
00579                 if (m==k or v.l != m_lcp[m]) { // no next l-index exists
00580                     if (i!=j)
00581                         return root();
00582                     left        = k;
00583                     right       = v.right;
00584                     break;
00585                 }
00586                 left  = k;
00587                 right = m-1;
00588                 k = m;
00589             }
00590             if (left==right) { // child interval is a leaf
00591                 assert(m_csa.size()-m_csa[left] > v.l);
00592                 return node_type(m_csa.size()-m_csa[left],left,right);
00593             } else {
00594                 // find l of the first child interval
00595                 size_type k1 = get_first_l_index(left, right, kpos, kpos);
00596                 assert(m_lcp[k1] > v.l);
00597                 return node_type(m_lcp[k1], left, right);
00598             }
00599             return root();
00600         }
00601 
00603 
00609         size_type degree(const node_type& v)const {
00610             if (is_leaf(v))  // if v is a leave, v has no child
00611                 return 0;
00612             // v is not a leave: v has at least two children
00613             size_type r = get_pos_of_closing_parenthesis_of_the_first_l_index(v.left, v.right);
00614             assert(m_lcp[m_bp_support.rank(m_bp_support.find_open(r))-1]==v.l);
00615             size_type begin = m_bp_support.preceding_closing_parentheses(r);
00616             assert(m_csa.sigma>0);
00617             if (begin > (size_type)m_csa.sigma-1) begin = m_csa.sigma-1;
00618             begin = r-begin;
00619             size_type end = r, mid;
00620             while (end != begin) {
00621                 mid = (begin+end)>>1;  // begin <= mid < end
00622                 if (m_lcp[m_bp_support.rank(m_bp_support.find_open(mid))-1] > v.l)
00623                     begin = mid+1;
00624                 else
00625                     end = mid;
00626             }
00627             return r-end+2;
00628         }
00629 
00631 
00637         node_type sibling(const node_type& v)const {
00638             if (v == root())
00639                 return root();
00640             if (v.right == m_csa.size()-1) // there exists no next sibling if the interval touches the right border
00641                 return root();
00642             // now let [p..q] be the interval [v.left, v.right]
00643             size_type lcp_p   = m_lcp[v.left];
00644             size_type lcp_qp1 = m_lcp[v.right+1]; // is defined as v.right < m_csa.size()-1
00645             if (lcp_p > lcp_qp1)
00646                 return root();
00647             else { // search next l-index
00648                 size_type new_q;
00649                 size_type qp1_closing_pos = m_bp_support.find_close(m_bp_support.select(v.right+2));
00650                 if (m_bp[qp1_closing_pos-1]) {// if there exists no closing parenthesis left to qp1_closing_pos
00651                     new_q = nsv(v.right+1);   // i.e. we could not find a next l-index, only a smaller value: last sibling
00652                     assert(new_q > v.right+1);
00653                 } else {
00654                     new_q = m_bp_support.rank(m_bp_support.find_open(qp1_closing_pos-1))-1;
00655                     assert(new_q > v.right+1);
00656                     if (m_lcp[new_q] > lcp_qp1) { // no next l-index exists: find next smaller value, last sibling
00657                         new_q = nsv(v.right+1);
00658                     } else {
00659                         assert(m_lcp[new_q] == lcp_qp1);
00660                     }
00661                 }
00662                 assert(new_q > v.right+1);
00663                 if (new_q-1 == v.right+1) { // sibling is a singlton interval
00664                     return ith_leaf(new_q/*v.right+2*/);
00665                 } else {
00666                     // get l-index m of the sibling
00667                     size_type m_pos, m_close_pos;
00668                     size_type m = get_first_l_index(v.right+1, new_q-1, m_pos, m_close_pos);
00669                     return node_type(m_lcp[m], v.right+1, new_q-1);
00670                 }
00671 
00672             }
00673         }
00674 
00675         // Returns the next sibling of node v.
00676         // Only for tests.
00677         node_type sibling_naive(const node_type& v)const {
00678             if (v==root())
00679                 return root();
00680             node_type parent = this->parent(v);
00681             assert(parent != v);
00682             size_type nr = degree(parent);
00683             for (size_type i=1; i <= nr; ++i)
00684                 if (ith_child(parent, i) == v and i!=nr)
00685                     return ith_child(parent, i+1);
00686             return root();
00687         }
00688 
00690 
00698         node_type child_linear(const node_type& v, unsigned char c, size_type& char_pos)const {
00699             if (is_leaf(v))  // if v is a leave, it has no child
00700                 return root();
00701             size_type k, kpos, k_find_close;
00702             // v is not a leave: v has at least two children
00703             k = get_first_l_index(v.left, v.right, kpos, k_find_close);// get first l-index k
00704             assert(m_lcp[k]==v.l);
00705             size_type left = v.left, right = k-1;
00706             char_pos = m_csa(m_csa[left]+v.l); // char_pos = invSA[SA[left]+v.l+1-1]
00707             uint16_t cc = m_csa.char2comp[c];
00708             size_type char_ex_max_pos = m_csa.C[cc+1], char_inc_min_pos = m_csa.C[cc];
00709             while (char_pos < char_ex_max_pos) {
00710                 if (char_pos >= char_inc_min_pos) {
00711                     goto found_child;
00712                 }
00713                 // get next l-index
00714                 size_type mpos = m_bp_support.find_open((m_bp_support.find_close(kpos)-1));
00715                 size_type m    = m_bp_support.rank(mpos)-1;
00716                 if (m==k or v.l != m_lcp[m]) { // no next l-index exists
00717                     left        = k;
00718                     right       = v.right;
00719                     char_pos = m_csa(m_csa[left]+v.l);
00720                     if (char_pos < char_inc_min_pos or char_pos >= char_ex_max_pos)  // check if character is two small
00721                         return root();
00722                     goto found_child;
00723                 }
00724                 left  = k;
00725                 right = m-1;
00726                 k = m;
00727                 kpos = mpos;
00728                 assert(m_csa(m_csa[left]+v.l) > char_pos);
00729                 char_pos = m_csa(m_csa[left]+v.l);
00730             }
00731             return root();
00732 found_child:
00733             if (left==right) { // child interval is a leaf
00734                 assert(m_csa.size()-m_csa[left] > v.l);
00735                 return node_type(m_csa.size()-m_csa[left],left,right);
00736             } else {
00737                 // find l of the first child interval
00738                 size_type k1pos, k1 = get_first_l_index(left, right, k1pos, k_find_close);
00739                 assert(m_lcp[k1] > v.l);
00740                 return node_type(m_lcp[k1], left, right);
00741             }
00742         }
00743 
00744 
00745 
00747 
00755         // TODO const unsigned char c durch char_type ersetzen
00756         node_type child(node_type v, const unsigned char c, size_type& char_pos)const {
00757             if (is_leaf(v))  // if v is a leaf = (), v has no child
00758                 return root();
00759             // else v = ( (     ))
00760             uint16_t cc = m_csa.char2comp[c];
00761             if (cc==0 and c!=0) // TODO: aendere char2comp so ab, dass man diesen sonderfall nicht braucht
00762                 return root();
00763             size_type char_ex_max_pos = m_csa.C[cc+1], char_inc_min_pos = m_csa.C[cc];
00764             // v is not a leave: v has at least two children
00765             size_type k, kpos, k_find_close;
00766             k = get_first_l_index(v.left, v.right, kpos, k_find_close);
00767             assert(m_lcp[k]==v.l);
00768             size_type left = v.left, right = k-1;
00769             char_pos = m_csa(m_csa[left]+v.l);
00770             if (char_pos >= char_ex_max_pos) {// c is lex. greater than the first character of the first child interval
00771                 return root();
00772             } else if (char_pos >= char_inc_min_pos) { // found interval [left, right]
00773                 goto found_child;
00774             }
00775             {
00776                 size_type k_char_pos = m_csa(m_csa[k]+v.l);
00777                 {
00778                     size_type begin = m_bp_support.preceding_closing_parentheses(k_find_close);
00779                     assert(m_csa.sigma>0);
00780                     if (begin > (size_type)m_csa.sigma-1) begin = m_csa.sigma-1;
00781                     begin = k_find_close - begin;
00782                     size_type end = k_find_close, m_find_close, m;
00783                     while (end != begin) {
00784                         m_find_close = (begin+end)>>1;  // begin <= m_find_close < end
00785                         m = m_bp_support.rank(m_bp_support.find_open(m_find_close))-1;
00786                         if (m_lcp[m] > v.l or /*m_lcp[m]==v.l and */ (char_pos = m_csa(m_csa[m]+v.l)) >= char_ex_max_pos) {
00787                             begin = m_find_close+1;
00788                         } else { // m_lcp[m] == v.l and char_pos < char_ex_max_pos
00789                             end = m_find_close;
00790                             k = m;
00791                             k_char_pos  = char_pos;
00792                         }
00793                     }
00794                     assert(m_lcp[k] == v.l);
00795                     assert(m_bp_support.find_close(m_bp_support.select(k+1)) == end);
00796                     if (k_char_pos < char_ex_max_pos and k_char_pos >= char_inc_min_pos) {// child interval exists
00797                         left = k;
00798                         char_pos = k_char_pos;
00799                         assert(end > 0);
00800                         if (m_bp[end-1] == 1) { // last child interval
00801                             right = v.right;
00802                         } else {
00803                             size_type k1 = m_bp_support.rank(m_bp_support.find_open(end-1))-1;
00804                             if (m_lcp[k1] > v.l) {// last child interval
00805                                 right = v.right;
00806                             } else {
00807                                 right = k1-1;
00808                             }
00809                         }
00810                     } else {
00811                         return root();
00812                     }
00813                 }
00814             }
00815 found_child:
00816             if (left==right) { // child interval is a leaf
00817                 assert(m_csa.size()-m_csa[left] > v.l);
00818                 return node_type(m_csa.size()-m_csa[left],left,right);
00819             } else {
00820                 // find l of the first child interval
00821                 size_type k1pos, k1 = get_first_l_index(left, right, k1pos, k_find_close);
00822                 assert(m_lcp[k1] > v.l);
00823                 return node_type(m_lcp[k1], left, right);
00824             }
00825         }
00826 
00828         // \sa child(node_type v, const unsigned char c, size_type &char_pos)
00829         node_type child(const node_type& v, const unsigned char c) {
00830             size_type char_pos;
00831             return child(v, c, char_pos);
00832         }
00833 
00835 
00842         unsigned char edge(const node_type& v, size_type d)const {
00843 #ifndef NDEBUG
00844             if (d < 1 or d > depth(v)) {
00845                 throw std::out_of_range("OUT_OF_RANGE_ERROR: "+util::demangle(typeid(this).name())+"cst_sct2<>::edge(node_type v, size_type d). d == 0 or d > depth(v)!");
00846             }
00847 #endif
00848             size_type order = m_csa(m_csa[v.left]+d-1);
00849             uint16_t c_begin = 1, c_end = m_csa.sigma+1, mid;
00850             while (c_begin < c_end) {
00851                 mid = (c_begin+c_end)>>1;
00852                 if (m_csa.C[mid] <= order) {
00853                     c_begin = mid+1;
00854                 } else {
00855                     c_end = mid;
00856                 }
00857             }
00858             return m_csa.comp2char[c_begin-1];
00859         }
00860 
00861 
00863 
00870         node_type lca(node_type v, node_type w)const {
00871             if (v.left > w.left or (v.left == w.left and v.right < w.right)) {
00872                 std::swap(v, w);
00873             }
00874             assert(v.left < w.left or (v.left==w.left and v.right >= w.right));
00875             assert(!(v.left < w.left and v.right > w.left and v.right < w.right));   // assert that v and w do not overlapp
00876             if (v.right >= w.right) { // v encloses w or v==w
00877                 return v;
00878             } else { // v.left < v.right < w.left < w.right
00879                 size_type min_index = rmq(v.left+1, w.right);
00880                 size_type l = m_lcp[min_index];
00881                 if (l==0)
00882                     return root();
00883                 else
00884                     return node_type(l, psv(min_index, l), nsv(min_index)-1);
00885             }
00886         }
00887 
00889 
00895         size_type depth(const node_type& v)const {
00896             return v.l;
00897         }
00898 
00900 
00906         // TODO: can be implemented in O(1) with o(n) space. See Jansson, Sadakane, Sung, SODA 2007, "Ultra-succinct Representation of Ordered Trees"
00907         size_type node_depth(node_type v)const {
00908             size_type d = 0;
00909             while (v != root()) {
00910                 ++d;
00911                 v = parent(v);
00912             }
00913             return d;
00914         }
00915 
00917 
00923         node_type sl(const node_type& v)const {
00924             if (v == root())
00925                 return root();
00926             // get interval with first char deleted
00927             size_type left       = m_csa.psi[v.left];
00928             if (is_leaf(v)) {
00929                 if (v.l==1)
00930                     return root();
00931                 else
00932                     return node_type(v.l-1, left, left);
00933             }
00934             size_type right      = m_csa.psi[v.right];
00935             assert(left < right);
00936             // rmq
00937             size_type min_index = rmq(left+1, right);
00938             size_type l = m_lcp[min_index];
00939             if (l==0)
00940                 return root();
00941             else
00942                 return node_type(l, psv(min_index, l), nsv(min_index)-1);
00943         }
00944 
00946         /*
00947          *  \param v A valid not of a cst_sct2.
00948          *  \param c The character which should be prepended to the string of the current node.
00949          *      \return root() if the Weiner link of (v, c) does not exist, otherwise the Weiner link is returned.
00950          *  \par Time complexity
00951          *              \f$ \Order{ t_{rank\_bwt}\cdot t_{rmq} }  \f$
00952          */
00953         node_type wl(const node_type& v, const unsigned char c) const {
00954             size_type c_left    = m_csa.rank_bwt(v.left, c);
00955             size_type c_right   = m_csa.rank_bwt(v.right+1, c);
00956             if (c_left == c_right)  // there exists no Weiner link
00957                 return root();
00958             if (c_left+1 == c_right)
00959                 return ith_leaf(m_csa.C[m_csa.char2comp[c]] + c_left + 1);
00960             else {
00961                 size_type left  = m_csa.C[m_csa.char2comp[c]] + c_left;
00962                 size_type right = m_csa.C[m_csa.char2comp[c]] + c_right - 1;
00963                 assert(left < right);
00964                 // rmq to compute l-index
00965                 size_type min_index = rmq(left+1, right);
00966                 size_type l = m_lcp[min_index];
00967                 return node_type(l, left, right);
00968             }
00969         }
00970 
00972 
00977         size_type sn(const node_type& v)const {
00978             assert(is_leaf(v));
00979             return m_csa[v.left];
00980         }
00981 
00983 
00989         size_type id(const node_type& v)const {
00990             if (is_leaf(v)) { // return i in the range from 0..csa.size()-1
00991                 return v.left;
00992             }
00993             size_type kpos, find_close_k;
00994             get_first_l_index(v.left, v.right, kpos, find_close_k);
00995             size_type r0clpos = find_close_k - m_bp_support.rank(find_close_k);
00996 
00997             return size()+m_first_child_rank(r0clpos);
00998         }
00999 
01001         size_type nodes()const {
01002             return m_nodes;
01003         }
01004 
01006         /* \param lb Left bound of the lcp-interval [lb..rb] (inclusive).
01007          * \param rb Right bound of the lcp-interval [lb..rb] (inclusive).
01008          * \param l  Depth of the lcp-interval.
01009          *\ return The node in the suffix tree corresponding lcp-interval [lb..rb]
01010          */
01011         node_type node(size_type lb, size_type rb, size_type l=0) const {
01012             return node_type(l, lb, rb);
01013         }
01014 
01016         void print_info()const {
01017             std::cout << "# size of cst in bytes per character" << std::endl;
01018             size_type cst_size = util::get_size_in_bytes(*this);
01019             std::cout << ((double)cst_size)/csa.size() << " # = "<< ((double)cst_size)/(1<<20) <<" MB"<<std::endl;
01020             std::cout<< ((double)util::get_size_in_bytes(csa))/cst_size << " # ratio of csa size" << std::endl;
01021             std::cout<< ((double)util::get_size_in_bytes(lcp))/cst_size << " # ratio of lcp size" << std::endl;
01022             std::cout<< ((double)util::get_size_in_bytes(bp))/cst_size << " # ratio of bp size" << std::endl;
01023             std::cout<< ((double)util::get_size_in_bytes(bp_support))/cst_size << " # ratio of bp_support size" << std::endl;
01024             std::cout<< 0 << " # ratio of bp_rank_10 size" << std::endl;
01025             std::cout<< 0 << " # ratio of bp_select_10 size" << std::endl;
01026         }
01027 
01028         /* @} */
01029 
01030 };
01031 
01032 // == template functions ==
01033 
01034 template<class Csa, class Lcp, class Bp_support, class Rank_support>
01035 template<uint8_t int_width, class size_type_class, uint8_t int_width_1, class size_type_class_1, uint8_t int_width_2, class size_type_class_2>
01036 cst_sct2<Csa, Lcp, Bp_support, Rank_support>::cst_sct2(const std::string& csa_file_name,
01037         int_vector_file_buffer<int_width, size_type_class>& lcp_buf,
01038         int_vector_file_buffer<int_width_1, size_type_class_1>& sa_buf,
01039         int_vector_file_buffer<int_width_2, size_type_class_2>& isa_buf,
01040         std::string dir,
01041         bool build_only_bps
01042                                                       ):csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support), first_child_bv(m_first_child)
01043 {
01044     std::string id =  util::to_string(util::get_pid())+"_"+util::to_string(util::get_id()).c_str();
01045     write_R_output("cst", "construct BPS", "begin", 1, 0);
01046     m_nodes = algorithm::construct_supercartesian_tree_bp_succinct_and_first_child(lcp_buf, m_bp, m_first_child) + m_bp.size()/2;
01047     write_R_output("cst", "construct BPS", "end", 1, 0);
01048 
01049     write_R_output("cst", "construct CLCP", "begin", 1, 0);
01050     construct_lcp(m_lcp, *this, lcp_buf, isa_buf);
01051     write_R_output("cst", "construct CLCP", "end", 1, 0);
01052 
01053     util::load_from_file(m_csa, csa_file_name.c_str());
01054 
01055     write_R_output("cst", "construct BPSS", "begin", 1, 0);
01056     m_bp_support = Bp_support(&m_bp);
01057     util::init_support(m_first_child_rank, &m_first_child);
01058     write_R_output("cst", "construct BPSS", "end",1,0);
01059 }
01060 
01061 
01062 template<class Csa, class Lcp, class Bp_support, class Rank_support>
01063 cst_sct2<Csa, Lcp, Bp_support, Rank_support>::cst_sct2(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps):csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support), first_child_bv(m_first_child)
01064 {
01065     construct(file_map, dir, id, build_only_bps);
01066 }
01067 
01068 
01069 template<class Csa, class Lcp, class Bp_support, class Rank_support>
01070 void cst_sct2<Csa, Lcp, Bp_support, Rank_support>::construct(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps)
01071 {
01072     write_R_output("cst", "construct BPS", "begin", 1, 0);
01073     int_vector_file_buffer<> lcp_buf(file_map["lcp"].c_str());
01074     m_nodes = algorithm::construct_supercartesian_tree_bp_succinct_and_first_child(lcp_buf, m_bp, m_first_child) + m_bp.size()/2;
01075     write_R_output("cst", "construct BPS", "end", 1, 0);
01076 
01077     write_R_output("cst", "construct BPSS", "begin", 1, 0);
01078     m_bp_support = Bp_support(&m_bp);
01079     write_R_output("cst", "construct BPSS", "end", 1, 0);
01080 
01081     write_R_output("cst", "construct CLCP", "begin", 1, 0);
01082     construct_lcp(m_lcp, *this, file_map, dir, id);
01083     write_R_output("cst", "construct CLCP", "end", 1, 0);
01084 
01085     util::load_from_file(m_csa, file_map["csa"].c_str());
01086 }
01087 
01088 
01089 
01090 template<class Csa, class Lcp, class Bp_support, class Rank_support>
01091 typename cst_sct2<Csa, Lcp, Bp_support, Rank_support>::size_type cst_sct2<Csa, Lcp, Bp_support, Rank_support>::serialize(std::ostream& out) const
01092 {
01093     size_type written_bytes = 0;
01094     written_bytes += m_csa.serialize(out);
01095     written_bytes += m_lcp.serialize(out);
01096     written_bytes += m_bp.serialize(out);
01097     written_bytes += m_bp_support.serialize(out);
01098     written_bytes += m_first_child.serialize(out);
01099     written_bytes += m_first_child_rank.serialize(out);
01100     out.write((char*) &m_nodes, sizeof(m_nodes));
01101     written_bytes += sizeof(m_nodes);
01102     return written_bytes;
01103 }
01104 
01105 template<class Csa, class Lcp, class Bp_support, class Rank_support>
01106 void cst_sct2<Csa, Lcp, Bp_support, Rank_support>::load(std::istream& in)
01107 {
01108     m_csa.load(in);
01109     load_lcp(m_lcp, in, *this);
01110     m_bp.load(in);
01111     m_bp_support.load(in, &m_bp);
01112     m_first_child.load(in);
01113     m_first_child_rank.load(in, &m_first_child);
01114     in.read((char*) &m_nodes, sizeof(m_nodes));
01115 #ifdef SDSL_DEBUG
01116     assert(algorithm::check_bp_support(m_bp, m_bp_support));
01117     std::cerr<<"checked bp_support"<<std::endl;
01118 #endif
01119 }
01120 
01121 template<class Csa, class Lcp, class Bp_support, class Rank_support>
01122 cst_sct2<Csa, Lcp, Bp_support, Rank_support>& cst_sct2<Csa, Lcp, Bp_support, Rank_support>::operator=(const cst_sct2& cst)
01123 {
01124     if (this != &cst) {
01125         copy(cst);
01126     }
01127     return *this;
01128 }
01129 
01130 
01131 
01132 
01133 
01134 } // end namespace sdsl
01135 
01136 
01137 #endif