SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
 All Classes Namespaces Files Functions Variables Typedefs Friends
sdsl/include/sdsl/cst_sct3.hpp
Go to the documentation of this file.
00001 /* sdsl - succinct data structures library
00002     Copyright (C) 2010 Simon Gog
00003 
00004     This program is free software: you can redistribute it and/or modify
00005     it under the terms of the GNU General Public License as published by
00006     the Free Software Foundation, either version 3 of the License, or
00007     (at your option) any later version.
00008 
00009     This program is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012     GNU General Public License for more details.
00013 
00014     You should have received a copy of the GNU General Public License
00015     along with this program.  If not, see http://www.gnu.org/licenses/ .
00016 */
00021 #ifndef INCLUDED_SDSL_CST_SCT3
00022 #define INCLUDED_SDSL_CST_SCT3
00023 
00024 #include "int_vector.hpp"
00025 #include "algorithms.hpp"
00026 #include "iterators.hpp"
00027 #include "lcp_support_tree.hpp"
00028 #include "lcp_wt.hpp"
00029 #include "bp_support.hpp"
00030 #include "csa_wt.hpp" // for std initialization of cst_sct3 
00031 #include "csa_uncompressed.hpp"
00032 #include "cst_iterators.hpp"
00033 #include "cst_sct.hpp" // for lcp_interval
00034 #include "rank_support.hpp"
00035 #include "select_support.hpp"
00036 #include "testutils.hpp"
00037 #include "util.hpp"
00038 #include <iostream>
00039 #include <algorithm>
00040 #include <cassert>
00041 #include <cstring> // for strlen
00042 #include <iomanip>
00043 #include <iterator>
00044 #include <stack> // for the calculation of the balanced parantheses sequence
00045 #include <ostream>
00046 
00047 namespace sdsl
00048 {
00049 
00050 struct cst_tag; // forward declaration
00051 
00052 template<class Int = int_vector<>::size_type>
00053 struct bp_interval {
00054     Int i;      
00055     Int j;      
00056     Int ipos;  // position of the i+1th opening parenthesis in the balanced parentheses sequence
00057     Int cipos; // position of the matching closing parenthesis of the i+1th opening parenthesis in the balanced parentheses sequence
00058     Int jp1pos; // position of the j+2th opening parenthesis in the balanced parentheses sequence
00059 
00061 
00063     bp_interval(Int i=0, Int j=0, Int ipos=0, Int cipos=0, Int jp1pos=0):i(i),j(j),ipos(ipos),cipos(cipos),jp1pos(jp1pos) {};
00064 
00065 
00066     bool operator<(const bp_interval& interval)const {
00067         if (i!=interval.i)
00068             return i<interval.i;
00069         return j<interval.j;
00070     }
00071 
00073 
00075     bool operator==(const bp_interval& interval)const {
00076         return i==interval.i and
00077                j==interval.j;
00078     }
00079 
00081 
00083     bool operator!=(const bp_interval& interval)const {
00084         return !(*this==interval);
00085     }
00086 
00088 
00090     bp_interval& operator=(const bp_interval& interval) {
00091         i = interval.i;
00092         j = interval.j;
00093         ipos = interval.ipos;
00094         cipos = interval.cipos;
00095         jp1pos = interval.jp1pos;
00096         return *this;
00097     }
00098 };
00099 
00100 
00101 template<class Int>
00102 inline std::ostream& operator<<(std::ostream& os, const bp_interval<Int>& interval)
00103 {
00104     os<<"-["<<interval.i<<","<<interval.j<<"]("<<interval.ipos<<","<<interval.cipos<<","<<interval.jp1pos<<")";
00105     return os;
00106 }
00107 
00108 
00110 
00130 template<class Csa = csa_wt<>,                        // CSA type
00131          class Lcp = lcp_support_tree<lcp_wt<> >,     // LCP type
00132          class Bp_support = bp_support_sada<>,        // for the balanced parentheses
00133          class Rank_support = rank_support_v5<>       // for the 'first child' bit_vector
00134          >
00135 class cst_sct3
00136 {
00137     public:
00138         typedef typename Csa::value_type                                                         value_type;    // STL Container requirement/TODO: ist das nicht gleich node type???
00139         typedef cst_dfs_const_forward_iterator<cst_sct3>                         const_iterator;// STL Container requirement
00140 //      typedef const_iterator                                                                           iterator;              // STL Container requirement
00141         typedef cst_bottom_up_const_forward_iterator<cst_sct3>           const_bottom_up_iterator;
00142 //      typedef  const_bottom_up_iterator                                                        bottom_up_iterator;
00143         typedef const value_type                                                                         const_reference;
00144         typedef const_reference                                                                          reference;
00145         typedef const_reference*                                                                         pointer;
00146         typedef const pointer                                                                            const_pointer;
00147         typedef typename Csa::size_type                                                          size_type;             // STL Container requirement
00148         typedef size_type                                                                                        cst_size_type;
00149         typedef ptrdiff_t                                                                                        difference_type; // STL Container requirement
00150         typedef Csa                                                                                                      csa_type;
00151         typedef typename Lcp::template type<cst_sct3>::lcp_type          lcp_type;
00152         typedef Bp_support                                                                                       bp_support_type;
00153         typedef typename Csa::pattern_type                                                       pattern_type;
00154         typedef typename Csa::char_type                                                          char_type;
00155         typedef bp_interval<size_type>                                                           node_type; 
00156         typedef Rank_support                                                                             fc_rank_support_type;
00157 
00158         typedef cst_tag                                                                                          index_category;
00159     private:
00160         Csa                                     m_csa;
00161         lcp_type                                m_lcp;
00162         bit_vector                              m_bp;
00163         bp_support_type                 m_bp_support;
00164         bit_vector                              m_first_child; // implementation note: no rank structure is needed for the first_child bit_vector, except for id()
00165         fc_rank_support_type    m_first_child_rank;
00166         uint8_t                                 m_sigma;
00167         size_type                               m_nodes;
00168 
00169         void copy(const cst_sct3& cst) {
00170             m_csa                       = cst.m_csa;
00171             copy_lcp(m_lcp, cst.m_lcp, *this);
00172             m_bp                        = cst.m_bp;
00173             m_bp_support        = cst.m_bp_support;
00174             m_bp_support.set_vector(&m_bp);
00175             m_first_child       = cst.m_first_child;
00176             m_first_child_rank = cst.m_first_child_rank;
00177             m_first_child_rank.set_vector(&m_first_child);
00178             m_sigma                     = cst.m_sigma;
00179             m_nodes                     = cst.m_nodes;
00180         }
00181 
00182         // Get the first l index of a [i,j] interval.
00183         /* I.e. given an interval [i,j], the function returns the position of the smallest entry lcp[k] with \f$ i<k\leq j \f$
00184          * \par Time complexity
00185          *       \f$ \Order{1} \f$
00186          */
00187         inline size_type get_first_l_index(const node_type& node, size_type& kpos, size_type& ckpos)const {
00188             if (node.cipos > node.jp1pos) { // corresponds to m_lcp[i] <= m_lcp[j+1]
00189                 ckpos   = node.jp1pos-1;
00190             } else { // corresponds to m_lcp[i] > m_lcp[j+1]
00191                 ckpos   = node.cipos-1;
00192             }
00193             assert(m_bp[ckpos]==0);
00194             kpos        = m_bp_support.find_open(ckpos);
00195             return m_bp_support.rank(kpos)-1;
00196         }
00197 
00198         // Get the ith l index of a node
00199         // if there exists no ith l-index return node.j+1
00200         size_type get_ith_l_index(const node_type& node, size_type i, size_type& kpos, size_type& ckpos)const {
00201             uint64_t children = 0;
00202 
00203             assert(i > 0);
00204             if (node.cipos > node.jp1pos) { // corresponds to m_lcp[i] <= m_lcp[j+1]
00205                 ckpos   = node.jp1pos-1;
00206             } else { // corresponds to m_lcp[i] > m_lcp[j+1]
00207                 ckpos   = node.cipos-1;
00208             }
00209             assert(m_bp[ckpos] == 0);   // at least the first l-index should be present, i.e. node is not leaf
00210             if (1 == i) {
00211                 kpos    = m_bp_support.find_open(ckpos);
00212                 return m_bp_support.rank(kpos)-1;
00213             } else { // i > 1
00214                 size_type r = ckpos - m_bp_support.rank(ckpos); // numbers of closing parentheses - 1 = index of first child in m_first_child
00215                 if (r+1 >= i) { // if there exist more than i l-indices
00216                     // check if m_first_child[r-i+1..r-1] consists of zeros
00217                     const uint64_t* p = m_first_child.data() + (r>>6);
00218                     uint8_t offset = r&0x3F;
00219 
00220                     uint64_t w = (*p) & bit_magic::Li1Mask[offset];
00221                     if (w) {
00222                         children = offset - bit_magic::l1BP(w) + 1;
00223                     } else if (m_first_child.data() == p) { // w==0 and we are in the first word
00224                         children = offset + 2; // da bit_magic::l1BP(w)=-1 sein muesste
00225                     } else {
00226                         children = offset + 2;
00227                         while (p > m_first_child.data()) {
00228                             w = *(--p);
00229                             if (0==w)
00230                                 children += 64;
00231                             else {
00232                                 children += (63-bit_magic::l1BP(w));
00233                                 break;
00234                             }
00235                         }
00236                         children += (w==0);
00237                     }
00238                     if (i < children) {  // there exists an ith l-index
00239                         ckpos -= (i-1);
00240                         assert(m_bp[ckpos] == 0);
00241                         kpos   = m_bp_support.find_open(ckpos);
00242                         return m_bp_support.rank(kpos)-1;
00243                     }
00244                 }
00245                 // if i >= children
00246                 kpos = node.jp1pos;
00247                 if (kpos < m_bp.size())
00248                     ckpos = m_bp_support.find_close(kpos);
00249                 else
00250                     ckpos = m_bp.size();
00251                 return node.j+1;
00252             }
00253         }
00254 
00255         // Get the postion of the first l index of a l-[i,j] interval in the balanced parentheses sequence.
00256         /* \par Time complexity
00257          *              \f$ \Order{1} \f$
00258          */
00259         inline size_type get_pos_of_closing_parenthesis_of_the_first_l_index(const node_type& node)const {
00260             if (node.cipos > node.jp1pos) { // corresponds to m_lcp[i] <= m_lcp[j+1]
00261                 return node.jp1pos-1;
00262             } else { // corresponds to m_lcp[i] > m_lcp[j+1]
00263                 return node.cipos-1;
00264             }
00265         }
00266 
00267         // Get the next smaller value.
00268         /* \par Time complexity
00269          *      \f$ \Order{1} \f$
00270          */
00271         // Returns n if there is no next smaller value in [i+1..n-1]
00272         inline size_type nsv(size_type i, size_type ipos)const { // possible optimization: calculate also position of nsv, i.e. next ( following position cipos
00273             size_type cipos = m_bp_support.find_close(ipos);
00274             size_type result = m_bp_support.rank(cipos);
00275             return result;
00276         }
00277 
00278         // Get the previous smaller value.
00279         /*
00280          * \par Time complexity
00281          *    \f$ \Order{\frac{\sigma}{w}} \f$, where w=64 is the word size, can be implemented in \f$\Order{1}\f$ with rank and select
00282          */
00283         inline size_type psv(size_type i, size_type ipos, size_type cipos, size_type& psvpos, size_type& psvcpos)const {
00284             if (cipos + m_sigma >= m_bp.size()) {  // if lcp[i]==0 => psv is the 0th index by definition
00285                 psvpos = 0;
00286                 psvcpos = m_bp.size()-1;
00287                 return 0;
00288             }
00289             // TODO einfach am anfang pruefen ob cipos+1
00290             if (m_bp[cipos+1]) {
00291                 psvpos = m_bp_support.enclose(ipos);
00292                 psvcpos = m_bp_support.find_close(psvpos);
00293                 return m_bp_support.rank(psvpos)-1;
00294             }
00295 
00296             size_type r0 = cipos - m_bp_support.rank(cipos); // index of clothing parenthesis in first_child bp
00297             size_type next_first_child = 0;
00298             const uint64_t* p = m_first_child.data() + (r0>>6);
00299             uint64_t w = (*p) >> (r0&0x3F);
00300             if (w) { // if w!=0
00301                 next_first_child = cipos + bit_magic::r1BP(w);
00302                 if (cipos == next_first_child and m_bp[next_first_child+1]) {
00303                     psvpos = m_bp_support.enclose(ipos);
00304                     psvcpos = m_bp_support.find_close(psvpos);
00305                     return m_bp_support.rank(psvpos)-1;
00306                 }
00307             } else {
00308                 cipos += 64-(r0&0x3F);
00309                 ++p;
00310                 while (!(w=*p)) { // while w==0
00311                     ++p;
00312                     cipos += 64;
00313                 }
00314                 next_first_child = cipos + bit_magic::r1BP(w);
00315             }
00316             if (!m_bp[next_first_child+1]) { // if next parenthesis is a closing one
00317                 psvcpos = next_first_child+1;
00318                 psvpos = m_bp_support.find_open(psvcpos);
00319                 return m_bp_support.rank(psvpos)-1;
00320             } else {
00321                 psvpos = m_bp_support.enclose(m_bp_support.find_open(next_first_child));
00322                 psvcpos = m_bp_support.find_close(psvpos);
00323                 return m_bp_support.rank(psvpos)-1;
00324             }
00325         }
00326 
00327         // Range minimum query based on the rr_enclose method.
00328         /* \par Time complexity
00329          *     \f$ \Order{\rrenclose} \f$
00330          */
00331         inline size_type rmq(size_type l, size_type r)const {
00332             size_type i         = m_bp_support.select(l+1);
00333             size_type j         = m_bp_support.select(r+1);
00334             size_type fc_i      = m_bp_support.find_close(i);
00335             if (j < fc_i) { // i < j < find_close(j) < find_close(i)
00336                 return l;
00337             } else { // i < find_close(i) < j < find_close(j)
00338                 size_type ec = m_bp_support.rr_enclose(i,j);
00339                 if (ec == m_bp_support.size()) {// no restricted enclosing pair found
00340                     return r;
00341                 } else { // found range restriced enclosing pair
00342                     return m_bp_support.rank(ec)-1; // subtract 1, as the index is 0 based
00343                 }
00344             }
00345         }
00346 
00347     public:
00348         const Csa& csa;                         
00349         const lcp_type& lcp;                            
00350         const bit_vector& bp;                   
00351         const bp_support_type& bp_support;      
00352 
00353         const bit_vector& first_child_bv;
00354         const fc_rank_support_type& first_child_rank;
00355 
00357         /* @{ */
00358 
00360         cst_sct3(): csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support), first_child_bv(m_first_child),
00361             first_child_rank(m_first_child_rank) {}
00362 
00363         // Constructor for the cst_sct3 taking a string for that the compressed suffix tree should be calculated.
00364         /*
00365          * \param str Text for which the \f$ \CST \f$ should be constructed. The text should be terminated by a zero byte.
00366          * \pre The text has to be terminated by a zero byte.
00367          */
00368 //              cst_sct3(const unsigned char *str);
00369 
00370         template<uint8_t int_width, class size_type_class, uint8_t int_width_1, class size_type_class_1, uint8_t int_width_2, class size_type_class_2>
00371         cst_sct3(const std::string& cst_file_name,
00372                  int_vector_file_buffer<int_width, size_type_class>& lcp_buf,
00373                  int_vector_file_buffer<int_width_1, size_type_class_1>& sa_buf,
00374                  int_vector_file_buffer<int_width_2, size_type_class_2>& isa_buf,
00375                  std::string dir="./",
00376                  bool build_ony_bps=false);
00377 
00378         cst_sct3(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps=false);
00379 
00381 
00386         cst_sct3(const cst_sct3& cst):csa(m_csa),lcp(m_lcp),bp(m_bp),bp_support(m_bp_support),first_child_bv(m_first_child),
00387             first_child_rank(m_first_child_rank) {
00388             copy(cst);
00389         }
00390 
00391         /* @} */
00392 
00394         ~cst_sct3() {}
00395 
00396         void construct(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps=false);
00397 
00399 
00402         size_type size()const {
00403             return m_bp.size()>>1;
00404         }
00405 
00407 
00410         static size_type max_size() {
00411             return Csa::max_size();
00412         }
00413 
00415 
00418         bool empty()const {
00419             return m_csa.empty();
00420         }
00421 
00423 
00431         void swap(cst_sct3& cst) {
00432             if (this != &cst) {
00433                 m_csa.swap(cst.m_csa);
00434                 m_bp.swap(cst.m_bp);
00435                 util::swap_support(m_bp_support, cst.m_bp_support, &m_bp, &(cst.m_bp));
00436 
00437                 m_first_child.swap(cst.m_first_child);
00438                 util::swap_support(m_first_child_rank, cst.m_first_child_rank, &m_first_child, &(cst.m_first_child));
00439                 std::swap(m_sigma, cst.m_sigma);
00440                 std::swap(m_nodes, cst.m_nodes);
00441                 // anything else has to be swapped before swapping lcp
00442                 swap_lcp(m_lcp, cst.m_lcp, *this, cst);
00443             }
00444         }
00445 
00447 
00450         const_iterator begin()const {
00451             if (0 == m_bp.size())  // special case: tree is uninitialized
00452                 return end();
00453             else if (2 == m_bp.size()) { // special case: the root is a leaf
00454                 return const_iterator(this, root(), true, true);
00455             }
00456             return const_iterator(this, root(), false, true);
00457         };
00458 
00460 
00463         const_iterator end()const {
00464             return const_iterator(this, root(), true, false);
00465         }
00466 
00468         const_bottom_up_iterator begin_bottom_up()const {
00469             if (0 == m_bp.size())  // special case: tree is uninitialized
00470                 return end_bottom_up();
00471             return const_bottom_up_iterator(this, leftmost_leaf_in_the_subtree(root()));
00472         }
00473 
00475         const_bottom_up_iterator end_bottom_up()const {
00476             return const_bottom_up_iterator(this, root(), false);
00477         }
00478 
00480 
00483         cst_sct3& operator=(const cst_sct3& cst);
00484 
00486 
00491         bool operator==(const cst_sct3& cst)const;
00492 
00494 
00499         bool operator!=(const cst_sct3& cst)const;
00500 
00502 
00505         size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const;
00506 
00508 
00510         void load(std::istream& in);
00511 
00513         /* @{ */
00514 
00516 
00520         node_type root() const {
00521             return node_type(0, size()-1, 0, m_bp.size()-1, m_bp.size());
00522         }
00523 
00525 
00531         bool is_leaf(const node_type& v)const {
00532             return v.i==v.j;
00533         }
00534 
00536 
00543         node_type ith_leaf(size_type i)const {
00544             assert(i > 0 and i <= size());
00545             size_type ipos = m_bp_support.select(i);
00546             size_type jp1pos;
00547             if (i == size())
00548                 jp1pos = m_bp.size();
00549             else if (m_bp[ipos+1])
00550                 jp1pos = ipos+1;
00551             else
00552                 jp1pos = m_bp_support.select(i+1);
00553             return node_type(i-1, i-1, ipos, m_bp_support.find_close(ipos), jp1pos);
00554         }
00555 
00557 
00564         size_type leaves_in_the_subtree(const node_type& v)const {
00565             return v.j-v.i+1;
00566         }
00567 
00569 
00574         node_type leftmost_leaf_in_the_subtree(const node_type& v)const {
00575             return ith_leaf(v.i+1);
00576         }
00577 
00579 
00584         node_type rightmost_leaf_in_the_subtree(const node_type& v)const {
00585             return ith_leaf(v.j+1);
00586         }
00587 
00589 
00596         size_type lb(const node_type& v)const {
00597             return v.i;
00598         }
00599 
00601 
00608         size_type rb(const node_type& v)const {
00609             return v.j;
00610         }
00611         /*
00612                         bool lcp_value_equals_zero(size_type i)const{
00613                                 size_type ipos = m_bp_support.find_close(m_bp_support.select(i+1));
00614                                 return ipos >= (m_bp.size() - m_sigma);
00615                         }
00616         */
00617 
00619 
00624         node_type parent(const node_type& v) const {
00625 //std::cout<<"parent "<<depth(v)<<v<<std::endl;
00626 #ifndef NDEBUG
00627 //                      std::cout<<"parent interval of "<<v.l<<"-["<<v.left<<" "<<v.right<<"]"<<std::endl;
00628 #endif
00629             if (v.cipos > v.jp1pos) { // LCP[i] <= LCP[j+1]
00630                 size_type psv_pos, psv_cpos, psv_v, nsv_v, nsv_p1pos;
00631                 psv_v = psv(v.j+1, v.jp1pos, m_bp_support.find_close(v.jp1pos), psv_pos, psv_cpos);
00632                 nsv_v = nsv(v.j+1, v.jp1pos)-1;
00633                 if (nsv_v == size()-1) {
00634                     nsv_p1pos = m_bp.size();
00635                 } else { // nsv_v < size()-1
00636                     nsv_p1pos = m_bp_support.select(nsv_v+2);
00637                 }
00638                 return node_type(psv_v, nsv_v, psv_pos, psv_cpos, nsv_p1pos);
00639             } else { // LCP[i] > LCP[j+1]
00640                 size_type psv_pos, psv_cpos, psv_v;
00641                 psv_v = psv(v.i, v.ipos, v.cipos, psv_pos, psv_cpos);
00642                 return node_type(psv_v, v.j, psv_pos, psv_cpos, v.jp1pos);
00643             }
00644         }
00645 
00647 
00653         node_type sibling(const node_type& v)const {
00654 //std::cout<<"sibling "<<depth(v)<<v<<std::endl;
00655 // Vorgehen:(1) Bestimmen, ob v noch einen rechten Bruder hat. Entsp. parent hat gleicht rechte Grenze wie v. Speziallfall rechter Rand ist hier schon behandelt!
00656             if (v.cipos < v.jp1pos) { // LCP[i] > LCP[j+1] => v hat die selbe rechte Grenze wie parent(v) => kein rechter Bruder
00657                 return root();
00658             }
00659 //          (2) Es existiert ein rechter Bruder, LCP[j+1] >= LCP[i] und j>i
00660             // es gilt nun: v.cipos > v.jp1pos
00661             size_type cjp1posm1 = m_bp_support.find_close(v.jp1pos)-1; // v.cipos-2 ???
00662 //std::cerr<<"cjp1posm1="<<cjp1posm1<<std::endl;
00663 //                      size_type cjp1posm1 = v.cipos-1;
00664 //                      if( cjp1posm1+1 != v.cipos ){
00665 //                              std::cout<<v<<" "<<cjp1posm1+1<<" "<<v.cipos<<std::endl;
00666 //                      }
00667 //                      size_type cjp1posm1 = v.cipos-2;
00668             // an der stelle von cjp1posm1 steht 1, falls v das letzte Kind ist
00669             bool last_child = m_bp[cjp1posm1];
00670             // an der stelle von cjp1posm1 steht 0 -> entweder nicht letztes Kind oder letztes Kind
00671             if (!last_child) {
00672                 size_type first_child_idx = cjp1posm1 - m_bp_support.rank(cjp1posm1);
00673                 last_child = m_first_child[first_child_idx]; // if first_child indicator is true => the new sibling is the rightmost sibling
00674             }
00675             if (last_child) {
00676                 size_type nsv_v = nsv(v.j+1, v.jp1pos)-1, nsv_p1pos;
00677                 if (nsv_v == size()-1) {
00678                     nsv_p1pos = m_bp.size();
00679                 } else {
00680                     nsv_p1pos = m_bp_support.select(nsv_v+2);
00681                 }
00682                 return node_type(v.j+1, nsv_v, v.jp1pos, m_bp_support.find_close(v.jp1pos), nsv_p1pos);
00683             } else {
00684 //std::cerr<<"not last child: "<<std::endl;
00685                 /*
00686                                         if( cjp1posm1+1 != v.cipos-1 ){
00687                                                 std::cout<<v<<" "<<cjp1posm1+1<<" "<<v.cipos<<std::endl;
00688                                                 std::cout<<"LCP["<<v.i<<"]="<<m_lcp[v.i]<<" "<<"LCP["<<v.j<<"]="<<m_lcp[v.j];
00689                                                 if(v.j<size())
00690                                                         std::cout<<" LCP[j+1]="<<m_lcp[v.j+1]<<std::endl;
00691                                         }
00692                 */
00693 //std::cerr<<"find_open(cjp1posm1) = "<< m_bp_support.find_open(cjp1posm1) << std::endl;
00694                 size_type new_j = m_bp_support.rank(m_bp_support.find_open(cjp1posm1))-2;
00695                 return node_type(v.j+1, new_j, v.jp1pos, m_bp_support.find_close(v.jp1pos), m_bp_support.select(new_j+2));
00696             }
00697         }
00698 
00700 
00709         node_type ith_child(const node_type& v, size_type i)const {
00710 //                      std::cerr<<i<<"th child "<<depth(v)<<v<<" i="<<i<<std::endl;
00711             assert(i > 0);
00712             if (is_leaf(v))  // if v is a leave, v has no child
00713                 return root();
00714             if (1 == i) {
00715                 size_type k = 0, kpos = 0, k_find_close = 0;
00716                 // v is not a leave: v has at least two children
00717                 k = get_first_l_index(v, kpos, k_find_close);// get first l-index k and the position of k
00718                 return node_type(v.i, k-1, v.ipos, v.cipos, kpos);
00719             } else { // i > 1
00720                 size_type k1, kpos1, k_find_close1;
00721                 k1 = get_ith_l_index(v, i-1, kpos1, k_find_close1);
00722                 if (k1 == v.j+1)
00723                     return root();
00724                 size_type k2, kpos2, k_find_close2;
00725                 k2 = get_ith_l_index(v, i, kpos2, k_find_close2);
00726                 return node_type(k1, k2-1, kpos1, k_find_close1, kpos2);
00727             }
00728         }
00729 
00731 
00737         size_type degree(const node_type& v)const {
00738             if (is_leaf(v))  // if v is a leave, v has no child
00739                 return 0;
00740             // v is not a leave: v has at least two children
00741             size_type r = get_pos_of_closing_parenthesis_of_the_first_l_index(v);
00742             /*                  if( m_bp[r-1] ){// if there exists no next l-index
00743                                         return 2;
00744                                 }
00745             */
00746             size_type r0 = r - m_bp_support.rank(r);
00747             const uint64_t* p = m_first_child.data() + (r0>>6);
00748             uint8_t offset = r0&0x3F;
00749 
00750             uint64_t w = (*p) & bit_magic::Li1Mask[offset];
00751             if (w) {
00752                 return offset-bit_magic::l1BP(w)+1;
00753             } else if (m_first_child.data() == p) { // w==0 and we are in the first word
00754                 return offset+2; // da bit_magic::l1BP(w)=-1 sein muesste
00755             } else {
00756                 size_type res = offset+2;
00757                 while (p > m_first_child.data()) {
00758                     w = *(--p);
00759                     if (0==w)
00760                         res += 64;
00761                     else {
00762                         return res + (63-bit_magic::l1BP(w));
00763                     }
00764                 }
00765                 return res + (w==0);
00766             }
00767         }
00768 
00769 
00770 
00771         // Returns the next sibling of node v.
00772         // Only for tests.
00773         node_type sibling_naive(const node_type& v)const {
00774             if (v==root())
00775                 return root();
00776             node_type parent = this->parent(v);
00777             assert(parent != v);
00778             size_type nr = degree(parent);
00779             for (size_type i=1; i <= nr; ++i)
00780                 if (ith_child(parent, i) == v and i!=nr)
00781                     return ith_child(parent, i+1);
00782             return root();
00783         }
00784 
00785 
00787 
00795         // TODO const unsigned char c durch char_type ersetzen
00796         node_type child(const node_type& v, const unsigned char c, size_type& char_pos)const {
00797             if (is_leaf(v))  // if v is a leaf = (), v has no child
00798                 return root();
00799             // else v = ( (     ))
00800             uint16_t cc = m_csa.char2comp[c];
00801             if (cc==0 and c!=0) // TODO: aendere char2comp so ab, dass man diesen sonderfall nicht braucht
00802                 return root();
00803             size_type char_ex_max_pos = m_csa.C[cc+1], char_inc_min_pos = m_csa.C[cc];
00804 
00805             size_type d                 = depth(v);
00806 
00807 //                      (1) check the first child
00808 //                      char_pos = m_csa(m_csa[v.i]+d);  // replaced by the next line
00809             char_pos = get_char_pos(v.i, d, m_csa);
00810             if (char_pos >= char_ex_max_pos) {// the first character of the first child interval is lex. greater than c
00811                 // => all other first characters of the child intervals are also greater than c => no solution
00812                 return root();
00813             } else if (char_pos >= char_inc_min_pos) { // i.e. char_pos < char_ex_max_pos and char_pos >= char_inc_min_pos
00814                 return ith_child(v, 1);
00815             }
00816 
00817             size_type child_cnt         = degree(v);
00818 
00819 //                      (2) check the last child
00820 //                      char_pos = m_csa(m_csa[v.j]+d); // replaced by the next line
00821             char_pos = get_char_pos(v.j, d, m_csa);
00822             if (char_pos < char_inc_min_pos) {// the first character of the last child interval is lex. smaller than c
00823                 // =>   all other first characters of the child intervals are also smaller than c => no solution
00824                 return root();
00825             } else if (char_pos < char_ex_max_pos) { // i.e. char_pos < char_ex_max_pos and char_pos >= char_inc_min_pos
00826                 return ith_child(v, child_cnt);
00827             }
00828 
00829 //                      (3) binary search for c in the children [2..children)
00830             size_type l_bound = 2, r_bound = child_cnt, mid, kpos, ckpos, l_index;
00831             while (l_bound < r_bound) {
00832                 mid = (l_bound + r_bound) >> 1;
00833 
00834                 l_index = get_ith_l_index(v, mid-1, kpos, ckpos);
00835 //                              char_pos = m_csa(m_csa[l_index]+d); // replaced by the next line
00836                 char_pos = get_char_pos(l_index, d, m_csa);
00837 
00838                 if (char_inc_min_pos > char_pos) {
00839                     l_bound = mid+1;
00840                 } else if (char_ex_max_pos <= char_pos) {
00841                     r_bound = mid;
00842                 } else { // char_inc_min_pos <= char_pos < char_ex_max_pos => found child
00843                     // we know that the child is not the last child, see (2)
00844                     // find next l_index: we know that a new l_index exists: i.e. assert( 0 == m_bp[ckpos-1]);
00845                     size_type lp1_index = m_bp_support.rank(m_bp_support.find_open(ckpos-1))-1;
00846                     size_type jp1pos = m_bp.size();
00847                     if (lp1_index-1 < size()-1) {
00848                         jp1pos = m_bp_support.select(lp1_index+1);
00849                     }
00850                     return node_type(l_index, lp1_index-1, kpos, ckpos, jp1pos);
00851                 }
00852             }
00853             return root();
00854         }
00855 
00857         // \sa child(node_type v, const unsigned char c, size_type &char_pos)
00858         node_type child(const node_type& v, const unsigned char c) {
00859             size_type char_pos;
00860             return child(v, c, char_pos);
00861         }
00862 
00864 
00871         unsigned char edge(const node_type& v, size_type d)const {
00872 #ifndef NDEBUG
00873             if (d < 1 or d > depth(v)) {
00874                 throw std::out_of_range("OUT_OF_RANGE_ERROR: "+util::demangle(typeid(this).name())+"cst_sct3<>::edge(node_type v, size_type d). d == 0 or d > depth(v)!");
00875             }
00876 #endif
00877 //                      size_type order = m_csa(m_csa[v.i]+d-1); replaced by the next line
00878             size_type order = get_char_pos(v.i, d-1, m_csa);
00879             uint16_t c_begin = 1, c_end = m_sigma+1, mid;
00880             while (c_begin < c_end) {
00881                 mid = (c_begin+c_end)>>1;
00882                 if (m_csa.C[mid] <= order) {
00883                     c_begin = mid+1;
00884                 } else {
00885                     c_end = mid;
00886                 }
00887             }
00888             return m_csa.comp2char[c_begin-1];
00889         }
00890 
00892 
00900         node_type lca(node_type v, node_type w)const {
00901             if (v.i > w.i or (v.i == w.i and v.j < w.j)) {
00902                 std::swap(v, w);
00903             }
00904 //                      assert(v.i < w.i or (v.i==w.i and v.j >= j));
00905 //                      assert( !(v.i < w.i and v.j > w.i and v.j < w.j) ); // assert that v and w do not overlapp
00906             if (v.j >= w.j) { // v encloses w or v==w
00907                 return v;
00908             } else { // v.i < v.j < w.i < w.j
00909                 size_type min_index = rmq(v.i+1, w.j);
00910                 size_type min_index_pos         = m_bp_support.select(min_index+1);
00911                 size_type min_index_cpos        = m_bp_support.find_close(min_index_pos);
00912 
00913                 if (min_index_cpos >= m_bp.size() - m_sigma) { // if lcp[min_index]==0 => return root
00914                     return root();
00915                 }
00916                 size_type new_j = nsv(min_index, min_index_pos)-1;
00917                 size_type new_ipos, new_icpos;
00918                 size_type new_i = psv(min_index, min_index_pos, min_index_cpos, new_ipos, new_icpos);
00919                 size_type jp1pos = m_bp.size();
00920                 if (new_j < size()-1) {
00921                     jp1pos = m_bp_support.select(new_j+2);
00922                 }
00923                 return node_type(new_i, new_j, new_ipos, new_icpos, jp1pos);
00924             }
00925         }
00926 
00928 
00934         size_type depth(const node_type& v)const {
00935             if (v.i == v.j) {
00936                 return size()-m_csa[v.i];
00937             } else if (v == root()) {
00938                 return 0;
00939             } else {
00940                 size_type kpos, ckpos;
00941                 size_type l = get_first_l_index(v, kpos, ckpos);
00942                 return m_lcp[l];
00943             }
00944         }
00945 
00947 
00953         // TODO: can be implemented in O(1) with o(n) space. See Jansson, Sadakane, Sung, SODA 2007, "Ultra-succinct Representation of Ordered Trees"
00954         size_type node_depth(node_type v)const {
00955             size_type d = 0;
00956             while (v != root()) {
00957                 ++d;
00958                 v = parent(v);
00959             }
00960             return d;
00961         }
00962 
00964 
00970         node_type sl(const node_type& v)const {
00971             if (v == root())
00972                 return root();
00973             // get interval with first char deleted
00974             size_type i  = m_csa.psi[v.i];
00975             if (is_leaf(v)) {
00976                 if (v.i==0 and v.j==0) // if( v.l==1 )
00977                     return root();
00978                 else
00979                     return ith_leaf(i+1);
00980             }
00981             size_type j  = m_csa.psi[v.j];
00982             assert(i < j);
00983             size_type min_index = rmq(i+1, j); // rmq
00984             size_type min_index_pos     = m_bp_support.select(min_index+1);
00985             size_type min_index_cpos    = m_bp_support.find_close(min_index_pos);
00986             if (min_index_cpos >= m_bp.size() - m_sigma) { // if lcp[min_index]==0 => return root
00987                 return root();
00988             }
00989             size_type new_j = nsv(min_index, min_index_pos)-1;
00990             size_type new_ipos, new_icpos;
00991             size_type new_i = psv(min_index, min_index_pos, min_index_cpos, new_ipos, new_icpos);
00992             size_type jp1pos = m_bp.size();
00993             if (new_j < size()-1) {
00994                 jp1pos = m_bp_support.select(new_j+2);
00995             }
00996             return node_type(new_i, new_j, new_ipos, new_icpos, jp1pos);
00997         }
00998 
00999 
01001         /*
01002          *  \param v A valid not of a cst_sct3.
01003          *  \param c The character which should be prepended to the string of the current node.
01004          *      \return root() if the Weiner link of (v, c) does not exist, otherwise the Weiner link is returned.
01005          *  \par Time complexity
01006          *              \f$ \Order{ t_{rank\_bwt} } \f$
01007          *
01008          */
01009         node_type wl(const node_type& v, const unsigned char c) const {
01010             size_type c_left    = m_csa.rank_bwt(v.i, c);
01011             size_type c_right   = m_csa.rank_bwt(v.j+1, c);
01012             if (c_left == c_right)  // there exists no Weiner link
01013                 return root();
01014             if (c_left+1 == c_right)
01015                 return ith_leaf(m_csa.C[m_csa.char2comp[c]] + c_left + 1);
01016             else {
01017                 size_type left  = m_csa.C[m_csa.char2comp[c]] + c_left;
01018                 size_type right = m_csa.C[m_csa.char2comp[c]] + c_right - 1;
01019                 assert(left < right);
01020 
01021                 size_type ipos = m_bp_support.select(left+1);
01022                 size_type jp1pos = m_bp.size();
01023                 if (right < size()-1) {
01024                     jp1pos = m_bp_support.select(right+2);
01025                 }
01026                 return node_type(left, right, ipos,
01027                                  m_bp_support.find_close(ipos), jp1pos);
01028             }
01029         }
01030 
01032 
01037         size_type sn(const node_type& v)const {
01038             assert(is_leaf(v));
01039             return m_csa[v.i];
01040         }
01041 
01043 
01049         size_type id(const node_type& v)const {
01050             if (is_leaf(v)) { // return id in the range from 0..csa.size()-1
01051                 return v.i;
01052             }
01053             size_type ckpos; // closing parentheses of the l-index
01054             if (v.cipos > v.jp1pos) { // corresponds to m_lcp[i] <= m_lcp[j+1]
01055                 ckpos   = v.jp1pos-1;
01056             } else { // corresponds to m_lcp[i] > m_lcp[j+1]
01057                 ckpos   = v.cipos-1;
01058             }
01059             assert(m_bp[ckpos]==0);
01060             size_type r0ckpos = ckpos-m_bp_support.rank(ckpos); // determine the rank of the closing parenthesis
01061             return size()+m_first_child_rank(r0ckpos);
01062         }
01063 
01065 
01072         node_type inv_id(size_type id) {
01073             if (id < size()) {  // the corresponding node is a leaf
01074                 return ith_leaf(id+1);
01075             } else { // the corresponding node is a inner node
01076                 // (1) get index of the closing parenthesis in m_first_child
01077                 size_type r0ckpos = 0;
01078                 {
01079                     //binary search for the position of the (id-size()+1)-th set bit in
01080                     id = id-size()+1;
01081                     size_type lb = 0, rb = m_bp.size(); // lb inclusive, rb exclusive
01082                     // invariant: arg(lb) < id, arg(rb) >= id
01083                     while (rb-lb > 1) {
01084                         size_type mid = lb + (rb-lb)/2;
01085                         size_type arg = m_first_child_rank(mid); // ones in the prefix [0..mid-1]
01086                         if (arg < id) {
01087                             lb = mid;
01088                         } else { // arg >= id
01089                             rb = mid;
01090                         }
01091                     }
01092                     r0ckpos = lb;
01093                 }
01094                 // (2) determine position clpos of the r0clpos-th closing parentheses in the parentheses sequence
01095                 size_type ckpos = 0;
01096                 {
01097                     // binary search for the position of the (r0ckpos+1)-th closing parenthesis
01098                     size_type lb = 0, rb = m_bp.size(); // lb inclusive, rb exclusive
01099                     // invariant: arg(lb) < r0ckpos+1,  arg(rb) >= r0ckpos+1
01100                     while (rb-lb > 1) {
01101                         size_type mid = lb + (rb-lb)/2;
01102                         size_type arg = mid - m_bp_support.rank(mid-1);  // zeros in the prefix [0..mid-1]
01103                         if (arg < r0ckpos+1) {
01104                             lb = mid;
01105                         } else { // arg >= x
01106                             rb = mid;
01107                         }
01108                     }
01109                     ckpos = lb;
01110                 }
01111 //                              if ( m_bp[ckpos] ){
01112 //                                      std::cerr<<"m_bp[ckpos] should be zero! id=" << id << std::endl;
01113 //                                      std::cerr<<"r0ckpos="<<r0ckpos<<" rank_0(ckpos)="<< ckpos - m_bp_support.rank(ckpos-1)  << std::endl;
01114 //                              }
01115                 if (ckpos == m_bp.size()-1) {
01116                     return root();
01117                 }
01118                 if (m_bp[ckpos+1]) {  // jp1pos < cipos
01119 //                                      std::cout<<"case1"<<std::endl;
01120                     size_type jp1pos= ckpos+1;
01121                     size_type j         = m_bp_support.rank(jp1pos-1)-1;
01122                     size_type kpos  = m_bp_support.find_open(ckpos);
01123                     size_type ipos      = m_bp_support.enclose(kpos);
01124                     size_type cipos = m_bp_support.find_close(ipos);
01125                     size_type i         = m_bp_support.rank(ipos-1);
01126                     return node_type(i, j, ipos, cipos, jp1pos);
01127                 } else { //
01128 //                                      std::cout<<"case2"<<std::endl;
01129                     size_type cipos = ckpos+1;
01130                     size_type ipos  = m_bp_support.find_open(cipos);
01131                     size_type i     = m_bp_support.rank(ipos-1);
01132                     size_type j     = nsv(i, ipos)-1;
01133                     size_type jp1pos= m_bp.size();
01134                     if (j != size()-1) {
01135                         jp1pos = m_bp_support.select(j+2);
01136                     }
01137                     return node_type(i, j, ipos, cipos, jp1pos);
01138                 }
01139             }
01140         }
01141 
01143         size_type nodes()const {
01144             return m_nodes;
01145         }
01146 
01148         /* \param lb Left bound of the lcp-interval [lb..rb] (inclusive).
01149          * \param rb Right bound of the lcp-interval [lb..rb] (inclusive).
01150          * \param l  Depth of the lcp-interval.
01151          *\ return The node in the suffix tree corresponding lcp-interval [lb..rb]
01152          */
01153         node_type node(size_type lb, size_type rb, size_type l=0) const {
01154             size_type ipos = m_bp_support.select(lb+1);
01155             size_type jp1pos;
01156             if (rb == size()-1) {
01157                 jp1pos = m_bp.size();
01158             } else {
01159                 jp1pos = m_bp_support.select(rb+2);
01160             }
01161             return node_type(lb, rb, ipos, m_bp_support.find_close(ipos), jp1pos);
01162         }
01163 
01165 
01169         size_type tlcp_idx(size_type i) const {
01170             size_type ipos      = m_bp_support.select(i+1);
01171             size_type cipos = m_bp_support.find_close(ipos);
01172             return m_first_child_rank.rank(((ipos+cipos-1)>>1)-i);
01173         }
01174         /* @} */
01175 };
01176 
01177 // == template functions ==
01178 
01179 
01180 template<class Csa, class Lcp, class Bp_support, class Rank_support>
01181 template<uint8_t int_width, class size_type_class, uint8_t int_width_1, class size_type_class_1, uint8_t int_width_2, class size_type_class_2>
01182 cst_sct3<Csa, Lcp, Bp_support, Rank_support>::cst_sct3(const std::string& csa_file_name,
01183         int_vector_file_buffer<int_width, size_type_class>& lcp_buf,
01184         int_vector_file_buffer<int_width_1, size_type_class_1>& sa_buf,
01185         int_vector_file_buffer<int_width_2, size_type_class_2>& isa_buf,
01186         std::string dir,
01187         bool build_only_bps
01188                                                       ):csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support), first_child_bv(m_first_child), first_child_rank(m_first_child_rank)
01189 {
01190     std::string id =  util::to_string(util::get_pid())+"_"+util::to_string(util::get_id()).c_str();
01191 
01192     write_R_output("cst", "construct BPS", "begin", 1, 0);
01193     m_nodes = algorithm::construct_supercartesian_tree_bp_succinct_and_first_child(lcp_buf, m_bp, m_first_child) + m_bp.size()/2;
01194     write_R_output("cst", "construct BPS", "end", 1, 0);
01195 
01196     if (!build_only_bps) {
01197         util::load_from_file(m_csa, csa_file_name.c_str());
01198 
01199         write_R_output("cst", "construct CLCP", "begin", 1, 0);
01200         construct_lcp(m_lcp, *this, lcp_buf, isa_buf);
01201         write_R_output("cst", "construct CLCP", "end", 1, 0);
01202     }
01203     write_R_output("cst", "construct BPSS", "begin", 1, 0);
01204     util::init_support(m_bp_support, &m_bp);
01205     util::init_support(m_first_child_rank, &m_first_child);
01206     write_R_output("cst", "construct BPSS", "end", 1, 0);
01207     m_sigma = degree(root());
01208 }
01209 
01210 
01211 template<class Csa, class Lcp, class Bp_support, class Rank_support>
01212 cst_sct3<Csa, Lcp, Bp_support, Rank_support>::cst_sct3(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps):csa(m_csa), lcp(m_lcp), bp(m_bp), bp_support(m_bp_support), first_child_bv(m_first_child), first_child_rank(m_first_child_rank)
01213 {
01214     construct(file_map, dir, id, build_only_bps);
01215 }
01216 
01217 
01218 template<class Csa, class Lcp, class Bp_support, class Rank_support>
01219 void cst_sct3<Csa, Lcp, Bp_support, Rank_support>::construct(tMSS& file_map, const std::string& dir, const std::string& id, bool build_only_bps)
01220 {
01221     write_R_output("cst", "construct BPS", "begin", 1, 0);
01222     int_vector_file_buffer<> lcp_buf(file_map["lcp"].c_str());
01223     m_nodes = algorithm::construct_supercartesian_tree_bp_succinct_and_first_child(lcp_buf, m_bp, m_first_child) + m_bp.size()/2;
01224     write_R_output("cst", "construct BPS", "end", 1, 0);
01225     write_R_output("cst", "construct BPSS", "begin", 1, 0);
01226     util::init_support(m_bp_support, &m_bp);
01227     util::init_support(m_first_child_rank, &m_first_child);
01228     write_R_output("cst", "construct BPSS", "end", 1, 0);
01229 
01230     if (!build_only_bps) {
01231         write_R_output("cst", "construct CLCP", "begin", 1, 0);
01232         construct_lcp(m_lcp, *this, file_map, dir, id);
01233         write_R_output("cst", "construct CLCP", "end", 1, 0);
01234     }
01235     if (!build_only_bps) {
01236         util::load_from_file(m_csa, file_map["csa"].c_str());
01237     }
01238     m_sigma = degree(root());
01239 }
01240 
01241 template<class Csa, class Lcp, class Bp_support, class Rank_support>
01242 typename cst_sct3<Csa, Lcp, Bp_support, Rank_support>::size_type cst_sct3<Csa, Lcp, Bp_support, Rank_support>::serialize(std::ostream& out, structure_tree_node* v, std::string name)const
01243 {
01244     structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
01245     size_type written_bytes = 0;
01246     written_bytes += m_csa.serialize(out, child, "csa");
01247     written_bytes += m_lcp.serialize(out, child, "lcp");
01248     written_bytes += m_bp.serialize(out, child, "bp");
01249     written_bytes += m_bp_support.serialize(out, child, "bp_support");
01250     written_bytes += m_first_child.serialize(out, child, "mark_child");
01251     written_bytes += m_first_child_rank.serialize(out, child, "mark_child_rank");
01252     written_bytes += util::write_member(m_sigma, out, child, "sigma");
01253     written_bytes += util::write_member(m_nodes, out, child, "node_cnt");
01254     structure_tree::add_size(child, written_bytes);
01255     return written_bytes;
01256 }
01257 
01258 template<class Csa, class Lcp, class Bp_support, class Rank_support>
01259 void cst_sct3<Csa, Lcp, Bp_support, Rank_support>::load(std::istream& in)
01260 {
01261     m_csa.load(in);
01262     load_lcp(m_lcp, in, *this);
01263     m_bp.load(in);
01264     m_bp_support.load(in, &m_bp);
01265     m_first_child.load(in);
01266     m_first_child_rank.load(in, &m_first_child);
01267     util::read_member(m_sigma, in);
01268     util::read_member(m_nodes, in);
01269 #ifdef SDSL_DEBUG
01270     assert(algorithm::check_bp_support(m_bp, m_bp_support));
01271     std::cerr<<"checked bp_support"<<std::endl;
01272 #endif
01273 }
01274 
01275 template<class Csa, class Lcp, class Bp_support, class Rank_support>
01276 cst_sct3<Csa, Lcp, Bp_support, Rank_support>& cst_sct3<Csa, Lcp, Bp_support, Rank_support>::operator=(const cst_sct3& cst)
01277 {
01278     if (this != &cst) {
01279         copy(cst);
01280     }
01281     return *this;
01282 }
01283 
01284 
01285 
01286 
01287 
01288 } // end namespace sdsl
01289 
01290 
01291 #endif