SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
 All Classes Namespaces Files Functions Variables Typedefs Friends
sdsl/include/sdsl/algorithms_for_string_matching.hpp
Go to the documentation of this file.
00001 /* sdsl - succinct data structures library
00002     Copyright (C) 2010 Simon Gog
00003 
00004     This program is free software: you can redistribute it and/or modify
00005     it under the terms of the GNU General Public License as published by
00006     the Free Software Foundation, either version 3 of the License, or
00007     (at your option) any later version.
00008 
00009     This program is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012     GNU General Public License for more details.
00013 
00014     You should have received a copy of the GNU General Public License
00015     along with this program.  If not, see http://www.gnu.org/licenses/ .
00016 */
00022 #ifndef INCLUDED_SDSL_ALGORITHMS_FOR_STRING_MATCHING
00023 #define INCLUDED_SDSL_ALGORITHMS_FOR_STRING_MATCHING
00024 
00025 #include "int_vector.hpp"
00026 
00027 #include <stdexcept> // for exceptions
00028 #include <iostream>
00029 #include <cassert>
00030 #include <stack>
00031 #include <utility>
00032 
00033 
00034 namespace sdsl
00035 {
00036 
00040 namespace algorithm
00041 {
00042 
00044 
00059 template<class Csa>
00060 static typename Csa::csa_size_type backward_search(
00061     const Csa& csa,
00062     typename Csa::size_type l,
00063     typename Csa::size_type r,
00064     typename Csa::char_type c,
00065     typename Csa::size_type& l_res,
00066     typename Csa::size_type& r_res
00067 )
00068 {
00069     assert(l <= r); assert(r < csa.size());
00070     typename Csa::size_type c_begin = csa.C[csa.char2comp[c]];
00071     l_res = c_begin + csa.rank_bwt(l, c); // count c in bwt[0..l-1]
00072     r_res = c_begin + csa.rank_bwt(r+1, c) - 1; // count c in bwt[0..r]
00073     assert(r_res+1-l_res >= 0);
00074     return r_res+1-l_res;
00075 }
00076 
00078 
00094 template<class Csa>
00095 static typename Csa::csa_size_type backward_search(const Csa& csa, typename Csa::size_type l,
00096         typename Csa::size_type r, typename Csa::pattern_type pat,
00097         typename Csa::size_type len, typename Csa::size_type& l_res, typename Csa::size_type& r_res)
00098 {
00099     typename Csa::size_type i = 0;
00100     l_res = l; r_res = r;
00101     while (i < len and backward_search(csa, l_res, r_res, pat[len-i-1], l_res, r_res)) {
00102         ++i;
00103     }
00104     return r_res+1-l_res;
00105 }
00106 
00107 // TODO: forward search. Include original text???
00108 
00110 
00119 template<class Csa>
00120 static typename Csa::csa_size_type count(const Csa& csa, typename Csa::pattern_type pat, typename Csa::size_type len)
00121 {
00122     if (len > csa.size())
00123         return 0;
00124     typename Csa::size_type t1,t2; t1=t2=0; // dummy varaiable for the backward_search call
00125     return backward_search(csa, 0, csa.size()-1, pat, len, t1, t2);
00126 }
00127 
00129 
00140 template<class Csa, class RandomAccessContainer>
00141 static typename Csa::csa_size_type locate(const Csa&  csa, typename Csa::pattern_type pattern,
00142         typename Csa::size_type pattern_len, RandomAccessContainer& occ)
00143 {
00144     typename Csa::size_type occ_begin, occ_end, occs;
00145     occs = backward_search(csa, 0, csa.size()-1, pattern, pattern_len, occ_begin, occ_end);
00146     occ.resize(occs);
00147     for (typename Csa::size_type i=0; i < occs; ++i) {
00148         occ[i] = csa[occ_begin+i];
00149     }
00150     return occs;
00151 }
00152 
00154 
00164 // Is it cheaper to call T[i] = BWT[iSA[i+1]]??? Additional ranks but H_0 average access
00165 // TODO: extract backward!!! is faster in most cases!
00166 template<class Csa>
00167 static void extract(const Csa& csa, typename Csa::size_type begin, typename Csa::size_type end, unsigned char* text)
00168 {
00169     assert(end <= csa.size());
00170     assert(begin <= end);
00171     for (typename Csa::size_type i=begin, order = csa(begin); i<=end; ++i, order =  csa.psi[order]) {
00172         uint16_t c_begin = 1, c_end = 257, mid;
00173         while (c_begin < c_end) {
00174             mid = (c_begin+c_end)>>1;
00175             if (csa.C[mid] <= order) {
00176                 c_begin = mid+1;
00177             } else {
00178                 c_end = mid;
00179             }
00180         }
00181         text[i-begin] = csa.comp2char[c_begin-1];
00182     }
00183     if (text[end-begin]!=0)
00184         text[end-begin+1] = 0; // set terminal character
00185 }
00186 
00188 
00197 // TODO: use extract with four parameters to implement this method
00198 template<class Csa>
00199 static std::string extract(const Csa& csa, typename Csa::size_type begin, typename Csa::size_type end)
00200 {
00201     assert(end <= csa.size());
00202     assert(begin <= end);
00203     std::string result(end-begin+1,' ');
00204     for (typename Csa::size_type i=begin, order = csa(begin); i<=end; ++i, order =  csa.psi[order]) {
00205         uint16_t c_begin = 1, c_end = 257, mid;
00206         while (c_begin < c_end) {
00207             mid = (c_begin+c_end)>>1;
00208             if (csa.C[mid] <= order) {
00209                 c_begin = mid+1;
00210             } else {
00211                 c_end = mid;
00212             }
00213         }
00214         result[i-begin] = csa.comp2char[c_begin-1];
00215     }
00216     return result;
00217 }
00218 
00220 
00230 template<class Cst>
00231 typename Cst::cst_size_type forward_search(const Cst& cst, typename Cst::node_type& v,
00232         const typename Cst::size_type d, const typename Cst::char_type c,
00233         typename Cst::size_type& char_pos)
00234 {
00235     unsigned char cc = cst.csa.char2comp[c]; // check if c occures in the text of the csa
00236     if (cc==0 and cc!=c)                                                 //   "    " "    "     "  "    "   "  "   "
00237         return 0;
00238     typename Cst::size_type depth_node = cst.depth(v);
00239     if (d < depth_node) {  //
00240         char_pos = cst.csa.psi[char_pos];
00241         if (char_pos < cst.csa.C[cc] or char_pos >= cst.csa.C[cc+1])
00242             return 0;
00243         return cst.leaves_in_the_subtree(v);
00244     } else if (d == depth_node) { //
00245         v = cst.child(v, c, char_pos);
00246         if (v == cst.root())
00247             return 0;
00248         else
00249             return cst.leaves_in_the_subtree(v);
00250     } else {
00251         throw std::invalid_argument("depth d should be smaller or equal than node depth of v!");
00252         return 0;
00253     }
00254 }
00255 
00257 
00268 template<class Cst>
00269 typename Cst::cst_size_type forward_search(const Cst& cst, typename Cst::node_type& v,
00270         typename Cst::size_type d, typename Cst::pattern_type pat, typename Cst::size_type len,
00271         typename Cst::size_type& char_pos
00272                                           )
00273 {
00274     if (len==0)
00275         return cst.leaves_in_the_subtree(v);
00276     typename Cst::size_type i=0, size=0;
00277     while (i < len and (size=forward_search(cst, v, d, pat[i], char_pos))) {
00278         ++d;
00279         ++i;
00280     }
00281     return size;
00282 }
00283 
00285 
00290 template<class Cst>
00291 typename Cst::cst_size_type count(const Cst& cst, typename Cst::pattern_type pat, typename Cst::size_type len)
00292 {
00293     typename Cst::node_type v = cst.root();
00294     typename Cst::size_type char_pos = 0;
00295     return forward_search(cst, v, 0, pat, len, char_pos);
00296 }
00297 
00299 
00305 template<class Cst, class RandomAccessContainer>
00306 typename Cst::cst_size_type locate(const Cst& cst, typename Cst::pattern_type pat,
00307                                    typename Cst::size_type len, RandomAccessContainer& occ)
00308 {
00309     typedef typename Cst::size_type size_type;
00310     typename Cst::node_type v = cst.root();
00311     size_type char_pos = 0;
00312     typename Cst::cst_size_type occs = forward_search(cst, v, 0, pat, len, char_pos);
00313     occ.resize(occs);
00314     if (occs == 0)
00315         return 0; // because v equals cst.root()
00316     size_type left      = cst.lb(v);   // get range in the
00317     size_type right = cst.rb(v);
00318     for (size_type i=left; i <= right; ++i)
00319         occ[i-left] = cst.csa[i];
00320     return occs;
00321 }
00322 
00324 
00330 template<class Cst>
00331 void extract(const Cst& cst, const typename Cst::node_type& v, unsigned char* text)
00332 {
00333     if (v == cst.root()) {
00334         text[0] = 0;
00335         return;
00336     }
00337     // first get the suffix array entry of the leftmost leaf in the subtree rooted at v
00338     typename Cst::size_type begin = cst.csa[cst.lb(v)];
00339     // then call the extract method on the compressed suffix array
00340     extract(cst.csa, begin, begin + cst.depth(v) - 1, text);
00341 }
00342 
00344 
00349 template<class Cst>
00350 std::string extract(const Cst& cst, const typename Cst::node_type& v)
00351 {
00352     if (v==cst.root()) {
00353         return std::string();
00354     }
00355     // first get the suffix array entry of the leftmost leaf in the subtree rooted at v
00356     typename Cst::size_type begin = cst.csa[cst.lb(v)];
00357     // then call the extract method on the compressed suffix array
00358     return extract(cst.csa, begin, begin + cst.depth(v) - 1);
00359 }
00360 
00361 
00362 /*
00363 template<class Cst>
00364 typename Cst::size_type count(
00365                         const Cst &cst,
00366                         typename Cst::pattern_type pattern,
00367                     typename Cst::size_type pattern_len){
00368         if(pattern_len==0){
00369                 return 0;
00370         }
00371         typedef typename Cst::size_type size_type;
00372         typedef typename Cst::node_type node_type;
00373         node_type node = cst.root();
00374         for(size_type i=0, char_pos=0; cst.depth(node) < pattern_len; ++i){
00375                 node_type newnode = cst.child(node, (unsigned char)pattern[cst.depth(node)], char_pos);
00376                 if( newnode == cst.root() )// root node, no match found
00377                         return 0;
00378                 // else the first character of the newnode matches the pattern at position depth(node)
00379                 for(size_type j=cst.depth(node)+1; j < cst.depth(newnode) and j < pattern_len; ++j){
00380                         char_pos = cst.csa.psi[char_pos];
00381                         size_type cc = cst.csa.char2comp[pattern[j]];
00382                         if(char_pos < cst.csa.C[cc] or char_pos >= cst.csa.C[cc+1] )
00383                                 return 0;
00384                 }
00385                 node = newnode;
00386         }
00387         return cst.leaves_in_the_subtree(node);
00388 }
00389 */
00390 /*
00391 template<class Cst, class RandomAccessContainer>
00392 typename Cst::size_type locate(
00393                 const Cst &cst,
00394                 typename Cst::pattern_type pattern,
00395                 typename Cst::size_type pattern_len,
00396                 RandomAccessContainer &occ){
00397         occ.resize(0);
00398         typedef typename Cst::size_type size_type;
00399         typedef typename Cst::node_type node_type;
00400         node_type node = cst.root();
00401         for(size_type i=0, char_pos=0; cst.depth(node) < pattern_len; ++i){
00402                 node_type newnode = cst.child(node, (unsigned char)pattern[cst.depth(node)], char_pos);
00403                 if( newnode == cst.root() )// root node, no match found
00404                         return 0;
00405                 // else the first character of the newnode matches the pattern at position depth(node)
00406                 for(size_type j=cst.depth(node)+1; j < cst.depth(newnode) and j < pattern_len; ++j){
00407                         char_pos = cst.csa.psi[char_pos];
00408                         size_type cc = cst.csa.char2comp[pattern[j]];
00409                         if(char_pos < cst.csa.C[cc] or char_pos >= cst.csa.C[cc+1] )
00410                                 return 0;
00411                 }
00412                 node = newnode;
00413         }
00414         size_type occs = cst.leaves_in_the_subtree(node);
00415         occ.resize(occs);
00416         size_type left  = cst.leftmost_suffix_array_index_in_the_subtree(node);
00417         size_type right = cst.rightmost_suffix_array_index_in_the_subtree(node);
00418         for(size_type i=left; i <= right; ++i)
00419                 occ[i-left] = cst.csa[i];
00420         return occs;
00421 }
00422 */
00423 
00424 
00425 /*
00426 template<class Csa, class Lcp, class Bp_support>
00427 typename cst_sct<Csa, Lcp, Bp_support>::size_type count(
00428                         const cst_sct<Csa, Lcp, Bp_support> &cst,
00429                         typename cst_sct<Csa, Lcp, Bp_support>::pattern_type pattern,
00430                     typename cst_sct<Csa, Lcp, Bp_support>::size_type pattern_len){
00431         if(pattern_len==0){
00432                 return 0;
00433         }
00434         typedef typename cst_sct<Csa, Lcp, Bp_support>::size_type size_type;
00435         typedef typename cst_sct<Csa, Lcp, Bp_support>::node_type node_type;
00436         node_type node = cst.root();
00437         for(size_type i=0, char_pos=0; node.l < pattern_len; ++i){
00438                 node_type newnode = cst.child(node, (unsigned char)pattern[node.l], char_pos);
00439                 if( newnode.l == 0 )// root node, no match found
00440                         return 0;
00441                 // else the first character of the newnode matches the pattern at position node.l
00442                 for(size_type j=node.l+1; j < newnode.l and j< pattern_len; ++j){
00443                         char_pos = cst.csa.psi[char_pos];
00444                         size_type cc = cst.csa.char2comp[pattern[j]];
00445                         if(char_pos < cst.csa.C[cc] or char_pos >= cst.csa.C[cc+1] )
00446                                 return 0;
00447                 }
00448                 node = newnode;
00449         }
00450         return cst.leaves_in_the_subtree(node);
00451 }
00452 */
00453 
00454 } // end namespace algorithm
00455 
00456 } // end namespace sdsl
00457 
00458 #endif