SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2010 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00022 #ifndef INCLUDED_SDSL_ALGORITHMS_FOR_STRING_MATCHING 00023 #define INCLUDED_SDSL_ALGORITHMS_FOR_STRING_MATCHING 00024 00025 #include "int_vector.hpp" 00026 00027 #include <stdexcept> // for exceptions 00028 #include <iostream> 00029 #include <cassert> 00030 #include <stack> 00031 #include <utility> 00032 00033 00034 namespace sdsl 00035 { 00036 00040 namespace algorithm 00041 { 00042 00044 00059 template<class Csa> 00060 static typename Csa::csa_size_type backward_search( 00061 const Csa& csa, 00062 typename Csa::size_type l, 00063 typename Csa::size_type r, 00064 typename Csa::char_type c, 00065 typename Csa::size_type& l_res, 00066 typename Csa::size_type& r_res 00067 ) 00068 { 00069 assert(l <= r); assert(r < csa.size()); 00070 typename Csa::size_type c_begin = csa.C[csa.char2comp[c]]; 00071 l_res = c_begin + csa.rank_bwt(l, c); // count c in bwt[0..l-1] 00072 r_res = c_begin + csa.rank_bwt(r+1, c) - 1; // count c in bwt[0..r] 00073 assert(r_res+1-l_res >= 0); 00074 return r_res+1-l_res; 00075 } 00076 00078 00094 template<class Csa> 00095 static typename Csa::csa_size_type backward_search(const Csa& csa, typename Csa::size_type l, 00096 typename Csa::size_type r, typename Csa::pattern_type pat, 00097 typename Csa::size_type len, typename Csa::size_type& l_res, typename Csa::size_type& r_res) 00098 { 00099 typename Csa::size_type i = 0; 00100 l_res = l; r_res = r; 00101 while (i < len and backward_search(csa, l_res, r_res, pat[len-i-1], l_res, r_res)) { 00102 ++i; 00103 } 00104 return r_res+1-l_res; 00105 } 00106 00107 // TODO: forward search. Include original text??? 00108 00110 00119 template<class Csa> 00120 static typename Csa::csa_size_type count(const Csa& csa, typename Csa::pattern_type pat, typename Csa::size_type len) 00121 { 00122 if (len > csa.size()) 00123 return 0; 00124 typename Csa::size_type t1,t2; t1=t2=0; // dummy varaiable for the backward_search call 00125 return backward_search(csa, 0, csa.size()-1, pat, len, t1, t2); 00126 } 00127 00129 00140 template<class Csa, class RandomAccessContainer> 00141 static typename Csa::csa_size_type locate(const Csa& csa, typename Csa::pattern_type pattern, 00142 typename Csa::size_type pattern_len, RandomAccessContainer& occ) 00143 { 00144 typename Csa::size_type occ_begin, occ_end, occs; 00145 occs = backward_search(csa, 0, csa.size()-1, pattern, pattern_len, occ_begin, occ_end); 00146 occ.resize(occs); 00147 for (typename Csa::size_type i=0; i < occs; ++i) { 00148 occ[i] = csa[occ_begin+i]; 00149 } 00150 return occs; 00151 } 00152 00154 00164 // Is it cheaper to call T[i] = BWT[iSA[i+1]]??? Additional ranks but H_0 average access 00165 // TODO: extract backward!!! is faster in most cases! 00166 template<class Csa> 00167 static void extract(const Csa& csa, typename Csa::size_type begin, typename Csa::size_type end, unsigned char* text) 00168 { 00169 assert(end <= csa.size()); 00170 assert(begin <= end); 00171 for (typename Csa::size_type i=begin, order = csa(begin); i<=end; ++i, order = csa.psi[order]) { 00172 uint16_t c_begin = 1, c_end = 257, mid; 00173 while (c_begin < c_end) { 00174 mid = (c_begin+c_end)>>1; 00175 if (csa.C[mid] <= order) { 00176 c_begin = mid+1; 00177 } else { 00178 c_end = mid; 00179 } 00180 } 00181 text[i-begin] = csa.comp2char[c_begin-1]; 00182 } 00183 if (text[end-begin]!=0) 00184 text[end-begin+1] = 0; // set terminal character 00185 } 00186 00188 00197 // TODO: use extract with four parameters to implement this method 00198 template<class Csa> 00199 static std::string extract(const Csa& csa, typename Csa::size_type begin, typename Csa::size_type end) 00200 { 00201 assert(end <= csa.size()); 00202 assert(begin <= end); 00203 std::string result(end-begin+1,' '); 00204 for (typename Csa::size_type i=begin, order = csa(begin); i<=end; ++i, order = csa.psi[order]) { 00205 uint16_t c_begin = 1, c_end = 257, mid; 00206 while (c_begin < c_end) { 00207 mid = (c_begin+c_end)>>1; 00208 if (csa.C[mid] <= order) { 00209 c_begin = mid+1; 00210 } else { 00211 c_end = mid; 00212 } 00213 } 00214 result[i-begin] = csa.comp2char[c_begin-1]; 00215 } 00216 return result; 00217 } 00218 00220 00230 template<class Cst> 00231 typename Cst::cst_size_type forward_search(const Cst& cst, typename Cst::node_type& v, 00232 const typename Cst::size_type d, const typename Cst::char_type c, 00233 typename Cst::size_type& char_pos) 00234 { 00235 unsigned char cc = cst.csa.char2comp[c]; // check if c occures in the text of the csa 00236 if (cc==0 and cc!=c) // " " " " " " " " " " 00237 return 0; 00238 typename Cst::size_type depth_node = cst.depth(v); 00239 if (d < depth_node) { // 00240 char_pos = cst.csa.psi[char_pos]; 00241 if (char_pos < cst.csa.C[cc] or char_pos >= cst.csa.C[cc+1]) 00242 return 0; 00243 return cst.leaves_in_the_subtree(v); 00244 } else if (d == depth_node) { // 00245 v = cst.child(v, c, char_pos); 00246 if (v == cst.root()) 00247 return 0; 00248 else 00249 return cst.leaves_in_the_subtree(v); 00250 } else { 00251 throw std::invalid_argument("depth d should be smaller or equal than node depth of v!"); 00252 return 0; 00253 } 00254 } 00255 00257 00268 template<class Cst> 00269 typename Cst::cst_size_type forward_search(const Cst& cst, typename Cst::node_type& v, 00270 typename Cst::size_type d, typename Cst::pattern_type pat, typename Cst::size_type len, 00271 typename Cst::size_type& char_pos 00272 ) 00273 { 00274 if (len==0) 00275 return cst.leaves_in_the_subtree(v); 00276 typename Cst::size_type i=0, size=0; 00277 while (i < len and (size=forward_search(cst, v, d, pat[i], char_pos))) { 00278 ++d; 00279 ++i; 00280 } 00281 return size; 00282 } 00283 00285 00290 template<class Cst> 00291 typename Cst::cst_size_type count(const Cst& cst, typename Cst::pattern_type pat, typename Cst::size_type len) 00292 { 00293 typename Cst::node_type v = cst.root(); 00294 typename Cst::size_type char_pos = 0; 00295 return forward_search(cst, v, 0, pat, len, char_pos); 00296 } 00297 00299 00305 template<class Cst, class RandomAccessContainer> 00306 typename Cst::cst_size_type locate(const Cst& cst, typename Cst::pattern_type pat, 00307 typename Cst::size_type len, RandomAccessContainer& occ) 00308 { 00309 typedef typename Cst::size_type size_type; 00310 typename Cst::node_type v = cst.root(); 00311 size_type char_pos = 0; 00312 typename Cst::cst_size_type occs = forward_search(cst, v, 0, pat, len, char_pos); 00313 occ.resize(occs); 00314 if (occs == 0) 00315 return 0; // because v equals cst.root() 00316 size_type left = cst.lb(v); // get range in the 00317 size_type right = cst.rb(v); 00318 for (size_type i=left; i <= right; ++i) 00319 occ[i-left] = cst.csa[i]; 00320 return occs; 00321 } 00322 00324 00330 template<class Cst> 00331 void extract(const Cst& cst, const typename Cst::node_type& v, unsigned char* text) 00332 { 00333 if (v == cst.root()) { 00334 text[0] = 0; 00335 return; 00336 } 00337 // first get the suffix array entry of the leftmost leaf in the subtree rooted at v 00338 typename Cst::size_type begin = cst.csa[cst.lb(v)]; 00339 // then call the extract method on the compressed suffix array 00340 extract(cst.csa, begin, begin + cst.depth(v) - 1, text); 00341 } 00342 00344 00349 template<class Cst> 00350 std::string extract(const Cst& cst, const typename Cst::node_type& v) 00351 { 00352 if (v==cst.root()) { 00353 return std::string(); 00354 } 00355 // first get the suffix array entry of the leftmost leaf in the subtree rooted at v 00356 typename Cst::size_type begin = cst.csa[cst.lb(v)]; 00357 // then call the extract method on the compressed suffix array 00358 return extract(cst.csa, begin, begin + cst.depth(v) - 1); 00359 } 00360 00361 00362 /* 00363 template<class Cst> 00364 typename Cst::size_type count( 00365 const Cst &cst, 00366 typename Cst::pattern_type pattern, 00367 typename Cst::size_type pattern_len){ 00368 if(pattern_len==0){ 00369 return 0; 00370 } 00371 typedef typename Cst::size_type size_type; 00372 typedef typename Cst::node_type node_type; 00373 node_type node = cst.root(); 00374 for(size_type i=0, char_pos=0; cst.depth(node) < pattern_len; ++i){ 00375 node_type newnode = cst.child(node, (unsigned char)pattern[cst.depth(node)], char_pos); 00376 if( newnode == cst.root() )// root node, no match found 00377 return 0; 00378 // else the first character of the newnode matches the pattern at position depth(node) 00379 for(size_type j=cst.depth(node)+1; j < cst.depth(newnode) and j < pattern_len; ++j){ 00380 char_pos = cst.csa.psi[char_pos]; 00381 size_type cc = cst.csa.char2comp[pattern[j]]; 00382 if(char_pos < cst.csa.C[cc] or char_pos >= cst.csa.C[cc+1] ) 00383 return 0; 00384 } 00385 node = newnode; 00386 } 00387 return cst.leaves_in_the_subtree(node); 00388 } 00389 */ 00390 /* 00391 template<class Cst, class RandomAccessContainer> 00392 typename Cst::size_type locate( 00393 const Cst &cst, 00394 typename Cst::pattern_type pattern, 00395 typename Cst::size_type pattern_len, 00396 RandomAccessContainer &occ){ 00397 occ.resize(0); 00398 typedef typename Cst::size_type size_type; 00399 typedef typename Cst::node_type node_type; 00400 node_type node = cst.root(); 00401 for(size_type i=0, char_pos=0; cst.depth(node) < pattern_len; ++i){ 00402 node_type newnode = cst.child(node, (unsigned char)pattern[cst.depth(node)], char_pos); 00403 if( newnode == cst.root() )// root node, no match found 00404 return 0; 00405 // else the first character of the newnode matches the pattern at position depth(node) 00406 for(size_type j=cst.depth(node)+1; j < cst.depth(newnode) and j < pattern_len; ++j){ 00407 char_pos = cst.csa.psi[char_pos]; 00408 size_type cc = cst.csa.char2comp[pattern[j]]; 00409 if(char_pos < cst.csa.C[cc] or char_pos >= cst.csa.C[cc+1] ) 00410 return 0; 00411 } 00412 node = newnode; 00413 } 00414 size_type occs = cst.leaves_in_the_subtree(node); 00415 occ.resize(occs); 00416 size_type left = cst.leftmost_suffix_array_index_in_the_subtree(node); 00417 size_type right = cst.rightmost_suffix_array_index_in_the_subtree(node); 00418 for(size_type i=left; i <= right; ++i) 00419 occ[i-left] = cst.csa[i]; 00420 return occs; 00421 } 00422 */ 00423 00424 00425 /* 00426 template<class Csa, class Lcp, class Bp_support> 00427 typename cst_sct<Csa, Lcp, Bp_support>::size_type count( 00428 const cst_sct<Csa, Lcp, Bp_support> &cst, 00429 typename cst_sct<Csa, Lcp, Bp_support>::pattern_type pattern, 00430 typename cst_sct<Csa, Lcp, Bp_support>::size_type pattern_len){ 00431 if(pattern_len==0){ 00432 return 0; 00433 } 00434 typedef typename cst_sct<Csa, Lcp, Bp_support>::size_type size_type; 00435 typedef typename cst_sct<Csa, Lcp, Bp_support>::node_type node_type; 00436 node_type node = cst.root(); 00437 for(size_type i=0, char_pos=0; node.l < pattern_len; ++i){ 00438 node_type newnode = cst.child(node, (unsigned char)pattern[node.l], char_pos); 00439 if( newnode.l == 0 )// root node, no match found 00440 return 0; 00441 // else the first character of the newnode matches the pattern at position node.l 00442 for(size_type j=node.l+1; j < newnode.l and j< pattern_len; ++j){ 00443 char_pos = cst.csa.psi[char_pos]; 00444 size_type cc = cst.csa.char2comp[pattern[j]]; 00445 if(char_pos < cst.csa.C[cc] or char_pos >= cst.csa.C[cc+1] ) 00446 return 0; 00447 } 00448 node = newnode; 00449 } 00450 return cst.leaves_in_the_subtree(node); 00451 } 00452 */ 00453 00454 } // end namespace algorithm 00455 00456 } // end namespace sdsl 00457 00458 #endif