SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2009 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00021 #ifndef INCLUDED_SDSL_LCP_KURTZ 00022 #define INCLUDED_SDSL_LCP_KURTZ 00023 00024 #include "lcp.hpp" 00025 #include "int_vector.hpp" 00026 #include "algorithms.hpp" 00027 #include "iterators.hpp" 00028 #include "bitmagic.hpp" 00029 #include <iostream> 00030 #include <algorithm> // for lower_bound 00031 #include <cassert> 00032 #include <cstring> // for strlen 00033 #include <iomanip> 00034 #include <iterator> 00035 #include <vector> 00036 #include <utility> // for pair 00037 00038 namespace sdsl 00039 { 00040 00042 00047 template<uint8_t width=0> 00048 class lcp_kurtz 00049 { 00050 public: 00051 typedef typename int_vector<width>::value_type value_type; // STL Container requirement 00052 typedef random_access_const_iterator<lcp_kurtz> const_iterator;// STL Container requirement 00053 typedef const_iterator iterator; // STL Container requirement 00054 typedef const value_type const_reference; 00055 typedef const_reference reference; 00056 typedef const_reference* pointer; 00057 typedef const pointer const_pointer; 00058 typedef int_vector<>::size_type size_type; // STL Container requirement 00059 typedef ptrdiff_t difference_type; // STL Container requirement 00060 00061 typedef lcp_plain_tag lcp_category; 00062 00063 enum { fast_access = 0, 00064 text_order = 0, 00065 sa_order = 1 00066 }; // as the lcp_kurtz is not fast for texts with long repetition 00067 00068 template<class Cst> // template inner class which is used in CSTs to parametrize lcp classes 00069 class type // with information about the CST. Thanks Stefan Arnold! (2011-03-02) 00070 { 00071 public: 00072 typedef lcp_kurtz lcp_type; 00073 }; 00074 00075 private: 00076 00077 int_vector<8> m_small_lcp; // vector for lcp values < 255 00078 int_vector<width> m_big_lcp; // vector for lcp values > 254 00079 int_vector<width> m_big_lcp_idx; // index of the lcp entries in the lcp array 00080 00081 typedef std::pair<size_type, size_type> tPII; 00082 typedef std::vector<tPII> tVPII; 00083 00084 void copy(const lcp_kurtz& lcp_c) { 00085 m_small_lcp = lcp_c.m_small_lcp; 00086 m_big_lcp = lcp_c.m_big_lcp; 00087 m_big_lcp_idx = lcp_c.m_big_lcp_idx; 00088 } 00089 00090 public: 00092 lcp_kurtz() {} 00094 ~lcp_kurtz() {} 00096 lcp_kurtz(const lcp_kurtz& lcp_c) { 00097 copy(lcp_c); 00098 } 00099 00101 template<class Text, class Sa> 00102 lcp_kurtz(const Text& text, const Sa& sa); 00103 00105 template<uint8_t int_width, class size_type_class> 00106 lcp_kurtz(int_vector_file_buffer<int_width, size_type_class>& lcp_buf); 00107 00108 template<class Text, class Sa> 00109 void construct(const Text& text, const Sa& sa); 00110 00111 template<uint8_t int_width, class size_type_class> 00112 void construct(int_vector_file_buffer<int_width, size_type_class>& lcp_buf); 00113 00115 00118 size_type size()const { 00119 return m_small_lcp.size(); 00120 } 00121 00123 00126 static size_type max_size() { 00127 return int_vector<8>::max_size(); 00128 } 00129 00131 00134 bool empty()const { 00135 return m_small_lcp.empty(); 00136 } 00137 00139 00147 void swap(lcp_kurtz& lcp_c); 00148 00150 00153 const_iterator begin()const; 00154 00156 00159 const_iterator end()const; 00160 00162 00166 inline value_type operator[](size_type i)const; 00167 00169 00172 lcp_kurtz& operator=(const lcp_kurtz& lcp_c); 00173 00175 00180 bool operator==(const lcp_kurtz& lcp_c)const; 00181 00183 00188 bool operator!=(const lcp_kurtz& lcp_c)const; 00189 00191 00194 size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const; 00195 00197 00199 void load(std::istream& in); 00200 }; 00201 00202 // == template functions == 00203 00204 template<uint8_t width> 00205 template<class Text, class Sa> 00206 lcp_kurtz<width>::lcp_kurtz(const Text& text, const Sa& sa) 00207 { 00208 construct(text, sa); 00209 } 00210 00211 template<uint8_t width> 00212 template<class Text, class Sa> 00213 void lcp_kurtz<width>::construct(const Text& text, const Sa& sa) 00214 { 00215 if (sa.size() == 0) { 00216 return; 00217 } 00218 m_small_lcp = int_vector<8>(sa.size()); 00219 tVPII big_values; 00220 00221 // use Kasai algorithm to compute the lcp values 00222 typename Sa::size_type sa_1 = sa(0), l=0, max_l=0, max_big_idx=0; 00223 for (typename Sa::size_type i=0,j=0; i < sa.size(); ++i) { 00224 if (l) --l; 00225 if (sa_1) { 00226 j = sa[sa_1-1]; 00227 while (i+l < sa.size() and j+l < sa.size() and text[i+l]==text[j+l]) ++l; 00228 } else { 00229 l = 0; 00230 } 00231 if (l < 255) { 00232 m_small_lcp[ sa_1 ] = l; 00233 } else { 00234 m_small_lcp[ sa_1 ] = 255; 00235 big_values.push_back(tPII(sa_1, l)); 00236 if (l > max_l) max_l = l; 00237 if (sa_1 > max_big_idx) max_big_idx = sa_1; 00238 } 00239 sa_1 = sa.psi[sa_1]; 00240 } 00241 00242 sort(big_values.begin(), big_values.end()); 00243 m_big_lcp = int_vector<width>(big_values.size(), 0, bit_magic::l1BP(max_l)+1); 00244 m_big_lcp_idx = int_vector<width>(big_values.size(), 0, bit_magic::l1BP(max_big_idx)+1); 00245 for (tVPII::size_type i=0; i<big_values.size(); ++i) { 00246 m_big_lcp[i] = big_values[i].second; 00247 m_big_lcp_idx[i] = big_values[i].first; 00248 } 00249 } 00250 00251 00252 template<uint8_t width> 00253 template<uint8_t int_width, class size_type_class> 00254 lcp_kurtz<width>::lcp_kurtz(int_vector_file_buffer<int_width, size_type_class>& lcp_buf) 00255 { 00256 construct(lcp_buf); 00257 } 00258 00259 template<uint8_t width> 00260 template<uint8_t int_width, class size_type_class> 00261 void lcp_kurtz<width>::construct(int_vector_file_buffer<int_width, size_type_class>& lcp_buf) 00262 { 00263 m_small_lcp = int_vector<8>(lcp_buf.int_vector_size); 00264 typename int_vector<>::size_type l=0, max_l=0, max_big_idx=0, big_sum=0; 00265 lcp_buf.reset(); 00266 00267 for (size_type i=0, r_sum=0, r = lcp_buf.load_next_block(); r_sum < m_small_lcp.size();) { 00268 for (; i < r_sum+r; ++i) { 00269 if ((l=lcp_buf[i-r_sum]) < 255) { 00270 m_small_lcp[i] = l; 00271 } else { 00272 m_small_lcp[i] = 255; 00273 if (l > max_l) max_l = l; 00274 max_big_idx = i; 00275 ++big_sum; 00276 } 00277 } 00278 r_sum += r; 00279 r = lcp_buf.load_next_block(); 00280 } 00281 m_big_lcp = int_vector<>(big_sum, 0, bit_magic::l1BP(max_l)+1); 00282 m_big_lcp_idx = int_vector<>(big_sum, 0, bit_magic::l1BP(max_big_idx)+1); 00283 00284 lcp_buf.reset(); 00285 for (size_type i=0, r_sum=0, r = lcp_buf.load_next_block(),ii=0; r_sum<m_small_lcp.size();) { 00286 for (; i < r_sum+r; ++i) { 00287 if ((l=lcp_buf[i-r_sum]) >= 255) { 00288 m_big_lcp[ii] = l; 00289 m_big_lcp_idx[ii] = i; 00290 ++ii; 00291 } 00292 } 00293 r_sum += r; 00294 r = lcp_buf.load_next_block(); 00295 } 00296 } 00297 00298 template<uint8_t width> 00299 void lcp_kurtz<width>::swap(lcp_kurtz& lcp_c) 00300 { 00301 m_small_lcp.swap(lcp_c.m_small_lcp); 00302 m_big_lcp.swap(lcp_c.m_big_lcp); 00303 m_big_lcp_idx.swap(lcp_c.m_big_lcp_idx); 00304 } 00305 00306 template<uint8_t width> 00307 inline typename lcp_kurtz<width>::value_type lcp_kurtz<width>::operator[](size_type i)const 00308 { 00309 if (m_small_lcp[i]!=255) { 00310 return m_small_lcp[i]; 00311 } else { 00312 size_type idx = lower_bound(m_big_lcp_idx.begin(), m_big_lcp_idx.end(), i) - m_big_lcp_idx.begin(); 00313 return m_big_lcp[idx]; 00314 } 00315 } 00316 00317 00318 template<uint8_t width> 00319 typename lcp_kurtz<width>::size_type lcp_kurtz<width>::serialize(std::ostream& out, structure_tree_node* v, std::string name)const 00320 { 00321 structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); 00322 size_type written_bytes = 0; 00323 written_bytes += m_small_lcp.serialize(out, child, "small_lcp"); 00324 written_bytes += m_big_lcp.serialize(out, child, "large_lcp"); 00325 written_bytes += m_big_lcp_idx.serialize(out, child, "large_lcp_idx"); 00326 structure_tree::add_size(child, written_bytes); 00327 return written_bytes; 00328 } 00329 00330 template<uint8_t width> 00331 void lcp_kurtz<width>::load(std::istream& in) 00332 { 00333 m_small_lcp.load(in); 00334 m_big_lcp.load(in); 00335 m_big_lcp_idx.load(in); 00336 } 00337 00338 00339 template<uint8_t width> 00340 lcp_kurtz<width>& lcp_kurtz<width>::operator=(const lcp_kurtz& lcp_c) 00341 { 00342 if (this != &lcp_c) { 00343 copy(lcp_c); 00344 } 00345 return *this; 00346 } 00347 00348 00349 template<uint8_t width> 00350 bool lcp_kurtz<width>::operator==(const lcp_kurtz& lcp_c)const 00351 { 00352 if (this == &lcp_c) 00353 return true; 00354 return m_small_lcp == lcp_c.m_small_lcp and m_big_lcp == lcp_c.m_big_lcp and m_big_lcp_idx == lcp_c.m_big_lcp_idx; 00355 } 00356 00357 template<uint8_t width> 00358 bool lcp_kurtz<width>::operator!=(const lcp_kurtz& lcp_c)const 00359 { 00360 return !(*this == lcp_c); 00361 } 00362 00363 template<uint8_t width> 00364 typename lcp_kurtz<width>::const_iterator lcp_kurtz<width>::begin()const 00365 { 00366 return const_iterator(this, 0); 00367 } 00368 00369 template<uint8_t width> 00370 typename lcp_kurtz<width>::const_iterator lcp_kurtz<width>::end()const 00371 { 00372 return const_iterator(this, size()); 00373 } 00374 00375 00376 00377 } // end namespace sdsl 00378 00379 #endif