SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
 All Classes Namespaces Files Functions Variables Typedefs Friends
sdsl/include/sdsl/lcp_kurtz.hpp
Go to the documentation of this file.
00001 /* sdsl - succinct data structures library
00002     Copyright (C) 2009 Simon Gog
00003 
00004     This program is free software: you can redistribute it and/or modify
00005     it under the terms of the GNU General Public License as published by
00006     the Free Software Foundation, either version 3 of the License, or
00007     (at your option) any later version.
00008 
00009     This program is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012     GNU General Public License for more details.
00013 
00014     You should have received a copy of the GNU General Public License
00015     along with this program.  If not, see http://www.gnu.org/licenses/ .
00016 */
00021 #ifndef INCLUDED_SDSL_LCP_KURTZ
00022 #define INCLUDED_SDSL_LCP_KURTZ
00023 
00024 #include "lcp.hpp"
00025 #include "int_vector.hpp"
00026 #include "algorithms.hpp"
00027 #include "iterators.hpp"
00028 #include "bitmagic.hpp"
00029 #include <iostream>
00030 #include <algorithm> // for lower_bound
00031 #include <cassert>
00032 #include <cstring> // for strlen
00033 #include <iomanip>
00034 #include <iterator>
00035 #include <vector>
00036 #include <utility> // for pair
00037 
00038 namespace sdsl
00039 {
00040 
00042 
00047 template<uint8_t width=0>
00048 class lcp_kurtz
00049 {
00050     public:
00051         typedef typename int_vector<width>::value_type           value_type;    // STL Container requirement
00052         typedef random_access_const_iterator<lcp_kurtz>          const_iterator;// STL Container requirement
00053         typedef const_iterator                                                           iterator;              // STL Container requirement
00054         typedef const value_type                                                         const_reference;
00055         typedef const_reference                                                          reference;
00056         typedef const_reference*                                                         pointer;
00057         typedef const pointer                                                            const_pointer;
00058         typedef int_vector<>::size_type                                          size_type;             // STL Container requirement
00059         typedef ptrdiff_t                                                                        difference_type; // STL Container requirement
00060 
00061         typedef lcp_plain_tag                                                            lcp_category;
00062 
00063         enum { fast_access = 0,
00064                text_order  = 0,
00065                sa_order   = 1
00066              }; // as the lcp_kurtz is not fast for texts with long repetition
00067 
00068         template<class Cst>  // template inner class which is used in CSTs to parametrize lcp classes
00069         class type           // with information about the CST. Thanks Stefan Arnold! (2011-03-02)
00070         {
00071             public:
00072                 typedef lcp_kurtz lcp_type;
00073         };
00074 
00075     private:
00076 
00077         int_vector<8>      m_small_lcp;         // vector for lcp values < 255
00078         int_vector<width>   m_big_lcp;          // vector for lcp values > 254
00079         int_vector<width>   m_big_lcp_idx;   // index of the lcp entries in the lcp array
00080 
00081         typedef std::pair<size_type, size_type> tPII;
00082         typedef std::vector<tPII> tVPII;
00083 
00084         void copy(const lcp_kurtz& lcp_c) {
00085             m_small_lcp         = lcp_c.m_small_lcp;
00086             m_big_lcp           = lcp_c.m_big_lcp;
00087             m_big_lcp_idx       = lcp_c.m_big_lcp_idx;
00088         }
00089 
00090     public:
00092         lcp_kurtz() {}
00094         ~lcp_kurtz() {}
00096         lcp_kurtz(const lcp_kurtz& lcp_c) {
00097             copy(lcp_c);
00098         }
00099 
00101         template<class Text, class Sa>
00102         lcp_kurtz(const Text& text, const Sa& sa);
00103 
00105         template<uint8_t int_width, class size_type_class>
00106         lcp_kurtz(int_vector_file_buffer<int_width, size_type_class>& lcp_buf);
00107 
00108         template<class Text, class Sa>
00109         void construct(const Text& text, const Sa& sa);
00110 
00111         template<uint8_t int_width, class size_type_class>
00112         void construct(int_vector_file_buffer<int_width, size_type_class>& lcp_buf);
00113 
00115 
00118         size_type size()const {
00119             return m_small_lcp.size();
00120         }
00121 
00123 
00126         static size_type max_size() {
00127             return int_vector<8>::max_size();
00128         }
00129 
00131 
00134         bool empty()const {
00135             return m_small_lcp.empty();
00136         }
00137 
00139 
00147         void swap(lcp_kurtz& lcp_c);
00148 
00150 
00153         const_iterator begin()const;
00154 
00156 
00159         const_iterator end()const;
00160 
00162 
00166         inline value_type operator[](size_type i)const;
00167 
00169 
00172         lcp_kurtz& operator=(const lcp_kurtz& lcp_c);
00173 
00175 
00180         bool operator==(const lcp_kurtz& lcp_c)const;
00181 
00183 
00188         bool operator!=(const lcp_kurtz& lcp_c)const;
00189 
00191 
00194         size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const;
00195 
00197 
00199         void load(std::istream& in);
00200 };
00201 
00202 // == template functions ==
00203 
00204 template<uint8_t width>
00205 template<class Text, class Sa>
00206 lcp_kurtz<width>::lcp_kurtz(const Text& text, const Sa& sa)
00207 {
00208     construct(text, sa);
00209 }
00210 
00211 template<uint8_t width>
00212 template<class Text, class Sa>
00213 void lcp_kurtz<width>::construct(const Text& text, const Sa& sa)
00214 {
00215     if (sa.size() == 0) {
00216         return;
00217     }
00218     m_small_lcp = int_vector<8>(sa.size());
00219     tVPII big_values;
00220 
00221     // use Kasai algorithm to compute the lcp values
00222     typename Sa::size_type sa_1 = sa(0), l=0, max_l=0, max_big_idx=0;
00223     for (typename Sa::size_type i=0,j=0; i < sa.size(); ++i) {
00224         if (l) --l;
00225         if (sa_1) {
00226             j = sa[sa_1-1];
00227             while (i+l < sa.size() and j+l < sa.size() and text[i+l]==text[j+l]) ++l;
00228         } else {
00229             l = 0;
00230         }
00231         if (l < 255) {
00232             m_small_lcp[ sa_1 ] = l;
00233         } else {
00234             m_small_lcp[ sa_1 ] = 255;
00235             big_values.push_back(tPII(sa_1, l));
00236             if (l > max_l) max_l = l;
00237             if (sa_1 > max_big_idx) max_big_idx = sa_1;
00238         }
00239         sa_1 = sa.psi[sa_1];
00240     }
00241 
00242     sort(big_values.begin(), big_values.end());
00243     m_big_lcp           = int_vector<width>(big_values.size(), 0, bit_magic::l1BP(max_l)+1);
00244     m_big_lcp_idx       = int_vector<width>(big_values.size(), 0, bit_magic::l1BP(max_big_idx)+1);
00245     for (tVPII::size_type i=0; i<big_values.size(); ++i) {
00246         m_big_lcp[i]     = big_values[i].second;
00247         m_big_lcp_idx[i] = big_values[i].first;
00248     }
00249 }
00250 
00251 
00252 template<uint8_t width>
00253 template<uint8_t int_width, class size_type_class>
00254 lcp_kurtz<width>::lcp_kurtz(int_vector_file_buffer<int_width, size_type_class>& lcp_buf)
00255 {
00256     construct(lcp_buf);
00257 }
00258 
00259 template<uint8_t width>
00260 template<uint8_t int_width, class size_type_class>
00261 void lcp_kurtz<width>::construct(int_vector_file_buffer<int_width, size_type_class>& lcp_buf)
00262 {
00263     m_small_lcp = int_vector<8>(lcp_buf.int_vector_size);
00264     typename int_vector<>::size_type l=0, max_l=0, max_big_idx=0, big_sum=0;
00265     lcp_buf.reset();
00266 
00267     for (size_type i=0, r_sum=0, r = lcp_buf.load_next_block(); r_sum < m_small_lcp.size();) {
00268         for (; i < r_sum+r; ++i) {
00269             if ((l=lcp_buf[i-r_sum]) < 255) {
00270                 m_small_lcp[i] = l;
00271             } else {
00272                 m_small_lcp[i] = 255;
00273                 if (l > max_l) max_l = l;
00274                 max_big_idx = i;
00275                 ++big_sum;
00276             }
00277         }
00278         r_sum   +=      r;
00279         r               =       lcp_buf.load_next_block();
00280     }
00281     m_big_lcp           = int_vector<>(big_sum, 0, bit_magic::l1BP(max_l)+1);
00282     m_big_lcp_idx       = int_vector<>(big_sum, 0, bit_magic::l1BP(max_big_idx)+1);
00283 
00284     lcp_buf.reset();
00285     for (size_type i=0, r_sum=0, r = lcp_buf.load_next_block(),ii=0; r_sum<m_small_lcp.size();) {
00286         for (; i < r_sum+r; ++i) {
00287             if ((l=lcp_buf[i-r_sum]) >= 255) {
00288                 m_big_lcp[ii] = l;
00289                 m_big_lcp_idx[ii] = i;
00290                 ++ii;
00291             }
00292         }
00293         r_sum   +=      r;
00294         r               =       lcp_buf.load_next_block();
00295     }
00296 }
00297 
00298 template<uint8_t width>
00299 void lcp_kurtz<width>::swap(lcp_kurtz& lcp_c)
00300 {
00301     m_small_lcp.swap(lcp_c.m_small_lcp);
00302     m_big_lcp.swap(lcp_c.m_big_lcp);
00303     m_big_lcp_idx.swap(lcp_c.m_big_lcp_idx);
00304 }
00305 
00306 template<uint8_t width>
00307 inline typename lcp_kurtz<width>::value_type lcp_kurtz<width>::operator[](size_type i)const
00308 {
00309     if (m_small_lcp[i]!=255) {
00310         return m_small_lcp[i];
00311     } else {
00312         size_type idx = lower_bound(m_big_lcp_idx.begin(), m_big_lcp_idx.end(), i) - m_big_lcp_idx.begin();
00313         return m_big_lcp[idx];
00314     }
00315 }
00316 
00317 
00318 template<uint8_t width>
00319 typename lcp_kurtz<width>::size_type lcp_kurtz<width>::serialize(std::ostream& out, structure_tree_node* v, std::string name)const
00320 {
00321     structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
00322     size_type written_bytes = 0;
00323     written_bytes += m_small_lcp.serialize(out, child, "small_lcp");
00324     written_bytes += m_big_lcp.serialize(out, child, "large_lcp");
00325     written_bytes += m_big_lcp_idx.serialize(out, child, "large_lcp_idx");
00326     structure_tree::add_size(child, written_bytes);
00327     return written_bytes;
00328 }
00329 
00330 template<uint8_t width>
00331 void lcp_kurtz<width>::load(std::istream& in)
00332 {
00333     m_small_lcp.load(in);
00334     m_big_lcp.load(in);
00335     m_big_lcp_idx.load(in);
00336 }
00337 
00338 
00339 template<uint8_t width>
00340 lcp_kurtz<width>& lcp_kurtz<width>::operator=(const lcp_kurtz& lcp_c)
00341 {
00342     if (this != &lcp_c) {
00343         copy(lcp_c);
00344     }
00345     return *this;
00346 }
00347 
00348 
00349 template<uint8_t width>
00350 bool lcp_kurtz<width>::operator==(const lcp_kurtz& lcp_c)const
00351 {
00352     if (this == &lcp_c)
00353         return true;
00354     return m_small_lcp == lcp_c.m_small_lcp and m_big_lcp == lcp_c.m_big_lcp and m_big_lcp_idx == lcp_c.m_big_lcp_idx;
00355 }
00356 
00357 template<uint8_t width>
00358 bool lcp_kurtz<width>::operator!=(const lcp_kurtz& lcp_c)const
00359 {
00360     return !(*this == lcp_c);
00361 }
00362 
00363 template<uint8_t width>
00364 typename lcp_kurtz<width>::const_iterator lcp_kurtz<width>::begin()const
00365 {
00366     return const_iterator(this, 0);
00367 }
00368 
00369 template<uint8_t width>
00370 typename lcp_kurtz<width>::const_iterator lcp_kurtz<width>::end()const
00371 {
00372     return const_iterator(this, size());
00373 }
00374 
00375 
00376 
00377 } // end namespace sdsl
00378 
00379 #endif