SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2010 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00021 #ifndef INCLUDED_SDSL_LCP_WT 00022 #define INCLUDED_SDSL_LCP_WT 00023 00024 #include "lcp.hpp" 00025 #include "int_vector.hpp" 00026 #include "algorithms.hpp" 00027 #include "iterators.hpp" 00028 #include "bitmagic.hpp" 00029 #include "wt_huff.hpp" 00030 #include "select_support_bs.hpp" // dummy select support for wavelet tree, as we don't use select in this application 00031 #include "util.hpp" 00032 #include "testutils.hpp" 00033 #include <iostream> 00034 #include <algorithm> // for lower_bound 00035 #include <cassert> 00036 #include <cstring> // for strlen 00037 #include <iomanip> 00038 #include <iterator> 00039 #include <vector> 00040 #include <utility> // for pair 00041 #include <stdexcept> 00042 00043 namespace sdsl 00044 { 00045 00047 00052 template<uint8_t width=0> 00053 class lcp_wt 00054 { 00055 public: 00056 typedef typename int_vector<width>::value_type value_type; // STL Container requirement 00057 typedef random_access_const_iterator<lcp_wt> const_iterator;// STL Container requirement 00058 typedef const_iterator iterator; // STL Container requirement 00059 typedef const value_type const_reference; 00060 typedef const_reference reference; 00061 typedef const_reference* pointer; 00062 typedef const pointer const_pointer; 00063 typedef int_vector<>::size_type size_type; // STL Container requirement 00064 typedef ptrdiff_t difference_type; // STL Container requirement 00065 00066 typedef lcp_plain_tag lcp_category; 00067 00068 enum { fast_access = 0, 00069 text_order = 0, 00070 sa_order = 1 00071 }; // as the lcp_wt is not fast for texts with long repetition 00072 00073 template<class Cst> // template inner class which is used in CSTs to parametrize lcp classes 00074 class type // with information about the CST. Thanks Stefan Arnold! (2011-03-02) 00075 { 00076 public: 00077 typedef lcp_wt lcp_type; 00078 }; 00079 00080 private: 00081 typedef select_support_bs< rank_support_v<> > tDummySS; 00082 wt_huff<bit_vector, rank_support_v<>, tDummySS, tDummySS > m_small_lcp; // vector for lcp values < 255 00083 int_vector<width> m_big_lcp; // vector for lcp values > 254 00084 00085 typedef std::pair<size_type, size_type> tPII; 00086 typedef std::vector<tPII> tVPII; 00087 00088 void copy(const lcp_wt& lcp_c) { 00089 m_small_lcp = lcp_c.m_small_lcp; 00090 m_big_lcp = lcp_c.m_big_lcp; 00091 } 00092 00093 public: 00095 lcp_wt() {} 00097 ~lcp_wt() {} 00099 lcp_wt(const lcp_wt& lcp_c) { 00100 copy(lcp_c); 00101 } 00102 00104 template<class Text, class Sa> 00105 lcp_wt(const Text& text, const Sa& sa); 00106 00108 template<uint8_t int_width, class size_type_class> 00109 lcp_wt(int_vector_file_buffer<int_width, size_type_class>& lcp_buf); 00110 00111 template<class Text, class Sa> 00112 void construct(const Text& text, const Sa& sa); 00113 00114 template<uint8_t int_width, class size_type_class> 00115 void construct(int_vector_file_buffer<int_width, size_type_class>& lcp_buf); 00116 00118 00121 size_type size()const { 00122 return m_small_lcp.size(); 00123 } 00124 00126 00129 static size_type max_size() { 00130 return int_vector<8>::max_size(); 00131 } 00132 00134 00137 bool empty()const { 00138 return m_small_lcp.size()==0; 00139 } 00140 00142 00150 void swap(lcp_wt& lcp_c); 00151 00153 00156 const_iterator begin()const; 00157 00159 00162 const_iterator end()const; 00163 00165 00169 inline value_type operator[](size_type i)const; 00170 00172 00175 lcp_wt& operator=(const lcp_wt& lcp_c); 00176 00178 00183 bool operator==(const lcp_wt& lcp_c)const; 00184 00186 00191 bool operator!=(const lcp_wt& lcp_c)const; 00192 00194 00197 size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const; 00198 00200 00202 void load(std::istream& in); 00203 }; 00204 00205 // == template functions == 00206 00207 template<uint8_t width> 00208 template<class Text, class Sa> 00209 lcp_wt<width>::lcp_wt(const Text& text, const Sa& sa) 00210 { 00211 construct(text, sa); 00212 } 00213 00214 template<uint8_t width> 00215 template<class Text, class Sa> 00216 void lcp_wt<width>::construct(const Text& text, const Sa& sa) 00217 { 00218 if (sa.size() == 0) { 00219 return; 00220 } 00221 throw std::logic_error("This constructor of lcp_wt is not yet implemented!"); 00222 /* 00223 */ 00224 } 00225 00226 00227 template<uint8_t width> 00228 template<uint8_t int_width, class size_type_class> 00229 lcp_wt<width>::lcp_wt(int_vector_file_buffer<int_width, size_type_class>& lcp_buf) 00230 { 00231 construct(lcp_buf); 00232 } 00233 00234 template<uint8_t width> 00235 template<uint8_t int_width, class size_type_class> 00236 void lcp_wt<width>::construct(int_vector_file_buffer<int_width, size_type_class>& lcp_buf) 00237 { 00238 std::string temp_file = "/tmp/lcp_sml" + util::to_string(util::get_pid()) + "_" + util::to_string(util::get_id()) ;// TODO: remove absolute file name 00239 // write_R_output("lcp","construct sml","begin"); 00240 typename int_vector<>::size_type l=0, max_l=0, big_sum=0, n = lcp_buf.int_vector_size; 00241 { 00242 int_vector<8> small_lcp = int_vector<8>(n); 00243 lcp_buf.reset(); 00244 for (size_type i=0, r_sum=0, r = lcp_buf.load_next_block(); r_sum < n;) { 00245 for (; i < r_sum+r; ++i) { 00246 if ((l=lcp_buf[i-r_sum]) < 255) { 00247 small_lcp[i] = l; 00248 } else { 00249 small_lcp[i] = 255; 00250 if (l > max_l) max_l = l; 00251 ++big_sum; 00252 } 00253 } 00254 r_sum += r; r = lcp_buf.load_next_block(); 00255 } 00256 util::store_to_file(small_lcp, temp_file.c_str()); 00257 } 00258 // write_R_output("lcp","construct sml","end"); 00259 int_vector_file_buffer<8> lcp_sml_buf(temp_file.c_str()); 00260 m_small_lcp.construct(lcp_sml_buf, lcp_sml_buf.int_vector_size); 00261 std::remove(temp_file.c_str()); 00262 // write_R_output("lcp","construct big","begin"); 00263 m_big_lcp = int_vector<>(big_sum, 0, bit_magic::l1BP(max_l)+1); 00264 { 00265 lcp_buf.reset(); 00266 for (size_type i=0, ii=0, r_sum=0, r = lcp_buf.load_next_block(); r_sum < n;) { 00267 for (; i < r_sum+r; ++i) { 00268 if (lcp_buf[i-r_sum] >= 255) { 00269 m_big_lcp[ ii++ ] = lcp_buf[i-r_sum]; 00270 } 00271 } 00272 r_sum += r; r = lcp_buf.load_next_block(); 00273 } 00274 } 00275 // write_R_output("lcp","construct big","end"); 00276 } 00277 00278 template<uint8_t width> 00279 void lcp_wt<width>::swap(lcp_wt& lcp_c) 00280 { 00281 m_small_lcp.swap(lcp_c.m_small_lcp); 00282 m_big_lcp.swap(lcp_c.m_big_lcp); 00283 } 00284 00285 template<uint8_t width> 00286 inline typename lcp_wt<width>::value_type lcp_wt<width>::operator[](size_type i)const 00287 { 00288 if (m_small_lcp[i]!=255) { 00289 return m_small_lcp[i]; 00290 } else { 00291 return m_big_lcp[ m_small_lcp.rank(i, 255) ]; 00292 } 00293 } 00294 00295 00296 template<uint8_t width> 00297 typename lcp_wt<width>::size_type lcp_wt<width>::serialize(std::ostream& out, structure_tree_node* v, std::string name)const 00298 { 00299 structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); 00300 size_type written_bytes = 0; 00301 written_bytes += m_small_lcp.serialize(out, child, "small_lcp"); 00302 written_bytes += m_big_lcp.serialize(out, child, "large_lcp"); 00303 structure_tree::add_size(child, written_bytes); 00304 return written_bytes; 00305 } 00306 00307 template<uint8_t width> 00308 void lcp_wt<width>::load(std::istream& in) 00309 { 00310 m_small_lcp.load(in); 00311 m_big_lcp.load(in); 00312 } 00313 00314 00315 template<uint8_t width> 00316 lcp_wt<width>& lcp_wt<width>::operator=(const lcp_wt& lcp_c) 00317 { 00318 if (this != &lcp_c) { 00319 copy(lcp_c); 00320 } 00321 return *this; 00322 } 00323 00324 00325 template<uint8_t width> 00326 bool lcp_wt<width>::operator==(const lcp_wt& lcp_c)const 00327 { 00328 if (this == &lcp_c) 00329 return true; 00330 return m_small_lcp == lcp_c.m_small_lcp and m_big_lcp == lcp_c.m_big_lcp; 00331 } 00332 00333 template<uint8_t width> 00334 bool lcp_wt<width>::operator!=(const lcp_wt& lcp_c)const 00335 { 00336 return !(*this == lcp_c); 00337 } 00338 00339 template<uint8_t width> 00340 typename lcp_wt<width>::const_iterator lcp_wt<width>::begin()const 00341 { 00342 return const_iterator(this, 0); 00343 } 00344 00345 template<uint8_t width> 00346 typename lcp_wt<width>::const_iterator lcp_wt<width>::end()const 00347 { 00348 return const_iterator(this, size()); 00349 } 00350 00351 00352 00353 } // end namespace sdsl 00354 00355 #endif