SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2008 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00021 #include "int_vector.hpp" 00022 #include "elias_delta_coder.hpp" 00023 #include "iterators.hpp" 00024 00025 #ifndef SDSL_ENC_VECTOR 00026 #define SDSL_ENC_VECTOR 00027 00029 namespace sdsl 00030 { 00031 00032 template<uint8_t fixedIntWidth> 00033 struct enc_vector_trait { 00034 typedef int_vector<0> int_vector_type; 00035 }; 00036 00037 template<> 00038 struct enc_vector_trait<32> { 00039 typedef int_vector<32> int_vector_type; 00040 }; 00041 00043 00050 template<class Coder=coder::elias_delta, 00051 uint32_t SampleDens = 8, uint8_t fixedIntWidth=0> 00052 class enc_vector 00053 { 00054 public: 00055 typedef uint64_t value_type; // STL Container requirement 00056 typedef random_access_const_iterator<enc_vector> iterator;// STL Container requirement 00057 typedef iterator const_iterator; // STL Container requirement 00058 typedef const value_type reference; 00059 typedef const value_type const_reference; 00060 typedef const value_type* const_pointer; 00061 typedef ptrdiff_t difference_type;// STL Container requirement 00062 typedef int_vector<>::size_type size_type; // STL Container requirement 00063 typedef Coder coder; 00064 typedef typename enc_vector_trait<fixedIntWidth>::int_vector_type int_vector_type; 00065 static const uint32_t sample_dens = SampleDens; // Required member 00066 00067 int_vector<0> m_z; // compressed bit stream 00068 private: 00069 int_vector_type m_sample_vals_and_pointer; 00070 size_type m_elements; // number of elements 00071 00072 // workaround function for the constructor 00073 void construct() { 00074 m_elements = 0; 00075 } 00076 void copy(const enc_vector& v); 00077 00078 void clear() { 00079 m_z.resize(0); 00080 m_elements = 0; 00081 m_sample_vals_and_pointer.resize(0); 00082 } 00083 00084 public: 00086 enc_vector() { 00087 construct(); 00088 }; 00090 00093 enc_vector(const enc_vector& v); 00094 00096 00099 template<class Container> 00100 enc_vector(const Container& c) { 00101 construct(); 00102 init(c); 00103 } 00104 00106 /* 00107 \param v_buf A int_vector_file_buf. 00108 \pre No two adjacent values should be equal. 00109 */ 00110 template<uint8_t int_width, class size_type_class> 00111 enc_vector(int_vector_file_buffer<int_width, size_type_class>& v_buf) { 00112 construct(); 00113 init(v_buf); 00114 } 00115 00116 template<class Container> 00117 void init(const Container& c); 00118 00119 template<uint8_t int_width, class size_type_class> 00120 void init(int_vector_file_buffer<int_width, size_type_class>& v_buf); 00121 00123 ~enc_vector() { 00124 } 00125 00127 00131 size_type size()const; 00132 00134 00136 static size_type max_size(); 00137 00139 00144 bool empty() const; 00145 00147 00155 void swap(enc_vector& v); 00156 00158 00162 const const_iterator begin()const; 00163 00165 00169 const const_iterator end()const; 00170 00171 // Iterator that points to the last element of the enc_vector. 00172 /* 00173 * Required for the Container Concept of the STL. 00174 * \sa rend() 00175 */ 00176 // reverse_iterator rbegin()const; 00177 00178 // Iterator that points to the position before the first element of the enc_vector. 00179 /* 00180 * Required for the Container Concept of the STL 00181 * \sa rbegin() 00182 */ 00183 // reverse_iterator rend()const; 00184 00186 00190 value_type operator[](size_type i)const; 00191 00193 00196 enc_vector& operator=(const enc_vector& v); 00197 00199 00207 bool operator==(const enc_vector& v)const; 00208 00210 00216 bool operator!=(const enc_vector& v)const; 00217 00219 00222 size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const; 00223 00225 void load(std::istream& in); 00226 00228 00231 value_type sample(const size_type i) const; 00232 00233 uint32_t get_sample_dens() const; 00234 00239 // template<class Iterator> 00240 void get_inter_sampled_values(const size_type i, uint64_t* it)const { 00241 *(it++) = 0; 00242 if (i*SampleDens + SampleDens - 1 < size()) { 00243 Coder::template decode<true, true>(m_z.data(), m_sample_vals_and_pointer[(i<<1)+1], SampleDens - 1, it); 00244 } else { 00245 assert(i*SampleDens < size()); 00246 Coder::template decode<true, true>(m_z.data(), m_sample_vals_and_pointer[(i<<1)+1], size()-i*SampleDens - 1, it); 00247 } 00248 }; 00249 }; 00250 00251 00252 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00253 inline uint32_t enc_vector<Coder, SampleDens, fixedIntWidth>::get_sample_dens() const 00254 { 00255 return SampleDens; 00256 } 00257 00258 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00259 inline typename enc_vector<Coder, SampleDens,fixedIntWidth>::value_type enc_vector<Coder, SampleDens,fixedIntWidth>::operator[](const size_type i)const 00260 { 00261 assert(i+1 != 0); 00262 #ifdef SDSL_DEBUG 00263 if (i >= m_elements) { 00264 throw std::out_of_range("OUT_OF_RANGE_ERROR: enc_vector::operator[](size_type); i >= size()!"); 00265 return 0; 00266 } 00267 #endif 00268 size_type idx = i/get_sample_dens(); 00269 return m_sample_vals_and_pointer[idx<<1] + Coder::decode_prefix_sum(m_z.data(), m_sample_vals_and_pointer[(idx<<1)+1], i-SampleDens*idx); 00270 } 00271 00272 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00273 inline typename enc_vector<Coder, SampleDens,fixedIntWidth>::value_type enc_vector<Coder, SampleDens,fixedIntWidth>::sample(const size_type i)const 00274 { 00275 assert(i*get_sample_dens()+1 != 0); 00276 #ifdef SDSL_DEBUG 00277 if (i*get_sample_dens() >= m_elements) { 00278 throw std::out_of_range("OUT_OF_RANGE_ERROR: enc_vector::sample(size_type); i*get_sample_dens() >= size()!"); 00279 return 0; 00280 } 00281 #endif 00282 return m_sample_vals_and_pointer[i<<1]; 00283 } 00284 00285 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00286 inline enc_vector<>::size_type enc_vector<Coder, SampleDens,fixedIntWidth>::size()const 00287 { 00288 return m_elements; 00289 } 00290 00291 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00292 inline enc_vector<>::size_type enc_vector<Coder, SampleDens,fixedIntWidth>::max_size() 00293 { 00294 return int_vector<>::max_size()/2; // each element could possible occupy double space with selfdelimiting codes 00295 } 00296 00297 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00298 inline bool enc_vector<Coder, SampleDens,fixedIntWidth>::empty()const 00299 { 00300 return 0==m_elements; 00301 } 00302 00303 00304 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00305 void enc_vector<Coder, SampleDens,fixedIntWidth>::copy(const enc_vector<Coder, SampleDens,fixedIntWidth>& v) 00306 { 00307 m_z = v.m_z; // copy compressed bit stream 00308 m_sample_vals_and_pointer = v.m_sample_vals_and_pointer; // copy sample values 00309 m_elements = v.m_elements; // copy number of stored elements 00310 } 00311 00312 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00313 enc_vector<Coder, SampleDens,fixedIntWidth>::enc_vector(const enc_vector& v) 00314 { 00315 copy(v); 00316 } 00317 00318 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00319 enc_vector<Coder, SampleDens,fixedIntWidth>& enc_vector<Coder, SampleDens,fixedIntWidth>::operator=(const enc_vector<Coder, SampleDens,fixedIntWidth>& v) 00320 { 00321 if (this != &v) {// if v and _this_ are not the same object 00322 copy(v); 00323 } 00324 return *this; 00325 } 00326 00327 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00328 bool enc_vector<Coder, SampleDens,fixedIntWidth>::operator==(const enc_vector<Coder, SampleDens,fixedIntWidth>& v)const 00329 { 00330 if (this == &v) 00331 return true; 00332 return m_elements == v.m_elements 00333 and m_z == v.m_z 00334 and m_sample_vals_and_pointer == v.m_sample_vals_and_pointer; 00335 } 00336 00337 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00338 bool enc_vector<Coder, SampleDens,fixedIntWidth>::operator!=(const enc_vector<Coder, SampleDens,fixedIntWidth>& v)const 00339 { 00340 return !(*this == v); 00341 } 00342 00343 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00344 void enc_vector<Coder, SampleDens,fixedIntWidth>::swap(enc_vector<Coder, SampleDens,fixedIntWidth>& v) 00345 { 00346 if (this != &v) { // if v and _this_ are not the same object 00347 m_z.swap(v.m_z); // swap compressed bit streams 00348 m_sample_vals_and_pointer.swap(v.m_sample_vals_and_pointer); 00349 std::swap(m_elements, v.m_elements);// swap the number of elements 00350 } 00351 } 00352 00353 00354 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00355 template<class Container> 00356 void enc_vector<Coder, SampleDens,fixedIntWidth>::init(const Container& c) 00357 { 00358 // clear bit_vectors 00359 clear(); 00360 00361 if (c.empty()) // if c is empty there is nothing to do... 00362 return; 00363 typename Container::const_iterator it = c.begin(), end = c.end(); 00364 typename Container::value_type v1 = *it, v2, max_sample_value=0, x; 00365 size_type samples=0; 00366 size_type z_size = 0; 00367 // (1) Calculate maximal value of samples and of deltas 00368 for (size_type i=0, no_sample=0; it != end; ++it,++i, --no_sample) { 00369 v2 = *it; 00370 if (!no_sample) { // add a sample 00371 no_sample = get_sample_dens(); 00372 if (max_sample_value < v2) max_sample_value = v2; 00373 ++samples; 00374 } else { 00375 z_size += Coder::encoding_length(v2-v1); 00376 } 00377 v1=v2; 00378 } 00379 // (2) Write sample values and deltas 00380 { 00381 if (max_sample_value > z_size+1) 00382 m_sample_vals_and_pointer.set_int_width(bit_magic::l1BP(max_sample_value) + 1); 00383 else 00384 m_sample_vals_and_pointer.set_int_width(bit_magic::l1BP(z_size+1) + 1); 00385 m_sample_vals_and_pointer.resize(2*samples+2); // add 2 for last entry 00386 util::set_zero_bits(m_sample_vals_and_pointer); 00387 typename int_vector_type::iterator sv_it = m_sample_vals_and_pointer.begin(); 00388 z_size = 0; 00389 size_type no_sample=0; 00390 for (it = c.begin(); it != end; ++it, --no_sample) { 00391 v2 = *it; 00392 if (!no_sample) { // add a sample 00393 no_sample = get_sample_dens(); 00394 *sv_it = v2; ++sv_it; 00395 *sv_it = z_size; ++sv_it; 00396 } else { 00397 x = v2-v1; 00398 // if (v2 == v1) { 00399 // throw std::logic_error("enc_vector cannot decode adjacent equal values!"); 00400 // } 00401 z_size += Coder::encoding_length(x); 00402 } 00403 v1=v2; 00404 } 00405 *sv_it = 0; ++sv_it; // initialize 00406 *sv_it = z_size+1; ++sv_it; // last entry 00407 00408 util::assign(m_z, int_vector<>(z_size, 0, 1)); 00409 uint64_t* z_data = Coder::raw_data(m_z); 00410 uint8_t offset = 0; 00411 no_sample = 0; 00412 for (it = c.begin(); it != end; ++it, --no_sample) { 00413 v2 = *it; 00414 if (!no_sample) { // add a sample 00415 no_sample = get_sample_dens(); 00416 } else { 00417 Coder::encode(v2-v1, z_data, offset); 00418 } 00419 v1=v2; 00420 } 00421 // Coder::encode(delta_c, m_z); // encode delta_c to m_z 00422 } 00423 // delta_c.resize(0); 00424 //std::cerr<<"Finished "<<std::endl;, 00425 m_elements = c.size(); 00426 } 00427 00428 00429 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00430 template<uint8_t int_width, class size_type_class> 00431 void enc_vector<Coder, SampleDens,fixedIntWidth>::init(int_vector_file_buffer<int_width, size_type_class>& v_buf) 00432 { 00433 // clear bit_vectors 00434 clear(); 00435 size_type n = v_buf.int_vector_size; 00436 if (n == 0) // if c is empty there is nothing to do... 00437 return; 00438 v_buf.reset(); 00439 value_type v1=0, v2=0, max_sample_value=0; 00440 size_type samples=0, z_size=0; 00441 const size_type sd = get_sample_dens(); 00442 // (1) Calculate maximal value of samples and of deltas 00443 for (size_type i=0, r_sum=0, r = v_buf.load_next_block(), no_sample = 0; r_sum < n;) { 00444 for (; i < r_sum+r; ++i, --no_sample) { 00445 v2 = v_buf[i-r_sum]; 00446 if (!no_sample) { // is sample 00447 no_sample = sd; 00448 if (max_sample_value < v2) max_sample_value = v2; 00449 ++samples; 00450 } else { 00451 // if (v2 == v1) { 00452 // throw std::logic_error("enc_vector cannot decode adjacent equal values!"); 00453 // } 00454 z_size += Coder::encoding_length(v2-v1); 00455 } 00456 v1 = v2; 00457 } 00458 r_sum += r; r = v_buf.load_next_block(); 00459 } 00460 00461 // (2) Write sample values and deltas 00462 // (a) Initialize array for sample values and pointers 00463 if (max_sample_value > z_size+1) 00464 m_sample_vals_and_pointer.set_int_width(bit_magic::l1BP(max_sample_value) + 1); 00465 else 00466 m_sample_vals_and_pointer.set_int_width(bit_magic::l1BP(z_size+1) + 1); 00467 m_sample_vals_and_pointer.resize(2*samples+2); // add 2 for last entry 00468 util::set_zero_bits(m_sample_vals_and_pointer); 00469 00470 // (b) Initilize bit_vector for encoded data 00471 util::assign(m_z, int_vector<>(z_size, 0, 1)); 00472 uint64_t* z_data = Coder::raw_data(m_z); 00473 uint8_t offset = 0; 00474 00475 // (c) Write sample values and deltas 00476 v_buf.reset(); 00477 z_size = 0; 00478 for (size_type i=0, j=0, r_sum=0, r = v_buf.load_next_block(), no_sample = 0; r_sum < n;) { 00479 for (; i < r_sum+r; ++i, --no_sample) { 00480 v2 = v_buf[i-r_sum]; 00481 if (!no_sample) { // is sample 00482 no_sample = sd; 00483 m_sample_vals_and_pointer[j++] = v2; // write samples 00484 m_sample_vals_and_pointer[j++] = z_size;// write pointers 00485 } else { 00486 z_size += Coder::encoding_length(v2-v1); 00487 Coder::encode(v2-v1, z_data, offset); // write encoded values 00488 } 00489 v1 = v2; 00490 } 00491 r_sum += r; r = v_buf.load_next_block(); 00492 } 00493 m_elements = n; 00494 } 00495 00496 00497 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00498 enc_vector<>::size_type enc_vector<Coder, SampleDens,fixedIntWidth>::serialize(std::ostream& out, structure_tree_node* v, std::string name)const 00499 { 00500 structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); 00501 size_type written_bytes = 0; 00502 written_bytes += util::write_member(m_elements, out, child, "elements"); 00503 written_bytes += m_z.serialize(out, child, "compressed differences"); 00504 written_bytes += m_sample_vals_and_pointer.serialize(out, child, "samples_and_pointers"); 00505 structure_tree::add_size(child, written_bytes); 00506 return written_bytes; 00507 } 00508 00509 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00510 void enc_vector<Coder, SampleDens,fixedIntWidth>::load(std::istream& in) 00511 { 00512 util::read_member(m_elements, in); 00513 m_z.load(in); 00514 m_sample_vals_and_pointer.load(in); 00515 } 00516 00517 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00518 const typename enc_vector<Coder,SampleDens,fixedIntWidth>::const_iterator enc_vector<Coder, SampleDens,fixedIntWidth>::begin()const 00519 { 00520 return const_iterator(this, 0); 00521 } 00522 00523 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00524 const typename enc_vector<Coder,SampleDens,fixedIntWidth>::const_iterator enc_vector<Coder, SampleDens,fixedIntWidth>::end()const 00525 { 00526 return const_iterator(this, this->m_elements); 00527 } 00528 00529 00530 } // end namespace sdsl 00531 00532 #endif