SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 20011 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00021 #include "int_vector.hpp" 00022 #include "elias_delta_coder.hpp" 00023 #include "iterators.hpp" 00024 00025 #ifndef SDSL_ENC_VECTOR_PRAC2 00026 #define SDSL_ENC_VECTOR_PRAC2 00027 00029 namespace sdsl 00030 { 00031 00032 template<uint8_t fixedIntWidth> 00033 struct enc_vector_prac2_trait { 00034 typedef int_vector<0> int_vector_type; 00035 }; 00036 00037 template<> 00038 struct enc_vector_prac2_trait<32> { 00039 typedef int_vector<32> int_vector_type; 00040 }; 00041 00043 00049 template<class Coder=coder::elias_delta, 00050 uint32_t SampleDens = 8, uint8_t fixedIntWidth=0> 00051 class enc_vector_prac2 00052 { 00053 public: 00054 typedef uint64_t value_type; // STL Container requirement 00055 typedef random_access_const_iterator<enc_vector_prac2> iterator;// STL Container requirement 00056 typedef iterator const_iterator; // STL Container requirement 00057 typedef const value_type reference; 00058 typedef const value_type const_reference; 00059 typedef const value_type* const_pointer; 00060 typedef ptrdiff_t difference_type;// STL Container requirement 00061 typedef int_vector<>::size_type size_type; // STL Container requirement 00062 typedef Coder coder; 00063 typedef typename enc_vector_prac2_trait<fixedIntWidth>::int_vector_type int_vector_type; 00064 static const uint32_t sample_dens = SampleDens; 00065 00066 int_vector<0> m_z; // compressed bit stream 00067 private: 00068 int_vector_type m_sample_vals_and_pointer; 00069 size_type m_elements; // number of elements 00070 uint32_t m_sample_dens; 00071 00072 // workaround function for the constructor 00073 void construct() { 00074 m_elements = 0; 00075 m_sample_dens = 8; 00076 } 00077 void copy(const enc_vector_prac2& v); 00078 00079 void clear() { 00080 m_z.resize(0); 00081 m_elements = 0; 00082 m_sample_vals_and_pointer.resize(0); 00083 } 00084 00085 public: 00087 enc_vector_prac2() { 00088 construct(); 00089 }; 00091 00094 enc_vector_prac2(const enc_vector_prac2& v); 00095 00097 00100 template<class Container> 00101 enc_vector_prac2(const Container& c) { 00102 construct(); 00103 init(c); 00104 }; 00105 00107 /* 00108 \param v_buf A int_vector_file_buf. 00109 \pre No two adjacent values should be equal. 00110 */ 00111 template<uint8_t int_width, class size_type_class> 00112 enc_vector_prac2(int_vector_file_buffer<int_width, size_type_class>& v_buf) { 00113 construct(); 00114 init(v_buf); 00115 } 00116 00117 template<class Container> 00118 void init(const Container& c); 00119 00120 template<uint8_t int_width, class size_type_class> 00121 void init(int_vector_file_buffer<int_width, size_type_class>& v_buf); 00122 00124 ~enc_vector_prac2() { 00125 }; 00126 00128 00132 size_type size()const; 00133 00135 00137 static size_type max_size(); 00138 00140 00145 bool empty() const; 00146 00148 00156 void swap(enc_vector_prac2& v); 00157 00159 00163 const const_iterator begin()const; 00164 00166 00170 const const_iterator end()const; 00171 00172 // Iterator that points to the last element of the enc_vector_prac2. 00173 /* 00174 * Required for the Container Concept of the STL. 00175 * \sa rend() 00176 */ 00177 // reverse_iterator rbegin()const; 00178 00179 // Iterator that points to the position before the first element of the enc_vector_prac2. 00180 /* 00181 * Required for the Container Concept of the STL 00182 * \sa rbegin() 00183 */ 00184 // reverse_iterator rend()const; 00185 00187 00191 value_type operator[](size_type i)const; 00192 00194 00197 enc_vector_prac2& operator=(const enc_vector_prac2& v); 00198 00200 00208 bool operator==(const enc_vector_prac2& v)const; 00209 00211 00219 bool operator!=(const enc_vector_prac2& v)const; 00220 00222 00225 size_type serialize(std::ostream& out) const; 00226 00228 void load(std::istream& in); 00229 00231 00234 value_type sample(const size_type i) const; 00235 00236 uint32_t get_sample_dens() const; 00237 void set_sample_dens(const uint32_t sample_dens); 00238 00243 // template<class Iterator> 00244 void get_inter_sampled_values(const size_type i, uint64_t*& it)const { 00245 *(it++) = 0; 00246 if (i*SampleDens + SampleDens - 1 < size()) { 00247 Coder::template decode<true, true>(m_z.data(), m_sample_vals_and_pointer[(i<<1)+1], SampleDens - 1, it); 00248 } else { 00249 assert(i*SampleDens < size()); 00250 Coder::template decode<true, true>(m_z.data(), m_sample_vals_and_pointer[(i<<1)+1], size()-i*SampleDens - 1, it); 00251 } 00252 }; 00253 }; 00254 00255 00256 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00257 inline uint32_t enc_vector_prac2<Coder, SampleDens, fixedIntWidth>::get_sample_dens() const 00258 { 00259 if (SampleDens == 0) 00260 return m_sample_dens; 00261 else 00262 return SampleDens; 00263 } 00264 00265 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00266 inline void enc_vector_prac2<Coder, SampleDens, fixedIntWidth>::set_sample_dens(const uint32_t sample_dens) 00267 { 00268 m_sample_dens = sample_dens; 00269 } 00270 00271 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00272 inline typename enc_vector_prac2<Coder, SampleDens,fixedIntWidth>::value_type enc_vector_prac2<Coder, SampleDens,fixedIntWidth>::operator[](const size_type i)const 00273 { 00274 assert(i+1 != 0); 00275 #ifdef SDSL_DEBUG 00276 if (i >= m_elements) { 00277 throw std::out_of_range("OUT_OF_RANGE_ERROR: enc_vector_prac2::operator[](size_type); i >= size()!"); 00278 return 0; 00279 } 00280 #endif 00281 size_type idx = i/get_sample_dens(); 00282 return m_sample_vals_and_pointer[idx<<1] + Coder::decode_prefix_sum(m_z.data(), m_sample_vals_and_pointer[(idx<<1)+1], i-SampleDens*idx); 00283 } 00284 00285 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00286 inline typename enc_vector_prac2<Coder, SampleDens,fixedIntWidth>::value_type enc_vector_prac2<Coder, SampleDens,fixedIntWidth>::sample(const size_type i)const 00287 { 00288 assert(i*get_sample_dens()+1 != 0); 00289 #ifdef SDSL_DEBUG 00290 if (i*get_sample_dens() >= m_elements) { 00291 throw std::out_of_range("OUT_OF_RANGE_ERROR: enc_vector_prac2::sample(size_type); i*get_sample_dens() >= size()!"); 00292 return 0; 00293 } 00294 #endif 00295 return m_sample_vals_and_pointer[i<<1]; 00296 } 00297 00298 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00299 inline enc_vector_prac2<>::size_type enc_vector_prac2<Coder, SampleDens,fixedIntWidth>::size()const 00300 { 00301 return m_elements; 00302 } 00303 00304 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00305 inline enc_vector_prac2<>::size_type enc_vector_prac2<Coder, SampleDens,fixedIntWidth>::max_size() 00306 { 00307 return int_vector<>::max_size()/2; // each element could possible occupy double space with selfdelimiting codes 00308 } 00309 00310 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00311 inline bool enc_vector_prac2<Coder, SampleDens,fixedIntWidth>::empty()const 00312 { 00313 return 0==m_elements; 00314 } 00315 00316 00317 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00318 void enc_vector_prac2<Coder, SampleDens,fixedIntWidth>::copy(const enc_vector_prac2<Coder, SampleDens,fixedIntWidth>& v) 00319 { 00320 m_z = v.m_z; // copy compressed bit stream 00321 m_sample_vals_and_pointer = v.m_sample_vals_and_pointer; // copy sample values 00322 m_elements = v.m_elements; // copy number of stored elements 00323 } 00324 00325 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00326 enc_vector_prac2<Coder, SampleDens,fixedIntWidth>::enc_vector_prac2(const enc_vector_prac2& v) 00327 { 00328 copy(v); 00329 } 00330 00331 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00332 enc_vector_prac2<Coder, SampleDens,fixedIntWidth>& enc_vector_prac2<Coder, SampleDens,fixedIntWidth>::operator=(const enc_vector_prac2<Coder, SampleDens,fixedIntWidth>& v) 00333 { 00334 if (this != &v) {// if v and _this_ are not the same object 00335 copy(v); 00336 } 00337 return *this; 00338 } 00339 00340 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00341 bool enc_vector_prac2<Coder, SampleDens,fixedIntWidth>::operator==(const enc_vector_prac2<Coder, SampleDens,fixedIntWidth>& v)const 00342 { 00343 if (this == &v) 00344 return true; 00345 return m_elements == v.m_elements 00346 and m_z == v.m_z 00347 and m_sample_vals_and_pointer == v.m_sample_vals_and_pointer; 00348 } 00349 00350 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00351 bool enc_vector_prac2<Coder, SampleDens,fixedIntWidth>::operator!=(const enc_vector_prac2<Coder, SampleDens,fixedIntWidth>& v)const 00352 { 00353 return !(*this == v); 00354 } 00355 00356 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00357 void enc_vector_prac2<Coder, SampleDens,fixedIntWidth>::swap(enc_vector_prac2<Coder, SampleDens,fixedIntWidth>& v) 00358 { 00359 if (this != &v) { // if v and _this_ are not the same object 00360 m_z.swap(v.m_z); // swap compressed bit streams 00361 m_sample_vals_and_pointer.swap(v.m_sample_vals_and_pointer); 00362 std::swap(m_elements, v.m_elements);// swap the number of elements 00363 } 00364 } 00365 00366 00367 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00368 template<class Container> 00369 void enc_vector_prac2<Coder, SampleDens,fixedIntWidth>::init(const Container& c) 00370 { 00371 // clear bit_vectors 00372 clear(); 00373 00374 if (c.empty()) // if c is empty there is nothing to do... 00375 return; 00376 typename Container::const_iterator it = c.begin(), end = c.end(); 00377 typename Container::value_type v1 = *it, v2, max_value=0, max_sample_value=0, x; 00378 size_type samples=0; 00379 size_type z_size = 0; 00380 // (1) Calculate maximal value of samples and of deltas 00381 for (size_type i=0, no_sample=0; it != end; ++it,++i, --no_sample) { 00382 v2 = *it; 00383 if (!no_sample) { // add a sample 00384 no_sample = get_sample_dens(); 00385 if (max_sample_value < v2) max_sample_value = v2; 00386 ++samples; 00387 } else { 00388 if (max_value < v2-v1) max_value = v2 - v1; 00389 z_size += Coder::encoding_length(v2-v1); 00390 } 00391 v1=v2; 00392 } 00393 // (2) Write sample values and deltas 00394 { 00395 if (max_sample_value > z_size+1) 00396 m_sample_vals_and_pointer.set_int_width(bit_magic::l1BP(max_sample_value) + 1); 00397 else 00398 m_sample_vals_and_pointer.set_int_width(bit_magic::l1BP(z_size+1) + 1); 00399 m_sample_vals_and_pointer.resize(2*samples+2); // add 2 for last entry 00400 typename int_vector_type::iterator sv_it = m_sample_vals_and_pointer.begin(); 00401 z_size = 0; 00402 size_type no_sample=0; 00403 for (it = c.begin(); it != end; ++it, --no_sample) { 00404 v2 = *it; 00405 if (!no_sample) { // add a sample 00406 no_sample = get_sample_dens(); 00407 *sv_it = v2; ++sv_it; 00408 *sv_it = z_size; ++sv_it; 00409 } else { 00410 x = v2-v1; 00411 if (v2 == v1) { 00412 throw std::logic_error("enc_vector_prac2 cannot decode adjacent equal values!"); 00413 } 00414 z_size += Coder::encoding_length(x); 00415 } 00416 v1=v2; 00417 } 00418 *sv_it = 0; ++sv_it; // initialize 00419 *sv_it = z_size+1; ++sv_it; // last entry 00420 00421 m_z.bit_resize(z_size); 00422 uint64_t* z_data = Coder::raw_data(m_z); 00423 uint8_t offset = 0; 00424 no_sample = 0; 00425 for (it = c.begin(); it != end; ++it, --no_sample) { 00426 v2 = *it; 00427 if (!no_sample) { // add a sample 00428 no_sample = get_sample_dens(); 00429 } else { 00430 Coder::encode(v2-v1, z_data, offset); 00431 } 00432 v1=v2; 00433 } 00434 // Coder::encode(delta_c, m_z); // encode delta_c to m_z 00435 } 00436 // delta_c.resize(0); 00437 //std::cerr<<"Finished "<<std::endl;, 00438 m_elements = c.size(); 00439 } 00440 00441 00442 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00443 template<uint8_t int_width, class size_type_class> 00444 void enc_vector_prac2<Coder, SampleDens,fixedIntWidth>::init(int_vector_file_buffer<int_width, size_type_class>& v_buf) 00445 { 00446 // clear bit_vectors 00447 clear(); 00448 size_type n = v_buf.int_vector_size; 00449 if (n == 0) // if c is empty there is nothing to do... 00450 return; 00451 v_buf.reset(); 00452 value_type v1=0, v2=0, max_sample_value=0; 00453 size_type samples=0, z_size=0; 00454 const size_type sd = get_sample_dens(); 00455 size_type z_size2 = 0, big_val_cnt=0, old_z_size=0, old_i=0; 00456 size_type big_val[sd+1], big_val_pos[sd+1]; 00457 uint8_t log_sd=bit_magic::l1BP(sd)+1; 00458 size_type saved_bits=0; 00459 // (1) Calculate maximal value of samples and of deltas 00460 for (size_type i=0, r_sum=0, r = v_buf.load_next_block(), no_sample = 0; r_sum < n;) { 00461 for (; i < r_sum+r; ++i, --no_sample) { 00462 v2 = v_buf[i-r_sum]; 00463 if (!no_sample) { // is sample 00464 no_sample = sd; 00465 if (max_sample_value < v2) max_sample_value = v2; 00466 ++samples; 00467 00468 size_type bits=0; 00469 bits += log_sd; // bits for big_val_cnt 00470 for (size_type j=0; j<big_val_cnt; ++j) { 00471 bits += Coder::encoding_length(big_val[j]); 00472 if (j == 0) 00473 bits += Coder::encoding_length(big_val_pos[j]+1); 00474 else 00475 bits += Coder::encoding_length(big_val_pos[j]-big_val_pos[j-1]); 00476 } 00477 big_val_cnt = 0; 00478 if (z_size - old_z_size > bits) { 00479 z_size2 += bits; 00480 } else { 00481 z_size2 += (z_size-old_z_size); 00482 } 00483 old_z_size = z_size; 00484 old_i = i; 00485 } else { 00486 if (v2 == v1) { 00487 throw std::logic_error("enc_vector_prac2 cannot decode adjacent equal values!"); 00488 } 00489 z_size += Coder::encoding_length(v2-v1); 00490 if (v2-v1 > 1) { 00491 big_val[big_val_cnt] = v2-v1; 00492 big_val_pos[big_val_cnt] = i-old_i; 00493 ++big_val_cnt; 00494 } 00495 } 00496 v1 = v2; 00497 } 00498 r_sum += r; r = v_buf.load_next_block(); 00499 } 00500 std::cerr<<"z in MB = " << z_size/(8.0*1024.0*1024.0) << std::endl; 00501 std::cerr<<"z2 in MB = " << z_size2/(8.0*1024.0*1024.0) << std::endl; 00502 std::cerr<<"saving in MB = " << (z_size-z_size2)/(8.0*1024.0*1024.0) << std::endl; 00503 00504 // (2) Write sample values and deltas 00505 // (a) Initialize array for sample values and pointers 00506 if (max_sample_value > z_size+1) 00507 m_sample_vals_and_pointer.set_int_width(bit_magic::l1BP(max_sample_value) + 1); 00508 else 00509 m_sample_vals_and_pointer.set_int_width(bit_magic::l1BP(z_size+1) + 1); 00510 m_sample_vals_and_pointer.resize(2*samples+2); // add 2 for last entry 00511 00512 // (b) Initilize bit_vector for encoded data 00513 m_z.bit_resize(z_size); 00514 uint64_t* z_data = Coder::raw_data(m_z); 00515 uint8_t offset = 0; 00516 00517 // (c) Write sample values and deltas 00518 v_buf.reset(); 00519 z_size = 0; 00520 for (size_type i=0, j=0, r_sum=0, r = v_buf.load_next_block(), no_sample = 0; r_sum < n;) { 00521 for (; i < r_sum+r; ++i, --no_sample) { 00522 v2 = v_buf[i-r_sum]; 00523 if (!no_sample) { // is sample 00524 no_sample = sd; 00525 m_sample_vals_and_pointer[j++] = v2; // write samples 00526 m_sample_vals_and_pointer[j++] = z_size;// write pointers 00527 } else { 00528 z_size += Coder::encoding_length(v2-v1); 00529 Coder::encode(v2-v1, z_data, offset); // write encoded values 00530 } 00531 v1 = v2; 00532 } 00533 r_sum += r; r = v_buf.load_next_block(); 00534 } 00535 m_elements = n; 00536 } 00537 00538 00539 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00540 enc_vector_prac2<>::size_type enc_vector_prac2<Coder, SampleDens,fixedIntWidth>::serialize(std::ostream& out) const 00541 { 00542 size_type written_bytes = 0; 00543 out.write((char*) &m_elements, sizeof(m_elements)); 00544 written_bytes += sizeof(m_elements); 00545 written_bytes += m_z.serialize(out); 00546 written_bytes += m_sample_vals_and_pointer.serialize(out); 00547 return written_bytes; 00548 } 00549 00550 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00551 void enc_vector_prac2<Coder, SampleDens,fixedIntWidth>::load(std::istream& in) 00552 { 00553 in.read((char*) &m_elements, sizeof(m_elements)); 00554 m_z.load(in); 00555 m_sample_vals_and_pointer.load(in); 00556 } 00557 00558 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00559 const typename enc_vector_prac2<Coder,SampleDens,fixedIntWidth>::const_iterator enc_vector_prac2<Coder, SampleDens,fixedIntWidth>::begin()const 00560 { 00561 return const_iterator(this, 0); 00562 } 00563 00564 template<class Coder, uint32_t SampleDens, uint8_t fixedIntWidth> 00565 const typename enc_vector_prac2<Coder,SampleDens,fixedIntWidth>::const_iterator enc_vector_prac2<Coder, SampleDens,fixedIntWidth>::end()const 00566 { 00567 return const_iterator(this, this->m_elements); 00568 } 00569 00570 00571 } // end namespace sdsl 00572 00573 #endif