SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2011 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00022 #ifndef INCLUDED_SDSL_WT_RLG8 00023 #define INCLUDED_SDSL_WT_RLG8 00024 00025 #include "int_vector.hpp" 00026 #include "rank_support_v.hpp" 00027 #include "rank_support_v5.hpp" 00028 #include "select_support_mcl.hpp" 00029 #include "select_support_bs.hpp" 00030 #include "bitmagic.hpp" 00031 #include "util.hpp" 00032 #include "wt_huff.hpp" 00033 #include <algorithm> // for std::swap 00034 #include <stdexcept> 00035 #include <vector> 00036 #include <utility> // for pair 00037 #include <queue> 00038 00039 00040 #ifdef SDSL_DEBUG 00041 #define SDSL_DEBUG_WAVELET_TREE_HUFFMAN_RLN4 00042 #endif 00043 00044 00046 namespace sdsl 00047 { 00048 00049 typedef wt_huff<bit_vector, rank_support_v5<>, select_support_bs<>, select_support_bs<> > wt_without_select; 00050 00052 00066 template<class RankSupport = rank_support_v5<>, class WaveletTree = wt_without_select > 00067 class wt_rlg8 00068 { 00069 public: 00070 typedef int_vector<>::size_type size_type; 00071 typedef unsigned char value_type; 00072 typedef RankSupport rank_support_type; 00073 typedef WaveletTree wt_type; 00074 00075 private: 00076 size_type m_size; // size of the original input sequence 00077 wt_type m_wt; // wavelet tree for all levels 00078 bit_vector m_b; // bit vector which indicates if a pair consists of 00079 // two equal chars 00080 rank_support_type m_b_rank; // rank support for vector b 00081 int_vector<64> m_b_border_rank;// Vector in which we store the rank values of m_b at the 00082 // border positions. 00083 int_vector<64> m_b_border; // Vector in which we store the borders of the different levels 00084 // Takes \f$\Order{\max(1, \log L)\log n}\f$ bits. 00085 int_vector<64> m_wt_rank; // Vector in which we store the rank value for each character 00086 // and each border. 00087 // Takes \f$\Order{\sigma\max(1, \log L)\log n}\f bits 00088 int_vector<8> m_char2comp; // 00089 int_vector<64> m_char_occ; // 00090 uint16_t m_sigma; 00091 00092 void copy(const wt_rlg8& wt) { 00093 m_size = wt.m_size; 00094 m_wt = wt.m_wt; 00095 m_b = wt.m_b; 00096 m_b_rank = wt.m_b_rank; 00097 m_b_rank.set_vector(&m_b); 00098 m_b_border_rank = wt.m_b_border_rank; 00099 m_b_border = wt.m_b_border; 00100 m_wt_rank = wt.m_wt_rank; 00101 m_char2comp = wt.m_char2comp; 00102 m_char_occ = wt.m_char_occ; 00103 m_size = wt.m_size; 00104 } 00105 00106 public: 00107 00108 const uint16_t& sigma; 00109 00110 // Default constructor 00111 wt_rlg8():m_size(0), m_sigma(0), sigma(m_sigma) {}; 00112 00113 00114 00116 00122 wt_rlg8(const unsigned char* rac, size_type size):m_size(size), m_sigma(0), sigma(m_sigma) { 00123 std::cerr << "ERROR: Constructor of wt_rlg8 not implemented yet!!!" << std::endl; 00124 throw std::logic_error("This constructor of wt_rlg8 is not yet implemented!"); 00125 } 00126 00127 template<class size_type_class> 00128 wt_rlg8(int_vector_file_buffer<8, size_type_class>& rac, size_type size):m_size(size), m_sigma(0), sigma(m_sigma) { 00129 construct(rac, size); 00130 } 00131 00133 00136 template<class size_type_class> 00137 void construct(int_vector_file_buffer<8, size_type_class>& rac, size_type size) { 00138 m_size = size; 00139 typedef size_type_class size_type; 00140 // TODO: remove absolute file name 00141 std::string temp_file = "wt_rlg8_" + util::to_string(util::get_pid()) + "_" + util::to_string(util::get_id()); 00142 std::ofstream wt_out(temp_file.c_str(), std::ios::binary | std::ios::trunc); 00143 size_type bit_cnt=0; 00144 wt_out.write((char*)&bit_cnt, sizeof(bit_cnt)); // initial dummy write 00145 00146 m_b = bit_vector(size/4+1,0); 00147 uint8_t* next_bwt = new uint8_t[size/4+4]; 00148 // bit_vector same_prev_char(size/2+1,0); // 00149 00150 m_b_border.resize(bit_magic::l1BP(size) + 1); 00151 m_b_border[0] = 0; 00152 00153 int m=0; 00154 00155 rac.reset(); 00156 bit_vector b_sigma(256, 0); 00157 uint8_t last_c[9] = {0,1,0,1,0,1,0,1}; 00158 uint8_t c = '\0'; 00159 size_type b_cnt = 0, pair1cnt=0, pair0cnt=0; 00160 for (size_type i=0, r=0, r_sum=0; r_sum < size;) { 00161 if (r_sum + r > size) { // read not more than size chars in the next loop 00162 r = size-r_sum; 00163 } 00164 for (; i < r+r_sum; ++i) { 00165 c = rac[i-r_sum]; 00166 b_sigma[c] = 1; 00167 last_c[i&7ULL] = c; 00168 if ((i & 7ULL)==7) { 00169 if (last_c[0] == last_c[1] and last_c[2] == last_c[3] and last_c[0] == last_c[2] and 00170 last_c[4] == last_c[5] and last_c[6] == last_c[7] and last_c[4] == last_c[6] and 00171 last_c[0] == last_c[4] 00172 ) { // join octtrupel 00173 m_b[b_cnt] = 1; 00174 next_bwt[pair1cnt] = c; 00175 ++pair1cnt; 00176 } else { // write pair to stream 00177 //m_b[b_cnt] = 0; // since m_b is initialized to zero, this is not necessary 00178 wt_out.write((char*)last_c, 8*sizeof(c)); 00179 ++pair0cnt; 00180 } 00181 ++b_cnt; 00182 } 00183 } 00184 r_sum += r; 00185 r = rac.load_next_block(); 00186 } 00187 if (size & 7ULL) { 00188 wt_out.write((char*)&last_c, 8*sizeof(c)); 00189 ++pair0cnt; 00190 ++b_cnt; 00191 } 00192 m_sigma = 0; 00193 for (size_type i=0; i<b_sigma.size(); ++i) { 00194 m_sigma += b_sigma[i]; 00195 } 00196 00197 uint32_t level = 0; 00198 // handle remaining levels 00199 while (pair1cnt > 0) { 00200 ++m; 00201 m_b_border[++level] = b_cnt; 00202 size_type level_size = pair1cnt; 00203 pair1cnt = 0; 00204 for (size_type i=7; i < level_size; i+=8) { 00205 if (next_bwt[i] == next_bwt[i-1] and next_bwt[i-2] == next_bwt[i-3] and next_bwt[i] == next_bwt[i-2] and 00206 next_bwt[i-4] == next_bwt[i-5] and next_bwt[i-6] == next_bwt[i-7] and next_bwt[i-4] == next_bwt[i-6] and 00207 next_bwt[i] == next_bwt[i-4]) { 00208 m_b[b_cnt] = 1; 00209 next_bwt[pair1cnt] = next_bwt[i]; 00210 ++pair1cnt; 00211 } else { 00212 //m_b[b_cnt] = 0; // since m_b is initialized to zero, this is not necessary 00213 wt_out.write((char*)next_bwt+i-7, 8*sizeof(c)); 00214 ++pair0cnt; 00215 } 00216 ++b_cnt; 00217 } 00218 if (level_size & 7ULL) { // handle last element 00219 wt_out.write((char*)next_bwt + (level_size/8)*8, (level_size&7ULL)*sizeof(c)); 00220 wt_out.write((char*)"\0\0\0\0\0\0\0\0", 8-(level_size&7ULL)); 00221 ++pair0cnt; 00222 ++b_cnt; 00223 } 00224 } 00225 delete [] next_bwt; 00226 m_b.resize(b_cnt); 00227 m_b_border.resize(level+1); 00228 00229 wt_out.seekp(0, std::ios::beg); 00230 bit_cnt = (8*pair0cnt)*8; 00231 wt_out.write((char*)&bit_cnt, sizeof(bit_cnt)); 00232 wt_out.close(); 00233 00234 { 00235 int_vector_file_buffer<8, size_type> temp_bwt_buf(temp_file.c_str()); 00236 m_wt.construct(temp_bwt_buf, temp_bwt_buf.int_vector_size); 00237 } 00238 00239 util::init_support(m_b_rank, &m_b); 00240 m_b_border_rank.resize(m_b_border.size()); 00241 00242 for (size_type i=0; i<m_b_border.size(); ++i) { 00243 m_b_border_rank[i] = m_b_rank.rank(m_b_border[i]); 00244 } 00245 00246 m_char2comp = int_vector<8>(256,255); 00247 for (uint16_t c=0, cnt=0; c<256; ++c) { 00248 if (b_sigma[c]) { 00249 m_char2comp[c] = cnt++; 00250 } 00251 } 00252 00253 00254 m_wt_rank.resize(m_sigma * m_b_border.size()); 00255 m_char_occ.resize(m_sigma); 00256 for (size_type c=0; c < 256; ++c) { 00257 uint16_t cc = m_char2comp[c]; 00258 if (cc < m_sigma) { 00259 for (size_type i=0; i < m_b_border.size(); ++i) { 00260 size_type zeros = m_b_border[i] - m_b_border_rank[i]; 00261 m_wt_rank[cc * m_b_border.size() + i] = m_wt.rank(8*zeros, c); 00262 } 00263 m_char_occ[cc] = m_wt.rank(m_wt.size(), c); 00264 } 00265 } 00266 std::remove(temp_file.c_str()); 00267 } 00268 00270 wt_rlg8(const wt_rlg8& wt):sigma(m_sigma) { 00271 copy(wt); 00272 } 00273 00275 wt_rlg8& operator=(const wt_rlg8& wt) { 00276 if (this != &wt) { 00277 copy(wt); 00278 } 00279 return *this; 00280 } 00281 00283 void swap(wt_rlg8& wt) { 00284 if (this != &wt) { 00285 std::swap(m_size, wt.m_size); 00286 m_wt.swap(wt.m_wt); 00287 m_b.swap(wt.m_b); 00288 util::swap_support(m_b_rank, wt.m_b_rank, &m_b, &(wt.m_b)); 00289 m_b_border.swap(wt.m_b_border); 00290 m_b_border_rank.swap(wt.m_b_border_rank); 00291 m_wt_rank.swap(wt.m_wt_rank); 00292 m_char2comp.swap(wt.m_char2comp); 00293 m_char_occ.swap(wt.m_char_occ); 00294 std::swap(m_sigma, wt.m_sigma); 00295 } 00296 } 00297 00299 size_type size()const { 00300 return m_size; 00301 } 00302 00304 bool empty()const { 00305 return m_size == 0; 00306 } 00307 00309 00315 value_type operator[](size_type i)const { 00316 size_type level = 0; 00317 while (m_b[(i>>3) + m_b_border[level]]) { 00318 i = m_b_rank((i>>3) + m_b_border[level]) - m_b_border_rank[level]; 00319 ++level; 00320 } 00321 size_type zeros = (i>>3) + m_b_border[level] - m_b_rank((i>>3) + m_b_border[level]); 00322 return m_wt[(zeros<<3) + (i&7ULL)]; 00323 }; 00324 00326 00334 size_type rank(size_type i, value_type c)const { 00335 value_type cc = m_char2comp[c]; 00336 if (((size_type)cc) >= m_char_occ.size()) { // char does not occur 00337 return 0; 00338 } 00339 size_type res = 0; 00340 size_type level = 0; 00341 size_type added = 0; 00342 size_type cs = 0; 00343 while (i>0 and cs != m_char_occ[cc]) { 00344 size_type ones = m_b_rank((i>>3) + m_b_border[level]); // # of ones till this position 00345 size_type zeros = m_b_border[level] + (i>>3) - ones; // # of zeros till this position 00346 res += ((cs=m_wt.rank((zeros<<3), c)) - (m_wt_rank[cc*m_b_border.size() + level])) << ((level<<1)+level); 00347 if (i & 7ULL) {// i not a multiple of 8 00348 if (m_b[(i>>3) + m_b_border[level]]) { 00349 added += (1<<((level<<1)+level)) * (8-(i & 7ULL)); 00350 i += (8-(i & 7ULL)); 00351 ++ones; 00352 } else { 00353 if (m_wt[(zeros<<3) + (i&7ULL) - 1] == c) { 00354 res += (1<<((level<<1)+level)) - added; 00355 } 00356 if ((i&7ULL) > 1) { 00357 size_type cnt = m_wt.rank((zeros<<3) + (i&7ULL) - 1, c) - cs; 00358 res += (cnt<<((level<<1)+level)); 00359 } 00360 added = 0; 00361 } 00362 } else { // i is a multiple of 8 00363 if (added > 0 and m_b[(i>>3) + m_b_border[level] - 1] == 0) { 00364 if (m_wt[(zeros<<3)-1] == c) { 00365 res -= added; 00366 } 00367 added = 0; 00368 } 00369 } 00370 i = ones - m_b_border_rank[level]; 00371 ++level; 00372 } 00373 return res; 00374 }; 00375 00377 00384 size_type rank_ith_symbol(size_type i, value_type& c)const { 00385 return rank(i, c=(*this)[i]); 00386 } 00387 00389 00397 size_type select(size_type i, value_type c)const { 00398 if (((size_type)m_char2comp[c]) >= m_char_occ.size()) { // char does not occur 00399 return size(); 00400 } 00401 size_type lb = 0, rb = m_size; // lb inclusive, rb exclusive 00402 while (rb > lb) { 00403 size_type m = (lb+rb)>>1; 00404 if (rank(m+1, c) < i) { 00405 lb = m+1; 00406 } else { // rank(m+1,c) >= i 00407 rb = m; 00408 } 00409 } 00410 return lb; 00411 }; 00412 00414 size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const { 00415 structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); 00416 size_type written_bytes = 0; 00417 written_bytes += util::write_member(m_size, out, child, "size"); 00418 written_bytes += m_wt.serialize(out, child, "wt"); 00419 written_bytes += m_b.serialize(out, child, "b"); 00420 written_bytes += m_b_rank.serialize(out, child, "b_rank"); 00421 written_bytes += m_b_border.serialize(out, child, "b_border"); 00422 written_bytes += m_b_border_rank.serialize(out, child, "b_border_rank"); 00423 written_bytes += m_wt_rank.serialize(out, child, "wt_rank"); 00424 written_bytes += m_char2comp.serialize(out, child, "char2comp"); 00425 written_bytes += m_char_occ.serialize(out, child, "char_occ"); 00426 written_bytes += util::write_member(m_sigma, out, child, "sigma"); 00427 structure_tree::add_size(child, written_bytes); 00428 return written_bytes; 00429 } 00430 00432 void load(std::istream& in) { 00433 util::read_member(m_size, in); 00434 m_wt.load(in); 00435 m_b.load(in); 00436 m_b_rank.load(in, &m_b); 00437 m_b_border.load(in); 00438 m_b_border_rank.load(in); 00439 m_wt_rank.load(in); 00440 m_char2comp.load(in); 00441 m_char_occ.load(in); 00442 util::read_member(m_sigma, in); 00443 } 00444 }; 00445 00446 00447 }// end namespace sdsl 00448 00449 #endif // end file