SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2011 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00022 #ifndef INCLUDED_SDSL_WT_RLG 00023 #define INCLUDED_SDSL_WT_RLG 00024 00025 #include "int_vector.hpp" 00026 #include "rank_support_v.hpp" 00027 #include "rank_support_v5.hpp" 00028 #include "select_support_mcl.hpp" 00029 #include "select_support_bs.hpp" 00030 #include "bitmagic.hpp" 00031 #include "util.hpp" 00032 #include "wt_huff.hpp" 00033 #include <algorithm> // for std::swap 00034 #include <stdexcept> 00035 #include <vector> 00036 #include <utility> // for pair 00037 #include <queue> 00038 00039 00040 #ifdef SDSL_DEBUG 00041 #define SDSL_DEBUG_WT_RLG 00042 #endif 00043 00044 00046 namespace sdsl 00047 { 00048 00049 typedef wt_huff<bit_vector, rank_support_v5<>, select_support_bs<>, select_support_bs<> > wt_without_select; 00050 00052 00067 template<class RankSupport = rank_support_v5<>, class WaveletTree = wt_without_select > 00068 class wt_rlg 00069 { 00070 public: 00071 typedef int_vector<>::size_type size_type; 00072 typedef unsigned char value_type; 00073 typedef RankSupport rank_support_type; 00074 typedef WaveletTree wt_type; 00075 00076 private: 00077 size_type m_size; // size of the original input sequence 00078 wt_type m_wt; // wavelet tree for all levels 00079 bit_vector m_b; // bit vector which indicates if a pair consists of 00080 // two equal chars 00081 rank_support_type m_b_rank; // rank support for vector b 00082 int_vector<64> m_b_border_rank; // Vector in which we store the rank values of m_b at the 00083 // border positions. 00084 int_vector<64> m_b_border; // Vector in which we store the borders of the different levels 00085 // Takes \f$\Order{\max(1, \log L)\log n}\f$ bits. 00086 int_vector<64> m_wt_rank; // Vector in which we store the rank value for each character 00087 // and each border. 00088 // Takes \f$\Order{\sigma\max(1, \log L)\log n}\f bits 00089 int_vector<8> m_char2comp; // 00090 int_vector<64> m_char_occ; // 00091 uint16_t m_sigma; 00092 00093 void copy(const wt_rlg& wt) { 00094 m_size = wt.m_size; 00095 m_wt = wt.m_wt; 00096 m_b = wt.m_b; 00097 m_b_rank = wt.m_b_rank; 00098 m_b_rank.set_vector(&m_b); 00099 m_b_border_rank = wt.m_b_border_rank; 00100 m_b_border = wt.m_b_border; 00101 m_wt_rank = wt.m_wt_rank; 00102 m_char2comp = wt.m_char2comp; 00103 m_char_occ = wt.m_char_occ; 00104 m_size = wt.m_size; 00105 } 00106 00107 public: 00108 00109 const uint16_t& sigma; 00110 00111 // Default constructor 00112 wt_rlg():m_size(0), m_sigma(0), sigma(m_sigma) {}; 00113 00114 00115 00117 00123 wt_rlg(const unsigned char* rac, size_type size):m_size(size), m_sigma(0), sigma(m_sigma) { 00124 // TODO 00125 std::cerr << "ERROR: Constructor of wt_rlg not implemented yet!!!" << std::endl; 00126 throw std::logic_error("This constructor of wt_rlg is not yet implemented!"); 00127 } 00128 00129 template<class size_type_class> 00130 wt_rlg(int_vector_file_buffer<8, size_type_class>& rac, size_type size):m_size(size), sigma(m_sigma) { 00131 construct(rac, size); 00132 } 00133 00135 00138 template<class size_type_class> 00139 void construct(int_vector_file_buffer<8, size_type_class>& rac, size_type size) { 00140 m_size = size; 00141 typedef size_type_class size_type; 00142 // TODO: remove absolute file name 00143 std::string temp_file = "wt_rlg_" + util::to_string(util::get_pid()) + "_" + util::to_string(util::get_id()); 00144 std::ofstream wt_out(temp_file.c_str(), std::ios::binary | std::ios::trunc); 00145 size_type bit_cnt=0; 00146 wt_out.write((char*)&bit_cnt, sizeof(bit_cnt)); // initial dummy write 00147 00148 m_b = bit_vector(size,0); 00149 int_vector<8> next_bwt(size/2+1); // space for the bwt of the next level 00150 // bit_vector same_prev_char(size/2+1,0); // 00151 00152 m_b_border.resize(bit_magic::l1BP(size) + 1); 00153 m_b_border[0] = 0; 00154 00155 typedef std::pair<int, char> tPIC; 00156 int m=0; 00157 00158 rac.reset(); 00159 bit_vector b_sigma(256, 0); 00160 uint8_t last_c = '\0', c = '\0'; 00161 size_type b_cnt = 0, pair1cnt=0, pair0cnt=0; 00162 for (size_type i=0, r=0, r_sum=0; r_sum < size;) { 00163 if (r_sum + r > size) { // read not more than size chars in the next loop 00164 r = size-r_sum; 00165 } 00166 for (; i < r+r_sum; ++i) { 00167 c = rac[i-r_sum]; 00168 b_sigma[c] = 1; 00169 if (i & 1) { // if position is odd 00170 if (c == last_c) { // join pair 00171 m_b[b_cnt] = 1; 00172 /* same_prev_char[pair1cnt] = (pair1cnt>0) && m_b[b_cnt-1] && next_bwt[pair1cnt-1]==c;*/ 00173 next_bwt[pair1cnt] = c; 00174 ++pair1cnt; 00175 } else { // write pair to stream 00176 //m_b[b_cnt] = 0; // since m_b is initialized to zero, this is not necessary 00177 wt_out.write((char*)&last_c, sizeof(last_c)); 00178 wt_out.write((char*)&c, sizeof(c)); 00179 ++pair0cnt; 00180 } 00181 ++b_cnt; 00182 } 00183 last_c = c; 00184 } 00185 r_sum += r; 00186 r = rac.load_next_block(); 00187 } 00188 if (size%2) { // handle last element if size is odd 00189 wt_out.write((char*)&c, sizeof(c)); 00190 wt_out.write("\0", sizeof(c)); 00191 ++pair0cnt; 00192 ++b_cnt; 00193 } 00194 m_sigma = 0; 00195 for (size_type i=0; i<b_sigma.size(); ++i) { 00196 m_sigma += b_sigma[i]; 00197 } 00198 00199 uint32_t level = 0; 00200 // handle remaining levels 00201 while (pair1cnt > 0) { 00202 ++m; 00203 m_b_border[++level] = b_cnt; 00204 size_type level_size = pair1cnt; 00205 pair1cnt = 0; 00206 for (size_type i=1; i < level_size; i+=2) { 00207 if (next_bwt[i] == next_bwt[i-1] /*and same_prev_char[i]*/) { // pair can be joined 00208 m_b[b_cnt] = 1; 00209 // same_prev_char[pair1cnt] = (pair1cnt>0) && m_b[b_cnt-1] && next_bwt[pair1cnt-1]==next_bwt[i] && same_prev_char[i-1]; 00210 next_bwt[pair1cnt] = next_bwt[i]; 00211 ++pair1cnt; 00212 } else { 00213 //m_b[b_cnt] = 0; // since m_b is initialized to zero, this is not necessary 00214 wt_out.write((char*)&next_bwt[i-1], sizeof(c)); 00215 wt_out.write((char*)&next_bwt[i], sizeof(c)); 00216 ++pair0cnt; 00217 } 00218 ++b_cnt; 00219 } 00220 if (level_size%2) { // handle last element 00221 wt_out.write((char*)&next_bwt[level_size-1], sizeof(c)); 00222 wt_out.write("\0", sizeof(c)); 00223 ++pair0cnt; 00224 ++b_cnt; 00225 } 00226 } 00227 m_b.resize(b_cnt); 00228 m_b_border.resize(level+1); 00229 00230 wt_out.seekp(0, std::ios::beg); 00231 bit_cnt = (2*pair0cnt)*8; 00232 wt_out.write((char*)&bit_cnt, sizeof(bit_cnt)); 00233 wt_out.close(); 00234 00235 { 00236 int_vector_file_buffer<8, size_type> temp_bwt_buf(temp_file.c_str()); 00237 m_wt.construct(temp_bwt_buf, temp_bwt_buf.int_vector_size); 00238 } 00239 00240 util::init_support(m_b_rank, &m_b); 00241 m_b_border_rank.resize(m_b_border.size()); 00242 00243 for (size_type i=0; i<m_b_border.size(); ++i) { 00244 m_b_border_rank[i] = m_b_rank.rank(m_b_border[i]); 00245 } 00246 00247 m_char2comp = int_vector<8>(256,255); 00248 for (uint16_t c=0, cnt=0; c<256; ++c) { 00249 if (b_sigma[c]) { 00250 m_char2comp[c] = cnt++; 00251 } 00252 } 00253 00254 m_wt_rank.resize(m_sigma * m_b_border.size()); 00255 m_char_occ.resize(m_sigma); 00256 for (size_type c=0; c < 256; ++c) { 00257 uint16_t cc = m_char2comp[c]; 00258 if (cc < m_sigma) { 00259 for (size_type i=0; i < m_b_border.size(); ++i) { 00260 size_type zeros = m_b_border[i] - m_b_border_rank[i]; 00261 m_wt_rank[cc * m_b_border.size() + i] = m_wt.rank(2*zeros, c); 00262 } 00263 m_char_occ[cc] = m_wt.rank(m_wt.size(), c); 00264 } 00265 } 00266 std::remove(temp_file.c_str()); 00267 } 00268 00270 wt_rlg(const wt_rlg& wt):sigma(m_sigma) { 00271 copy(wt); 00272 } 00273 00275 wt_rlg& operator=(const wt_rlg& wt) { 00276 if (this != &wt) { 00277 copy(wt); 00278 } 00279 return *this; 00280 } 00281 00283 void swap(wt_rlg& wt) { 00284 if (this != &wt) { 00285 std::swap(m_size, wt.m_size); 00286 m_wt.swap(wt.m_wt); 00287 m_b.swap(wt.m_b); 00288 util::swap_support(m_b_rank, wt.m_b_rank, &m_b, &(wt.m_b)); 00289 m_b_border.swap(wt.m_b_border); 00290 m_b_border_rank.swap(wt.m_b_border_rank); 00291 m_wt_rank.swap(wt.m_wt_rank); 00292 m_char2comp.swap(wt.m_char2comp); 00293 m_char_occ.swap(wt.m_char_occ); 00294 std::swap(m_sigma, wt.m_sigma); 00295 } 00296 } 00297 00299 size_type size()const { 00300 return m_size; 00301 } 00302 00304 bool empty()const { 00305 return m_size == 0; 00306 } 00307 00309 00315 value_type operator[](size_type i)const { 00316 size_type level = 0; 00317 while (m_b[(i>>1) + m_b_border[level]]) { 00318 i = m_b_rank((i>>1) + m_b_border[level]) - m_b_border_rank[level]; 00319 ++level; 00320 } 00321 size_type zeros = (i>>1) + m_b_border[level] - m_b_rank((i>>1) + m_b_border[level]); 00322 return m_wt[(zeros<<1) + (i&1)]; 00323 }; 00324 00326 00334 size_type rank(size_type i, value_type c)const { 00335 value_type cc = m_char2comp[c]; 00336 if (((size_type)cc) >= m_char_occ.size()) { // char does not occur 00337 return 0; 00338 } 00339 size_type res = 0; 00340 size_type level = 0; 00341 size_type added = 0; 00342 size_type cs = 0; 00343 while (i>0 and cs != m_char_occ[cc]) { 00344 size_type ones = m_b_rank((i>>1) + m_b_border[level]); // # of ones till this position 00345 size_type zeros = m_b_border[level] + (i>>1) - ones; // # of zeros till this position 00346 res += ((cs=m_wt.rank(zeros<<1, c)) - (m_wt_rank[cc*m_b_border.size() + level])) << level; 00347 if (i & 1) {// i is odd 00348 if (m_b[(i>>1) + m_b_border[level]]) { 00349 ++i; 00350 added += (1<<level); 00351 ++ones; 00352 } else { 00353 if (m_wt[zeros<<1] == c) { 00354 res += (1<<level) - added; 00355 } 00356 added = 0; 00357 } 00358 } else { // i is even 00359 if (added > 0 and m_b[(i>>1) + m_b_border[level] - 1] == 0) { 00360 if (m_wt[(zeros<<1)-1] == c) { 00361 res -= added; 00362 } 00363 added = 0; 00364 } 00365 } 00366 i = ones - m_b_border_rank[level]; 00367 ++level; 00368 } 00369 return res; 00370 }; 00371 00373 00380 size_type rank_ith_symbol(size_type i, value_type& c)const { 00381 return rank(i, c=(*this)[i]); 00382 } 00383 00385 00393 size_type select(size_type i, value_type c)const { 00394 if (((size_type)m_char2comp[c]) >= m_char_occ.size()) { // char does not occur 00395 return size(); 00396 } 00397 size_type lb = 0, rb = m_size; // lb inclusive, rb exclusive 00398 while (rb > lb) { 00399 size_type m = (lb+rb)>>1; 00400 if (rank(m+1, c) < i) { 00401 lb = m+1; 00402 } else { // rank(m+1,c) >= i 00403 rb = m; 00404 } 00405 } 00406 return lb; 00407 }; 00408 00410 size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const { 00411 structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); 00412 size_type written_bytes = 0; 00413 written_bytes += util::write_member(m_size, out, child, "size"); 00414 written_bytes += m_wt.serialize(out, child, "wt"); 00415 written_bytes += m_b.serialize(out, child, "b"); 00416 written_bytes += m_b_rank.serialize(out, child, "rank"); 00417 written_bytes += m_b_border.serialize(out, child, "b_border"); 00418 written_bytes += m_b_border_rank.serialize(out, child, "b_border_rank"); 00419 written_bytes += m_wt_rank.serialize(out, child, "wt_rank"); 00420 written_bytes += m_char2comp.serialize(out, child, "char2comp"); 00421 written_bytes += m_char_occ.serialize(out, child, "char_occ"); 00422 written_bytes += util::write_member(m_sigma, out, child, "sigma"); 00423 structure_tree::add_size(child, written_bytes); 00424 return written_bytes; 00425 } 00426 00428 void load(std::istream& in) { 00429 util::read_member(m_size, in); 00430 m_wt.load(in); 00431 m_b.load(in); 00432 m_b_rank.load(in, &m_b); 00433 m_b_border.load(in); 00434 m_b_border_rank.load(in); 00435 m_wt_rank.load(in); 00436 m_char2comp.load(in); 00437 m_char_occ.load(in); 00438 util::read_member(m_sigma, in); 00439 } 00440 00441 }; 00442 00443 00444 }// end namespace sdsl 00445 00446 #endif // end file