SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2011 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00021 #ifndef INCLUDED_SDSL_WT_RLMN 00022 #define INCLUDED_SDSL_WT_RLMN 00023 00024 #include "int_vector.hpp" 00025 #include "sd_vector.hpp" // for standard initialisation of template parameters 00026 #include "bitmagic.hpp" 00027 #include "util.hpp" 00028 #include "wt_huff.hpp" 00029 #include <algorithm> // for std::swap 00030 #include <stdexcept> 00031 #include <vector> 00032 #include <utility> // for pair 00033 #include <queue> 00034 #include <iostream> 00035 00036 00037 #ifdef SDSL_DEBUG 00038 #define SDSL_DEBUG_WAVELET_TREE_HUFFMAN_RL 00039 #endif 00040 00041 00043 namespace sdsl 00044 { 00045 00047 00069 template<class BitVector = sd_vector<>, 00070 class RankSupport = typename BitVector::rank_1_type, 00071 class SelectSupport = typename BitVector::select_1_type, 00072 class WaveletTree = wt_huff<> > 00073 class wt_rlmn 00074 { 00075 public: 00076 typedef int_vector<>::size_type size_type; 00077 typedef unsigned char value_type; 00078 typedef BitVector bit_vector_type; 00079 typedef RankSupport rank_support_type; 00080 typedef SelectSupport select_support_type; 00081 typedef WaveletTree wt_type; 00082 00083 private: 00084 size_type m_size; // size of the original input sequence 00085 bit_vector_type m_bl; // bit vector which indicates the starts of runs in 00086 // the BWT (or last column), i.e. _b_ _l_ast 00087 bit_vector_type m_bf; // bit vector which indicates the starts of runs in 00088 // the first column of the sorted suffixes, i.e _b_ _f_irst 00089 wt_type m_wt; // wavelet tree for all levels 00090 // two equal chars 00091 rank_support_type m_bl_rank; // rank support for bit vector bl 00092 rank_support_type m_bf_rank; // rank support for bit vector bf 00093 select_support_type m_bl_select; // select support for bit vector bl 00094 select_support_type m_bf_select; // select support for bit vector bf 00095 int_vector<64> m_C; // 00096 int_vector<64> m_C_bf_rank; // stores the number of 1s in m_bf for the prefixes 00097 // m_bf[0..m_C[0]],m_bf[0..m_C[1]],....,m_bf[0..m_C[255]]; 00098 // named C_s in the original paper 00099 00100 void copy(const wt_rlmn& wt) { 00101 m_size = wt.m_size; 00102 m_bl = wt.m_bl; 00103 m_bf = wt.m_bf; 00104 m_wt = wt.m_wt; 00105 m_bl_rank = wt.m_bl_rank; 00106 m_bl_rank.set_vector(&m_bl); 00107 m_bf_rank = wt.m_bf_rank; 00108 m_bf_rank.set_vector(&m_bf); 00109 m_bl_select = wt.m_bl_select; 00110 m_bl_select.set_vector(&m_bl); 00111 m_bf_select = wt.m_bf_select; 00112 m_bf_select.set_vector(&m_bf); 00113 m_C = wt.m_C; 00114 m_C_bf_rank = wt.m_C_bf_rank; 00115 } 00116 00117 public: 00118 00119 const size_type& sigma; 00120 00121 // Default constructor 00122 wt_rlmn():m_size(0), sigma(m_wt.sigma) {}; 00123 00124 00125 00127 00133 wt_rlmn(const unsigned char* rac, size_type size):m_size(size), sigma(m_wt.sigma) { 00134 // TODO 00135 std::cerr << "ERROR: Constructor of wt_rlmn not implemented yet!!!" << std::endl; 00136 throw std::logic_error("This constructor of wt_rlmn is not yet implemented!"); 00137 } 00138 00139 template<class size_type_class> 00140 wt_rlmn(int_vector_file_buffer<8, size_type_class>& rac, size_type size):m_size(size), sigma(m_wt.sigma) { 00141 construct(rac, size); 00142 } 00143 00145 00148 template<class size_type_class> 00149 void construct(int_vector_file_buffer<8, size_type_class>& rac, size_type size) { 00150 m_size = size; 00151 typedef size_type_class size_type; 00152 std::string temp_file = "tmp_wt_rlmn_" + util::to_string(util::get_pid()) + "_" + util::to_string(util::get_id()); 00153 std::ofstream wt_out(temp_file.c_str(), std::ios::binary | std::ios::trunc); 00154 size_type bit_cnt=0; 00155 wt_out.write((char*)&bit_cnt, sizeof(bit_cnt)); // initial dummy write 00156 { 00157 // scope for bl and bf 00158 bit_vector bl = bit_vector(size, 0); 00159 m_C = int_vector<64>(256, 0); 00160 rac.reset(); 00161 uint8_t last_c = '\0'; 00162 for (size_type i=0, r=0, r_sum=0; r_sum < size;) { 00163 if (r_sum + r > size) { // read not more than size chars in the next loop 00164 r = size-r_sum; 00165 } 00166 for (; i < r+r_sum; ++i) { 00167 uint8_t c = rac[i-r_sum]; 00168 if (last_c != c or i==0) { 00169 bl[i] = 1; 00170 wt_out.write((char*)&c, sizeof(c)); 00171 bit_cnt += 8; 00172 } 00173 ++m_C[c]; 00174 last_c = c; 00175 } 00176 r_sum += r; 00177 r = rac.load_next_block(); 00178 } 00179 00180 wt_out.seekp(0, std::ios::beg); 00181 wt_out.write((char*)&bit_cnt, sizeof(bit_cnt)); 00182 wt_out.close(); 00183 00184 for (size_type i=0, prefix_sum=0, t=0; i<256; ++i) { 00185 t = m_C[i]; 00186 m_C[i] = prefix_sum; 00187 prefix_sum += t; 00188 } 00189 00190 int_vector<64> lf_map = m_C; 00191 bit_vector bf = bit_vector(size+1, 0); 00192 bf[size] = 1; // initialize last element 00193 rac.reset(); 00194 for (size_type i=0, r=0, r_sum=0; r_sum < size;) { 00195 if (r_sum + r > size) { // read not more than size chars in the next loop 00196 r = size-r_sum; 00197 } 00198 for (; i < r+r_sum; ++i) { 00199 uint8_t c = rac[i-r_sum]; 00200 if (bl[i]) { 00201 bf[lf_map[c]] = 1; 00202 } 00203 ++lf_map[c]; 00204 } 00205 r_sum += r; 00206 r = rac.load_next_block(); 00207 } 00208 { 00209 int_vector_file_buffer<8, size_type> temp_bwt_buf(temp_file.c_str()); 00210 m_wt.construct(temp_bwt_buf, temp_bwt_buf.int_vector_size); 00211 00212 } 00213 util::assign(m_bl, bl); 00214 util::assign(m_bf, bf); 00215 } 00216 00217 util::init_support(m_bl_rank, &m_bl); 00218 util::init_support(m_bf_rank, &m_bf); 00219 util::init_support(m_bf_select, &m_bf); 00220 util::init_support(m_bl_select, &m_bl); 00221 m_C_bf_rank = int_vector<64>(256,0); 00222 for (size_type i=0; i<256; ++i) { 00223 m_C_bf_rank[i] = m_bf_rank(m_C[i]); 00224 } 00225 std::remove(temp_file.c_str()); 00226 } 00227 00229 wt_rlmn(const wt_rlmn& wt):sigma(wt.sigma) { 00230 copy(wt); 00231 } 00232 00234 wt_rlmn& operator=(const wt_rlmn& wt) { 00235 if (this != &wt) { 00236 copy(wt); 00237 } 00238 return *this; 00239 } 00240 00242 void swap(wt_rlmn& wt) { 00243 if (this != &wt) { 00244 std::swap(m_size, wt.m_size); 00245 m_bl.swap(wt.m_bl); 00246 m_bf.swap(wt.m_bf); 00247 m_wt.swap(wt.m_wt); 00248 00249 m_bl_rank.swap(wt.m_bl_rank); 00250 m_bl_rank.set_vector(&m_bl); 00251 wt.m_bl_rank.set_vector(&(wt.m_bl)); 00252 m_bf_rank.swap(wt.m_bf_rank); 00253 m_bf_rank.set_vector(&m_bf); 00254 wt.m_bf_rank.set_vector(&(wt.m_bf)); 00255 00256 m_bl_select.swap(wt.m_bl_select); 00257 m_bl_select.set_vector(&m_bl); 00258 wt.m_bl_select.set_vector(&(wt.m_bl)); 00259 m_bf_select.swap(wt.m_bf_select); 00260 m_bf_select.set_vector(&m_bf); 00261 wt.m_bf_select.set_vector(&(wt.m_bf)); 00262 00263 m_C.swap(wt.m_C); 00264 m_C_bf_rank.swap(wt.m_C_bf_rank); 00265 } 00266 } 00267 00269 size_type size()const { 00270 return m_size; 00271 } 00272 00274 bool empty()const { 00275 return m_size == 0; 00276 } 00277 00279 00285 value_type operator[](size_type i)const { 00286 return m_wt[m_bl_rank(i+1)-1]; 00287 }; 00288 00290 00298 size_type rank(size_type i, value_type c)const { 00299 if (i == 0) 00300 return 0; 00301 size_type wt_ex_pos = m_bl_rank(i); 00302 size_type c_runs = m_wt.rank(wt_ex_pos, c); 00303 if (c_runs == 0) 00304 return 0; 00305 if (m_wt[wt_ex_pos-1] == c) { 00306 size_type c_run_begin = m_bl_select(wt_ex_pos); 00307 return m_bf_select(m_C_bf_rank[c] + c_runs) - m_C[c] + i - c_run_begin; 00308 } else { 00309 return m_bf_select(m_C_bf_rank[c] + c_runs + 1) - m_C[c]; 00310 } 00311 }; 00312 00314 00321 size_type rank_ith_symbol(size_type i, value_type& c)const { 00322 if (i == 0) { 00323 c = m_wt[0]; 00324 return 0; 00325 } 00326 size_type wt_ex_pos = m_bl_rank(i+1); 00327 size_type c_runs = m_wt.rank_ith_symbol(wt_ex_pos-1, c)+1; 00328 if (c_runs == 0) 00329 return 0; 00330 if (m_wt[wt_ex_pos-1] == c) { 00331 size_type c_run_begin = m_bl_select(wt_ex_pos); 00332 return m_bf_select(m_C_bf_rank[c] + c_runs) - m_C[c] + i - c_run_begin; 00333 } else { 00334 return m_bf_select(m_C_bf_rank[c] + c_runs + 1) - m_C[c]; 00335 } 00336 } 00337 00339 00346 size_type select(size_type i, value_type c)const { 00347 size_type c_runs = m_bf_rank(m_C[c]+i) - m_C_bf_rank[c]; 00348 size_type offset = m_C[c] + i - 1 - m_bf_select(c_runs + m_C_bf_rank[c]); 00349 return m_bl_select(m_wt.select(c_runs, c)+1) + offset; 00350 }; 00351 00353 size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const { 00354 structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); 00355 size_type written_bytes = 0; 00356 written_bytes += util::write_member(m_size, out, child, "size"); 00357 written_bytes += m_bl.serialize(out, child, "bl"); 00358 written_bytes += m_bf.serialize(out, child, "bf"); 00359 written_bytes += m_wt.serialize(out, child, "wt"); 00360 written_bytes += m_bl_rank.serialize(out, child, "bl_rank"); 00361 written_bytes += m_bf_rank.serialize(out, child, "bf_rank"); 00362 written_bytes += m_bl_select.serialize(out, child, "bl_select"); 00363 written_bytes += m_bf_select.serialize(out, child, "bf_select"); 00364 written_bytes += m_C.serialize(out, child, "C"); 00365 written_bytes += m_C_bf_rank.serialize(out, child, "C_bf_rank"); 00366 structure_tree::add_size(child, written_bytes); 00367 return written_bytes; 00368 } 00369 00371 void load(std::istream& in) { 00372 util::read_member(m_size, in); 00373 m_bl.load(in); 00374 m_bf.load(in); 00375 m_wt.load(in); 00376 m_bl_rank.load(in, &m_bl); 00377 m_bf_rank.load(in, &m_bf); 00378 m_bl_select.load(in, &m_bl); 00379 m_bf_select.load(in, &m_bf); 00380 m_C.load(in); 00381 m_C_bf_rank.load(in); 00382 } 00383 00384 void print_info()const { 00385 std::cout<<"# m_wt.size in MB="<<util::get_size_in_bytes(m_wt)/(1024.0*1024.0)<<std::endl; 00386 std::cout<<"# m_bf.size in MB="<<util::get_size_in_bytes(m_bf)/(1024.0*1024.0)<<std::endl; 00387 std::cout<<"# m_bl.size in MB="<<util::get_size_in_bytes(m_bl)/(1024.0*1024.0)<<std::endl; 00388 } 00389 }; 00390 00391 00392 }// end namespace sdsl 00393 00394 #endif // end file