SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
 All Classes Namespaces Files Functions Variables Typedefs Friends
sdsl/include/sdsl/wt_rlmn.hpp
Go to the documentation of this file.
00001 /* sdsl - succinct data structures library
00002     Copyright (C) 2011 Simon Gog
00003 
00004     This program is free software: you can redistribute it and/or modify
00005     it under the terms of the GNU General Public License as published by
00006     the Free Software Foundation, either version 3 of the License, or
00007     (at your option) any later version.
00008 
00009     This program is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012     GNU General Public License for more details.
00013 
00014     You should have received a copy of the GNU General Public License
00015     along with this program.  If not, see http://www.gnu.org/licenses/ .
00016 */
00021 #ifndef INCLUDED_SDSL_WT_RLMN
00022 #define INCLUDED_SDSL_WT_RLMN
00023 
00024 #include "int_vector.hpp"
00025 #include "sd_vector.hpp"  // for standard initialisation of template parameters 
00026 #include "bitmagic.hpp"
00027 #include "util.hpp"
00028 #include "wt_huff.hpp"
00029 #include <algorithm> // for std::swap
00030 #include <stdexcept>
00031 #include <vector>
00032 #include <utility> // for pair
00033 #include <queue>
00034 #include <iostream>
00035 
00036 
00037 #ifdef SDSL_DEBUG
00038 #define SDSL_DEBUG_WAVELET_TREE_HUFFMAN_RL
00039 #endif
00040 
00041 
00043 namespace sdsl
00044 {
00045 
00047 
00069 template<class BitVector = sd_vector<>,
00070          class RankSupport = typename BitVector::rank_1_type,
00071          class SelectSupport = typename BitVector::select_1_type,
00072          class WaveletTree = wt_huff<> >
00073 class wt_rlmn
00074 {
00075     public:
00076         typedef int_vector<>::size_type size_type;
00077         typedef unsigned char                   value_type;
00078         typedef BitVector                               bit_vector_type;
00079         typedef RankSupport                             rank_support_type;
00080         typedef SelectSupport           select_support_type;
00081         typedef WaveletTree             wt_type;
00082 
00083     private:
00084         size_type                               m_size;         // size of the original input sequence
00085         bit_vector_type                 m_bl;           // bit vector which indicates the starts of runs in
00086         // the BWT (or last column), i.e. _b_ _l_ast
00087         bit_vector_type         m_bf;           // bit vector which indicates the starts of runs in
00088         // the first column of the sorted suffixes, i.e _b_ _f_irst
00089         wt_type                                 m_wt;           // wavelet tree for all levels
00090         // two equal chars
00091         rank_support_type               m_bl_rank;      // rank support for bit vector bl
00092         rank_support_type               m_bf_rank;          // rank support for bit vector bf
00093         select_support_type     m_bl_select;    // select support for bit vector bl
00094         select_support_type     m_bf_select;    // select support for bit vector bf
00095         int_vector<64>          m_C;                    //
00096         int_vector<64>          m_C_bf_rank;    // stores the number of 1s in m_bf for the prefixes
00097         // m_bf[0..m_C[0]],m_bf[0..m_C[1]],....,m_bf[0..m_C[255]];
00098         // named C_s in the original paper
00099 
00100         void copy(const wt_rlmn& wt) {
00101             m_size          = wt.m_size;
00102             m_bl            = wt.m_bl;
00103             m_bf            = wt.m_bf;
00104             m_wt            = wt.m_wt;
00105             m_bl_rank       = wt.m_bl_rank;
00106             m_bl_rank.set_vector(&m_bl);
00107             m_bf_rank       = wt.m_bf_rank;
00108             m_bf_rank.set_vector(&m_bf);
00109             m_bl_select     = wt.m_bl_select;
00110             m_bl_select.set_vector(&m_bl);
00111             m_bf_select     = wt.m_bf_select;
00112             m_bf_select.set_vector(&m_bf);
00113             m_C             = wt.m_C;
00114             m_C_bf_rank     = wt.m_C_bf_rank;
00115         }
00116 
00117     public:
00118 
00119         const size_type& sigma;
00120 
00121         // Default constructor
00122         wt_rlmn():m_size(0), sigma(m_wt.sigma) {};
00123 
00124 
00125 
00127 
00133         wt_rlmn(const unsigned char* rac, size_type size):m_size(size), sigma(m_wt.sigma) {
00134             // TODO
00135             std::cerr << "ERROR: Constructor of wt_rlmn not implemented yet!!!" << std::endl;
00136             throw std::logic_error("This constructor of wt_rlmn is not yet implemented!");
00137         }
00138 
00139         template<class size_type_class>
00140         wt_rlmn(int_vector_file_buffer<8, size_type_class>& rac, size_type size):m_size(size), sigma(m_wt.sigma) {
00141             construct(rac, size);
00142         }
00143 
00145 
00148         template<class size_type_class>
00149         void construct(int_vector_file_buffer<8, size_type_class>& rac, size_type size) {
00150             m_size = size;
00151             typedef size_type_class size_type;
00152             std::string temp_file = "tmp_wt_rlmn_" + util::to_string(util::get_pid()) + "_" + util::to_string(util::get_id());
00153             std::ofstream wt_out(temp_file.c_str(), std::ios::binary | std::ios::trunc);
00154             size_type bit_cnt=0;
00155             wt_out.write((char*)&bit_cnt, sizeof(bit_cnt)); // initial dummy write
00156             {
00157                 // scope for bl and bf
00158                 bit_vector bl = bit_vector(size, 0);
00159                 m_C  = int_vector<64>(256, 0);
00160                 rac.reset();
00161                 uint8_t last_c = '\0';
00162                 for (size_type i=0, r=0, r_sum=0; r_sum < size;) {
00163                     if (r_sum + r > size) {  // read not more than size chars in the next loop
00164                         r = size-r_sum;
00165                     }
00166                     for (; i < r+r_sum; ++i) {
00167                         uint8_t c = rac[i-r_sum];
00168                         if (last_c != c or i==0) {
00169                             bl[i] = 1;
00170                             wt_out.write((char*)&c, sizeof(c));
00171                             bit_cnt += 8;
00172                         }
00173                         ++m_C[c];
00174                         last_c = c;
00175                     }
00176                     r_sum += r;
00177                     r = rac.load_next_block();
00178                 }
00179 
00180                 wt_out.seekp(0, std::ios::beg);
00181                 wt_out.write((char*)&bit_cnt, sizeof(bit_cnt));
00182                 wt_out.close();
00183 
00184                 for (size_type i=0, prefix_sum=0, t=0; i<256; ++i) {
00185                     t = m_C[i];
00186                     m_C[i] = prefix_sum;
00187                     prefix_sum += t;
00188                 }
00189 
00190                 int_vector<64> lf_map = m_C;
00191                 bit_vector bf = bit_vector(size+1, 0);
00192                 bf[size] = 1; // initialize last element
00193                 rac.reset();
00194                 for (size_type i=0, r=0, r_sum=0; r_sum < size;) {
00195                     if (r_sum + r > size) {  // read not more than size chars in the next loop
00196                         r = size-r_sum;
00197                     }
00198                     for (; i < r+r_sum; ++i) {
00199                         uint8_t c = rac[i-r_sum];
00200                         if (bl[i]) {
00201                             bf[lf_map[c]] = 1;
00202                         }
00203                         ++lf_map[c];
00204                     }
00205                     r_sum += r;
00206                     r = rac.load_next_block();
00207                 }
00208                 {
00209                     int_vector_file_buffer<8, size_type> temp_bwt_buf(temp_file.c_str());
00210                     m_wt.construct(temp_bwt_buf, temp_bwt_buf.int_vector_size);
00211 
00212                 }
00213                 util::assign(m_bl, bl);
00214                 util::assign(m_bf, bf);
00215             }
00216 
00217             util::init_support(m_bl_rank, &m_bl);
00218             util::init_support(m_bf_rank, &m_bf);
00219             util::init_support(m_bf_select, &m_bf);
00220             util::init_support(m_bl_select, &m_bl);
00221             m_C_bf_rank = int_vector<64>(256,0);
00222             for (size_type i=0; i<256; ++i) {
00223                 m_C_bf_rank[i] = m_bf_rank(m_C[i]);
00224             }
00225             std::remove(temp_file.c_str());
00226         }
00227 
00229         wt_rlmn(const wt_rlmn& wt):sigma(wt.sigma) {
00230             copy(wt);
00231         }
00232 
00234         wt_rlmn& operator=(const wt_rlmn& wt) {
00235             if (this != &wt) {
00236                 copy(wt);
00237             }
00238             return *this;
00239         }
00240 
00242         void swap(wt_rlmn& wt) {
00243             if (this != &wt) {
00244                 std::swap(m_size, wt.m_size);
00245                 m_bl.swap(wt.m_bl);
00246                 m_bf.swap(wt.m_bf);
00247                 m_wt.swap(wt.m_wt);
00248 
00249                 m_bl_rank.swap(wt.m_bl_rank);
00250                 m_bl_rank.set_vector(&m_bl);
00251                 wt.m_bl_rank.set_vector(&(wt.m_bl));
00252                 m_bf_rank.swap(wt.m_bf_rank);
00253                 m_bf_rank.set_vector(&m_bf);
00254                 wt.m_bf_rank.set_vector(&(wt.m_bf));
00255 
00256                 m_bl_select.swap(wt.m_bl_select);
00257                 m_bl_select.set_vector(&m_bl);
00258                 wt.m_bl_select.set_vector(&(wt.m_bl));
00259                 m_bf_select.swap(wt.m_bf_select);
00260                 m_bf_select.set_vector(&m_bf);
00261                 wt.m_bf_select.set_vector(&(wt.m_bf));
00262 
00263                 m_C.swap(wt.m_C);
00264                 m_C_bf_rank.swap(wt.m_C_bf_rank);
00265             }
00266         }
00267 
00269         size_type size()const {
00270             return m_size;
00271         }
00272 
00274         bool empty()const {
00275             return m_size == 0;
00276         }
00277 
00279 
00285         value_type operator[](size_type i)const {
00286             return m_wt[m_bl_rank(i+1)-1];
00287         };
00288 
00290 
00298         size_type rank(size_type i, value_type c)const {
00299             if (i == 0)
00300                 return 0;
00301             size_type wt_ex_pos = m_bl_rank(i);
00302             size_type c_runs = m_wt.rank(wt_ex_pos, c);
00303             if (c_runs == 0)
00304                 return 0;
00305             if (m_wt[wt_ex_pos-1] == c) {
00306                 size_type c_run_begin = m_bl_select(wt_ex_pos);
00307                 return m_bf_select(m_C_bf_rank[c] + c_runs) - m_C[c] + i - c_run_begin;
00308             } else {
00309                 return m_bf_select(m_C_bf_rank[c] + c_runs + 1) - m_C[c];
00310             }
00311         };
00312 
00314 
00321         size_type rank_ith_symbol(size_type i, value_type& c)const {
00322             if (i == 0) {
00323                 c = m_wt[0];
00324                 return 0;
00325             }
00326             size_type wt_ex_pos = m_bl_rank(i+1);
00327             size_type c_runs = m_wt.rank_ith_symbol(wt_ex_pos-1, c)+1;
00328             if (c_runs == 0)
00329                 return 0;
00330             if (m_wt[wt_ex_pos-1] == c) {
00331                 size_type c_run_begin = m_bl_select(wt_ex_pos);
00332                 return m_bf_select(m_C_bf_rank[c] + c_runs) - m_C[c] + i - c_run_begin;
00333             } else {
00334                 return m_bf_select(m_C_bf_rank[c] + c_runs + 1) - m_C[c];
00335             }
00336         }
00337 
00339 
00346         size_type select(size_type i, value_type c)const {
00347             size_type c_runs = m_bf_rank(m_C[c]+i) - m_C_bf_rank[c];
00348             size_type offset = m_C[c] + i - 1 - m_bf_select(c_runs + m_C_bf_rank[c]);
00349             return m_bl_select(m_wt.select(c_runs, c)+1) + offset;
00350         };
00351 
00353         size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const {
00354             structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
00355             size_type written_bytes = 0;
00356             written_bytes += util::write_member(m_size, out, child, "size");
00357             written_bytes += m_bl.serialize(out, child, "bl");
00358             written_bytes += m_bf.serialize(out, child, "bf");
00359             written_bytes += m_wt.serialize(out, child, "wt");
00360             written_bytes += m_bl_rank.serialize(out, child, "bl_rank");
00361             written_bytes += m_bf_rank.serialize(out, child, "bf_rank");
00362             written_bytes += m_bl_select.serialize(out, child, "bl_select");
00363             written_bytes += m_bf_select.serialize(out, child, "bf_select");
00364             written_bytes += m_C.serialize(out, child, "C");
00365             written_bytes += m_C_bf_rank.serialize(out, child, "C_bf_rank");
00366             structure_tree::add_size(child, written_bytes);
00367             return written_bytes;
00368         }
00369 
00371         void load(std::istream& in) {
00372             util::read_member(m_size, in);
00373             m_bl.load(in);
00374             m_bf.load(in);
00375             m_wt.load(in);
00376             m_bl_rank.load(in, &m_bl);
00377             m_bf_rank.load(in, &m_bf);
00378             m_bl_select.load(in, &m_bl);
00379             m_bf_select.load(in, &m_bf);
00380             m_C.load(in);
00381             m_C_bf_rank.load(in);
00382         }
00383 
00384         void print_info()const {
00385             std::cout<<"# m_wt.size in MB="<<util::get_size_in_bytes(m_wt)/(1024.0*1024.0)<<std::endl;
00386             std::cout<<"# m_bf.size in MB="<<util::get_size_in_bytes(m_bf)/(1024.0*1024.0)<<std::endl;
00387             std::cout<<"# m_bl.size in MB="<<util::get_size_in_bytes(m_bl)/(1024.0*1024.0)<<std::endl;
00388         }
00389 };
00390 
00391 
00392 }// end namespace sdsl
00393 
00394 #endif // end file