SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
 All Classes Namespaces Files Functions Variables Typedefs Friends
sdsl/include/sdsl/wt_rlg.hpp
Go to the documentation of this file.
00001 /* sdsl - succinct data structures library
00002     Copyright (C) 2011 Simon Gog
00003 
00004     This program is free software: you can redistribute it and/or modify
00005     it under the terms of the GNU General Public License as published by
00006     the Free Software Foundation, either version 3 of the License, or
00007     (at your option) any later version.
00008 
00009     This program is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012     GNU General Public License for more details.
00013 
00014     You should have received a copy of the GNU General Public License
00015     along with this program.  If not, see http://www.gnu.org/licenses/ .
00016 */
00022 #ifndef INCLUDED_SDSL_WT_RLG
00023 #define INCLUDED_SDSL_WT_RLG
00024 
00025 #include "int_vector.hpp"
00026 #include "rank_support_v.hpp"
00027 #include "rank_support_v5.hpp"
00028 #include "select_support_mcl.hpp"
00029 #include "select_support_bs.hpp"
00030 #include "bitmagic.hpp"
00031 #include "util.hpp"
00032 #include "wt_huff.hpp"
00033 #include <algorithm> // for std::swap
00034 #include <stdexcept>
00035 #include <vector>
00036 #include <utility> // for pair
00037 #include <queue>
00038 
00039 
00040 #ifdef SDSL_DEBUG
00041 #define SDSL_DEBUG_WT_RLG
00042 #endif
00043 
00044 
00046 namespace sdsl
00047 {
00048 
00049 typedef wt_huff<bit_vector, rank_support_v5<>, select_support_bs<>, select_support_bs<> > wt_without_select;
00050 
00052 
00067 template<class RankSupport = rank_support_v5<>, class WaveletTree = wt_without_select >
00068 class wt_rlg
00069 {
00070     public:
00071         typedef int_vector<>::size_type size_type;
00072         typedef unsigned char                   value_type;
00073         typedef RankSupport                             rank_support_type;
00074         typedef WaveletTree             wt_type;
00075 
00076     private:
00077         size_type                               m_size;           // size of the original input sequence
00078         wt_type                                 m_wt;             // wavelet tree for all levels
00079         bit_vector                              m_b;              // bit vector which indicates if a pair consists of
00080         // two equal chars
00081         rank_support_type               m_b_rank;         // rank support for vector b
00082         int_vector<64>          m_b_border_rank;  // Vector in which we store the rank values of m_b at the
00083         // border positions.
00084         int_vector<64>          m_b_border;       // Vector in which we store the borders of the different levels
00085         // Takes \f$\Order{\max(1, \log L)\log n}\f$ bits.
00086         int_vector<64>          m_wt_rank;        // Vector in which we store the rank value for each character
00087         // and each border.
00088         // Takes \f$\Order{\sigma\max(1, \log L)\log n}\f bits
00089         int_vector<8>                   m_char2comp;      //
00090         int_vector<64>          m_char_occ;       //
00091         uint16_t                                m_sigma;
00092 
00093         void copy(const wt_rlg& wt) {
00094             m_size          = wt.m_size;
00095             m_wt            = wt.m_wt;
00096             m_b             = wt.m_b;
00097             m_b_rank        = wt.m_b_rank;
00098             m_b_rank.set_vector(&m_b);
00099             m_b_border_rank = wt.m_b_border_rank;
00100             m_b_border      = wt.m_b_border;
00101             m_wt_rank       = wt.m_wt_rank;
00102             m_char2comp     = wt.m_char2comp;
00103             m_char_occ      = wt.m_char_occ;
00104             m_size                      = wt.m_size;
00105         }
00106 
00107     public:
00108 
00109         const uint16_t& sigma;
00110 
00111         // Default constructor
00112         wt_rlg():m_size(0), m_sigma(0), sigma(m_sigma) {};
00113 
00114 
00115 
00117 
00123         wt_rlg(const unsigned char* rac, size_type size):m_size(size), m_sigma(0), sigma(m_sigma) {
00124             // TODO
00125             std::cerr << "ERROR: Constructor of wt_rlg not implemented yet!!!" << std::endl;
00126             throw std::logic_error("This constructor of wt_rlg is not yet implemented!");
00127         }
00128 
00129         template<class size_type_class>
00130         wt_rlg(int_vector_file_buffer<8, size_type_class>& rac, size_type size):m_size(size), sigma(m_sigma) {
00131             construct(rac, size);
00132         }
00133 
00135 
00138         template<class size_type_class>
00139         void construct(int_vector_file_buffer<8, size_type_class>& rac, size_type size) {
00140             m_size = size;
00141             typedef size_type_class size_type;
00142             // TODO: remove absolute file name
00143             std::string temp_file = "wt_rlg_" + util::to_string(util::get_pid()) + "_" + util::to_string(util::get_id());
00144             std::ofstream wt_out(temp_file.c_str(), std::ios::binary | std::ios::trunc);
00145             size_type bit_cnt=0;
00146             wt_out.write((char*)&bit_cnt, sizeof(bit_cnt)); // initial dummy write
00147 
00148             m_b = bit_vector(size,0);
00149             int_vector<8> next_bwt(size/2+1); // space for the bwt of the next level
00150 //              bit_vector same_prev_char(size/2+1,0); //
00151 
00152             m_b_border.resize(bit_magic::l1BP(size) + 1);
00153             m_b_border[0] = 0;
00154 
00155             typedef std::pair<int, char> tPIC;
00156             int m=0;
00157 
00158             rac.reset();
00159             bit_vector b_sigma(256, 0);
00160             uint8_t last_c = '\0', c = '\0';
00161             size_type b_cnt = 0, pair1cnt=0, pair0cnt=0;
00162             for (size_type i=0, r=0, r_sum=0; r_sum < size;) {
00163                 if (r_sum + r > size) {  // read not more than size chars in the next loop
00164                     r = size-r_sum;
00165                 }
00166                 for (; i < r+r_sum; ++i) {
00167                     c = rac[i-r_sum];
00168                     b_sigma[c] = 1;
00169                     if (i & 1) { // if position is odd
00170                         if (c == last_c) { // join pair
00171                             m_b[b_cnt] = 1;
00172                             /*                                          same_prev_char[pair1cnt] = (pair1cnt>0) && m_b[b_cnt-1] && next_bwt[pair1cnt-1]==c;*/
00173                             next_bwt[pair1cnt] = c;
00174                             ++pair1cnt;
00175                         } else { // write pair to stream
00176                             //m_b[b_cnt] = 0; // since m_b is initialized to zero, this is not necessary
00177                             wt_out.write((char*)&last_c, sizeof(last_c));
00178                             wt_out.write((char*)&c, sizeof(c));
00179                             ++pair0cnt;
00180                         }
00181                         ++b_cnt;
00182                     }
00183                     last_c = c;
00184                 }
00185                 r_sum += r;
00186                 r = rac.load_next_block();
00187             }
00188             if (size%2) { // handle last element if size is odd
00189                 wt_out.write((char*)&c, sizeof(c));
00190                 wt_out.write("\0", sizeof(c));
00191                 ++pair0cnt;
00192                 ++b_cnt;
00193             }
00194             m_sigma = 0;
00195             for (size_type i=0; i<b_sigma.size(); ++i) {
00196                 m_sigma += b_sigma[i];
00197             }
00198 
00199             uint32_t level = 0;
00200             //  handle remaining levels
00201             while (pair1cnt > 0) {
00202                 ++m;
00203                 m_b_border[++level] = b_cnt;
00204                 size_type level_size = pair1cnt;
00205                 pair1cnt = 0;
00206                 for (size_type i=1; i < level_size; i+=2) {
00207                     if (next_bwt[i] == next_bwt[i-1] /*and same_prev_char[i]*/) { // pair can be joined
00208                         m_b[b_cnt] = 1;
00209 //                                 same_prev_char[pair1cnt] = (pair1cnt>0) && m_b[b_cnt-1] && next_bwt[pair1cnt-1]==next_bwt[i] && same_prev_char[i-1];
00210                         next_bwt[pair1cnt] = next_bwt[i];
00211                         ++pair1cnt;
00212                     } else {
00213                         //m_b[b_cnt] = 0; // since m_b is initialized to zero, this is not necessary
00214                         wt_out.write((char*)&next_bwt[i-1], sizeof(c));
00215                         wt_out.write((char*)&next_bwt[i], sizeof(c));
00216                         ++pair0cnt;
00217                     }
00218                     ++b_cnt;
00219                 }
00220                 if (level_size%2) { // handle last element
00221                     wt_out.write((char*)&next_bwt[level_size-1], sizeof(c));
00222                     wt_out.write("\0", sizeof(c));
00223                     ++pair0cnt;
00224                     ++b_cnt;
00225                 }
00226             }
00227             m_b.resize(b_cnt);
00228             m_b_border.resize(level+1);
00229 
00230             wt_out.seekp(0, std::ios::beg);
00231             bit_cnt = (2*pair0cnt)*8;
00232             wt_out.write((char*)&bit_cnt, sizeof(bit_cnt));
00233             wt_out.close();
00234 
00235             {
00236                 int_vector_file_buffer<8, size_type> temp_bwt_buf(temp_file.c_str());
00237                 m_wt.construct(temp_bwt_buf, temp_bwt_buf.int_vector_size);
00238             }
00239 
00240             util::init_support(m_b_rank, &m_b);
00241             m_b_border_rank.resize(m_b_border.size());
00242 
00243             for (size_type i=0; i<m_b_border.size(); ++i) {
00244                 m_b_border_rank[i] = m_b_rank.rank(m_b_border[i]);
00245             }
00246 
00247             m_char2comp = int_vector<8>(256,255);
00248             for (uint16_t c=0, cnt=0; c<256; ++c) {
00249                 if (b_sigma[c]) {
00250                     m_char2comp[c] = cnt++;
00251                 }
00252             }
00253 
00254             m_wt_rank.resize(m_sigma * m_b_border.size());
00255             m_char_occ.resize(m_sigma);
00256             for (size_type c=0; c < 256; ++c) {
00257                 uint16_t cc = m_char2comp[c];
00258                 if (cc < m_sigma) {
00259                     for (size_type i=0; i < m_b_border.size(); ++i) {
00260                         size_type zeros  = m_b_border[i] - m_b_border_rank[i];
00261                         m_wt_rank[cc * m_b_border.size() + i] = m_wt.rank(2*zeros, c);
00262                     }
00263                     m_char_occ[cc] = m_wt.rank(m_wt.size(), c);
00264                 }
00265             }
00266             std::remove(temp_file.c_str());
00267         }
00268 
00270         wt_rlg(const wt_rlg& wt):sigma(m_sigma) {
00271             copy(wt);
00272         }
00273 
00275         wt_rlg& operator=(const wt_rlg& wt) {
00276             if (this != &wt) {
00277                 copy(wt);
00278             }
00279             return *this;
00280         }
00281 
00283         void swap(wt_rlg& wt) {
00284             if (this != &wt) {
00285                 std::swap(m_size, wt.m_size);
00286                 m_wt.swap(wt.m_wt);
00287                 m_b.swap(wt.m_b);
00288                 util::swap_support(m_b_rank, wt.m_b_rank, &m_b, &(wt.m_b));
00289                 m_b_border.swap(wt.m_b_border);
00290                 m_b_border_rank.swap(wt.m_b_border_rank);
00291                 m_wt_rank.swap(wt.m_wt_rank);
00292                 m_char2comp.swap(wt.m_char2comp);
00293                 m_char_occ.swap(wt.m_char_occ);
00294                 std::swap(m_sigma, wt.m_sigma);
00295             }
00296         }
00297 
00299         size_type size()const {
00300             return m_size;
00301         }
00302 
00304         bool empty()const {
00305             return m_size == 0;
00306         }
00307 
00309 
00315         value_type operator[](size_type i)const {
00316             size_type level = 0;
00317             while (m_b[(i>>1) + m_b_border[level]]) {
00318                 i = m_b_rank((i>>1) + m_b_border[level]) - m_b_border_rank[level];
00319                 ++level;
00320             }
00321             size_type zeros = (i>>1) + m_b_border[level] - m_b_rank((i>>1) + m_b_border[level]);
00322             return m_wt[(zeros<<1) + (i&1)];
00323         };
00324 
00326 
00334         size_type rank(size_type i, value_type c)const {
00335             value_type cc    = m_char2comp[c];
00336             if (((size_type)cc) >= m_char_occ.size()) { // char does not occur
00337                 return 0;
00338             }
00339             size_type  res   = 0;
00340             size_type  level = 0;
00341             size_type  added = 0;
00342             size_type cs     = 0;
00343             while (i>0 and cs != m_char_occ[cc]) {
00344                 size_type ones  = m_b_rank((i>>1) + m_b_border[level]); // # of ones till this position
00345                 size_type zeros = m_b_border[level] + (i>>1) - ones;  // # of zeros till this position
00346                 res += ((cs=m_wt.rank(zeros<<1, c)) - (m_wt_rank[cc*m_b_border.size() + level])) << level;
00347                 if (i & 1) {//  i is odd
00348                     if (m_b[(i>>1) + m_b_border[level]]) {
00349                         ++i;
00350                         added += (1<<level);
00351                         ++ones;
00352                     } else {
00353                         if (m_wt[zeros<<1] == c) {
00354                             res += (1<<level) - added;
00355                         }
00356                         added = 0;
00357                     }
00358                 } else { // i is even
00359                     if (added > 0 and m_b[(i>>1) + m_b_border[level] - 1] == 0) {
00360                         if (m_wt[(zeros<<1)-1] == c) {
00361                             res -= added;
00362                         }
00363                         added = 0;
00364                     }
00365                 }
00366                 i = ones - m_b_border_rank[level];
00367                 ++level;
00368             }
00369             return res;
00370         };
00371 
00373 
00380         size_type rank_ith_symbol(size_type i, value_type& c)const {
00381             return rank(i, c=(*this)[i]);
00382         }
00383 
00385 
00393         size_type select(size_type i, value_type c)const {
00394             if (((size_type)m_char2comp[c]) >= m_char_occ.size()) { // char does not occur
00395                 return size();
00396             }
00397             size_type lb = 0, rb = m_size;  // lb inclusive, rb exclusive
00398             while (rb > lb) {
00399                 size_type m = (lb+rb)>>1;
00400                 if (rank(m+1, c) < i) {
00401                     lb = m+1;
00402                 } else { // rank(m+1,c) >= i
00403                     rb = m;
00404                 }
00405             }
00406             return lb;
00407         };
00408 
00410         size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const {
00411             structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
00412             size_type written_bytes = 0;
00413             written_bytes += util::write_member(m_size, out, child, "size");
00414             written_bytes += m_wt.serialize(out, child, "wt");
00415             written_bytes += m_b.serialize(out, child, "b");
00416             written_bytes += m_b_rank.serialize(out, child, "rank");
00417             written_bytes += m_b_border.serialize(out, child, "b_border");
00418             written_bytes += m_b_border_rank.serialize(out, child, "b_border_rank");
00419             written_bytes += m_wt_rank.serialize(out, child, "wt_rank");
00420             written_bytes += m_char2comp.serialize(out, child, "char2comp");
00421             written_bytes += m_char_occ.serialize(out, child, "char_occ");
00422             written_bytes += util::write_member(m_sigma, out, child, "sigma");
00423             structure_tree::add_size(child, written_bytes);
00424             return written_bytes;
00425         }
00426 
00428         void load(std::istream& in) {
00429             util::read_member(m_size, in);
00430             m_wt.load(in);
00431             m_b.load(in);
00432             m_b_rank.load(in, &m_b);
00433             m_b_border.load(in);
00434             m_b_border_rank.load(in);
00435             m_wt_rank.load(in);
00436             m_char2comp.load(in);
00437             m_char_occ.load(in);
00438             util::read_member(m_sigma, in);
00439         }
00440 
00441 };
00442 
00443 
00444 }// end namespace sdsl
00445 
00446 #endif // end file