SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
 All Classes Namespaces Files Functions Variables Typedefs Friends
sdsl/include/sdsl/wt_rlg8.hpp
Go to the documentation of this file.
00001 /* sdsl - succinct data structures library
00002     Copyright (C) 2011 Simon Gog
00003 
00004     This program is free software: you can redistribute it and/or modify
00005     it under the terms of the GNU General Public License as published by
00006     the Free Software Foundation, either version 3 of the License, or
00007     (at your option) any later version.
00008 
00009     This program is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012     GNU General Public License for more details.
00013 
00014     You should have received a copy of the GNU General Public License
00015     along with this program.  If not, see http://www.gnu.org/licenses/ .
00016 */
00022 #ifndef INCLUDED_SDSL_WT_RLG8
00023 #define INCLUDED_SDSL_WT_RLG8
00024 
00025 #include "int_vector.hpp"
00026 #include "rank_support_v.hpp"
00027 #include "rank_support_v5.hpp"
00028 #include "select_support_mcl.hpp"
00029 #include "select_support_bs.hpp"
00030 #include "bitmagic.hpp"
00031 #include "util.hpp"
00032 #include "wt_huff.hpp"
00033 #include <algorithm> // for std::swap
00034 #include <stdexcept>
00035 #include <vector>
00036 #include <utility> // for pair
00037 #include <queue>
00038 
00039 
00040 #ifdef SDSL_DEBUG
00041 #define SDSL_DEBUG_WAVELET_TREE_HUFFMAN_RLN4
00042 #endif
00043 
00044 
00046 namespace sdsl
00047 {
00048 
00049 typedef wt_huff<bit_vector, rank_support_v5<>, select_support_bs<>, select_support_bs<> > wt_without_select;
00050 
00052 
00066 template<class RankSupport = rank_support_v5<>, class WaveletTree = wt_without_select >
00067 class wt_rlg8
00068 {
00069     public:
00070         typedef int_vector<>::size_type size_type;
00071         typedef unsigned char                   value_type;
00072         typedef RankSupport                             rank_support_type;
00073         typedef WaveletTree             wt_type;
00074 
00075     private:
00076         size_type                               m_size;         // size of the original input sequence
00077         wt_type                                 m_wt;           // wavelet tree for all levels
00078         bit_vector                              m_b;            // bit vector which indicates if a pair consists of
00079         // two equal chars
00080         rank_support_type               m_b_rank;       // rank support for vector b
00081         int_vector<64>          m_b_border_rank;// Vector in which we store the rank values of m_b at the
00082         // border positions.
00083         int_vector<64>          m_b_border;       // Vector in which we store the borders of the different levels
00084         // Takes \f$\Order{\max(1, \log L)\log n}\f$ bits.
00085         int_vector<64>          m_wt_rank;  // Vector in which we store the rank value for each character
00086         // and each border.
00087         // Takes \f$\Order{\sigma\max(1, \log L)\log n}\f bits
00088         int_vector<8>                   m_char2comp;    //
00089         int_vector<64>          m_char_occ;     //
00090         uint16_t                                m_sigma;
00091 
00092         void copy(const wt_rlg8& wt) {
00093             m_size          = wt.m_size;
00094             m_wt            = wt.m_wt;
00095             m_b             = wt.m_b;
00096             m_b_rank        = wt.m_b_rank;
00097             m_b_rank.set_vector(&m_b);
00098             m_b_border_rank = wt.m_b_border_rank;
00099             m_b_border      = wt.m_b_border;
00100             m_wt_rank       = wt.m_wt_rank;
00101             m_char2comp     = wt.m_char2comp;
00102             m_char_occ      = wt.m_char_occ;
00103             m_size                      = wt.m_size;
00104         }
00105 
00106     public:
00107 
00108         const uint16_t& sigma;
00109 
00110         // Default constructor
00111         wt_rlg8():m_size(0), m_sigma(0), sigma(m_sigma) {};
00112 
00113 
00114 
00116 
00122         wt_rlg8(const unsigned char* rac, size_type size):m_size(size), m_sigma(0), sigma(m_sigma) {
00123             std::cerr << "ERROR: Constructor of wt_rlg8 not implemented yet!!!" << std::endl;
00124             throw std::logic_error("This constructor of wt_rlg8 is not yet implemented!");
00125         }
00126 
00127         template<class size_type_class>
00128         wt_rlg8(int_vector_file_buffer<8, size_type_class>& rac, size_type size):m_size(size), m_sigma(0), sigma(m_sigma) {
00129             construct(rac, size);
00130         }
00131 
00133 
00136         template<class size_type_class>
00137         void construct(int_vector_file_buffer<8, size_type_class>& rac, size_type size) {
00138             m_size = size;
00139             typedef size_type_class size_type;
00140             // TODO: remove absolute file name
00141             std::string temp_file = "wt_rlg8_" + util::to_string(util::get_pid()) + "_" + util::to_string(util::get_id());
00142             std::ofstream wt_out(temp_file.c_str(), std::ios::binary | std::ios::trunc);
00143             size_type bit_cnt=0;
00144             wt_out.write((char*)&bit_cnt, sizeof(bit_cnt)); // initial dummy write
00145 
00146             m_b = bit_vector(size/4+1,0);
00147             uint8_t* next_bwt = new uint8_t[size/4+4];
00148 //              bit_vector same_prev_char(size/2+1,0); //
00149 
00150             m_b_border.resize(bit_magic::l1BP(size) + 1);
00151             m_b_border[0] = 0;
00152 
00153             int m=0;
00154 
00155             rac.reset();
00156             bit_vector b_sigma(256, 0);
00157             uint8_t last_c[9] = {0,1,0,1,0,1,0,1};
00158             uint8_t c = '\0';
00159             size_type b_cnt = 0, pair1cnt=0, pair0cnt=0;
00160             for (size_type i=0, r=0, r_sum=0; r_sum < size;) {
00161                 if (r_sum + r > size) {  // read not more than size chars in the next loop
00162                     r = size-r_sum;
00163                 }
00164                 for (; i < r+r_sum; ++i) {
00165                     c = rac[i-r_sum];
00166                     b_sigma[c] = 1;
00167                     last_c[i&7ULL] = c;
00168                     if ((i & 7ULL)==7) {
00169                         if (last_c[0] == last_c[1] and last_c[2] == last_c[3] and last_c[0] == last_c[2] and
00170                             last_c[4] == last_c[5] and last_c[6] == last_c[7] and last_c[4] == last_c[6]  and
00171                             last_c[0] == last_c[4]
00172                            ) { // join octtrupel
00173                             m_b[b_cnt] = 1;
00174                             next_bwt[pair1cnt] = c;
00175                             ++pair1cnt;
00176                         } else { // write pair to stream
00177                             //m_b[b_cnt] = 0; // since m_b is initialized to zero, this is not necessary
00178                             wt_out.write((char*)last_c, 8*sizeof(c));
00179                             ++pair0cnt;
00180                         }
00181                         ++b_cnt;
00182                     }
00183                 }
00184                 r_sum += r;
00185                 r = rac.load_next_block();
00186             }
00187             if (size & 7ULL) {
00188                 wt_out.write((char*)&last_c, 8*sizeof(c));
00189                 ++pair0cnt;
00190                 ++b_cnt;
00191             }
00192             m_sigma = 0;
00193             for (size_type i=0; i<b_sigma.size(); ++i) {
00194                 m_sigma += b_sigma[i];
00195             }
00196 
00197             uint32_t level = 0;
00198             //  handle remaining levels
00199             while (pair1cnt > 0) {
00200                 ++m;
00201                 m_b_border[++level] = b_cnt;
00202                 size_type level_size = pair1cnt;
00203                 pair1cnt = 0;
00204                 for (size_type i=7; i < level_size; i+=8) {
00205                     if (next_bwt[i] == next_bwt[i-1] and next_bwt[i-2] == next_bwt[i-3] and next_bwt[i] == next_bwt[i-2] and
00206                         next_bwt[i-4] == next_bwt[i-5] and next_bwt[i-6] == next_bwt[i-7] and next_bwt[i-4] == next_bwt[i-6] and
00207                         next_bwt[i] == next_bwt[i-4]) {
00208                         m_b[b_cnt] = 1;
00209                         next_bwt[pair1cnt] = next_bwt[i];
00210                         ++pair1cnt;
00211                     } else {
00212                         //m_b[b_cnt] = 0; // since m_b is initialized to zero, this is not necessary
00213                         wt_out.write((char*)next_bwt+i-7, 8*sizeof(c));
00214                         ++pair0cnt;
00215                     }
00216                     ++b_cnt;
00217                 }
00218                 if (level_size & 7ULL) { // handle last element
00219                     wt_out.write((char*)next_bwt + (level_size/8)*8, (level_size&7ULL)*sizeof(c));
00220                     wt_out.write((char*)"\0\0\0\0\0\0\0\0", 8-(level_size&7ULL));
00221                     ++pair0cnt;
00222                     ++b_cnt;
00223                 }
00224             }
00225             delete [] next_bwt;
00226             m_b.resize(b_cnt);
00227             m_b_border.resize(level+1);
00228 
00229             wt_out.seekp(0, std::ios::beg);
00230             bit_cnt = (8*pair0cnt)*8;
00231             wt_out.write((char*)&bit_cnt, sizeof(bit_cnt));
00232             wt_out.close();
00233 
00234             {
00235                 int_vector_file_buffer<8, size_type> temp_bwt_buf(temp_file.c_str());
00236                 m_wt.construct(temp_bwt_buf, temp_bwt_buf.int_vector_size);
00237             }
00238 
00239             util::init_support(m_b_rank, &m_b);
00240             m_b_border_rank.resize(m_b_border.size());
00241 
00242             for (size_type i=0; i<m_b_border.size(); ++i) {
00243                 m_b_border_rank[i] = m_b_rank.rank(m_b_border[i]);
00244             }
00245 
00246             m_char2comp = int_vector<8>(256,255);
00247             for (uint16_t c=0, cnt=0; c<256; ++c) {
00248                 if (b_sigma[c]) {
00249                     m_char2comp[c] = cnt++;
00250                 }
00251             }
00252 
00253 
00254             m_wt_rank.resize(m_sigma * m_b_border.size());
00255             m_char_occ.resize(m_sigma);
00256             for (size_type c=0; c < 256; ++c) {
00257                 uint16_t cc = m_char2comp[c];
00258                 if (cc < m_sigma) {
00259                     for (size_type i=0; i < m_b_border.size(); ++i) {
00260                         size_type zeros  = m_b_border[i] - m_b_border_rank[i];
00261                         m_wt_rank[cc * m_b_border.size() + i] = m_wt.rank(8*zeros, c);
00262                     }
00263                     m_char_occ[cc] = m_wt.rank(m_wt.size(), c);
00264                 }
00265             }
00266             std::remove(temp_file.c_str());
00267         }
00268 
00270         wt_rlg8(const wt_rlg8& wt):sigma(m_sigma) {
00271             copy(wt);
00272         }
00273 
00275         wt_rlg8& operator=(const wt_rlg8& wt) {
00276             if (this != &wt) {
00277                 copy(wt);
00278             }
00279             return *this;
00280         }
00281 
00283         void swap(wt_rlg8& wt) {
00284             if (this != &wt) {
00285                 std::swap(m_size, wt.m_size);
00286                 m_wt.swap(wt.m_wt);
00287                 m_b.swap(wt.m_b);
00288                 util::swap_support(m_b_rank, wt.m_b_rank, &m_b, &(wt.m_b));
00289                 m_b_border.swap(wt.m_b_border);
00290                 m_b_border_rank.swap(wt.m_b_border_rank);
00291                 m_wt_rank.swap(wt.m_wt_rank);
00292                 m_char2comp.swap(wt.m_char2comp);
00293                 m_char_occ.swap(wt.m_char_occ);
00294                 std::swap(m_sigma, wt.m_sigma);
00295             }
00296         }
00297 
00299         size_type size()const {
00300             return m_size;
00301         }
00302 
00304         bool empty()const {
00305             return m_size == 0;
00306         }
00307 
00309 
00315         value_type operator[](size_type i)const {
00316             size_type level = 0;
00317             while (m_b[(i>>3) + m_b_border[level]]) {
00318                 i = m_b_rank((i>>3) + m_b_border[level]) - m_b_border_rank[level];
00319                 ++level;
00320             }
00321             size_type zeros = (i>>3) + m_b_border[level] - m_b_rank((i>>3) + m_b_border[level]);
00322             return m_wt[(zeros<<3) + (i&7ULL)];
00323         };
00324 
00326 
00334         size_type rank(size_type i, value_type c)const {
00335             value_type cc    = m_char2comp[c];
00336             if (((size_type)cc) >= m_char_occ.size()) { // char does not occur
00337                 return 0;
00338             }
00339             size_type  res   = 0;
00340             size_type  level = 0;
00341             size_type  added = 0;
00342             size_type cs     = 0;
00343             while (i>0 and cs != m_char_occ[cc]) {
00344                 size_type ones  = m_b_rank((i>>3) + m_b_border[level]); // # of ones till this position
00345                 size_type zeros = m_b_border[level] + (i>>3) - ones;  // # of zeros till this position
00346                 res += ((cs=m_wt.rank((zeros<<3), c)) - (m_wt_rank[cc*m_b_border.size() + level])) << ((level<<1)+level);
00347                 if (i & 7ULL) {//  i not a multiple of 8
00348                     if (m_b[(i>>3) + m_b_border[level]]) {
00349                         added += (1<<((level<<1)+level)) * (8-(i & 7ULL));
00350                         i += (8-(i & 7ULL));
00351                         ++ones;
00352                     } else {
00353                         if (m_wt[(zeros<<3) + (i&7ULL) - 1] == c) {
00354                             res += (1<<((level<<1)+level)) - added;
00355                         }
00356                         if ((i&7ULL) > 1) {
00357                             size_type cnt = m_wt.rank((zeros<<3) + (i&7ULL) - 1, c) - cs;
00358                             res += (cnt<<((level<<1)+level));
00359                         }
00360                         added = 0;
00361                     }
00362                 } else { // i is a multiple of 8
00363                     if (added > 0 and m_b[(i>>3) + m_b_border[level] - 1] == 0) {
00364                         if (m_wt[(zeros<<3)-1] == c) {
00365                             res -= added;
00366                         }
00367                         added = 0;
00368                     }
00369                 }
00370                 i = ones - m_b_border_rank[level];
00371                 ++level;
00372             }
00373             return res;
00374         };
00375 
00377 
00384         size_type rank_ith_symbol(size_type i, value_type& c)const {
00385             return rank(i, c=(*this)[i]);
00386         }
00387 
00389 
00397         size_type select(size_type i, value_type c)const {
00398             if (((size_type)m_char2comp[c]) >= m_char_occ.size()) { // char does not occur
00399                 return size();
00400             }
00401             size_type lb = 0, rb = m_size;  // lb inclusive, rb exclusive
00402             while (rb > lb) {
00403                 size_type m = (lb+rb)>>1;
00404                 if (rank(m+1, c) < i) {
00405                     lb = m+1;
00406                 } else { // rank(m+1,c) >= i
00407                     rb = m;
00408                 }
00409             }
00410             return lb;
00411         };
00412 
00414         size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const {
00415             structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
00416             size_type written_bytes = 0;
00417             written_bytes += util::write_member(m_size, out, child, "size");
00418             written_bytes += m_wt.serialize(out, child, "wt");
00419             written_bytes += m_b.serialize(out, child, "b");
00420             written_bytes += m_b_rank.serialize(out, child, "b_rank");
00421             written_bytes += m_b_border.serialize(out, child, "b_border");
00422             written_bytes += m_b_border_rank.serialize(out, child, "b_border_rank");
00423             written_bytes += m_wt_rank.serialize(out, child, "wt_rank");
00424             written_bytes += m_char2comp.serialize(out, child, "char2comp");
00425             written_bytes += m_char_occ.serialize(out, child, "char_occ");
00426             written_bytes += util::write_member(m_sigma, out, child, "sigma");
00427             structure_tree::add_size(child, written_bytes);
00428             return written_bytes;
00429         }
00430 
00432         void load(std::istream& in) {
00433             util::read_member(m_size, in);
00434             m_wt.load(in);
00435             m_b.load(in);
00436             m_b_rank.load(in, &m_b);
00437             m_b_border.load(in);
00438             m_b_border_rank.load(in);
00439             m_wt_rank.load(in);
00440             m_char2comp.load(in);
00441             m_char_occ.load(in);
00442             util::read_member(m_sigma, in);
00443         }
00444 };
00445 
00446 
00447 }// end namespace sdsl
00448 
00449 #endif // end file