SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
 All Classes Namespaces Files Functions Variables Typedefs Friends
sdsl/include/sdsl/wt_fixed_block.hpp
00001 
00002 
00003 /*
00004  *   template parameter block size
00005  *
00006  * average access time \f$ \Order{H_k} \f$
00007  * References:
00008  *    Juha K"arkk"ainen and Simon J. Puglisi "Fixed Block Compression Boosting in FM-Indexes", SPIRE 2011
00009  *
00010  * Implementation for small alphabet size (<=256)
00011 
00012 
00013  *
00014  */
00015 
00016 #ifndef INCLUDED_SDSL_WT_FIXED_BLOCK
00017 #define INCLUDED_SDSL_WT_FIXED_BLOCK
00018 
00019 #include "int_vector.hpp"
00020 #include "wt_helper.hpp"
00021 #include "bp_support.hpp"
00022 
00023 namespace sdsl
00024 {
00025 
00026 template<uint32_t block_size = 1<<15, class bp_support_type = bp_support_sada<> >
00027 class wt_fixed_block
00028 {
00029     public:
00030         typedef bit_vector::size_type size_type;
00031     private:
00032         size_type                       m_size; // stores the size of the sequence
00033         size_type                       m_blk_cnt; // stores the number of blocks
00034         uint16_t                        m_sigma; // size of the effective alphabet
00035         uint16_t                        m_c_to_nr; // map symbol c to a unqiue number in the range [0..m_sigma-1]
00036         bit_vector                      m_occ; // indicator bit vector for occurences of a character c
00037         // in a block i; if m_occ[m_c_to_nr[c]*(m_blk_cnt+1) + i +1] = 1
00038         bit_vector                      m_bp;  // the concatenated balanced parentheses sequences for the
00039         // Huffman trees of the blocks
00040         bp_support_type         m_bp_support; // supporting data structure for m_bp
00041         int_vector<>        m_bp_pointers; // for each block, we store a pointer to the start
00042         // of the corrsponding start of the tree representation in m_bp
00043         bit_vector                      m_tree; // store the concatenated bit vectors of the wavelet trees of
00044         // the blocks
00045         int_vector<>        m_tree_pointers; // for each block, we store a pointer to the start
00046         // of the
00047 
00048         // TODO: * for each node in a huffman shaped wt, we have to save the size of the right nodes
00049         //       * for each occurring character we have to store the number of its leaf
00050         //           or the position ... number of leaf is more space efficient but requires an additional select
00051         int_vector<>            m_leaf_nr;
00052 
00053         // if alphabet size is <= 4,text almost random, and block size not very larg,
00054         // use scanning with broadword methods b1Cnt, b11Cnt,... to answer questions
00055 
00056         // if alphabet size equals 1, do not store m_tree, all answers can be
00057         // calculated easily
00058 
00059         // if the alphabet size is 2, we can decide with m_leaf_nr, which char is encoded as 1 or 0.
00060     public:
00061 
00063         template<class size_type_class>
00064         wt_fixed_block(int_vector_file_buffer<8, size_type_class>& text, size_type size) {
00065             m_size = size;
00066             if (m_size == 0)
00067                 return;
00068             // calculate number of blocks
00069             size_type m_blk_cnt = (m_size + block_size - 1) / block_size;
00070             // caculate the number of occurences of each character in the text
00071             size_type C[256] = {0};
00072             calculate_character_occurences(text, m_size, C);
00073             // calculate the size of the effective alphabet
00074             calculate_effective_alphabet_size(C, m_sigma);
00075 
00076             m_occ.resize(m_sigma * (m_blk_cnt+1));   // +1 since we also store a dummy element for block[-1]
00077 
00078             // initialize m_c_to_nr
00079             uint16_t nr=0;
00080             for (size_type i=0; i<256; ++i) {
00081                 if (C[i]>0) {
00082                     m_c_to_nr[i] = nr++;
00083                 } else {
00084                     m_c_to_nr[i] = _undef_node;
00085                 }
00086             }
00087 
00088 
00089         }
00090 }
00091 
00092 } // end namespace sdsl
00093 
00094 #endif