SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 00002 00003 /* 00004 * template parameter block size 00005 * 00006 * average access time \f$ \Order{H_k} \f$ 00007 * References: 00008 * Juha K"arkk"ainen and Simon J. Puglisi "Fixed Block Compression Boosting in FM-Indexes", SPIRE 2011 00009 * 00010 * Implementation for small alphabet size (<=256) 00011 00012 00013 * 00014 */ 00015 00016 #ifndef INCLUDED_SDSL_WT_FIXED_BLOCK 00017 #define INCLUDED_SDSL_WT_FIXED_BLOCK 00018 00019 #include "int_vector.hpp" 00020 #include "wt_helper.hpp" 00021 #include "bp_support.hpp" 00022 00023 namespace sdsl 00024 { 00025 00026 template<uint32_t block_size = 1<<15, class bp_support_type = bp_support_sada<> > 00027 class wt_fixed_block 00028 { 00029 public: 00030 typedef bit_vector::size_type size_type; 00031 private: 00032 size_type m_size; // stores the size of the sequence 00033 size_type m_blk_cnt; // stores the number of blocks 00034 uint16_t m_sigma; // size of the effective alphabet 00035 uint16_t m_c_to_nr; // map symbol c to a unqiue number in the range [0..m_sigma-1] 00036 bit_vector m_occ; // indicator bit vector for occurences of a character c 00037 // in a block i; if m_occ[m_c_to_nr[c]*(m_blk_cnt+1) + i +1] = 1 00038 bit_vector m_bp; // the concatenated balanced parentheses sequences for the 00039 // Huffman trees of the blocks 00040 bp_support_type m_bp_support; // supporting data structure for m_bp 00041 int_vector<> m_bp_pointers; // for each block, we store a pointer to the start 00042 // of the corrsponding start of the tree representation in m_bp 00043 bit_vector m_tree; // store the concatenated bit vectors of the wavelet trees of 00044 // the blocks 00045 int_vector<> m_tree_pointers; // for each block, we store a pointer to the start 00046 // of the 00047 00048 // TODO: * for each node in a huffman shaped wt, we have to save the size of the right nodes 00049 // * for each occurring character we have to store the number of its leaf 00050 // or the position ... number of leaf is more space efficient but requires an additional select 00051 int_vector<> m_leaf_nr; 00052 00053 // if alphabet size is <= 4,text almost random, and block size not very larg, 00054 // use scanning with broadword methods b1Cnt, b11Cnt,... to answer questions 00055 00056 // if alphabet size equals 1, do not store m_tree, all answers can be 00057 // calculated easily 00058 00059 // if the alphabet size is 2, we can decide with m_leaf_nr, which char is encoded as 1 or 0. 00060 public: 00061 00063 template<class size_type_class> 00064 wt_fixed_block(int_vector_file_buffer<8, size_type_class>& text, size_type size) { 00065 m_size = size; 00066 if (m_size == 0) 00067 return; 00068 // calculate number of blocks 00069 size_type m_blk_cnt = (m_size + block_size - 1) / block_size; 00070 // caculate the number of occurences of each character in the text 00071 size_type C[256] = {0}; 00072 calculate_character_occurences(text, m_size, C); 00073 // calculate the size of the effective alphabet 00074 calculate_effective_alphabet_size(C, m_sigma); 00075 00076 m_occ.resize(m_sigma * (m_blk_cnt+1)); // +1 since we also store a dummy element for block[-1] 00077 00078 // initialize m_c_to_nr 00079 uint16_t nr=0; 00080 for (size_type i=0; i<256; ++i) { 00081 if (C[i]>0) { 00082 m_c_to_nr[i] = nr++; 00083 } else { 00084 m_c_to_nr[i] = _undef_node; 00085 } 00086 } 00087 00088 00089 } 00090 } 00091 00092 } // end namespace sdsl 00093 00094 #endif