SDSL: Succinct Data Structure Library: sdsl/include/sdsl/bitmagic.hpp Source File

Go to the documentation of this file.
00001 /* sdsl - succinct data structures library
00002     Copyright (C) 2008 Simon Gog
00003 
00004     This program is free software: you can redistribute it and/or modify
00005     it under the terms of the GNU General Public License as published by
00006     the Free Software Foundation, either version 3 of the License, or
00007     (at your option) any later version.
00008 
00009     This program is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012     GNU General Public License for more details.
00013 
00014     You should have received a copy of the GNU General Public License
00015     along with this program.  If not, see http://www.gnu.org/licenses/ .
00016 */
00021 #ifndef INCLUDED_SDSL_BITMAGIC
00022 #define INCLUDED_SDSL_BITMAGIC
00023 
00024 #include <stdint.h> // for uint64_t uint32_t declaration
00025 #include <iostream>// for cerr
00026 #include <cassert>
00027 #ifdef __SSE4_2__
00028 #include <xmmintrin.h>
00029 #endif
00030 
00032 namespace sdsl
00033 {
00034 
00036 
00050 class bit_magic
00051 {
00052     private:
00053         bit_magic(); // This helper class can not be instantiated
00054     public:
00056         static const int64_t  All1Mask = -1LL;
00057 
00059 
00064         static const uint64_t DeBruijn64 = 0x0218A392CD3D5DBFULL;
00065 
00067 
00069         static const uint32_t DeBruijn64ToIndex[64];
00070 
00072         static const uint64_t Fib[92];
00073 
00075         static const uint8_t B1CntBytes[256];
00076 
00078 
00080         static const uint32_t L1BP[256];
00081 
00083         static const uint64_t Li1Mask[65];
00084 
00086         static const uint64_t Li0Mask[65];
00087 
00088         static const uint8_t lookupr1BP[256];
00089 
00091 
00094         static const uint8_t Select256[256*8];
00095 
00096         static const uint64_t PsOverflow[65];
00097 
00098         static const uint32_t powerOf3[9];
00099 
00100         static const uint8_t max_excess_8bit[256];
00101 
00103         static const uint64_t _8_x_the_byte[65];
00104 
00105         static const uint8_t cover0[1];
00106         static const uint8_t cover1[2];
00107         static const uint8_t cover2[3];
00108         static const uint8_t cover3[4];
00109         static const uint8_t cover4[5];
00110         static const uint8_t cover5[7];
00111         static const uint8_t cover6[9];
00112         static const uint8_t cover7[13];
00113         static const uint8_t cover8[20];
00114 
00115         static const uint8_t* covers[9];
00116 
00117         static const uint32_t cover_sizes[9];
00118 
00119 
00120         static void generate_first_pos_of_excess_val() {
00121             for (int i=0; i<9; ++i) {
00122                 for (int j=0; j<256; ++j) {
00123                     int x = j;
00124                     int excess = 0;
00125                     int found=0;
00126                     for (int k=0; k<8 and !found; ++k, x>>=1) {
00127                         if (x&1)
00128                             excess++;
00129                         else
00130                             excess--;
00131                         if (excess == i) {
00132                             std::cout<<k<<",";
00133                             found=1;
00134                         }
00135                     }
00136                     if (!found)
00137                         std::cout<<0<<",";
00138                     if ((j+1)%16==0)
00139                         std::cout<<std::endl;
00140                 }
00141                 std::cout<<std::endl;
00142             }
00143         }
00144 
00145         static const uint8_t first_pos_of_excess_val[256*9];
00146 
00147         static void generate_last_pos_of_excess_val() {
00148             std::cout<<"const uint8_t bit_magic::last_pos_of_excess_val[]={"<<std::endl;
00149             for (int i=-8; i<9; ++i) {
00150                 for (int j=0; j<256; ++j) {
00151                     int x = j;
00152                     int excess = 0;
00153                     int idx=0;
00154                     for (int k=0; k<8; ++k, x>>=1) {
00155                         if (x&1)
00156                             excess++;
00157                         else
00158                             excess--;
00159                         if (excess == i) {
00160                             idx=k;
00161                         }
00162                     }
00163                     std::cout<<idx;
00164                     if (i* j!=8*255)
00165                         std::cout<<",";
00166                     if ((j+1)%16==0)
00167                         std::cout<<std::endl;
00168                 }
00169                 std::cout<<std::endl;
00170             }
00171             std::cout<<"};"<<std::endl;
00172         }
00173 
00174         static const uint8_t last_pos_of_excess_val[256*17];
00175 
00176         static void generate_very_near_find_open() {
00177             std::cout<<"const uint8_t bit_magic::very_near_find_open[]={"<<std::endl;
00178             for (int w=0; w<256; ++w) {
00179                 int excess_val=0;
00180                 for (int i=0; i<8; ++i) {
00181                     if ((w>>i)&1)
00182                         ++excess_val;
00183                     else
00184                         --excess_val;
00185                 }
00186                 int res=0;
00187                 int cur_excess_val=0;
00188                 for (int i=0; i<8; ++i) {
00189                     if ((w>>i)&1)
00190                         ++cur_excess_val;
00191                     else
00192                         --cur_excess_val;
00193                     if (cur_excess_val==excess_val-1)
00194                         res = 7-i;
00195                 }
00196                 std::cout<<res;
00197                 if (w!=255)
00198                     std::cout<<",";
00199                 if ((w+1)%16==0)
00200                     std::cout<<std::endl;
00201             }
00202             std::cout<<"};"<<std::endl;
00203         }
00204 
00206 
00210         static const uint8_t very_near_find_open[256];
00211 
00212         static void generate_very_near_enclose() {
00213             std::cout<<"const uint8_t bit_magic::very_near_enclose[]={"<<std::endl;
00214             for (int w=0; w<256; ++w) {
00215                 int excess_val = 0, res=0;
00216                 for (int i=7; i>=0 and !res; --i) {
00217                     if (w&(1<<i))
00218                         ++excess_val;
00219                     else
00220                         --excess_val;
00221                     if (excess_val==1)
00222                         res = 8-i;
00223                 }
00224                 std::cout<<res;
00225                 if (w!=255)
00226                     std::cout<<",";
00227                 if ((w+1)%16==0)
00228                     std::cout<<std::endl;
00229             }
00230             std::cout<<"};"<<std::endl;
00231         }
00232 
00233         static const uint8_t very_near_enclose[256];
00235 
00240         static uint64_t b1Cnt(uint64_t x);
00241 
00243 
00249         static uint32_t b1Cnt32(uint32_t x);
00250 
00252         static uint32_t b1CntNaive(uint64_t x);
00253 
00255 
00259         static uint32_t b11Cnt(uint64_t x, uint64_t& c);
00260 
00261         static uint32_t b11CntS(uint64_t x, uint64_t& c);
00262         static uint32_t b11CntS(uint64_t x);
00263 
00265 
00268         static uint32_t b11Cnt(uint64_t x);
00269 
00271         static uint32_t b11CntNaive(uint64_t x);
00272 
00274         static uint32_t eB11Cnt(uint64_t x);
00275 
00277 
00281         static uint32_t b10Cnt(uint64_t x, uint64_t& c);
00282 
00284 
00288         static uint32_t b01Cnt(uint64_t x, uint64_t& c);
00289 
00290         static uint32_t b10CntNaive(uint64_t x, uint64_t& c);
00291 
00293         static uint64_t b10Map(uint64_t x, uint64_t c=0);
00294 
00296         static uint64_t b01Map(uint64_t x, uint64_t c=1);
00297 
00299 
00305         static uint32_t i1BP(uint64_t x, uint32_t i);
00306         static uint32_t i1BP2(uint64_t x, uint32_t i);
00307 
00309 
00311         static uint32_t j1BP(uint64_t x, uint32_t j);
00312 
00314 
00316         static uint32_t k1BP(uint64_t x, uint32_t j);
00317 
00318 
00319 
00321         static uint32_t i1BPNaive(uint64_t x, uint32_t i);
00322 
00324 
00329         static uint32_t l1BP(uint64_t x);
00330 
00332         static uint32_t l1BPNaive(uint64_t x);
00333 
00335 
00341         static uint32_t r1BP(uint64_t x);
00342 
00344         static uint32_t r1BPNaive(uint64_t x);
00345 
00347 
00355         static uint32_t i11BP(uint64_t x, uint32_t i, uint32_t c=0);
00356 
00358         static uint32_t i11BPNaive(uint64_t x, uint32_t i) {
00359             for (uint32_t j=0; j<63; ++j) {
00360                 if ((x&3)==3) {
00361                     i--;
00362                     if (!i) return j+1;
00363                     x>>=1;
00364                     ++j;
00365                 }
00366                 x>>=1;
00367             }
00368             return 63;
00369         }
00370 
00371         static uint64_t all11BPs(uint64_t x, bool& c);
00372 
00374         static uint32_t eI11BP(uint64_t x, uint32_t i);
00375 
00377 
00383         static uint32_t l11BP(uint64_t x);
00384 
00386         static void write_int(uint64_t* word, uint64_t x, const uint8_t offset=0, const uint8_t len=64);
00387 //              static void writeInt2(uint64_t *word, uint64_t x, const uint8_t offset=0, const uint8_t len=64);
00388 
00390         static void write_int_and_move(uint64_t*& word, uint64_t x, uint8_t& offset, const uint8_t len);
00391 
00393         static uint64_t read_int(const uint64_t* word, uint8_t offset=0, const uint8_t len=64);
00394 
00396         static uint64_t read_int_and_move(const uint64_t*& word, uint8_t& offset, const uint8_t len=64);
00397 
00399         static uint64_t readUnaryInt(const uint64_t* word, uint8_t offset=0);
00400 
00402         static uint64_t readUnaryIntAndMove(const uint64_t*& word, uint8_t& offset);
00403 
00405 
00411         static void move_right(const uint64_t*& word, uint8_t& offset, const uint8_t len);
00413 
00419         static void move_left(const uint64_t*& word, uint8_t& offset, const uint8_t len);
00420 
00422         static uint64_t next(const uint64_t* word, uint64_t idx);
00423 
00425         static uint64_t prev(const uint64_t* word, uint64_t idx);
00426 
00427         /*              // precalc the deBruijn64ToIndex table
00428                         static const void calcDeBruijnHash(uint64_t h[]){
00429                                 h[0] = 0;
00430                                 for(int i=1; i<64; ++i){
00431                                         h[((1ULL<<i)*DeBruijn64)>>58] = i;
00432                                 }
00433                         }
00434         */
00435         /*              // precalc the lookup for select256
00436                         static const void select256_table(uint64_t table[]){
00437                                 for(int ones=1;ones<=8;++ones){
00438                                         for(uint16_t i=0;i<256;++i){
00439                                                 uint16_t tmp = i, res=0, one_occs=0;
00440                                                 while(tmp){
00441                                                         if(tmp&1)
00442                                                                 if( ++one_occs==ones)
00443                                                                         break;
00444                                                         tmp>>=1;
00445                                                         ++res;
00446                                                 }
00447                                                 if(one_occs!=ones)
00448                                                         res = 0;
00449                                                 table[((ones-1)<<8) + i] = res;
00450                                         }
00451                                 }
00452                         }
00453         */
00454         // precalac for PSoverflow
00455         /*              static const void psoverflow_table(uint64_t table[]){
00456                                 for(int i=0; i<=64; ++i){
00457                                         table[i]= 128-i;
00458                                         for(int j=1;j<8;++j){
00459                                                 table[i]<<=8;
00460                                                 table[i] += (128-i);
00461                                         }
00462                                 }
00463                         }
00464         */
00465         static uint16_t max_excess(uint64_t x, uint16_t& b1Cnt);
00466         static uint16_t max_excess2(uint64_t x, uint16_t& b1Cnt);
00467         static uint16_t max_excess3(uint64_t x, uint16_t& b1Cnt);
00474         static void max_byte_excesses(uint64_t w, uint64_t& max_byte_excesses, uint64_t& byte_prefix_sums_x_2);
00475         static void max_byte_excesses2(uint64_t w, uint64_t& max_byte_excesses, uint64_t& byte_prefix_sums_x_2);
00476 
00477         static void min_max_byte_excesses(uint64_t max2, uint64_t& min_byte_excesses, uint64_t& max_byte_excesses, uint64_t& byte_prefix_sums_x_2);
00478 
00488         static uint8_t first_excess_position(uint64_t w, uint8_t excess_val, uint64_t& byte_prefix_sums_x_2);
00489 
00493         static uint8_t first_excess_position_naive(uint64_t w, uint8_t excess_val, uint64_t& byte_prefix_sums_x_2);
00494 
00495         static uint8_t last_excess_position(uint64_t w, uint8_t excess_val, uint64_t& byte_prefix_sums_x_2);
00496         static uint8_t last_excess_position_naive(uint64_t w, uint8_t excess_val, uint64_t& byte_prefix_sums_x_2);
00497         /*
00498          * \param close_parenthesis_index Index of the closing parenthesis. \f$0 \leq close_parenthesis_index \leq 64\f$
00499          */
00500         static uint8_t find_open(uint64_t w, uint8_t close_parenthesis_index);
00501         static uint8_t find_open_naive(uint64_t w, uint8_t close_parenthesis_index);
00502 
00503         static uint8_t find_enclose(uint64_t w, uint8_t open_parenthesis_index);
00504         static uint8_t find_enclose_naive(uint64_t w, uint8_t open_parenthesis_index);
00505 };
00506 
00507 
00508 // ============= inline - implementations ================
00509 
00510 inline uint16_t bit_magic::max_excess(uint64_t max2, uint16_t& b1Cnt)
00511 {
00512     uint64_t sum = max2-((max2>>1)&0x5555555555555555ULL);
00513     uint64_t max = sum&max2;//(~(x&0x5555555555555555ULL&sum))&sum;
00514     // Maxima fuer 4bit Bloecke
00515     max2 = ((max>>2)&0x3333333333333333ULL) + 0x6666666666666666ULL + ((sum&0x3333333333333333ULL)<<1);
00516 //      cout<<"max2 ";
00517 //      printDebug(max2);
00518     uint64_t mask = (max2 - (max = max&0x3333333333333333ULL))&0x8888888888888888ULL;
00519     mask = (mask | (((mask>>3))^0x1111111111111111ULL))-0x1111111111111111ULL;
00520 //      max2 = ((max2 & mask) | (max & ~mask));
00521     max2 = max ^((max^max2) & mask);
00522     // Summen fuer 4bit Bloecke
00523     sum = ((sum & 0x3333333333333333ULL) + ((sum>>2) & 0x3333333333333333ULL))<<1;
00524     // Maxima fuer 8bit Bloecke
00525     max  = ((max2>>4)&0x0707070707070707ULL) + 0x0C0C0C0C0C0C0C0CULL + ((sum&0x0F0F0F0F0F0F0F0FULL));
00526     mask = (max - (max2 = max2&0x0707070707070707ULL)) & 0x1010101010101010ULL;
00527     mask = (mask | ((mask>>4)^0x0101010101010101ULL))-0x0101010101010101ULL;
00528 //      max  = ((max & mask) | (max2 & ~mask));
00529     max  = max2 ^((max2^max) & mask);
00530     // Summe fuer 8 bit Bloecke
00531     sum = ((sum + (sum>>4)) & 0x1E1E1E1E1E1E1E1EULL);
00532     // Maxima fuer 16bit Bloecke
00533     max2 = ((max>>8)&0x000F000F000F000FULL) + 0x0018001800180018ULL + ((sum&0x00FF00FF00FF00FFULL));
00534     mask = (max2 - (max = max&0x000F000F000F000FULL)) & 0x0020002000200020ULL;
00535     mask = (mask | ((mask>>5)^0x0001000100010001ULL))-0x0001000100010001ULL;
00536 //      max2 = ((max2 & mask) | (max & ~mask));
00537     max2 = max ^((max^max2) & mask);
00538     // Summe fuer 16 bit Bloecke
00539     sum = (sum + (sum>>8)) & 0x00FF00FF00FF00FFULL;
00540     // Maxima fuer 32bit Bloecke
00541     max = ((max2>>16)&0x0000001F0000001FULL) + 0x0000003000000030ULL + ((sum&0x0000FFFF0000FFFFULL));
00542     mask = (max - (max2 = max2&0x0000001F0000001FULL)) & 0x0000004000000040ULL;
00543     mask = (mask | ((mask>>6)^0x0000000100000001ULL))-0x0000000100000001ULL;
00544 //      max = ((max & mask) | (max2 & ~mask)) & 0x0000003F0000003FULL;
00545     max  = (max2 ^((max2^max) & mask)) & 0x0000003F0000003FULL;
00546     // Summe fuer 32 bit Bloecke
00547     sum = (sum + (sum>>16)) & 0x0000FFFF0000FFFFULL;
00548     b1Cnt = (sum+(sum>>32))>>1;
00549     // Maxima fuer 64bit Bloecke
00550     max2 = (max>>32) + 0x0000000000000060ULL + (sum&0x00000000FFFFFFFFULL);
00551     return ((max2 - (max&0x00000000FFFFFFFFULL)) & 0x0000000000000080ULL) ? max2 & 0x000000000000007FULL : max & 0x00000000FFFFFFFFULL;
00552     /*  mask = (max2 - (max = max&0x00000000FFFFFFFFULL)) & 0x0000000000000080ULL;
00553         return max2;*/
00554 }
00555 
00556 inline uint16_t bit_magic::max_excess2(uint64_t x, uint16_t& b1Cnt)
00557 {
00558     uint64_t _max_byte_excesses, byte_prefix_sums_x_2;
00559     max_byte_excesses(x, _max_byte_excesses, byte_prefix_sums_x_2);
00560     b1Cnt = byte_prefix_sums_x_2 >> 57;
00561     uint8_t maxi1 = _max_byte_excesses, maxi2 = _max_byte_excesses>>32;
00562     for (uint8_t i=1, m; i<4; ++i) {
00563         if ((m = (_max_byte_excesses >>= 8)) > maxi1) maxi1 = m;
00564     }
00565     _max_byte_excesses >>= 8;
00566     for (uint8_t i=5, m; i<8; ++i) {
00567         if ((m = (_max_byte_excesses >>= 8)) > maxi2) maxi2 = m;
00568     }
00569     return (maxi1>maxi2)?maxi1&0x7F:maxi2&0x7F;
00570 
00571     /*
00572         uint8_t maxi[8] = {sum, sum>>8, sum>>16, sum>>24, sum>>32, sum>>40, sum>>48,sum>>56};
00573         if(maxi[0]<maxi[1])maxi[0]=maxi[1];
00574         if(maxi[2]<maxi[3])maxi[2]=maxi[3];
00575         if(maxi[4]<maxi[5])maxi[4]=maxi[5];
00576         if(maxi[6]<maxi[7])maxi[6]=maxi[7];
00577         if(maxi[0]<maxi[2])maxi[0]=maxi[2];
00578         if(maxi[4]<maxi[6])maxi[4]=maxi[6];
00579         return (maxi[0]<maxi[4])?maxi[4]&0x7F:maxi[0]&0x7F;
00580     */
00581 
00582     /*  max = (sum&0x00FF00FF00FF00FFULL)|0x0100010001000100ULL;
00583         mask = max-(max2 = ((sum>>4)&0x00FF00FF00FF00FFULL));
00584         mask = (mask | ((mask>>5)^0x0001000100010001ULL))-0x0001000100010001ULL;
00585         max  = max2 ^ (max2^max) & mask;
00586         uint8_t maxi1 = max, maxi2 = max>>32;
00587         if( ((uint8_t)(max>>16)) > maxi1 ) maxi1 = max>>16;
00588         if( ((uint8_t)(max>>48)) > maxi2 ) maxi2 = max>>48;
00589         return maxi2 > maxi1 ? maxi2 : maxi1;
00590     */
00591 
00592     /*  for(uint8_t i=1, m;i<4;++i){
00593                 if( (m = (max >>= 16)) > maxi ) maxi = m;
00594         }
00595         return maxi&0x7F;
00596     */
00597 }
00598 
00599 inline void bit_magic::max_byte_excesses(uint64_t max2, uint64_t& max_byte_excesses, uint64_t& byte_prefix_sums_x_2)
00600 {
00601     byte_prefix_sums_x_2 = max2-((max2>>1)&0x5555555555555555ULL);
00602     uint64_t max = byte_prefix_sums_x_2&max2;
00603     // Maxima fuer 4bit Bloecke
00604     max2 = ((max>>2)&0x3333333333333333ULL) + 0x6666666666666666ULL + ((byte_prefix_sums_x_2&0x3333333333333333ULL)<<1);
00605     uint64_t mask = (max2 - (max = max&0x3333333333333333ULL))&0x8888888888888888ULL;
00606     mask = (mask | ((mask>>3)^0x1111111111111111ULL))-0x1111111111111111ULL;
00607     max2 = max ^((max^max2) & mask);
00608     // Summen fuer 4bit Bloecke
00609     byte_prefix_sums_x_2 = ((byte_prefix_sums_x_2 & 0x3333333333333333ULL) + ((byte_prefix_sums_x_2>>2) & 0x3333333333333333ULL))<<1;
00610     // Maxima fuer 8bit Bloecke
00611     max  = ((max2>>4)&0x0707070707070707ULL) + 0x0C0C0C0C0C0C0C0CULL + ((byte_prefix_sums_x_2&0x0F0F0F0F0F0F0F0FULL));
00612     mask = (max - (max2 = max2&0x0707070707070707ULL)) & 0x1010101010101010ULL;
00613     mask = (mask | ((mask>>4)^0x0101010101010101ULL))-0x0101010101010101ULL;
00614     max  = max2 ^((max2^max) & mask);
00615     // Summe fuer 8 bit Bloecke
00616     byte_prefix_sums_x_2 = ((byte_prefix_sums_x_2 + (byte_prefix_sums_x_2>>4)) & 0x1E1E1E1E1E1E1E1EULL);
00617     byte_prefix_sums_x_2 = 0x0101010101010101ULL*byte_prefix_sums_x_2;// partialsummen der 2*summe der einsen im praefix
00618     max_byte_excesses = (((byte_prefix_sums_x_2<<8) | 0x8080808080808080ULL) - 0x3830282018100800ULL) + max;
00619 }
00620 
00621 inline void bit_magic::max_byte_excesses2(uint64_t max2, uint64_t& max_byte_excesses, uint64_t& byte_prefix_sums_x_2)
00622 {
00623     byte_prefix_sums_x_2 = max2-((max2>>1)&0x5555555555555555ULL);
00624 //      uint64_t max = (byte_prefix_sums_x_2&max2)+((max2|(max2>>1))&0x5555555555555555ULL);// add one to each max if max != 0
00625     uint64_t max = ((max2&0x5555555555555555ULL)<<1)|((max2&0x5555555555555555ULL)>>1);
00626     // Maxima fuer 4bit Bloecke
00627     max2 = ((max>>2)&0x3333333333333333ULL) + 0x6666666666666666ULL + ((byte_prefix_sums_x_2&0x3333333333333333ULL)<<1);
00628     uint64_t mask = (max2 - (max = max&0x3333333333333333ULL))&0x8888888888888888ULL;
00629     mask = (mask | ((mask>>3)^0x1111111111111111ULL))-0x1111111111111111ULL;
00630     max2 = max ^((max^max2) & mask);
00631     // Summen fuer 4bit Bloecke
00632     byte_prefix_sums_x_2 = ((byte_prefix_sums_x_2 & 0x3333333333333333ULL) + ((byte_prefix_sums_x_2>>2) & 0x3333333333333333ULL))<<1;
00633     // Maxima fuer 8bit Bloecke
00634     max  = ((max2>>4)&0x0707070707070707ULL) + 0x0C0C0C0C0C0C0C0CULL + ((byte_prefix_sums_x_2&0x0F0F0F0F0F0F0F0FULL));
00635     mask = (max - (max2 = max2&0x0707070707070707ULL)) & 0x1010101010101010ULL;
00636     mask = (mask | ((mask>>4)^0x0101010101010101ULL))-0x0101010101010101ULL;
00637     max  = max2 ^((max2^max) & mask);
00638     // Summe fuer 8 bit Bloecke
00639     byte_prefix_sums_x_2 = ((byte_prefix_sums_x_2 + (byte_prefix_sums_x_2>>4)) & 0x1E1E1E1E1E1E1E1EULL);
00640     byte_prefix_sums_x_2 = 0x0101010101010101ULL*byte_prefix_sums_x_2;// partialsummen der 2*summe der einsen im praefix
00641     max_byte_excesses = (byte_prefix_sums_x_2<<8) + 0x474f575f676F777FULL + max;
00642 }
00643 
00644 inline void bit_magic::min_max_byte_excesses(uint64_t max2, uint64_t& min_byte_excesses, uint64_t& max_byte_excesses, uint64_t& byte_prefix_sums_x_2)
00645 {
00646     byte_prefix_sums_x_2 = max2-((max2>>1)&0x5555555555555555ULL);
00647 //      uint64_t max = (byte_prefix_sums_x_2&max2)+((max2|(max2>>1))&0x5555555555555555ULL);// add one to each max if max != 0
00648     uint64_t max = ((max2&0x5555555555555555ULL)<<1)|((max2>>1)&0x5555555555555555ULL);
00649     uint64_t min = ~max, min2,mask,mask2;
00650 //std::cerr<<"min = "<<min<<std::endl;
00651     // Maxima and minima fuer 4bit Bloecke
00652     max2 = ((max>>2)&0x3333333333333333ULL) + 0x6666666666666666ULL + ((byte_prefix_sums_x_2&0x3333333333333333ULL)<<1);
00653     min2 = ((min>>2)&0x3333333333333333ULL) + 0x6666666666666666ULL + (0x4444444444444444ULL-(((byte_prefix_sums_x_2)&0x3333333333333333ULL)<<1));
00654 //      min2 = ((min>>2)&0x3333333333333333ULL) + mask;
00655     mask = (max2 - (max = max&0x3333333333333333ULL))&0x8888888888888888ULL;
00656     mask2 = (min2 - (min = min&0x3333333333333333ULL))&0x8888888888888888ULL;
00657     mask = (mask | ((mask>>3)^0x1111111111111111ULL))-0x1111111111111111ULL;
00658     mask2 = (mask2 | ((mask2>>3)^0x1111111111111111ULL))-0x1111111111111111ULL;
00659     max2 = max ^((max^max2) & mask);
00660     min2 = min ^((min^min2) & mask2);
00661 //std::cerr<<"min2 = "<<min2<<std::endl;
00662     // Minima fuer 4bit Bloecke
00663     // Summen fuer 4bit Bloecke
00664     byte_prefix_sums_x_2 = ((byte_prefix_sums_x_2 & 0x3333333333333333ULL) + ((byte_prefix_sums_x_2>>2) & 0x3333333333333333ULL))<<1;
00665     // Maxima and minima fuer 8bit Bloecke
00666     max  = ((max2>>4)&0x0707070707070707ULL) + 0x0C0C0C0C0C0C0C0CULL + ((byte_prefix_sums_x_2&0x0F0F0F0F0F0F0F0FULL));
00667     min  = ((min2>>4)&0x0707070707070707ULL) + 0x0C0C0C0C0C0C0C0CULL + (0x0808080808080808ULL-((byte_prefix_sums_x_2)&0x0F0F0F0F0F0F0F0FULL));
00668     //  min  = ((min2>>4)&0x0707070707070707ULL) + mask;
00669     mask = (max - (max2 = max2&0x0707070707070707ULL)) & 0x1010101010101010ULL;
00670     mask2 = (min - (min2 = min2&0x0707070707070707ULL)) & 0x1010101010101010ULL;
00671     mask = (mask | ((mask>>4)^0x0101010101010101ULL))-0x0101010101010101ULL;
00672     mask2 = (mask2 | ((mask2>>4)^0x0101010101010101ULL))-0x0101010101010101ULL;
00673     max  = max2 ^((max2^max) & mask);
00674     min  = min2 ^((min2^min) & mask2);
00675 //std::cerr<<"min = "<<min<<std::endl;
00676     // Summe fuer 8 bit Bloecke
00677     byte_prefix_sums_x_2 = ((byte_prefix_sums_x_2 + (byte_prefix_sums_x_2>>4)) & 0x1E1E1E1E1E1E1E1EULL);
00678     byte_prefix_sums_x_2 = 0x0101010101010101ULL*byte_prefix_sums_x_2;// partialsummen der 2*summe der einsen im praefix
00679 //std::cerr<<"max = "<<max<<std::endl;
00680 //std::cerr<<"min = "<<min<<std::endl;
00681 //std::cerr<<"byte_prefix_sums_x = "<<(byte_prefix_sums_x_2) << std::endl;
00682     max_byte_excesses = (byte_prefix_sums_x_2<<8) + 0x474f575f676F777FULL + max;
00683     min_byte_excesses = (byte_prefix_sums_x_2<<8) + 0x4951596169717981ULL - min;
00684 }
00685 
00686 
00687 inline uint8_t bit_magic::first_excess_position(uint64_t w, uint8_t excess_val, uint64_t& byte_prefix_sums_x_2)
00688 {
00689     uint64_t max_byte_excesses;
00690     bit_magic::max_byte_excesses(w, max_byte_excesses, byte_prefix_sums_x_2);
00691     assert(excess_val <= 64);
00692     max_byte_excesses = (max_byte_excesses-_8_x_the_byte[excess_val])&0x8080808080808080ULL;
00693     if (max_byte_excesses) {
00694         uint8_t block_of_occurence_x_8 = DeBruijn64ToIndex[((max_byte_excesses&-max_byte_excesses)*DeBruijn64)>>58]&0xF8;
00695         assert(block_of_occurence_x_8<64);
00696         uint64_t sums = byte_prefix_sums_x_2<<8;
00697         excess_val = (excess_val + (block_of_occurence_x_8)) - ((sums >> (block_of_occurence_x_8))&0xFF);
00698         assert(excess_val <= 8);
00699         return first_pos_of_excess_val[((w>>(block_of_occurence_x_8))&0xFF)+(excess_val<<8)]+block_of_occurence_x_8;
00700     } else {
00701         return 64;
00702     }
00703 }
00704 
00705 inline uint8_t bit_magic::first_excess_position_naive(uint64_t w, uint8_t excess_val, uint64_t& byte_prefix_sums_x_2)
00706 {
00707     byte_prefix_sums_x_2 = w-((w>>1) & 0x5555555555555555ull);
00708     byte_prefix_sums_x_2 = (byte_prefix_sums_x_2 & 0x3333333333333333ull) + ((byte_prefix_sums_x_2 >> 2) & 0x3333333333333333ull);
00709     byte_prefix_sums_x_2 = (byte_prefix_sums_x_2 + (byte_prefix_sums_x_2 >> 4)) & 0x0f0f0f0f0f0f0f0full;
00710     byte_prefix_sums_x_2 *= 0x0101010101010101ull;
00711     byte_prefix_sums_x_2 <<= 1;
00712     for (uint8_t i=0; i<64; ++i, w>>=1) {
00713         excess_val += (1-((w&1)<<1));
00714         if (excess_val==0)
00715             return i;
00716     }
00717     return 64;
00718 }
00719 
00720 inline uint8_t bit_magic::find_open(uint64_t w, uint8_t close_parenthesis_index)
00721 {
00722     if (!close_parenthesis_index)
00723         return 64;
00724     assert(close_parenthesis_index <= 64);
00725     w <<= (64-close_parenthesis_index);
00726     uint8_t near_match = very_near_find_open[(w>>56)&0xFF];// find near matches
00727 //      std::cerr<<"near_match="<<(int)near_match<<std::endl;
00728     if (near_match) { // the matching opening parenthesis is at most 7 positions before the closing paranthesis
00729         return close_parenthesis_index-near_match;
00730     } else { // the matching opening parenthesis is not among the first 8 preceding bits/parentheses
00731         // calculate
00732         uint64_t min_byte_excesses, max_byte_excesses, byte_prefix_sums_x_2;
00733 //std::cerr<<"w="<<w<<" excess_val="<<(int)excess_val<<std::endl;
00734         bit_magic::min_max_byte_excesses(w, min_byte_excesses, max_byte_excesses, byte_prefix_sums_x_2);
00735         int8_t desired_excess = (byte_prefix_sums_x_2 >> 56) - 65;
00736 //              std::cerr<<"desired_excess="<<(int)desired_excess<<"  "<<(int)(64-close_parenthesis_index)<<std::endl;
00737         if (desired_excess >= 0) {
00738             max_byte_excesses = (max_byte_excesses-_8_x_the_byte[desired_excess])&0x8080808080808080ULL;
00739             max_byte_excesses &= ~(min_byte_excesses-_8_x_the_byte[desired_excess]-_8_x_the_byte[1]);
00740         } else { // desire_excess <0
00741 //                      std::cerr<<"max_byte_excesses="<<max_byte_excesses<<std::endl;
00742 //                      std::cerr<<"min_byte_excesses="<<min_byte_excesses<<std::endl;
00743             max_byte_excesses = (max_byte_excesses+_8_x_the_byte[-desired_excess])&0x8080808080808080ULL;
00744             max_byte_excesses &= ~(min_byte_excesses+_8_x_the_byte[-desired_excess]-_8_x_the_byte[1]);
00745         }
00746 //              std::cerr<<"max_byte_excesses="<<max_byte_excesses<<std::endl;
00747         if (max_byte_excesses) {
00748             max_byte_excesses |= (max_byte_excesses>>8);
00749             max_byte_excesses |= (max_byte_excesses>>16);
00750             max_byte_excesses |= (max_byte_excesses>>32);
00751             max_byte_excesses -= (max_byte_excesses>>8);
00752             assert(max_byte_excesses = (max_byte_excesses&-max_byte_excesses));
00753             uint8_t block_of_occurence_x_8 = DeBruijn64ToIndex[(max_byte_excesses*DeBruijn64)>>58]-7;
00754 //std::cerr<<"block_of_occurence_x_8="<<(int)block_of_occurence_x_8<<std::endl;
00755 //std::cerr<<"result+(64-close_parenthesis_index)="<<result+(64-close_parenthesis_index)<<std::endl;
00756             assert(block_of_occurence_x_8 <= 56);
00757             byte_prefix_sums_x_2 <<= 8;
00758             int excess_val = (int)(desired_excess + block_of_occurence_x_8 + 8) - ((byte_prefix_sums_x_2 >> block_of_occurence_x_8)&0xFF);
00759 //std::cerr<<"desired_excess+block_of_occurence_x_8="<< (int)(desired_excess + block_of_occurence_x_8) <<std::endl;
00760 //std::cerr<<"excess_so_far ="<< ((int)((byte_prefix_sums_x_2 >> block_of_occurence_x_8)&0xFF))-(block_of_occurence_x_8) <<std::endl;
00761 //std::cerr<<"byte_prefix_sums ="<< byte_prefix_sums_x_2 <<std::endl;
00762 //std::cerr<<"excess_val="<<(int)excess_val<<std::endl;
00763             assert(excess_val<=16);
00764             return last_pos_of_excess_val[((w>>(block_of_occurence_x_8))&0xFF)+(excess_val<<8)]+block_of_occurence_x_8-63+close_parenthesis_index;
00765         } else // no near parenthesis found
00766             return 64;
00767     }
00768 }
00769 
00770 inline uint8_t bit_magic::find_open_naive(uint64_t w, uint8_t close_parenthesis_index)
00771 {
00772     for (int i=close_parenthesis_index-1, excess_val=-1; i>=0; --i) {
00773         if ((w>>i)&1)
00774             ++excess_val;
00775         else
00776             --excess_val;
00777         if (excess_val==0)
00778             return i;
00779     }
00780     return 64;
00781 }
00782 
00783 inline uint8_t bit_magic::find_enclose_naive(uint64_t w, uint8_t open_parenthesis_index)
00784 {
00785     for (int i=open_parenthesis_index-1, excess_val=0; i>=0; --i) {
00786         if ((w>>i)&1)
00787             ++excess_val;
00788         else
00789             --excess_val;
00790         if (excess_val==1)
00791             return i;
00792     }
00793     return 64;
00794 }
00795 
00796 inline uint8_t bit_magic::find_enclose(uint64_t w, uint8_t open_parenthesis_index)
00797 {
00798     return find_open(w, open_parenthesis_index);
00799 }
00800 
00801 inline uint8_t bit_magic::last_excess_position(uint64_t w, uint8_t excess_val, uint64_t& byte_prefix_sums_x_2)
00802 {
00803     uint64_t min_byte_excesses, max_byte_excesses;
00804 //std::cerr<<"w="<<w<<" excess_val="<<(int)excess_val<<std::endl;
00805     bit_magic::min_max_byte_excesses(w, min_byte_excesses, max_byte_excesses, byte_prefix_sums_x_2);
00806 //std::cerr<<"min_byte_accesses="<<min_byte_excesses<<std::endl;
00807 //std::cerr<<"max_byte_accesses="<<max_byte_excesses<<std::endl;
00808     assert(excess_val <= 64);
00809     max_byte_excesses = (max_byte_excesses-_8_x_the_byte[excess_val])&0x8080808080808080ULL;
00810     max_byte_excesses &= ~(min_byte_excesses-_8_x_the_byte[excess_val]-_8_x_the_byte[1]);
00811 //std::cerr<<"max_byte_excesses="<<max_byte_excesses<<std::endl;
00812     if (max_byte_excesses) {
00813 //              std::cerr<<"max_byte_accesses="<<max_byte_excesses<<std::endl;
00814         uint8_t block_of_occurence_x_8;
00815         if (max_byte_excesses&0x8080808000000000ULL) { // 4..7
00816             if (max_byte_excesses&0x8080000000000000ULL) { // 6..7
00817                 block_of_occurence_x_8 = 48 + ((max_byte_excesses>>60)&0x8);
00818             } else { // 4..5
00819                 block_of_occurence_x_8 = 32 + ((max_byte_excesses>>44)&0x8);
00820             }
00821         } else { // 0..3
00822             if (max_byte_excesses&0x8080808080800000ULL) { //2..3
00823                 block_of_occurence_x_8 = 16 + ((max_byte_excesses>>28)&0x8);
00824             } else { //0..1
00825                 block_of_occurence_x_8 = (max_byte_excesses>>12)&0x8;
00826             }
00827         }
00828 //                      std::cerr<<" block_of_oc...="<<(int)block_of_occurence_x_8<<std::endl;
00829 #ifndef NDEBUG
00830 #endif
00831         assert(block_of_occurence_x_8<64);
00832         uint64_t sums = byte_prefix_sums_x_2<<8;
00833         excess_val = (excess_val + (block_of_occurence_x_8)+8) - ((sums >> (block_of_occurence_x_8))&0xFF);
00834 //              if(excess_val>8+8){
00835 //                      std::cerr<<"block_of_occurence_x_8="<< (int)block_of_occurence_x_8  <<" excess_val="<<(int)excess_val<<std::endl;
00836 //                      std::cerr<<"(sums >> (block_of_occurence_x_8))&0xFF = "<< ((sums >> (block_of_occurence_x_8))&0xFF) << std::endl;
00837 //              }
00838         assert(excess_val <= 16);
00839 //              std::cerr<<"excess_val="<<(int)((int)excess_val)-8<<std::endl;
00840 //              std::cerr<<"pos_inside_8_bit_block "<<(int)last_pos_of_excess_val[((w>>(block_of_occurence_x_8))&0xFF)+(excess_val<<8)]<<std::endl;
00841 //              std::cerr<<"pos_inside_8_bit_block+block_of... "<<(int)last_pos_of_excess_val[((w>>(block_of_occurence_x_8))&0xFF)+(excess_val<<8)]+block_of_occurence_x_8<<std::endl;
00842         return last_pos_of_excess_val[((w>>(block_of_occurence_x_8))&0xFF)+(excess_val<<8)]+block_of_occurence_x_8;
00843     } else {
00844         return 64;
00845     }
00846 }
00847 
00848 
00849 inline uint8_t bit_magic::last_excess_position_naive(uint64_t w, uint8_t excess_val, uint64_t& byte_prefix_sums_x_2)
00850 {
00851     byte_prefix_sums_x_2 = w-((w>>1) & 0x5555555555555555ull);
00852     byte_prefix_sums_x_2 = (byte_prefix_sums_x_2 & 0x3333333333333333ull) + ((byte_prefix_sums_x_2 >> 2) & 0x3333333333333333ull);
00853     byte_prefix_sums_x_2 = (byte_prefix_sums_x_2 + (byte_prefix_sums_x_2 >> 4)) & 0x0f0f0f0f0f0f0f0full;
00854     byte_prefix_sums_x_2 *= 0x0101010101010101ull;
00855     byte_prefix_sums_x_2 <<= 1;
00856     uint8_t res = 64;
00857     uint8_t my_excess_val=0;
00858     for (uint8_t i=0; i<64; ++i, w>>=1) {
00859         if (w&1)
00860             ++my_excess_val;
00861         else
00862             --my_excess_val;
00863 //if(i%8==0) std::cerr<<std::endl;
00864 //std::cerr<<"("<<(int)i<<","<<(int)my_excess_val<<")";
00865         if (excess_val==my_excess_val)
00866             res = i;
00867     }
00868 //std::cerr<<std::endl;
00869     return res;
00870 }
00871 
00872 
00873 inline uint16_t bit_magic::max_excess3(uint64_t x, uint16_t& b1Cnt)
00874 {
00875     uint64_t sum = x-((x>>1) & 0x5555555555555555ull);
00876     sum = (sum & 0x3333333333333333ull) + ((sum >> 2) & 0x3333333333333333ull);
00877     sum = (sum + (sum >> 4)) & 0x0f0f0f0f0f0f0f0full;
00878     sum *= 0x0101010101010101ull;
00879     b1Cnt = sum>>56;
00880     sum = (((sum<<1) | 0x8080808080808080ULL) - 0x4038302820181008ULL);
00881     /*  uint8_t maxi = max_excess_8bit[(uint8_t)x] | 0x80;
00882         x>>=8;
00883         for(uint8_t i=1,m;i<8;++i, sum>>=8, x>>=8){
00884                 if( (m = (((uint8_t)sum)+max_excess_8bit[(uint8_t)x])) > maxi ) maxi = m;
00885     //          std::cerr<<"m="<<(int)(m&0x7F)<<" "<<(int)max_excess_8bit[(uint8_t)x]<<" sum="<<(int)((uint8_t)sum)<<std::endl;
00886         }
00887         return maxi&0x7F;
00888     */  uint8_t maxi[8] = {     static_cast<uint8_t>(max_excess_8bit[(uint8_t)x]|0x80),
00889                             static_cast<uint8_t>((sum)+         max_excess_8bit[(uint8_t)(x>>8)]),
00890                             static_cast<uint8_t>((sum>>8)+      max_excess_8bit[(uint8_t)(x>>16)]),
00891                             static_cast<uint8_t>((sum>>16)+     max_excess_8bit[(uint8_t)(x>>24)]),
00892                             static_cast<uint8_t>((sum>>24)+     max_excess_8bit[(uint8_t)(x>>32)]),
00893                             static_cast<uint8_t>((sum>>32)+     max_excess_8bit[(uint8_t)(x>>40)]),
00894                             static_cast<uint8_t>((sum>>40)+     max_excess_8bit[(uint8_t)(x>>48)]),
00895                             static_cast<uint8_t>((sum>>48)+     max_excess_8bit[(uint8_t)(x>>56)])
00896                          };
00897     if (maxi[0]<maxi[1])maxi[0]=maxi[1];
00898     if (maxi[2]<maxi[3])maxi[2]=maxi[3];
00899     if (maxi[4]<maxi[5])maxi[4]=maxi[5];
00900     if (maxi[6]<maxi[7])maxi[6]=maxi[7];
00901     if (maxi[0]<maxi[2])maxi[0]=maxi[2];
00902     if (maxi[4]<maxi[6])maxi[4]=maxi[6];
00903     return (maxi[0]<maxi[4])?maxi[4]&0x7F:maxi[0]&0x7F;
00904 }
00905 
00906 // see page 11, Knuth TAOCP Vol 4 F1A
00907 inline uint64_t bit_magic::b1Cnt(uint64_t x)
00908 {
00909 #ifdef __SSE4_2__
00910     return __builtin_popcountll(x);
00911 #else
00912 #ifdef POPCOUNT_TL
00913     return B1CntBytes[x&0xFFULL] + B1CntBytes[(x>>8)&0xFFULL] +
00914            B1CntBytes[(x>>16)&0xFFULL] + B1CntBytes[(x>>24)&0xFFULL] +
00915            B1CntBytes[(x>>32)&0xFFULL] + B1CntBytes[(x>>40)&0xFFULL] +
00916            B1CntBytes[(x>>48)&0xFFULL] + B1CntBytes[(x>>56)&0xFFULL];
00917 #else
00918     x = x-((x>>1) & 0x5555555555555555ull);
00919     x = (x & 0x3333333333333333ull) + ((x >> 2) & 0x3333333333333333ull);
00920     x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0full;
00921     return (0x0101010101010101ull*x >> 56);
00922 #endif
00923 #endif
00924 }
00925 
00926 inline uint32_t bit_magic::b1Cnt32(uint32_t x)
00927 {
00928     x = x-((x>>1) & 0x55555555);
00929     x = (x & 0x33333333) + ((x>>2) & 0x33333333);
00930     return (0x10101010*x >>28)+(0x01010101*x >>28);
00931 }
00932 
00933 inline uint32_t bit_magic::b1CntNaive(uint64_t x)
00934 {
00935     uint32_t res = 0;
00936     for (int i=0; i<64; ++i) {
00937         res += x&1;
00938         x>>=1;
00939     }
00940     return res;
00941 }
00942 
00943 /*
00944 inline uint32_t bit_magic::b11Cnt(uint64_t x){
00945         // extract "10" 2bit blocks
00946         uint64_t ex10 = ((x^(x<<1))&0xAAAAAAAAAAAAAAAAULL)&x;
00947         // extract "10" 2bit blocks
00948         uint64_t ex01 = ((x^(x>>1))&0x5555555555555555ULL)&x;
00949         // extract "11" 2bit blocks
00950         uint64_t ex11 =  (x&(x<<1))&0xAAAAAAAAAAAAAAAAULL;
00951         ex11 |= (ex11>>1);
00952         x = (((ex11)+(ex10<<1))&(ex01))|(ex11&0x5555555555555555ULL);
00953         x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
00954         x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
00955         return  (0x0101010101010101ULL*x >> 56);
00956 }*/
00957 
00958 inline uint32_t bit_magic::b11Cnt(uint64_t x, uint64_t& c)
00959 {
00960     // extract "11" 2bit blocks
00961     uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL, t;
00962     // extract "10" 2bit blocks
00963     uint64_t ex10or01 = (ex11|(ex11<<1))^x;
00964 
00965     x = ex11 | ((t=(ex11|(ex11<<1))+(((ex10or01<<1)&0x5555555555555555ULL)|c))&(ex10or01&0x5555555555555555ULL));
00966     c = (ex10or01>>63) or(t < (ex11|(ex11<<1)));
00967 
00968     x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
00969     x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
00970     return (0x0101010101010101ULL*x >> 56);
00971 }
00972 
00973 inline uint32_t bit_magic::b11Cnt(uint64_t x)
00974 {
00975     // extract "11" 2bit blocks
00976     uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL;
00977     // extract "10" 2bit blocks
00978     uint64_t ex10or01 = (ex11|(ex11<<1))^x;
00979 
00980     x = ex11 | (((ex11|(ex11<<1))+((ex10or01<<1)&0x5555555555555555ULL))&(ex10or01&0x5555555555555555ULL));
00981 
00982     x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
00983     x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
00984     return (0x0101010101010101ULL*x >> 56);
00985 }
00986 
00987 inline uint32_t bit_magic::b11CntS(uint64_t x, uint64_t& c)
00988 {
00989     uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL;
00990     uint64_t ex10or01 = (ex11|(ex11<<1))^x;
00991     ex11 += (((ex11|(ex11<<1))+(((ex10or01<<1)&0x5555555555555555ULL)|c)) & ((ex10or01&0x5555555555555555ULL)|ex11));
00992     c = (x>>63) and(ex11>>62);
00993     x = ex11-((ex11>>1) & 0x5555555555555555ULL);
00994     x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
00995     x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
00996     return (0x0101010101010101ULL*x >> 56);
00997 }
00998 
00999 inline uint32_t bit_magic::b11CntS(uint64_t x)
01000 {
01001     uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL;
01002     uint64_t ex10or01 = (ex11|(ex11<<1))^x;
01003     ex11 += (((ex11|(ex11<<1))+((ex10or01<<1)&0x5555555555555555ULL)) & ((ex10or01&0x5555555555555555ULL)|ex11));
01004     x = ex11-((ex11>>1) & 0x5555555555555555ULL);
01005     x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
01006     x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
01007     return (0x0101010101010101ULL*x >> 56);
01008 }
01009 
01010 inline uint32_t bit_magic::b11CntNaive(uint64_t x)
01011 {
01012     uint32_t res = 0;
01013     for (int i=0; i<64; ++i) {
01014         if ((x&3) == 3) {
01015             ++res;
01016             ++i;
01017             x>>=1;
01018         }
01019         x>>=1;
01020     }
01021     return res;
01022 }
01023 
01024 inline uint32_t bit_magic::eB11Cnt(uint64_t x)
01025 {
01026     x = (x&(x>>1))&0x5555555555555555ULL;
01027     x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
01028     x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
01029     return (0x0101010101010101ULL*x >> 56);
01030 }
01031 
01032 inline uint32_t bit_magic::b10Cnt(uint64_t x, uint64_t& c)
01033 {
01034     uint32_t res = b1Cnt((x ^((x<<1) | c)) & (~x));
01035     c = (x >> 63);
01036     return res;
01037 }
01038 
01039 inline uint64_t bit_magic::b10Map(uint64_t x, uint64_t c)
01040 {
01041     return ((x ^((x << 1) | c)) & (~x));
01042 }
01043 
01044 inline uint32_t bit_magic::b01Cnt(uint64_t x, uint64_t& c)
01045 {
01046     uint32_t res = b1Cnt((x ^((x<<1) | c)) & x);
01047     c = (x >> 63);
01048     return res;
01049 }
01050 inline uint64_t bit_magic::b01Map(uint64_t x, uint64_t c)
01051 {
01052     return ((x ^((x << 1) | c)) &  x);
01053 }
01054 
01055 inline uint32_t bit_magic::b10CntNaive(uint64_t x, uint64_t& c)
01056 {
01057     uint32_t sum = 0, lastbit = c;
01058     for (uint32_t i=0; i<64; ++i) {
01059         if ((x&1) == 0 and lastbit == 1) {
01060             ++sum;
01061         }
01062         lastbit = (x&1);
01063         x >>= 1;
01064     }
01065     c = lastbit;
01066     return sum;
01067 }
01068 
01069 inline uint32_t bit_magic::k1BP(uint64_t x, uint32_t i)
01070 {
01071 //      if ( i == 1 ){
01072 //              return __builtin_ctzll(x);
01073 //      }
01074 #ifdef __SSE4_2__
01075     uint64_t s = x, b;
01076     s = s-((s>>1) & 0x5555555555555555ULL);
01077     s = (s & 0x3333333333333333ULL) + ((s >> 2) & 0x3333333333333333ULL);
01078     s = (s + (s >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
01079     s = 0x0101010101010101ULL*s;
01080     // now s contains 8 bytes s[7],...,s[0], s[i] contains the cumulative sum
01081     // off (i+1)*8 least significant bits of s
01082     b = (s+PsOverflow[i]) & 0x8080808080808080ULL;
01083     // PsOverflow contains a bit mask x consisting of 8 bytes
01084     // x[7],...,x[0] and x[i] is set to 128-i
01085     // => a byte b[i] in b is >= 128 if cum sum >= i
01086 
01087     // __builtin_ctzll returns the number of trailing zeros, if b!=0
01088     int  byte_nr = __builtin_ctzll(b) >> 3;   // byte nr in [0..7]
01089 //      b = _mm_movemask_pi8( (__m64)~b );
01090 //      std::cout << "byte_nr = " << byte_nr << " ";
01094     s <<= 8;
01095     i -= ((uint8_t*)&s)[byte_nr];
01096 //      std::cout << "i = " << i << std::endl;
01097     return (byte_nr << 3) + Select256[((i-1) << 8) + ((uint8_t*)&x)[byte_nr] ];
01098 #endif
01099     return i1BP(x, i);
01100 }
01101 
01102 
01103 inline uint32_t bit_magic::i1BP(uint64_t x, uint32_t i)
01104 {
01105 #ifdef __SSE4_2__
01106 //__m64 v;
01107 //v = (__m64)x;
01108 //const __m64 mask_
01109     uint64_t s = x, b;
01110     s = s-((s>>1) & 0x5555555555555555ULL);
01111     s = (s & 0x3333333333333333ULL) + ((s >> 2) & 0x3333333333333333ULL);
01112     s = (s + (s >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
01113     s = 0x0101010101010101ULL*s;
01114 // now s contains 8 bytes s[7],...,s[0], s[i] contains the cumulative sum
01115 // off (i+1)*8 least significant bits of s
01116     b = (s+PsOverflow[i]) & 0x8080808080808080ULL;
01117 // PsOverflow contains a bit mask x consisting of 8 bytes
01118 // x[7],...,x[0] and x[i] is set to 128-i
01119 // => a byte b[i] in b is >= 128 if cum sum >= i
01120 
01121 // __builtin_ctzll returns the number of trailing zeros, if b!=0
01122     int  byte_nr = __builtin_ctzll(b) >> 3;   // byte nr in [0..7]
01123 //      b = _mm_movemask_pi8( (__m64)~b );
01124 //      std::cout << "byte_nr = " << byte_nr << " ";
01128     s <<= 8;
01129     i -= (s >> (byte_nr<<3)) & 0xFFULL;
01130 //i -= ((uint8_t*)&s)[byte_nr];
01131 //      std::cout << "i = " << i << std::endl;
01132     return (byte_nr << 3) + Select256[((i-1) << 8) + ((x>>(byte_nr<<3))&0xFFULL) ];
01133 #endif
01134     return i1BP2(x, i);
01135 }
01136 
01137 
01138 inline uint32_t bit_magic::i1BP2(uint64_t x, uint32_t i)
01139 {
01140 // TODO: Maybe case i<16 as a special case
01141 //      assert(i>0 && i<=b1Cnt(x));
01142     /*register*/ uint64_t s = x, b;  // s = sum
01143     s = s-((s>>1) & 0x5555555555555555ULL);
01144     s = (s & 0x3333333333333333ULL) + ((s >> 2) & 0x3333333333333333ULL);
01145     s = (s + (s >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
01146     s = 0x0101010101010101ULL*s;
01147     b = (s+PsOverflow[i]);//&0x8080808080808080ULL;// add something to the partial sums to cause overflow
01148 //      b =DeBruijn64ToIndex[((b&-b)*DeBruijn64)>>58];
01149     i = (i-1)<<8;
01150     if (b&0x0000000080000000ULL) // byte <=3
01151         if (b&0x0000000000008000ULL) //byte <= 1
01152             if (b&0x0000000000000080ULL)
01153                 return    Select256[(x&0xFFULL) + i];
01154             else
01155                 return 8 +Select256[(((x>>8)&0xFFULL)  + i - ((s&0xFFULL)<<8))&0x7FFULL];//byte 1;
01156         else//byte >1
01157             if (b&0x0000000000800000ULL) //byte <=2
01158                 return 16+Select256[(((x>>16)&0xFFULL) + i - (s&0xFF00ULL))&0x7FFULL];//byte 2;
01159             else
01160                 return 24+Select256[(((x>>24)&0xFFULL) + i - ((s>>8)&0xFF00ULL))&0x7FFULL];//byte 3;
01161     else//  byte > 3
01162         if (b&0x0000800000000000ULL) // byte <=5
01163             if (b&0x0000008000000000ULL) //byte <=4
01164                 return 32+Select256[(((x>>32)&0xFFULL) + i - ((s>>16)&0xFF00ULL))&0x7FFULL];//byte 4;
01165             else
01166                 return 40+Select256[(((x>>40)&0xFFULL) + i - ((s>>24)&0xFF00ULL))&0x7FFULL];//byte 5;
01167         else// byte >5
01168             if (b&0x0080000000000000ULL) //byte<=6
01169                 return 48+Select256[(((x>>48)&0xFFULL) + i - ((s>>32)&0xFF00ULL))&0x7FFULL];//byte 6;
01170             else
01171                 return 56+Select256[(((x>>56)&0xFFULL) + i - ((s>>40)&0xFF00ULL))&0x7FFULL];//byte 7;
01172     return 0;
01173 }
01174 
01175 inline uint32_t bit_magic::j1BP(uint64_t x, uint32_t j)
01176 {
01177     register uint64_t s = x, b, l;  // s = sum   b = byte [1..8] multiplied by 8 in which the j 1 is located
01178     s = s-((s>>1) & 0x5555555555555555ULL);
01179     s = (s & 0x3333333333333333ULL) + ((s >> 2) & 0x3333333333333333ULL);
01180     s = (s + (s >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
01181     s = 0x0101010101010101ULL*s;
01182     b = ((((((((j-1)*0x0101010101010101ULL)|0x8080808080808080ULL)-s)&0x8080808080808080ULL)>>7)*0x0101010101010101ULL)>>53)&0xFFFFFFFFFFFFFFF8ULL; // s <=_8 (j*L_8)
01183     // b>0 if there are at least j 1s in x
01184     l = j - (((s<<8)>>b)&0xFF);
01185     s = (((((((((x>>b)&0xFF)*0x0101010101010101ULL)&0x8040201008040201ULL)|0x8080808080808080ULL)-0x0101010101010101ULL)|(((x>>b)&0x80ULL)<<56))&0x8080808080808080ULL)>>7)*0x0101010101010101ULL;
01186 //      std::cerr<<"j="<<j<<" in byte "<<b<<" l="<<l<<" s="<<s<<std::endl;
01187     return b + ((((((((l-1)*0x0101010101010101ULL)|0x8080808080808080ULL)-s)&0x8080808080808080ULL)>>7)*0x0101010101010101ULL)>>56);
01188 }
01189 
01190 inline uint32_t bit_magic::i1BPNaive(uint64_t x, uint32_t i)
01191 {
01192     uint32_t pos = 0;
01193     while (x) {
01194         i -= x&1;
01195         x>>=1;
01196         if (!i) break;
01197         ++pos;
01198     }
01199     return pos;
01200 }
01201 
01202 // builtin version of sse version or
01203 // 64 bit version of 32 bit proposial of
01204 // http://www-graphics.stanford.edu/~seander/bithacks.html
01205 inline uint32_t bit_magic::l1BP(uint64_t x)
01206 {
01207 #ifdef __SSE4_2__
01208     if (x == 0)
01209         return 0;
01210     return 63 - __builtin_clzll(x);
01211 #else
01212     register uint64_t t,tt; // temporaries
01213     if ((tt = x >> 32)) { // l1BP >= 32
01214         if ((t = tt >> 16)) { // l1BP >= 48
01215             return (tt = t >> 8) ? 56 + L1BP[tt] : 48 + L1BP[t];
01216         } else { // l1BP < 48
01217             return (t = tt >> 8) ? 40 + L1BP[t] : 32 + L1BP[tt];
01218         }
01219     } else { // l1BP < 32
01220         if ((t = x >> 16)) { // l1BP >= 16
01221             return (tt = t >> 8) ? 24 + L1BP[tt] : 16 + L1BP[t];
01222         } else { // l1BP < 16
01223             return (tt = x >> 8) ?  8 + L1BP[tt] : L1BP[x];
01224         }
01225     }
01226 #endif
01227 }
01228 
01229 inline uint32_t bit_magic::l1BPNaive(uint64_t x)
01230 {
01231     if (x&0x8000000000000000ULL)
01232         return 63;
01233     x<<=1;
01234     for (int i=62; i>=0; ++i)
01235         if (x&0x8000000000000000ULL)
01236             return i;
01237         else
01238             x<<=1;
01239     return 0;
01240 }
01241 
01242 // details see: http://citeseer.ist.psu.edu/leiserson98using.html
01243 // or page 10, Knuth TAOCP Vol 4 F1A
01244 inline uint32_t bit_magic::r1BP(uint64_t x)
01245 {
01246 #ifdef __SSE4_2__
01247     if (x==0)
01248         return 0;
01249     return __builtin_ctzll(x);
01250 #else
01251     if (x&1) return 0;
01252     if (x&3) return 1;
01253     if (x&7) return 2;
01254     if (x&0x7F) { // in average every second random number x can be answered this way
01255 //              return 0;
01256         return lookupr1BP[(x&0x7F)>>3]+3;
01257     }
01258     // x&-x equals x with only the lsb set
01259     //default:
01260     return DeBruijn64ToIndex[((x&-x)*DeBruijn64)>>58];
01261 #endif
01262 }
01263 
01264 inline uint32_t bit_magic::r1BPNaive(uint64_t x)
01265 {
01266     if (x&1)
01267         return 0;
01268     x>>=1;
01269     for (int i=1; i<64; ++i)
01270         if (x&1)
01271             return i;
01272         else
01273             x>>=1;
01274     return 63;
01275 }
01276 
01277 inline uint32_t bit_magic::l11BP(uint64_t x)
01278 {
01279     // extract "11" 2bit blocks
01280     uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL;
01281 //      std::cerr<<"ex11="<<ex11<<std::endl;
01282     // extract "10" 2bit blocks
01283     uint64_t ex10or01 = (ex11|(ex11<<1))^x;
01284 //      uint64_t ex10or01 = (x^(x<<1))&0xAAAAAAAAAAAAAAAAULL;
01285 //      ex10or01 = (ex10or01|(ex10or01>>1))&x;
01286     // extract "10" 2bit blocks
01287     ex11 += (((ex11|(ex11<<1))+((ex10or01<<1)&0x5555555555555555ULL)) & ((ex10or01&0x5555555555555555ULL)|ex11));
01288     //
01289 //      std::cerr<<"ex11="<<ex11<<std::endl;
01290 //      uint32_t p = l1BP(ex11);// p % 2 == 0  and (x&(1<<p))==1
01291 //      std::cerr<<"p="<<p<<std::endl;
01292     return l1BP(ex11);/*
01293         if( p == 0 and (x&1) == 0 )
01294                 return 0;
01295         return p + (x>>(p+1)&1);
01296         */
01297 }
01298 /*
01299 inline uint64_t bit_magic::all11BPs(uint64_t x, uint32_t c){
01300         uint64_t ex11 =  (x&(x>>1))&0x5555555555555555ULL;
01301         uint64_t ex10or01 = (ex11|(ex11<<1))^x;
01302         ex11 += ( ((ex11|(ex11<<1))+(((ex10or01<<1)&0x5555555555555555ULL)|c)) & ((ex10or01&0x5555555555555555ULL)|ex11) );
01303         return ex11;
01304 }
01305 */
01306 
01307 inline uint64_t bit_magic::all11BPs(uint64_t x, bool& c)
01308 {
01309     uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL;
01310     uint64_t ex10or01 = (ex11|(ex11<<1))^x;
01311     ex11 += (((ex11|(ex11<<1))+(((ex10or01<<1)&0x5555555555555555ULL)|c)) & ((ex10or01&0x5555555555555555ULL)|ex11));
01312     c = (x&(~ex11))>>63;
01313     return ex11;
01314 }
01315 
01316 inline uint32_t bit_magic::i11BP(uint64_t x, uint32_t i, uint32_t c)
01317 {
01318     uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL;
01319     uint64_t ex10or01 = (ex11|(ex11<<1))^x;
01320     ex11 += (((ex11|(ex11<<1))+(((ex10or01<<1)&0x5555555555555555ULL)|c)) & ((ex10or01&0x5555555555555555ULL)|ex11));
01321     return i1BP(ex11,i);
01322 }
01323 
01324 inline uint32_t bit_magic::eI11BP(uint64_t x, uint32_t i)
01325 {
01326     return i1BP((x&(x<<1))&0xAAAAAAAAAAAAAAAAULL, i);
01327 }
01328 
01329 inline void bit_magic::write_int(uint64_t* word, uint64_t x, uint8_t offset, const uint8_t len)
01330 {
01331     x &= bit_magic::Li1Mask[len];
01332     if (offset + len < 64) {
01333         *word &=
01334             ((bit_magic::All1Mask << (offset+len)) | bit_magic::Li1Mask[offset]); // mask 1..10..01..1
01335         *word |= (x << offset);
01336 //              *word ^= ((*word ^ x) & (bit_magic::Li1Mask[len] << offset) );
01337 //      surprisingly the above line is slower than the lines above
01338     } else {
01339         *word &=
01340             ((bit_magic::Li1Mask[offset]));  // mask 0....01..1
01341         *word |= (x << offset);
01342         if ((offset = (offset+len)&0x3F)) { // offset+len > 64
01343             *(word+1) &= (~bit_magic::Li1Mask[offset]); // mask 1...10..0
01344 //                      *(word+1) &= bit_magic::Li0Mask[offset]; // mask 1...10..0
01345 //          surprisingly the above line is slower than the line above
01346             *(word+1) |= (x >> (len-offset));
01347         }
01348     }
01349 }
01350 
01351 //inline void bit_magic::writeInt2(uint64_t *word, uint64_t x, uint8_t offset, const uint8_t len){
01352 //      *word ^= ((x)& 111ULL << offset);
01353 //      *word ^= ((*word ^ x)& bit_magic::Li1Mask[len] << offset);
01354 //      if( offset + len > 64 ){
01355 //              offset = offset + len - 64;
01356 //              *(word+1) ^= (( *(word+1) ^ (x >> (len-offset)) ) & bit_magic::Li1Mask[offset]);
01357 //      }
01358 //}
01359 
01360 inline void bit_magic::write_int_and_move(uint64_t*& word, uint64_t x, uint8_t& offset, const uint8_t len)
01361 {
01362     x &= bit_magic::Li1Mask[len];
01363     if (offset + len < 64) {
01364         *word &=
01365             ((bit_magic::All1Mask << (offset+len)) | bit_magic::Li1Mask[offset]); // mask 1..10..01..1
01366         *word |= (x << offset);
01367         offset += len;
01368     } else {
01369         *word &=
01370             ((bit_magic::Li1Mask[offset]));  // mask 0....01..1
01371         *word |= (x << offset);
01372         if ((offset= (offset+len))>64) {// offset+len >= 64
01373             offset &= 0x3F;
01374             *(++word) &= (~bit_magic::Li1Mask[offset]); // mask 1...10..0
01375             *word |= (x >> (len-offset));
01376         } else {
01377             offset = 0;
01378             ++word;
01379         }
01380     }
01381 }
01382 
01383 inline uint64_t bit_magic::read_int(const uint64_t* word, uint8_t offset, const uint8_t len)
01384 {
01385     uint64_t w1 = (*word)>>offset;
01386     if ((offset+len) > 64) { // if offset+len > 64
01387         return w1 |  // w1 or w2 adepted:
01388                ((*(word+1) & bit_magic::Li1Mask[(offset+len)&0x3F])   // set higher bits zero
01389                 << (64-offset));  // move bits to the left
01390     } else {
01391         return w1 & bit_magic::Li1Mask[len];
01392     }
01393 }
01394 
01395 inline uint64_t bit_magic::read_int_and_move(const uint64_t*& word, uint8_t& offset, const uint8_t len)
01396 {
01397     uint64_t w1 = (*word)>>offset;
01398     if ((offset = (offset+len))>=64) {  // if offset+len > 64
01399         if (offset==64) {
01400             offset &= 0x3F;
01401             ++word;
01402             return w1;
01403         } else {
01404             offset &= 0x3F;
01405             return w1 |
01406                    (((*(++word)) & bit_magic::Li1Mask[offset]) << (len-offset));
01407         }
01408     } else {
01409         return w1 & bit_magic::Li1Mask[len];
01410     }
01411 }
01412 
01413 inline uint64_t bit_magic::readUnaryInt(const uint64_t* word, uint8_t offset)
01414 {
01415     register uint64_t w = *word >> offset;
01416     if (w) {
01417         return bit_magic::r1BP(w)/*+1*/;
01418     } else {
01419         if (0!=(w=*(++word)))
01420             return bit_magic::r1BP(w)+64-offset/*+1*/;
01421         uint64_t cnt=2;
01422         while (0==(w=*(++word)))
01423             ++cnt;
01424         return bit_magic::r1BP(w)+(cnt<<6)/*cnt*64*/-offset/*+1*/;//+(64-offset)
01425     }
01426     return 0;
01427 }
01428 
01429 inline uint64_t bit_magic::readUnaryIntAndMove(const uint64_t*& word, uint8_t& offset)
01430 {
01431     register uint64_t w = (*word) >> offset; // temporary variable is good for the performance
01432     if (w) {
01433         uint8_t r = bit_magic::r1BP(w);
01434         offset = (offset + r+1)&0x3F;
01435         // we know that offset + r +1 <= 64, so if the new offset equals 0 increase word
01436         word += (offset==0);
01437         return r;
01438     } else {
01439         uint8_t rr=0;
01440         if (0!=(w=*(++word))) {
01441             rr = bit_magic::r1BP(w)+64-offset;
01442             offset = (offset+rr+1)&0x3F;
01443             word += (offset==0);
01444             return rr;
01445         } else {
01446             uint64_t cnt_1=1;
01447             while (0==(w=*(++word)))
01448                 ++cnt_1;
01449             rr = bit_magic::r1BP(w)+64-offset;
01450             offset = (offset+rr+1)&0x3F;
01451             word += (offset==0);
01452             return ((cnt_1)<<6) + rr;
01453         }
01454     }
01455     return 0;
01456 }
01457 
01458 inline void bit_magic::move_right(const uint64_t*& word, uint8_t& offset, const uint8_t len)
01459 {
01460     if ((offset+=len)&0xC0) { // if offset >= 65
01461         offset&=0x3F;
01462         ++word;
01463     }
01464 }
01465 
01466 inline void bit_magic::move_left(const uint64_t*& word, uint8_t& offset, const uint8_t len)
01467 {
01468     if ((offset-=len)&0xC0) {  // if offset-len<0
01469         offset&=0x3F;
01470         --word;
01471     }
01472 }
01473 
01474 inline uint64_t bit_magic::next(const uint64_t* word, uint64_t idx)
01475 {
01476     word += (idx>>6);
01477     if (*word & ~Li1Mask[idx&0x3F]) {
01478         return (idx & ~((size_t)0x3F)) + r1BP(*word & ~Li1Mask[idx&0x3F]);
01479     }
01480     idx = (idx & ~((size_t)0x3F)) + 64;
01481     ++word;
01482     while (*word==0) {
01483         idx += 64;
01484         ++word;
01485     }
01486     return idx + r1BP(*word);
01487 }
01488 
01489 inline uint64_t bit_magic::prev(const uint64_t* word, uint64_t idx)
01490 {
01491     word += (idx>>6);
01492     if (*word & Li1Mask[(idx&0x3F)+1]) {
01493         return (idx & ~((size_t)0x3F)) + l1BP(*word & Li1Mask[(idx&0x3F)+1]);
01494     }
01495     idx = (idx & ~((size_t)0x3F)) - 64;
01496     --word;
01497     while (*word==0) {
01498         idx -= 64;
01499         --word;
01500     }
01501     return idx + l1BP(*word);
01502 }
01503 
01504 } // end namespace sdsl
01505 
01506 #endif