SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
 All Classes Namespaces Files Functions Variables Typedefs Friends
sdsl/include/sdsl/csa_construct.hpp
Go to the documentation of this file.
00001 /* sdsl - succinct data structures library
00002     Copyright (C) 2010 Simon Gog
00003 
00004     This program is free software: you can redistribute it and/or modify
00005     it under the terms of the GNU General Public License as published by
00006     the Free Software Foundation, either version 3 of the License, or
00007     (at your option) any later version.
00008 
00009     This program is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012     GNU General Public License for more details.
00013 
00014     You should have received a copy of the GNU General Public License
00015     along with this program.  If not, see http://www.gnu.org/licenses/ .
00016 */
00021 #ifndef INCLUDED_SDSL_CSA_CONSTRUCT
00022 #define INCLUDED_SDSL_CSA_CONSTRUCT
00023 
00024 #include "typedefs.hpp"
00025 #include "int_vector.hpp"
00026 #include "algorithms_for_suffix_array_construction.hpp"
00027 #include "algorithms_for_compressed_suffix_arrays.hpp"
00028 #include "util.hpp"
00029 #include "testutils.hpp"
00030 #include "suffixarrays.hpp"
00031 #include "bwt_construct.hpp"
00032 
00033 #include <iostream>
00034 #include <fstream>
00035 #include <string>
00036 #include <stdexcept>
00037 
00038 namespace sdsl
00039 {
00040 
00041 template<class Csa>
00042 static bool construct_csa(std::string file_name, Csa& csa)
00043 {
00044     tMSS file_map;
00045     return construct_csa(file_name, csa, file_map, true, "./","");
00046 }
00047 
00048 template<class Csa>
00049 static bool construct_csa(std::string file_name, Csa& csa, tMSS& file_map, bool delete_files=true, std::string dir="./", std::string id="")
00050 {
00051     uint64_t fs = 0;
00052     char* ccc = NULL;
00053     if ((fs=file::read_text((const char*)file_name.c_str(), ccc))>0) {
00054         typename Csa::size_type n = strlen((const char*)ccc);
00055         if (id == "")
00056             id =  util::to_string(util::get_pid())+"_"+util::to_string(util::get_id()).c_str();
00057         if (fs != n + 1) {
00058             std::cerr << "# WARNING: file \"" << file_name << "\" contains 0-bytes." << std::endl;
00059             algorithm::shift_text((char*)ccc, fs, true);
00060             n = fs-1;
00061         }
00062 
00063         if (!util::store_to_file(char_array_serialize_wrapper<>((unsigned char*)ccc,n+1), (dir+"text_"+id).c_str())) {
00064             throw std::ios_base::failure("#csa_construct: Cannot store text to file system!");
00065             delete [] ccc;
00066             return false;
00067         } else {
00068             file_map["text"] = (dir+"text_"+id).c_str();
00069         }
00070         delete [] ccc;
00071         return construct_csa(csa, file_map, delete_files, dir, id);
00072     }
00073     return false;
00074 }
00075 
00076 template<class Csa>
00077 static bool construct_csa(Csa& csa, tMSS& file_map, bool delete_files=true, std::string dir="./", std::string id="")
00078 {
00079     write_R_output("csa", "construct CSA", "begin", 1, 0);
00080     int_vector_file_buffer<8> text_buf(file_map["text"].c_str());
00081     typedef int_vector<>::size_type size_type;
00082     if (id=="")
00083         id =  util::to_string(util::get_pid())+"_"+util::to_string(util::get_id()).c_str();
00084     text_buf.reset();
00085     size_type n = text_buf.int_vector_size;
00086 
00087     std::string sa_file_name = dir+"sa_"+id;
00088     // if sa file already exists
00089     std::ifstream in(sa_file_name.c_str());
00090     if (!in) {
00091         unsigned char* text = NULL;
00092         util::load_from_int_vector_buffer(text, text_buf);
00093 
00094         typename Csa::size_type nn = strlen((const char*) text);
00095         if (nn+1 != n) {
00096             std::cerr << "# WARNING: text contains 0-bytes. nn=" << nn << " n=" << n << std::endl;
00097             algorithm::shift_text((char*)text, n);
00098         }
00099 
00100         write_R_output("csa", "construct SA", "begin", 1, 0);
00101         int_vector<> sa = int_vector<>(n, 0, bit_magic::l1BP(n+1)+1);
00102         algorithm::calculate_sa(text,n, sa);     // calculate the suffix array sa of str
00103         delete [] text;
00104 
00105         assert(sa.size() == n);
00106 
00107         write_R_output("csa", "construct SA", "end", 1, 0);
00108         write_R_output("csa", "store SA", "begin", 1, 0);
00109 
00110         if (!util::store_to_file(sa, sa_file_name.c_str())) {
00111             throw std::ios_base::failure("#csa_construct: Cannot store SA to file system!");
00112             return false;
00113         } else {
00114             file_map["sa"] = sa_file_name;
00115         }
00116         write_R_output("csa", "store SA", "end", 1, 0);
00117         {
00118             sa.resize(0);
00119             int_vector<> temp;
00120             temp.swap(sa);
00121         }
00122     } else {
00123         file_map["sa"] = sa_file_name;
00124         in.close();
00125     }
00126 
00127 
00128     write_R_output("csa", "encode CSA", "begin", 1, 0);
00129     util::assign(csa, Csa(file_map, dir, id));
00130     write_R_output("csa", "encode CSA", "end", 1, 0);
00131 
00132     if (delete_files) {
00133         util::delete_all_files(file_map);
00134     }
00135     write_R_output("csa", "construct CSA", "end", 1, 0);
00136     return true;
00137 }
00138 
00139 template<class Csa>
00140 static bool construct_csa_of_reversed_text(std::string file_name, Csa& csa)
00141 {
00142     tMSS file_map;
00143     return construct_csa_of_reversed_text(file_name, csa, file_map, true, "./","");
00144 }
00145 
00146 
00147 
00148 template<class Csa>
00149 static bool construct_csa_of_reversed_text(std::string file_name, Csa& csa, tMSS& file_map, bool delete_files=true,
00150         std::string dir="./", std::string id="")
00151 {
00152     typedef int_vector<>::size_type size_type;
00153 
00154     if (id=="")
00155         id =  util::to_string(util::get_pid())+"_"+util::to_string(util::get_id()).c_str();
00156 
00157     std::string tmp_rev_file_name = dir+"text_rev_"+id;
00158     std::ifstream in(tmp_rev_file_name.c_str());
00159     if (!in) {
00160         char* text = NULL;
00161         size_type n = 0;
00162         if ((n=file::read_text((const char*)file_name.c_str(), text)) > 0) {
00163             --n; // since read_text appends a 0 byte
00164             for (size_type i=0; i < n/2; ++i) {
00165                 std::swap(text[i], text[n-1-i]);
00166             }
00167             file::write_text((const char*)tmp_rev_file_name.c_str(),text, n);
00168             file_map["text_rev"] = tmp_rev_file_name;
00169             delete [] text;
00170         } else {
00171             std::cout<<"ERROR: text cannot be read from file "<<file_name<<std::endl;
00172             return false;
00173         }
00174     } else {
00175         file_map["text_rev"] = tmp_rev_file_name;
00176         in.close();
00177     }
00178     bool res = construct_csa(tmp_rev_file_name, csa, file_map, delete_files, dir, id);
00179     if (delete_files) {
00180         util::delete_all_files(file_map);
00181     }
00182     return res;
00183 }
00184 
00185 
00186 }// end namespace
00187 
00188 #endif