SDSL: Succinct Data Structure Library
A C++ template library for succinct data structures
|
00001 /* sdsl - succinct data structures library 00002 Copyright (C) 2010 Simon Gog 00003 00004 This program is free software: you can redistribute it and/or modify 00005 it under the terms of the GNU General Public License as published by 00006 the Free Software Foundation, either version 3 of the License, or 00007 (at your option) any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU General Public License for more details. 00013 00014 You should have received a copy of the GNU General Public License 00015 along with this program. If not, see http://www.gnu.org/licenses/ . 00016 */ 00021 #ifndef INCLUDED_SDSL_CSA_CONSTRUCT 00022 #define INCLUDED_SDSL_CSA_CONSTRUCT 00023 00024 #include "typedefs.hpp" 00025 #include "int_vector.hpp" 00026 #include "algorithms_for_suffix_array_construction.hpp" 00027 #include "algorithms_for_compressed_suffix_arrays.hpp" 00028 #include "util.hpp" 00029 #include "testutils.hpp" 00030 #include "suffixarrays.hpp" 00031 #include "bwt_construct.hpp" 00032 00033 #include <iostream> 00034 #include <fstream> 00035 #include <string> 00036 #include <stdexcept> 00037 00038 namespace sdsl 00039 { 00040 00041 template<class Csa> 00042 static bool construct_csa(std::string file_name, Csa& csa) 00043 { 00044 tMSS file_map; 00045 return construct_csa(file_name, csa, file_map, true, "./",""); 00046 } 00047 00048 template<class Csa> 00049 static bool construct_csa(std::string file_name, Csa& csa, tMSS& file_map, bool delete_files=true, std::string dir="./", std::string id="") 00050 { 00051 uint64_t fs = 0; 00052 char* ccc = NULL; 00053 if ((fs=file::read_text((const char*)file_name.c_str(), ccc))>0) { 00054 typename Csa::size_type n = strlen((const char*)ccc); 00055 if (id == "") 00056 id = util::to_string(util::get_pid())+"_"+util::to_string(util::get_id()).c_str(); 00057 if (fs != n + 1) { 00058 std::cerr << "# WARNING: file \"" << file_name << "\" contains 0-bytes." << std::endl; 00059 algorithm::shift_text((char*)ccc, fs, true); 00060 n = fs-1; 00061 } 00062 00063 if (!util::store_to_file(char_array_serialize_wrapper<>((unsigned char*)ccc,n+1), (dir+"text_"+id).c_str())) { 00064 throw std::ios_base::failure("#csa_construct: Cannot store text to file system!"); 00065 delete [] ccc; 00066 return false; 00067 } else { 00068 file_map["text"] = (dir+"text_"+id).c_str(); 00069 } 00070 delete [] ccc; 00071 return construct_csa(csa, file_map, delete_files, dir, id); 00072 } 00073 return false; 00074 } 00075 00076 template<class Csa> 00077 static bool construct_csa(Csa& csa, tMSS& file_map, bool delete_files=true, std::string dir="./", std::string id="") 00078 { 00079 write_R_output("csa", "construct CSA", "begin", 1, 0); 00080 int_vector_file_buffer<8> text_buf(file_map["text"].c_str()); 00081 typedef int_vector<>::size_type size_type; 00082 if (id=="") 00083 id = util::to_string(util::get_pid())+"_"+util::to_string(util::get_id()).c_str(); 00084 text_buf.reset(); 00085 size_type n = text_buf.int_vector_size; 00086 00087 std::string sa_file_name = dir+"sa_"+id; 00088 // if sa file already exists 00089 std::ifstream in(sa_file_name.c_str()); 00090 if (!in) { 00091 unsigned char* text = NULL; 00092 util::load_from_int_vector_buffer(text, text_buf); 00093 00094 typename Csa::size_type nn = strlen((const char*) text); 00095 if (nn+1 != n) { 00096 std::cerr << "# WARNING: text contains 0-bytes. nn=" << nn << " n=" << n << std::endl; 00097 algorithm::shift_text((char*)text, n); 00098 } 00099 00100 write_R_output("csa", "construct SA", "begin", 1, 0); 00101 int_vector<> sa = int_vector<>(n, 0, bit_magic::l1BP(n+1)+1); 00102 algorithm::calculate_sa(text,n, sa); // calculate the suffix array sa of str 00103 delete [] text; 00104 00105 assert(sa.size() == n); 00106 00107 write_R_output("csa", "construct SA", "end", 1, 0); 00108 write_R_output("csa", "store SA", "begin", 1, 0); 00109 00110 if (!util::store_to_file(sa, sa_file_name.c_str())) { 00111 throw std::ios_base::failure("#csa_construct: Cannot store SA to file system!"); 00112 return false; 00113 } else { 00114 file_map["sa"] = sa_file_name; 00115 } 00116 write_R_output("csa", "store SA", "end", 1, 0); 00117 { 00118 sa.resize(0); 00119 int_vector<> temp; 00120 temp.swap(sa); 00121 } 00122 } else { 00123 file_map["sa"] = sa_file_name; 00124 in.close(); 00125 } 00126 00127 00128 write_R_output("csa", "encode CSA", "begin", 1, 0); 00129 util::assign(csa, Csa(file_map, dir, id)); 00130 write_R_output("csa", "encode CSA", "end", 1, 0); 00131 00132 if (delete_files) { 00133 util::delete_all_files(file_map); 00134 } 00135 write_R_output("csa", "construct CSA", "end", 1, 0); 00136 return true; 00137 } 00138 00139 template<class Csa> 00140 static bool construct_csa_of_reversed_text(std::string file_name, Csa& csa) 00141 { 00142 tMSS file_map; 00143 return construct_csa_of_reversed_text(file_name, csa, file_map, true, "./",""); 00144 } 00145 00146 00147 00148 template<class Csa> 00149 static bool construct_csa_of_reversed_text(std::string file_name, Csa& csa, tMSS& file_map, bool delete_files=true, 00150 std::string dir="./", std::string id="") 00151 { 00152 typedef int_vector<>::size_type size_type; 00153 00154 if (id=="") 00155 id = util::to_string(util::get_pid())+"_"+util::to_string(util::get_id()).c_str(); 00156 00157 std::string tmp_rev_file_name = dir+"text_rev_"+id; 00158 std::ifstream in(tmp_rev_file_name.c_str()); 00159 if (!in) { 00160 char* text = NULL; 00161 size_type n = 0; 00162 if ((n=file::read_text((const char*)file_name.c_str(), text)) > 0) { 00163 --n; // since read_text appends a 0 byte 00164 for (size_type i=0; i < n/2; ++i) { 00165 std::swap(text[i], text[n-1-i]); 00166 } 00167 file::write_text((const char*)tmp_rev_file_name.c_str(),text, n); 00168 file_map["text_rev"] = tmp_rev_file_name; 00169 delete [] text; 00170 } else { 00171 std::cout<<"ERROR: text cannot be read from file "<<file_name<<std::endl; 00172 return false; 00173 } 00174 } else { 00175 file_map["text_rev"] = tmp_rev_file_name; 00176 in.close(); 00177 } 00178 bool res = construct_csa(tmp_rev_file_name, csa, file_map, delete_files, dir, id); 00179 if (delete_files) { 00180 util::delete_all_files(file_map); 00181 } 00182 return res; 00183 } 00184 00185 00186 }// end namespace 00187 00188 #endif