|
bpp-seq
2.1.0
|
00001 // 00002 // File: WordAlphabet.h 00003 // Authors: Laurent Gueguen 00004 // Sylvain Gaillard 00005 // Created on: Sun Dec 28 2008 00006 // 00007 00008 /* 00009 Copyright or © or Copr. CNRS, (November 17, 2004) 00010 00011 This software is a computer program whose purpose is to provide classes 00012 for sequences analysis. 00013 00014 This software is governed by the CeCILL license under French law and 00015 abiding by the rules of distribution of free software. You can use, 00016 modify and/ or redistribute the software under the terms of the CeCILL 00017 license as circulated by CEA, CNRS and INRIA at the following URL 00018 "http://www.cecill.info". 00019 00020 As a counterpart to the access to the source code and rights to copy, 00021 modify and redistribute granted by the license, users are provided only 00022 with a limited warranty and the software's author, the holder of the 00023 economic rights, and the successive licensors have only limited 00024 liability. 00025 00026 In this respect, the user's attention is drawn to the risks associated 00027 with loading, using, modifying and/or developing or reproducing the 00028 software by the user in light of its specific status of free software, 00029 that may mean that it is complicated to manipulate, and that also 00030 therefore means that it is reserved for developers and experienced 00031 professionals having in-depth computer knowledge. Users are therefore 00032 encouraged to load and test the software's suitability as regards their 00033 requirements in conditions enabling the security of their systems and/or 00034 data to be ensured and, more generally, to use and operate it in the 00035 same conditions as regards security. 00036 00037 The fact that you are presently reading this means that you have had 00038 knowledge of the CeCILL license and that you accept its terms. 00039 */ 00040 00041 #include "WordAlphabet.h" 00042 #include <Bpp/Text/TextTools.h> 00043 00044 using namespace bpp; 00045 00046 // From the STL: 00047 #include <iostream> 00048 00049 using namespace std; 00050 00051 WordAlphabet::WordAlphabet(const vector<const Alphabet*>& vAlpha) : 00052 AbstractAlphabet(), 00053 vAbsAlph_(vAlpha) 00054 { 00055 build_(); 00056 } 00057 00058 WordAlphabet::WordAlphabet(const Alphabet* pAlpha, unsigned int num) : 00059 AbstractAlphabet(), 00060 vAbsAlph_(0) 00061 { 00062 for (unsigned int i = 0; i < num; i++) 00063 { 00064 vAbsAlph_.push_back(pAlpha); 00065 } 00066 00067 build_(); 00068 } 00069 00070 void WordAlphabet::build_() 00071 { 00072 unsigned int size = 1; 00073 00074 for (unsigned int i = 0; i < vAbsAlph_.size(); i++) 00075 { 00076 size *= vAbsAlph_[i]->getSize(); 00077 } 00078 00079 resize(size + 2); 00080 00081 string s = ""; 00082 for (unsigned int i = 0; i < vAbsAlph_.size(); i++) 00083 { 00084 s += "-"; 00085 } 00086 00087 setState(0, AlphabetState(-1, s, "gap")); 00088 00089 for (unsigned int i = 0; i < size; i++) 00090 { 00091 setState(i + 1, AlphabetState(i, "", "")); 00092 } 00093 00094 unsigned lr = size; 00095 char c; 00096 for (unsigned int na = 0; na < vAbsAlph_.size(); na++) 00097 { 00098 lr /= vAbsAlph_[na]->getSize(); 00099 unsigned int j = 1; 00100 unsigned int i = 0; 00101 while (j <= size) 00102 { 00103 c = vAbsAlph_[na]->intToChar(i)[0]; 00104 00105 for (unsigned int k = 0; k < lr; k++) 00106 { 00107 getStateAt(j).setLetter(getStateAt(j).getLetter() + c); 00108 j++; 00109 // alphabet[j++].letter += c; 00110 } 00111 00112 if (++i == vAbsAlph_[na]->getSize()) 00113 i = 0; 00114 } 00115 } 00116 00117 s = ""; 00118 for (unsigned i = 0; i < vAbsAlph_.size(); i++) 00119 { 00120 s += "N"; 00121 } 00122 00123 setState(size + 1, AlphabetState(size, s, "Unresolved")); 00124 remap(); 00125 } 00126 00127 /******************************************************************************/ 00128 00129 std::string WordAlphabet::getAlphabetType() const 00130 { 00131 string s = "Word alphabet:"; 00132 for (unsigned int i = 0; i < vAbsAlph_.size(); i++) 00133 { 00134 s += " " + vAbsAlph_[i]->getAlphabetType(); 00135 } 00136 00137 return s; 00138 } 00139 00140 bool WordAlphabet::hasUniqueAlphabet() const 00141 { 00142 string s = vAbsAlph_[0]->getAlphabetType(); 00143 for (unsigned int i = 1; i < vAbsAlph_.size(); i++) 00144 { 00145 if (vAbsAlph_[i]->getAlphabetType() != s) 00146 return false; 00147 } 00148 return true; 00149 } 00150 00151 bool WordAlphabet::containsUnresolved(const std::string& state) const throw (BadCharException) 00152 { 00153 size_t s = vAbsAlph_.size(); 00154 if (state.length() != s) 00155 throw BadCharException(state, "WordAlphabet::containsUnresolved", this); 00156 00157 for (size_t i = 0; i < vAbsAlph_.size(); i++) 00158 { 00159 if (vAbsAlph_[i]->isUnresolved(state.substr(i, 1))) 00160 { 00161 return true; 00162 } 00163 } 00164 return false; 00165 } 00166 00167 /******************************************************************************/ 00168 00169 bool WordAlphabet::containsGap(const std::string& state) const throw (BadCharException) 00170 { 00171 size_t s = vAbsAlph_.size(); 00172 if (state.length() != s) 00173 throw BadCharException(state, "WordAlphabet::containsGap", this); 00174 00175 for (size_t i = 0; i < vAbsAlph_.size(); i++) 00176 { 00177 if (vAbsAlph_[i]->isGap(state.substr(i, 1))) 00178 return true; 00179 } 00180 00181 return false; 00182 } 00183 00184 /******************************************************************************/ 00185 00186 std::string WordAlphabet::getName(const std::string& state) const throw (BadCharException) 00187 { 00188 if (state.size() != vAbsAlph_.size()) 00189 throw BadCharException(state, "WordAlphabet::getName", this); 00190 if (containsUnresolved(state)) 00191 return getStateAt(getSize() + 1).getName(); 00192 if (containsGap(state)) 00193 return getStateAt(0).getName(); 00194 else 00195 return AbstractAlphabet::getName(state); 00196 } 00197 00198 /******************************************************************************/ 00199 00200 std::vector<int> WordAlphabet::getAlias(int state) const throw (BadIntException) 00201 { 00202 if (!isIntInAlphabet(state)) 00203 throw BadIntException(state, "WordAlphabet::getAlias(int): Specified base unknown."); 00204 vector<int> v; 00205 int i, s = getSize(); 00206 00207 if (state == s) 00208 { 00209 v.resize(s); 00210 for (i = 0; i < s; i++) 00211 { 00212 v[i] = i; 00213 } 00214 } 00215 else 00216 { 00217 v.resize(1); v[0] = state; 00218 } 00219 return v; 00220 } 00221 00222 /******************************************************************************/ 00223 00224 std::vector<std::string> WordAlphabet::getAlias(const std::string& state) const throw (BadCharException) 00225 { 00226 string locstate = TextTools::toUpper(state); 00227 if (!isCharInAlphabet(locstate)) 00228 throw BadCharException(locstate, "WordAlphabet::getAlias(string): Specified base unknown."); 00229 vector<string> v; 00230 00231 unsigned int i, s = getSize(); 00232 00233 string st = ""; 00234 for (i = 0; i < vAbsAlph_.size(); i++) 00235 { 00236 st += "N"; 00237 } 00238 00239 if (locstate == st) 00240 { 00241 v.resize(s); 00242 for (i = 0; i < s; i++) 00243 { 00244 v[i] = intToChar(i); 00245 } 00246 } 00247 else 00248 { 00249 v.resize(1); v[0] = state; 00250 } 00251 return v; 00252 } 00253 00254 /******************************************************************************/ 00255 00256 int WordAlphabet::getGeneric(const std::vector<int>& states) const throw (BadIntException) 00257 { 00258 return states[0]; 00259 } 00260 00261 /******************************************************************************/ 00262 00263 std::string WordAlphabet::getGeneric(const std::vector<std::string>& states) const throw (BadCharException) 00264 { 00265 return states[0]; 00266 } 00267 00268 /******************************************************************************/ 00269 00270 int WordAlphabet::getWord(const std::vector<int>& vint, size_t pos) const throw (IndexOutOfBoundsException) 00271 { 00272 if (vint.size() < pos + vAbsAlph_.size()) 00273 throw IndexOutOfBoundsException("WordAlphabet::getWord", pos, 0, vint.size() - vAbsAlph_.size()); 00274 00275 vector<string> vs; 00276 for (size_t i = 0; i < vAbsAlph_.size(); i++) 00277 { 00278 vs.push_back(vAbsAlph_[i]->intToChar(vint[i + pos])); 00279 } 00280 00281 return charToInt(getWord(vs)); // This can't throw a BadCharException! 00282 } 00283 00284 /****************************************************************************************/ 00285 00286 std::string WordAlphabet::getWord(const std::vector<string>& vpos, size_t pos) const throw (IndexOutOfBoundsException, BadCharException) 00287 { 00288 if (vpos.size() < pos + vAbsAlph_.size()) 00289 throw IndexOutOfBoundsException("WordAlphabet::getWord", pos, 0, vpos.size() - vAbsAlph_.size()); 00290 00291 string s = ""; 00292 for (size_t i = 0; i < vAbsAlph_.size(); i++) 00293 { 00294 s += vpos[pos + i]; 00295 } 00296 // test 00297 charToInt(s); 00298 return s; 00299 } 00300 00301 /****************************************************************************************/ 00302 00303 Sequence* WordAlphabet::translate(const Sequence& sequence, size_t pos) const throw (AlphabetMismatchException, Exception) 00304 { 00305 if ((!hasUniqueAlphabet()) or 00306 (sequence.getAlphabet()->getAlphabetType() != vAbsAlph_[0]->getAlphabetType())) 00307 throw AlphabetMismatchException("No matching alphabets", sequence.getAlphabet(), vAbsAlph_[0]); 00308 00309 vector<int> v1 = sequence.getContent(); 00310 vector<int> v2; 00311 00312 size_t s = sequence.size(); 00313 unsigned int l = getLength(); 00314 size_t i = pos; 00315 00316 while (i + l < s) 00317 { 00318 v2.push_back(getWord(v1, i)); 00319 i += l; 00320 } 00321 00322 return new BasicSequence(sequence.getName(), v2, this); 00323 } 00324 00325 /****************************************************************************************/ 00326 00327 Sequence* WordAlphabet::reverse(const Sequence& sequence) const throw (AlphabetMismatchException, Exception) 00328 { 00329 if ((!hasUniqueAlphabet()) or 00330 (sequence.getAlphabet()->getAlphabetType() != getAlphabetType())) 00331 throw AlphabetMismatchException("No matching alphabets"); 00332 00333 Sequence* pseq = new BasicSequence(sequence.getName(), "", getNAlphabet(0)); 00334 00335 size_t s = sequence.size(); 00336 for (size_t i = 0; i < s; i++) 00337 { 00338 pseq->append(getPositions(sequence[i])); 00339 } 00340 00341 return pseq; 00342 } 00343 00344 /****************************************************************************************/ 00345