|
bpp-seq
2.1.0
|
00001 // 00002 // File: SequenceTools.cpp 00003 // Authors: Guillaume Deuchst 00004 // Julien Dutheil 00005 // Sylvain Gaillard 00006 // Created on: Tue Aug 21 2003 00007 // 00008 00009 /* 00010 Copyright or © or Copr. Bio++ Development Team, (November 17, 2004) 00011 00012 This software is a computer program whose purpose is to provide classes 00013 for sequences analysis. 00014 00015 This software is governed by the CeCILL license under French law and 00016 abiding by the rules of distribution of free software. You can use, 00017 modify and/ or redistribute the software under the terms of the CeCILL 00018 license as circulated by CEA, CNRS and INRIA at the following URL 00019 "http://www.cecill.info". 00020 00021 As a counterpart to the access to the source code and rights to copy, 00022 modify and redistribute granted by the license, users are provided only 00023 with a limited warranty and the software's author, the holder of the 00024 economic rights, and the successive licensors have only limited 00025 liability. 00026 00027 In this respect, the user's attention is drawn to the risks associated 00028 with loading, using, modifying and/or developing or reproducing the 00029 software by the user in light of its specific status of free software, 00030 that may mean that it is complicated to manipulate, and that also 00031 therefore means that it is reserved for developers and experienced 00032 professionals having in-depth computer knowledge. Users are therefore 00033 encouraged to load and test the software's suitability as regards their 00034 requirements in conditions enabling the security of their systems and/or 00035 data to be ensured and, more generally, to use and operate it in the 00036 same conditions as regards security. 00037 00038 The fact that you are presently reading this means that you have had 00039 knowledge of the CeCILL license and that you accept its terms. 00040 */ 00041 00042 #include "SequenceTools.h" 00043 00044 #include "Alphabet/AlphabetTools.h" 00045 #include "StringSequenceTools.h" 00046 #include <Bpp/Numeric/Matrix/Matrix.h> 00047 #include <Bpp/Numeric/VectorTools.h> 00048 00049 using namespace bpp; 00050 00051 // From the STL: 00052 #include <ctype.h> 00053 #include <cmath> 00054 #include <list> 00055 #include <iostream> 00056 00057 using namespace std; 00058 00059 DNA SequenceTools::_DNA; 00060 RNA SequenceTools::_RNA; 00061 RNY SequenceTools::_RNY(_DNA); 00062 NucleicAcidsReplication SequenceTools::_DNARep(&_DNA, &_DNA); 00063 NucleicAcidsReplication SequenceTools::_RNARep(&_RNA, &_RNA); 00064 NucleicAcidsReplication SequenceTools::_transc(&_DNA, &_RNA); 00065 00066 /******************************************************************************/ 00067 00068 Sequence* SequenceTools::subseq(const Sequence& sequence, size_t begin, size_t end) throw (IndexOutOfBoundsException, Exception) 00069 { 00070 // Checking interval 00071 if (end >= sequence.size()) 00072 throw IndexOutOfBoundsException ("SequenceTools::subseq : Invalid upper bound", end, 0, sequence.size()); 00073 if (end < begin) 00074 throw Exception ("SequenceTools::subseq : Invalid interval"); 00075 00076 // Copy sequence 00077 vector<int> temp(sequence.getContent()); 00078 00079 // Truncate sequence 00080 temp.erase(temp.begin() + end + 1, temp.end()); 00081 temp.erase(temp.begin(), temp.begin() + begin); 00082 00083 // New sequence creation 00084 return new BasicSequence(sequence.getName(), temp, sequence.getComments(), sequence.getAlphabet()); 00085 } 00086 00087 /******************************************************************************/ 00088 00089 Sequence* SequenceTools::concatenate(const Sequence& seq1, const Sequence& seq2) throw (AlphabetMismatchException, Exception) 00090 { 00091 // Sequence's alphabets matching verification 00092 if ((seq1.getAlphabet()->getAlphabetType()) != (seq2.getAlphabet()->getAlphabetType())) 00093 throw AlphabetMismatchException("SequenceTools::concatenate : Sequence's alphabets don't match ", seq1.getAlphabet(), seq2.getAlphabet()); 00094 00095 // Sequence's names matching verification 00096 if (seq1.getName() != seq2.getName()) 00097 throw Exception ("SequenceTools::concatenate : Sequence's names don't match"); 00098 00099 // Concatenate sequences and send result 00100 vector<int> sequence = seq1.getContent(); 00101 vector<int> sequence2 = seq2.getContent(); 00102 sequence.insert(sequence.end(), sequence2.begin(), sequence2.end()); 00103 return new BasicSequence(seq1.getName(), sequence, seq1.getComments(), seq1.getAlphabet()); 00104 } 00105 00106 /******************************************************************************/ 00107 00108 Sequence& SequenceTools::complement(Sequence& seq) throw (AlphabetException) 00109 { 00110 // Alphabet type checking 00111 NucleicAcidsReplication* NAR; 00112 if (seq.getAlphabet()->getAlphabetType() == "DNA alphabet") 00113 { 00114 NAR = &_DNARep; 00115 } 00116 else if (seq.getAlphabet()->getAlphabetType() == "RNA alphabet") 00117 { 00118 NAR = &_RNARep; 00119 } 00120 else 00121 { 00122 throw AlphabetException("SequenceTools::complement: Sequence must be nucleic.", seq.getAlphabet()); 00123 } 00124 for (size_t i = 0; i < seq.size(); i++) 00125 { 00126 seq.setElement(i, NAR->translate(seq.getValue(i))); 00127 } 00128 return seq; 00129 } 00130 00131 /******************************************************************************/ 00132 00133 Sequence* SequenceTools::getComplement(const Sequence& sequence) throw (AlphabetException) 00134 { 00135 // Alphabet type checking 00136 NucleicAcidsReplication* NAR; 00137 if (sequence.getAlphabet()->getAlphabetType() == "DNA alphabet") 00138 { 00139 NAR = &_DNARep; 00140 } 00141 else if (sequence.getAlphabet()->getAlphabetType() == "RNA alphabet") 00142 { 00143 NAR = &_RNARep; 00144 } 00145 else 00146 { 00147 throw AlphabetException ("SequenceTools::getComplement: Sequence must be nucleic.", sequence.getAlphabet()); 00148 } 00149 00150 return NAR->translate(sequence); 00151 } 00152 00153 /******************************************************************************/ 00154 00155 Sequence* SequenceTools::transcript(const Sequence& sequence) throw (AlphabetException) 00156 { 00157 // Alphabet type checking 00158 if (sequence.getAlphabet()->getAlphabetType() != "DNA alphabet") 00159 { 00160 throw AlphabetException ("SequenceTools::transcript : Sequence must be DNA", sequence.getAlphabet()); 00161 } 00162 00163 return _transc.translate(sequence); 00164 } 00165 00166 /******************************************************************************/ 00167 00168 Sequence* SequenceTools::reverseTranscript(const Sequence& sequence) throw (AlphabetException) 00169 { 00170 // Alphabet type checking 00171 if (sequence.getAlphabet()->getAlphabetType() != "RNA alphabet") 00172 { 00173 throw AlphabetException ("SequenceTools::reverseTranscript : Sequence must be RNA", sequence.getAlphabet()); 00174 } 00175 00176 return _transc.reverse(sequence); 00177 } 00178 00179 /******************************************************************************/ 00180 00181 Sequence& SequenceTools::invert(Sequence& seq) 00182 { 00183 size_t seq_size = seq.size(); // store seq size for efficiency 00184 unsigned int tmp_state = 0; // to store one state when swapping positions 00185 size_t j = seq_size; // symetric position iterator from sequence end 00186 for (size_t i = 0; i < seq_size / 2; i++) 00187 { 00188 j = seq_size - 1 - i; 00189 tmp_state = seq.getValue(i); 00190 seq.setElement(i, seq.getValue(j)); 00191 seq.setElement(j, tmp_state); 00192 } 00193 return seq; 00194 } 00195 00196 /******************************************************************************/ 00197 00198 Sequence* SequenceTools::getInvert(const Sequence& sequence) 00199 { 00200 Sequence* iSeq = sequence.clone(); 00201 invert(*iSeq); 00202 return iSeq; 00203 } 00204 00205 /******************************************************************************/ 00206 00207 Sequence& SequenceTools::invertComplement(Sequence& seq) 00208 { 00209 // Alphabet type checking 00210 NucleicAcidsReplication* NAR; 00211 if (seq.getAlphabet()->getAlphabetType() == "DNA alphabet") 00212 { 00213 NAR = &_DNARep; 00214 } 00215 else if (seq.getAlphabet()->getAlphabetType() == "RNA alphabet") 00216 { 00217 NAR = &_RNARep; 00218 } 00219 else 00220 { 00221 throw AlphabetException("SequenceTools::complement: Sequence must be nucleic.", seq.getAlphabet()); 00222 } 00223 // for (size_t i = 0 ; i < seq.size() ; i++) { 00224 // seq.setElement(i, NAR->translate(seq.getValue(i))); 00225 // } 00226 size_t seq_size = seq.size(); // store seq size for efficiency 00227 int tmp_state = 0; // to store one state when swapping positions 00228 size_t j = seq_size; // symetric position iterator from sequence end 00229 for (size_t i = 0; i < seq_size / 2; i++) 00230 { 00231 j = seq_size - 1 - i; 00232 tmp_state = seq.getValue(i); 00233 seq.setElement(i, NAR->translate(seq.getValue(j))); 00234 seq.setElement(j, NAR->translate(tmp_state)); 00235 } 00236 if (seq_size % 2) // treate the state in the middle of odd sequences 00237 { 00238 seq.setElement(seq_size / 2, NAR->translate(seq.getValue(seq_size / 2))); 00239 } 00240 return seq; 00241 } 00242 00243 /******************************************************************************/ 00244 00245 double SequenceTools::getPercentIdentity(const Sequence& seq1, const Sequence& seq2, bool ignoreGaps) throw (AlphabetMismatchException, SequenceNotAlignedException) 00246 { 00247 if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType()) 00248 throw AlphabetMismatchException("SequenceTools::getPercentIdentity", seq1.getAlphabet(), seq2.getAlphabet()); 00249 if (seq1.size() != seq2.size()) 00250 throw SequenceNotAlignedException("SequenceTools::getPercentIdentity", &seq2); 00251 int gap = seq1.getAlphabet()->getGapCharacterCode(); 00252 size_t id = 0; 00253 size_t tot = 0; 00254 for (size_t i = 0; i < seq1.size(); i++) 00255 { 00256 int x = seq1.getValue(i); 00257 int y = seq2.getValue(i); 00258 if (ignoreGaps) 00259 { 00260 if (x != gap && y != gap) 00261 { 00262 tot++; 00263 if (x == y) 00264 id++; 00265 } 00266 } 00267 else 00268 { 00269 tot++; 00270 if (x == y) 00271 id++; 00272 } 00273 } 00274 return static_cast<double>(id) / static_cast<double>(tot) * 100.; 00275 } 00276 00277 /******************************************************************************/ 00278 00279 size_t SequenceTools::getNumberOfSites(const Sequence& seq) 00280 { 00281 size_t count = 0; 00282 const Alphabet* alpha = seq.getAlphabet(); 00283 for (size_t i = 0; i < seq.size(); i++) 00284 { 00285 if (!alpha->isGap(seq[i])) 00286 count++; 00287 } 00288 return count; 00289 } 00290 00291 /******************************************************************************/ 00292 00293 size_t SequenceTools::getNumberOfCompleteSites(const Sequence& seq) 00294 { 00295 size_t count = 0; 00296 const Alphabet* alpha = seq.getAlphabet(); 00297 for (size_t i = 0; i < seq.size(); i++) 00298 { 00299 if (!alpha->isGap(seq[i]) && !alpha->isUnresolved(seq[i])) 00300 count++; 00301 } 00302 return count; 00303 } 00304 00305 /******************************************************************************/ 00306 00307 size_t SequenceTools::getNumberOfUnresolvedSites(const Sequence& seq) 00308 { 00309 size_t count = 0; 00310 const Alphabet* alpha = seq.getAlphabet(); 00311 for (size_t i = 0; i < seq.size(); i++) 00312 { 00313 if (alpha->isUnresolved(seq[i])) 00314 count++; 00315 } 00316 return count; 00317 } 00318 00319 /******************************************************************************/ 00320 00321 Sequence* SequenceTools::getSequenceWithoutGaps(const Sequence& seq) 00322 { 00323 const Alphabet* alpha = seq.getAlphabet(); 00324 vector<int> content; 00325 for (size_t i = 0; i < seq.size(); i++) 00326 { 00327 if (!alpha->isGap(seq[i])) 00328 content.push_back(seq[i]); 00329 } 00330 Sequence* newSeq = dynamic_cast<Sequence*>(seq.clone()); 00331 newSeq->setContent(content); 00332 return newSeq; 00333 } 00334 00335 /******************************************************************************/ 00336 00337 void SequenceTools::removeGaps(Sequence& seq) 00338 { 00339 const Alphabet* alpha = seq.getAlphabet(); 00340 for (size_t i = seq.size(); i > 0; --i) 00341 { 00342 if (alpha->isGap(seq[i - 1])) 00343 seq.deleteElement(i - 1); 00344 } 00345 } 00346 00347 /******************************************************************************/ 00348 00349 Sequence* SequenceTools::getSequenceWithoutStops(const Sequence& seq) throw (Exception) 00350 { 00351 const CodonAlphabet* calpha = dynamic_cast<const CodonAlphabet*>(seq.getAlphabet()); 00352 if (!calpha) 00353 throw Exception("SequenceTools::getSequenceWithoutStops. Input sequence should have a codon alphabet."); 00354 vector<int> content; 00355 for (size_t i = 0; i < seq.size(); i++) 00356 { 00357 if (!calpha->isStop(seq[i])) 00358 content.push_back(seq[i]); 00359 } 00360 Sequence* newSeq = dynamic_cast<Sequence*>(seq.clone()); 00361 newSeq->setContent(content); 00362 return newSeq; 00363 } 00364 00365 /******************************************************************************/ 00366 00367 void SequenceTools::removeStops(Sequence& seq) throw (Exception) 00368 { 00369 const CodonAlphabet* calpha = dynamic_cast<const CodonAlphabet*>(seq.getAlphabet()); 00370 if (!calpha) 00371 throw Exception("SequenceTools::removeStops. Input sequence should have a codon alphabet."); 00372 for (size_t i = seq.size(); i > 0; --i) 00373 { 00374 if (calpha->isStop(seq[i - 1])) 00375 seq.deleteElement(i - 1); 00376 } 00377 } 00378 00379 /******************************************************************************/ 00380 00381 void SequenceTools::replaceStopsWithGaps(Sequence& seq) throw (Exception) 00382 { 00383 const CodonAlphabet* calpha = dynamic_cast<const CodonAlphabet*>(seq.getAlphabet()); 00384 if (!calpha) 00385 throw Exception("SequenceTools::replaceStopsWithGaps. Input sequence should have a codon alphabet."); 00386 int gap = calpha->getGapCharacterCode(); 00387 for (size_t i = 0; i < seq.size(); ++i) 00388 { 00389 if (calpha->isStop(seq[i])) 00390 seq.setElement(i, gap); 00391 } 00392 } 00393 00394 /******************************************************************************/ 00395 00396 BowkerTest* SequenceTools::bowkerTest(const Sequence& seq1, const Sequence& seq2) 00397 throw (SequenceNotAlignedException) 00398 { 00399 if (seq1.size() != seq2.size()) 00400 throw SequenceNotAlignedException("SequenceTools::bowkerTest.", &seq2); 00401 size_t n = seq1.size(); 00402 const Alphabet* alpha = seq1.getAlphabet(); 00403 unsigned int r = alpha->getSize(); 00404 00405 // Compute contingency table: 00406 RowMatrix<double> array(r, r); 00407 int x, y; 00408 for (size_t i = 0; i < n; i++) 00409 { 00410 x = seq1[i]; 00411 y = seq2[i]; 00412 if (!alpha->isGap(x) && !alpha->isUnresolved(x) 00413 && !alpha->isGap(y) && !alpha->isUnresolved(y)) 00414 { 00415 array(static_cast<size_t>(x), static_cast<size_t>(y))++; 00416 } 00417 } 00418 00419 // Compute Bowker's statistic: 00420 double sb2 = 0, nij, nji; 00421 for (unsigned int i = 1; i < r; ++i) 00422 { 00423 for (unsigned int j = 0; j < i; ++j) 00424 { 00425 nij = array(i, j); 00426 nji = array(j, i); 00427 if (nij != 0 || nji != 0) 00428 { 00429 sb2 += pow(nij - nji, 2) / (nij + nji); 00430 } 00431 // Else: we should display a warning there. 00432 } 00433 } 00434 00435 // Compute p-value: 00436 double pvalue = 1. - RandomTools::pChisq(sb2, (r - 1) * r / 2); 00437 00438 // Return results: 00439 BowkerTest* test = new BowkerTest(); 00440 test->setStatistic(sb2); 00441 test->setPValue(pvalue); 00442 return test; 00443 } 00444 00445 /******************************************************************************/ 00446 00447 void SequenceTools::getPutativeHaplotypes(const Sequence& seq, std::vector<Sequence*>& hap, unsigned int level) 00448 { 00449 vector< vector< int > > states(seq.size()); 00450 list<Sequence*> t_hap; 00451 const Alphabet* alpha = seq.getAlphabet(); 00452 unsigned int hap_count = 1; 00453 // Vector of available states at each position 00454 for (size_t i = 0; i < seq.size(); i++) 00455 { 00456 vector<int> st = alpha->getAlias(seq[i]); 00457 if (!st.size()) 00458 { 00459 st.push_back(alpha->getGapCharacterCode()); 00460 } 00461 if (st.size() <= level) 00462 { 00463 states[i] = st; 00464 } 00465 else 00466 { 00467 states[i] = vector<int>(1, seq[i]); 00468 } 00469 } 00470 // Combinatorial haplotypes building (the use of tree may be more accurate) 00471 t_hap.push_back(new BasicSequence(seq.getName() + "_hap" + TextTools::toString(hap_count++), "", alpha)); 00472 for (size_t i = 0; i < states.size(); i++) 00473 { 00474 for (list<Sequence*>::iterator it = t_hap.begin(); it != t_hap.end(); it++) 00475 { 00476 for (unsigned int j = 0; j < states[i].size(); j++) 00477 { 00478 Sequence* tmp_seq = new BasicSequence(seq.getName() + "_hap", (**it).getContent(), alpha); 00479 if (j < states[i].size() - 1) 00480 { 00481 tmp_seq->setName(tmp_seq->getName() + TextTools::toString(hap_count++)); 00482 tmp_seq->addElement(states[i][j]); 00483 t_hap.insert(it, tmp_seq); 00484 } 00485 else 00486 { 00487 (**it).addElement(states[i][j]); 00488 } 00489 } 00490 } 00491 } 00492 for (list<Sequence*>::reverse_iterator it = t_hap.rbegin(); it != t_hap.rend(); it++) 00493 { 00494 hap.push_back(*it); 00495 } 00496 } 00497 00498 /******************************************************************************/ 00499 00500 Sequence* SequenceTools::combineSequences(const Sequence& s1, const Sequence& s2) throw (AlphabetMismatchException) 00501 { 00502 if (s1.getAlphabet()->getAlphabetType() != s2.getAlphabet()->getAlphabetType()) 00503 { 00504 throw AlphabetMismatchException("SequenceTools::combineSequences(const Sequence& s1, const Sequence& s2): s1 and s2 don't have same Alphabet.", s1.getAlphabet(), s2.getAlphabet()); 00505 } 00506 const Alphabet* alpha = s1.getAlphabet(); 00507 vector<int> st; 00508 vector<int> seq; 00509 size_t length = NumTools::max(s1.size(), s2.size()); 00510 for (size_t i = 0; i < length; i++) 00511 { 00512 if (i < s1.size()) 00513 st.push_back(s1.getValue(i)); 00514 if (i < s2.size()) 00515 st.push_back(s2.getValue(i)); 00516 seq.push_back(alpha->getGeneric(st)); 00517 st.clear(); 00518 } 00519 Sequence* s = new BasicSequence(s1.getName() + "+" + s2.getName(), seq, alpha); 00520 return s; 00521 } 00522 00523 /******************************************************************************/ 00524 00525 Sequence* SequenceTools::subtractHaplotype(const Sequence& s, const Sequence& h, string name, unsigned int level) throw (SequenceNotAlignedException) 00526 { 00527 const Alphabet* alpha = s.getAlphabet(); 00528 if (name.size() == 0) 00529 name = s.getName() + "_haplotype"; 00530 string seq; 00531 if (s.size() != h.size()) 00532 throw SequenceNotAlignedException("SequenceTools::subtractHaplotype: haplotype must be aligned with the sequence.", &h); 00533 for (unsigned int i = 0; i < s.size(); ++i) 00534 { 00535 string c; 00536 vector<int> nucs = alpha->getAlias(s.getValue(i)); 00537 if (nucs.size() > 1) 00538 { 00539 remove(nucs.begin(), nucs.end(), h.getValue(i)); 00540 nucs = vector<int>(nucs.begin(), nucs.end() - 1); 00541 } 00542 else 00543 { 00544 nucs = vector<int>(nucs.begin(), nucs.end()); 00545 } 00546 c = alpha->intToChar(alpha->getGeneric(nucs)); 00547 if (level <= nucs.size() && (alpha->isUnresolved(s.getValue(i)) || alpha->isUnresolved(h.getValue(i)))) 00548 { 00549 c = alpha->intToChar(alpha->getUnknownCharacterCode()); 00550 } 00551 seq += c; 00552 } 00553 Sequence* hap = new BasicSequence(name, seq, alpha); 00554 return hap; 00555 } 00556 00557 /******************************************************************************/ 00558 00559 Sequence* SequenceTools::RNYslice(const Sequence& seq, int ph) throw (AlphabetException) 00560 { 00561 // Alphabet type checking 00562 if (seq.getAlphabet()->getAlphabetType() != "DNA alphabet") 00563 { 00564 throw AlphabetException ("SequenceTools::transcript : Sequence must be DNA", seq.getAlphabet()); 00565 } 00566 00567 if ((ph < 1) || (ph > 3)) 00568 throw Exception("Bad phase for RNYSlice: " + TextTools::toString(ph) + ". Should be between 1 and 3."); 00569 00570 size_t s = seq.size(); 00571 size_t n = (s - ph + 3) / 3; 00572 00573 vector<int> content(n); 00574 00575 int tir = seq.getAlphabet()->getGapCharacterCode(); 00576 size_t j; 00577 00578 for (size_t i = 0; i < n; i++) 00579 { 00580 j = i * 3 + ph - 1; 00581 00582 if (j == 0) 00583 content[i] = _RNY.getRNY(tir, seq[0], seq[1], *seq.getAlphabet()); 00584 else 00585 { 00586 if (j == s - 1) 00587 content[i] = _RNY.getRNY(seq[j - 1], seq[j], tir, *seq.getAlphabet()); 00588 else 00589 content[i] = _RNY.getRNY(seq[j - 1], seq[j], seq[j + 1], *seq.getAlphabet()); 00590 } 00591 } 00592 00593 // New sequence creating, and sense reversing 00594 Sequence* sq = new BasicSequence(seq.getName(), content, 00595 seq.getComments(), &_RNY); 00596 00597 // Send result 00598 return sq; 00599 } 00600 00601 Sequence* SequenceTools::RNYslice(const Sequence& seq) throw (AlphabetException) 00602 { 00603 // Alphabet type checking 00604 if (seq.getAlphabet()->getAlphabetType() != "DNA alphabet") 00605 { 00606 throw AlphabetException ("SequenceTools::transcript : Sequence must be DNA", seq.getAlphabet()); 00607 } 00608 00609 size_t n = seq.size(); 00610 00611 vector<int> content(n); 00612 00613 int tir = seq.getAlphabet()->getGapCharacterCode(); 00614 00615 if (seq.size() >= 2) 00616 { 00617 content[0] = _RNY.getRNY(tir, seq[0], seq[1], *seq.getAlphabet()); 00618 00619 for (unsigned int i = 1; i < n - 1; i++) 00620 { 00621 content[i] = _RNY.getRNY(seq[i - 1], seq[i], seq[i + 1], 00622 *seq.getAlphabet()); 00623 } 00624 00625 content[n - 1] = _RNY.getRNY(seq[n - 2], seq[n - 1], tir, *seq.getAlphabet()); 00626 } 00627 00628 // New sequence creating, and sense reversing 00629 Sequence* s = new BasicSequence(seq.getName(), content, 00630 seq.getComments(), &_RNY); 00631 00632 // Send result 00633 return s; 00634 } 00635 00636 /******************************************************************************/ 00637 00638 00639 void SequenceTools::getCDS(Sequence& sequence, bool checkInit, bool checkStop, bool includeInit, bool includeStop) 00640 { 00641 const CodonAlphabet* alphabet = dynamic_cast<const CodonAlphabet*>(sequence.getAlphabet()); 00642 if (!alphabet) 00643 throw AlphabetException("SequenceTools::getCDS. Sequence is not a codon sequence."); 00644 if (checkInit) 00645 { 00646 unsigned int i; 00647 for (i = 0; i < sequence.size() && !alphabet->isInit(sequence[i]); ++i) 00648 {} 00649 for (unsigned int j = 0; includeInit ? j < i : j <= i; ++j) 00650 { 00651 sequence.deleteElement(j); 00652 } 00653 } 00654 if (checkStop) 00655 { 00656 unsigned int i; 00657 for (i = 0; i < sequence.size() && !alphabet->isStop(sequence[i]); ++i) 00658 {} 00659 for (unsigned int j = includeStop ? i + 1 : i; j < sequence.size(); ++j) 00660 { 00661 sequence.deleteElement(j); 00662 } 00663 } 00664 } 00665 00666 /******************************************************************************/ 00667 00668 size_t SequenceTools::findFirstOf(const Sequence& seq, const Sequence& motif, bool strict) 00669 { 00670 if (motif.size() > seq.size()) 00671 return seq.size(); 00672 for (size_t seqi = 0; seqi < seq.size() - motif.size() + 1; seqi++) 00673 { 00674 bool match = false; 00675 for (size_t moti = 0; moti < motif.size(); moti++) 00676 { 00677 if (strict) 00678 { 00679 match = seq.getValue(seqi + moti) == motif.getValue(moti); 00680 } 00681 else 00682 { 00683 match = AlphabetTools::match(seq.getAlphabet(), seq.getValue(seqi + moti), motif.getValue(moti)); 00684 } 00685 if (!match) 00686 { 00687 break; 00688 } 00689 } 00690 if (match) 00691 { 00692 return seqi; 00693 } 00694 } 00695 return seq.size(); 00696 } 00697 00698 /******************************************************************************/ 00699