bpp-seq  2.1.0
Bpp/Seq/SequenceTools.cpp
Go to the documentation of this file.
00001 //
00002 // File: SequenceTools.cpp
00003 // Authors: Guillaume Deuchst
00004 //          Julien Dutheil
00005 //          Sylvain Gaillard
00006 // Created on: Tue Aug 21 2003
00007 //
00008 
00009 /*
00010    Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
00011 
00012    This software is a computer program whose purpose is to provide classes
00013    for sequences analysis.
00014 
00015    This software is governed by the CeCILL  license under French law and
00016    abiding by the rules of distribution of free software.  You can  use,
00017    modify and/ or redistribute the software under the terms of the CeCILL
00018    license as circulated by CEA, CNRS and INRIA at the following URL
00019    "http://www.cecill.info".
00020 
00021    As a counterpart to the access to the source code and  rights to copy,
00022    modify and redistribute granted by the license, users are provided only
00023    with a limited warranty  and the software's author,  the holder of the
00024    economic rights,  and the successive licensors  have only  limited
00025    liability.
00026 
00027    In this respect, the user's attention is drawn to the risks associated
00028    with loading,  using,  modifying and/or developing or reproducing the
00029    software by the user in light of its specific status of free software,
00030    that may mean  that it is complicated to manipulate,  and  that  also
00031    therefore means  that it is reserved for developers  and  experienced
00032    professionals having in-depth computer knowledge. Users are therefore
00033    encouraged to load and test the software's suitability as regards their
00034    requirements in conditions enabling the security of their systems and/or
00035    data to be ensured and,  more generally, to use and operate it in the
00036    same conditions as regards security.
00037 
00038    The fact that you are presently reading this means that you have had
00039    knowledge of the CeCILL license and that you accept its terms.
00040  */
00041 
00042 #include "SequenceTools.h"
00043 
00044 #include "Alphabet/AlphabetTools.h"
00045 #include "StringSequenceTools.h"
00046 #include <Bpp/Numeric/Matrix/Matrix.h>
00047 #include <Bpp/Numeric/VectorTools.h>
00048 
00049 using namespace bpp;
00050 
00051 // From the STL:
00052 #include <ctype.h>
00053 #include <cmath>
00054 #include <list>
00055 #include <iostream>
00056 
00057 using namespace std;
00058 
00059 DNA SequenceTools::_DNA;
00060 RNA SequenceTools::_RNA;
00061 RNY SequenceTools::_RNY(_DNA);
00062 NucleicAcidsReplication SequenceTools::_DNARep(&_DNA, &_DNA);
00063 NucleicAcidsReplication SequenceTools::_RNARep(&_RNA, &_RNA);
00064 NucleicAcidsReplication SequenceTools::_transc(&_DNA, &_RNA);
00065 
00066 /******************************************************************************/
00067 
00068 Sequence* SequenceTools::subseq(const Sequence& sequence, size_t begin, size_t end) throw (IndexOutOfBoundsException, Exception)
00069 {
00070   // Checking interval
00071   if (end >= sequence.size())
00072     throw IndexOutOfBoundsException ("SequenceTools::subseq : Invalid upper bound", end, 0, sequence.size());
00073   if (end < begin)
00074     throw Exception ("SequenceTools::subseq : Invalid interval");
00075 
00076   // Copy sequence
00077   vector<int> temp(sequence.getContent());
00078 
00079   // Truncate sequence
00080   temp.erase(temp.begin() + end + 1, temp.end());
00081   temp.erase(temp.begin(), temp.begin() + begin);
00082 
00083   // New sequence creation
00084   return new BasicSequence(sequence.getName(), temp, sequence.getComments(), sequence.getAlphabet());
00085 }
00086 
00087 /******************************************************************************/
00088 
00089 Sequence* SequenceTools::concatenate(const Sequence& seq1, const Sequence& seq2) throw (AlphabetMismatchException, Exception)
00090 {
00091   // Sequence's alphabets matching verification
00092   if ((seq1.getAlphabet()->getAlphabetType()) != (seq2.getAlphabet()->getAlphabetType()))
00093     throw AlphabetMismatchException("SequenceTools::concatenate : Sequence's alphabets don't match ", seq1.getAlphabet(), seq2.getAlphabet());
00094 
00095   // Sequence's names matching verification
00096   if (seq1.getName() != seq2.getName())
00097     throw Exception ("SequenceTools::concatenate : Sequence's names don't match");
00098 
00099   // Concatenate sequences and send result
00100   vector<int> sequence  = seq1.getContent();
00101   vector<int> sequence2 = seq2.getContent();
00102   sequence.insert(sequence.end(), sequence2.begin(), sequence2.end());
00103   return new BasicSequence(seq1.getName(), sequence, seq1.getComments(), seq1.getAlphabet());
00104 }
00105 
00106 /******************************************************************************/
00107 
00108 Sequence& SequenceTools::complement(Sequence& seq) throw (AlphabetException)
00109 {
00110   // Alphabet type checking
00111   NucleicAcidsReplication* NAR;
00112   if (seq.getAlphabet()->getAlphabetType() == "DNA alphabet")
00113   {
00114     NAR = &_DNARep;
00115   }
00116   else if (seq.getAlphabet()->getAlphabetType() == "RNA alphabet")
00117   {
00118     NAR = &_RNARep;
00119   }
00120   else
00121   {
00122     throw AlphabetException("SequenceTools::complement: Sequence must be nucleic.", seq.getAlphabet());
00123   }
00124   for (size_t i = 0; i < seq.size(); i++)
00125   {
00126     seq.setElement(i, NAR->translate(seq.getValue(i)));
00127   }
00128   return seq;
00129 }
00130 
00131 /******************************************************************************/
00132 
00133 Sequence* SequenceTools::getComplement(const Sequence& sequence) throw (AlphabetException)
00134 {
00135   // Alphabet type checking
00136   NucleicAcidsReplication* NAR;
00137   if (sequence.getAlphabet()->getAlphabetType() == "DNA alphabet")
00138   {
00139     NAR = &_DNARep;
00140   }
00141   else if (sequence.getAlphabet()->getAlphabetType() == "RNA alphabet")
00142   {
00143     NAR = &_RNARep;
00144   }
00145   else
00146   {
00147     throw AlphabetException ("SequenceTools::getComplement: Sequence must be nucleic.", sequence.getAlphabet());
00148   }
00149 
00150   return NAR->translate(sequence);
00151 }
00152 
00153 /******************************************************************************/
00154 
00155 Sequence* SequenceTools::transcript(const Sequence& sequence) throw (AlphabetException)
00156 {
00157   // Alphabet type checking
00158   if (sequence.getAlphabet()->getAlphabetType() != "DNA alphabet")
00159   {
00160     throw AlphabetException ("SequenceTools::transcript : Sequence must be DNA", sequence.getAlphabet());
00161   }
00162 
00163   return _transc.translate(sequence);
00164 }
00165 
00166 /******************************************************************************/
00167 
00168 Sequence* SequenceTools::reverseTranscript(const Sequence& sequence) throw (AlphabetException)
00169 {
00170   // Alphabet type checking
00171   if (sequence.getAlphabet()->getAlphabetType() != "RNA alphabet")
00172   {
00173     throw AlphabetException ("SequenceTools::reverseTranscript : Sequence must be RNA", sequence.getAlphabet());
00174   }
00175 
00176   return _transc.reverse(sequence);
00177 }
00178 
00179 /******************************************************************************/
00180 
00181 Sequence& SequenceTools::invert(Sequence& seq)
00182 {
00183   size_t seq_size = seq.size(); // store seq size for efficiency
00184   unsigned int tmp_state = 0; // to store one state when swapping positions
00185   size_t j = seq_size; // symetric position iterator from sequence end
00186   for (size_t i = 0; i < seq_size / 2; i++)
00187   {
00188     j = seq_size - 1 - i;
00189     tmp_state = seq.getValue(i);
00190     seq.setElement(i, seq.getValue(j));
00191     seq.setElement(j, tmp_state);
00192   }
00193   return seq;
00194 }
00195 
00196 /******************************************************************************/
00197 
00198 Sequence* SequenceTools::getInvert(const Sequence& sequence)
00199 {
00200   Sequence* iSeq = sequence.clone();
00201   invert(*iSeq);
00202   return iSeq;
00203 }
00204 
00205 /******************************************************************************/
00206 
00207 Sequence& SequenceTools::invertComplement(Sequence& seq)
00208 {
00209   // Alphabet type checking
00210   NucleicAcidsReplication* NAR;
00211   if (seq.getAlphabet()->getAlphabetType() == "DNA alphabet")
00212   {
00213     NAR = &_DNARep;
00214   }
00215   else if (seq.getAlphabet()->getAlphabetType() == "RNA alphabet")
00216   {
00217     NAR = &_RNARep;
00218   }
00219   else
00220   {
00221     throw AlphabetException("SequenceTools::complement: Sequence must be nucleic.", seq.getAlphabet());
00222   }
00223   // for (size_t i = 0 ; i < seq.size() ; i++) {
00224   //  seq.setElement(i, NAR->translate(seq.getValue(i)));
00225   // }
00226   size_t seq_size = seq.size(); // store seq size for efficiency
00227   int tmp_state = 0; // to store one state when swapping positions
00228   size_t j = seq_size; // symetric position iterator from sequence end
00229   for (size_t i = 0; i < seq_size / 2; i++)
00230   {
00231     j = seq_size - 1 - i;
00232     tmp_state = seq.getValue(i);
00233     seq.setElement(i, NAR->translate(seq.getValue(j)));
00234     seq.setElement(j, NAR->translate(tmp_state));
00235   }
00236   if (seq_size % 2)   // treate the state in the middle of odd sequences
00237   {
00238     seq.setElement(seq_size / 2, NAR->translate(seq.getValue(seq_size / 2)));
00239   }
00240   return seq;
00241 }
00242 
00243 /******************************************************************************/
00244 
00245 double SequenceTools::getPercentIdentity(const Sequence& seq1, const Sequence& seq2, bool ignoreGaps) throw (AlphabetMismatchException, SequenceNotAlignedException)
00246 {
00247   if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
00248     throw AlphabetMismatchException("SequenceTools::getPercentIdentity", seq1.getAlphabet(), seq2.getAlphabet());
00249   if (seq1.size() != seq2.size())
00250     throw SequenceNotAlignedException("SequenceTools::getPercentIdentity", &seq2);
00251   int gap = seq1.getAlphabet()->getGapCharacterCode();
00252   size_t id = 0;
00253   size_t tot = 0;
00254   for (size_t i = 0; i < seq1.size(); i++)
00255   {
00256     int x = seq1.getValue(i);
00257     int y = seq2.getValue(i);
00258     if (ignoreGaps)
00259     {
00260       if (x != gap && y != gap)
00261       {
00262         tot++;
00263         if (x == y)
00264           id++;
00265       }
00266     }
00267     else
00268     {
00269       tot++;
00270       if (x == y)
00271         id++;
00272     }
00273   }
00274   return static_cast<double>(id) / static_cast<double>(tot) * 100.;
00275 }
00276 
00277 /******************************************************************************/
00278 
00279 size_t SequenceTools::getNumberOfSites(const Sequence& seq)
00280 {
00281   size_t count = 0;
00282   const Alphabet* alpha = seq.getAlphabet();
00283   for (size_t i = 0; i < seq.size(); i++)
00284   {
00285     if (!alpha->isGap(seq[i]))
00286       count++;
00287   }
00288   return count;
00289 }
00290 
00291 /******************************************************************************/
00292 
00293 size_t SequenceTools::getNumberOfCompleteSites(const Sequence& seq)
00294 {
00295   size_t count = 0;
00296   const Alphabet* alpha = seq.getAlphabet();
00297   for (size_t i = 0; i < seq.size(); i++)
00298   {
00299     if (!alpha->isGap(seq[i]) && !alpha->isUnresolved(seq[i]))
00300       count++;
00301   }
00302   return count;
00303 }
00304 
00305 /******************************************************************************/
00306 
00307 size_t SequenceTools::getNumberOfUnresolvedSites(const Sequence& seq)
00308 {
00309   size_t count = 0;
00310   const Alphabet* alpha = seq.getAlphabet();
00311   for (size_t i = 0; i < seq.size(); i++)
00312   {
00313     if (alpha->isUnresolved(seq[i]))
00314       count++;
00315   }
00316   return count;
00317 }
00318 
00319 /******************************************************************************/
00320 
00321 Sequence* SequenceTools::getSequenceWithoutGaps(const Sequence& seq)
00322 {
00323   const Alphabet* alpha = seq.getAlphabet();
00324   vector<int> content;
00325   for (size_t i = 0; i < seq.size(); i++)
00326   {
00327     if (!alpha->isGap(seq[i]))
00328       content.push_back(seq[i]);
00329   }
00330   Sequence* newSeq = dynamic_cast<Sequence*>(seq.clone());
00331   newSeq->setContent(content);
00332   return newSeq;
00333 }
00334 
00335 /******************************************************************************/
00336 
00337 void SequenceTools::removeGaps(Sequence& seq)
00338 {
00339   const Alphabet* alpha = seq.getAlphabet();
00340   for (size_t i = seq.size(); i > 0; --i)
00341   {
00342     if (alpha->isGap(seq[i - 1]))
00343       seq.deleteElement(i - 1);
00344   }
00345 }
00346 
00347 /******************************************************************************/
00348 
00349 Sequence* SequenceTools::getSequenceWithoutStops(const Sequence& seq) throw (Exception)
00350 {
00351   const CodonAlphabet* calpha = dynamic_cast<const CodonAlphabet*>(seq.getAlphabet());
00352   if (!calpha)
00353     throw Exception("SequenceTools::getSequenceWithoutStops. Input sequence should have a codon alphabet.");
00354   vector<int> content;
00355   for (size_t i = 0; i < seq.size(); i++)
00356   {
00357     if (!calpha->isStop(seq[i]))
00358       content.push_back(seq[i]);
00359   }
00360   Sequence* newSeq = dynamic_cast<Sequence*>(seq.clone());
00361   newSeq->setContent(content);
00362   return newSeq;
00363 }
00364 
00365 /******************************************************************************/
00366 
00367 void SequenceTools::removeStops(Sequence& seq) throw (Exception)
00368 {
00369   const CodonAlphabet* calpha = dynamic_cast<const CodonAlphabet*>(seq.getAlphabet());
00370   if (!calpha)
00371     throw Exception("SequenceTools::removeStops. Input sequence should have a codon alphabet.");
00372   for (size_t i = seq.size(); i > 0; --i)
00373   {
00374     if (calpha->isStop(seq[i - 1]))
00375       seq.deleteElement(i - 1);
00376   }
00377 }
00378 
00379 /******************************************************************************/
00380 
00381 void SequenceTools::replaceStopsWithGaps(Sequence& seq) throw (Exception)
00382 {
00383   const CodonAlphabet* calpha = dynamic_cast<const CodonAlphabet*>(seq.getAlphabet());
00384   if (!calpha)
00385     throw Exception("SequenceTools::replaceStopsWithGaps. Input sequence should have a codon alphabet.");
00386   int gap = calpha->getGapCharacterCode();
00387   for (size_t i = 0; i < seq.size(); ++i)
00388   {
00389     if (calpha->isStop(seq[i]))
00390       seq.setElement(i, gap);
00391   }
00392 }
00393 
00394 /******************************************************************************/
00395 
00396 BowkerTest* SequenceTools::bowkerTest(const Sequence& seq1, const Sequence& seq2)
00397 throw (SequenceNotAlignedException)
00398 {
00399   if (seq1.size() != seq2.size())
00400     throw SequenceNotAlignedException("SequenceTools::bowkerTest.", &seq2);
00401   size_t n = seq1.size();
00402   const Alphabet* alpha = seq1.getAlphabet();
00403   unsigned int r = alpha->getSize();
00404 
00405   // Compute contingency table:
00406   RowMatrix<double> array(r, r);
00407   int x, y;
00408   for (size_t i = 0; i < n; i++)
00409   {
00410     x = seq1[i];
00411     y = seq2[i];
00412     if (!alpha->isGap(x) && !alpha->isUnresolved(x)
00413         && !alpha->isGap(y) && !alpha->isUnresolved(y))
00414     {
00415       array(static_cast<size_t>(x), static_cast<size_t>(y))++;
00416     }
00417   }
00418 
00419   // Compute Bowker's statistic:
00420   double sb2 = 0, nij, nji;
00421   for (unsigned int i = 1; i < r; ++i)
00422   {
00423     for (unsigned int j = 0; j < i; ++j)
00424     {
00425       nij = array(i, j);
00426       nji = array(j, i);
00427       if (nij != 0 || nji != 0)
00428       {
00429         sb2 += pow(nij - nji, 2) / (nij + nji);
00430       }
00431       // Else: we should display a warning there.
00432     }
00433   }
00434 
00435   // Compute p-value:
00436   double pvalue = 1. - RandomTools::pChisq(sb2, (r - 1) * r / 2);
00437 
00438   // Return results:
00439   BowkerTest* test = new BowkerTest();
00440   test->setStatistic(sb2);
00441   test->setPValue(pvalue);
00442   return test;
00443 }
00444 
00445 /******************************************************************************/
00446 
00447 void SequenceTools::getPutativeHaplotypes(const Sequence& seq, std::vector<Sequence*>& hap, unsigned int level)
00448 {
00449   vector< vector< int > > states(seq.size());
00450   list<Sequence*> t_hap;
00451   const Alphabet* alpha = seq.getAlphabet();
00452   unsigned int hap_count = 1;
00453   // Vector of available states at each position
00454   for (size_t i = 0; i < seq.size(); i++)
00455   {
00456     vector<int> st = alpha->getAlias(seq[i]);
00457     if (!st.size())
00458     {
00459       st.push_back(alpha->getGapCharacterCode());
00460     }
00461     if (st.size() <= level)
00462     {
00463       states[i] = st;
00464     }
00465     else
00466     {
00467       states[i] = vector<int>(1, seq[i]);
00468     }
00469   }
00470   // Combinatorial haplotypes building (the use of tree may be more accurate)
00471   t_hap.push_back(new BasicSequence(seq.getName() + "_hap" + TextTools::toString(hap_count++), "", alpha));
00472   for (size_t i = 0; i < states.size(); i++)
00473   {
00474     for (list<Sequence*>::iterator it = t_hap.begin(); it != t_hap.end(); it++)
00475     {
00476       for (unsigned int j = 0; j < states[i].size(); j++)
00477       {
00478         Sequence* tmp_seq = new BasicSequence(seq.getName() + "_hap", (**it).getContent(), alpha);
00479         if (j < states[i].size() - 1)
00480         {
00481           tmp_seq->setName(tmp_seq->getName() + TextTools::toString(hap_count++));
00482           tmp_seq->addElement(states[i][j]);
00483           t_hap.insert(it, tmp_seq);
00484         }
00485         else
00486         {
00487           (**it).addElement(states[i][j]);
00488         }
00489       }
00490     }
00491   }
00492   for (list<Sequence*>::reverse_iterator it = t_hap.rbegin(); it != t_hap.rend(); it++)
00493   {
00494     hap.push_back(*it);
00495   }
00496 }
00497 
00498 /******************************************************************************/
00499 
00500 Sequence* SequenceTools::combineSequences(const Sequence& s1, const Sequence& s2) throw (AlphabetMismatchException)
00501 {
00502   if (s1.getAlphabet()->getAlphabetType() != s2.getAlphabet()->getAlphabetType())
00503   {
00504     throw AlphabetMismatchException("SequenceTools::combineSequences(const Sequence& s1, const Sequence& s2): s1 and s2 don't have same Alphabet.", s1.getAlphabet(), s2.getAlphabet());
00505   }
00506   const Alphabet* alpha = s1.getAlphabet();
00507   vector<int> st;
00508   vector<int> seq;
00509   size_t length = NumTools::max(s1.size(), s2.size());
00510   for (size_t i = 0; i < length; i++)
00511   {
00512     if (i < s1.size())
00513       st.push_back(s1.getValue(i));
00514     if (i < s2.size())
00515       st.push_back(s2.getValue(i));
00516     seq.push_back(alpha->getGeneric(st));
00517     st.clear();
00518   }
00519   Sequence* s = new BasicSequence(s1.getName() + "+" + s2.getName(), seq, alpha);
00520   return s;
00521 }
00522 
00523 /******************************************************************************/
00524 
00525 Sequence* SequenceTools::subtractHaplotype(const Sequence& s, const Sequence& h, string name, unsigned int level) throw (SequenceNotAlignedException)
00526 {
00527   const Alphabet* alpha = s.getAlphabet();
00528   if (name.size() == 0)
00529     name = s.getName() + "_haplotype";
00530   string seq;
00531   if (s.size() != h.size())
00532     throw SequenceNotAlignedException("SequenceTools::subtractHaplotype: haplotype must be aligned with the sequence.", &h);
00533   for (unsigned int i = 0; i < s.size(); ++i)
00534   {
00535     string c;
00536     vector<int> nucs = alpha->getAlias(s.getValue(i));
00537     if (nucs.size() > 1)
00538     {
00539       remove(nucs.begin(), nucs.end(), h.getValue(i));
00540       nucs = vector<int>(nucs.begin(), nucs.end() - 1);
00541     }
00542     else
00543     {
00544       nucs = vector<int>(nucs.begin(), nucs.end());
00545     }
00546     c = alpha->intToChar(alpha->getGeneric(nucs));
00547     if (level <= nucs.size() && (alpha->isUnresolved(s.getValue(i)) || alpha->isUnresolved(h.getValue(i))))
00548     {
00549       c = alpha->intToChar(alpha->getUnknownCharacterCode());
00550     }
00551     seq += c;
00552   }
00553   Sequence* hap = new BasicSequence(name, seq, alpha);
00554   return hap;
00555 }
00556 
00557 /******************************************************************************/
00558 
00559 Sequence* SequenceTools::RNYslice(const Sequence& seq, int ph) throw (AlphabetException)
00560 {
00561   // Alphabet type checking
00562   if (seq.getAlphabet()->getAlphabetType() != "DNA alphabet")
00563   {
00564     throw AlphabetException ("SequenceTools::transcript : Sequence must be DNA", seq.getAlphabet());
00565   }
00566 
00567   if ((ph < 1) || (ph > 3))
00568     throw Exception("Bad phase for RNYSlice: " + TextTools::toString(ph) + ". Should be between 1 and 3.");
00569 
00570   size_t s = seq.size();
00571   size_t n = (s - ph + 3) / 3;
00572 
00573   vector<int> content(n);
00574 
00575   int tir = seq.getAlphabet()->getGapCharacterCode();
00576   size_t j;
00577 
00578   for (size_t i = 0; i < n; i++)
00579   {
00580     j = i * 3 + ph - 1;
00581 
00582     if (j == 0)
00583       content[i] = _RNY.getRNY(tir, seq[0], seq[1], *seq.getAlphabet());
00584     else
00585     {
00586       if (j == s - 1)
00587         content[i] = _RNY.getRNY(seq[j - 1], seq[j], tir, *seq.getAlphabet());
00588       else
00589         content[i] = _RNY.getRNY(seq[j - 1], seq[j], seq[j + 1], *seq.getAlphabet());
00590     }
00591   }
00592 
00593   // New sequence creating, and sense reversing
00594   Sequence* sq = new BasicSequence(seq.getName(), content,
00595                                    seq.getComments(), &_RNY);
00596 
00597   // Send result
00598   return sq;
00599 }
00600 
00601 Sequence* SequenceTools::RNYslice(const Sequence& seq) throw (AlphabetException)
00602 {
00603   // Alphabet type checking
00604   if (seq.getAlphabet()->getAlphabetType() != "DNA alphabet")
00605   {
00606     throw AlphabetException ("SequenceTools::transcript : Sequence must be DNA", seq.getAlphabet());
00607   }
00608 
00609   size_t n = seq.size();
00610 
00611   vector<int> content(n);
00612 
00613   int tir = seq.getAlphabet()->getGapCharacterCode();
00614 
00615   if (seq.size() >= 2)
00616   {
00617     content[0] = _RNY.getRNY(tir, seq[0], seq[1], *seq.getAlphabet());
00618 
00619     for (unsigned int i = 1; i < n - 1; i++)
00620     {
00621       content[i] = _RNY.getRNY(seq[i - 1], seq[i], seq[i + 1],
00622                                *seq.getAlphabet());
00623     }
00624 
00625     content[n - 1] = _RNY.getRNY(seq[n - 2], seq[n - 1], tir, *seq.getAlphabet());
00626   }
00627 
00628   // New sequence creating, and sense reversing
00629   Sequence* s = new BasicSequence(seq.getName(), content,
00630                                   seq.getComments(), &_RNY);
00631 
00632   // Send result
00633   return s;
00634 }
00635 
00636 /******************************************************************************/
00637 
00638 
00639 void SequenceTools::getCDS(Sequence& sequence, bool checkInit, bool checkStop, bool includeInit, bool includeStop)
00640 {
00641   const CodonAlphabet* alphabet = dynamic_cast<const CodonAlphabet*>(sequence.getAlphabet());
00642   if (!alphabet)
00643     throw AlphabetException("SequenceTools::getCDS. Sequence is not a codon sequence.");
00644   if (checkInit)
00645   {
00646     unsigned int i;
00647     for (i = 0; i < sequence.size() && !alphabet->isInit(sequence[i]); ++i)
00648     {}
00649     for (unsigned int j = 0; includeInit ? j < i : j <= i; ++j)
00650     {
00651       sequence.deleteElement(j);
00652     }
00653   }
00654   if (checkStop)
00655   {
00656     unsigned int i;
00657     for (i = 0; i < sequence.size() && !alphabet->isStop(sequence[i]); ++i)
00658     {}
00659     for (unsigned int j = includeStop ? i + 1 : i; j < sequence.size(); ++j)
00660     {
00661       sequence.deleteElement(j);
00662     }
00663   }
00664 }
00665 
00666 /******************************************************************************/
00667 
00668 size_t SequenceTools::findFirstOf(const Sequence& seq, const Sequence& motif, bool strict)
00669 {
00670   if (motif.size() > seq.size())
00671     return seq.size();
00672   for (size_t seqi = 0; seqi < seq.size() - motif.size() + 1; seqi++)
00673   {
00674     bool match = false;
00675     for (size_t moti = 0; moti < motif.size(); moti++)
00676     {
00677       if (strict)
00678       {
00679         match = seq.getValue(seqi + moti) == motif.getValue(moti);
00680       }
00681       else
00682       {
00683         match = AlphabetTools::match(seq.getAlphabet(), seq.getValue(seqi + moti), motif.getValue(moti));
00684       }
00685       if (!match)
00686       {
00687         break;
00688       }
00689     }
00690     if (match)
00691     {
00692       return seqi;
00693     }
00694   }
00695   return seq.size();
00696 }
00697 
00698 /******************************************************************************/
00699 
 All Classes Namespaces Files Functions Variables Typedefs Friends