bpp-seq  2.1.0
Bpp/Seq/Container/SiteContainerTools.cpp
Go to the documentation of this file.
00001 //
00002 // File: SiteContainerTools.cpp
00003 // Created by: Julien Dutheil
00004 //             Sylvain Glémin
00005 // Created on: Fri Dec 12 18:55:06 2003
00006 //
00007 
00008 /*
00009    Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
00010 
00011    This software is a computer program whose purpose is to provide classes
00012    for sequences analysis.
00013 
00014    This software is governed by the CeCILL  license under French law and
00015    abiding by the rules of distribution of free software.  You can  use,
00016    modify and/ or redistribute the software under the terms of the CeCILL
00017    license as circulated by CEA, CNRS and INRIA at the following URL
00018    "http://www.cecill.info".
00019 
00020    As a counterpart to the access to the source code and  rights to copy,
00021    modify and redistribute granted by the license, users are provided only
00022    with a limited warranty  and the software's author,  the holder of the
00023    economic rights,  and the successive licensors  have only  limited
00024    liability.
00025 
00026    In this respect, the user's attention is drawn to the risks associated
00027    with loading,  using,  modifying and/or developing or reproducing the
00028    software by the user in light of its specific status of free software,
00029    that may mean  that it is complicated to manipulate,  and  that  also
00030    therefore means  that it is reserved for developers  and  experienced
00031    professionals having in-depth computer knowledge. Users are therefore
00032    encouraged to load and test the software's suitability as regards their
00033    requirements in conditions enabling the security of their systems and/or
00034    data to be ensured and,  more generally, to use and operate it in the
00035    same conditions as regards security.
00036 
00037    The fact that you are presently reading this means that you have had
00038    knowledge of the CeCILL license and that you accept its terms.
00039  */
00040 
00041 #include "SiteContainerTools.h"
00042 #include "SequenceContainerTools.h"
00043 #include "VectorSiteContainer.h"
00044 #include "SiteContainerIterator.h"
00045 #include "../SiteTools.h"
00046 #include "../Alphabet/AlphabetTools.h"
00047 #include "../SequenceTools.h"
00048 #include <Bpp/App/ApplicationTools.h>
00049 
00050 using namespace bpp;
00051 
00052 // From the STL:
00053 #include <vector>
00054 #include <deque>
00055 #include <string>
00056 
00057 using namespace std;
00058 
00059 /******************************************************************************/
00060 
00061 SiteContainer* SiteContainerTools::getSitesWithoutGaps(const SiteContainer& sites)
00062 {
00063   vector<string> seqNames = sites.getSequencesNames();
00064   VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
00065   noGapCont->setSequencesNames(seqNames, false);
00066   NoGapSiteContainerIterator ngsi(sites);
00067   while (ngsi.hasMoreSites())
00068   {
00069     noGapCont->addSite(*ngsi.nextSite());
00070   }
00071   return noGapCont;
00072 }
00073 
00074 /******************************************************************************/
00075 
00076 SiteContainer* SiteContainerTools::getCompleteSites(const SiteContainer& sites)
00077 {
00078   vector<string> seqNames = sites.getSequencesNames();
00079   VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
00080   noGapCont->setSequencesNames(seqNames, false);
00081   CompleteSiteContainerIterator csi(sites);
00082   while (csi.hasMoreSites())
00083   {
00084     noGapCont->addSite(*csi.nextSite());
00085   }
00086   return noGapCont;
00087 }
00088 
00089 /******************************************************************************/
00090 
00091 SiteContainer* SiteContainerTools::getSelectedSites(
00092   const SiteContainer& sequences,
00093   const SiteSelection& selection)
00094 {
00095   vector<string> seqNames = sequences.getSequencesNames();
00096   VectorSiteContainer* sc = new VectorSiteContainer(seqNames.size(), sequences.getAlphabet());
00097   sc->setSequencesNames(seqNames, false);
00098   for (unsigned int i = 0; i < selection.size(); i++)
00099   {
00100     sc->addSite(sequences.getSite(selection[i]), false);
00101     // We do not check names, we suppose that the container passed as an argument is correct.
00102     // WARNING: what if selection contains many times the same indice? ...
00103   }
00104   sc->setGeneralComments(sequences.getGeneralComments());
00105   return sc;
00106 }
00107 
00108 /******************************************************************************/
00109 
00110 const Sequence* SiteContainerTools::getConsensus(const SiteContainer& sc, const std::string& name, bool ignoreGap, bool resolveUnknown)
00111 {
00112   Vint consensus;
00113   SimpleSiteContainerIterator ssi(sc);
00114   const Site* site;
00115   while (ssi.hasMoreSites())
00116   {
00117     site = ssi.nextSite();
00118     map<int, double> freq;
00119     SiteTools::getFrequencies(*site, freq, resolveUnknown);
00120     double max = 0;
00121     int cons = -1; // default result
00122     if (ignoreGap)
00123     {
00124       for (map<int, double>::iterator it = freq.begin(); it != freq.end(); it++)
00125       {
00126         if (it->second > max && it->first != -1)
00127         {
00128           max = it->second;
00129           cons = it->first;
00130         }
00131       }
00132     }
00133     else
00134     {
00135       for (map<int, double>::iterator it = freq.begin(); it != freq.end(); it++)
00136       {
00137         if (it->second > max)
00138         {
00139           max = it->second;
00140           cons = it->first;
00141         }
00142       }
00143     }
00144     consensus.push_back(cons);
00145   }
00146   const Sequence* seqConsensus = new BasicSequence(name, consensus, sc.getAlphabet());
00147   return seqConsensus;
00148 }
00149 
00150 /******************************************************************************/
00151 
00152 void SiteContainerTools::changeGapsToUnknownCharacters(SiteContainer& sites)
00153 {
00154   // NB: use iterators for a better algorithm?
00155   int unknownCode = sites.getAlphabet()->getUnknownCharacterCode();
00156   for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
00157   {
00158     for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++)
00159     {
00160       int* element = &sites(j, i);
00161       if (sites.getAlphabet()->isGap(*element))
00162         *element = unknownCode;
00163     }
00164   }
00165 }
00166 
00167 /******************************************************************************/
00168 
00169 void SiteContainerTools::changeUnresolvedCharactersToGaps(SiteContainer& sites)
00170 {
00171   // NB: use iterators for a better algorithm?
00172   int gapCode = sites.getAlphabet()->getGapCharacterCode();
00173   for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
00174   {
00175     for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++)
00176     {
00177       int* element = &sites(j, i);
00178       if (sites.getAlphabet()->isUnresolved(*element))
00179         *element = gapCode;
00180     }
00181   }
00182 }
00183 
00184 /******************************************************************************/
00185 
00186 SiteContainer* SiteContainerTools::removeGapOnlySites(const SiteContainer& sites)
00187 {
00188   vector<string> seqNames = sites.getSequencesNames();
00189   VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
00190   noGapCont->setSequencesNames(seqNames, false);
00191   for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
00192   {
00193     const Site* site = &sites.getSite(i);
00194     if (!SiteTools::isGapOnly(*site))
00195       noGapCont->addSite(*site);
00196   }
00197   return noGapCont;
00198 }
00199 
00200 /******************************************************************************/
00201 
00202 void SiteContainerTools::removeGapOnlySites(SiteContainer& sites)
00203 {
00204   size_t n = sites.getNumberOfSites();
00205   size_t i = n;
00206   while (i > 1)
00207   {
00208     ApplicationTools::displayGauge(n - i + 1, n);
00209     const Site* site = &sites.getSite(i - 1);
00210     if (SiteTools::isGapOnly(*site))
00211     {
00212       size_t end = i;
00213       while (SiteTools::isGapOnly(*site) && i > 1)
00214       {
00215         --i;
00216         site = &sites.getSite(i - 1);
00217       }
00218       sites.deleteSites(i, end - i);
00219     }
00220     else
00221     {
00222       --i;
00223     }
00224   }
00225   ApplicationTools::displayGauge(n, n);
00226   const Site* site = &sites.getSite(0);
00227   if (SiteTools::isGapOnly(*site))
00228     sites.deleteSite(0);
00229 }
00230 
00231 /******************************************************************************/
00232 
00233 SiteContainer* SiteContainerTools::removeGapOrUnresolvedOnlySites(const SiteContainer& sites)
00234 {
00235   vector<string> seqNames = sites.getSequencesNames();
00236   VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
00237   noGapCont->setSequencesNames(seqNames, false);
00238   for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
00239   {
00240     const Site* site = &sites.getSite(i);
00241     if (!SiteTools::isGapOrUnresolvedOnly(*site))
00242       noGapCont->addSite(*site, false);
00243   }
00244   return noGapCont;
00245 }
00246 
00247 /******************************************************************************/
00248 
00249 void SiteContainerTools::removeGapOrUnresolvedOnlySites(SiteContainer& sites)
00250 {
00251   size_t n = sites.getNumberOfSites();
00252   size_t i = n;
00253   while (i > 1)
00254   {
00255     ApplicationTools::displayGauge(n - i + 1, n);
00256     const Site* site = &sites.getSite(i - 1);
00257     if (SiteTools::isGapOnly(*site))
00258     {
00259       size_t end = i;
00260       while (SiteTools::isGapOrUnresolvedOnly(*site) && i > 1)
00261       {
00262         --i;
00263         site = &sites.getSite(i - 1);
00264       }
00265       sites.deleteSites(i, end - i);
00266     }
00267     else
00268     {
00269       --i;
00270     }
00271   }
00272   ApplicationTools::displayGauge(n, n);
00273   const Site* site = &sites.getSite(0);
00274   if (SiteTools::isGapOrUnresolvedOnly(*site))
00275     sites.deleteSite(0);
00276 }
00277 
00278 /******************************************************************************/
00279 
00280 SiteContainer* SiteContainerTools::removeGapSites(const SiteContainer& sites, double maxFreqGaps)
00281 {
00282   vector<string> seqNames = sites.getSequencesNames();
00283   VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
00284   noGapCont->setSequencesNames(seqNames, false);
00285   for (unsigned int i = 0; i < sites.getNumberOfSites(); ++i) {
00286     map<int, double> freq;
00287     SiteTools::getFrequencies(sites.getSite(i), freq);
00288     if (freq[-1] <= maxFreqGaps)
00289       noGapCont->addSite(sites.getSite(i), false);
00290   }
00291   return noGapCont;
00292 }
00293 
00294 /******************************************************************************/
00295 
00296 void SiteContainerTools::removeGapSites(SiteContainer& sites, double maxFreqGaps)
00297 {
00298   for (size_t i = sites.getNumberOfSites(); i > 0; i--) {
00299     map<int, double> freq;
00300     SiteTools::getFrequencies(sites.getSite(i - 1), freq);
00301     if (freq[-1] > maxFreqGaps)
00302       sites.deleteSite(i - 1);
00303   }
00304 }
00305 
00306 /******************************************************************************/
00307 
00308 SiteContainer* SiteContainerTools::removeStopCodonSites(const SiteContainer& sites)  throw (AlphabetException)
00309 {
00310   const CodonAlphabet* pca = dynamic_cast<const CodonAlphabet*>(sites.getAlphabet());
00311   if (!pca)
00312     throw AlphabetException("Not a Codon Alphabet", sites.getAlphabet());
00313   vector<string> seqNames = sites.getSequencesNames();
00314   VectorSiteContainer* noStopCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
00315   noStopCont->setSequencesNames(seqNames, false);
00316   for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
00317   {
00318     const Site* site = &sites.getSite(i);
00319     if (!SiteTools::hasStopCodon(*site))
00320       noStopCont->addSite(*site, false);
00321   }
00322   return noStopCont;
00323 }
00324 
00325 /******************************************************************************/
00326 
00327 SiteContainer* SiteContainerTools::resolveDottedAlignment(
00328   const SiteContainer& dottedAln,
00329   const Alphabet* resolvedAlphabet) throw (AlphabetException, Exception)
00330 {
00331   if (!AlphabetTools::isDefaultAlphabet(dottedAln.getAlphabet()))
00332     throw AlphabetException("SiteContainerTools::resolveDottedAlignment. Alignment alphabet should of class 'DefaultAlphabet'.", dottedAln.getAlphabet());
00333 
00334   // First we look for the reference sequence:
00335   size_t n = dottedAln.getNumberOfSequences();
00336   if (n == 0)
00337     throw Exception("SiteContainerTools::resolveDottedAlignment. Input alignment contains no sequence.");
00338 
00339   const Sequence* refSeq = 0;
00340   for (size_t i = 0; i < n; ++i) // Test each sequence
00341   {
00342     const Sequence* seq = &dottedAln.getSequence(i);
00343     bool isRef = true;
00344     for (unsigned int j = 0; isRef && j < seq->size(); ++j) // For each site in the sequence
00345     {
00346       if (seq->getChar(j) == ".")
00347         isRef = false;
00348     }
00349     if (isRef) // We found the reference sequence!
00350     {
00351       refSeq = new BasicSequence(*seq);
00352     }
00353   }
00354   if (!refSeq)
00355     throw Exception("SiteContainerTools::resolveDottedAlignment. No reference sequence was found in the input alignment.");
00356 
00357   // Now we build a new VectorSiteContainer:
00358   VectorSiteContainer* sites = new VectorSiteContainer(n, resolvedAlphabet);
00359 
00360   // We add each site one by one:
00361   size_t m = dottedAln.getNumberOfSites();
00362   string state;
00363   for (unsigned int i = 0; i < m; ++i)
00364   {
00365     string resolved = refSeq->getChar(i);
00366     const Site* site = &dottedAln.getSite(i);
00367     Site resolvedSite(resolvedAlphabet, site->getPosition());
00368     for (unsigned int j = 0; j < n; j++)
00369     {
00370       state = site->getChar(j);
00371       if (state == ".")
00372       {
00373         state = resolved;
00374       }
00375       resolvedSite.addElement(state);
00376     }
00377     // Add the new site:
00378     sites->addSite(resolvedSite);
00379   }
00380 
00381   // Seq sequence names:
00382   sites->setSequencesNames(dottedAln.getSequencesNames());
00383 
00384   // Delete the copied sequence:
00385   delete refSeq;
00386 
00387   // Return result:
00388   return sites;
00389 }
00390 
00391 /******************************************************************************/
00392 
00393 std::map<size_t, size_t> SiteContainerTools::getSequencePositions(const Sequence& seq)
00394 {
00395   map<size_t, size_t> tln;
00396   if (seq.size() == 0)
00397     return tln;
00398   unsigned int count = 0;
00399   for (size_t i = 0; i < seq.size(); i++)
00400   {
00401     if (seq[i] != -1)
00402     {
00403       count++;
00404       tln[i + 1] = count;
00405     }
00406   }
00407   return tln;
00408 }
00409 
00410 /******************************************************************************/
00411 
00412 std::map<size_t, size_t> SiteContainerTools::getAlignmentPositions(const Sequence& seq)
00413 {
00414   map<size_t, size_t> tln;
00415   if (seq.size() == 0)
00416     return tln;
00417   unsigned int count = 0;
00418   for (size_t i = 0; i < seq.size(); i++)
00419   {
00420     if (seq[i] != -1)
00421     {
00422       count++;
00423       tln[count] = i + 1;
00424     }
00425   }
00426   return tln;
00427 }
00428 
00429 /******************************************************************************/
00430 
00431 std::map<size_t, size_t> SiteContainerTools::translateAlignment(const Sequence& seq1, const Sequence& seq2)
00432 throw (AlphabetMismatchException, Exception)
00433 {
00434   if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
00435     throw AlphabetMismatchException("SiteContainerTools::translateAlignment", seq1.getAlphabet(), seq2.getAlphabet());
00436   map<size_t, size_t> tln;
00437   if (seq1.size() == 0)
00438     return tln;
00439   unsigned int count1 = 0;
00440   unsigned int count2 = 0;
00441   if (seq2.size() == 0)
00442     throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + ".");
00443   int state1 = seq1[count1];
00444   int state2 = seq2[count2];
00445   bool end = false;
00446   while (!end)
00447   {
00448     while (state1 == -1)
00449     {
00450       count1++;
00451       if (count1 < seq1.size())
00452         state1 = seq1[count1];
00453       else
00454         break;
00455     }
00456     while (state2 == -1)
00457     {
00458       count2++;
00459       if (count2 < seq2.size())
00460         state2 = seq2[count2];
00461       else
00462         break;
00463     }
00464     if (state1 != state2)
00465       throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + ".");
00466     tln[count1 + 1] = count2 + 1; // Count start at 1
00467     if (count1 == seq1.size() - 1)
00468       end = true;
00469     else
00470     {
00471       if (count2 == seq2.size() - 1)
00472       {
00473         state1 = seq1[++count1];
00474         while (state1 == -1)
00475         {
00476           count1++;
00477           if (count1 < seq1.size())
00478             state1 = seq1[count1];
00479           else
00480             break;
00481         }
00482         if (state1 == -1)
00483           end = true;
00484         else
00485           throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + ".");
00486       }
00487       else
00488       {
00489         state1 = seq1[++count1];
00490         state2 = seq2[++count2];
00491       }
00492     }
00493   }
00494   return tln;
00495 }
00496 
00497 /******************************************************************************/
00498 
00499 std::map<size_t, size_t> SiteContainerTools::translateSequence(const SiteContainer& sequences, size_t i1, size_t i2)
00500 {
00501   const Sequence* seq1 = &sequences.getSequence(i1);
00502   const Sequence* seq2 = &sequences.getSequence(i2);
00503   map<size_t, size_t> tln;
00504   size_t count1 = 0; // Sequence 1 counter
00505   size_t count2 = 0; // Sequence 2 counter
00506   int state1;
00507   int state2;
00508   for (size_t i = 0; i <  sequences.getNumberOfSites(); i++)
00509   {
00510     state1 = (*seq1)[i];
00511     if (state1 != -1)
00512       count1++;
00513     state2 = (*seq2)[i];
00514     if (state2 != -1)
00515       count2++;
00516     if (state1 != -1)
00517     {
00518       tln[count1] = (state2 == -1 ? 0 : count2);
00519     }
00520   }
00521   return tln;
00522 }
00523 
00524 /******************************************************************************/
00525 
00526 AlignedSequenceContainer* SiteContainerTools::alignNW(
00527   const Sequence& seq1,
00528   const Sequence& seq2,
00529   const AlphabetIndex2& s,
00530   double gap)
00531 throw (AlphabetMismatchException)
00532 {
00533   if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
00534     throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), seq2.getAlphabet());
00535   if (seq1.getAlphabet()->getAlphabetType() != s.getAlphabet()->getAlphabetType())
00536     throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), s.getAlphabet());
00537   // Check that sequences have no gap!
00538   auto_ptr<Sequence> s1(seq1.clone());
00539   SequenceTools::removeGaps(*s1);
00540   auto_ptr<Sequence> s2(seq2.clone());
00541   SequenceTools::removeGaps(*s2);
00542 
00543   // 1) Initialize matrix:
00544   RowMatrix<double> m(s1->size() + 1, s2->size() + 1);
00545   RowMatrix<char>   p(s1->size(), s2->size());
00546   double choice1, choice2, choice3, mx;
00547   char px;
00548   for (size_t i = 0; i <= s1->size(); i++)
00549   {
00550     m(i, 0) = static_cast<double>(i) * gap;
00551   }
00552   for (size_t j = 0; j <= s2->size(); j++)
00553   {
00554     m(0, j) = static_cast<double>(j) * gap;
00555   }
00556   for (size_t i = 1; i <= s1->size(); i++)
00557   {
00558     for (size_t j = 1; j <= s2->size(); j++)
00559     {
00560       choice1 = m(i - 1, j - 1) + static_cast<double>(s.getIndex((*s1)[i - 1], (*s2)[j - 1]));
00561       choice2 = m(i - 1, j) + gap;
00562       choice3 = m(i, j - 1) + gap;
00563       mx = choice1; px = 'd'; // Default in case of equality of scores.
00564       if (choice2 > mx)
00565       {
00566         mx = choice2; px = 'u';
00567       }
00568       if (choice3 > mx)
00569       {
00570         mx = choice3; px = 'l';
00571       }
00572       m(i, j) = mx;
00573       p(i - 1, j - 1) = px;
00574     }
00575   }
00576 
00577   // 2) Get alignment:
00578   deque<int> a1, a2;
00579   size_t i = s1->size(), j = s2->size();
00580   char c;
00581   while (i > 0 && j > 0)
00582   {
00583     c = p(i - 1, j - 1);
00584     if (c == 'd')
00585     {
00586       a1.push_front((*s1)[i - 1]);
00587       a2.push_front((*s2)[j - 1]);
00588       i--;
00589       j--;
00590     }
00591     else if (c == 'u')
00592     {
00593       a1.push_front((*s1)[i - 1]);
00594       a2.push_front(-1);
00595       i--;
00596     }
00597     else
00598     {
00599       a1.push_front(-1);
00600       a2.push_front((*s2)[j - 1]);
00601       j--;
00602     }
00603   }
00604   while (i > 0)
00605   {
00606     a1.push_front((*s1)[i - 1]);
00607     a2.push_front(-1);
00608     i--;
00609   }
00610   while (j > 0)
00611   {
00612     a1.push_front(-1);
00613     a2.push_front((*s2)[j - 1]);
00614     j--;
00615   }
00616   s1->setContent(vector<int>(a1.begin(), a1.end()));
00617   s2->setContent(vector<int>(a2.begin(), a2.end()));
00618   AlignedSequenceContainer* asc = new AlignedSequenceContainer(s1->getAlphabet());
00619   asc->addSequence(*s1, false);
00620   asc->addSequence(*s2, false); // Do not check for sequence names.
00621   return asc;
00622 }
00623 
00624 /******************************************************************************/
00625 
00626 AlignedSequenceContainer* SiteContainerTools::alignNW(
00627   const Sequence& seq1,
00628   const Sequence& seq2,
00629   const AlphabetIndex2& s,
00630   double opening,
00631   double extending)
00632 throw (AlphabetMismatchException)
00633 {
00634   if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
00635     throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), seq2.getAlphabet());
00636   if (seq1.getAlphabet()->getAlphabetType() != s.getAlphabet()->getAlphabetType())
00637     throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), s.getAlphabet());
00638   // Check that sequences have no gap!
00639   auto_ptr<Sequence> s1(seq1.clone());
00640   SequenceTools::removeGaps(*s1);
00641   auto_ptr<Sequence> s2(seq2.clone());
00642   SequenceTools::removeGaps(*s2);
00643 
00644   // 1) Initialize matrix:
00645   RowMatrix<double> m(s1->size() + 1, s2->size() + 1);
00646   RowMatrix<double> v(s1->size() + 1, s2->size() + 1);
00647   RowMatrix<double> h(s1->size() + 1, s2->size() + 1);
00648   RowMatrix<char>   p(s1->size(), s2->size());
00649 
00650   double choice1, choice2, choice3, mx;
00651   char px;
00652   m(0, 0) = 0.;
00653   for (size_t i = 0; i <= s1->size(); i++)
00654   {
00655     v(i, 0) = log(0.);
00656   }
00657   for (size_t j = 0; j <= s2->size(); j++)
00658   {
00659     h(0, j) = log(0.);
00660   }
00661   for (size_t i = 1; i <= s1->size(); i++)
00662   {
00663     m(i, 0) = h(i, 0) = opening + static_cast<double>(i) * extending;
00664   }
00665   for (size_t j = 1; j <= s2->size(); j++)
00666   {
00667     m(0, j) = v(0, j) = opening + static_cast<double>(j) * extending;
00668   }
00669 
00670   for (size_t i = 1; i <= s1->size(); i++)
00671   {
00672     for (size_t j = 1; j <= s2->size(); j++)
00673     {
00674       choice1 = m(i - 1, j - 1) + s.getIndex((*s1)[i - 1], (*s2)[j - 1]);
00675       choice2 = h(i - 1, j - 1) + opening + extending;
00676       choice3 = v(i - 1, j - 1) + opening + extending;
00677       mx = choice1; // Default in case of equality of scores.
00678       if (choice2 > mx)
00679       {
00680         mx = choice2;
00681       }
00682       if (choice3 > mx)
00683       {
00684         mx = choice3;
00685       }
00686       m(i, j) = mx;
00687 
00688       choice1 = m(i, j - 1) + opening + extending;
00689       choice2 = h(i, j - 1) + extending;
00690       mx = choice1; // Default in case of equality of scores.
00691       if (choice2 > mx)
00692       {
00693         mx = choice2;
00694       }
00695       h(i, j) = mx;
00696 
00697       choice1 = m(i - 1, j) + opening + extending;
00698       choice2 = v(i - 1, j) + extending;
00699       mx = choice1; // Default in case of equality of scores.
00700       if (choice2 > mx)
00701       {
00702         mx = choice2;
00703       }
00704       v(i, j) = mx;
00705 
00706       px = 'd';
00707       if (v(i, j) > m(i, j))
00708         px = 'u';
00709       if (h(i, j) > m(i, j))
00710         px = 'l';
00711       p(i - 1, j - 1) = px;
00712     }
00713   }
00714 
00715   // 2) Get alignment:
00716   deque<int> a1, a2;
00717   size_t i = s1->size(), j = s2->size();
00718   char c;
00719   while (i > 0 && j > 0)
00720   {
00721     c = p(i - 1, j - 1);
00722     if (c == 'd')
00723     {
00724       a1.push_front((*s1)[i - 1]);
00725       a2.push_front((*s2)[j - 1]);
00726       i--;
00727       j--;
00728     }
00729     else if (c == 'u')
00730     {
00731       a1.push_front((*s1)[i - 1]);
00732       a2.push_front(-1);
00733       i--;
00734     }
00735     else
00736     {
00737       a1.push_front(-1);
00738       a2.push_front((*s2)[j - 1]);
00739       j--;
00740     }
00741   }
00742   while (i > 0)
00743   {
00744     a1.push_front((*s1)[i - 1]);
00745     a2.push_front(-1);
00746     i--;
00747   }
00748   while (j > 0)
00749   {
00750     a1.push_front(-1);
00751     a2.push_front((*s2)[j - 1]);
00752     j--;
00753   }
00754   s1->setContent(vector<int>(a1.begin(), a1.end()));
00755   s2->setContent(vector<int>(a2.begin(), a2.end()));
00756   AlignedSequenceContainer* asc = new AlignedSequenceContainer(s1->getAlphabet());
00757   asc->addSequence(*s1, false);
00758   asc->addSequence(*s2, false); // Do not check for sequence names.
00759   return asc;
00760 }
00761 
00762 /******************************************************************************/
00763 
00764 VectorSiteContainer* SiteContainerTools::sampleSites(const SiteContainer& sites, size_t nbSites, vector<size_t>* index)
00765 {
00766   VectorSiteContainer* sample = new VectorSiteContainer(sites.getSequencesNames(), sites.getAlphabet());
00767   for (size_t i = 0; i < nbSites; i++)
00768   {
00769     size_t pos = static_cast<size_t>(RandomTools::giveIntRandomNumberBetweenZeroAndEntry(static_cast<int>(sites.getNumberOfSites())));
00770     sample->addSite(sites.getSite(pos), false);
00771     if (index)
00772       index->push_back(pos);
00773   }
00774   return sample;
00775 }
00776 
00777 /******************************************************************************/
00778 
00779 VectorSiteContainer* SiteContainerTools::bootstrapSites(const SiteContainer& sites)
00780 {
00781   return sampleSites(sites, sites.getNumberOfSites());
00782 }
00783 
00784 /******************************************************************************/
00785 
00786 const string SiteContainerTools::SIMILARITY_ALL         = "all sites";
00787 const string SiteContainerTools::SIMILARITY_NOFULLGAP   = "no full gap";
00788 const string SiteContainerTools::SIMILARITY_NODOUBLEGAP = "no double gap";
00789 const string SiteContainerTools::SIMILARITY_NOGAP       = "no gap";
00790 
00791 /******************************************************************************/
00792 
00793 double SiteContainerTools::computeSimilarity(const Sequence& seq1, const Sequence& seq2, bool dist, const std::string& gapOption, bool unresolvedAsGap) throw (SequenceNotAlignedException, AlphabetMismatchException, Exception)
00794 {
00795   if (seq1.size() != seq2.size())
00796     throw SequenceNotAlignedException("SiteContainerTools::computeSimilarity.", &seq2);
00797   if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
00798     throw AlphabetMismatchException("SiteContainerTools::computeSimilarity.", seq1.getAlphabet(), seq2.getAlphabet());
00799 
00800   const Alphabet* alpha = seq1.getAlphabet();
00801   unsigned int s = 0;
00802   unsigned int t = 0;
00803   for (size_t i = 0; i < seq1.size(); i++)
00804   {
00805     int x = seq1[i];
00806     int y = seq2[i];
00807     int gapCode = alpha->getGapCharacterCode();
00808     if (unresolvedAsGap)
00809     {
00810       if (alpha->isUnresolved(x))
00811         x = gapCode;
00812       if (alpha->isUnresolved(y))
00813         y = gapCode;
00814     }
00815     if (gapOption == SIMILARITY_ALL)
00816     {
00817       t++;
00818       if (x == y && !alpha->isGap(x) && !alpha->isGap(y))
00819         s++;
00820     }
00821     else if (gapOption == SIMILARITY_NODOUBLEGAP)
00822     {
00823       if (!alpha->isGap(x) || !alpha->isGap(y))
00824       {
00825         t++;
00826         if (x == y)
00827           s++;
00828       }
00829     }
00830     else if (gapOption == SIMILARITY_NOGAP)
00831     {
00832       if (!alpha->isGap(x) && !alpha->isGap(y))
00833       {
00834         t++;
00835         if (x == y)
00836           s++;
00837       }
00838     }
00839     else
00840       throw Exception("SiteContainerTools::computeSimilarity. Invalid gap option: " + gapOption);
00841   }
00842   double r = (t == 0 ? 0. : static_cast<double>(s) / static_cast<double>(t));
00843   return dist ? 1 - r : r;
00844 }
00845 
00846 /******************************************************************************/
00847 
00848 DistanceMatrix* SiteContainerTools::computeSimilarityMatrix(const SiteContainer& sites, bool dist, const std::string& gapOption, bool unresolvedAsGap)
00849 {
00850   size_t n = sites.getNumberOfSequences();
00851   DistanceMatrix* mat = new DistanceMatrix(sites.getSequencesNames());
00852   string pairwiseGapOption = gapOption;
00853   SiteContainer* sites2;
00854   if (gapOption == SIMILARITY_NOFULLGAP)
00855   {
00856     if (unresolvedAsGap)
00857     {
00858       SiteContainer* tmp = removeGapOrUnresolvedOnlySites(sites);
00859       sites2 = new AlignedSequenceContainer(*tmp);
00860       delete tmp;
00861     }
00862     else
00863     {
00864       SiteContainer* tmp = removeGapOnlySites(sites);
00865       sites2 = new AlignedSequenceContainer(*tmp);
00866       delete tmp;
00867     }
00868     pairwiseGapOption = SIMILARITY_ALL;
00869   }
00870   else
00871   {
00872     sites2 = new AlignedSequenceContainer(sites);
00873   }
00874 
00875   for (size_t i = 0; i < n; i++)
00876   {
00877     (*mat)(i, i) = dist ? 0. : 1.;
00878     const Sequence* seq1 = &sites2->getSequence(i);
00879     for (size_t j = i + 1; j < n; j++)
00880     {
00881       const Sequence* seq2 = &sites2->getSequence(j);
00882       (*mat)(i, j) = (*mat)(j, i) = computeSimilarity(*seq1, *seq2, dist, pairwiseGapOption, unresolvedAsGap);
00883     }
00884   }
00885   delete sites2;
00886   return mat;
00887 }
00888 
00889 /******************************************************************************/
00890 
00891 void SiteContainerTools::merge(SiteContainer& seqCont1, const SiteContainer& seqCont2, bool leavePositionAsIs)
00892 throw (AlphabetMismatchException, Exception)
00893 {
00894   if (seqCont1.getAlphabet()->getAlphabetType() != seqCont2.getAlphabet()->getAlphabetType())
00895     throw AlphabetMismatchException("SiteContainerTools::merge.", seqCont1.getAlphabet(), seqCont2.getAlphabet());
00896 
00897 
00898   vector<string> seqNames1 = seqCont1.getSequencesNames();
00899   vector<string> seqNames2 = seqCont2.getSequencesNames();
00900   const SiteContainer* seqCont2bis = 0;
00901   bool del = false;
00902   if (seqNames1 == seqNames2)
00903   {
00904     seqCont2bis = &seqCont2;
00905   }
00906   else
00907   {
00908     // We shall reorder sequences first:
00909     SiteContainer* seqCont2ter = new VectorSiteContainer(seqCont2.getAlphabet());
00910     SequenceContainerTools::getSelectedSequences(seqCont2, seqNames1, *seqCont2ter);
00911     seqCont2bis = seqCont2ter;
00912     del = true;
00913   }
00914 
00915   if (leavePositionAsIs)
00916   {
00917     for (size_t i = 0; i < seqCont2bis->getNumberOfSites(); i++)
00918     {
00919       seqCont1.addSite(seqCont2bis->getSite(i), false);
00920     }
00921   }
00922   else
00923   {
00924     int offset = static_cast<int>(seqCont1.getNumberOfSites());
00925     for (size_t i = 0; i < seqCont2bis->getNumberOfSites(); i++)
00926     {
00927       seqCont1.addSite(seqCont2bis->getSite(i), offset + seqCont2bis->getSite(i).getPosition(), false);
00928     }
00929   }
00930 
00931   if (del)
00932     delete seqCont2bis;
00933 }
00934 
00935 /******************************************************************************/
00936 
00937 void SiteContainerTools::getSequencePositions(const SiteContainer& sites, Matrix<size_t>& positions)
00938 {
00939   positions.resize(sites.getNumberOfSequences(), sites.getNumberOfSites());
00940   int gap = sites.getAlphabet()->getGapCharacterCode();
00941   for (size_t i = 0; i < sites.getNumberOfSequences(); ++i) {
00942     const Sequence& seq = sites.getSequence(i);
00943     unsigned int pos = 0;
00944     for (size_t j = 0; j < sites.getNumberOfSites(); ++j) {
00945       if (seq[j] != gap) {
00946         ++pos;
00947         positions(i, j) = pos;
00948       } else {
00949         positions(i, j) = 0;
00950       }
00951     }
00952   }
00953 }
00954 
00955 /******************************************************************************/
00956 
00957 vector<int> SiteContainerTools::getColumnScores(const Matrix<size_t>& positions1, const Matrix<size_t>& positions2, int na)
00958 {
00959   if (positions1.getNumberOfRows() != positions2.getNumberOfRows())
00960     throw Exception("SiteContainerTools::getColumnScores. The two input alignments must have the same number of sequences!");
00961   vector<int> scores(positions1.getNumberOfColumns());
00962   for (size_t i = 0; i < positions1.getNumberOfColumns(); ++i) {
00963     //Find an anchor point:
00964     size_t whichSeq = 0;
00965     size_t whichPos = 0;
00966     for (size_t j = 0; j < positions1.getNumberOfRows(); ++j) {
00967       if (positions1(j, i) > 0) {
00968         whichSeq = j;
00969         whichPos = positions1(j, i);
00970         break;
00971       }
00972     }
00973     if (whichPos == 0) {
00974       //No anchor found, this alignment column is only made of gaps. We assign a score of 'na' and move to the next column.
00975       scores[i] = na;
00976       continue;
00977     }
00978     //We look for the anchor in the reference alignment:
00979     size_t i2 = 0;
00980     bool found = false;
00981     for (size_t j = 0; !found && j < positions2.getNumberOfColumns(); ++j) {
00982       if (positions2(whichSeq, j) == whichPos) {
00983         i2 = j;
00984         found = true;
00985       }
00986     }
00987     if (!found) {
00988       throw Exception("SiteContainerTools::getColumnScores(). Position " + TextTools::toString(whichPos) + " of sequence " + TextTools::toString(whichSeq) + " not found in reference alignment. Please make sure the two indexes are built from the same data!");
00989     }
00990     //Now we compare all pairs of sequences between the two positions:
00991     bool test = true;
00992     for (size_t j = 0; test && j < positions1.getNumberOfRows(); ++j) {
00993       test = (positions1(j, i) == positions2(j, i2));
00994     }
00995     scores[i] = test ? 1 : 0;
00996   }
00997   return scores;
00998 }
00999 
01000 /******************************************************************************/
01001 
01002 vector<double> SiteContainerTools::getSumOfPairsScores(const Matrix<size_t>& positions1, const Matrix<size_t>& positions2, double na)
01003 {
01004   if (positions1.getNumberOfRows() != positions2.getNumberOfRows())
01005     throw Exception("SiteContainerTools::getColumnScores. The two input alignments must have the same number of sequences!");
01006   vector<double> scores(positions1.getNumberOfColumns());
01007   for (size_t i = 0; i < positions1.getNumberOfColumns(); ++i) {
01008     //For all positions in alignment 1...
01009     size_t countAlignable = 0;
01010     size_t countAligned   = 0;
01011     for (size_t j = 0; j < positions1.getNumberOfRows(); ++j) {
01012       //Get the corresponding column in alignment 2:
01013       size_t whichPos = positions1(j, i);
01014       if (whichPos == 0) {
01015         //No position for this sequence here.
01016         continue;
01017       }
01018       //We look for the position in the second alignment:
01019       size_t i2 = 0;
01020       bool found = false;
01021       for (size_t k = 0; !found && k < positions2.getNumberOfColumns(); ++k) {
01022         if (positions2(j, k) == whichPos) {
01023           i2 = k;
01024           found = true;
01025         }
01026       }
01027       if (!found) {
01028         throw Exception("SiteContainerTools::getColumnScores(). Position " + TextTools::toString(whichPos) + " of sequence " + TextTools::toString(j) + " not found in reference alignment. Please make sure the two indexes are built from the same data!");
01029       }
01030 
01031       //Now we check all other positions and see if they are aligned with this one:
01032       for (size_t k = j + 1; k < positions1.getNumberOfRows(); ++k) {
01033         size_t whichPos2 = positions1(k, i);
01034         if (whichPos2 == 0) {
01035           //Empty position
01036           continue;
01037         }
01038         countAlignable++;
01039         //check position in alignment 2:
01040         if (positions2(k, i2) == whichPos2)
01041           countAligned++;
01042       }
01043     }
01044     scores[i] = countAlignable == 0 ? na : static_cast<double>(countAligned) / static_cast<double>(countAlignable);
01045   }
01046   return scores;
01047 }
01048 
01049 /******************************************************************************/
01050 
 All Classes Namespaces Files Functions Variables Typedefs Friends