bpp-popgen  2.1.0
Bpp/PopGen/SequenceStatistics.cpp
Go to the documentation of this file.
00001 //
00002 // File SequenceStatistics.cpp
00003 // Authors: Eric Bazin
00004 //          Sylvain Gailard
00005 //          Khalid Belkhir
00006 //          Benoit Nabholz
00007 // Created on: Wed Aug 04 2004
00008 //
00009 
00010 /*
00011    Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
00012 
00013    This software is a computer program whose purpose is to provide classes
00014    for population genetics analysis.
00015 
00016    This software is governed by the CeCILL  license under French law and
00017    abiding by the rules of distribution of free software.  You can  use,
00018    modify and/ or redistribute the software under the terms of the CeCILL
00019    license as circulated by CEA, CNRS and INRIA at the following URL
00020    "http://www.cecill.info".
00021 
00022    As a counterpart to the access to the source code and  rights to copy,
00023    modify and redistribute granted by the license, users are provided only
00024    with a limited warranty  and the software's author,  the holder of the
00025    economic rights,  and the successive licensors  have only  limited
00026    liability.
00027 
00028    In this respect, the user's attention is drawn to the risks associated
00029    with loading,  using,  modifying and/or developing or reproducing the
00030    software by the user in light of its specific status of free software,
00031    that may mean  that it is complicated to manipulate,  and  that  also
00032    therefore means  that it is reserved for developers  and  experienced
00033    professionals having in-depth computer knowledge. Users are therefore
00034    encouraged to load and test the software's suitability as regards their
00035    requirements in conditions enabling the security of their systems and/or
00036    data to be ensured and,  more generally, to use and operate it in the
00037    same conditions as regards security.
00038 
00039    The fact that you are presently reading this means that you have had
00040    knowledge of the CeCILL license and that you accept its terms.
00041  */
00042 
00043 #include "SequenceStatistics.h" // class's header file
00044 #include "PolymorphismSequenceContainerTools.h"
00045 #include "PolymorphismSequenceContainer.h"
00046 
00047 // From the STL:
00048 #include <ctype.h>
00049 #include <cmath>
00050 #include <iostream>
00051 #include <vector>
00052 
00053 using namespace std;
00054 
00055 // From SeqLib:
00056 #include <Bpp/Seq/Site.h>
00057 #include <Bpp/Seq/SiteTools.h>
00058 #include <Bpp/Seq/StringSequenceTools.h>
00059 #include <Bpp/Seq/CodonSiteTools.h>
00060 #include <Bpp/Seq/Alphabet/DNA.h>
00061 #include <Bpp/Seq/Alphabet/AlphabetTools.h>
00062 
00063 #include <Bpp/Numeric/VectorTools.h>
00064 #include <Bpp/Numeric/VectorExceptions.h>
00065 
00066 using namespace bpp;
00067 
00068 // ******************************************************************************
00069 // Basic statistics
00070 // ******************************************************************************
00071 
00072 size_t SequenceStatistics::polymorphicSiteNumber(const PolymorphismSequenceContainer& psc, bool gapflag, bool ignoreUnknown)
00073 {
00074   size_t S = 0;
00075   const Site* site = 0;
00076   ConstSiteIterator* si = 0;
00077   if (gapflag)
00078     si = new CompleteSiteContainerIterator(psc);
00079   else
00080     si = new SimpleSiteContainerIterator(psc);
00081   while (si->hasMoreSites())
00082   {
00083     site = si->nextSite();
00084     if (!SiteTools::isConstant(*site, ignoreUnknown))
00085     {
00086       S++;
00087     }
00088   }
00089   delete si;
00090   return S;
00091 }
00092 
00093 size_t SequenceStatistics::parsimonyInformativeSiteNumber(const PolymorphismSequenceContainer& psc, bool gapflag)
00094 {
00095   ConstSiteIterator* si = 0;
00096   if (gapflag)
00097     si = new CompleteSiteContainerIterator(psc);
00098   else
00099     si = new SimpleSiteContainerIterator(psc);
00100   size_t S = 0;
00101   const Site* site = 0;
00102   while (si->hasMoreSites())
00103   {
00104     site = si->nextSite();
00105     if (SiteTools::isParsimonyInformativeSite(*site))
00106     {
00107       S++;
00108     }
00109   }
00110   delete si;
00111   return S;
00112 }
00113 
00114 size_t SequenceStatistics::countSingleton(const PolymorphismSequenceContainer& psc, bool gapflag)
00115 {
00116   size_t nus = 0;
00117   const Site* site = 0;
00118   ConstSiteIterator* si = 0;
00119   if (gapflag)
00120     si = new CompleteSiteContainerIterator(psc);
00121   else
00122     si = new SimpleSiteContainerIterator(psc);
00123   while (si->hasMoreSites())
00124   {
00125     site = si->nextSite();
00126     nus += getSingletonNumber_(*site);
00127   }
00128   delete si;
00129   return nus;
00130 }
00131 
00132 size_t SequenceStatistics::tripletNumber(const PolymorphismSequenceContainer& psc, bool gapflag)
00133 {
00134   ConstSiteIterator* si = 0;
00135   if (gapflag)
00136     si = new CompleteSiteContainerIterator(psc);
00137   else
00138     si = new SimpleSiteContainerIterator(psc);
00139   int S = 0;
00140   const Site* site = 0;
00141   while (si->hasMoreSites())
00142   {
00143     site = si->nextSite();
00144     if (SiteTools::isTriplet(*site))
00145     {
00146       S++;
00147     }
00148   }
00149 
00150   delete si;
00151   return S;
00152 }
00153 
00154 size_t SequenceStatistics::totNumberMutations(const PolymorphismSequenceContainer& psc, bool gapflag)
00155 {
00156   size_t tnm = 0;
00157   const Site* site = 0;
00158   ConstSiteIterator* si = 0;
00159   if (gapflag)
00160     si = new CompleteSiteContainerIterator(psc);
00161   else
00162     si = new SimpleSiteContainerIterator(psc);
00163   while (si->hasMoreSites())
00164   {
00165     site = si->nextSite();
00166     tnm += getMutationNumber_(*site);
00167   }
00168   delete si;
00169   return tnm;
00170 }
00171 
00172 size_t SequenceStatistics::totMutationsExternalBranchs(
00173   const PolymorphismSequenceContainer& ing,
00174   const PolymorphismSequenceContainer& outg) throw (Exception)
00175 {
00176   if (ing.getNumberOfSites() != outg.getNumberOfSites())
00177     throw Exception("ing and outg must have the same size");
00178   size_t nmuts = 0;
00179   const Site* site_in = 0;
00180   const Site* site_out = 0;
00181   ConstSiteIterator* si = 0;
00182   ConstSiteIterator* so = 0;
00183   si = new SimpleSiteContainerIterator(ing);
00184   so = new SimpleSiteContainerIterator(outg);
00185   while (si->hasMoreSites())
00186   {
00187     site_in = si->nextSite();
00188     site_out = so->nextSite();
00189     // use fully resolved sites
00190     if (SiteTools::isComplete(*site_in) &&  SiteTools::isComplete(*site_out))
00191       nmuts += getDerivedSingletonNumber_(*site_in, *site_out);                                                                   // singletons that are not in outgroup
00192   }
00193   delete si;
00194   delete so;
00195   return nmuts;
00196 }
00197 
00198 double SequenceStatistics::heterozygosity(const PolymorphismSequenceContainer& psc, bool gapflag)
00199 {
00200   ConstSiteIterator* si = 0;
00201   const Site* site = 0;
00202   if (gapflag)
00203     si = new CompleteSiteContainerIterator(psc);
00204   else
00205     si = new SimpleSiteContainerIterator(psc);
00206   double S = 0;
00207   while (si->hasMoreSites())
00208   {
00209     site = si->nextSite();
00210     S += SiteTools::heterozygosity(*site);
00211   }
00212   delete si;
00213   return S;
00214 }
00215 
00216 double SequenceStatistics::squaredHeterozygosity(const PolymorphismSequenceContainer& psc, bool gapflag)
00217 {
00218   ConstSiteIterator* si = 0;
00219   const Site* site = 0;
00220   if (gapflag)
00221     si = new CompleteSiteContainerIterator(psc);
00222   else
00223     si = new SimpleSiteContainerIterator(psc);
00224   double S = 0;
00225   while (si->hasMoreSites())
00226   {
00227     site = si->nextSite();
00228     double h = SiteTools::heterozygosity(*site);
00229     S += h * h;
00230   }
00231   delete si;
00232   return S;
00233 }
00234 
00235 // ******************************************************************************
00236 // GC statistics
00237 // ******************************************************************************
00238 
00239 double SequenceStatistics::gcContent(const PolymorphismSequenceContainer& psc)
00240 {
00241   map<int, double> freqs;
00242   SequenceContainerTools::getFrequencies(psc, freqs);
00243   const Alphabet* alpha = psc.getAlphabet();
00244   return (freqs[alpha->charToInt("C")] + freqs[alpha->charToInt("G")]) / (freqs[alpha->charToInt("A")] + freqs[alpha->charToInt("C")] + freqs[alpha->charToInt("G")] + freqs[alpha->charToInt("T")]);
00245 }
00246 
00247 std::vector<size_t> SequenceStatistics::gcPolymorphism(const PolymorphismSequenceContainer& psc, bool gapflag)
00248 {
00249   size_t nbMut = 0;
00250   size_t nbGC = 0;
00251   const size_t nbSeq = psc.getNumberOfSequences();
00252   vector<size_t> vect(2);
00253   const Site* site = 0;
00254   ConstSiteIterator* si = 0;
00255   if (gapflag)
00256     si = new CompleteSiteContainerIterator(psc);
00257   else
00258     si = new NoGapSiteContainerIterator(psc);
00259   while (si->hasMoreSites())
00260   {
00261     site = si->nextSite();
00262     if (!SiteTools::isConstant(*site))
00263     {
00264       long double freqGC = SymbolListTools::getGCContent(*site);
00265       /*
00266        * Sylvain Gaillard 15/03/2010: realy unclear ...
00267        *   freqGC is always in [0,1] then why testing it ?
00268        *   why casting double into size_t ?
00269        *   is that method used by someone ?
00270        */
00271       if (freqGC > 0 && freqGC < 1)
00272       {
00273         nbMut += static_cast<size_t>(nbSeq);
00274         long double adGC = freqGC * nbSeq;
00275         nbGC += static_cast<size_t>(adGC);
00276       }
00277     }
00278   }
00279   vect[0] = nbMut;
00280   vect[1] = nbGC;
00281   delete si;
00282   return vect;
00283 }
00284 
00285 // ******************************************************************************
00286 // Diversity statistics
00287 // ******************************************************************************
00288 
00289 double SequenceStatistics::watterson75(const PolymorphismSequenceContainer& psc, bool gapflag, bool ignoreUnknown)
00290 {
00291   double ThetaW;
00292   size_t n = psc.getNumberOfSequences();
00293   size_t S = polymorphicSiteNumber(psc, gapflag, ignoreUnknown);
00294   map<string, double> values = getUsefullValues_(n);
00295   ThetaW = (double) S / values["a1"];
00296   return ThetaW;
00297 }
00298 
00299 double SequenceStatistics::tajima83(const PolymorphismSequenceContainer& psc, bool gapflag)
00300 {
00301   size_t alphabet_size = (psc.getAlphabet())->getSize();
00302   const Site* site = 0;
00303   ConstSiteIterator* si = 0;
00304   double value2 = 0.;
00305   if (gapflag)
00306     si = new CompleteSiteContainerIterator(psc);
00307   else
00308     si = new SimpleSiteContainerIterator(psc);
00309   while (si->hasMoreSites())
00310   {
00311     site = si->nextSite();
00312     if (!SiteTools::isConstant(*site))
00313     {
00314       double value = 0.;
00315       map<int, size_t> count;
00316       SymbolListTools::getCounts(*site, count);
00317       map<int, size_t> tmp_k;
00318       size_t tmp_n = 0;
00319       for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++)
00320       {
00321         if (it->first >= 0 && it->first < static_cast<int>(alphabet_size))
00322         {
00323           tmp_k[it->first] = it->second * (it->second - 1);
00324           tmp_n += it->second;
00325         }
00326       }
00327       if (tmp_n == 0 || tmp_n == 1)
00328         continue;
00329       for (map<int, size_t>::iterator it = tmp_k.begin(); it != tmp_k.end(); it++)
00330       {
00331         value += static_cast<double>(it->second) / static_cast<double>(tmp_n * (tmp_n - 1));
00332       }
00333       value2 += 1. - value;
00334     }
00335   }
00336   delete si;
00337   return value2;
00338 }
00339 
00340 double SequenceStatistics::FayWu2000(const PolymorphismSequenceContainer& psc, const Sequence& ancestralSites)
00341 {
00342   if (psc.getNumberOfSites() != ancestralSites.size())
00343     throw Exception("SequenceStatistics::FayWu2000: ancestralSites and psc don't have the same size!!!'" );
00344 
00345   const Sequence& tmps = psc.getSequence(0);
00346 
00347   size_t alphabet_size = (psc.getAlphabet())->getSize();
00348   double value = 0.;
00349   for (size_t i = 0; i < psc.getNumberOfSites(); i++)
00350   {
00351     const Site& site = psc.getSite(i);
00352     string ancB = ancestralSites.getChar(i);
00353     int ancV = ancestralSites.getValue(i);
00354 
00355     if (!SiteTools::isConstant(site) || tmps.getChar(i) != ancB)
00356     {
00357       if (ancV < 0)
00358         continue;
00359 
00360       map<int, size_t> count;
00361       SymbolListTools::getCounts(site, count);
00362       map<int, size_t> tmp_k;
00363       size_t tmp_n = 0;
00364       for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++)
00365       {
00366         if (it->first >= 0 && it->first < static_cast<int>(alphabet_size))
00367         {
00368           /* if derived allele */
00369           if (it->first != ancV)
00370           {
00371             tmp_k[it->first] = 2 * it->second * it->second;
00372           }
00373           tmp_n += it->second;
00374         }
00375       }
00376       if (tmp_n == 0 || tmp_n == 1)
00377         continue;
00378       for (map<int, size_t>::iterator it = tmp_k.begin(); it != tmp_k.end(); it++)
00379       {
00380         value += static_cast<double>(it->second) / static_cast<double>(tmp_n * (tmp_n - 1));
00381       }
00382     }
00383   }
00384   return value;
00385 }
00386 
00387 size_t SequenceStatistics::DVK(const PolymorphismSequenceContainer& psc, bool gapflag)
00388 {
00389   /*
00390    * Sylvain Gaillard 17/03/2010:
00391    * This implementation uses unneeded SequenceContainer recopy and works on
00392    * string. It needs to be improved.
00393    */
00394   PolymorphismSequenceContainer* sc = 0;
00395   if (gapflag)
00396     sc = PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc);
00397   else
00398     sc = new PolymorphismSequenceContainer(psc);
00399   // int K = 0;
00400   vector<string> pscvector;
00401   pscvector.push_back(sc->toString(0));
00402   // K++;
00403   for (size_t i = 1; i < sc->getNumberOfSequences(); i++)
00404   {
00405     bool uniq = true;
00406     string query = sc->toString(i);
00407     for (vector<string>::iterator it = pscvector.begin(); it != pscvector.end(); it++)
00408     {
00409       if (query.compare(*it) == 0)
00410       {
00411         uniq = false;
00412         break;
00413       }
00414     }
00415     if (uniq)
00416     {
00417       // K++;
00418       pscvector.push_back(query);
00419     }
00420   }
00421   delete sc;
00422   // return K;
00423   return pscvector.size();
00424 }
00425 
00426 double SequenceStatistics::DVH(const PolymorphismSequenceContainer& psc, bool gapflag)
00427 {
00428   /*
00429    * Sylvain Gaillard 17/03/2010:
00430    * This implementation uses unneeded SequenceContainer recopy and works on
00431    * string. It needs to be improved.
00432    */
00433   PolymorphismSequenceContainer* sc = 0;
00434   if (gapflag)
00435     sc = PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc);
00436   else
00437     sc = new PolymorphismSequenceContainer(psc);
00438   double H = 0.;
00439   size_t nbSeq;
00440   vector<string> pscvector;
00441   vector<size_t> effvector;
00442   pscvector.push_back(sc->toString(0));
00443   effvector.push_back(sc->getSequenceCount(0));
00444   nbSeq = sc->getSequenceCount(0);
00445   for (size_t i = 1; i < sc->getNumberOfSequences(); i++)
00446   {
00447     nbSeq += sc->getSequenceCount(i);
00448     bool uniq = true;
00449     string query = sc->toString(i);
00450     for (size_t j = 0; j < pscvector.size(); j++)
00451     {
00452       if (query.compare(pscvector[j]) == 0)
00453       {
00454         effvector[j] += sc->getSequenceCount(i);
00455         uniq = false;
00456         break;
00457       }
00458     }
00459     if (uniq)
00460     {
00461       pscvector.push_back(query);
00462       effvector.push_back(sc->getSequenceCount(i));
00463     }
00464   }
00465   for (size_t i = 0; i < effvector.size(); i++)
00466   {
00467     H -= (static_cast<double>(effvector[i]) / static_cast<double>(nbSeq)) * ( static_cast<double>(effvector[i]) / static_cast<double>(nbSeq));
00468   }
00469   H += 1.;
00470   delete sc;
00471   return H;
00472 }
00473 
00474 size_t SequenceStatistics::getNumberOfTransitions(const PolymorphismSequenceContainer& psc)
00475 {
00476   size_t nbT = 0;
00477   ConstSiteIterator* si = new CompleteSiteContainerIterator(psc);
00478   const Site* site = 0;
00479   while (si->hasMoreSites())
00480   {
00481     site = si->nextSite();
00482     // if (SiteTools::isConstant(*site) || SiteTools::isTriplet(*site)) continue;
00483     if (SiteTools::getNumberOfDistinctCharacters(*site) != 2)
00484       continue;
00485     vector<int> seq = site->getContent();
00486     int state1 = seq[0];
00487     int state2 = seq[0];
00488     for (size_t i = 1; i < seq.size(); i++)
00489     {
00490       if (state1 != seq[i])
00491       {
00492         state2 = seq[i];
00493         break;
00494       }
00495     }
00496     if (((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) ||
00497         ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1)))
00498     {
00499       nbT++;
00500     }
00501   }
00502   delete si;
00503   return nbT;
00504 }
00505 
00506 size_t SequenceStatistics::getNumberOfTransversions(const PolymorphismSequenceContainer& psc)
00507 {
00508   size_t nbTv = 0;
00509   ConstSiteIterator* si = new CompleteSiteContainerIterator(psc);
00510   const Site* site = 0;
00511   while (si->hasMoreSites())
00512   {
00513     site = si->nextSite();
00514     // if (SiteTools::isConstant(*site) || SiteTools::isTriplet(*site)) continue;
00515     if (SiteTools::getNumberOfDistinctCharacters(*site) != 2)
00516       continue;
00517     vector<int> seq = site->getContent();
00518     int state1 = seq[0];
00519     int state2 = seq[0];
00520     for (size_t i = 1; i < seq.size(); i++)
00521     {
00522       if (state1 != seq[i])
00523       {
00524         state2 = seq[i];
00525         break;
00526       }
00527     }
00528     if (!(((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) ||
00529           ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1))))
00530     {
00531       nbTv++;
00532     }
00533   }
00534   delete si;
00535   return nbTv;
00536 }
00537 
00538 double SequenceStatistics::getTransitionsTransversionsRatio(const PolymorphismSequenceContainer& psc) throw (Exception)
00539 {
00540   // return (double) getNumberOfTransitions(psc)/getNumberOfTransversions(psc);
00541   size_t nbT = 0;
00542   size_t nbTv = 0;
00543   ConstSiteIterator* si = new CompleteSiteContainerIterator(psc);
00544   const Site* site = 0;
00545   vector< int > state(2);
00546   while (si->hasMoreSites())
00547   {
00548     map<int, size_t> count;
00549     site = si->nextSite();
00550     SymbolListTools::getCounts(*site, count);
00551     if (count.size() != 2)
00552       continue;
00553     int i = 0;
00554     for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++)
00555     {
00556       state[i] = it->first;
00557       i++;
00558     }
00559     if (((state[0] == 0 && state[1] == 2) || (state[0] == 2 && state[1] == 0)) ||
00560         ((state[0] == 1 && state[1] == 3) || (state[0] == 3 && state[1] == 1)))
00561     {
00562       nbT++; // transitions
00563     }
00564     else
00565     {
00566       nbTv++; // transversion
00567     }
00568   }
00569   delete si;
00570   if (nbTv == 0)
00571     throw ZeroDivisionException("SequenceStatistics::getTransitionsTransversionsRatio.");
00572   return static_cast<double>(nbT) /  static_cast<double>(nbTv);
00573 }
00574 
00575 // ******************************************************************************
00576 // Synonymous and non-synonymous polymorphism
00577 // ******************************************************************************
00578 
00579 size_t SequenceStatistics::stopCodonSiteNumber(const PolymorphismSequenceContainer& psc, const GeneticCode& gCode, bool gapflag)
00580 {
00581   /*
00582    * Sylvain Gaillard 17/03/2010
00583    * What if the Alphabet is not a codon alphabet?
00584    */
00585   if (!AlphabetTools::isCodonAlphabet(psc.getAlphabet()))
00586     throw AlphabetMismatchException("SequenceStatistics::stopCodonSiteNumber(). PolymorphismSequenceContainer must be with a codon alphabet.", psc.getAlphabet());
00587 
00588   ConstSiteIterator* si = 0;
00589   if (gapflag)
00590     si = new NoGapSiteContainerIterator(psc);
00591   else
00592     si = new SimpleSiteContainerIterator(psc);
00593   size_t S = 0;
00594   const Site* site = 0;
00595   while (si->hasMoreSites())
00596   {
00597     site = si->nextSite();
00598     if (CodonSiteTools::hasStop(*site, gCode))
00599       S++;
00600   }
00601   delete si;
00602   return S;
00603 }
00604 
00605 size_t SequenceStatistics::monoSitePolymorphicCodonNumber(const PolymorphismSequenceContainer& psc, bool stopflag, bool gapflag)
00606 {
00607   ConstSiteIterator* si = 0;
00608   if (stopflag)
00609     si = new CompleteSiteContainerIterator(psc);
00610   else
00611   {
00612     if (gapflag)
00613       si = new NoGapSiteContainerIterator(psc);
00614     else
00615       si = new SimpleSiteContainerIterator(psc);
00616   }
00617   size_t S = 0;
00618   const Site* site;
00619   while (si->hasMoreSites())
00620   {
00621     site = si->nextSite();
00622     if (CodonSiteTools::isMonoSitePolymorphic(*site))
00623       S++;
00624   }
00625   delete si;
00626   return S;
00627 }
00628 
00629 size_t SequenceStatistics::synonymousPolymorphicCodonNumber(const PolymorphismSequenceContainer& psc, const GeneticCode& gc)
00630 {
00631   ConstSiteIterator* si = new CompleteSiteContainerIterator(psc);
00632   size_t S = 0;
00633   const Site* site;
00634   while (si->hasMoreSites())
00635   {
00636     site = si->nextSite();
00637     if (CodonSiteTools::isSynonymousPolymorphic(*site, gc))
00638       S++;
00639   }
00640   delete si;
00641   return S;
00642 }
00643 
00644 double SequenceStatistics::watterson75Synonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc)
00645 {
00646   double ThetaW = 0.;
00647   size_t n = psc.getNumberOfSequences();
00648   size_t S = synonymousSubstitutionsNumber(psc, gc);
00649   map<string, double> values = getUsefullValues_(n);
00650   ThetaW = static_cast<double>(S) / values["a1"];
00651   return ThetaW;
00652 }
00653 
00654 double SequenceStatistics::watterson75NonSynonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc)
00655 {
00656   double ThetaW;
00657   size_t n = psc.getNumberOfSequences();
00658   size_t S = nonSynonymousSubstitutionsNumber(psc, gc);
00659   map<string, double> values = getUsefullValues_(n);
00660   ThetaW = static_cast<double>(S) / values["a1"];
00661   return ThetaW;
00662 }
00663 
00664 double SequenceStatistics::piSynonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, bool minchange)
00665 {
00666   double S = 0.;
00667   ConstSiteIterator* si = new CompleteSiteContainerIterator(psc);
00668   const Site* site = 0;
00669   while (si->hasMoreSites())
00670   {
00671     site = si->nextSite();
00672     S += CodonSiteTools::piSynonymous(*site, gc, minchange);
00673   }
00674   delete si;
00675   return S;
00676 }
00677 
00678 double SequenceStatistics::piNonSynonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, bool minchange)
00679 {
00680   double S = 0.;
00681   ConstSiteIterator* si = new CompleteSiteContainerIterator(psc);
00682   const Site* site = 0;
00683   while (si->hasMoreSites())
00684   {
00685     site = si->nextSite();
00686     S += CodonSiteTools::piNonSynonymous(*site, gc, minchange);
00687   }
00688   delete si;
00689   return S;
00690 }
00691 
00692 double SequenceStatistics::meanSynonymousSitesNumber(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double ratio)
00693 {
00694   double S = 0.;
00695   ConstSiteIterator* si = new CompleteSiteContainerIterator(psc);
00696   const Site* site = 0;
00697   while (si->hasMoreSites())
00698   {
00699     site = si->nextSite();
00700     S += CodonSiteTools::meanNumberOfSynonymousPositions(*site, gc, ratio);
00701   }
00702   delete si;
00703   return S;
00704 }
00705 
00706 double SequenceStatistics::meanNonSynonymousSitesNumber(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double ratio)
00707 {
00708   double S = 0.;
00709   int n = 0;
00710   ConstSiteIterator* si = new CompleteSiteContainerIterator(psc);
00711   const Site* site = 0;
00712   while (si->hasMoreSites())
00713   {
00714     site = si->nextSite();
00715     n = n + 3;
00716     S += CodonSiteTools::meanNumberOfSynonymousPositions(*site, gc, ratio);
00717   }
00718   delete si;
00719   return static_cast<double>(n - S);
00720 }
00721 
00722 size_t SequenceStatistics::synonymousSubstitutionsNumber(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double freqmin)
00723 {
00724   size_t St = 0, Sns = 0;
00725   ConstSiteIterator* si = new CompleteSiteContainerIterator(psc);
00726   const Site* site = 0;
00727   while (si->hasMoreSites())
00728   {
00729     site = si->nextSite();
00730     St += CodonSiteTools::numberOfSubsitutions(*site, gc, freqmin);
00731     Sns += CodonSiteTools::numberOfNonSynonymousSubstitutions(*site, gc, freqmin);
00732   }
00733   delete si;
00734   return St - Sns;
00735 }
00736 
00737 size_t SequenceStatistics::nonSynonymousSubstitutionsNumber(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double freqmin)
00738 {
00739   size_t Sns = 0;
00740   ConstSiteIterator* si = new CompleteSiteContainerIterator(psc);
00741   const Site* site = 0;
00742   while (si->hasMoreSites())
00743   {
00744     site = si->nextSite();
00745     Sns += CodonSiteTools::numberOfNonSynonymousSubstitutions(*site, gc, freqmin);
00746   }
00747   delete si;
00748   return Sns;
00749 }
00750 
00751 vector<size_t> SequenceStatistics::fixedDifferences(const PolymorphismSequenceContainer& pscin, const PolymorphismSequenceContainer& pscout, PolymorphismSequenceContainer& psccons, const GeneticCode& gc)
00752 {
00753   ConstSiteIterator* siIn = new CompleteSiteContainerIterator(pscin);
00754   ConstSiteIterator* siOut = new CompleteSiteContainerIterator(pscout);
00755   ConstSiteIterator* siCons = new CompleteSiteContainerIterator(psccons);
00756   const Site* siteIn = 0;
00757   const Site* siteOut = 0;
00758   const Site* siteCons = 0;
00759   size_t NfixS = 0;
00760   size_t NfixA = 0;
00761   while (siIn->hasMoreSites())
00762   {
00763     siteIn = siIn->nextSite();
00764     siteOut = siOut->nextSite();
00765     siteCons = siCons->nextSite();
00766     vector<size_t> v = CodonSiteTools::fixedDifferences(*siteIn, *siteOut, siteCons->getValue(0), siteCons->getValue(1), gc);
00767     NfixS += v[0];
00768     NfixA += v[1];
00769   }
00770   vector<size_t> v(2);
00771   v[0] = NfixS;
00772   v[1] = NfixA;
00773   delete siIn;
00774   delete siOut;
00775   delete siCons;
00776   return v;
00777 }
00778 
00779 vector<size_t> SequenceStatistics::MKtable(const PolymorphismSequenceContainer& ingroup, const PolymorphismSequenceContainer& outgroup, const GeneticCode& gc, double freqmin)
00780 {
00781   PolymorphismSequenceContainer psctot(ingroup);
00782   for (size_t i = 0; i < outgroup.getNumberOfSequences(); i++)
00783   {
00784     psctot.addSequence(outgroup.getSequence(i));
00785     psctot.setAsOutgroupMember(i + ingroup.getNumberOfSequences());
00786   }
00787   const PolymorphismSequenceContainer* psccomplet = PolymorphismSequenceContainerTools::getCompleteSites(psctot);
00788   const PolymorphismSequenceContainer* pscin = PolymorphismSequenceContainerTools::extractIngroup(*psccomplet);
00789   const PolymorphismSequenceContainer* pscout = PolymorphismSequenceContainerTools::extractOutgroup(*psccomplet);
00790   const Sequence* consensusIn = SiteContainerTools::getConsensus(*pscin, "consensusIn");
00791   const Sequence* consensusOut = SiteContainerTools::getConsensus(*pscout, "consensusOut");
00792   PolymorphismSequenceContainer* consensus = new PolymorphismSequenceContainer(ingroup.getAlphabet());
00793   consensus->addSequence(*consensusIn);
00794   consensus->addSequence(*consensusOut);
00795   vector<size_t> u = SequenceStatistics::fixedDifferences(*pscin, *pscout, *consensus, gc);
00796   vector<size_t> v(4);
00797   v[0] = SequenceStatistics::nonSynonymousSubstitutionsNumber(*pscin, gc, freqmin);
00798   v[1] = SequenceStatistics::synonymousSubstitutionsNumber(*pscin, gc, freqmin);
00799   v[2] = u[1];
00800   v[3] = u[0];
00801   delete consensus;
00802   if (psccomplet)
00803   {
00804     delete psccomplet;
00805   }
00806   if (pscin)
00807   {
00808     delete pscin;
00809   }
00810   if (pscout)
00811   {
00812     delete pscout;
00813   }
00814   if (consensusIn)
00815   {
00816     delete consensusIn;
00817   }
00818   if (consensusOut)
00819   {
00820     delete consensusOut;
00821   }
00822   return v;
00823 }
00824 
00825 double SequenceStatistics::neutralityIndex(const PolymorphismSequenceContainer& ingroup, const PolymorphismSequenceContainer& outgroup, const GeneticCode& gc, double freqmin)
00826 {
00827   vector<size_t> v = SequenceStatistics::MKtable(ingroup, outgroup, gc, freqmin);
00828   if (v[1] != 0 && v[2] != 0)
00829     return static_cast<double>(v[0] * v[3]) / static_cast<double>(v[1] * v[2]);
00830   else
00831     return -1;
00832 }
00833 
00834 // ******************************************************************************
00835 // Statistical tests
00836 // ******************************************************************************
00837 
00838 double SequenceStatistics::tajimaDSS(const PolymorphismSequenceContainer& psc, bool gapflag) throw (ZeroDivisionException)
00839 {
00840   double S = static_cast<double>(polymorphicSiteNumber(psc, gapflag));
00841   if (!S)
00842     throw ZeroDivisionException("S should not be null");
00843   double tajima = tajima83(psc, gapflag);
00844   double watterson = watterson75(psc, gapflag);
00845   size_t n = psc.getNumberOfSequences();
00846   map<string, double> values = getUsefullValues_(n);
00847   // if (S == 0)
00848   //  cout << "ARG S == 0" << endl;
00849   return (tajima - watterson) / sqrt((values["e1"] * S) + (values["e2"] * S * (S - 1)));
00850 }
00851 
00852 double SequenceStatistics::tajimaDTNM(const PolymorphismSequenceContainer& psc, bool gapflag) throw (ZeroDivisionException)
00853 {
00854   double eta =  static_cast<double>(totNumberMutations(psc, gapflag));
00855   if (!eta)
00856     throw ZeroDivisionException("eta should not be null");
00857   double tajima = tajima83(psc, gapflag);
00858   size_t n = psc.getNumberOfSequences();
00859   map<string, double> values = getUsefullValues_(n);
00860   double eta_a1 = static_cast<double>(eta) / values["a1"];
00861   return (tajima - eta_a1) / sqrt((values["e1"] * eta) + (values["e2"] * eta * (eta - 1)));
00862 }
00863 
00864 double SequenceStatistics::fuliD(const PolymorphismSequenceContainer& ingroup, const PolymorphismSequenceContainer& outgroup, bool original) throw (ZeroDivisionException)
00865 {
00866   size_t n = ingroup.getNumberOfSequences();
00867   map<string, double> values = getUsefullValues_(n);
00868   double vD = getVD_(n, values["a1"], values["a2"], values["cn"]);
00869   double uD = getUD_(values["a1"], vD);
00870   double eta = static_cast<double>(totNumberMutations(ingroup));
00871   if (eta == 0.)
00872     throw ZeroDivisionException("eta should not be null");
00873   double etae = 0.;
00874   if (original)
00875     etae = static_cast<double>(countSingleton(outgroup));
00876   else
00877     etae = static_cast<double>(totMutationsExternalBranchs(ingroup, outgroup));  // added by Khalid 13/07/2005
00878   return (eta - (values["a1"] * etae)) / sqrt((uD * eta) + (vD * eta * eta));
00879 }
00880 
00881 double SequenceStatistics::fuliDstar(const PolymorphismSequenceContainer& group) throw (ZeroDivisionException)
00882 {
00883   size_t n = group.getNumberOfSequences();
00884   double nn = static_cast<double>(n);
00885   double _n = nn / (nn - 1.);
00886   map<string, double> values = getUsefullValues_(n);
00887   double vDs = getVDstar_(n, values["a1"], values["a2"], values["dn"]);
00888   double uDs = getUDstar_(n, values["a1"], vDs);
00889   double eta = static_cast<double>(totNumberMutations(group));
00890   if (eta == 0.)
00891     throw ZeroDivisionException("eta should not be null");
00892   double etas = static_cast<double>(countSingleton(group));
00893 
00894   // Fu & Li 1993
00895   return ((_n * eta) - (values["a1"] * etas)) / sqrt(uDs * eta + vDs * eta * eta);
00896 
00897   // Simonsen et al. 1995
00898   /*
00899      return ((eta / values["a1"]) - (etas * ((n - 1) / n))) / sqrt(uDs * eta + vDs * eta * eta);
00900    */
00901 }
00902 
00903 double SequenceStatistics::fuliF(const PolymorphismSequenceContainer& ingroup, const PolymorphismSequenceContainer& outgroup, bool original) throw (ZeroDivisionException)
00904 {
00905   size_t n = ingroup.getNumberOfSequences();
00906   double nn = static_cast<double>(n);
00907   map<string, double> values = getUsefullValues_(n);
00908   double pi = tajima83(ingroup, true);
00909   double vF = (values["cn"] + values["b2"] - 2. / (nn - 1.)) / (pow(values["a1"], 2) + values["a2"]);
00910   double uF = ((1. + values["b1"] - (4. * ((nn + 1.) / ((nn - 1.) * (nn - 1.)))) * (values["a1n"] - (2. * nn) / (nn + 1.))) / values["a1"]) - vF;
00911   double eta = static_cast<double>(totNumberMutations(ingroup));
00912   if (eta == 0.)
00913     throw ZeroDivisionException("eta should not be null");
00914   double etae = 0.;
00915   if (original)
00916     etae = static_cast<double>(countSingleton(outgroup));
00917   else
00918     etae = static_cast<double>(totMutationsExternalBranchs(ingroup, outgroup));  // added by Khalid 13/07/2005
00919   return (pi - etae) / sqrt(uF * eta + vF * eta * eta);
00920 }
00921 
00922 double SequenceStatistics::fuliFstar(const PolymorphismSequenceContainer& group) throw (ZeroDivisionException)
00923 {
00924   double n = static_cast<double>(group.getNumberOfSequences());
00925   map<string, double> values = getUsefullValues_(group.getNumberOfSequences());
00926   double pi = tajima83(group, true);
00927 
00928   // Fu & Li 1993
00929   //  double vFs = (values["dn"] + values["b2"] - (2. / (nn - 1.)) * (4. * values["a2"] - 6. + 8. / nn)) / (pow(values["a1"], 2) + values["a2"]);
00930   //  double uFs = (((nn / (nn - 1.)) + values["b1"] - (4. / (nn * (nn - 1.))) + 2. * ((nn + 1.) / (pow((nn - 1.), 2))) * (values["a1n"] - 2. * nn / (nn + 1.))) / values["a1"]) - vFs;
00931 
00932   // Simonsen et al. 1995
00933   double vFs = (((2 * n * n * n + 110 * n * n - 255 * n + 153) / (9 * n * n * (n - 1))) + ((2 * (n - 1) * values["a1"]) / (n * n)) - 8 * values["a2"] / n) / (pow(values["a1"], 2) + values["a2"]);
00934   double uFs = (((4 * n * n + 19 * n + 3 - 12 * (n + 1) * values["a1n"]) / (3 * n * (n - 1))) / values["a1"]) - vFs;
00935   double eta = static_cast<double>(totNumberMutations(group));
00936   if (eta == 0.)
00937     throw ZeroDivisionException("eta should not be null");
00938   double etas = static_cast<double>(countSingleton(group));
00939   // Fu & Li 1993
00940   // Simonsen et al. 1995
00941   return (pi - ((n - 1.) / n * etas)) / sqrt(uFs * eta + vFs * eta * eta);
00942 }
00943 
00944 double SequenceStatistics::FstHudson92(const PolymorphismSequenceContainer& psc, size_t id1, size_t id2)
00945 {
00946   vector<double> vdiff;
00947   double piIntra1, piIntra2, meanPiIntra, piInter, Fst;
00948 
00949   PolymorphismSequenceContainer* Pop1 = PolymorphismSequenceContainerTools::extractGroup(psc, id1);
00950   PolymorphismSequenceContainer* Pop2 = PolymorphismSequenceContainerTools::extractGroup(psc, id2);
00951 
00952   piIntra1 = SequenceStatistics::tajima83(*Pop1, false);
00953   piIntra2 = SequenceStatistics::tajima83(*Pop2, false);
00954 
00955   meanPiIntra = (piIntra1 + piIntra2) / 2;
00956 
00957   double n = 0;
00958   for (size_t i = 0; i < Pop1->getNumberOfSequences(); i++)
00959   {
00960     const Sequence& s1 = Pop1->getSequence(i);
00961     for (size_t j = 0; j < Pop2->getNumberOfSequences(); j++)
00962     {
00963       n++;
00964       const Sequence& s2 = Pop2->getSequence(j);
00965       vdiff.push_back(SiteContainerTools::computeSimilarity(s1, s2, true, "no gap", true));
00966     }
00967   }
00968   piInter = (VectorTools::sum(vdiff) / n) * static_cast<double>(psc.getNumberOfSites());
00969 
00970 
00971   Fst = 1.0 - meanPiIntra / piInter;
00972 
00973   delete Pop1;
00974   delete Pop2;
00975 
00976   return Fst;
00977 }
00978 
00979 // ******************************************************************************
00980 // Linkage disequilibrium statistics
00981 // ******************************************************************************
00982 
00983 /**********************/
00984 /* Preliminary method */
00985 /**********************/
00986 
00987 PolymorphismSequenceContainer* SequenceStatistics::generateLDContainer(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
00988 {
00989   SiteSelection ss;
00990   // Extract polymorphic site with only two alleles
00991   for (size_t i = 0; i < psc.getNumberOfSites(); i++)
00992   {
00993     if (keepsingleton)
00994     {
00995       if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)))
00996       {
00997         ss.push_back(i);
00998       }
00999     }
01000     else
01001     {
01002       if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i)))
01003       {
01004         ss.push_back(i);
01005       }
01006     }
01007   }
01008 
01009   const SiteContainer* sc = SiteContainerTools::getSelectedSites(psc, ss);
01010   Alphabet* alpha = new DNA(); // Sylvain Gaillard 17/03/2010: What if psc's Alphabet is not DNA
01011   PolymorphismSequenceContainer* ldpsc = new PolymorphismSequenceContainer(sc->getNumberOfSequences(), alpha);
01012   // Assign 1 to the more frequent and 0 to the less frequent alleles
01013   for (size_t i = 0; i < sc->getNumberOfSites(); i++)
01014   {
01015     const Site& site = sc->getSite(i);
01016     Site siteclone(site);
01017     bool deletesite = false;
01018     map<int, double> freqs;
01019     SymbolListTools::getFrequencies(siteclone, freqs);
01020     int first = 0;
01021     for (map<int, double>::iterator it = freqs.begin(); it != freqs.end(); it++)
01022     {
01023       if (it->second >= 0.5)
01024         first = it->first;
01025     }
01026     for (size_t j = 0; j < sc->getNumberOfSequences(); j++)
01027     {
01028       if (freqs[site.getValue(j)] >= 0.5 && site.getValue(j) == first)
01029       {
01030         if (freqs[site.getValue(j)] <= 1 - freqmin)
01031         {
01032           siteclone.setElement(j, 1);
01033           first = site.getValue(j);
01034         }
01035         else
01036           deletesite = true;
01037       }
01038       else
01039       {
01040         if (freqs[site.getValue(j)] >= freqmin)
01041           siteclone.setElement(j, 0);
01042         else
01043           deletesite = true;
01044       }
01045     }
01046     if (!deletesite)
01047       ldpsc->addSite(siteclone);
01048   }
01049   delete alpha;
01050   return ldpsc;
01051 }
01052 
01053 /*************************************/
01054 /* Pairwise LD and distance measures */
01055 /*************************************/
01056 
01057 Vdouble SequenceStatistics::pairwiseDistances1(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
01058 {
01059   // get Positions with sites of interest
01060   SiteSelection ss;
01061   for (size_t i = 0; i < psc.getNumberOfSites(); i++)
01062   {
01063     if (keepsingleton)
01064     {
01065       if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)))
01066       {
01067         const Site& site = psc.getSite(i);
01068         bool deletesite = false;
01069         map<int, double> freqs;
01070         SymbolListTools::getFrequencies(site, freqs);
01071         for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
01072         {
01073           if (freqs[j] >= 1 - freqmin)
01074             deletesite = true;
01075         }
01076         if (!deletesite)
01077           ss.push_back(i);
01078       }
01079     }
01080     else
01081     {
01082       if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i)))
01083       {
01084         ss.push_back(i);
01085         const Site& site = psc.getSite(i);
01086         bool deletesite = false;
01087         map<int, double> freqs;
01088         SymbolListTools::getFrequencies(site, freqs);
01089         for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
01090         {
01091           if (freqs[j] >= 1 - freqmin)
01092             deletesite = true;
01093         }
01094         if (!deletesite)
01095           ss.push_back(i);
01096       }
01097     }
01098   }
01099   // compute pairwise distances
01100   if (ss.size() < 2)
01101     throw DimensionException("SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2);
01102   Vdouble dist;
01103   for (size_t i = 0; i < ss.size() - 1; i++)
01104   {
01105     for (size_t j = i + 1; j < ss.size(); j++)
01106     {
01107       dist.push_back(static_cast<double>(ss[j] - ss[i]));
01108     }
01109   }
01110   return dist;
01111 }
01112 
01113 Vdouble SequenceStatistics::pairwiseDistances2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
01114 {
01115   SiteSelection ss;
01116   for (size_t i = 0; i < psc.getNumberOfSites(); i++)
01117   {
01118     if (keepsingleton)
01119     {
01120       if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)))
01121       {
01122         const Site& site = psc.getSite(i);
01123         bool deletesite = false;
01124         map<int, double> freqs;
01125         SymbolListTools::getFrequencies(site, freqs);
01126         for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
01127         {
01128           if (freqs[j] >= 1 - freqmin)
01129             deletesite = true;
01130         }
01131         if (!deletesite)
01132           ss.push_back(i);
01133       }
01134     }
01135     else
01136     {
01137       if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i)))
01138       {
01139         ss.push_back(i);
01140         const Site& site = psc.getSite(i);
01141         bool deletesite = false;
01142         map<int, double> freqs;
01143         SymbolListTools::getFrequencies(site, freqs);
01144         for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
01145         {
01146           if (freqs[j] >= 1 - freqmin)
01147             deletesite = true;
01148         }
01149         if (!deletesite)
01150           ss.push_back(i);
01151       }
01152     }
01153   }
01154   size_t n = ss.size();
01155   if (n < 2)
01156     throw DimensionException("SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2);
01157   Vdouble distance(n * (n - 1) / 2, 0);
01158   size_t nbsite = psc.getNumberOfSites();
01159   for (size_t k = 0; k < psc.getNumberOfSequences(); k++)
01160   {
01161     const Sequence& seq = psc.getSequence(k);
01162     SiteSelection gap, newss = ss;
01163     Vdouble dist;
01164     for (size_t i = 0; i < nbsite; i++)
01165     {
01166       if (seq.getValue(i) == -1)
01167         gap.push_back(i);
01168     }
01169     // Site positions are re-numbered to take gaps into account
01170     for (size_t i = 0; i < gap.size(); i++)
01171     {
01172       for (size_t j = 0; j < ss.size(); j++)
01173       {
01174         if (ss[j] > gap[i])
01175           newss[j]--;
01176       }
01177     }
01178     for (size_t i = 0; i < n - 1; i++)
01179     {
01180       for (size_t j = i + 1; j < n; j++)
01181       {
01182         dist.push_back(static_cast<double>(newss[j] - newss[i]));
01183       }
01184     }
01185     distance += dist;
01186   }
01187   distance = distance / static_cast<double>(psc.getNumberOfSequences());
01188   return distance;
01189 }
01190 
01191 Vdouble SequenceStatistics::pairwiseD(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
01192 {
01193   PolymorphismSequenceContainer* newpsc = SequenceStatistics::generateLDContainer(psc, keepsingleton, freqmin);
01194   Vdouble D;
01195   size_t nbsite = newpsc->getNumberOfSites();
01196   size_t nbseq = newpsc->getNumberOfSequences();
01197   if (nbsite < 2)
01198     throw DimensionException("SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
01199   if (nbseq < 2)
01200     throw DimensionException("SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
01201   for (size_t i = 0; i < nbsite - 1; i++)
01202   {
01203     for (size_t j = i + 1; j < nbsite; j++)
01204     {
01205       double haplo = 0;
01206       const Site& site1 = newpsc->getSite(i);
01207       const Site& site2 = newpsc->getSite(j);
01208       map<int, double> freq1;
01209       map<int, double> freq2;
01210       SymbolListTools::getFrequencies(site1, freq1);
01211       SymbolListTools::getFrequencies(site2, freq2);
01212       for (size_t k = 0; k < nbseq; k++)
01213       {
01214         if (site1.getValue(k) + site2.getValue(k) == 2)
01215           haplo++;
01216       }
01217       haplo = haplo / static_cast<double>(nbseq);
01218       D.push_back(std::abs(haplo - freq1[1] * freq2[1]));
01219     }
01220   }
01221   return D;
01222 }
01223 
01224 Vdouble SequenceStatistics::pairwiseDprime(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
01225 {
01226   PolymorphismSequenceContainer* newpsc = SequenceStatistics::generateLDContainer(psc, keepsingleton, freqmin);
01227   Vdouble Dprime;
01228   size_t nbsite = newpsc->getNumberOfSites();
01229   size_t nbseq = newpsc->getNumberOfSequences();
01230   if (nbsite < 2)
01231     throw DimensionException("SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
01232   if (nbseq < 2)
01233     throw DimensionException("SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
01234   for (size_t i = 0; i < nbsite - 1; i++)
01235   {
01236     for (size_t j = i + 1; j < nbsite; j++)
01237     {
01238       double haplo = 0;
01239       const Site& site1 = newpsc->getSite(i);
01240       const Site& site2 = newpsc->getSite(j);
01241       map<int, double> freq1;
01242       map<int, double> freq2;
01243       SymbolListTools::getFrequencies(site1, freq1);
01244       SymbolListTools::getFrequencies(site2, freq2);
01245       for (size_t k = 0; k < nbseq; k++)
01246       {
01247         if (site1.getValue(k) + site2.getValue(k) == 2)
01248           haplo++;
01249       }
01250       haplo = haplo / static_cast<double>(nbseq);
01251       double d, D = (haplo - freq1[1] * freq2[1]);
01252       if (D > 0)
01253       {
01254         if (freq1[1] * freq2[0] <= freq1[0] * freq2[1])
01255         {
01256           d = std::abs(D) / (freq1[1] * freq2[0]);
01257         }
01258         else
01259         {
01260           d = std::abs(D) / (freq1[0] * freq2[1]);
01261         }
01262       }
01263       else
01264       {
01265         if (freq1[1] * freq2[1] <= freq1[0] * freq2[0])
01266         {
01267           d = std::abs(D) / (freq1[1] * freq2[1]);
01268         }
01269         else
01270         {
01271           d = std::abs(D) / (freq1[0] * freq2[0]);
01272         }
01273       }
01274       Dprime.push_back(d);
01275     }
01276   }
01277   return Dprime;
01278 }
01279 
01280 Vdouble SequenceStatistics::pairwiseR2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
01281 {
01282   PolymorphismSequenceContainer* newpsc = SequenceStatistics::generateLDContainer(psc, keepsingleton, freqmin);
01283   Vdouble R2;
01284   size_t nbsite = newpsc->getNumberOfSites();
01285   size_t nbseq = newpsc->getNumberOfSequences();
01286   if (nbsite < 2)
01287     throw DimensionException("SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
01288   if (nbseq < 2)
01289     throw DimensionException("SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
01290   for (size_t i = 0; i < nbsite - 1; i++)
01291   {
01292     for (size_t j = i + 1; j < nbsite; j++)
01293     {
01294       double haplo = 0;
01295       const Site& site1 = newpsc->getSite(i);
01296       const Site& site2 = newpsc->getSite(j);
01297       map<int, double> freq1;
01298       map<int, double> freq2;
01299       SymbolListTools::getFrequencies(site1, freq1);
01300       SymbolListTools::getFrequencies(site2, freq2);
01301       for (size_t k = 0; k < nbseq; k++)
01302       {
01303         if (site1.getValue(k) + site2.getValue(k) == 2)
01304           haplo++;
01305       }
01306       haplo = haplo / static_cast<double>(nbseq);
01307       double r = ((haplo - freq1[1] * freq2[1]) * (haplo - freq1[1] * freq2[1])) / (freq1[0] * freq1[1] * freq2[0] * freq2[1]);
01308       R2.push_back(r);
01309     }
01310   }
01311   return R2;
01312 }
01313 
01314 /***********************************/
01315 /* Global LD and distance measures */
01316 /***********************************/
01317 
01318 double SequenceStatistics::meanD(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
01319 {
01320   Vdouble D = pairwiseD(psc, keepsingleton, freqmin);
01321   return VectorTools::mean<double, double>(D);
01322 }
01323 
01324 double SequenceStatistics::meanDprime(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
01325 {
01326   try
01327   {
01328     Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin);
01329     return VectorTools::mean<double, double>(Dprime);
01330   }
01331   catch (DimensionException& e)
01332   {
01333     throw e;
01334   }
01335 }
01336 
01337 double SequenceStatistics::meanR2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
01338 {
01339   try
01340   {
01341     Vdouble R2 = SequenceStatistics::pairwiseR2(psc, keepsingleton, freqmin);
01342     return VectorTools::mean<double, double>(R2);
01343   }
01344   catch (DimensionException& e)
01345   {
01346     throw e;
01347   }
01348 }
01349 
01350 double SequenceStatistics::meanDistance1(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
01351 {
01352   try
01353   {
01354     Vdouble dist = pairwiseDistances1(psc, keepsingleton, freqmin);
01355     return VectorTools::mean<double, double>(dist);
01356   }
01357   catch (DimensionException& e)
01358   {
01359     throw e;
01360   }
01361 }
01362 
01363 double SequenceStatistics::meanDistance2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
01364 {
01365   try
01366   {
01367     Vdouble dist = pairwiseDistances2(psc, keepsingleton, freqmin);
01368     return VectorTools::mean<double, double>(dist);
01369   }
01370   catch (DimensionException& e)
01371   {
01372     throw e;
01373   }
01374 }
01375 
01376 /**********************/
01377 /* Regression methods */
01378 /**********************/
01379 
01380 double SequenceStatistics::originRegressionD(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
01381 {
01382   try
01383   {
01384     Vdouble D = pairwiseD(psc, keepsingleton, freqmin) - 1;
01385     Vdouble dist;
01386     if (distance1)
01387       dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
01388     else
01389       dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
01390     return VectorTools::sum(D * dist) / VectorTools::sum(dist * dist);
01391   }
01392   catch (DimensionException& e)
01393   {
01394     throw e;
01395   }
01396 }
01397 
01398 double SequenceStatistics::originRegressionDprime(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
01399 {
01400   try
01401   {
01402     Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin) - 1;
01403     Vdouble dist;
01404     if (distance1)
01405       dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
01406     else
01407       dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
01408     return VectorTools::sum(Dprime * dist) / VectorTools::sum(dist * dist);
01409   }
01410   catch (DimensionException& e)
01411   {
01412     throw e;
01413   }
01414 }
01415 
01416 double SequenceStatistics::originRegressionR2(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
01417 {
01418   try
01419   {
01420     Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin) - 1;
01421     Vdouble dist;
01422     if (distance1)
01423       dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
01424     else
01425       dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
01426     return VectorTools::sum(R2 * dist) / VectorTools::sum(dist * dist);
01427   }
01428   catch (DimensionException& e)
01429   {
01430     throw e;
01431   }
01432 }
01433 
01434 Vdouble SequenceStatistics::linearRegressionD(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
01435 {
01436   try
01437   {
01438     Vdouble D = pairwiseD(psc, keepsingleton, freqmin);
01439     Vdouble dist;
01440     Vdouble reg(2);
01441     if (distance1)
01442       dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
01443     else
01444       dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
01445     reg[0] = VectorTools::cov<double, double>(dist, D) / VectorTools::var<double, double>(dist);
01446     reg[1] = VectorTools::mean<double, double>(D) - reg[0] * VectorTools::mean<double, double>(dist);
01447     return reg;
01448   }
01449   catch (DimensionException& e)
01450   {
01451     throw e;
01452   }
01453 }
01454 
01455 Vdouble SequenceStatistics::linearRegressionDprime(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
01456 {
01457   try
01458   {
01459     Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin);
01460     Vdouble dist;
01461     Vdouble reg(2);
01462     if (distance1)
01463       dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
01464     else
01465       dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
01466     reg[0] = VectorTools::cov<double, double>(dist, Dprime) / VectorTools::var<double, double>(dist);
01467     reg[1] = VectorTools::mean<double, double>(Dprime) - reg[0] * VectorTools::mean<double, double>(dist);
01468     return reg;
01469   }
01470   catch (DimensionException& e)
01471   {
01472     throw e;
01473   }
01474 }
01475 
01476 Vdouble SequenceStatistics::linearRegressionR2(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
01477 {
01478   try
01479   {
01480     Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin);
01481     Vdouble dist;
01482     Vdouble reg(2);
01483     if (distance1)
01484       dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
01485     else
01486       dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
01487     reg[0] = VectorTools::cov<double, double>(dist, R2) / VectorTools::var<double, double>(dist);
01488     reg[1] = VectorTools::mean<double, double>(R2) - reg[0] * VectorTools::mean<double, double>(dist);
01489     return reg;
01490   }
01491   catch (DimensionException& e)
01492   {
01493     throw e;
01494   }
01495 }
01496 
01497 double SequenceStatistics::inverseRegressionR2(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
01498 {
01499   try
01500   {
01501     Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin);
01502     Vdouble unit(R2.size(), 1);
01503     Vdouble R2transformed = unit / R2 - 1;
01504     Vdouble dist;
01505     if (distance1)
01506       dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
01507     else
01508       dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
01509     return VectorTools::sum(R2transformed * dist) / VectorTools::sum(dist * dist);
01510   }
01511   catch (DimensionException& e)
01512   {
01513     throw e;
01514   }
01515 }
01516 
01517 /**********************/
01518 /*   Hudson method    */
01519 /**********************/
01520 
01521 double SequenceStatistics::hudson87(const PolymorphismSequenceContainer& psc, double precision, double cinf, double csup)
01522 {
01523   double left = leftHandHudson_(psc);
01524   size_t n = psc.getNumberOfSequences();
01525   double dif = 1;
01526   double c1 = cinf;
01527   double c2 = csup;
01528   if (SequenceStatistics::polymorphicSiteNumber(psc) < 2)
01529     return -1;
01530   if (rightHandHudson_(c1, n) < left)
01531     return cinf;
01532   if (rightHandHudson_(c2, n) > left)
01533     return csup;
01534   while (dif > precision)
01535   {
01536     if (rightHandHudson_((c1 + c2) / 2, n) > left)
01537       c1 = (c1 + c2) / 2;
01538     else
01539       c2 = (c1 + c2) / 2;
01540     dif = std::abs(2 * (c1 - c2) / (c1 + c2));
01541   }
01542   return (c1 + c2) / 2;
01543 }
01544 
01545 /*****************/
01546 /* Tests methods */
01547 /*****************/
01548 
01549 void SequenceStatistics::testUsefullValues(std::ostream& s, size_t n)
01550 {
01551   map<string, double> v = getUsefullValues_(n);
01552   double vD = getVD_(n, v["a1"], v["a2"], v["cn"]);
01553   double uD = getUD_(v["a1"], vD);
01554   double vDs = getVDstar_(n, v["a1"], v["a2"], v["dn"]);
01555   double uDs = getUDstar_(n, v["a1"], vDs);
01556 
01557   s << n << "\t";
01558   s << v["a1"] << "\t";
01559   s << v["a2"] << "\t";
01560   s << v["a1n"] << "\t";
01561   s << v["b1"] << "\t";
01562   s << v["b2"] << "\t";
01563   s << v["c1"] << "\t";
01564   s << v["c2"] << "\t";
01565   s << v["cn"] << "\t";
01566   s << v["dn"] << "\t";
01567   s << v["e1"] << "\t";
01568   s << v["e2"] << "\t";
01569   s << uD << "\t";
01570   s << vD << "\t";
01571   s << uDs << "\t";
01572   s << vDs << endl;
01573 }
01574 
01575 // ******************************************************************************
01576 // Private methods
01577 // ******************************************************************************
01578 
01579 size_t SequenceStatistics::getMutationNumber_(const Site& site)
01580 {
01581   size_t tmp_count = 0;
01582   map<int, size_t> states_count;
01583   SymbolListTools::getCounts(site, states_count);
01584 
01585   for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
01586   {
01587     if (it->first >= 0)
01588       tmp_count++;
01589   }
01590   if (tmp_count > 0)
01591     tmp_count--;
01592   return tmp_count;
01593 }
01594 
01595 size_t SequenceStatistics::getSingletonNumber_(const Site& site)
01596 {
01597   size_t nus = 0;
01598   map<int, size_t> states_count;
01599   SymbolListTools::getCounts(site, states_count);
01600   for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
01601   {
01602     if (it->second == 1)
01603       nus++;
01604   }
01605   return nus;
01606 }
01607 
01608 size_t SequenceStatistics::getDerivedSingletonNumber_(const Site& site_in, const Site& site_out)
01609 {
01610   size_t nus = 0;
01611   map<int, size_t> states_count;
01612   map<int, size_t> outgroup_states_count;
01613   SymbolListTools::getCounts(site_in, states_count);
01614   SymbolListTools::getCounts(site_out, outgroup_states_count);
01615   // if there is more than one variant in the outgroup we will not be able to recover the ancestral state
01616   if (outgroup_states_count.size() == 1)
01617   {
01618     for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
01619     {
01620       if (it->second == 1)
01621       {
01622         if (outgroup_states_count.find(it->first) == outgroup_states_count.end())
01623           nus++;
01624       }
01625     }
01626   }
01627   return nus;
01628 }
01629 
01630 std::map<std::string, double> SequenceStatistics::getUsefullValues_(size_t n)
01631 {
01632   double nn = static_cast<double>(n);
01633   map<string, double> values;
01634   values["a1"] = 0.;
01635   values["a2"] = 0.;
01636   values["a1n"] = 0.;
01637   values["b1"] = 0.;
01638   values["b2"] = 0.;
01639   values["c1"] = 0.;
01640   values["c2"] = 0.;
01641   values["cn"] = 0.;
01642   values["dn"] = 0.;
01643   values["e1"] = 0.;
01644   values["e2"] = 0.;
01645   if (n > 1)
01646   {
01647     for (double i = 1; i < nn; i++)
01648     {
01649       values["a1"] += 1. / i;
01650       values["a2"] += 1. / (i * i);
01651     }
01652     values["a1n"] = values["a1"] + (1. / nn);
01653     values["b1"] = (nn + 1.) / (3. * (nn - 1.));
01654     values["b2"] = 2. * ((nn * nn) + nn + 3.) / (9. * nn * (nn - 1.));
01655     values["c1"] = values["b1"] - (1. / values["a1"]);
01656     values["c2"] = values["b2"] - ((nn + 2.) / (values["a1"] * nn)) + (values["a2"] / (values["a1"] * values["a1"]));
01657     if (n == 2)
01658     {
01659       values["cn"] = 1.;
01660       values["dn"] = 2.;
01661     }
01662     else
01663     {
01664       values["cn"] = 2. * ((nn * values["a1"]) - (2. * (nn - 1.))) / ((nn - 1.) * (nn - 2.));
01665       values["dn"] =
01666         values["cn"]
01667         + ((nn - 2.) / ((nn - 1.) * (nn - 1.)))
01668         + (2. / (nn - 1.))
01669         * ((3. / 2.) - (((2. * values["a1n"]) - 3.) / (nn - 2.)) - (1. / nn));
01670     }
01671     values["e1"] = values["c1"] / values["a1"];
01672     values["e2"] = values["c2"] / ((values["a1"] * values["a1"]) + values["a2"]);
01673   }
01674   return values;
01675 }
01676 
01677 double SequenceStatistics::getVD_(size_t n, double a1, double a2, double cn)
01678 {
01679   double nn = static_cast<double>(n);
01680   if (n < 3)
01681     return 0.;
01682   double vD = 1. + ((a1 * a1) / (a2 + (a1 * a1))) * (cn - ((nn + 1.) / (nn - 1.)));
01683   return vD;
01684 }
01685 
01686 double SequenceStatistics::getUD_(double a1, double vD)
01687 {
01688   return a1 - 1. - vD;
01689 }
01690 
01691 double SequenceStatistics::getVDstar_(size_t n, double a1, double a2, double dn)
01692 {
01693   double denom = (a1 * a1) + a2;
01694   if (n < 3 || denom == 0.)
01695     return 0.;
01696   double nn = static_cast<double>(n);
01697   double nnn = nn / (nn - 1.);
01698   // Fu & Li 1993
01699   double vDs = (
01700     (nnn * nnn * a2)
01701     + (a1 * a1 * dn)
01702     - (2. * (nn * a1 * (a1 + 1)) / ((nn - 1.) * (nn - 1.)))
01703     )
01704                /
01705                denom;
01706   // Simonsen et al. 1995
01707   /*
01708      double vDs = (
01709       (values["a2"] / pow(values["a1"], 2))
01710       - (2./nn) * (1. + 1./values["a1"] - values["a1"] + values["a1"]/nn)
01711       - 1./(nn*nn)
01712       )
01713       /
01714       (pow(values["a1"], 2) + values["a2"]);
01715    */
01716   return vDs;
01717 }
01718 
01719 double SequenceStatistics::getUDstar_(size_t n, double a1, double vDs)
01720 {
01721   if (n < 3)
01722     return 0.;
01723   double nn = static_cast<double>(n);
01724   double nnn = nn / (nn - 1.);
01725   // Fu & Li 1993
01726   double uDs = (nnn * (a1 - nnn)) - vDs;
01727   // Simonsen et al. 1995
01728   /*
01729      double uDs = (((nn - 1.)/nn - 1./values["a1"]) / values["a1"]) - vDs;
01730    */
01731   return uDs;
01732 }
01733 
01734 double SequenceStatistics::leftHandHudson_(const PolymorphismSequenceContainer& psc)
01735 {
01736   PolymorphismSequenceContainer* newpsc = PolymorphismSequenceContainerTools::getCompleteSites(psc);
01737   size_t nbseq = newpsc->getNumberOfSequences();
01738   double S1 = 0;
01739   double S2 = 0;
01740   for (size_t i = 0; i < nbseq - 1; i++)
01741   {
01742     for (size_t j = i + 1; j < nbseq; j++)
01743     {
01744       SequenceSelection ss(2);
01745       ss[0] = i;
01746       ss[1] = j;
01747       PolymorphismSequenceContainer* psc2 = PolymorphismSequenceContainerTools::getSelectedSequences(*newpsc, ss);
01748       S1 += SequenceStatistics::watterson75(*psc2, true);
01749       S2 += SequenceStatistics::watterson75(*psc2, true) * SequenceStatistics::watterson75(*psc2, true);
01750       delete psc2;
01751     }
01752   }
01753   double Sk = (2 * S2 - pow(2 * S1 / static_cast<double>(nbseq), 2.)) / pow(nbseq, 2.);
01754   double H = SequenceStatistics::heterozygosity(*newpsc);
01755   double H2 = SequenceStatistics::squaredHeterozygosity(*newpsc);
01756   delete newpsc;
01757   return static_cast<double>(Sk - H + H2) / pow(H * static_cast<double>(nbseq) / static_cast<double>(nbseq - 1), 2.);
01758 }
01759 
01760 double SequenceStatistics::rightHandHudson_(double c, size_t n)
01761 {
01762   double nn = static_cast<double>(n);
01763   return 1. / (97. * pow(c, 2.) * pow(nn, 3.)) * ((nn - 1.) * (97. * (c * (4. + (c - 2. * nn) * nn) + (-2. * (7. + c) + 4. * nn + (c - 1.) * pow(nn, 2.)) * log((18. + c * (13. + c)) / 18.)) + sqrt(97.) * (110. + nn * (49. * nn - 52.) + c * (2. + nn * (15. * nn - 8.))) * log(-1. + (72. + 26. * c) / (36. + 13. * c - c * sqrt(97.)))));
01764 }
01765 
 All Classes Namespaces Files Functions Variables Typedefs Friends