|
bpp-popgen
2.1.0
|
00001 // 00002 // File SequenceStatistics.cpp 00003 // Authors: Eric Bazin 00004 // Sylvain Gailard 00005 // Khalid Belkhir 00006 // Benoit Nabholz 00007 // Created on: Wed Aug 04 2004 00008 // 00009 00010 /* 00011 Copyright or © or Copr. Bio++ Development Team, (November 17, 2004) 00012 00013 This software is a computer program whose purpose is to provide classes 00014 for population genetics analysis. 00015 00016 This software is governed by the CeCILL license under French law and 00017 abiding by the rules of distribution of free software. You can use, 00018 modify and/ or redistribute the software under the terms of the CeCILL 00019 license as circulated by CEA, CNRS and INRIA at the following URL 00020 "http://www.cecill.info". 00021 00022 As a counterpart to the access to the source code and rights to copy, 00023 modify and redistribute granted by the license, users are provided only 00024 with a limited warranty and the software's author, the holder of the 00025 economic rights, and the successive licensors have only limited 00026 liability. 00027 00028 In this respect, the user's attention is drawn to the risks associated 00029 with loading, using, modifying and/or developing or reproducing the 00030 software by the user in light of its specific status of free software, 00031 that may mean that it is complicated to manipulate, and that also 00032 therefore means that it is reserved for developers and experienced 00033 professionals having in-depth computer knowledge. Users are therefore 00034 encouraged to load and test the software's suitability as regards their 00035 requirements in conditions enabling the security of their systems and/or 00036 data to be ensured and, more generally, to use and operate it in the 00037 same conditions as regards security. 00038 00039 The fact that you are presently reading this means that you have had 00040 knowledge of the CeCILL license and that you accept its terms. 00041 */ 00042 00043 #include "SequenceStatistics.h" // class's header file 00044 #include "PolymorphismSequenceContainerTools.h" 00045 #include "PolymorphismSequenceContainer.h" 00046 00047 // From the STL: 00048 #include <ctype.h> 00049 #include <cmath> 00050 #include <iostream> 00051 #include <vector> 00052 00053 using namespace std; 00054 00055 // From SeqLib: 00056 #include <Bpp/Seq/Site.h> 00057 #include <Bpp/Seq/SiteTools.h> 00058 #include <Bpp/Seq/StringSequenceTools.h> 00059 #include <Bpp/Seq/CodonSiteTools.h> 00060 #include <Bpp/Seq/Alphabet/DNA.h> 00061 #include <Bpp/Seq/Alphabet/AlphabetTools.h> 00062 00063 #include <Bpp/Numeric/VectorTools.h> 00064 #include <Bpp/Numeric/VectorExceptions.h> 00065 00066 using namespace bpp; 00067 00068 // ****************************************************************************** 00069 // Basic statistics 00070 // ****************************************************************************** 00071 00072 size_t SequenceStatistics::polymorphicSiteNumber(const PolymorphismSequenceContainer& psc, bool gapflag, bool ignoreUnknown) 00073 { 00074 size_t S = 0; 00075 const Site* site = 0; 00076 ConstSiteIterator* si = 0; 00077 if (gapflag) 00078 si = new CompleteSiteContainerIterator(psc); 00079 else 00080 si = new SimpleSiteContainerIterator(psc); 00081 while (si->hasMoreSites()) 00082 { 00083 site = si->nextSite(); 00084 if (!SiteTools::isConstant(*site, ignoreUnknown)) 00085 { 00086 S++; 00087 } 00088 } 00089 delete si; 00090 return S; 00091 } 00092 00093 size_t SequenceStatistics::parsimonyInformativeSiteNumber(const PolymorphismSequenceContainer& psc, bool gapflag) 00094 { 00095 ConstSiteIterator* si = 0; 00096 if (gapflag) 00097 si = new CompleteSiteContainerIterator(psc); 00098 else 00099 si = new SimpleSiteContainerIterator(psc); 00100 size_t S = 0; 00101 const Site* site = 0; 00102 while (si->hasMoreSites()) 00103 { 00104 site = si->nextSite(); 00105 if (SiteTools::isParsimonyInformativeSite(*site)) 00106 { 00107 S++; 00108 } 00109 } 00110 delete si; 00111 return S; 00112 } 00113 00114 size_t SequenceStatistics::countSingleton(const PolymorphismSequenceContainer& psc, bool gapflag) 00115 { 00116 size_t nus = 0; 00117 const Site* site = 0; 00118 ConstSiteIterator* si = 0; 00119 if (gapflag) 00120 si = new CompleteSiteContainerIterator(psc); 00121 else 00122 si = new SimpleSiteContainerIterator(psc); 00123 while (si->hasMoreSites()) 00124 { 00125 site = si->nextSite(); 00126 nus += getSingletonNumber_(*site); 00127 } 00128 delete si; 00129 return nus; 00130 } 00131 00132 size_t SequenceStatistics::tripletNumber(const PolymorphismSequenceContainer& psc, bool gapflag) 00133 { 00134 ConstSiteIterator* si = 0; 00135 if (gapflag) 00136 si = new CompleteSiteContainerIterator(psc); 00137 else 00138 si = new SimpleSiteContainerIterator(psc); 00139 int S = 0; 00140 const Site* site = 0; 00141 while (si->hasMoreSites()) 00142 { 00143 site = si->nextSite(); 00144 if (SiteTools::isTriplet(*site)) 00145 { 00146 S++; 00147 } 00148 } 00149 00150 delete si; 00151 return S; 00152 } 00153 00154 size_t SequenceStatistics::totNumberMutations(const PolymorphismSequenceContainer& psc, bool gapflag) 00155 { 00156 size_t tnm = 0; 00157 const Site* site = 0; 00158 ConstSiteIterator* si = 0; 00159 if (gapflag) 00160 si = new CompleteSiteContainerIterator(psc); 00161 else 00162 si = new SimpleSiteContainerIterator(psc); 00163 while (si->hasMoreSites()) 00164 { 00165 site = si->nextSite(); 00166 tnm += getMutationNumber_(*site); 00167 } 00168 delete si; 00169 return tnm; 00170 } 00171 00172 size_t SequenceStatistics::totMutationsExternalBranchs( 00173 const PolymorphismSequenceContainer& ing, 00174 const PolymorphismSequenceContainer& outg) throw (Exception) 00175 { 00176 if (ing.getNumberOfSites() != outg.getNumberOfSites()) 00177 throw Exception("ing and outg must have the same size"); 00178 size_t nmuts = 0; 00179 const Site* site_in = 0; 00180 const Site* site_out = 0; 00181 ConstSiteIterator* si = 0; 00182 ConstSiteIterator* so = 0; 00183 si = new SimpleSiteContainerIterator(ing); 00184 so = new SimpleSiteContainerIterator(outg); 00185 while (si->hasMoreSites()) 00186 { 00187 site_in = si->nextSite(); 00188 site_out = so->nextSite(); 00189 // use fully resolved sites 00190 if (SiteTools::isComplete(*site_in) && SiteTools::isComplete(*site_out)) 00191 nmuts += getDerivedSingletonNumber_(*site_in, *site_out); // singletons that are not in outgroup 00192 } 00193 delete si; 00194 delete so; 00195 return nmuts; 00196 } 00197 00198 double SequenceStatistics::heterozygosity(const PolymorphismSequenceContainer& psc, bool gapflag) 00199 { 00200 ConstSiteIterator* si = 0; 00201 const Site* site = 0; 00202 if (gapflag) 00203 si = new CompleteSiteContainerIterator(psc); 00204 else 00205 si = new SimpleSiteContainerIterator(psc); 00206 double S = 0; 00207 while (si->hasMoreSites()) 00208 { 00209 site = si->nextSite(); 00210 S += SiteTools::heterozygosity(*site); 00211 } 00212 delete si; 00213 return S; 00214 } 00215 00216 double SequenceStatistics::squaredHeterozygosity(const PolymorphismSequenceContainer& psc, bool gapflag) 00217 { 00218 ConstSiteIterator* si = 0; 00219 const Site* site = 0; 00220 if (gapflag) 00221 si = new CompleteSiteContainerIterator(psc); 00222 else 00223 si = new SimpleSiteContainerIterator(psc); 00224 double S = 0; 00225 while (si->hasMoreSites()) 00226 { 00227 site = si->nextSite(); 00228 double h = SiteTools::heterozygosity(*site); 00229 S += h * h; 00230 } 00231 delete si; 00232 return S; 00233 } 00234 00235 // ****************************************************************************** 00236 // GC statistics 00237 // ****************************************************************************** 00238 00239 double SequenceStatistics::gcContent(const PolymorphismSequenceContainer& psc) 00240 { 00241 map<int, double> freqs; 00242 SequenceContainerTools::getFrequencies(psc, freqs); 00243 const Alphabet* alpha = psc.getAlphabet(); 00244 return (freqs[alpha->charToInt("C")] + freqs[alpha->charToInt("G")]) / (freqs[alpha->charToInt("A")] + freqs[alpha->charToInt("C")] + freqs[alpha->charToInt("G")] + freqs[alpha->charToInt("T")]); 00245 } 00246 00247 std::vector<size_t> SequenceStatistics::gcPolymorphism(const PolymorphismSequenceContainer& psc, bool gapflag) 00248 { 00249 size_t nbMut = 0; 00250 size_t nbGC = 0; 00251 const size_t nbSeq = psc.getNumberOfSequences(); 00252 vector<size_t> vect(2); 00253 const Site* site = 0; 00254 ConstSiteIterator* si = 0; 00255 if (gapflag) 00256 si = new CompleteSiteContainerIterator(psc); 00257 else 00258 si = new NoGapSiteContainerIterator(psc); 00259 while (si->hasMoreSites()) 00260 { 00261 site = si->nextSite(); 00262 if (!SiteTools::isConstant(*site)) 00263 { 00264 long double freqGC = SymbolListTools::getGCContent(*site); 00265 /* 00266 * Sylvain Gaillard 15/03/2010: realy unclear ... 00267 * freqGC is always in [0,1] then why testing it ? 00268 * why casting double into size_t ? 00269 * is that method used by someone ? 00270 */ 00271 if (freqGC > 0 && freqGC < 1) 00272 { 00273 nbMut += static_cast<size_t>(nbSeq); 00274 long double adGC = freqGC * nbSeq; 00275 nbGC += static_cast<size_t>(adGC); 00276 } 00277 } 00278 } 00279 vect[0] = nbMut; 00280 vect[1] = nbGC; 00281 delete si; 00282 return vect; 00283 } 00284 00285 // ****************************************************************************** 00286 // Diversity statistics 00287 // ****************************************************************************** 00288 00289 double SequenceStatistics::watterson75(const PolymorphismSequenceContainer& psc, bool gapflag, bool ignoreUnknown) 00290 { 00291 double ThetaW; 00292 size_t n = psc.getNumberOfSequences(); 00293 size_t S = polymorphicSiteNumber(psc, gapflag, ignoreUnknown); 00294 map<string, double> values = getUsefullValues_(n); 00295 ThetaW = (double) S / values["a1"]; 00296 return ThetaW; 00297 } 00298 00299 double SequenceStatistics::tajima83(const PolymorphismSequenceContainer& psc, bool gapflag) 00300 { 00301 size_t alphabet_size = (psc.getAlphabet())->getSize(); 00302 const Site* site = 0; 00303 ConstSiteIterator* si = 0; 00304 double value2 = 0.; 00305 if (gapflag) 00306 si = new CompleteSiteContainerIterator(psc); 00307 else 00308 si = new SimpleSiteContainerIterator(psc); 00309 while (si->hasMoreSites()) 00310 { 00311 site = si->nextSite(); 00312 if (!SiteTools::isConstant(*site)) 00313 { 00314 double value = 0.; 00315 map<int, size_t> count; 00316 SymbolListTools::getCounts(*site, count); 00317 map<int, size_t> tmp_k; 00318 size_t tmp_n = 0; 00319 for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++) 00320 { 00321 if (it->first >= 0 && it->first < static_cast<int>(alphabet_size)) 00322 { 00323 tmp_k[it->first] = it->second * (it->second - 1); 00324 tmp_n += it->second; 00325 } 00326 } 00327 if (tmp_n == 0 || tmp_n == 1) 00328 continue; 00329 for (map<int, size_t>::iterator it = tmp_k.begin(); it != tmp_k.end(); it++) 00330 { 00331 value += static_cast<double>(it->second) / static_cast<double>(tmp_n * (tmp_n - 1)); 00332 } 00333 value2 += 1. - value; 00334 } 00335 } 00336 delete si; 00337 return value2; 00338 } 00339 00340 double SequenceStatistics::FayWu2000(const PolymorphismSequenceContainer& psc, const Sequence& ancestralSites) 00341 { 00342 if (psc.getNumberOfSites() != ancestralSites.size()) 00343 throw Exception("SequenceStatistics::FayWu2000: ancestralSites and psc don't have the same size!!!'" ); 00344 00345 const Sequence& tmps = psc.getSequence(0); 00346 00347 size_t alphabet_size = (psc.getAlphabet())->getSize(); 00348 double value = 0.; 00349 for (size_t i = 0; i < psc.getNumberOfSites(); i++) 00350 { 00351 const Site& site = psc.getSite(i); 00352 string ancB = ancestralSites.getChar(i); 00353 int ancV = ancestralSites.getValue(i); 00354 00355 if (!SiteTools::isConstant(site) || tmps.getChar(i) != ancB) 00356 { 00357 if (ancV < 0) 00358 continue; 00359 00360 map<int, size_t> count; 00361 SymbolListTools::getCounts(site, count); 00362 map<int, size_t> tmp_k; 00363 size_t tmp_n = 0; 00364 for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++) 00365 { 00366 if (it->first >= 0 && it->first < static_cast<int>(alphabet_size)) 00367 { 00368 /* if derived allele */ 00369 if (it->first != ancV) 00370 { 00371 tmp_k[it->first] = 2 * it->second * it->second; 00372 } 00373 tmp_n += it->second; 00374 } 00375 } 00376 if (tmp_n == 0 || tmp_n == 1) 00377 continue; 00378 for (map<int, size_t>::iterator it = tmp_k.begin(); it != tmp_k.end(); it++) 00379 { 00380 value += static_cast<double>(it->second) / static_cast<double>(tmp_n * (tmp_n - 1)); 00381 } 00382 } 00383 } 00384 return value; 00385 } 00386 00387 size_t SequenceStatistics::DVK(const PolymorphismSequenceContainer& psc, bool gapflag) 00388 { 00389 /* 00390 * Sylvain Gaillard 17/03/2010: 00391 * This implementation uses unneeded SequenceContainer recopy and works on 00392 * string. It needs to be improved. 00393 */ 00394 PolymorphismSequenceContainer* sc = 0; 00395 if (gapflag) 00396 sc = PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc); 00397 else 00398 sc = new PolymorphismSequenceContainer(psc); 00399 // int K = 0; 00400 vector<string> pscvector; 00401 pscvector.push_back(sc->toString(0)); 00402 // K++; 00403 for (size_t i = 1; i < sc->getNumberOfSequences(); i++) 00404 { 00405 bool uniq = true; 00406 string query = sc->toString(i); 00407 for (vector<string>::iterator it = pscvector.begin(); it != pscvector.end(); it++) 00408 { 00409 if (query.compare(*it) == 0) 00410 { 00411 uniq = false; 00412 break; 00413 } 00414 } 00415 if (uniq) 00416 { 00417 // K++; 00418 pscvector.push_back(query); 00419 } 00420 } 00421 delete sc; 00422 // return K; 00423 return pscvector.size(); 00424 } 00425 00426 double SequenceStatistics::DVH(const PolymorphismSequenceContainer& psc, bool gapflag) 00427 { 00428 /* 00429 * Sylvain Gaillard 17/03/2010: 00430 * This implementation uses unneeded SequenceContainer recopy and works on 00431 * string. It needs to be improved. 00432 */ 00433 PolymorphismSequenceContainer* sc = 0; 00434 if (gapflag) 00435 sc = PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc); 00436 else 00437 sc = new PolymorphismSequenceContainer(psc); 00438 double H = 0.; 00439 size_t nbSeq; 00440 vector<string> pscvector; 00441 vector<size_t> effvector; 00442 pscvector.push_back(sc->toString(0)); 00443 effvector.push_back(sc->getSequenceCount(0)); 00444 nbSeq = sc->getSequenceCount(0); 00445 for (size_t i = 1; i < sc->getNumberOfSequences(); i++) 00446 { 00447 nbSeq += sc->getSequenceCount(i); 00448 bool uniq = true; 00449 string query = sc->toString(i); 00450 for (size_t j = 0; j < pscvector.size(); j++) 00451 { 00452 if (query.compare(pscvector[j]) == 0) 00453 { 00454 effvector[j] += sc->getSequenceCount(i); 00455 uniq = false; 00456 break; 00457 } 00458 } 00459 if (uniq) 00460 { 00461 pscvector.push_back(query); 00462 effvector.push_back(sc->getSequenceCount(i)); 00463 } 00464 } 00465 for (size_t i = 0; i < effvector.size(); i++) 00466 { 00467 H -= (static_cast<double>(effvector[i]) / static_cast<double>(nbSeq)) * ( static_cast<double>(effvector[i]) / static_cast<double>(nbSeq)); 00468 } 00469 H += 1.; 00470 delete sc; 00471 return H; 00472 } 00473 00474 size_t SequenceStatistics::getNumberOfTransitions(const PolymorphismSequenceContainer& psc) 00475 { 00476 size_t nbT = 0; 00477 ConstSiteIterator* si = new CompleteSiteContainerIterator(psc); 00478 const Site* site = 0; 00479 while (si->hasMoreSites()) 00480 { 00481 site = si->nextSite(); 00482 // if (SiteTools::isConstant(*site) || SiteTools::isTriplet(*site)) continue; 00483 if (SiteTools::getNumberOfDistinctCharacters(*site) != 2) 00484 continue; 00485 vector<int> seq = site->getContent(); 00486 int state1 = seq[0]; 00487 int state2 = seq[0]; 00488 for (size_t i = 1; i < seq.size(); i++) 00489 { 00490 if (state1 != seq[i]) 00491 { 00492 state2 = seq[i]; 00493 break; 00494 } 00495 } 00496 if (((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) || 00497 ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1))) 00498 { 00499 nbT++; 00500 } 00501 } 00502 delete si; 00503 return nbT; 00504 } 00505 00506 size_t SequenceStatistics::getNumberOfTransversions(const PolymorphismSequenceContainer& psc) 00507 { 00508 size_t nbTv = 0; 00509 ConstSiteIterator* si = new CompleteSiteContainerIterator(psc); 00510 const Site* site = 0; 00511 while (si->hasMoreSites()) 00512 { 00513 site = si->nextSite(); 00514 // if (SiteTools::isConstant(*site) || SiteTools::isTriplet(*site)) continue; 00515 if (SiteTools::getNumberOfDistinctCharacters(*site) != 2) 00516 continue; 00517 vector<int> seq = site->getContent(); 00518 int state1 = seq[0]; 00519 int state2 = seq[0]; 00520 for (size_t i = 1; i < seq.size(); i++) 00521 { 00522 if (state1 != seq[i]) 00523 { 00524 state2 = seq[i]; 00525 break; 00526 } 00527 } 00528 if (!(((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) || 00529 ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1)))) 00530 { 00531 nbTv++; 00532 } 00533 } 00534 delete si; 00535 return nbTv; 00536 } 00537 00538 double SequenceStatistics::getTransitionsTransversionsRatio(const PolymorphismSequenceContainer& psc) throw (Exception) 00539 { 00540 // return (double) getNumberOfTransitions(psc)/getNumberOfTransversions(psc); 00541 size_t nbT = 0; 00542 size_t nbTv = 0; 00543 ConstSiteIterator* si = new CompleteSiteContainerIterator(psc); 00544 const Site* site = 0; 00545 vector< int > state(2); 00546 while (si->hasMoreSites()) 00547 { 00548 map<int, size_t> count; 00549 site = si->nextSite(); 00550 SymbolListTools::getCounts(*site, count); 00551 if (count.size() != 2) 00552 continue; 00553 int i = 0; 00554 for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++) 00555 { 00556 state[i] = it->first; 00557 i++; 00558 } 00559 if (((state[0] == 0 && state[1] == 2) || (state[0] == 2 && state[1] == 0)) || 00560 ((state[0] == 1 && state[1] == 3) || (state[0] == 3 && state[1] == 1))) 00561 { 00562 nbT++; // transitions 00563 } 00564 else 00565 { 00566 nbTv++; // transversion 00567 } 00568 } 00569 delete si; 00570 if (nbTv == 0) 00571 throw ZeroDivisionException("SequenceStatistics::getTransitionsTransversionsRatio."); 00572 return static_cast<double>(nbT) / static_cast<double>(nbTv); 00573 } 00574 00575 // ****************************************************************************** 00576 // Synonymous and non-synonymous polymorphism 00577 // ****************************************************************************** 00578 00579 size_t SequenceStatistics::stopCodonSiteNumber(const PolymorphismSequenceContainer& psc, const GeneticCode& gCode, bool gapflag) 00580 { 00581 /* 00582 * Sylvain Gaillard 17/03/2010 00583 * What if the Alphabet is not a codon alphabet? 00584 */ 00585 if (!AlphabetTools::isCodonAlphabet(psc.getAlphabet())) 00586 throw AlphabetMismatchException("SequenceStatistics::stopCodonSiteNumber(). PolymorphismSequenceContainer must be with a codon alphabet.", psc.getAlphabet()); 00587 00588 ConstSiteIterator* si = 0; 00589 if (gapflag) 00590 si = new NoGapSiteContainerIterator(psc); 00591 else 00592 si = new SimpleSiteContainerIterator(psc); 00593 size_t S = 0; 00594 const Site* site = 0; 00595 while (si->hasMoreSites()) 00596 { 00597 site = si->nextSite(); 00598 if (CodonSiteTools::hasStop(*site, gCode)) 00599 S++; 00600 } 00601 delete si; 00602 return S; 00603 } 00604 00605 size_t SequenceStatistics::monoSitePolymorphicCodonNumber(const PolymorphismSequenceContainer& psc, bool stopflag, bool gapflag) 00606 { 00607 ConstSiteIterator* si = 0; 00608 if (stopflag) 00609 si = new CompleteSiteContainerIterator(psc); 00610 else 00611 { 00612 if (gapflag) 00613 si = new NoGapSiteContainerIterator(psc); 00614 else 00615 si = new SimpleSiteContainerIterator(psc); 00616 } 00617 size_t S = 0; 00618 const Site* site; 00619 while (si->hasMoreSites()) 00620 { 00621 site = si->nextSite(); 00622 if (CodonSiteTools::isMonoSitePolymorphic(*site)) 00623 S++; 00624 } 00625 delete si; 00626 return S; 00627 } 00628 00629 size_t SequenceStatistics::synonymousPolymorphicCodonNumber(const PolymorphismSequenceContainer& psc, const GeneticCode& gc) 00630 { 00631 ConstSiteIterator* si = new CompleteSiteContainerIterator(psc); 00632 size_t S = 0; 00633 const Site* site; 00634 while (si->hasMoreSites()) 00635 { 00636 site = si->nextSite(); 00637 if (CodonSiteTools::isSynonymousPolymorphic(*site, gc)) 00638 S++; 00639 } 00640 delete si; 00641 return S; 00642 } 00643 00644 double SequenceStatistics::watterson75Synonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc) 00645 { 00646 double ThetaW = 0.; 00647 size_t n = psc.getNumberOfSequences(); 00648 size_t S = synonymousSubstitutionsNumber(psc, gc); 00649 map<string, double> values = getUsefullValues_(n); 00650 ThetaW = static_cast<double>(S) / values["a1"]; 00651 return ThetaW; 00652 } 00653 00654 double SequenceStatistics::watterson75NonSynonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc) 00655 { 00656 double ThetaW; 00657 size_t n = psc.getNumberOfSequences(); 00658 size_t S = nonSynonymousSubstitutionsNumber(psc, gc); 00659 map<string, double> values = getUsefullValues_(n); 00660 ThetaW = static_cast<double>(S) / values["a1"]; 00661 return ThetaW; 00662 } 00663 00664 double SequenceStatistics::piSynonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, bool minchange) 00665 { 00666 double S = 0.; 00667 ConstSiteIterator* si = new CompleteSiteContainerIterator(psc); 00668 const Site* site = 0; 00669 while (si->hasMoreSites()) 00670 { 00671 site = si->nextSite(); 00672 S += CodonSiteTools::piSynonymous(*site, gc, minchange); 00673 } 00674 delete si; 00675 return S; 00676 } 00677 00678 double SequenceStatistics::piNonSynonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, bool minchange) 00679 { 00680 double S = 0.; 00681 ConstSiteIterator* si = new CompleteSiteContainerIterator(psc); 00682 const Site* site = 0; 00683 while (si->hasMoreSites()) 00684 { 00685 site = si->nextSite(); 00686 S += CodonSiteTools::piNonSynonymous(*site, gc, minchange); 00687 } 00688 delete si; 00689 return S; 00690 } 00691 00692 double SequenceStatistics::meanSynonymousSitesNumber(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double ratio) 00693 { 00694 double S = 0.; 00695 ConstSiteIterator* si = new CompleteSiteContainerIterator(psc); 00696 const Site* site = 0; 00697 while (si->hasMoreSites()) 00698 { 00699 site = si->nextSite(); 00700 S += CodonSiteTools::meanNumberOfSynonymousPositions(*site, gc, ratio); 00701 } 00702 delete si; 00703 return S; 00704 } 00705 00706 double SequenceStatistics::meanNonSynonymousSitesNumber(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double ratio) 00707 { 00708 double S = 0.; 00709 int n = 0; 00710 ConstSiteIterator* si = new CompleteSiteContainerIterator(psc); 00711 const Site* site = 0; 00712 while (si->hasMoreSites()) 00713 { 00714 site = si->nextSite(); 00715 n = n + 3; 00716 S += CodonSiteTools::meanNumberOfSynonymousPositions(*site, gc, ratio); 00717 } 00718 delete si; 00719 return static_cast<double>(n - S); 00720 } 00721 00722 size_t SequenceStatistics::synonymousSubstitutionsNumber(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double freqmin) 00723 { 00724 size_t St = 0, Sns = 0; 00725 ConstSiteIterator* si = new CompleteSiteContainerIterator(psc); 00726 const Site* site = 0; 00727 while (si->hasMoreSites()) 00728 { 00729 site = si->nextSite(); 00730 St += CodonSiteTools::numberOfSubsitutions(*site, gc, freqmin); 00731 Sns += CodonSiteTools::numberOfNonSynonymousSubstitutions(*site, gc, freqmin); 00732 } 00733 delete si; 00734 return St - Sns; 00735 } 00736 00737 size_t SequenceStatistics::nonSynonymousSubstitutionsNumber(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double freqmin) 00738 { 00739 size_t Sns = 0; 00740 ConstSiteIterator* si = new CompleteSiteContainerIterator(psc); 00741 const Site* site = 0; 00742 while (si->hasMoreSites()) 00743 { 00744 site = si->nextSite(); 00745 Sns += CodonSiteTools::numberOfNonSynonymousSubstitutions(*site, gc, freqmin); 00746 } 00747 delete si; 00748 return Sns; 00749 } 00750 00751 vector<size_t> SequenceStatistics::fixedDifferences(const PolymorphismSequenceContainer& pscin, const PolymorphismSequenceContainer& pscout, PolymorphismSequenceContainer& psccons, const GeneticCode& gc) 00752 { 00753 ConstSiteIterator* siIn = new CompleteSiteContainerIterator(pscin); 00754 ConstSiteIterator* siOut = new CompleteSiteContainerIterator(pscout); 00755 ConstSiteIterator* siCons = new CompleteSiteContainerIterator(psccons); 00756 const Site* siteIn = 0; 00757 const Site* siteOut = 0; 00758 const Site* siteCons = 0; 00759 size_t NfixS = 0; 00760 size_t NfixA = 0; 00761 while (siIn->hasMoreSites()) 00762 { 00763 siteIn = siIn->nextSite(); 00764 siteOut = siOut->nextSite(); 00765 siteCons = siCons->nextSite(); 00766 vector<size_t> v = CodonSiteTools::fixedDifferences(*siteIn, *siteOut, siteCons->getValue(0), siteCons->getValue(1), gc); 00767 NfixS += v[0]; 00768 NfixA += v[1]; 00769 } 00770 vector<size_t> v(2); 00771 v[0] = NfixS; 00772 v[1] = NfixA; 00773 delete siIn; 00774 delete siOut; 00775 delete siCons; 00776 return v; 00777 } 00778 00779 vector<size_t> SequenceStatistics::MKtable(const PolymorphismSequenceContainer& ingroup, const PolymorphismSequenceContainer& outgroup, const GeneticCode& gc, double freqmin) 00780 { 00781 PolymorphismSequenceContainer psctot(ingroup); 00782 for (size_t i = 0; i < outgroup.getNumberOfSequences(); i++) 00783 { 00784 psctot.addSequence(outgroup.getSequence(i)); 00785 psctot.setAsOutgroupMember(i + ingroup.getNumberOfSequences()); 00786 } 00787 const PolymorphismSequenceContainer* psccomplet = PolymorphismSequenceContainerTools::getCompleteSites(psctot); 00788 const PolymorphismSequenceContainer* pscin = PolymorphismSequenceContainerTools::extractIngroup(*psccomplet); 00789 const PolymorphismSequenceContainer* pscout = PolymorphismSequenceContainerTools::extractOutgroup(*psccomplet); 00790 const Sequence* consensusIn = SiteContainerTools::getConsensus(*pscin, "consensusIn"); 00791 const Sequence* consensusOut = SiteContainerTools::getConsensus(*pscout, "consensusOut"); 00792 PolymorphismSequenceContainer* consensus = new PolymorphismSequenceContainer(ingroup.getAlphabet()); 00793 consensus->addSequence(*consensusIn); 00794 consensus->addSequence(*consensusOut); 00795 vector<size_t> u = SequenceStatistics::fixedDifferences(*pscin, *pscout, *consensus, gc); 00796 vector<size_t> v(4); 00797 v[0] = SequenceStatistics::nonSynonymousSubstitutionsNumber(*pscin, gc, freqmin); 00798 v[1] = SequenceStatistics::synonymousSubstitutionsNumber(*pscin, gc, freqmin); 00799 v[2] = u[1]; 00800 v[3] = u[0]; 00801 delete consensus; 00802 if (psccomplet) 00803 { 00804 delete psccomplet; 00805 } 00806 if (pscin) 00807 { 00808 delete pscin; 00809 } 00810 if (pscout) 00811 { 00812 delete pscout; 00813 } 00814 if (consensusIn) 00815 { 00816 delete consensusIn; 00817 } 00818 if (consensusOut) 00819 { 00820 delete consensusOut; 00821 } 00822 return v; 00823 } 00824 00825 double SequenceStatistics::neutralityIndex(const PolymorphismSequenceContainer& ingroup, const PolymorphismSequenceContainer& outgroup, const GeneticCode& gc, double freqmin) 00826 { 00827 vector<size_t> v = SequenceStatistics::MKtable(ingroup, outgroup, gc, freqmin); 00828 if (v[1] != 0 && v[2] != 0) 00829 return static_cast<double>(v[0] * v[3]) / static_cast<double>(v[1] * v[2]); 00830 else 00831 return -1; 00832 } 00833 00834 // ****************************************************************************** 00835 // Statistical tests 00836 // ****************************************************************************** 00837 00838 double SequenceStatistics::tajimaDSS(const PolymorphismSequenceContainer& psc, bool gapflag) throw (ZeroDivisionException) 00839 { 00840 double S = static_cast<double>(polymorphicSiteNumber(psc, gapflag)); 00841 if (!S) 00842 throw ZeroDivisionException("S should not be null"); 00843 double tajima = tajima83(psc, gapflag); 00844 double watterson = watterson75(psc, gapflag); 00845 size_t n = psc.getNumberOfSequences(); 00846 map<string, double> values = getUsefullValues_(n); 00847 // if (S == 0) 00848 // cout << "ARG S == 0" << endl; 00849 return (tajima - watterson) / sqrt((values["e1"] * S) + (values["e2"] * S * (S - 1))); 00850 } 00851 00852 double SequenceStatistics::tajimaDTNM(const PolymorphismSequenceContainer& psc, bool gapflag) throw (ZeroDivisionException) 00853 { 00854 double eta = static_cast<double>(totNumberMutations(psc, gapflag)); 00855 if (!eta) 00856 throw ZeroDivisionException("eta should not be null"); 00857 double tajima = tajima83(psc, gapflag); 00858 size_t n = psc.getNumberOfSequences(); 00859 map<string, double> values = getUsefullValues_(n); 00860 double eta_a1 = static_cast<double>(eta) / values["a1"]; 00861 return (tajima - eta_a1) / sqrt((values["e1"] * eta) + (values["e2"] * eta * (eta - 1))); 00862 } 00863 00864 double SequenceStatistics::fuliD(const PolymorphismSequenceContainer& ingroup, const PolymorphismSequenceContainer& outgroup, bool original) throw (ZeroDivisionException) 00865 { 00866 size_t n = ingroup.getNumberOfSequences(); 00867 map<string, double> values = getUsefullValues_(n); 00868 double vD = getVD_(n, values["a1"], values["a2"], values["cn"]); 00869 double uD = getUD_(values["a1"], vD); 00870 double eta = static_cast<double>(totNumberMutations(ingroup)); 00871 if (eta == 0.) 00872 throw ZeroDivisionException("eta should not be null"); 00873 double etae = 0.; 00874 if (original) 00875 etae = static_cast<double>(countSingleton(outgroup)); 00876 else 00877 etae = static_cast<double>(totMutationsExternalBranchs(ingroup, outgroup)); // added by Khalid 13/07/2005 00878 return (eta - (values["a1"] * etae)) / sqrt((uD * eta) + (vD * eta * eta)); 00879 } 00880 00881 double SequenceStatistics::fuliDstar(const PolymorphismSequenceContainer& group) throw (ZeroDivisionException) 00882 { 00883 size_t n = group.getNumberOfSequences(); 00884 double nn = static_cast<double>(n); 00885 double _n = nn / (nn - 1.); 00886 map<string, double> values = getUsefullValues_(n); 00887 double vDs = getVDstar_(n, values["a1"], values["a2"], values["dn"]); 00888 double uDs = getUDstar_(n, values["a1"], vDs); 00889 double eta = static_cast<double>(totNumberMutations(group)); 00890 if (eta == 0.) 00891 throw ZeroDivisionException("eta should not be null"); 00892 double etas = static_cast<double>(countSingleton(group)); 00893 00894 // Fu & Li 1993 00895 return ((_n * eta) - (values["a1"] * etas)) / sqrt(uDs * eta + vDs * eta * eta); 00896 00897 // Simonsen et al. 1995 00898 /* 00899 return ((eta / values["a1"]) - (etas * ((n - 1) / n))) / sqrt(uDs * eta + vDs * eta * eta); 00900 */ 00901 } 00902 00903 double SequenceStatistics::fuliF(const PolymorphismSequenceContainer& ingroup, const PolymorphismSequenceContainer& outgroup, bool original) throw (ZeroDivisionException) 00904 { 00905 size_t n = ingroup.getNumberOfSequences(); 00906 double nn = static_cast<double>(n); 00907 map<string, double> values = getUsefullValues_(n); 00908 double pi = tajima83(ingroup, true); 00909 double vF = (values["cn"] + values["b2"] - 2. / (nn - 1.)) / (pow(values["a1"], 2) + values["a2"]); 00910 double uF = ((1. + values["b1"] - (4. * ((nn + 1.) / ((nn - 1.) * (nn - 1.)))) * (values["a1n"] - (2. * nn) / (nn + 1.))) / values["a1"]) - vF; 00911 double eta = static_cast<double>(totNumberMutations(ingroup)); 00912 if (eta == 0.) 00913 throw ZeroDivisionException("eta should not be null"); 00914 double etae = 0.; 00915 if (original) 00916 etae = static_cast<double>(countSingleton(outgroup)); 00917 else 00918 etae = static_cast<double>(totMutationsExternalBranchs(ingroup, outgroup)); // added by Khalid 13/07/2005 00919 return (pi - etae) / sqrt(uF * eta + vF * eta * eta); 00920 } 00921 00922 double SequenceStatistics::fuliFstar(const PolymorphismSequenceContainer& group) throw (ZeroDivisionException) 00923 { 00924 double n = static_cast<double>(group.getNumberOfSequences()); 00925 map<string, double> values = getUsefullValues_(group.getNumberOfSequences()); 00926 double pi = tajima83(group, true); 00927 00928 // Fu & Li 1993 00929 // double vFs = (values["dn"] + values["b2"] - (2. / (nn - 1.)) * (4. * values["a2"] - 6. + 8. / nn)) / (pow(values["a1"], 2) + values["a2"]); 00930 // double uFs = (((nn / (nn - 1.)) + values["b1"] - (4. / (nn * (nn - 1.))) + 2. * ((nn + 1.) / (pow((nn - 1.), 2))) * (values["a1n"] - 2. * nn / (nn + 1.))) / values["a1"]) - vFs; 00931 00932 // Simonsen et al. 1995 00933 double vFs = (((2 * n * n * n + 110 * n * n - 255 * n + 153) / (9 * n * n * (n - 1))) + ((2 * (n - 1) * values["a1"]) / (n * n)) - 8 * values["a2"] / n) / (pow(values["a1"], 2) + values["a2"]); 00934 double uFs = (((4 * n * n + 19 * n + 3 - 12 * (n + 1) * values["a1n"]) / (3 * n * (n - 1))) / values["a1"]) - vFs; 00935 double eta = static_cast<double>(totNumberMutations(group)); 00936 if (eta == 0.) 00937 throw ZeroDivisionException("eta should not be null"); 00938 double etas = static_cast<double>(countSingleton(group)); 00939 // Fu & Li 1993 00940 // Simonsen et al. 1995 00941 return (pi - ((n - 1.) / n * etas)) / sqrt(uFs * eta + vFs * eta * eta); 00942 } 00943 00944 double SequenceStatistics::FstHudson92(const PolymorphismSequenceContainer& psc, size_t id1, size_t id2) 00945 { 00946 vector<double> vdiff; 00947 double piIntra1, piIntra2, meanPiIntra, piInter, Fst; 00948 00949 PolymorphismSequenceContainer* Pop1 = PolymorphismSequenceContainerTools::extractGroup(psc, id1); 00950 PolymorphismSequenceContainer* Pop2 = PolymorphismSequenceContainerTools::extractGroup(psc, id2); 00951 00952 piIntra1 = SequenceStatistics::tajima83(*Pop1, false); 00953 piIntra2 = SequenceStatistics::tajima83(*Pop2, false); 00954 00955 meanPiIntra = (piIntra1 + piIntra2) / 2; 00956 00957 double n = 0; 00958 for (size_t i = 0; i < Pop1->getNumberOfSequences(); i++) 00959 { 00960 const Sequence& s1 = Pop1->getSequence(i); 00961 for (size_t j = 0; j < Pop2->getNumberOfSequences(); j++) 00962 { 00963 n++; 00964 const Sequence& s2 = Pop2->getSequence(j); 00965 vdiff.push_back(SiteContainerTools::computeSimilarity(s1, s2, true, "no gap", true)); 00966 } 00967 } 00968 piInter = (VectorTools::sum(vdiff) / n) * static_cast<double>(psc.getNumberOfSites()); 00969 00970 00971 Fst = 1.0 - meanPiIntra / piInter; 00972 00973 delete Pop1; 00974 delete Pop2; 00975 00976 return Fst; 00977 } 00978 00979 // ****************************************************************************** 00980 // Linkage disequilibrium statistics 00981 // ****************************************************************************** 00982 00983 /**********************/ 00984 /* Preliminary method */ 00985 /**********************/ 00986 00987 PolymorphismSequenceContainer* SequenceStatistics::generateLDContainer(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) 00988 { 00989 SiteSelection ss; 00990 // Extract polymorphic site with only two alleles 00991 for (size_t i = 0; i < psc.getNumberOfSites(); i++) 00992 { 00993 if (keepsingleton) 00994 { 00995 if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i))) 00996 { 00997 ss.push_back(i); 00998 } 00999 } 01000 else 01001 { 01002 if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i))) 01003 { 01004 ss.push_back(i); 01005 } 01006 } 01007 } 01008 01009 const SiteContainer* sc = SiteContainerTools::getSelectedSites(psc, ss); 01010 Alphabet* alpha = new DNA(); // Sylvain Gaillard 17/03/2010: What if psc's Alphabet is not DNA 01011 PolymorphismSequenceContainer* ldpsc = new PolymorphismSequenceContainer(sc->getNumberOfSequences(), alpha); 01012 // Assign 1 to the more frequent and 0 to the less frequent alleles 01013 for (size_t i = 0; i < sc->getNumberOfSites(); i++) 01014 { 01015 const Site& site = sc->getSite(i); 01016 Site siteclone(site); 01017 bool deletesite = false; 01018 map<int, double> freqs; 01019 SymbolListTools::getFrequencies(siteclone, freqs); 01020 int first = 0; 01021 for (map<int, double>::iterator it = freqs.begin(); it != freqs.end(); it++) 01022 { 01023 if (it->second >= 0.5) 01024 first = it->first; 01025 } 01026 for (size_t j = 0; j < sc->getNumberOfSequences(); j++) 01027 { 01028 if (freqs[site.getValue(j)] >= 0.5 && site.getValue(j) == first) 01029 { 01030 if (freqs[site.getValue(j)] <= 1 - freqmin) 01031 { 01032 siteclone.setElement(j, 1); 01033 first = site.getValue(j); 01034 } 01035 else 01036 deletesite = true; 01037 } 01038 else 01039 { 01040 if (freqs[site.getValue(j)] >= freqmin) 01041 siteclone.setElement(j, 0); 01042 else 01043 deletesite = true; 01044 } 01045 } 01046 if (!deletesite) 01047 ldpsc->addSite(siteclone); 01048 } 01049 delete alpha; 01050 return ldpsc; 01051 } 01052 01053 /*************************************/ 01054 /* Pairwise LD and distance measures */ 01055 /*************************************/ 01056 01057 Vdouble SequenceStatistics::pairwiseDistances1(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException) 01058 { 01059 // get Positions with sites of interest 01060 SiteSelection ss; 01061 for (size_t i = 0; i < psc.getNumberOfSites(); i++) 01062 { 01063 if (keepsingleton) 01064 { 01065 if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i))) 01066 { 01067 const Site& site = psc.getSite(i); 01068 bool deletesite = false; 01069 map<int, double> freqs; 01070 SymbolListTools::getFrequencies(site, freqs); 01071 for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++) 01072 { 01073 if (freqs[j] >= 1 - freqmin) 01074 deletesite = true; 01075 } 01076 if (!deletesite) 01077 ss.push_back(i); 01078 } 01079 } 01080 else 01081 { 01082 if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i))) 01083 { 01084 ss.push_back(i); 01085 const Site& site = psc.getSite(i); 01086 bool deletesite = false; 01087 map<int, double> freqs; 01088 SymbolListTools::getFrequencies(site, freqs); 01089 for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++) 01090 { 01091 if (freqs[j] >= 1 - freqmin) 01092 deletesite = true; 01093 } 01094 if (!deletesite) 01095 ss.push_back(i); 01096 } 01097 } 01098 } 01099 // compute pairwise distances 01100 if (ss.size() < 2) 01101 throw DimensionException("SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2); 01102 Vdouble dist; 01103 for (size_t i = 0; i < ss.size() - 1; i++) 01104 { 01105 for (size_t j = i + 1; j < ss.size(); j++) 01106 { 01107 dist.push_back(static_cast<double>(ss[j] - ss[i])); 01108 } 01109 } 01110 return dist; 01111 } 01112 01113 Vdouble SequenceStatistics::pairwiseDistances2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException) 01114 { 01115 SiteSelection ss; 01116 for (size_t i = 0; i < psc.getNumberOfSites(); i++) 01117 { 01118 if (keepsingleton) 01119 { 01120 if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i))) 01121 { 01122 const Site& site = psc.getSite(i); 01123 bool deletesite = false; 01124 map<int, double> freqs; 01125 SymbolListTools::getFrequencies(site, freqs); 01126 for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++) 01127 { 01128 if (freqs[j] >= 1 - freqmin) 01129 deletesite = true; 01130 } 01131 if (!deletesite) 01132 ss.push_back(i); 01133 } 01134 } 01135 else 01136 { 01137 if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i))) 01138 { 01139 ss.push_back(i); 01140 const Site& site = psc.getSite(i); 01141 bool deletesite = false; 01142 map<int, double> freqs; 01143 SymbolListTools::getFrequencies(site, freqs); 01144 for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++) 01145 { 01146 if (freqs[j] >= 1 - freqmin) 01147 deletesite = true; 01148 } 01149 if (!deletesite) 01150 ss.push_back(i); 01151 } 01152 } 01153 } 01154 size_t n = ss.size(); 01155 if (n < 2) 01156 throw DimensionException("SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2); 01157 Vdouble distance(n * (n - 1) / 2, 0); 01158 size_t nbsite = psc.getNumberOfSites(); 01159 for (size_t k = 0; k < psc.getNumberOfSequences(); k++) 01160 { 01161 const Sequence& seq = psc.getSequence(k); 01162 SiteSelection gap, newss = ss; 01163 Vdouble dist; 01164 for (size_t i = 0; i < nbsite; i++) 01165 { 01166 if (seq.getValue(i) == -1) 01167 gap.push_back(i); 01168 } 01169 // Site positions are re-numbered to take gaps into account 01170 for (size_t i = 0; i < gap.size(); i++) 01171 { 01172 for (size_t j = 0; j < ss.size(); j++) 01173 { 01174 if (ss[j] > gap[i]) 01175 newss[j]--; 01176 } 01177 } 01178 for (size_t i = 0; i < n - 1; i++) 01179 { 01180 for (size_t j = i + 1; j < n; j++) 01181 { 01182 dist.push_back(static_cast<double>(newss[j] - newss[i])); 01183 } 01184 } 01185 distance += dist; 01186 } 01187 distance = distance / static_cast<double>(psc.getNumberOfSequences()); 01188 return distance; 01189 } 01190 01191 Vdouble SequenceStatistics::pairwiseD(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException) 01192 { 01193 PolymorphismSequenceContainer* newpsc = SequenceStatistics::generateLDContainer(psc, keepsingleton, freqmin); 01194 Vdouble D; 01195 size_t nbsite = newpsc->getNumberOfSites(); 01196 size_t nbseq = newpsc->getNumberOfSequences(); 01197 if (nbsite < 2) 01198 throw DimensionException("SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2); 01199 if (nbseq < 2) 01200 throw DimensionException("SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2); 01201 for (size_t i = 0; i < nbsite - 1; i++) 01202 { 01203 for (size_t j = i + 1; j < nbsite; j++) 01204 { 01205 double haplo = 0; 01206 const Site& site1 = newpsc->getSite(i); 01207 const Site& site2 = newpsc->getSite(j); 01208 map<int, double> freq1; 01209 map<int, double> freq2; 01210 SymbolListTools::getFrequencies(site1, freq1); 01211 SymbolListTools::getFrequencies(site2, freq2); 01212 for (size_t k = 0; k < nbseq; k++) 01213 { 01214 if (site1.getValue(k) + site2.getValue(k) == 2) 01215 haplo++; 01216 } 01217 haplo = haplo / static_cast<double>(nbseq); 01218 D.push_back(std::abs(haplo - freq1[1] * freq2[1])); 01219 } 01220 } 01221 return D; 01222 } 01223 01224 Vdouble SequenceStatistics::pairwiseDprime(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException) 01225 { 01226 PolymorphismSequenceContainer* newpsc = SequenceStatistics::generateLDContainer(psc, keepsingleton, freqmin); 01227 Vdouble Dprime; 01228 size_t nbsite = newpsc->getNumberOfSites(); 01229 size_t nbseq = newpsc->getNumberOfSequences(); 01230 if (nbsite < 2) 01231 throw DimensionException("SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2); 01232 if (nbseq < 2) 01233 throw DimensionException("SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2); 01234 for (size_t i = 0; i < nbsite - 1; i++) 01235 { 01236 for (size_t j = i + 1; j < nbsite; j++) 01237 { 01238 double haplo = 0; 01239 const Site& site1 = newpsc->getSite(i); 01240 const Site& site2 = newpsc->getSite(j); 01241 map<int, double> freq1; 01242 map<int, double> freq2; 01243 SymbolListTools::getFrequencies(site1, freq1); 01244 SymbolListTools::getFrequencies(site2, freq2); 01245 for (size_t k = 0; k < nbseq; k++) 01246 { 01247 if (site1.getValue(k) + site2.getValue(k) == 2) 01248 haplo++; 01249 } 01250 haplo = haplo / static_cast<double>(nbseq); 01251 double d, D = (haplo - freq1[1] * freq2[1]); 01252 if (D > 0) 01253 { 01254 if (freq1[1] * freq2[0] <= freq1[0] * freq2[1]) 01255 { 01256 d = std::abs(D) / (freq1[1] * freq2[0]); 01257 } 01258 else 01259 { 01260 d = std::abs(D) / (freq1[0] * freq2[1]); 01261 } 01262 } 01263 else 01264 { 01265 if (freq1[1] * freq2[1] <= freq1[0] * freq2[0]) 01266 { 01267 d = std::abs(D) / (freq1[1] * freq2[1]); 01268 } 01269 else 01270 { 01271 d = std::abs(D) / (freq1[0] * freq2[0]); 01272 } 01273 } 01274 Dprime.push_back(d); 01275 } 01276 } 01277 return Dprime; 01278 } 01279 01280 Vdouble SequenceStatistics::pairwiseR2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException) 01281 { 01282 PolymorphismSequenceContainer* newpsc = SequenceStatistics::generateLDContainer(psc, keepsingleton, freqmin); 01283 Vdouble R2; 01284 size_t nbsite = newpsc->getNumberOfSites(); 01285 size_t nbseq = newpsc->getNumberOfSequences(); 01286 if (nbsite < 2) 01287 throw DimensionException("SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2); 01288 if (nbseq < 2) 01289 throw DimensionException("SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2); 01290 for (size_t i = 0; i < nbsite - 1; i++) 01291 { 01292 for (size_t j = i + 1; j < nbsite; j++) 01293 { 01294 double haplo = 0; 01295 const Site& site1 = newpsc->getSite(i); 01296 const Site& site2 = newpsc->getSite(j); 01297 map<int, double> freq1; 01298 map<int, double> freq2; 01299 SymbolListTools::getFrequencies(site1, freq1); 01300 SymbolListTools::getFrequencies(site2, freq2); 01301 for (size_t k = 0; k < nbseq; k++) 01302 { 01303 if (site1.getValue(k) + site2.getValue(k) == 2) 01304 haplo++; 01305 } 01306 haplo = haplo / static_cast<double>(nbseq); 01307 double r = ((haplo - freq1[1] * freq2[1]) * (haplo - freq1[1] * freq2[1])) / (freq1[0] * freq1[1] * freq2[0] * freq2[1]); 01308 R2.push_back(r); 01309 } 01310 } 01311 return R2; 01312 } 01313 01314 /***********************************/ 01315 /* Global LD and distance measures */ 01316 /***********************************/ 01317 01318 double SequenceStatistics::meanD(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException) 01319 { 01320 Vdouble D = pairwiseD(psc, keepsingleton, freqmin); 01321 return VectorTools::mean<double, double>(D); 01322 } 01323 01324 double SequenceStatistics::meanDprime(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException) 01325 { 01326 try 01327 { 01328 Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin); 01329 return VectorTools::mean<double, double>(Dprime); 01330 } 01331 catch (DimensionException& e) 01332 { 01333 throw e; 01334 } 01335 } 01336 01337 double SequenceStatistics::meanR2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException) 01338 { 01339 try 01340 { 01341 Vdouble R2 = SequenceStatistics::pairwiseR2(psc, keepsingleton, freqmin); 01342 return VectorTools::mean<double, double>(R2); 01343 } 01344 catch (DimensionException& e) 01345 { 01346 throw e; 01347 } 01348 } 01349 01350 double SequenceStatistics::meanDistance1(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException) 01351 { 01352 try 01353 { 01354 Vdouble dist = pairwiseDistances1(psc, keepsingleton, freqmin); 01355 return VectorTools::mean<double, double>(dist); 01356 } 01357 catch (DimensionException& e) 01358 { 01359 throw e; 01360 } 01361 } 01362 01363 double SequenceStatistics::meanDistance2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException) 01364 { 01365 try 01366 { 01367 Vdouble dist = pairwiseDistances2(psc, keepsingleton, freqmin); 01368 return VectorTools::mean<double, double>(dist); 01369 } 01370 catch (DimensionException& e) 01371 { 01372 throw e; 01373 } 01374 } 01375 01376 /**********************/ 01377 /* Regression methods */ 01378 /**********************/ 01379 01380 double SequenceStatistics::originRegressionD(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException) 01381 { 01382 try 01383 { 01384 Vdouble D = pairwiseD(psc, keepsingleton, freqmin) - 1; 01385 Vdouble dist; 01386 if (distance1) 01387 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000; 01388 else 01389 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000; 01390 return VectorTools::sum(D * dist) / VectorTools::sum(dist * dist); 01391 } 01392 catch (DimensionException& e) 01393 { 01394 throw e; 01395 } 01396 } 01397 01398 double SequenceStatistics::originRegressionDprime(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException) 01399 { 01400 try 01401 { 01402 Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin) - 1; 01403 Vdouble dist; 01404 if (distance1) 01405 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000; 01406 else 01407 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000; 01408 return VectorTools::sum(Dprime * dist) / VectorTools::sum(dist * dist); 01409 } 01410 catch (DimensionException& e) 01411 { 01412 throw e; 01413 } 01414 } 01415 01416 double SequenceStatistics::originRegressionR2(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException) 01417 { 01418 try 01419 { 01420 Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin) - 1; 01421 Vdouble dist; 01422 if (distance1) 01423 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000; 01424 else 01425 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000; 01426 return VectorTools::sum(R2 * dist) / VectorTools::sum(dist * dist); 01427 } 01428 catch (DimensionException& e) 01429 { 01430 throw e; 01431 } 01432 } 01433 01434 Vdouble SequenceStatistics::linearRegressionD(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException) 01435 { 01436 try 01437 { 01438 Vdouble D = pairwiseD(psc, keepsingleton, freqmin); 01439 Vdouble dist; 01440 Vdouble reg(2); 01441 if (distance1) 01442 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000; 01443 else 01444 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000; 01445 reg[0] = VectorTools::cov<double, double>(dist, D) / VectorTools::var<double, double>(dist); 01446 reg[1] = VectorTools::mean<double, double>(D) - reg[0] * VectorTools::mean<double, double>(dist); 01447 return reg; 01448 } 01449 catch (DimensionException& e) 01450 { 01451 throw e; 01452 } 01453 } 01454 01455 Vdouble SequenceStatistics::linearRegressionDprime(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException) 01456 { 01457 try 01458 { 01459 Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin); 01460 Vdouble dist; 01461 Vdouble reg(2); 01462 if (distance1) 01463 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000; 01464 else 01465 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000; 01466 reg[0] = VectorTools::cov<double, double>(dist, Dprime) / VectorTools::var<double, double>(dist); 01467 reg[1] = VectorTools::mean<double, double>(Dprime) - reg[0] * VectorTools::mean<double, double>(dist); 01468 return reg; 01469 } 01470 catch (DimensionException& e) 01471 { 01472 throw e; 01473 } 01474 } 01475 01476 Vdouble SequenceStatistics::linearRegressionR2(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException) 01477 { 01478 try 01479 { 01480 Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin); 01481 Vdouble dist; 01482 Vdouble reg(2); 01483 if (distance1) 01484 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000; 01485 else 01486 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000; 01487 reg[0] = VectorTools::cov<double, double>(dist, R2) / VectorTools::var<double, double>(dist); 01488 reg[1] = VectorTools::mean<double, double>(R2) - reg[0] * VectorTools::mean<double, double>(dist); 01489 return reg; 01490 } 01491 catch (DimensionException& e) 01492 { 01493 throw e; 01494 } 01495 } 01496 01497 double SequenceStatistics::inverseRegressionR2(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException) 01498 { 01499 try 01500 { 01501 Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin); 01502 Vdouble unit(R2.size(), 1); 01503 Vdouble R2transformed = unit / R2 - 1; 01504 Vdouble dist; 01505 if (distance1) 01506 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000; 01507 else 01508 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000; 01509 return VectorTools::sum(R2transformed * dist) / VectorTools::sum(dist * dist); 01510 } 01511 catch (DimensionException& e) 01512 { 01513 throw e; 01514 } 01515 } 01516 01517 /**********************/ 01518 /* Hudson method */ 01519 /**********************/ 01520 01521 double SequenceStatistics::hudson87(const PolymorphismSequenceContainer& psc, double precision, double cinf, double csup) 01522 { 01523 double left = leftHandHudson_(psc); 01524 size_t n = psc.getNumberOfSequences(); 01525 double dif = 1; 01526 double c1 = cinf; 01527 double c2 = csup; 01528 if (SequenceStatistics::polymorphicSiteNumber(psc) < 2) 01529 return -1; 01530 if (rightHandHudson_(c1, n) < left) 01531 return cinf; 01532 if (rightHandHudson_(c2, n) > left) 01533 return csup; 01534 while (dif > precision) 01535 { 01536 if (rightHandHudson_((c1 + c2) / 2, n) > left) 01537 c1 = (c1 + c2) / 2; 01538 else 01539 c2 = (c1 + c2) / 2; 01540 dif = std::abs(2 * (c1 - c2) / (c1 + c2)); 01541 } 01542 return (c1 + c2) / 2; 01543 } 01544 01545 /*****************/ 01546 /* Tests methods */ 01547 /*****************/ 01548 01549 void SequenceStatistics::testUsefullValues(std::ostream& s, size_t n) 01550 { 01551 map<string, double> v = getUsefullValues_(n); 01552 double vD = getVD_(n, v["a1"], v["a2"], v["cn"]); 01553 double uD = getUD_(v["a1"], vD); 01554 double vDs = getVDstar_(n, v["a1"], v["a2"], v["dn"]); 01555 double uDs = getUDstar_(n, v["a1"], vDs); 01556 01557 s << n << "\t"; 01558 s << v["a1"] << "\t"; 01559 s << v["a2"] << "\t"; 01560 s << v["a1n"] << "\t"; 01561 s << v["b1"] << "\t"; 01562 s << v["b2"] << "\t"; 01563 s << v["c1"] << "\t"; 01564 s << v["c2"] << "\t"; 01565 s << v["cn"] << "\t"; 01566 s << v["dn"] << "\t"; 01567 s << v["e1"] << "\t"; 01568 s << v["e2"] << "\t"; 01569 s << uD << "\t"; 01570 s << vD << "\t"; 01571 s << uDs << "\t"; 01572 s << vDs << endl; 01573 } 01574 01575 // ****************************************************************************** 01576 // Private methods 01577 // ****************************************************************************** 01578 01579 size_t SequenceStatistics::getMutationNumber_(const Site& site) 01580 { 01581 size_t tmp_count = 0; 01582 map<int, size_t> states_count; 01583 SymbolListTools::getCounts(site, states_count); 01584 01585 for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++) 01586 { 01587 if (it->first >= 0) 01588 tmp_count++; 01589 } 01590 if (tmp_count > 0) 01591 tmp_count--; 01592 return tmp_count; 01593 } 01594 01595 size_t SequenceStatistics::getSingletonNumber_(const Site& site) 01596 { 01597 size_t nus = 0; 01598 map<int, size_t> states_count; 01599 SymbolListTools::getCounts(site, states_count); 01600 for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++) 01601 { 01602 if (it->second == 1) 01603 nus++; 01604 } 01605 return nus; 01606 } 01607 01608 size_t SequenceStatistics::getDerivedSingletonNumber_(const Site& site_in, const Site& site_out) 01609 { 01610 size_t nus = 0; 01611 map<int, size_t> states_count; 01612 map<int, size_t> outgroup_states_count; 01613 SymbolListTools::getCounts(site_in, states_count); 01614 SymbolListTools::getCounts(site_out, outgroup_states_count); 01615 // if there is more than one variant in the outgroup we will not be able to recover the ancestral state 01616 if (outgroup_states_count.size() == 1) 01617 { 01618 for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++) 01619 { 01620 if (it->second == 1) 01621 { 01622 if (outgroup_states_count.find(it->first) == outgroup_states_count.end()) 01623 nus++; 01624 } 01625 } 01626 } 01627 return nus; 01628 } 01629 01630 std::map<std::string, double> SequenceStatistics::getUsefullValues_(size_t n) 01631 { 01632 double nn = static_cast<double>(n); 01633 map<string, double> values; 01634 values["a1"] = 0.; 01635 values["a2"] = 0.; 01636 values["a1n"] = 0.; 01637 values["b1"] = 0.; 01638 values["b2"] = 0.; 01639 values["c1"] = 0.; 01640 values["c2"] = 0.; 01641 values["cn"] = 0.; 01642 values["dn"] = 0.; 01643 values["e1"] = 0.; 01644 values["e2"] = 0.; 01645 if (n > 1) 01646 { 01647 for (double i = 1; i < nn; i++) 01648 { 01649 values["a1"] += 1. / i; 01650 values["a2"] += 1. / (i * i); 01651 } 01652 values["a1n"] = values["a1"] + (1. / nn); 01653 values["b1"] = (nn + 1.) / (3. * (nn - 1.)); 01654 values["b2"] = 2. * ((nn * nn) + nn + 3.) / (9. * nn * (nn - 1.)); 01655 values["c1"] = values["b1"] - (1. / values["a1"]); 01656 values["c2"] = values["b2"] - ((nn + 2.) / (values["a1"] * nn)) + (values["a2"] / (values["a1"] * values["a1"])); 01657 if (n == 2) 01658 { 01659 values["cn"] = 1.; 01660 values["dn"] = 2.; 01661 } 01662 else 01663 { 01664 values["cn"] = 2. * ((nn * values["a1"]) - (2. * (nn - 1.))) / ((nn - 1.) * (nn - 2.)); 01665 values["dn"] = 01666 values["cn"] 01667 + ((nn - 2.) / ((nn - 1.) * (nn - 1.))) 01668 + (2. / (nn - 1.)) 01669 * ((3. / 2.) - (((2. * values["a1n"]) - 3.) / (nn - 2.)) - (1. / nn)); 01670 } 01671 values["e1"] = values["c1"] / values["a1"]; 01672 values["e2"] = values["c2"] / ((values["a1"] * values["a1"]) + values["a2"]); 01673 } 01674 return values; 01675 } 01676 01677 double SequenceStatistics::getVD_(size_t n, double a1, double a2, double cn) 01678 { 01679 double nn = static_cast<double>(n); 01680 if (n < 3) 01681 return 0.; 01682 double vD = 1. + ((a1 * a1) / (a2 + (a1 * a1))) * (cn - ((nn + 1.) / (nn - 1.))); 01683 return vD; 01684 } 01685 01686 double SequenceStatistics::getUD_(double a1, double vD) 01687 { 01688 return a1 - 1. - vD; 01689 } 01690 01691 double SequenceStatistics::getVDstar_(size_t n, double a1, double a2, double dn) 01692 { 01693 double denom = (a1 * a1) + a2; 01694 if (n < 3 || denom == 0.) 01695 return 0.; 01696 double nn = static_cast<double>(n); 01697 double nnn = nn / (nn - 1.); 01698 // Fu & Li 1993 01699 double vDs = ( 01700 (nnn * nnn * a2) 01701 + (a1 * a1 * dn) 01702 - (2. * (nn * a1 * (a1 + 1)) / ((nn - 1.) * (nn - 1.))) 01703 ) 01704 / 01705 denom; 01706 // Simonsen et al. 1995 01707 /* 01708 double vDs = ( 01709 (values["a2"] / pow(values["a1"], 2)) 01710 - (2./nn) * (1. + 1./values["a1"] - values["a1"] + values["a1"]/nn) 01711 - 1./(nn*nn) 01712 ) 01713 / 01714 (pow(values["a1"], 2) + values["a2"]); 01715 */ 01716 return vDs; 01717 } 01718 01719 double SequenceStatistics::getUDstar_(size_t n, double a1, double vDs) 01720 { 01721 if (n < 3) 01722 return 0.; 01723 double nn = static_cast<double>(n); 01724 double nnn = nn / (nn - 1.); 01725 // Fu & Li 1993 01726 double uDs = (nnn * (a1 - nnn)) - vDs; 01727 // Simonsen et al. 1995 01728 /* 01729 double uDs = (((nn - 1.)/nn - 1./values["a1"]) / values["a1"]) - vDs; 01730 */ 01731 return uDs; 01732 } 01733 01734 double SequenceStatistics::leftHandHudson_(const PolymorphismSequenceContainer& psc) 01735 { 01736 PolymorphismSequenceContainer* newpsc = PolymorphismSequenceContainerTools::getCompleteSites(psc); 01737 size_t nbseq = newpsc->getNumberOfSequences(); 01738 double S1 = 0; 01739 double S2 = 0; 01740 for (size_t i = 0; i < nbseq - 1; i++) 01741 { 01742 for (size_t j = i + 1; j < nbseq; j++) 01743 { 01744 SequenceSelection ss(2); 01745 ss[0] = i; 01746 ss[1] = j; 01747 PolymorphismSequenceContainer* psc2 = PolymorphismSequenceContainerTools::getSelectedSequences(*newpsc, ss); 01748 S1 += SequenceStatistics::watterson75(*psc2, true); 01749 S2 += SequenceStatistics::watterson75(*psc2, true) * SequenceStatistics::watterson75(*psc2, true); 01750 delete psc2; 01751 } 01752 } 01753 double Sk = (2 * S2 - pow(2 * S1 / static_cast<double>(nbseq), 2.)) / pow(nbseq, 2.); 01754 double H = SequenceStatistics::heterozygosity(*newpsc); 01755 double H2 = SequenceStatistics::squaredHeterozygosity(*newpsc); 01756 delete newpsc; 01757 return static_cast<double>(Sk - H + H2) / pow(H * static_cast<double>(nbseq) / static_cast<double>(nbseq - 1), 2.); 01758 } 01759 01760 double SequenceStatistics::rightHandHudson_(double c, size_t n) 01761 { 01762 double nn = static_cast<double>(n); 01763 return 1. / (97. * pow(c, 2.) * pow(nn, 3.)) * ((nn - 1.) * (97. * (c * (4. + (c - 2. * nn) * nn) + (-2. * (7. + c) + 4. * nn + (c - 1.) * pow(nn, 2.)) * log((18. + c * (13. + c)) / 18.)) + sqrt(97.) * (110. + nn * (49. * nn - 52.) + c * (2. + nn * (15. * nn - 8.))) * log(-1. + (72. + 26. * c) / (36. + 13. * c - c * sqrt(97.))))); 01764 } 01765