82 while (si->hasMoreSites())
84 site = si->nextSite();
85 if (!SiteTools::isConstant(*site, ignoreUnknown))
102 const Site* site = 0;
103 while (si->hasMoreSites())
105 site = si->nextSite();
106 if (SiteTools::isParsimonyInformativeSite(*site))
118 const Site* site = 0;
124 while (si->hasMoreSites())
126 site = si->nextSite();
127 nus += getSingletonNumber_(*site);
141 const Site* site = 0;
142 while (si->hasMoreSites())
144 site = si->nextSite();
145 if (SiteTools::isTriplet(*site))
158 const Site* site = 0;
164 while (si->hasMoreSites())
166 site = si->nextSite();
167 tnm += getMutationNumber_(*site);
173 size_t SequenceStatistics::totMutationsExternalBranchs(
177 if (ing.getNumberOfSites() != outg.getNumberOfSites())
178 throw Exception(
"ing and outg must have the same size");
180 const Site* site_in = 0;
181 const Site* site_out = 0;
186 while (si->hasMoreSites())
188 site_in = si->nextSite();
189 site_out = so->nextSite();
191 if (SiteTools::isComplete(*site_in) && SiteTools::isComplete(*site_out))
192 nmuts += getDerivedSingletonNumber_(*site_in, *site_out);
202 const Site* site = 0;
208 while (si->hasMoreSites())
210 site = si->nextSite();
211 S += SiteTools::heterozygosity(*site);
220 const Site* site = 0;
226 while (si->hasMoreSites())
228 site = si->nextSite();
229 double h = SiteTools::heterozygosity(*site);
242 map<int, double> freqs;
243 SequenceContainerTools::getFrequencies(psc, freqs);
253 vector<size_t> vect(2);
254 const Site* site = 0;
260 while (si->hasMoreSites())
262 site = si->nextSite();
263 if (!SiteTools::isConstant(*site))
265 long double freqGC = SymbolListTools::getGCContent(*site);
272 if (freqGC > 0 && freqGC < 1)
274 nbMut +=
static_cast<size_t>(nbSeq);
275 long double adGC = freqGC * nbSeq;
276 nbGC +=
static_cast<size_t>(adGC);
294 size_t S = polymorphicSiteNumber(psc, gapflag, ignoreUnknown);
295 map<string, double> values = getUsefullValues_(n);
296 ThetaW = (double) S / values[
"a1"];
302 size_t alphabet_size = (psc.
getAlphabet())->getSize();
303 const Site* site = 0;
310 while (si->hasMoreSites())
312 site = si->nextSite();
313 if (!SiteTools::isConstant(*site))
316 map<int, size_t> count;
317 SymbolListTools::getCounts(*site, count);
318 map<int, size_t> tmp_k;
320 for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++)
322 if (it->first >= 0 && it->first < static_cast<int>(alphabet_size))
324 tmp_k[it->first] = it->second * (it->second - 1);
328 if (tmp_n == 0 || tmp_n == 1)
330 for (map<int, size_t>::iterator it = tmp_k.begin(); it != tmp_k.end(); it++)
332 value +=
static_cast<double>(it->second) / static_cast<double>(tmp_n * (tmp_n - 1));
334 value2 += 1. - value;
344 throw Exception(
"SequenceStatistics::FayWu2000: ancestralSites and psc don't have the same size!!!'" );
348 size_t alphabet_size = (psc.
getAlphabet())->getSize();
353 string ancB = ancestralSites.
getChar(i);
354 int ancV = ancestralSites.
getValue(i);
356 if (!SiteTools::isConstant(site) || tmps.
getChar(i) != ancB)
361 map<int, size_t> count;
362 SymbolListTools::getCounts(site, count);
363 map<int, size_t> tmp_k;
365 for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++)
367 if (it->first >= 0 && it->first < static_cast<int>(alphabet_size))
370 if (it->first != ancV)
372 tmp_k[it->first] = 2 * it->second * it->second;
377 if (tmp_n == 0 || tmp_n == 1)
379 for (map<int, size_t>::iterator it = tmp_k.begin(); it != tmp_k.end(); it++)
381 value +=
static_cast<double>(it->second) / static_cast<double>(tmp_n * (tmp_n - 1));
397 sc = PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc);
401 vector<string> pscvector;
402 pscvector.push_back(sc->
toString(0));
408 for (vector<string>::iterator it = pscvector.begin(); it != pscvector.end(); it++)
410 if (query.compare(*it) == 0)
419 pscvector.push_back(query);
424 return pscvector.size();
436 sc = PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc);
441 vector<string> pscvector;
442 vector<size_t> effvector;
443 pscvector.push_back(sc->
toString(0));
451 for (
size_t j = 0; j < pscvector.size(); j++)
453 if (query.compare(pscvector[j]) == 0)
462 pscvector.push_back(query);
466 for (
size_t i = 0; i < effvector.size(); i++)
468 H -= (
static_cast<double>(effvector[i]) / static_cast<double>(nbSeq)) * ( static_cast<double>(effvector[i]) /
static_cast<double>(nbSeq));
479 const Site* site = 0;
480 while (si->hasMoreSites())
482 site = si->nextSite();
484 if (SiteTools::getNumberOfDistinctCharacters(*site) != 2)
486 vector<int> seq = site->getContent();
489 for (
size_t i = 1; i < seq.size(); i++)
491 if (state1 != seq[i])
497 if (((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) ||
498 ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1)))
511 const Site* site = 0;
512 while (si->hasMoreSites())
514 site = si->nextSite();
516 if (SiteTools::getNumberOfDistinctCharacters(*site) != 2)
518 vector<int> seq = site->getContent();
521 for (
size_t i = 1; i < seq.size(); i++)
523 if (state1 != seq[i])
529 if (!(((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) ||
530 ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1))))
545 const Site* site = 0;
546 vector< int > state(2);
547 while (si->hasMoreSites())
549 map<int, size_t> count;
550 site = si->nextSite();
551 SymbolListTools::getCounts(*site, count);
552 if (count.size() != 2)
555 for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++)
557 state[i] = it->first;
560 if (((state[0] == 0 && state[1] == 2) || (state[0] == 2 && state[1] == 0)) ||
561 ((state[0] == 1 && state[1] == 3) || (state[0] == 3 && state[1] == 1)))
573 return static_cast<double>(nbT) / static_cast<double>(nbTv);
592 const Site* site = 0;
593 while (si->hasMoreSites())
595 site = si->nextSite();
596 if (CodonSiteTools::hasStop(*site))
617 while (si->hasMoreSites())
619 site = si->nextSite();
620 if (CodonSiteTools::isMonoSitePolymorphic(*site))
632 while (si->hasMoreSites())
634 site = si->nextSite();
635 if (CodonSiteTools::isSynonymousPolymorphic(*site, gc))
646 size_t S = synonymousSubstitutionsNumber(psc, gc);
647 map<string, double> values = getUsefullValues_(n);
648 ThetaW =
static_cast<double>(S) / values[
"a1"];
656 size_t S = nonSynonymousSubstitutionsNumber(psc, gc);
657 map<string, double> values = getUsefullValues_(n);
658 ThetaW =
static_cast<double>(S) / values[
"a1"];
666 const Site* site = 0;
667 while (si->hasMoreSites())
669 site = si->nextSite();
670 S += CodonSiteTools::piSynonymous(*site, gc, minchange);
680 const Site* site = 0;
681 while (si->hasMoreSites())
683 site = si->nextSite();
684 S += CodonSiteTools::piNonSynonymous(*site, gc, minchange);
694 const Site* site = 0;
695 while (si->hasMoreSites())
697 site = si->nextSite();
698 S += CodonSiteTools::meanNumberOfSynonymousPositions(*site, gc, ratio);
709 const Site* site = 0;
710 while (si->hasMoreSites())
712 site = si->nextSite();
714 S += CodonSiteTools::meanNumberOfSynonymousPositions(*site, gc, ratio);
717 return static_cast<double>(n - S);
722 size_t St = 0, Sns = 0;
724 const Site* site = 0;
725 while (si->hasMoreSites())
727 site = si->nextSite();
728 St += CodonSiteTools::numberOfSubsitutions(*site, freqmin);
729 Sns += CodonSiteTools::numberOfNonSynonymousSubstitutions(*site, gc, freqmin);
739 const Site* site = 0;
740 while (si->hasMoreSites())
742 site = si->nextSite();
743 Sns += CodonSiteTools::numberOfNonSynonymousSubstitutions(*site, gc, freqmin);
754 const Site* siteIn = 0;
755 const Site* siteOut = 0;
756 const Site* siteCons = 0;
759 while (siIn->hasMoreSites())
761 siteIn = siIn->nextSite();
762 siteOut = siOut->nextSite();
763 siteCons = siCons->nextSite();
764 vector<size_t> v = CodonSiteTools::fixedDifferences(*siteIn, *siteOut, siteCons->getValue(0), siteCons->getValue(1), gc);
788 const Sequence* consensusIn = SiteContainerTools::getConsensus(*pscin,
"consensusIn");
789 const Sequence* consensusOut = SiteContainerTools::getConsensus(*pscout,
"consensusOut");
793 vector<size_t> u = SequenceStatistics::fixedDifferences(*pscin, *pscout, *consensus, gc);
795 v[0] = SequenceStatistics::nonSynonymousSubstitutionsNumber(*pscin, gc, freqmin);
796 v[1] = SequenceStatistics::synonymousSubstitutionsNumber(*pscin, gc, freqmin);
825 vector<size_t> v = SequenceStatistics::MKtable(ingroup, outgroup, gc, freqmin);
826 if (v[1] != 0 && v[2] != 0)
827 return static_cast<double>(v[0] * v[3]) / static_cast<double>(v[1] * v[2]);
838 double S =
static_cast<double>(polymorphicSiteNumber(psc, gapflag));
841 double tajima = tajima83(psc, gapflag);
842 double watterson = watterson75(psc, gapflag);
843 size_t n = psc.getNumberOfSequences();
844 map<string, double> values = getUsefullValues_(n);
847 return (tajima - watterson) / sqrt((values[
"e1"] * S) + (values[
"e2"] * S * (S - 1)));
852 double eta =
static_cast<double>(totNumberMutations(psc, gapflag));
855 double tajima = tajima83(psc, gapflag);
856 size_t n = psc.getNumberOfSequences();
857 map<string, double> values = getUsefullValues_(n);
858 double eta_a1 =
static_cast<double>(eta) / values[
"a1"];
859 return (tajima - eta_a1) / sqrt((values[
"e1"] * eta) + (values[
"e2"] * eta * (eta - 1)));
864 size_t n = ingroup.getNumberOfSequences();
865 map<string, double> values = getUsefullValues_(n);
866 double vD = getVD_(n, values[
"a1"], values[
"a2"], values[
"cn"]);
867 double uD = getUD_(values[
"a1"], vD);
868 double eta =
static_cast<double>(totNumberMutations(ingroup));
873 etae =
static_cast<double>(countSingleton(outgroup));
875 etae =
static_cast<double>(totMutationsExternalBranchs(ingroup, outgroup));
876 return (eta - (values[
"a1"] * etae)) / sqrt((uD * eta) + (vD * eta * eta));
881 size_t n = group.getNumberOfSequences();
882 double nn =
static_cast<double>(n);
883 double _n = nn / (nn - 1.);
884 map<string, double> values = getUsefullValues_(n);
885 double vDs = getVDstar_(n, values[
"a1"], values[
"a2"], values[
"dn"]);
886 double uDs = getUDstar_(n, values[
"a1"], vDs);
887 double eta =
static_cast<double>(totNumberMutations(group));
890 double etas =
static_cast<double>(countSingleton(group));
893 return ((_n * eta) - (values[
"a1"] * etas)) / sqrt(uDs * eta + vDs * eta * eta);
903 size_t n = ingroup.getNumberOfSequences();
904 double nn =
static_cast<double>(n);
905 map<string, double> values = getUsefullValues_(n);
906 double pi = tajima83(ingroup,
true);
907 double vF = (values[
"cn"] + values[
"b2"] - 2. / (nn - 1.)) / (pow(values[
"a1"], 2) + values[
"a2"]);
908 double uF = ((1. + values[
"b1"] - (4. * ((nn + 1.) / ((nn - 1.) * (nn - 1.)))) * (values[
"a1n"] - (2. * nn) / (nn + 1.))) / values[
"a1"]) - vF;
909 double eta =
static_cast<double>(totNumberMutations(ingroup));
914 etae =
static_cast<double>(countSingleton(outgroup));
916 etae =
static_cast<double>(totMutationsExternalBranchs(ingroup, outgroup));
917 return (pi - etae) / sqrt(uF * eta + vF * eta * eta);
922 double n =
static_cast<double>(group.getNumberOfSequences());
923 map<string, double> values = getUsefullValues_(group.getNumberOfSequences());
924 double pi = tajima83(group,
true);
931 double vFs = (((2 * n * n * n + 110 * n * n - 255 * n + 153) / (9 * n * n * (n - 1))) + ((2 * (n - 1) * values[
"a1"]) / (n * n)) - 8 * values[
"a2"] / n) / (pow(values[
"a1"], 2) + values[
"a2"]);
932 double uFs = (((4 * n * n + 19 * n + 3 - 12 * (n + 1) * values[
"a1n"]) / (3 * n * (n - 1))) / values[
"a1"]) - vFs;
933 double eta =
static_cast<double>(totNumberMutations(group));
936 double etas =
static_cast<double>(countSingleton(group));
939 return (pi - ((n - 1.) / n * etas)) / sqrt(uFs * eta + vFs * eta * eta);
944 vector<double> vdiff;
945 double piIntra1, piIntra2, meanPiIntra, piInter, Fst;
950 piIntra1 = SequenceStatistics::tajima83(*Pop1,
false);
951 piIntra2 = SequenceStatistics::tajima83(*Pop2,
false);
953 meanPiIntra = (piIntra1 + piIntra2) / 2;
963 vdiff.push_back(SiteContainerTools::computeSimilarity(s1, s2,
true,
"no gap",
true));
966 piInter = (VectorTools::sum(vdiff) / n) * static_cast<double>(psc.
getNumberOfSites());
969 Fst = 1.0 - meanPiIntra / piInter;
993 if (SiteTools::isComplete(psc.
getSite(i)) && !SiteTools::isConstant(psc.
getSite(i)) && !SiteTools::isTriplet(psc.
getSite(i)))
1000 if (SiteTools::isComplete(psc.
getSite(i)) && !SiteTools::isConstant(psc.
getSite(i)) && !SiteTools::isTriplet(psc.
getSite(i)) && !SiteTools::hasSingleton(psc.
getSite(i)))
1007 const SiteContainer* sc = SiteContainerTools::getSelectedSites(psc, ss);
1014 Site siteclone(site);
1015 bool deletesite =
false;
1016 map<int, double> freqs;
1017 SymbolListTools::getFrequencies(siteclone, freqs);
1019 for (map<int, double>::iterator it = freqs.begin(); it != freqs.end(); it++)
1021 if (it->second >= 0.5)
1026 if (freqs[site.getValue(j)] >= 0.5 && site.getValue(j) == first)
1028 if (freqs[site.getValue(j)] <= 1 - freqmin)
1030 siteclone.setElement(j, 1);
1031 first = site.getValue(j);
1038 if (freqs[site.getValue(j)] >= freqmin)
1039 siteclone.setElement(j, 0);
1059 for (
size_t i = 0; i < psc.getNumberOfSites(); i++)
1063 if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)))
1065 const Site& site = psc.getSite(i);
1066 bool deletesite =
false;
1067 map<int, double> freqs;
1068 SymbolListTools::getFrequencies(site, freqs);
1069 for (
int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1071 if (freqs[j] >= 1 - freqmin)
1080 if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i)))
1083 const Site& site = psc.getSite(i);
1084 bool deletesite =
false;
1085 map<int, double> freqs;
1086 SymbolListTools::getFrequencies(site, freqs);
1087 for (
int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1089 if (freqs[j] >= 1 - freqmin)
1099 throw DimensionException(
"SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2);
1101 for (
size_t i = 0; i < ss.size() - 1; i++)
1103 for (
size_t j = i + 1; j < ss.size(); j++)
1105 dist.push_back(static_cast<double>(ss[j] - ss[i]));
1114 for (
size_t i = 0; i < psc.getNumberOfSites(); i++)
1118 if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)))
1120 const Site& site = psc.getSite(i);
1121 bool deletesite =
false;
1122 map<int, double> freqs;
1123 SymbolListTools::getFrequencies(site, freqs);
1124 for (
int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1126 if (freqs[j] >= 1 - freqmin)
1135 if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i)))
1138 const Site& site = psc.getSite(i);
1139 bool deletesite =
false;
1140 map<int, double> freqs;
1141 SymbolListTools::getFrequencies(site, freqs);
1142 for (
int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1144 if (freqs[j] >= 1 - freqmin)
1152 size_t n = ss.size();
1154 throw DimensionException(
"SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2);
1155 Vdouble distance(n * (n - 1) / 2, 0);
1156 size_t nbsite = psc.getNumberOfSites();
1157 for (
size_t k = 0; k < psc.getNumberOfSequences(); k++)
1159 const Sequence& seq = psc.getSequence(k);
1160 SiteSelection gap, newss = ss;
1162 for (
size_t i = 0; i < nbsite; i++)
1168 for (
size_t i = 0; i < gap.size(); i++)
1170 for (
size_t j = 0; j < ss.size(); j++)
1176 for (
size_t i = 0; i < n - 1; i++)
1178 for (
size_t j = i + 1; j < n; j++)
1180 dist.push_back(static_cast<double>(newss[j] - newss[i]));
1185 distance = distance /
static_cast<double>(psc.getNumberOfSequences());
1196 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1198 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1199 for (
size_t i = 0; i < nbsite - 1; i++)
1201 for (
size_t j = i + 1; j < nbsite; j++)
1206 map<int, double> freq1;
1207 map<int, double> freq2;
1208 SymbolListTools::getFrequencies(site1, freq1);
1209 SymbolListTools::getFrequencies(site2, freq2);
1210 for (
size_t k = 0; k < nbseq; k++)
1212 if (site1.getValue(k) + site2.getValue(k) == 2)
1215 haplo = haplo /
static_cast<double>(nbseq);
1216 D.push_back(std::abs(haplo - freq1[1] * freq2[1]));
1229 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1231 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1232 for (
size_t i = 0; i < nbsite - 1; i++)
1234 for (
size_t j = i + 1; j < nbsite; j++)
1239 map<int, double> freq1;
1240 map<int, double> freq2;
1241 SymbolListTools::getFrequencies(site1, freq1);
1242 SymbolListTools::getFrequencies(site2, freq2);
1243 for (
size_t k = 0; k < nbseq; k++)
1245 if (site1.getValue(k) + site2.getValue(k) == 2)
1248 haplo = haplo /
static_cast<double>(nbseq);
1249 double d, D = (haplo - freq1[1] * freq2[1]);
1252 if (freq1[1] * freq2[0] <= freq1[0] * freq2[1])
1254 d = std::abs(D) / (freq1[1] * freq2[0]);
1258 d = std::abs(D) / (freq1[0] * freq2[1]);
1263 if (freq1[1] * freq2[1] <= freq1[0] * freq2[0])
1265 d = std::abs(D) / (freq1[1] * freq2[1]);
1269 d = std::abs(D) / (freq1[0] * freq2[0]);
1272 Dprime.push_back(d);
1285 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1287 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1288 for (
size_t i = 0; i < nbsite - 1; i++)
1290 for (
size_t j = i + 1; j < nbsite; j++)
1295 map<int, double> freq1;
1296 map<int, double> freq2;
1297 SymbolListTools::getFrequencies(site1, freq1);
1298 SymbolListTools::getFrequencies(site2, freq2);
1299 for (
size_t k = 0; k < nbseq; k++)
1301 if (site1.getValue(k) + site2.getValue(k) == 2)
1304 haplo = haplo /
static_cast<double>(nbseq);
1305 double r = ((haplo - freq1[1] * freq2[1]) * (haplo - freq1[1] * freq2[1])) / (freq1[0] * freq1[1] * freq2[0] * freq2[1]);
1318 Vdouble D = pairwiseD(psc, keepsingleton, freqmin);
1319 return VectorTools::mean<double, double>(D);
1326 Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin);
1327 return VectorTools::mean<double, double>(Dprime);
1339 Vdouble R2 = SequenceStatistics::pairwiseR2(psc, keepsingleton, freqmin);
1340 return VectorTools::mean<double, double>(R2);
1352 Vdouble dist = pairwiseDistances1(psc, keepsingleton, freqmin);
1353 return VectorTools::mean<double, double>(dist);
1365 Vdouble dist = pairwiseDistances2(psc, keepsingleton, freqmin);
1366 return VectorTools::mean<double, double>(dist);
1382 Vdouble D = pairwiseD(psc, keepsingleton, freqmin) - 1;
1385 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1387 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1388 return VectorTools::sum(D * dist) / VectorTools::sum(dist * dist);
1400 Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin) - 1;
1403 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1405 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1406 return VectorTools::sum(Dprime * dist) / VectorTools::sum(dist * dist);
1418 Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin) - 1;
1421 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1423 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1424 return VectorTools::sum(R2 * dist) / VectorTools::sum(dist * dist);
1436 Vdouble D = pairwiseD(psc, keepsingleton, freqmin);
1440 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1442 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1443 reg[0] = VectorTools::cov<double, double>(dist, D) / VectorTools::var<double, double>(dist);
1444 reg[1] = VectorTools::mean<double, double>(D) - reg[0] * VectorTools::mean<double, double>(dist);
1457 Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin);
1461 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1463 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1464 reg[0] = VectorTools::cov<double, double>(dist, Dprime) / VectorTools::var<double, double>(dist);
1465 reg[1] = VectorTools::mean<double, double>(Dprime) - reg[0] * VectorTools::mean<double, double>(dist);
1478 Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin);
1482 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1484 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1485 reg[0] = VectorTools::cov<double, double>(dist, R2) / VectorTools::var<double, double>(dist);
1486 reg[1] = VectorTools::mean<double, double>(R2) - reg[0] * VectorTools::mean<double, double>(dist);
1499 Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin);
1501 Vdouble R2transformed = unit / R2 - 1;
1504 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1506 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1507 return VectorTools::sum(R2transformed * dist) / VectorTools::sum(dist * dist);
1521 double left = leftHandHudson_(psc);
1526 if (SequenceStatistics::polymorphicSiteNumber(psc) < 2)
1528 if (rightHandHudson_(c1, n) < left)
1530 if (rightHandHudson_(c2, n) > left)
1532 while (dif > precision)
1534 if (rightHandHudson_((c1 + c2) / 2, n) > left)
1538 dif = std::abs(2 * (c1 - c2) / (c1 + c2));
1540 return (c1 + c2) / 2;
1547 void SequenceStatistics::testUsefullValues(std::ostream& s,
size_t n)
1549 map<string, double> v = getUsefullValues_(n);
1550 double vD = getVD_(n, v[
"a1"], v[
"a2"], v[
"cn"]);
1551 double uD = getUD_(v[
"a1"], vD);
1552 double vDs = getVDstar_(n, v[
"a1"], v[
"a2"], v[
"dn"]);
1553 double uDs = getUDstar_(n, v[
"a1"], vDs);
1556 s << v[
"a1"] <<
"\t";
1557 s << v[
"a2"] <<
"\t";
1558 s << v[
"a1n"] <<
"\t";
1559 s << v[
"b1"] <<
"\t";
1560 s << v[
"b2"] <<
"\t";
1561 s << v[
"c1"] <<
"\t";
1562 s << v[
"c2"] <<
"\t";
1563 s << v[
"cn"] <<
"\t";
1564 s << v[
"dn"] <<
"\t";
1565 s << v[
"e1"] <<
"\t";
1566 s << v[
"e2"] <<
"\t";
1577 size_t SequenceStatistics::getMutationNumber_(
const Site& site)
1579 size_t tmp_count = 0;
1580 map<int, size_t> states_count;
1581 SymbolListTools::getCounts(site, states_count);
1583 for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1593 size_t SequenceStatistics::getSingletonNumber_(
const Site& site)
1596 map<int, size_t> states_count;
1597 SymbolListTools::getCounts(site, states_count);
1598 for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1600 if (it->second == 1)
1606 size_t SequenceStatistics::getDerivedSingletonNumber_(
const Site& site_in,
const Site& site_out)
1609 map<int, size_t> states_count;
1610 map<int, size_t> outgroup_states_count;
1611 SymbolListTools::getCounts(site_in, states_count);
1612 SymbolListTools::getCounts(site_out, outgroup_states_count);
1614 if (outgroup_states_count.size() == 1)
1616 for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1618 if (it->second == 1)
1620 if (outgroup_states_count.find(it->first) == outgroup_states_count.end())
1628 std::map<std::string, double> SequenceStatistics::getUsefullValues_(
size_t n)
1630 double nn =
static_cast<double>(n);
1631 map<string, double> values;
1645 for (
double i = 1; i < nn; i++)
1647 values[
"a1"] += 1. / i;
1648 values[
"a2"] += 1. / (i * i);
1650 values[
"a1n"] = values[
"a1"] + (1. / nn);
1651 values[
"b1"] = (nn + 1.) / (3. * (nn - 1.));
1652 values[
"b2"] = 2. * ((nn * nn) + nn + 3.) / (9. * nn * (nn - 1.));
1653 values[
"c1"] = values[
"b1"] - (1. / values[
"a1"]);
1654 values[
"c2"] = values[
"b2"] - ((nn + 2.) / (values[
"a1"] * nn)) + (values[
"a2"] / (values[
"a1"] * values[
"a1"]));
1662 values[
"cn"] = 2. * ((nn * values[
"a1"]) - (2. * (nn - 1.))) / ((nn - 1.) * (nn - 2.));
1665 + ((nn - 2.) / ((nn - 1.) * (nn - 1.)))
1667 * ((3. / 2.) - (((2. * values[
"a1n"]) - 3.) / (nn - 2.)) - (1. / nn));
1669 values[
"e1"] = values[
"c1"] / values[
"a1"];
1670 values[
"e2"] = values[
"c2"] / ((values[
"a1"] * values[
"a1"]) + values[
"a2"]);
1675 double SequenceStatistics::getVD_(
size_t n,
double a1,
double a2,
double cn)
1677 double nn =
static_cast<double>(n);
1680 double vD = 1. + ((a1 * a1) / (a2 + (a1 * a1))) * (cn - ((nn + 1.) / (nn - 1.)));
1684 double SequenceStatistics::getUD_(
double a1,
double vD)
1686 return a1 - 1. - vD;
1689 double SequenceStatistics::getVDstar_(
size_t n,
double a1,
double a2,
double dn)
1691 double denom = (a1 * a1) + a2;
1692 if (n < 3 || denom == 0.)
1694 double nn =
static_cast<double>(n);
1695 double nnn = nn / (nn - 1.);
1700 - (2. * (nn * a1 * (a1 + 1)) / ((nn - 1.) * (nn - 1.)))
1717 double SequenceStatistics::getUDstar_(
size_t n,
double a1,
double vDs)
1721 double nn =
static_cast<double>(n);
1722 double nnn = nn / (nn - 1.);
1724 double uDs = (nnn * (a1 - nnn)) - vDs;
1738 for (
size_t i = 0; i < nbseq - 1; i++)
1740 for (
size_t j = i + 1; j < nbseq; j++)
1742 SequenceSelection ss(2);
1746 S1 += SequenceStatistics::watterson75(*psc2,
true);
1747 S2 += SequenceStatistics::watterson75(*psc2,
true) * SequenceStatistics::watterson75(*psc2,
true);
1751 double Sk = (2 * S2 - pow(2 * S1 / static_cast<double>(nbseq), 2.)) / pow(nbseq, 2.);
1752 double H = SequenceStatistics::heterozygosity(*newpsc);
1753 double H2 = SequenceStatistics::squaredHeterozygosity(*newpsc);
1755 return static_cast<double>(Sk - H + H2) / pow(H * static_cast<double>(nbseq) /
static_cast<double>(nbseq - 1), 2.);
1758 double SequenceStatistics::rightHandHudson_(
double c,
size_t n)
1760 double nn =
static_cast<double>(n);
1761 return 1. / (97. * pow(c, 2.) * pow(nn, 3.)) * ((nn - 1.) * (97. * (c * (4. + (c - 2. * nn) * nn) + (-2. * (7. + c) + 4. * nn + (c - 1.) * pow(nn, 2.)) * log((18. + c * (13. + c)) / 18.)) + sqrt(97.) * (110. + nn * (49. * nn - 52.) + c * (2. + nn * (15. * nn - 8.))) * log(-1. + (72. + 26. * c) / (36. + 13. * c - c * sqrt(97.)))));