bpp-popgen  2.1.0
 All Classes Namespaces Files Functions Variables Friends Pages
SequenceStatistics.cpp
Go to the documentation of this file.
1 //
2 // File SequenceStatistics.cpp
3 // Authors: Eric Bazin
4 // Sylvain Gailard
5 // Khalid Belkhir
6 // Benoit Nabholz
7 // Created on: Wed Aug 04 2004
8 //
9 
10 /*
11  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
12 
13  This software is a computer program whose purpose is to provide classes
14  for population genetics analysis.
15 
16  This software is governed by the CeCILL license under French law and
17  abiding by the rules of distribution of free software. You can use,
18  modify and/ or redistribute the software under the terms of the CeCILL
19  license as circulated by CEA, CNRS and INRIA at the following URL
20  "http://www.cecill.info".
21 
22  As a counterpart to the access to the source code and rights to copy,
23  modify and redistribute granted by the license, users are provided only
24  with a limited warranty and the software's author, the holder of the
25  economic rights, and the successive licensors have only limited
26  liability.
27 
28  In this respect, the user's attention is drawn to the risks associated
29  with loading, using, modifying and/or developing or reproducing the
30  software by the user in light of its specific status of free software,
31  that may mean that it is complicated to manipulate, and that also
32  therefore means that it is reserved for developers and experienced
33  professionals having in-depth computer knowledge. Users are therefore
34  encouraged to load and test the software's suitability as regards their
35  requirements in conditions enabling the security of their systems and/or
36  data to be ensured and, more generally, to use and operate it in the
37  same conditions as regards security.
38 
39  The fact that you are presently reading this means that you have had
40  knowledge of the CeCILL license and that you accept its terms.
41  */
42 
43 #include "SequenceStatistics.h" // class's header file
46 
47 // From the STL:
48 #include <ctype.h>
49 #include <cmath>
50 #include <iostream>
51 #include <vector>
52 
53 using namespace std;
54 
55 // From SeqLib:
56 #include <Bpp/Seq/Site.h>
57 #include <Bpp/Seq/SiteTools.h>
59 #include <Bpp/Seq/CodonSiteTools.h>
60 #include <Bpp/Seq/Alphabet/DNA.h>
63 
66 
67 using namespace bpp;
68 
69 // ******************************************************************************
70 // Basic statistics
71 // ******************************************************************************
72 
73 size_t SequenceStatistics::polymorphicSiteNumber(const PolymorphismSequenceContainer& psc, bool gapflag, bool ignoreUnknown)
74 {
75  size_t S = 0;
76  const Site* site = 0;
77  ConstSiteIterator* si = 0;
78  if (gapflag)
79  si = new CompleteSiteContainerIterator(psc);
80  else
81  si = new SimpleSiteContainerIterator(psc);
82  while (si->hasMoreSites())
83  {
84  site = si->nextSite();
85  if (!SiteTools::isConstant(*site, ignoreUnknown))
86  {
87  S++;
88  }
89  }
90  delete si;
91  return S;
92 }
93 
94 size_t SequenceStatistics::parsimonyInformativeSiteNumber(const PolymorphismSequenceContainer& psc, bool gapflag)
95 {
96  ConstSiteIterator* si = 0;
97  if (gapflag)
98  si = new CompleteSiteContainerIterator(psc);
99  else
100  si = new SimpleSiteContainerIterator(psc);
101  size_t S = 0;
102  const Site* site = 0;
103  while (si->hasMoreSites())
104  {
105  site = si->nextSite();
106  if (SiteTools::isParsimonyInformativeSite(*site))
107  {
108  S++;
109  }
110  }
111  delete si;
112  return S;
113 }
114 
115 size_t SequenceStatistics::countSingleton(const PolymorphismSequenceContainer& psc, bool gapflag)
116 {
117  size_t nus = 0;
118  const Site* site = 0;
119  ConstSiteIterator* si = 0;
120  if (gapflag)
121  si = new CompleteSiteContainerIterator(psc);
122  else
123  si = new SimpleSiteContainerIterator(psc);
124  while (si->hasMoreSites())
125  {
126  site = si->nextSite();
127  nus += getSingletonNumber_(*site);
128  }
129  delete si;
130  return nus;
131 }
132 
133 size_t SequenceStatistics::tripletNumber(const PolymorphismSequenceContainer& psc, bool gapflag)
134 {
135  ConstSiteIterator* si = 0;
136  if (gapflag)
137  si = new CompleteSiteContainerIterator(psc);
138  else
139  si = new SimpleSiteContainerIterator(psc);
140  int S = 0;
141  const Site* site = 0;
142  while (si->hasMoreSites())
143  {
144  site = si->nextSite();
145  if (SiteTools::isTriplet(*site))
146  {
147  S++;
148  }
149  }
150 
151  delete si;
152  return S;
153 }
154 
155 size_t SequenceStatistics::totNumberMutations(const PolymorphismSequenceContainer& psc, bool gapflag)
156 {
157  size_t tnm = 0;
158  const Site* site = 0;
159  ConstSiteIterator* si = 0;
160  if (gapflag)
161  si = new CompleteSiteContainerIterator(psc);
162  else
163  si = new SimpleSiteContainerIterator(psc);
164  while (si->hasMoreSites())
165  {
166  site = si->nextSite();
167  tnm += getMutationNumber_(*site);
168  }
169  delete si;
170  return tnm;
171 }
172 
173 size_t SequenceStatistics::totMutationsExternalBranchs(
175  const PolymorphismSequenceContainer& outg) throw (Exception)
176 {
177  if (ing.getNumberOfSites() != outg.getNumberOfSites())
178  throw Exception("ing and outg must have the same size");
179  size_t nmuts = 0;
180  const Site* site_in = 0;
181  const Site* site_out = 0;
182  ConstSiteIterator* si = 0;
183  ConstSiteIterator* so = 0;
184  si = new SimpleSiteContainerIterator(ing);
185  so = new SimpleSiteContainerIterator(outg);
186  while (si->hasMoreSites())
187  {
188  site_in = si->nextSite();
189  site_out = so->nextSite();
190  // use fully resolved sites
191  if (SiteTools::isComplete(*site_in) && SiteTools::isComplete(*site_out))
192  nmuts += getDerivedSingletonNumber_(*site_in, *site_out); // singletons that are not in outgroup
193  }
194  delete si;
195  delete so;
196  return nmuts;
197 }
198 
199 double SequenceStatistics::heterozygosity(const PolymorphismSequenceContainer& psc, bool gapflag)
200 {
201  ConstSiteIterator* si = 0;
202  const Site* site = 0;
203  if (gapflag)
204  si = new CompleteSiteContainerIterator(psc);
205  else
206  si = new SimpleSiteContainerIterator(psc);
207  double S = 0;
208  while (si->hasMoreSites())
209  {
210  site = si->nextSite();
211  S += SiteTools::heterozygosity(*site);
212  }
213  delete si;
214  return S;
215 }
216 
217 double SequenceStatistics::squaredHeterozygosity(const PolymorphismSequenceContainer& psc, bool gapflag)
218 {
219  ConstSiteIterator* si = 0;
220  const Site* site = 0;
221  if (gapflag)
222  si = new CompleteSiteContainerIterator(psc);
223  else
224  si = new SimpleSiteContainerIterator(psc);
225  double S = 0;
226  while (si->hasMoreSites())
227  {
228  site = si->nextSite();
229  double h = SiteTools::heterozygosity(*site);
230  S += h * h;
231  }
232  delete si;
233  return S;
234 }
235 
236 // ******************************************************************************
237 // GC statistics
238 // ******************************************************************************
239 
240 double SequenceStatistics::gcContent(const PolymorphismSequenceContainer& psc)
241 {
242  map<int, double> freqs;
243  SequenceContainerTools::getFrequencies(psc, freqs);
244  const Alphabet* alpha = psc.getAlphabet();
245  return (freqs[alpha->charToInt("C")] + freqs[alpha->charToInt("G")]) / (freqs[alpha->charToInt("A")] + freqs[alpha->charToInt("C")] + freqs[alpha->charToInt("G")] + freqs[alpha->charToInt("T")]);
246 }
247 
248 std::vector<size_t> SequenceStatistics::gcPolymorphism(const PolymorphismSequenceContainer& psc, bool gapflag)
249 {
250  size_t nbMut = 0;
251  size_t nbGC = 0;
252  const size_t nbSeq = psc.getNumberOfSequences();
253  vector<size_t> vect(2);
254  const Site* site = 0;
255  ConstSiteIterator* si = 0;
256  if (gapflag)
257  si = new CompleteSiteContainerIterator(psc);
258  else
259  si = new NoGapSiteContainerIterator(psc);
260  while (si->hasMoreSites())
261  {
262  site = si->nextSite();
263  if (!SiteTools::isConstant(*site))
264  {
265  long double freqGC = SymbolListTools::getGCContent(*site);
266  /*
267  * Sylvain Gaillard 15/03/2010: realy unclear ...
268  * freqGC is always in [0,1] then why testing it ?
269  * why casting double into size_t ?
270  * is that method used by someone ?
271  */
272  if (freqGC > 0 && freqGC < 1)
273  {
274  nbMut += static_cast<size_t>(nbSeq);
275  long double adGC = freqGC * nbSeq;
276  nbGC += static_cast<size_t>(adGC);
277  }
278  }
279  }
280  vect[0] = nbMut;
281  vect[1] = nbGC;
282  delete si;
283  return vect;
284 }
285 
286 // ******************************************************************************
287 // Diversity statistics
288 // ******************************************************************************
289 
290 double SequenceStatistics::watterson75(const PolymorphismSequenceContainer& psc, bool gapflag, bool ignoreUnknown)
291 {
292  double ThetaW;
293  size_t n = psc.getNumberOfSequences();
294  size_t S = polymorphicSiteNumber(psc, gapflag, ignoreUnknown);
295  map<string, double> values = getUsefullValues_(n);
296  ThetaW = (double) S / values["a1"];
297  return ThetaW;
298 }
299 
300 double SequenceStatistics::tajima83(const PolymorphismSequenceContainer& psc, bool gapflag)
301 {
302  size_t alphabet_size = (psc.getAlphabet())->getSize();
303  const Site* site = 0;
304  ConstSiteIterator* si = 0;
305  double value2 = 0.;
306  if (gapflag)
307  si = new CompleteSiteContainerIterator(psc);
308  else
309  si = new SimpleSiteContainerIterator(psc);
310  while (si->hasMoreSites())
311  {
312  site = si->nextSite();
313  if (!SiteTools::isConstant(*site))
314  {
315  double value = 0.;
316  map<int, size_t> count;
317  SymbolListTools::getCounts(*site, count);
318  map<int, size_t> tmp_k;
319  size_t tmp_n = 0;
320  for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++)
321  {
322  if (it->first >= 0 && it->first < static_cast<int>(alphabet_size))
323  {
324  tmp_k[it->first] = it->second * (it->second - 1);
325  tmp_n += it->second;
326  }
327  }
328  if (tmp_n == 0 || tmp_n == 1)
329  continue;
330  for (map<int, size_t>::iterator it = tmp_k.begin(); it != tmp_k.end(); it++)
331  {
332  value += static_cast<double>(it->second) / static_cast<double>(tmp_n * (tmp_n - 1));
333  }
334  value2 += 1. - value;
335  }
336  }
337  delete si;
338  return value2;
339 }
340 
341 double SequenceStatistics::FayWu2000(const PolymorphismSequenceContainer& psc, const Sequence& ancestralSites)
342 {
343  if (psc.getNumberOfSites() != ancestralSites.size())
344  throw Exception("SequenceStatistics::FayWu2000: ancestralSites and psc don't have the same size!!!'" );
345 
346  const Sequence& tmps = psc.getSequence(0);
347 
348  size_t alphabet_size = (psc.getAlphabet())->getSize();
349  double value = 0.;
350  for (size_t i = 0; i < psc.getNumberOfSites(); i++)
351  {
352  const Site& site = psc.getSite(i);
353  string ancB = ancestralSites.getChar(i);
354  int ancV = ancestralSites.getValue(i);
355 
356  if (!SiteTools::isConstant(site) || tmps.getChar(i) != ancB)
357  {
358  if (ancV < 0)
359  continue;
360 
361  map<int, size_t> count;
362  SymbolListTools::getCounts(site, count);
363  map<int, size_t> tmp_k;
364  size_t tmp_n = 0;
365  for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++)
366  {
367  if (it->first >= 0 && it->first < static_cast<int>(alphabet_size))
368  {
369  /* if derived allele */
370  if (it->first != ancV)
371  {
372  tmp_k[it->first] = 2 * it->second * it->second;
373  }
374  tmp_n += it->second;
375  }
376  }
377  if (tmp_n == 0 || tmp_n == 1)
378  continue;
379  for (map<int, size_t>::iterator it = tmp_k.begin(); it != tmp_k.end(); it++)
380  {
381  value += static_cast<double>(it->second) / static_cast<double>(tmp_n * (tmp_n - 1));
382  }
383  }
384  }
385  return value;
386 }
387 
388 size_t SequenceStatistics::DVK(const PolymorphismSequenceContainer& psc, bool gapflag)
389 {
390  /*
391  * Sylvain Gaillard 17/03/2010:
392  * This implementation uses unneeded SequenceContainer recopy and works on
393  * string. It needs to be improved.
394  */
396  if (gapflag)
397  sc = PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc);
398  else
399  sc = new PolymorphismSequenceContainer(psc);
400  // int K = 0;
401  vector<string> pscvector;
402  pscvector.push_back(sc->toString(0));
403  // K++;
404  for (size_t i = 1; i < sc->getNumberOfSequences(); i++)
405  {
406  bool uniq = true;
407  string query = sc->toString(i);
408  for (vector<string>::iterator it = pscvector.begin(); it != pscvector.end(); it++)
409  {
410  if (query.compare(*it) == 0)
411  {
412  uniq = false;
413  break;
414  }
415  }
416  if (uniq)
417  {
418  // K++;
419  pscvector.push_back(query);
420  }
421  }
422  delete sc;
423  // return K;
424  return pscvector.size();
425 }
426 
427 double SequenceStatistics::DVH(const PolymorphismSequenceContainer& psc, bool gapflag)
428 {
429  /*
430  * Sylvain Gaillard 17/03/2010:
431  * This implementation uses unneeded SequenceContainer recopy and works on
432  * string. It needs to be improved.
433  */
435  if (gapflag)
436  sc = PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc);
437  else
438  sc = new PolymorphismSequenceContainer(psc);
439  double H = 0.;
440  size_t nbSeq;
441  vector<string> pscvector;
442  vector<size_t> effvector;
443  pscvector.push_back(sc->toString(0));
444  effvector.push_back(sc->getSequenceCount(0));
445  nbSeq = sc->getSequenceCount(0);
446  for (size_t i = 1; i < sc->getNumberOfSequences(); i++)
447  {
448  nbSeq += sc->getSequenceCount(i);
449  bool uniq = true;
450  string query = sc->toString(i);
451  for (size_t j = 0; j < pscvector.size(); j++)
452  {
453  if (query.compare(pscvector[j]) == 0)
454  {
455  effvector[j] += sc->getSequenceCount(i);
456  uniq = false;
457  break;
458  }
459  }
460  if (uniq)
461  {
462  pscvector.push_back(query);
463  effvector.push_back(sc->getSequenceCount(i));
464  }
465  }
466  for (size_t i = 0; i < effvector.size(); i++)
467  {
468  H -= (static_cast<double>(effvector[i]) / static_cast<double>(nbSeq)) * ( static_cast<double>(effvector[i]) / static_cast<double>(nbSeq));
469  }
470  H += 1.;
471  delete sc;
472  return H;
473 }
474 
475 size_t SequenceStatistics::getNumberOfTransitions(const PolymorphismSequenceContainer& psc)
476 {
477  size_t nbT = 0;
479  const Site* site = 0;
480  while (si->hasMoreSites())
481  {
482  site = si->nextSite();
483  // if (SiteTools::isConstant(*site) || SiteTools::isTriplet(*site)) continue;
484  if (SiteTools::getNumberOfDistinctCharacters(*site) != 2)
485  continue;
486  vector<int> seq = site->getContent();
487  int state1 = seq[0];
488  int state2 = seq[0];
489  for (size_t i = 1; i < seq.size(); i++)
490  {
491  if (state1 != seq[i])
492  {
493  state2 = seq[i];
494  break;
495  }
496  }
497  if (((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) ||
498  ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1)))
499  {
500  nbT++;
501  }
502  }
503  delete si;
504  return nbT;
505 }
506 
507 size_t SequenceStatistics::getNumberOfTransversions(const PolymorphismSequenceContainer& psc)
508 {
509  size_t nbTv = 0;
511  const Site* site = 0;
512  while (si->hasMoreSites())
513  {
514  site = si->nextSite();
515  // if (SiteTools::isConstant(*site) || SiteTools::isTriplet(*site)) continue;
516  if (SiteTools::getNumberOfDistinctCharacters(*site) != 2)
517  continue;
518  vector<int> seq = site->getContent();
519  int state1 = seq[0];
520  int state2 = seq[0];
521  for (size_t i = 1; i < seq.size(); i++)
522  {
523  if (state1 != seq[i])
524  {
525  state2 = seq[i];
526  break;
527  }
528  }
529  if (!(((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) ||
530  ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1))))
531  {
532  nbTv++;
533  }
534  }
535  delete si;
536  return nbTv;
537 }
538 
539 double SequenceStatistics::getTransitionsTransversionsRatio(const PolymorphismSequenceContainer& psc) throw (Exception)
540 {
541  // return (double) getNumberOfTransitions(psc)/getNumberOfTransversions(psc);
542  size_t nbT = 0;
543  size_t nbTv = 0;
545  const Site* site = 0;
546  vector< int > state(2);
547  while (si->hasMoreSites())
548  {
549  map<int, size_t> count;
550  site = si->nextSite();
551  SymbolListTools::getCounts(*site, count);
552  if (count.size() != 2)
553  continue;
554  int i = 0;
555  for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++)
556  {
557  state[i] = it->first;
558  i++;
559  }
560  if (((state[0] == 0 && state[1] == 2) || (state[0] == 2 && state[1] == 0)) ||
561  ((state[0] == 1 && state[1] == 3) || (state[0] == 3 && state[1] == 1)))
562  {
563  nbT++; // transitions
564  }
565  else
566  {
567  nbTv++; // transversion
568  }
569  }
570  delete si;
571  if (nbTv == 0)
572  throw ZeroDivisionException("SequenceStatistics::getTransitionsTransversionsRatio.");
573  return static_cast<double>(nbT) / static_cast<double>(nbTv);
574 }
575 
576 // ******************************************************************************
577 // Synonymous and non-synonymous polymorphism
578 // ******************************************************************************
579 
580 size_t SequenceStatistics::stopCodonSiteNumber(const PolymorphismSequenceContainer& psc, bool gapflag)
581 {
582  /*
583  * Sylvain Gaillard 17/03/2010
584  * What if the Alphabet is not a codon alphabet?
585  */
586  ConstSiteIterator* si = 0;
587  if (gapflag)
588  si = new NoGapSiteContainerIterator(psc);
589  else
590  si = new SimpleSiteContainerIterator(psc);
591  size_t S = 0;
592  const Site* site = 0;
593  while (si->hasMoreSites())
594  {
595  site = si->nextSite();
596  if (CodonSiteTools::hasStop(*site))
597  S++;
598  }
599  delete si;
600  return S;
601 }
602 
603 size_t SequenceStatistics::monoSitePolymorphicCodonNumber(const PolymorphismSequenceContainer& psc, bool stopflag, bool gapflag)
604 {
605  ConstSiteIterator* si = 0;
606  if (stopflag)
607  si = new CompleteSiteContainerIterator(psc);
608  else
609  {
610  if (gapflag)
611  si = new NoGapSiteContainerIterator(psc);
612  else
613  si = new SimpleSiteContainerIterator(psc);
614  }
615  size_t S = 0;
616  const Site* site;
617  while (si->hasMoreSites())
618  {
619  site = si->nextSite();
620  if (CodonSiteTools::isMonoSitePolymorphic(*site))
621  S++;
622  }
623  delete si;
624  return S;
625 }
626 
627 size_t SequenceStatistics::synonymousPolymorphicCodonNumber(const PolymorphismSequenceContainer& psc, const GeneticCode& gc)
628 {
630  size_t S = 0;
631  const Site* site;
632  while (si->hasMoreSites())
633  {
634  site = si->nextSite();
635  if (CodonSiteTools::isSynonymousPolymorphic(*site, gc))
636  S++;
637  }
638  delete si;
639  return S;
640 }
641 
642 double SequenceStatistics::watterson75Synonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc)
643 {
644  double ThetaW = 0.;
645  size_t n = psc.getNumberOfSequences();
646  size_t S = synonymousSubstitutionsNumber(psc, gc);
647  map<string, double> values = getUsefullValues_(n);
648  ThetaW = static_cast<double>(S) / values["a1"];
649  return ThetaW;
650 }
651 
652 double SequenceStatistics::watterson75NonSynonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc)
653 {
654  double ThetaW;
655  size_t n = psc.getNumberOfSequences();
656  size_t S = nonSynonymousSubstitutionsNumber(psc, gc);
657  map<string, double> values = getUsefullValues_(n);
658  ThetaW = static_cast<double>(S) / values["a1"];
659  return ThetaW;
660 }
661 
662 double SequenceStatistics::piSynonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, bool minchange)
663 {
664  double S = 0.;
666  const Site* site = 0;
667  while (si->hasMoreSites())
668  {
669  site = si->nextSite();
670  S += CodonSiteTools::piSynonymous(*site, gc, minchange);
671  }
672  delete si;
673  return S;
674 }
675 
676 double SequenceStatistics::piNonSynonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, bool minchange)
677 {
678  double S = 0.;
680  const Site* site = 0;
681  while (si->hasMoreSites())
682  {
683  site = si->nextSite();
684  S += CodonSiteTools::piNonSynonymous(*site, gc, minchange);
685  }
686  delete si;
687  return S;
688 }
689 
690 double SequenceStatistics::meanSynonymousSitesNumber(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double ratio)
691 {
692  double S = 0.;
694  const Site* site = 0;
695  while (si->hasMoreSites())
696  {
697  site = si->nextSite();
698  S += CodonSiteTools::meanNumberOfSynonymousPositions(*site, gc, ratio);
699  }
700  delete si;
701  return S;
702 }
703 
704 double SequenceStatistics::meanNonSynonymousSitesNumber(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double ratio)
705 {
706  double S = 0.;
707  int n = 0;
709  const Site* site = 0;
710  while (si->hasMoreSites())
711  {
712  site = si->nextSite();
713  n = n + 3;
714  S += CodonSiteTools::meanNumberOfSynonymousPositions(*site, gc, ratio);
715  }
716  delete si;
717  return static_cast<double>(n - S);
718 }
719 
720 size_t SequenceStatistics::synonymousSubstitutionsNumber(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double freqmin)
721 {
722  size_t St = 0, Sns = 0;
724  const Site* site = 0;
725  while (si->hasMoreSites())
726  {
727  site = si->nextSite();
728  St += CodonSiteTools::numberOfSubsitutions(*site, freqmin);
729  Sns += CodonSiteTools::numberOfNonSynonymousSubstitutions(*site, gc, freqmin);
730  }
731  delete si;
732  return St - Sns;
733 }
734 
735 size_t SequenceStatistics::nonSynonymousSubstitutionsNumber(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double freqmin)
736 {
737  size_t Sns = 0;
739  const Site* site = 0;
740  while (si->hasMoreSites())
741  {
742  site = si->nextSite();
743  Sns += CodonSiteTools::numberOfNonSynonymousSubstitutions(*site, gc, freqmin);
744  }
745  delete si;
746  return Sns;
747 }
748 
749 vector<size_t> SequenceStatistics::fixedDifferences(const PolymorphismSequenceContainer& pscin, const PolymorphismSequenceContainer& pscout, PolymorphismSequenceContainer& psccons, const GeneticCode& gc)
750 {
753  ConstSiteIterator* siCons = new CompleteSiteContainerIterator(psccons);
754  const Site* siteIn = 0;
755  const Site* siteOut = 0;
756  const Site* siteCons = 0;
757  size_t NfixS = 0;
758  size_t NfixA = 0;
759  while (siIn->hasMoreSites())
760  {
761  siteIn = siIn->nextSite();
762  siteOut = siOut->nextSite();
763  siteCons = siCons->nextSite();
764  vector<size_t> v = CodonSiteTools::fixedDifferences(*siteIn, *siteOut, siteCons->getValue(0), siteCons->getValue(1), gc);
765  NfixS += v[0];
766  NfixA += v[1];
767  }
768  vector<size_t> v(2);
769  v[0] = NfixS;
770  v[1] = NfixA;
771  delete siIn;
772  delete siOut;
773  delete siCons;
774  return v;
775 }
776 
777 vector<size_t> SequenceStatistics::MKtable(const PolymorphismSequenceContainer& ingroup, const PolymorphismSequenceContainer& outgroup, const GeneticCode& gc, double freqmin)
778 {
779  PolymorphismSequenceContainer psctot(ingroup);
780  for (size_t i = 0; i < outgroup.getNumberOfSequences(); i++)
781  {
782  psctot.addSequence(outgroup.getSequence(i));
783  psctot.setAsOutgroupMember(i + ingroup.getNumberOfSequences());
784  }
785  const PolymorphismSequenceContainer* psccomplet = PolymorphismSequenceContainerTools::getCompleteSites(psctot);
786  const PolymorphismSequenceContainer* pscin = PolymorphismSequenceContainerTools::extractIngroup(*psccomplet);
787  const PolymorphismSequenceContainer* pscout = PolymorphismSequenceContainerTools::extractOutgroup(*psccomplet);
788  const Sequence* consensusIn = SiteContainerTools::getConsensus(*pscin, "consensusIn");
789  const Sequence* consensusOut = SiteContainerTools::getConsensus(*pscout, "consensusOut");
791  consensus->addSequence(*consensusIn);
792  consensus->addSequence(*consensusOut);
793  vector<size_t> u = SequenceStatistics::fixedDifferences(*pscin, *pscout, *consensus, gc);
794  vector<size_t> v(4);
795  v[0] = SequenceStatistics::nonSynonymousSubstitutionsNumber(*pscin, gc, freqmin);
796  v[1] = SequenceStatistics::synonymousSubstitutionsNumber(*pscin, gc, freqmin);
797  v[2] = u[1];
798  v[3] = u[0];
799  delete consensus;
800  if (psccomplet)
801  {
802  delete psccomplet;
803  }
804  if (pscin)
805  {
806  delete pscin;
807  }
808  if (pscout)
809  {
810  delete pscout;
811  }
812  if (consensusIn)
813  {
814  delete consensusIn;
815  }
816  if (consensusOut)
817  {
818  delete consensusOut;
819  }
820  return v;
821 }
822 
823 double SequenceStatistics::neutralityIndex(const PolymorphismSequenceContainer& ingroup, const PolymorphismSequenceContainer& outgroup, const GeneticCode& gc, double freqmin)
824 {
825  vector<size_t> v = SequenceStatistics::MKtable(ingroup, outgroup, gc, freqmin);
826  if (v[1] != 0 && v[2] != 0)
827  return static_cast<double>(v[0] * v[3]) / static_cast<double>(v[1] * v[2]);
828  else
829  return -1;
830 }
831 
832 // ******************************************************************************
833 // Statistical tests
834 // ******************************************************************************
835 
836 double SequenceStatistics::tajimaDSS(const PolymorphismSequenceContainer& psc, bool gapflag) throw (ZeroDivisionException)
837 {
838  double S = static_cast<double>(polymorphicSiteNumber(psc, gapflag));
839  if (!S)
840  throw ZeroDivisionException("S should not be null");
841  double tajima = tajima83(psc, gapflag);
842  double watterson = watterson75(psc, gapflag);
843  size_t n = psc.getNumberOfSequences();
844  map<string, double> values = getUsefullValues_(n);
845  // if (S == 0)
846  // cout << "ARG S == 0" << endl;
847  return (tajima - watterson) / sqrt((values["e1"] * S) + (values["e2"] * S * (S - 1)));
848 }
849 
850 double SequenceStatistics::tajimaDTNM(const PolymorphismSequenceContainer& psc, bool gapflag) throw (ZeroDivisionException)
851 {
852  double eta = static_cast<double>(totNumberMutations(psc, gapflag));
853  if (!eta)
854  throw ZeroDivisionException("eta should not be null");
855  double tajima = tajima83(psc, gapflag);
856  size_t n = psc.getNumberOfSequences();
857  map<string, double> values = getUsefullValues_(n);
858  double eta_a1 = static_cast<double>(eta) / values["a1"];
859  return (tajima - eta_a1) / sqrt((values["e1"] * eta) + (values["e2"] * eta * (eta - 1)));
860 }
861 
862 double SequenceStatistics::fuliD(const PolymorphismSequenceContainer& ingroup, const PolymorphismSequenceContainer& outgroup, bool original) throw (ZeroDivisionException)
863 {
864  size_t n = ingroup.getNumberOfSequences();
865  map<string, double> values = getUsefullValues_(n);
866  double vD = getVD_(n, values["a1"], values["a2"], values["cn"]);
867  double uD = getUD_(values["a1"], vD);
868  double eta = static_cast<double>(totNumberMutations(ingroup));
869  if (eta == 0.)
870  throw ZeroDivisionException("eta should not be null");
871  double etae = 0.;
872  if (original)
873  etae = static_cast<double>(countSingleton(outgroup));
874  else
875  etae = static_cast<double>(totMutationsExternalBranchs(ingroup, outgroup)); // added by Khalid 13/07/2005
876  return (eta - (values["a1"] * etae)) / sqrt((uD * eta) + (vD * eta * eta));
877 }
878 
879 double SequenceStatistics::fuliDstar(const PolymorphismSequenceContainer& group) throw (ZeroDivisionException)
880 {
881  size_t n = group.getNumberOfSequences();
882  double nn = static_cast<double>(n);
883  double _n = nn / (nn - 1.);
884  map<string, double> values = getUsefullValues_(n);
885  double vDs = getVDstar_(n, values["a1"], values["a2"], values["dn"]);
886  double uDs = getUDstar_(n, values["a1"], vDs);
887  double eta = static_cast<double>(totNumberMutations(group));
888  if (eta == 0.)
889  throw ZeroDivisionException("eta should not be null");
890  double etas = static_cast<double>(countSingleton(group));
891 
892  // Fu & Li 1993
893  return ((_n * eta) - (values["a1"] * etas)) / sqrt(uDs * eta + vDs * eta * eta);
894 
895  // Simonsen et al. 1995
896  /*
897  return ((eta / values["a1"]) - (etas * ((n - 1) / n))) / sqrt(uDs * eta + vDs * eta * eta);
898  */
899 }
900 
901 double SequenceStatistics::fuliF(const PolymorphismSequenceContainer& ingroup, const PolymorphismSequenceContainer& outgroup, bool original) throw (ZeroDivisionException)
902 {
903  size_t n = ingroup.getNumberOfSequences();
904  double nn = static_cast<double>(n);
905  map<string, double> values = getUsefullValues_(n);
906  double pi = tajima83(ingroup, true);
907  double vF = (values["cn"] + values["b2"] - 2. / (nn - 1.)) / (pow(values["a1"], 2) + values["a2"]);
908  double uF = ((1. + values["b1"] - (4. * ((nn + 1.) / ((nn - 1.) * (nn - 1.)))) * (values["a1n"] - (2. * nn) / (nn + 1.))) / values["a1"]) - vF;
909  double eta = static_cast<double>(totNumberMutations(ingroup));
910  if (eta == 0.)
911  throw ZeroDivisionException("eta should not be null");
912  double etae = 0.;
913  if (original)
914  etae = static_cast<double>(countSingleton(outgroup));
915  else
916  etae = static_cast<double>(totMutationsExternalBranchs(ingroup, outgroup)); // added by Khalid 13/07/2005
917  return (pi - etae) / sqrt(uF * eta + vF * eta * eta);
918 }
919 
920 double SequenceStatistics::fuliFstar(const PolymorphismSequenceContainer& group) throw (ZeroDivisionException)
921 {
922  double n = static_cast<double>(group.getNumberOfSequences());
923  map<string, double> values = getUsefullValues_(group.getNumberOfSequences());
924  double pi = tajima83(group, true);
925 
926  // Fu & Li 1993
927  // double vFs = (values["dn"] + values["b2"] - (2. / (nn - 1.)) * (4. * values["a2"] - 6. + 8. / nn)) / (pow(values["a1"], 2) + values["a2"]);
928  // double uFs = (((nn / (nn - 1.)) + values["b1"] - (4. / (nn * (nn - 1.))) + 2. * ((nn + 1.) / (pow((nn - 1.), 2))) * (values["a1n"] - 2. * nn / (nn + 1.))) / values["a1"]) - vFs;
929 
930  // Simonsen et al. 1995
931  double vFs = (((2 * n * n * n + 110 * n * n - 255 * n + 153) / (9 * n * n * (n - 1))) + ((2 * (n - 1) * values["a1"]) / (n * n)) - 8 * values["a2"] / n) / (pow(values["a1"], 2) + values["a2"]);
932  double uFs = (((4 * n * n + 19 * n + 3 - 12 * (n + 1) * values["a1n"]) / (3 * n * (n - 1))) / values["a1"]) - vFs;
933  double eta = static_cast<double>(totNumberMutations(group));
934  if (eta == 0.)
935  throw ZeroDivisionException("eta should not be null");
936  double etas = static_cast<double>(countSingleton(group));
937  // Fu & Li 1993
938  // Simonsen et al. 1995
939  return (pi - ((n - 1.) / n * etas)) / sqrt(uFs * eta + vFs * eta * eta);
940 }
941 
942 double SequenceStatistics::FstHudson92(const PolymorphismSequenceContainer& psc, size_t id1, size_t id2)
943 {
944  vector<double> vdiff;
945  double piIntra1, piIntra2, meanPiIntra, piInter, Fst;
946 
947  PolymorphismSequenceContainer* Pop1 = PolymorphismSequenceContainerTools::extractGroup(psc, id1);
948  PolymorphismSequenceContainer* Pop2 = PolymorphismSequenceContainerTools::extractGroup(psc, id2);
949 
950  piIntra1 = SequenceStatistics::tajima83(*Pop1, false);
951  piIntra2 = SequenceStatistics::tajima83(*Pop2, false);
952 
953  meanPiIntra = (piIntra1 + piIntra2) / 2;
954 
955  double n = 0;
956  for (size_t i = 0; i < Pop1->getNumberOfSequences(); i++)
957  {
958  const Sequence& s1 = Pop1->getSequence(i);
959  for (size_t j = 0; j < Pop2->getNumberOfSequences(); j++)
960  {
961  n++;
962  const Sequence& s2 = Pop2->getSequence(j);
963  vdiff.push_back(SiteContainerTools::computeSimilarity(s1, s2, true, "no gap", true));
964  }
965  }
966  piInter = (VectorTools::sum(vdiff) / n) * static_cast<double>(psc.getNumberOfSites());
967 
968 
969  Fst = 1.0 - meanPiIntra / piInter;
970 
971  delete Pop1;
972  delete Pop2;
973 
974  return Fst;
975 }
976 
977 // ******************************************************************************
978 // Linkage disequilibrium statistics
979 // ******************************************************************************
980 
981 /**********************/
982 /* Preliminary method */
983 /**********************/
984 
985 PolymorphismSequenceContainer* SequenceStatistics::generateLDContainer(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
986 {
987  SiteSelection ss;
988  // Extract polymorphic site with only two alleles
989  for (size_t i = 0; i < psc.getNumberOfSites(); i++)
990  {
991  if (keepsingleton)
992  {
993  if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)))
994  {
995  ss.push_back(i);
996  }
997  }
998  else
999  {
1000  if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i)))
1001  {
1002  ss.push_back(i);
1003  }
1004  }
1005  }
1006 
1007  const SiteContainer* sc = SiteContainerTools::getSelectedSites(psc, ss);
1008  Alphabet* alpha = new DNA(); // Sylvain Gaillard 17/03/2010: What if psc's Alphabet is not DNA
1010  // Assign 1 to the more frequent and 0 to the less frequent alleles
1011  for (size_t i = 0; i < sc->getNumberOfSites(); i++)
1012  {
1013  const Site& site = sc->getSite(i);
1014  Site siteclone(site);
1015  bool deletesite = false;
1016  map<int, double> freqs;
1017  SymbolListTools::getFrequencies(siteclone, freqs);
1018  int first = 0;
1019  for (map<int, double>::iterator it = freqs.begin(); it != freqs.end(); it++)
1020  {
1021  if (it->second >= 0.5)
1022  first = it->first;
1023  }
1024  for (size_t j = 0; j < sc->getNumberOfSequences(); j++)
1025  {
1026  if (freqs[site.getValue(j)] >= 0.5 && site.getValue(j) == first)
1027  {
1028  if (freqs[site.getValue(j)] <= 1 - freqmin)
1029  {
1030  siteclone.setElement(j, 1);
1031  first = site.getValue(j);
1032  }
1033  else
1034  deletesite = true;
1035  }
1036  else
1037  {
1038  if (freqs[site.getValue(j)] >= freqmin)
1039  siteclone.setElement(j, 0);
1040  else
1041  deletesite = true;
1042  }
1043  }
1044  if (!deletesite)
1045  ldpsc->addSite(siteclone);
1046  }
1047  delete alpha;
1048  return ldpsc;
1049 }
1050 
1051 /*************************************/
1052 /* Pairwise LD and distance measures */
1053 /*************************************/
1054 
1055 Vdouble SequenceStatistics::pairwiseDistances1(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1056 {
1057  // get Positions with sites of interest
1058  SiteSelection ss;
1059  for (size_t i = 0; i < psc.getNumberOfSites(); i++)
1060  {
1061  if (keepsingleton)
1062  {
1063  if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)))
1064  {
1065  const Site& site = psc.getSite(i);
1066  bool deletesite = false;
1067  map<int, double> freqs;
1068  SymbolListTools::getFrequencies(site, freqs);
1069  for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1070  {
1071  if (freqs[j] >= 1 - freqmin)
1072  deletesite = true;
1073  }
1074  if (!deletesite)
1075  ss.push_back(i);
1076  }
1077  }
1078  else
1079  {
1080  if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i)))
1081  {
1082  ss.push_back(i);
1083  const Site& site = psc.getSite(i);
1084  bool deletesite = false;
1085  map<int, double> freqs;
1086  SymbolListTools::getFrequencies(site, freqs);
1087  for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1088  {
1089  if (freqs[j] >= 1 - freqmin)
1090  deletesite = true;
1091  }
1092  if (!deletesite)
1093  ss.push_back(i);
1094  }
1095  }
1096  }
1097  // compute pairwise distances
1098  if (ss.size() < 2)
1099  throw DimensionException("SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2);
1100  Vdouble dist;
1101  for (size_t i = 0; i < ss.size() - 1; i++)
1102  {
1103  for (size_t j = i + 1; j < ss.size(); j++)
1104  {
1105  dist.push_back(static_cast<double>(ss[j] - ss[i]));
1106  }
1107  }
1108  return dist;
1109 }
1110 
1111 Vdouble SequenceStatistics::pairwiseDistances2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1112 {
1113  SiteSelection ss;
1114  for (size_t i = 0; i < psc.getNumberOfSites(); i++)
1115  {
1116  if (keepsingleton)
1117  {
1118  if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)))
1119  {
1120  const Site& site = psc.getSite(i);
1121  bool deletesite = false;
1122  map<int, double> freqs;
1123  SymbolListTools::getFrequencies(site, freqs);
1124  for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1125  {
1126  if (freqs[j] >= 1 - freqmin)
1127  deletesite = true;
1128  }
1129  if (!deletesite)
1130  ss.push_back(i);
1131  }
1132  }
1133  else
1134  {
1135  if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i)))
1136  {
1137  ss.push_back(i);
1138  const Site& site = psc.getSite(i);
1139  bool deletesite = false;
1140  map<int, double> freqs;
1141  SymbolListTools::getFrequencies(site, freqs);
1142  for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1143  {
1144  if (freqs[j] >= 1 - freqmin)
1145  deletesite = true;
1146  }
1147  if (!deletesite)
1148  ss.push_back(i);
1149  }
1150  }
1151  }
1152  size_t n = ss.size();
1153  if (n < 2)
1154  throw DimensionException("SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2);
1155  Vdouble distance(n * (n - 1) / 2, 0);
1156  size_t nbsite = psc.getNumberOfSites();
1157  for (size_t k = 0; k < psc.getNumberOfSequences(); k++)
1158  {
1159  const Sequence& seq = psc.getSequence(k);
1160  SiteSelection gap, newss = ss;
1161  Vdouble dist;
1162  for (size_t i = 0; i < nbsite; i++)
1163  {
1164  if (seq.getValue(i) == -1)
1165  gap.push_back(i);
1166  }
1167  // Site positions are re-numbered to take gaps into account
1168  for (size_t i = 0; i < gap.size(); i++)
1169  {
1170  for (size_t j = 0; j < ss.size(); j++)
1171  {
1172  if (ss[j] > gap[i])
1173  newss[j]--;
1174  }
1175  }
1176  for (size_t i = 0; i < n - 1; i++)
1177  {
1178  for (size_t j = i + 1; j < n; j++)
1179  {
1180  dist.push_back(static_cast<double>(newss[j] - newss[i]));
1181  }
1182  }
1183  distance += dist;
1184  }
1185  distance = distance / static_cast<double>(psc.getNumberOfSequences());
1186  return distance;
1187 }
1188 
1189 Vdouble SequenceStatistics::pairwiseD(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1190 {
1191  PolymorphismSequenceContainer* newpsc = SequenceStatistics::generateLDContainer(psc, keepsingleton, freqmin);
1192  Vdouble D;
1193  size_t nbsite = newpsc->getNumberOfSites();
1194  size_t nbseq = newpsc->getNumberOfSequences();
1195  if (nbsite < 2)
1196  throw DimensionException("SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1197  if (nbseq < 2)
1198  throw DimensionException("SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1199  for (size_t i = 0; i < nbsite - 1; i++)
1200  {
1201  for (size_t j = i + 1; j < nbsite; j++)
1202  {
1203  double haplo = 0;
1204  const Site& site1 = newpsc->getSite(i);
1205  const Site& site2 = newpsc->getSite(j);
1206  map<int, double> freq1;
1207  map<int, double> freq2;
1208  SymbolListTools::getFrequencies(site1, freq1);
1209  SymbolListTools::getFrequencies(site2, freq2);
1210  for (size_t k = 0; k < nbseq; k++)
1211  {
1212  if (site1.getValue(k) + site2.getValue(k) == 2)
1213  haplo++;
1214  }
1215  haplo = haplo / static_cast<double>(nbseq);
1216  D.push_back(std::abs(haplo - freq1[1] * freq2[1]));
1217  }
1218  }
1219  return D;
1220 }
1221 
1222 Vdouble SequenceStatistics::pairwiseDprime(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1223 {
1224  PolymorphismSequenceContainer* newpsc = SequenceStatistics::generateLDContainer(psc, keepsingleton, freqmin);
1225  Vdouble Dprime;
1226  size_t nbsite = newpsc->getNumberOfSites();
1227  size_t nbseq = newpsc->getNumberOfSequences();
1228  if (nbsite < 2)
1229  throw DimensionException("SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1230  if (nbseq < 2)
1231  throw DimensionException("SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1232  for (size_t i = 0; i < nbsite - 1; i++)
1233  {
1234  for (size_t j = i + 1; j < nbsite; j++)
1235  {
1236  double haplo = 0;
1237  const Site& site1 = newpsc->getSite(i);
1238  const Site& site2 = newpsc->getSite(j);
1239  map<int, double> freq1;
1240  map<int, double> freq2;
1241  SymbolListTools::getFrequencies(site1, freq1);
1242  SymbolListTools::getFrequencies(site2, freq2);
1243  for (size_t k = 0; k < nbseq; k++)
1244  {
1245  if (site1.getValue(k) + site2.getValue(k) == 2)
1246  haplo++;
1247  }
1248  haplo = haplo / static_cast<double>(nbseq);
1249  double d, D = (haplo - freq1[1] * freq2[1]);
1250  if (D > 0)
1251  {
1252  if (freq1[1] * freq2[0] <= freq1[0] * freq2[1])
1253  {
1254  d = std::abs(D) / (freq1[1] * freq2[0]);
1255  }
1256  else
1257  {
1258  d = std::abs(D) / (freq1[0] * freq2[1]);
1259  }
1260  }
1261  else
1262  {
1263  if (freq1[1] * freq2[1] <= freq1[0] * freq2[0])
1264  {
1265  d = std::abs(D) / (freq1[1] * freq2[1]);
1266  }
1267  else
1268  {
1269  d = std::abs(D) / (freq1[0] * freq2[0]);
1270  }
1271  }
1272  Dprime.push_back(d);
1273  }
1274  }
1275  return Dprime;
1276 }
1277 
1278 Vdouble SequenceStatistics::pairwiseR2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1279 {
1280  PolymorphismSequenceContainer* newpsc = SequenceStatistics::generateLDContainer(psc, keepsingleton, freqmin);
1281  Vdouble R2;
1282  size_t nbsite = newpsc->getNumberOfSites();
1283  size_t nbseq = newpsc->getNumberOfSequences();
1284  if (nbsite < 2)
1285  throw DimensionException("SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1286  if (nbseq < 2)
1287  throw DimensionException("SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1288  for (size_t i = 0; i < nbsite - 1; i++)
1289  {
1290  for (size_t j = i + 1; j < nbsite; j++)
1291  {
1292  double haplo = 0;
1293  const Site& site1 = newpsc->getSite(i);
1294  const Site& site2 = newpsc->getSite(j);
1295  map<int, double> freq1;
1296  map<int, double> freq2;
1297  SymbolListTools::getFrequencies(site1, freq1);
1298  SymbolListTools::getFrequencies(site2, freq2);
1299  for (size_t k = 0; k < nbseq; k++)
1300  {
1301  if (site1.getValue(k) + site2.getValue(k) == 2)
1302  haplo++;
1303  }
1304  haplo = haplo / static_cast<double>(nbseq);
1305  double r = ((haplo - freq1[1] * freq2[1]) * (haplo - freq1[1] * freq2[1])) / (freq1[0] * freq1[1] * freq2[0] * freq2[1]);
1306  R2.push_back(r);
1307  }
1308  }
1309  return R2;
1310 }
1311 
1312 /***********************************/
1313 /* Global LD and distance measures */
1314 /***********************************/
1315 
1316 double SequenceStatistics::meanD(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1317 {
1318  Vdouble D = pairwiseD(psc, keepsingleton, freqmin);
1319  return VectorTools::mean<double, double>(D);
1320 }
1321 
1322 double SequenceStatistics::meanDprime(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1323 {
1324  try
1325  {
1326  Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin);
1327  return VectorTools::mean<double, double>(Dprime);
1328  }
1329  catch (DimensionException& e)
1330  {
1331  throw e;
1332  }
1333 }
1334 
1335 double SequenceStatistics::meanR2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1336 {
1337  try
1338  {
1339  Vdouble R2 = SequenceStatistics::pairwiseR2(psc, keepsingleton, freqmin);
1340  return VectorTools::mean<double, double>(R2);
1341  }
1342  catch (DimensionException& e)
1343  {
1344  throw e;
1345  }
1346 }
1347 
1348 double SequenceStatistics::meanDistance1(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1349 {
1350  try
1351  {
1352  Vdouble dist = pairwiseDistances1(psc, keepsingleton, freqmin);
1353  return VectorTools::mean<double, double>(dist);
1354  }
1355  catch (DimensionException& e)
1356  {
1357  throw e;
1358  }
1359 }
1360 
1361 double SequenceStatistics::meanDistance2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1362 {
1363  try
1364  {
1365  Vdouble dist = pairwiseDistances2(psc, keepsingleton, freqmin);
1366  return VectorTools::mean<double, double>(dist);
1367  }
1368  catch (DimensionException& e)
1369  {
1370  throw e;
1371  }
1372 }
1373 
1374 /**********************/
1375 /* Regression methods */
1376 /**********************/
1377 
1378 double SequenceStatistics::originRegressionD(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
1379 {
1380  try
1381  {
1382  Vdouble D = pairwiseD(psc, keepsingleton, freqmin) - 1;
1383  Vdouble dist;
1384  if (distance1)
1385  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1386  else
1387  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1388  return VectorTools::sum(D * dist) / VectorTools::sum(dist * dist);
1389  }
1390  catch (DimensionException& e)
1391  {
1392  throw e;
1393  }
1394 }
1395 
1396 double SequenceStatistics::originRegressionDprime(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
1397 {
1398  try
1399  {
1400  Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin) - 1;
1401  Vdouble dist;
1402  if (distance1)
1403  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1404  else
1405  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1406  return VectorTools::sum(Dprime * dist) / VectorTools::sum(dist * dist);
1407  }
1408  catch (DimensionException& e)
1409  {
1410  throw e;
1411  }
1412 }
1413 
1414 double SequenceStatistics::originRegressionR2(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
1415 {
1416  try
1417  {
1418  Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin) - 1;
1419  Vdouble dist;
1420  if (distance1)
1421  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1422  else
1423  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1424  return VectorTools::sum(R2 * dist) / VectorTools::sum(dist * dist);
1425  }
1426  catch (DimensionException& e)
1427  {
1428  throw e;
1429  }
1430 }
1431 
1432 Vdouble SequenceStatistics::linearRegressionD(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
1433 {
1434  try
1435  {
1436  Vdouble D = pairwiseD(psc, keepsingleton, freqmin);
1437  Vdouble dist;
1438  Vdouble reg(2);
1439  if (distance1)
1440  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1441  else
1442  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1443  reg[0] = VectorTools::cov<double, double>(dist, D) / VectorTools::var<double, double>(dist);
1444  reg[1] = VectorTools::mean<double, double>(D) - reg[0] * VectorTools::mean<double, double>(dist);
1445  return reg;
1446  }
1447  catch (DimensionException& e)
1448  {
1449  throw e;
1450  }
1451 }
1452 
1453 Vdouble SequenceStatistics::linearRegressionDprime(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
1454 {
1455  try
1456  {
1457  Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin);
1458  Vdouble dist;
1459  Vdouble reg(2);
1460  if (distance1)
1461  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1462  else
1463  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1464  reg[0] = VectorTools::cov<double, double>(dist, Dprime) / VectorTools::var<double, double>(dist);
1465  reg[1] = VectorTools::mean<double, double>(Dprime) - reg[0] * VectorTools::mean<double, double>(dist);
1466  return reg;
1467  }
1468  catch (DimensionException& e)
1469  {
1470  throw e;
1471  }
1472 }
1473 
1474 Vdouble SequenceStatistics::linearRegressionR2(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
1475 {
1476  try
1477  {
1478  Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin);
1479  Vdouble dist;
1480  Vdouble reg(2);
1481  if (distance1)
1482  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1483  else
1484  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1485  reg[0] = VectorTools::cov<double, double>(dist, R2) / VectorTools::var<double, double>(dist);
1486  reg[1] = VectorTools::mean<double, double>(R2) - reg[0] * VectorTools::mean<double, double>(dist);
1487  return reg;
1488  }
1489  catch (DimensionException& e)
1490  {
1491  throw e;
1492  }
1493 }
1494 
1495 double SequenceStatistics::inverseRegressionR2(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
1496 {
1497  try
1498  {
1499  Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin);
1500  Vdouble unit(R2.size(), 1);
1501  Vdouble R2transformed = unit / R2 - 1;
1502  Vdouble dist;
1503  if (distance1)
1504  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1505  else
1506  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1507  return VectorTools::sum(R2transformed * dist) / VectorTools::sum(dist * dist);
1508  }
1509  catch (DimensionException& e)
1510  {
1511  throw e;
1512  }
1513 }
1514 
1515 /**********************/
1516 /* Hudson method */
1517 /**********************/
1518 
1519 double SequenceStatistics::hudson87(const PolymorphismSequenceContainer& psc, double precision, double cinf, double csup)
1520 {
1521  double left = leftHandHudson_(psc);
1522  size_t n = psc.getNumberOfSequences();
1523  double dif = 1;
1524  double c1 = cinf;
1525  double c2 = csup;
1526  if (SequenceStatistics::polymorphicSiteNumber(psc) < 2)
1527  return -1;
1528  if (rightHandHudson_(c1, n) < left)
1529  return cinf;
1530  if (rightHandHudson_(c2, n) > left)
1531  return csup;
1532  while (dif > precision)
1533  {
1534  if (rightHandHudson_((c1 + c2) / 2, n) > left)
1535  c1 = (c1 + c2) / 2;
1536  else
1537  c2 = (c1 + c2) / 2;
1538  dif = std::abs(2 * (c1 - c2) / (c1 + c2));
1539  }
1540  return (c1 + c2) / 2;
1541 }
1542 
1543 /*****************/
1544 /* Tests methods */
1545 /*****************/
1546 
1547 void SequenceStatistics::testUsefullValues(std::ostream& s, size_t n)
1548 {
1549  map<string, double> v = getUsefullValues_(n);
1550  double vD = getVD_(n, v["a1"], v["a2"], v["cn"]);
1551  double uD = getUD_(v["a1"], vD);
1552  double vDs = getVDstar_(n, v["a1"], v["a2"], v["dn"]);
1553  double uDs = getUDstar_(n, v["a1"], vDs);
1554 
1555  s << n << "\t";
1556  s << v["a1"] << "\t";
1557  s << v["a2"] << "\t";
1558  s << v["a1n"] << "\t";
1559  s << v["b1"] << "\t";
1560  s << v["b2"] << "\t";
1561  s << v["c1"] << "\t";
1562  s << v["c2"] << "\t";
1563  s << v["cn"] << "\t";
1564  s << v["dn"] << "\t";
1565  s << v["e1"] << "\t";
1566  s << v["e2"] << "\t";
1567  s << uD << "\t";
1568  s << vD << "\t";
1569  s << uDs << "\t";
1570  s << vDs << endl;
1571 }
1572 
1573 // ******************************************************************************
1574 // Private methods
1575 // ******************************************************************************
1576 
1577 size_t SequenceStatistics::getMutationNumber_(const Site& site)
1578 {
1579  size_t tmp_count = 0;
1580  map<int, size_t> states_count;
1581  SymbolListTools::getCounts(site, states_count);
1582 
1583  for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1584  {
1585  if (it->first >= 0)
1586  tmp_count++;
1587  }
1588  if (tmp_count > 0)
1589  tmp_count--;
1590  return tmp_count;
1591 }
1592 
1593 size_t SequenceStatistics::getSingletonNumber_(const Site& site)
1594 {
1595  size_t nus = 0;
1596  map<int, size_t> states_count;
1597  SymbolListTools::getCounts(site, states_count);
1598  for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1599  {
1600  if (it->second == 1)
1601  nus++;
1602  }
1603  return nus;
1604 }
1605 
1606 size_t SequenceStatistics::getDerivedSingletonNumber_(const Site& site_in, const Site& site_out)
1607 {
1608  size_t nus = 0;
1609  map<int, size_t> states_count;
1610  map<int, size_t> outgroup_states_count;
1611  SymbolListTools::getCounts(site_in, states_count);
1612  SymbolListTools::getCounts(site_out, outgroup_states_count);
1613  // if there is more than one variant in the outgroup we will not be able to recover the ancestral state
1614  if (outgroup_states_count.size() == 1)
1615  {
1616  for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1617  {
1618  if (it->second == 1)
1619  {
1620  if (outgroup_states_count.find(it->first) == outgroup_states_count.end())
1621  nus++;
1622  }
1623  }
1624  }
1625  return nus;
1626 }
1627 
1628 std::map<std::string, double> SequenceStatistics::getUsefullValues_(size_t n)
1629 {
1630  double nn = static_cast<double>(n);
1631  map<string, double> values;
1632  values["a1"] = 0.;
1633  values["a2"] = 0.;
1634  values["a1n"] = 0.;
1635  values["b1"] = 0.;
1636  values["b2"] = 0.;
1637  values["c1"] = 0.;
1638  values["c2"] = 0.;
1639  values["cn"] = 0.;
1640  values["dn"] = 0.;
1641  values["e1"] = 0.;
1642  values["e2"] = 0.;
1643  if (n > 1)
1644  {
1645  for (double i = 1; i < nn; i++)
1646  {
1647  values["a1"] += 1. / i;
1648  values["a2"] += 1. / (i * i);
1649  }
1650  values["a1n"] = values["a1"] + (1. / nn);
1651  values["b1"] = (nn + 1.) / (3. * (nn - 1.));
1652  values["b2"] = 2. * ((nn * nn) + nn + 3.) / (9. * nn * (nn - 1.));
1653  values["c1"] = values["b1"] - (1. / values["a1"]);
1654  values["c2"] = values["b2"] - ((nn + 2.) / (values["a1"] * nn)) + (values["a2"] / (values["a1"] * values["a1"]));
1655  if (n == 2)
1656  {
1657  values["cn"] = 1.;
1658  values["dn"] = 2.;
1659  }
1660  else
1661  {
1662  values["cn"] = 2. * ((nn * values["a1"]) - (2. * (nn - 1.))) / ((nn - 1.) * (nn - 2.));
1663  values["dn"] =
1664  values["cn"]
1665  + ((nn - 2.) / ((nn - 1.) * (nn - 1.)))
1666  + (2. / (nn - 1.))
1667  * ((3. / 2.) - (((2. * values["a1n"]) - 3.) / (nn - 2.)) - (1. / nn));
1668  }
1669  values["e1"] = values["c1"] / values["a1"];
1670  values["e2"] = values["c2"] / ((values["a1"] * values["a1"]) + values["a2"]);
1671  }
1672  return values;
1673 }
1674 
1675 double SequenceStatistics::getVD_(size_t n, double a1, double a2, double cn)
1676 {
1677  double nn = static_cast<double>(n);
1678  if (n < 3)
1679  return 0.;
1680  double vD = 1. + ((a1 * a1) / (a2 + (a1 * a1))) * (cn - ((nn + 1.) / (nn - 1.)));
1681  return vD;
1682 }
1683 
1684 double SequenceStatistics::getUD_(double a1, double vD)
1685 {
1686  return a1 - 1. - vD;
1687 }
1688 
1689 double SequenceStatistics::getVDstar_(size_t n, double a1, double a2, double dn)
1690 {
1691  double denom = (a1 * a1) + a2;
1692  if (n < 3 || denom == 0.)
1693  return 0.;
1694  double nn = static_cast<double>(n);
1695  double nnn = nn / (nn - 1.);
1696  // Fu & Li 1993
1697  double vDs = (
1698  (nnn * nnn * a2)
1699  + (a1 * a1 * dn)
1700  - (2. * (nn * a1 * (a1 + 1)) / ((nn - 1.) * (nn - 1.)))
1701  )
1702  /
1703  denom;
1704  // Simonsen et al. 1995
1705  /*
1706  double vDs = (
1707  (values["a2"] / pow(values["a1"], 2))
1708  - (2./nn) * (1. + 1./values["a1"] - values["a1"] + values["a1"]/nn)
1709  - 1./(nn*nn)
1710  )
1711  /
1712  (pow(values["a1"], 2) + values["a2"]);
1713  */
1714  return vDs;
1715 }
1716 
1717 double SequenceStatistics::getUDstar_(size_t n, double a1, double vDs)
1718 {
1719  if (n < 3)
1720  return 0.;
1721  double nn = static_cast<double>(n);
1722  double nnn = nn / (nn - 1.);
1723  // Fu & Li 1993
1724  double uDs = (nnn * (a1 - nnn)) - vDs;
1725  // Simonsen et al. 1995
1726  /*
1727  double uDs = (((nn - 1.)/nn - 1./values["a1"]) / values["a1"]) - vDs;
1728  */
1729  return uDs;
1730 }
1731 
1732 double SequenceStatistics::leftHandHudson_(const PolymorphismSequenceContainer& psc)
1733 {
1734  PolymorphismSequenceContainer* newpsc = PolymorphismSequenceContainerTools::getCompleteSites(psc);
1735  size_t nbseq = newpsc->getNumberOfSequences();
1736  double S1 = 0;
1737  double S2 = 0;
1738  for (size_t i = 0; i < nbseq - 1; i++)
1739  {
1740  for (size_t j = i + 1; j < nbseq; j++)
1741  {
1742  SequenceSelection ss(2);
1743  ss[0] = i;
1744  ss[1] = j;
1745  PolymorphismSequenceContainer* psc2 = PolymorphismSequenceContainerTools::getSelectedSequences(*newpsc, ss);
1746  S1 += SequenceStatistics::watterson75(*psc2, true);
1747  S2 += SequenceStatistics::watterson75(*psc2, true) * SequenceStatistics::watterson75(*psc2, true);
1748  delete psc2;
1749  }
1750  }
1751  double Sk = (2 * S2 - pow(2 * S1 / static_cast<double>(nbseq), 2.)) / pow(nbseq, 2.);
1752  double H = SequenceStatistics::heterozygosity(*newpsc);
1753  double H2 = SequenceStatistics::squaredHeterozygosity(*newpsc);
1754  delete newpsc;
1755  return static_cast<double>(Sk - H + H2) / pow(H * static_cast<double>(nbseq) / static_cast<double>(nbseq - 1), 2.);
1756 }
1757 
1758 double SequenceStatistics::rightHandHudson_(double c, size_t n)
1759 {
1760  double nn = static_cast<double>(n);
1761  return 1. / (97. * pow(c, 2.) * pow(nn, 3.)) * ((nn - 1.) * (97. * (c * (4. + (c - 2. * nn) * nn) + (-2. * (7. + c) + 4. * nn + (c - 1.) * pow(nn, 2.)) * log((18. + c * (13. + c)) / 18.)) + sqrt(97.) * (110. + nn * (49. * nn - 52.) + c * (2. + nn * (15. * nn - 8.))) * log(-1. + (72. + 26. * c) / (36. + 13. * c - c * sqrt(97.)))));
1762 }
1763