bpp-seq  2.1.0
 All Classes Namespaces Files Functions Variables Friends Pages
SiteTools.cpp
Go to the documentation of this file.
1 //
2 // File SiteTools.cpp
3 // Author : Julien Dutheil
4 // Guillaume Deuchst
5 // Created on: Friday August 8 2003
6 //
7 
8 /*
9  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
10 
11  This software is a computer program whose purpose is to provide classes
12  for sequences analysis.
13 
14  This software is governed by the CeCILL license under French law and
15  abiding by the rules of distribution of free software. You can use,
16  modify and/ or redistribute the software under the terms of the CeCILL
17  license as circulated by CEA, CNRS and INRIA at the following URL
18  "http://www.cecill.info".
19 
20  As a counterpart to the access to the source code and rights to copy,
21  modify and redistribute granted by the license, users are provided only
22  with a limited warranty and the software's author, the holder of the
23  economic rights, and the successive licensors have only limited
24  liability.
25 
26  In this respect, the user's attention is drawn to the risks associated
27  with loading, using, modifying and/or developing or reproducing the
28  software by the user in light of its specific status of free software,
29  that may mean that it is complicated to manipulate, and that also
30  therefore means that it is reserved for developers and experienced
31  professionals having in-depth computer knowledge. Users are therefore
32  encouraged to load and test the software's suitability as regards their
33  requirements in conditions enabling the security of their systems and/or
34  data to be ensured and, more generally, to use and operate it in the
35  same conditions as regards security.
36 
37  The fact that you are presently reading this means that you have had
38  knowledge of the CeCILL license and that you accept its terms.
39  */
40 
41 #include "SiteTools.h"
42 #include "Alphabet/CodonAlphabet.h"
43 #include <Bpp/Utils/MapTools.h>
44 #include <Bpp/Numeric/NumTools.h>
46 
47 using namespace bpp;
48 
49 // From the STL:
50 #include <cmath>
51 
52 using namespace std;
53 
54 /******************************************************************************/
55 
56 bool SiteTools::hasGap(const Site& site)
57 {
58  // Main loop : for all characters in site
59  for (size_t i = 0; i < site.size(); i++)
60  {
61  if (site.getAlphabet()->isGap(site[i]))
62  return true;
63  }
64  return false;
65 }
66 
67 /******************************************************************************/
68 
69 bool SiteTools::isGapOnly(const Site& site)
70 {
71  // Main loop : for all characters in site
72  for (size_t i = 0; i < site.size(); i++)
73  {
74  if (!site.getAlphabet()->isGap(site[i]))
75  return false;
76  }
77  return true;
78 }
79 
80 /******************************************************************************/
81 
83 {
84  // Main loop : for all characters in site
85  for (size_t i = 0; i < site.size(); i++)
86  {
87  if (!site.getAlphabet()->isGap(site[i]) && !site.getAlphabet()->isUnresolved(site[i]))
88  return false;
89  }
90  return true;
91 }
92 
93 /******************************************************************************/
94 
95 bool SiteTools::hasUnknown(const Site& site)
96 {
97  // Main loop : for all characters in site
98  for (size_t i = 0; i < site.size(); i++)
99  {
100  if (site[i] == site.getAlphabet()->getUnknownCharacterCode())
101  return true;
102  }
103  return false;
104 }
105 
106 /******************************************************************************/
107 
108 bool SiteTools::hasStopCodon(const Site& site)
109 {
110  // Main loop : for all characters in site
111  const CodonAlphabet* pca = dynamic_cast<const CodonAlphabet*>(site.getAlphabet());
112  if (pca == 0)
113  return false;
114  for (size_t i = 0; i < site.size(); i++)
115  {
116  if (pca->isStop(site[i]))
117  return true;
118  }
119  return false;
120 }
121 
122 /******************************************************************************/
123 
124 bool SiteTools::isComplete(const Site& site)
125 {
126  // Main loop : for all characters in site
127  for (size_t i = 0; i < site.size(); i++)
128  {
129  if (site.getAlphabet()->isGap(site[i]) || site.getAlphabet()->isUnresolved(site[i]))
130  return false;
131  }
132  return true;
133 }
134 
135 /******************************************************************************/
136 
137 bool SiteTools::areSitesIdentical(const Site& site1, const Site& site2)
138 {
139  // Site's size and content checking
140  if (site1.getAlphabet()->getAlphabetType() != site2.getAlphabet()->getAlphabetType())
141  return false;
142  if (site1.size() != site2.size())
143  return false;
144  else
145  {
146  for (size_t i = 0; i < site1.size(); i++)
147  {
148  if (site1[i] != site2[i])
149  return false;
150  }
151  return true;
152  }
153 }
154 
155 /******************************************************************************/
156 
157 bool SiteTools::isConstant(const Site& site, bool ignoreUnknown, bool unresolvedRaisesException) throw (EmptySiteException)
158 {
159  // Empty site checking
160  if (site.size() == 0)
161  throw EmptySiteException("SiteTools::isConstant: Incorrect specified site, size must be > 0", &site);
162 
163  // For all site's characters
164  int gap = site.getAlphabet()->getGapCharacterCode();
165  if (ignoreUnknown)
166  {
167  int s = site[0];
168  int unknown = site.getAlphabet()->getUnknownCharacterCode();
169  size_t i = 0;
170  while (i < site.size() && (s == gap || s == unknown))
171  {
172  s = site[i];
173  i++;
174  }
175  if (s == unknown || s == gap)
176  {
177  if (unresolvedRaisesException)
178  throw EmptySiteException("SiteTools::isConstant: Site is only made of gaps or generic characters.");
179  else
180  return false;
181  }
182  while (i < site.size())
183  {
184  if (site[i] != s && site[i] != gap && site[i] != unknown)
185  return false;
186  i++;
187  }
188  }
189  else
190  {
191  int s = site[0];
192  size_t i = 0;
193  while (i < site.size() && s == gap)
194  {
195  s = site[i];
196  i++;
197  }
198  if (s == gap)
199  {
200  if (unresolvedRaisesException)
201  throw EmptySiteException("SiteTools::isConstant: Site is only made of gaps.");
202  else
203  return false;
204  }
205  while (i < site.size())
206  {
207  if (site[i] != s && site[i] != gap)
208  return false;
209  i++;
210  }
211  }
212 
213  return true;
214 }
215 
216 /******************************************************************************/
217 
218 double SiteTools::variabilityShannon(const Site& site, bool resolveUnknown) throw (EmptySiteException)
219 {
220  // Empty site checking
221  if (site.size() == 0)
222  throw EmptySiteException("SiteTools::variabilityShannon: Incorrect specified site, size must be > 0", &site);
223  map<int, double> p;
224  getFrequencies(site, p, resolveUnknown);
225  // We need to correct frequencies for gaps:
226  double s = 0.;
227  for (int i = 0; i < static_cast<int>(site.getAlphabet()->getSize()); i++)
228  {
229  double f = p[i];
230  if (f > 0)
231  s += f * log(f);
232  }
233  return -s;
234 }
235 
236 /******************************************************************************/
237 
238 double SiteTools::mutualInformation(const Site& site1, const Site& site2, bool resolveUnknown) throw (DimensionException, EmptySiteException)
239 {
240  // Empty site checking
241  if (site1.size() == 0)
242  throw EmptySiteException("SiteTools::mutualInformation: Incorrect specified site, size must be > 0", &site1);
243  if (site2.size() == 0)
244  throw EmptySiteException("SiteTools::mutualInformation: Incorrect specified site, size must be > 0", &site2);
245  if (site1.size() != site2.size())
246  throw DimensionException("SiteTools::mutualInformation: sites must have the same size!", site1.size(), site2.size());
247  vector<double> p1(site1.getAlphabet()->getSize());
248  vector<double> p2(site2.getAlphabet()->getSize());
249  map<int, map<int, double> > p12;
250  getCounts(site1, site2, p12, resolveUnknown);
251  double mi = 0, tot = 0, pxy;
252  // We need to correct frequencies for gaps:
253  for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++)
254  {
255  for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++)
256  {
257  pxy = p12[static_cast<int>(i)][static_cast<int>(j)];
258  tot += pxy;
259  p1[i] += pxy;
260  p2[j] += pxy;
261  }
262  }
263  for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++)
264  {
265  p1[i] /= tot;
266  }
267  for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++)
268  {
269  p2[j] /= tot;
270  }
271  for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++)
272  {
273  for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++)
274  {
275  pxy = p12[static_cast<int>(i)][static_cast<int>(j)] / tot;
276  if (pxy > 0)
277  mi += pxy * log(pxy / (p1[i] * p2[j]));
278  }
279  }
280  return mi;
281 }
282 
283 /******************************************************************************/
284 
285 double SiteTools::jointEntropy(const Site& site1, const Site& site2, bool resolveUnknown) throw (DimensionException, EmptySiteException)
286 {
287  // Empty site checking
288  if (site1.size() == 0)
289  throw EmptySiteException("SiteTools::jointEntropy: Incorrect specified site, size must be > 0", &site1);
290  if (site2.size() == 0)
291  throw EmptySiteException("SiteTools::jointEntropy: Incorrect specified site, size must be > 0", &site2);
292  if (site1.size() != site2.size())
293  throw DimensionException("SiteTools::jointEntropy: sites must have the same size!", site1.size(), site2.size());
294  map<int, map<int, double> > p12;
295  getCounts(site1, site2, p12, resolveUnknown);
296  double tot = 0, pxy, h = 0;
297  // We need to correct frequencies for gaps:
298  for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++)
299  {
300  for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++)
301  {
302  pxy = p12[static_cast<int>(i)][static_cast<int>(j)];
303  tot += pxy;
304  }
305  }
306  for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++)
307  {
308  for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++)
309  {
310  pxy = p12[static_cast<int>(i)][static_cast<int>(j)] / tot;
311  if (pxy > 0)
312  h += pxy * log(pxy);
313  }
314  }
315  return -h;
316 }
317 
318 /******************************************************************************/
319 
321 {
322  // Empty site checking
323  if (site.size() == 0)
324  throw EmptySiteException("SiteTools::variabilityFactorial: Incorrect specified site, size must be > 0", &site);
325  map<int, size_t> p;
326  getCounts(site, p);
327  vector<size_t> c = MapTools::getValues(p);
328  size_t s = VectorTools::sum(c);
329  long double l = static_cast<long double>(NumTools::fact(s)) / static_cast<long double>(VectorTools::sum(VectorTools::fact(c)));
330  return (static_cast<double>(std::log(l)));
331 }
332 
333 /******************************************************************************/
334 
336 {
337  // Empty site checking
338  if (site.size() == 0)
339  throw EmptySiteException("SiteTools::heterozygosity: Incorrect specified site, size must be > 0", &site);
340  map<int, double> p;
341  getFrequencies(site, p);
342  vector<double> c = MapTools::getValues(p);
343  double n = VectorTools::norm<double, double>(MapTools::getValues(p));
344  return 1. - n * n;
345 }
346 
347 /******************************************************************************/
348 
350 {
351  // Empty site checking
352  if (site.size() == 0)
353  throw EmptySiteException("SiteTools::getNumberOfDistinctCharacters(): Incorrect specified site, size must be > 0", &site);
354  // For all site's characters
355  if (SiteTools::isConstant(site))
356  return 1;
357  map<int, size_t> counts;
358  SymbolListTools::getCounts(site, counts);
359  int s = 0;
360  for (map<int, size_t>::iterator it = counts.begin(); it != counts.end(); it++)
361  {
362  if (it->second != 0)
363  s++;
364  }
365  return s;
366 }
367 
368 /******************************************************************************/
369 
371 {
372  // Empty site checking
373  if (site.size() == 0)
374  throw EmptySiteException("SiteTools::hasSingleton: Incorrect specified site, size must be > 0", &site);
375  // For all site's characters
376  if (SiteTools::isConstant(site))
377  return false;
378  map<int, size_t> counts;
379  getCounts(site, counts);
380  for (map<int, size_t>::iterator it = counts.begin(); it != counts.end(); it++)
381  {
382  if (it->second == 1)
383  return true;
384  }
385  return false;
386 }
387 
388 /******************************************************************************/
389 
391 {
392  // Empty site checking
393  if (site.size() == 0)
394  throw EmptySiteException("SiteTools::isParsimonyInformativeSite: Incorrect specified site, size must be > 0", &site);
395  // For all site's characters
396  if (SiteTools::isConstant(site, false, false))
397  return false;
398  map<int, size_t> counts;
399  SymbolListTools::getCounts(site, counts);
400  size_t npars = 0;
401  for (map<int, size_t>::iterator it = counts.begin(); it != counts.end(); it++)
402  {
403  if (it->second > 1)
404  npars++;
405  }
406  if (npars > 1)
407  return true;
408  return false;
409 }
410 
411 /******************************************************************************/
412 
414 {
415  // Empty site checking
416  if (site.size() == 0)
417  throw EmptySiteException("SiteTools::isTriplet: Incorrect specified site, size must be > 0", &site);
418  // For all site's characters
420  return true;
421  else
422  return false;
423 }
424 
425 /******************************************************************************/
426