bpp-seq  2.1.0
Bpp/Seq/SiteTools.cpp
Go to the documentation of this file.
00001 //
00002 // File SiteTools.cpp
00003 // Author : Julien Dutheil
00004 //          Guillaume Deuchst
00005 // Created on: Friday August 8 2003
00006 //
00007 
00008 /*
00009    Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
00010 
00011    This software is a computer program whose purpose is to provide classes
00012    for sequences analysis.
00013 
00014    This software is governed by the CeCILL  license under French law and
00015    abiding by the rules of distribution of free software.  You can  use,
00016    modify and/ or redistribute the software under the terms of the CeCILL
00017    license as circulated by CEA, CNRS and INRIA at the following URL
00018    "http://www.cecill.info".
00019 
00020    As a counterpart to the access to the source code and  rights to copy,
00021    modify and redistribute granted by the license, users are provided only
00022    with a limited warranty  and the software's author,  the holder of the
00023    economic rights,  and the successive licensors  have only  limited
00024    liability.
00025 
00026    In this respect, the user's attention is drawn to the risks associated
00027    with loading,  using,  modifying and/or developing or reproducing the
00028    software by the user in light of its specific status of free software,
00029    that may mean  that it is complicated to manipulate,  and  that  also
00030    therefore means  that it is reserved for developers  and  experienced
00031    professionals having in-depth computer knowledge. Users are therefore
00032    encouraged to load and test the software's suitability as regards their
00033    requirements in conditions enabling the security of their systems and/or
00034    data to be ensured and,  more generally, to use and operate it in the
00035    same conditions as regards security.
00036 
00037    The fact that you are presently reading this means that you have had
00038    knowledge of the CeCILL license and that you accept its terms.
00039  */
00040 
00041 #include "SiteTools.h"
00042 #include "Alphabet/CodonAlphabet.h"
00043 #include <Bpp/Utils/MapTools.h>
00044 #include <Bpp/Numeric/NumTools.h>
00045 #include <Bpp/Numeric/VectorTools.h>
00046 
00047 using namespace bpp;
00048 
00049 // From the STL:
00050 #include <cmath>
00051 
00052 using namespace std;
00053 
00054 /******************************************************************************/
00055 
00056 bool SiteTools::hasGap(const Site& site)
00057 {
00058   // Main loop : for all characters in site
00059   for (size_t i = 0; i < site.size(); i++)
00060   {
00061     if (site.getAlphabet()->isGap(site[i]))
00062       return true;
00063   }
00064   return false;
00065 }
00066 
00067 /******************************************************************************/
00068 
00069 bool SiteTools::isGapOnly(const Site& site)
00070 {
00071   // Main loop : for all characters in site
00072   for (size_t i = 0; i < site.size(); i++)
00073   {
00074     if (!site.getAlphabet()->isGap(site[i]))
00075       return false;
00076   }
00077   return true;
00078 }
00079 
00080 /******************************************************************************/
00081 
00082 bool SiteTools::isGapOrUnresolvedOnly(const Site& site)
00083 {
00084   // Main loop : for all characters in site
00085   for (size_t i = 0; i < site.size(); i++)
00086   {
00087     if (!site.getAlphabet()->isGap(site[i]) && !site.getAlphabet()->isUnresolved(site[i]))
00088       return false;
00089   }
00090   return true;
00091 }
00092 
00093 /******************************************************************************/
00094 
00095 bool SiteTools::hasUnknown(const Site& site)
00096 {
00097   // Main loop : for all characters in site
00098   for (size_t i = 0; i < site.size(); i++)
00099   {
00100     if (site[i] == site.getAlphabet()->getUnknownCharacterCode())
00101       return true;
00102   }
00103   return false;
00104 }
00105 
00106 /******************************************************************************/
00107 
00108 bool SiteTools::isComplete(const Site& site)
00109 {
00110   // Main loop : for all characters in site
00111   for (size_t i = 0; i < site.size(); i++)
00112   {
00113     if (site.getAlphabet()->isGap(site[i]) || site.getAlphabet()->isUnresolved(site[i]))
00114       return false;
00115   }
00116   return true;
00117 }
00118 
00119 /******************************************************************************/
00120 
00121 bool SiteTools::areSitesIdentical(const Site& site1, const Site& site2)
00122 {
00123   // Site's size and content checking
00124   if (site1.getAlphabet()->getAlphabetType() != site2.getAlphabet()->getAlphabetType())
00125     return false;
00126   if (site1.size() != site2.size())
00127     return false;
00128   else
00129   {
00130     for (size_t i = 0; i < site1.size(); i++)
00131     {
00132       if (site1[i] != site2[i])
00133         return false;
00134     }
00135     return true;
00136   }
00137 }
00138 
00139 /******************************************************************************/
00140 
00141 bool SiteTools::isConstant(const Site& site, bool ignoreUnknown, bool unresolvedRaisesException) throw (EmptySiteException)
00142 {
00143   // Empty site checking
00144   if (site.size() == 0)
00145     throw EmptySiteException("SiteTools::isConstant: Incorrect specified site, size must be > 0", &site);
00146 
00147   // For all site's characters
00148   int gap = site.getAlphabet()->getGapCharacterCode();
00149   if (ignoreUnknown)
00150   {
00151     int s = site[0];
00152     int unknown = site.getAlphabet()->getUnknownCharacterCode();
00153     size_t i = 0;
00154     while (i < site.size() && (s == gap || s == unknown))
00155     {
00156       s = site[i];
00157       i++;
00158     }
00159     if (s == unknown || s == gap)
00160     {
00161       if (unresolvedRaisesException)
00162         throw EmptySiteException("SiteTools::isConstant: Site is only made of gaps or generic characters.");
00163       else
00164         return false;
00165     }
00166     while (i < site.size())
00167     {
00168       if (site[i] != s && site[i] != gap && site[i] != unknown)
00169         return false;
00170       i++;
00171     }
00172   }
00173   else
00174   {
00175     int s = site[0];
00176     size_t i = 0;
00177     while  (i < site.size() && s == gap)
00178     {
00179       s = site[i];
00180       i++;
00181     }
00182     if (s == gap)
00183     {
00184       if (unresolvedRaisesException)
00185         throw EmptySiteException("SiteTools::isConstant: Site is only made of gaps.");
00186       else
00187         return false;
00188     }
00189     while (i < site.size())
00190     {
00191       if (site[i] != s && site[i] != gap)
00192         return false;
00193       i++;
00194     }
00195   }
00196 
00197   return true;
00198 }
00199 
00200 /******************************************************************************/
00201 
00202 double SiteTools::variabilityShannon(const Site& site, bool resolveUnknown) throw (EmptySiteException)
00203 {
00204   // Empty site checking
00205   if (site.size() == 0)
00206     throw EmptySiteException("SiteTools::variabilityShannon: Incorrect specified site, size must be > 0", &site);
00207   map<int, double> p;
00208   getFrequencies(site, p, resolveUnknown);
00209   // We need to correct frequencies for gaps:
00210   double s = 0.;
00211   for (int i = 0; i < static_cast<int>(site.getAlphabet()->getSize()); i++)
00212   {
00213     double f = p[i];
00214     if (f > 0)
00215       s += f * log(f);
00216   }
00217   return -s;
00218 }
00219 
00220 /******************************************************************************/
00221 
00222 double SiteTools::mutualInformation(const Site& site1, const Site& site2, bool resolveUnknown) throw (DimensionException, EmptySiteException)
00223 {
00224   // Empty site checking
00225   if (site1.size() == 0)
00226     throw EmptySiteException("SiteTools::mutualInformation: Incorrect specified site, size must be > 0", &site1);
00227   if (site2.size() == 0)
00228     throw EmptySiteException("SiteTools::mutualInformation: Incorrect specified site, size must be > 0", &site2);
00229   if (site1.size() != site2.size())
00230     throw DimensionException("SiteTools::mutualInformation: sites must have the same size!", site1.size(), site2.size());
00231   vector<double> p1(site1.getAlphabet()->getSize());
00232   vector<double> p2(site2.getAlphabet()->getSize());
00233   map<int, map<int, double> > p12;
00234   getCounts(site1, site2, p12, resolveUnknown);
00235   double mi = 0, tot = 0, pxy;
00236   // We need to correct frequencies for gaps:
00237   for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++)
00238   {
00239     for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++)
00240     {
00241       pxy = p12[static_cast<int>(i)][static_cast<int>(j)];
00242       tot += pxy;
00243       p1[i] += pxy;
00244       p2[j] += pxy;
00245     }
00246   }
00247   for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++)
00248   {
00249     p1[i] /= tot;
00250   }
00251   for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++)
00252   {
00253     p2[j] /= tot;
00254   }
00255   for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++)
00256   {
00257     for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++)
00258     {
00259       pxy = p12[static_cast<int>(i)][static_cast<int>(j)] / tot;
00260       if (pxy > 0)
00261         mi += pxy * log(pxy / (p1[i] * p2[j]));
00262     }
00263   }
00264   return mi;
00265 }
00266 
00267 /******************************************************************************/
00268 
00269 double SiteTools::jointEntropy(const Site& site1, const Site& site2, bool resolveUnknown) throw (DimensionException, EmptySiteException)
00270 {
00271   // Empty site checking
00272   if (site1.size() == 0)
00273     throw EmptySiteException("SiteTools::jointEntropy: Incorrect specified site, size must be > 0", &site1);
00274   if (site2.size() == 0)
00275     throw EmptySiteException("SiteTools::jointEntropy: Incorrect specified site, size must be > 0", &site2);
00276   if (site1.size() != site2.size())
00277     throw DimensionException("SiteTools::jointEntropy: sites must have the same size!", site1.size(), site2.size());
00278   map<int, map<int, double> > p12;
00279   getCounts(site1, site2, p12, resolveUnknown);
00280   double tot = 0, pxy, h = 0;
00281   // We need to correct frequencies for gaps:
00282   for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++)
00283   {
00284     for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++)
00285     {
00286       pxy = p12[static_cast<int>(i)][static_cast<int>(j)];
00287       tot += pxy;
00288     }
00289   }
00290   for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++)
00291   {
00292     for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++)
00293     {
00294       pxy = p12[static_cast<int>(i)][static_cast<int>(j)] / tot;
00295       if (pxy > 0)
00296         h += pxy * log(pxy);
00297     }
00298   }
00299   return -h;
00300 }
00301 
00302 /******************************************************************************/
00303 
00304 double SiteTools::variabilityFactorial(const Site& site) throw (EmptySiteException)
00305 {
00306   // Empty site checking
00307   if (site.size() == 0)
00308     throw EmptySiteException("SiteTools::variabilityFactorial: Incorrect specified site, size must be > 0", &site);
00309   map<int, size_t> p;
00310   getCounts(site, p);
00311   vector<size_t> c = MapTools::getValues(p);
00312   size_t s = VectorTools::sum(c);
00313   long double l = static_cast<long double>(NumTools::fact(s)) / static_cast<long double>(VectorTools::sum(VectorTools::fact(c)));
00314   return (static_cast<double>(std::log(l)));
00315 }
00316 
00317 /******************************************************************************/
00318 
00319 double SiteTools::heterozygosity(const Site& site) throw (EmptySiteException)
00320 {
00321   // Empty site checking
00322   if (site.size() == 0)
00323     throw EmptySiteException("SiteTools::heterozygosity: Incorrect specified site, size must be > 0", &site);
00324   map<int, double> p;
00325   getFrequencies(site, p);
00326   vector<double> c = MapTools::getValues(p);
00327   double n = VectorTools::norm<double, double>(MapTools::getValues(p));
00328   return 1. - n * n;
00329 }
00330 
00331 /******************************************************************************/
00332 
00333 size_t SiteTools::getNumberOfDistinctCharacters(const Site& site) throw (EmptySiteException)
00334 {
00335   // Empty site checking
00336   if (site.size() == 0)
00337     throw EmptySiteException("SiteTools::getNumberOfDistinctCharacters(): Incorrect specified site, size must be > 0", &site);
00338   // For all site's characters
00339   if (SiteTools::isConstant(site))
00340     return 1;
00341   map<int, size_t> counts;
00342   SymbolListTools::getCounts(site, counts);
00343   int s = 0;
00344   for (map<int, size_t>::iterator it = counts.begin(); it != counts.end(); it++)
00345   {
00346     if (it->second != 0)
00347       s++;
00348   }
00349   return s;
00350 }
00351 
00352 /******************************************************************************/
00353 
00354 bool SiteTools::hasSingleton(const Site& site) throw (EmptySiteException)
00355 {
00356   // Empty site checking
00357   if (site.size() == 0)
00358     throw EmptySiteException("SiteTools::hasSingleton: Incorrect specified site, size must be > 0", &site);
00359   // For all site's characters
00360   if (SiteTools::isConstant(site))
00361     return false;
00362   map<int, size_t> counts;
00363   getCounts(site, counts);
00364   for (map<int, size_t>::iterator it = counts.begin(); it != counts.end(); it++)
00365   {
00366     if (it->second == 1)
00367       return true;
00368   }
00369   return false;
00370 }
00371 
00372 /******************************************************************************/
00373 
00374 bool SiteTools::isParsimonyInformativeSite(const Site& site) throw (EmptySiteException)
00375 {
00376   // Empty site checking
00377   if (site.size() == 0)
00378     throw EmptySiteException("SiteTools::isParsimonyInformativeSite: Incorrect specified site, size must be > 0", &site);
00379   // For all site's characters
00380   if (SiteTools::isConstant(site, false, false))
00381     return false;
00382   map<int, size_t> counts;
00383   SymbolListTools::getCounts(site, counts);
00384   size_t npars = 0;
00385   for (map<int, size_t>::iterator it = counts.begin(); it != counts.end(); it++)
00386   {
00387     if (it->second > 1)
00388       npars++;
00389   }
00390   if (npars > 1)
00391     return true;
00392   return false;
00393 }
00394 
00395 /******************************************************************************/
00396 
00397 bool SiteTools::isTriplet(const Site& site) throw (EmptySiteException)
00398 {
00399   // Empty site checking
00400   if (site.size() == 0)
00401     throw EmptySiteException("SiteTools::isTriplet: Incorrect specified site, size must be > 0", &site);
00402   // For all site's characters
00403   if (SiteTools::getNumberOfDistinctCharacters(site) >= 3)
00404     return true;
00405   else
00406     return false;
00407 }
00408 
00409 /******************************************************************************/
00410 
 All Classes Namespaces Files Functions Variables Typedefs Friends