|
bpp-seq
2.1.0
|
00001 // 00002 // File SiteTools.cpp 00003 // Author : Julien Dutheil 00004 // Guillaume Deuchst 00005 // Created on: Friday August 8 2003 00006 // 00007 00008 /* 00009 Copyright or © or Copr. Bio++ Development Team, (November 17, 2004) 00010 00011 This software is a computer program whose purpose is to provide classes 00012 for sequences analysis. 00013 00014 This software is governed by the CeCILL license under French law and 00015 abiding by the rules of distribution of free software. You can use, 00016 modify and/ or redistribute the software under the terms of the CeCILL 00017 license as circulated by CEA, CNRS and INRIA at the following URL 00018 "http://www.cecill.info". 00019 00020 As a counterpart to the access to the source code and rights to copy, 00021 modify and redistribute granted by the license, users are provided only 00022 with a limited warranty and the software's author, the holder of the 00023 economic rights, and the successive licensors have only limited 00024 liability. 00025 00026 In this respect, the user's attention is drawn to the risks associated 00027 with loading, using, modifying and/or developing or reproducing the 00028 software by the user in light of its specific status of free software, 00029 that may mean that it is complicated to manipulate, and that also 00030 therefore means that it is reserved for developers and experienced 00031 professionals having in-depth computer knowledge. Users are therefore 00032 encouraged to load and test the software's suitability as regards their 00033 requirements in conditions enabling the security of their systems and/or 00034 data to be ensured and, more generally, to use and operate it in the 00035 same conditions as regards security. 00036 00037 The fact that you are presently reading this means that you have had 00038 knowledge of the CeCILL license and that you accept its terms. 00039 */ 00040 00041 #include "SiteTools.h" 00042 #include "Alphabet/CodonAlphabet.h" 00043 #include <Bpp/Utils/MapTools.h> 00044 #include <Bpp/Numeric/NumTools.h> 00045 #include <Bpp/Numeric/VectorTools.h> 00046 00047 using namespace bpp; 00048 00049 // From the STL: 00050 #include <cmath> 00051 00052 using namespace std; 00053 00054 /******************************************************************************/ 00055 00056 bool SiteTools::hasGap(const Site& site) 00057 { 00058 // Main loop : for all characters in site 00059 for (size_t i = 0; i < site.size(); i++) 00060 { 00061 if (site.getAlphabet()->isGap(site[i])) 00062 return true; 00063 } 00064 return false; 00065 } 00066 00067 /******************************************************************************/ 00068 00069 bool SiteTools::isGapOnly(const Site& site) 00070 { 00071 // Main loop : for all characters in site 00072 for (size_t i = 0; i < site.size(); i++) 00073 { 00074 if (!site.getAlphabet()->isGap(site[i])) 00075 return false; 00076 } 00077 return true; 00078 } 00079 00080 /******************************************************************************/ 00081 00082 bool SiteTools::isGapOrUnresolvedOnly(const Site& site) 00083 { 00084 // Main loop : for all characters in site 00085 for (size_t i = 0; i < site.size(); i++) 00086 { 00087 if (!site.getAlphabet()->isGap(site[i]) && !site.getAlphabet()->isUnresolved(site[i])) 00088 return false; 00089 } 00090 return true; 00091 } 00092 00093 /******************************************************************************/ 00094 00095 bool SiteTools::hasUnknown(const Site& site) 00096 { 00097 // Main loop : for all characters in site 00098 for (size_t i = 0; i < site.size(); i++) 00099 { 00100 if (site[i] == site.getAlphabet()->getUnknownCharacterCode()) 00101 return true; 00102 } 00103 return false; 00104 } 00105 00106 /******************************************************************************/ 00107 00108 bool SiteTools::isComplete(const Site& site) 00109 { 00110 // Main loop : for all characters in site 00111 for (size_t i = 0; i < site.size(); i++) 00112 { 00113 if (site.getAlphabet()->isGap(site[i]) || site.getAlphabet()->isUnresolved(site[i])) 00114 return false; 00115 } 00116 return true; 00117 } 00118 00119 /******************************************************************************/ 00120 00121 bool SiteTools::areSitesIdentical(const Site& site1, const Site& site2) 00122 { 00123 // Site's size and content checking 00124 if (site1.getAlphabet()->getAlphabetType() != site2.getAlphabet()->getAlphabetType()) 00125 return false; 00126 if (site1.size() != site2.size()) 00127 return false; 00128 else 00129 { 00130 for (size_t i = 0; i < site1.size(); i++) 00131 { 00132 if (site1[i] != site2[i]) 00133 return false; 00134 } 00135 return true; 00136 } 00137 } 00138 00139 /******************************************************************************/ 00140 00141 bool SiteTools::isConstant(const Site& site, bool ignoreUnknown, bool unresolvedRaisesException) throw (EmptySiteException) 00142 { 00143 // Empty site checking 00144 if (site.size() == 0) 00145 throw EmptySiteException("SiteTools::isConstant: Incorrect specified site, size must be > 0", &site); 00146 00147 // For all site's characters 00148 int gap = site.getAlphabet()->getGapCharacterCode(); 00149 if (ignoreUnknown) 00150 { 00151 int s = site[0]; 00152 int unknown = site.getAlphabet()->getUnknownCharacterCode(); 00153 size_t i = 0; 00154 while (i < site.size() && (s == gap || s == unknown)) 00155 { 00156 s = site[i]; 00157 i++; 00158 } 00159 if (s == unknown || s == gap) 00160 { 00161 if (unresolvedRaisesException) 00162 throw EmptySiteException("SiteTools::isConstant: Site is only made of gaps or generic characters."); 00163 else 00164 return false; 00165 } 00166 while (i < site.size()) 00167 { 00168 if (site[i] != s && site[i] != gap && site[i] != unknown) 00169 return false; 00170 i++; 00171 } 00172 } 00173 else 00174 { 00175 int s = site[0]; 00176 size_t i = 0; 00177 while (i < site.size() && s == gap) 00178 { 00179 s = site[i]; 00180 i++; 00181 } 00182 if (s == gap) 00183 { 00184 if (unresolvedRaisesException) 00185 throw EmptySiteException("SiteTools::isConstant: Site is only made of gaps."); 00186 else 00187 return false; 00188 } 00189 while (i < site.size()) 00190 { 00191 if (site[i] != s && site[i] != gap) 00192 return false; 00193 i++; 00194 } 00195 } 00196 00197 return true; 00198 } 00199 00200 /******************************************************************************/ 00201 00202 double SiteTools::variabilityShannon(const Site& site, bool resolveUnknown) throw (EmptySiteException) 00203 { 00204 // Empty site checking 00205 if (site.size() == 0) 00206 throw EmptySiteException("SiteTools::variabilityShannon: Incorrect specified site, size must be > 0", &site); 00207 map<int, double> p; 00208 getFrequencies(site, p, resolveUnknown); 00209 // We need to correct frequencies for gaps: 00210 double s = 0.; 00211 for (int i = 0; i < static_cast<int>(site.getAlphabet()->getSize()); i++) 00212 { 00213 double f = p[i]; 00214 if (f > 0) 00215 s += f * log(f); 00216 } 00217 return -s; 00218 } 00219 00220 /******************************************************************************/ 00221 00222 double SiteTools::mutualInformation(const Site& site1, const Site& site2, bool resolveUnknown) throw (DimensionException, EmptySiteException) 00223 { 00224 // Empty site checking 00225 if (site1.size() == 0) 00226 throw EmptySiteException("SiteTools::mutualInformation: Incorrect specified site, size must be > 0", &site1); 00227 if (site2.size() == 0) 00228 throw EmptySiteException("SiteTools::mutualInformation: Incorrect specified site, size must be > 0", &site2); 00229 if (site1.size() != site2.size()) 00230 throw DimensionException("SiteTools::mutualInformation: sites must have the same size!", site1.size(), site2.size()); 00231 vector<double> p1(site1.getAlphabet()->getSize()); 00232 vector<double> p2(site2.getAlphabet()->getSize()); 00233 map<int, map<int, double> > p12; 00234 getCounts(site1, site2, p12, resolveUnknown); 00235 double mi = 0, tot = 0, pxy; 00236 // We need to correct frequencies for gaps: 00237 for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++) 00238 { 00239 for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++) 00240 { 00241 pxy = p12[static_cast<int>(i)][static_cast<int>(j)]; 00242 tot += pxy; 00243 p1[i] += pxy; 00244 p2[j] += pxy; 00245 } 00246 } 00247 for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++) 00248 { 00249 p1[i] /= tot; 00250 } 00251 for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++) 00252 { 00253 p2[j] /= tot; 00254 } 00255 for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++) 00256 { 00257 for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++) 00258 { 00259 pxy = p12[static_cast<int>(i)][static_cast<int>(j)] / tot; 00260 if (pxy > 0) 00261 mi += pxy * log(pxy / (p1[i] * p2[j])); 00262 } 00263 } 00264 return mi; 00265 } 00266 00267 /******************************************************************************/ 00268 00269 double SiteTools::jointEntropy(const Site& site1, const Site& site2, bool resolveUnknown) throw (DimensionException, EmptySiteException) 00270 { 00271 // Empty site checking 00272 if (site1.size() == 0) 00273 throw EmptySiteException("SiteTools::jointEntropy: Incorrect specified site, size must be > 0", &site1); 00274 if (site2.size() == 0) 00275 throw EmptySiteException("SiteTools::jointEntropy: Incorrect specified site, size must be > 0", &site2); 00276 if (site1.size() != site2.size()) 00277 throw DimensionException("SiteTools::jointEntropy: sites must have the same size!", site1.size(), site2.size()); 00278 map<int, map<int, double> > p12; 00279 getCounts(site1, site2, p12, resolveUnknown); 00280 double tot = 0, pxy, h = 0; 00281 // We need to correct frequencies for gaps: 00282 for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++) 00283 { 00284 for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++) 00285 { 00286 pxy = p12[static_cast<int>(i)][static_cast<int>(j)]; 00287 tot += pxy; 00288 } 00289 } 00290 for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++) 00291 { 00292 for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++) 00293 { 00294 pxy = p12[static_cast<int>(i)][static_cast<int>(j)] / tot; 00295 if (pxy > 0) 00296 h += pxy * log(pxy); 00297 } 00298 } 00299 return -h; 00300 } 00301 00302 /******************************************************************************/ 00303 00304 double SiteTools::variabilityFactorial(const Site& site) throw (EmptySiteException) 00305 { 00306 // Empty site checking 00307 if (site.size() == 0) 00308 throw EmptySiteException("SiteTools::variabilityFactorial: Incorrect specified site, size must be > 0", &site); 00309 map<int, size_t> p; 00310 getCounts(site, p); 00311 vector<size_t> c = MapTools::getValues(p); 00312 size_t s = VectorTools::sum(c); 00313 long double l = static_cast<long double>(NumTools::fact(s)) / static_cast<long double>(VectorTools::sum(VectorTools::fact(c))); 00314 return (static_cast<double>(std::log(l))); 00315 } 00316 00317 /******************************************************************************/ 00318 00319 double SiteTools::heterozygosity(const Site& site) throw (EmptySiteException) 00320 { 00321 // Empty site checking 00322 if (site.size() == 0) 00323 throw EmptySiteException("SiteTools::heterozygosity: Incorrect specified site, size must be > 0", &site); 00324 map<int, double> p; 00325 getFrequencies(site, p); 00326 vector<double> c = MapTools::getValues(p); 00327 double n = VectorTools::norm<double, double>(MapTools::getValues(p)); 00328 return 1. - n * n; 00329 } 00330 00331 /******************************************************************************/ 00332 00333 size_t SiteTools::getNumberOfDistinctCharacters(const Site& site) throw (EmptySiteException) 00334 { 00335 // Empty site checking 00336 if (site.size() == 0) 00337 throw EmptySiteException("SiteTools::getNumberOfDistinctCharacters(): Incorrect specified site, size must be > 0", &site); 00338 // For all site's characters 00339 if (SiteTools::isConstant(site)) 00340 return 1; 00341 map<int, size_t> counts; 00342 SymbolListTools::getCounts(site, counts); 00343 int s = 0; 00344 for (map<int, size_t>::iterator it = counts.begin(); it != counts.end(); it++) 00345 { 00346 if (it->second != 0) 00347 s++; 00348 } 00349 return s; 00350 } 00351 00352 /******************************************************************************/ 00353 00354 bool SiteTools::hasSingleton(const Site& site) throw (EmptySiteException) 00355 { 00356 // Empty site checking 00357 if (site.size() == 0) 00358 throw EmptySiteException("SiteTools::hasSingleton: Incorrect specified site, size must be > 0", &site); 00359 // For all site's characters 00360 if (SiteTools::isConstant(site)) 00361 return false; 00362 map<int, size_t> counts; 00363 getCounts(site, counts); 00364 for (map<int, size_t>::iterator it = counts.begin(); it != counts.end(); it++) 00365 { 00366 if (it->second == 1) 00367 return true; 00368 } 00369 return false; 00370 } 00371 00372 /******************************************************************************/ 00373 00374 bool SiteTools::isParsimonyInformativeSite(const Site& site) throw (EmptySiteException) 00375 { 00376 // Empty site checking 00377 if (site.size() == 0) 00378 throw EmptySiteException("SiteTools::isParsimonyInformativeSite: Incorrect specified site, size must be > 0", &site); 00379 // For all site's characters 00380 if (SiteTools::isConstant(site, false, false)) 00381 return false; 00382 map<int, size_t> counts; 00383 SymbolListTools::getCounts(site, counts); 00384 size_t npars = 0; 00385 for (map<int, size_t>::iterator it = counts.begin(); it != counts.end(); it++) 00386 { 00387 if (it->second > 1) 00388 npars++; 00389 } 00390 if (npars > 1) 00391 return true; 00392 return false; 00393 } 00394 00395 /******************************************************************************/ 00396 00397 bool SiteTools::isTriplet(const Site& site) throw (EmptySiteException) 00398 { 00399 // Empty site checking 00400 if (site.size() == 0) 00401 throw EmptySiteException("SiteTools::isTriplet: Incorrect specified site, size must be > 0", &site); 00402 // For all site's characters 00403 if (SiteTools::getNumberOfDistinctCharacters(site) >= 3) 00404 return true; 00405 else 00406 return false; 00407 } 00408 00409 /******************************************************************************/ 00410