|
bpp-phyl
2.1.0
|
00001 // 00002 // File: SubstitutionMappingTools.cpp 00003 // Created by: Julien Dutheil 00004 // Created on: Wed Apr 5 13:04 2006 00005 // 00006 00007 /* 00008 Copyright or © or Copr. Bio++ Development Team, (November 16, 2004, 2005, 2006) 00009 00010 This software is a computer program whose purpose is to provide classes 00011 for phylogenetic data analysis. 00012 00013 This software is governed by the CeCILL license under French law and 00014 abiding by the rules of distribution of free software. You can use, 00015 modify and/ or redistribute the software under the terms of the CeCILL 00016 license as circulated by CEA, CNRS and INRIA at the following URL 00017 "http://www.cecill.info". 00018 00019 As a counterpart to the access to the source code and rights to copy, 00020 modify and redistribute granted by the license, users are provided only 00021 with a limited warranty and the software's author, the holder of the 00022 economic rights, and the successive licensors have only limited 00023 liability. 00024 00025 In this respect, the user's attention is drawn to the risks associated 00026 with loading, using, modifying and/or developing or reproducing the 00027 software by the user in light of its specific status of free software, 00028 that may mean that it is complicated to manipulate, and that also 00029 therefore means that it is reserved for developers and experienced 00030 professionals having in-depth computer knowledge. Users are therefore 00031 encouraged to load and test the software's suitability as regards their 00032 requirements in conditions enabling the security of their systems and/or 00033 data to be ensured and, more generally, to use and operate it in the 00034 same conditions as regards security. 00035 00036 The fact that you are presently reading this means that you have had 00037 knowledge of the CeCILL license and that you accept its terms. 00038 */ 00039 00040 #include "SubstitutionMappingTools.h" 00041 #include "../Likelihood/DRTreeLikelihoodTools.h" 00042 #include "../Likelihood/MarginalAncestralStateReconstruction.h" 00043 00044 #include <Bpp/Text/TextTools.h> 00045 #include <Bpp/App/ApplicationTools.h> 00046 #include <Bpp/Numeric/Matrix/MatrixTools.h> 00047 #include <Bpp/Numeric/DataTable.h> 00048 00049 using namespace bpp; 00050 00051 // From the STL: 00052 #include <iomanip> 00053 00054 using namespace std; 00055 00056 /******************************************************************************/ 00057 00058 ProbabilisticSubstitutionMapping* SubstitutionMappingTools::computeSubstitutionVectors( 00059 const DRTreeLikelihood& drtl, 00060 SubstitutionCount& substitutionCount, 00061 bool verbose) throw (Exception) 00062 { 00063 //Preamble: 00064 if (!drtl.isInitialized()) throw Exception("SubstitutionMappingTools::computeSubstitutionVectors(). Likelihood object is not initialized."); 00065 00066 //A few variables we'll need: 00067 00068 const TreeTemplate<Node> tree(drtl.getTree()); 00069 const SiteContainer* sequences = drtl.getData(); 00070 const DiscreteDistribution* rDist = drtl.getRateDistribution(); 00071 00072 size_t nbSites = sequences->getNumberOfSites(); 00073 size_t nbDistinctSites = drtl.getLikelihoodData()->getNumberOfDistinctSites(); 00074 size_t nbStates = sequences->getAlphabet()->getSize(); 00075 size_t nbClasses = rDist->getNumberOfCategories(); 00076 size_t nbTypes = substitutionCount.getNumberOfSubstitutionTypes(); 00077 vector<const Node*> nodes = tree.getNodes(); 00078 const vector<size_t>* rootPatternLinks 00079 = &drtl.getLikelihoodData()->getRootArrayPositions(); 00080 nodes.pop_back(); // Remove root node. 00081 size_t nbNodes = nodes.size(); 00082 00083 // We create a new ProbabilisticSubstitutionMapping object: 00084 ProbabilisticSubstitutionMapping* substitutions = new ProbabilisticSubstitutionMapping(tree, &substitutionCount, nbSites); 00085 00086 // Store likelihood for each rate for each site: 00087 VVVdouble lik; 00088 drtl.computeLikelihoodAtNode(tree.getRootId(), lik); 00089 Vdouble Lr(nbDistinctSites, 0); 00090 Vdouble rcProbs = rDist->getProbabilities(); 00091 Vdouble rcRates = rDist->getCategories(); 00092 for (size_t i = 0; i < nbDistinctSites; i++) 00093 { 00094 VVdouble* lik_i = &lik[i]; 00095 for (size_t c = 0; c < nbClasses; c++) 00096 { 00097 Vdouble* lik_i_c = &(*lik_i)[c]; 00098 double rc = rDist->getProbability(c); 00099 for (size_t s = 0; s < nbStates; s++) 00100 { 00101 Lr[i] += (*lik_i_c)[s] * rc; 00102 } 00103 } 00104 } 00105 00106 // Compute the number of substitutions for each class and each branch in the tree: 00107 if (verbose) ApplicationTools::displayTask("Compute joint node-pairs likelihood", true); 00108 00109 for (size_t l = 0; l < nbNodes; l++) 00110 { 00111 //For each node, 00112 const Node* currentNode = nodes[l]; 00113 00114 const Node* father = currentNode->getFather(); 00115 00116 double d = currentNode->getDistanceToFather(); 00117 00118 if (verbose) ApplicationTools::displayGauge(l, nbNodes-1, '>'); 00119 VVdouble substitutionsForCurrentNode(nbDistinctSites); 00120 for (size_t i = 0; i < nbDistinctSites; ++i) 00121 substitutionsForCurrentNode[i].resize(nbTypes); 00122 00123 // Now we've got to compute likelihoods in a smart manner... ;) 00124 VVVdouble likelihoodsFatherConstantPart(nbDistinctSites); 00125 for (size_t i = 0; i < nbDistinctSites; i++) 00126 { 00127 VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i]; 00128 likelihoodsFatherConstantPart_i->resize(nbClasses); 00129 for (size_t c = 0; c < nbClasses; c++) 00130 { 00131 Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c]; 00132 likelihoodsFatherConstantPart_i_c->resize(nbStates); 00133 double rc = rDist->getProbability(c); 00134 for (size_t s = 0; s < nbStates; s++) 00135 { 00136 //(* likelihoodsFatherConstantPart_i_c)[s] = rc * model->freq(s); 00137 //freq is already accounted in the array 00138 (* likelihoodsFatherConstantPart_i_c)[s] = rc; 00139 } 00140 } 00141 } 00142 00143 // First, what will remain constant: 00144 size_t nbSons = father->getNumberOfSons(); 00145 for (size_t n = 0; n < nbSons; n++) 00146 { 00147 const Node* currentSon = father->getSon(n); 00148 if (currentSon->getId() != currentNode->getId()) 00149 { 00150 const VVVdouble* likelihoodsFather_son = &drtl.getLikelihoodData()->getLikelihoodArray(father->getId(), currentSon->getId()); 00151 00152 //Now iterate over all site partitions: 00153 auto_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(currentSon->getId())); 00154 VVVdouble pxy; 00155 bool first; 00156 while (mit->hasNext()) 00157 { 00158 TreeLikelihood::ConstBranchModelDescription* bmd = mit->next(); 00159 auto_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator()); 00160 first = true; 00161 while (sit->hasNext()) 00162 { 00163 size_t i = sit->next(); 00164 //We retrieve the transition probabilities for this site partition: 00165 if (first) 00166 { 00167 pxy = drtl.getTransitionProbabilitiesPerRateClass(currentSon->getId(), i); 00168 first = false; 00169 } 00170 const VVdouble* likelihoodsFather_son_i = &(*likelihoodsFather_son)[i]; 00171 VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i]; 00172 for (size_t c = 0; c < nbClasses; c++) 00173 { 00174 const Vdouble* likelihoodsFather_son_i_c = &(*likelihoodsFather_son_i)[c]; 00175 Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c]; 00176 VVdouble* pxy_c = &pxy[c]; 00177 for (size_t x = 0; x < nbStates; x++) 00178 { 00179 Vdouble* pxy_c_x = &(*pxy_c)[x]; 00180 double likelihood = 0.; 00181 for (size_t y = 0; y < nbStates; y++) 00182 { 00183 likelihood += (*pxy_c_x)[y] * (*likelihoodsFather_son_i_c)[y]; 00184 } 00185 (*likelihoodsFatherConstantPart_i_c)[x] *= likelihood; 00186 } 00187 } 00188 } 00189 } 00190 } 00191 } 00192 if (father->hasFather()) 00193 { 00194 const Node* currentSon = father->getFather(); 00195 const VVVdouble* likelihoodsFather_son = &drtl.getLikelihoodData()->getLikelihoodArray(father->getId(), currentSon->getId()); 00196 //Now iterate over all site partitions: 00197 auto_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(father->getId())); 00198 VVVdouble pxy; 00199 bool first; 00200 while (mit->hasNext()) 00201 { 00202 TreeLikelihood::ConstBranchModelDescription* bmd = mit->next(); 00203 auto_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator()); 00204 first = true; 00205 while (sit->hasNext()) 00206 { 00207 size_t i = sit->next(); 00208 //We retrieve the transition probabilities for this site partition: 00209 if (first) 00210 { 00211 pxy = drtl.getTransitionProbabilitiesPerRateClass(father->getId(), i); 00212 first = false; 00213 } 00214 const VVdouble* likelihoodsFather_son_i = &(*likelihoodsFather_son)[i]; 00215 VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i]; 00216 for (size_t c = 0; c < nbClasses; c++) 00217 { 00218 const Vdouble* likelihoodsFather_son_i_c = &(*likelihoodsFather_son_i)[c]; 00219 Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c]; 00220 VVdouble* pxy_c = &pxy[c]; 00221 for (size_t x = 0; x < nbStates; x++) 00222 { 00223 double likelihood = 0.; 00224 for (size_t y = 0; y < nbStates; y++) 00225 { 00226 Vdouble* pxy_c_x = &(*pxy_c)[y]; 00227 likelihood += (*pxy_c_x)[x] * (*likelihoodsFather_son_i_c)[y]; 00228 } 00229 (*likelihoodsFatherConstantPart_i_c)[x] *= likelihood; 00230 } 00231 } 00232 } 00233 } 00234 } 00235 else 00236 { 00237 //Account for root frequencies: 00238 for (size_t i = 0; i < nbDistinctSites; i++) 00239 { 00240 vector<double> freqs = drtl.getRootFrequencies(i); 00241 VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i]; 00242 for (size_t c = 0; c < nbClasses; c++) 00243 { 00244 Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c]; 00245 for (size_t x = 0; x < nbStates; x++) 00246 { 00247 (*likelihoodsFatherConstantPart_i_c)[x] *= freqs[x]; 00248 } 00249 } 00250 } 00251 } 00252 00253 // Then, we deal with the node of interest. 00254 // We first average uppon 'y' to save computations, and then uppon 'x'. 00255 // ('y' is the state at 'node' and 'x' the state at 'father'.) 00256 00257 //Iterate over all site partitions: 00258 const VVVdouble* likelihoodsFather_node = &(drtl.getLikelihoodData()->getLikelihoodArray(father->getId(), currentNode->getId())); 00259 auto_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(currentNode->getId())); 00260 VVVdouble pxy; 00261 bool first; 00262 while (mit->hasNext()) 00263 { 00264 TreeLikelihood::ConstBranchModelDescription* bmd = mit->next(); 00265 substitutionCount.setSubstitutionModel(bmd->getModel()); 00266 //compute all nxy first: 00267 VVVVdouble nxy(nbClasses); 00268 for (size_t c = 0; c < nbClasses; ++c) 00269 { 00270 VVVdouble* nxy_c = &nxy[c]; 00271 double rc = rcRates[c]; 00272 nxy_c->resize(nbTypes); 00273 for (size_t t = 0; t < nbTypes; ++t) 00274 { 00275 VVdouble* nxy_c_t = &(*nxy_c)[t]; 00276 Matrix<double>* nijt = substitutionCount.getAllNumbersOfSubstitutions(d * rc, t + 1); 00277 nxy_c_t->resize(nbStates); 00278 for (size_t x = 0; x < nbStates; ++x) 00279 { 00280 Vdouble* nxy_c_t_x = &(*nxy_c_t)[x]; 00281 nxy_c_t_x->resize(nbStates); 00282 for (size_t y = 0; y < nbStates; ++y) 00283 (*nxy_c_t_x)[y] = (*nijt)(x, y); 00284 } 00285 delete nijt; 00286 } 00287 } 00288 00289 //Now loop over sites: 00290 auto_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator()); 00291 first = true; 00292 while (sit->hasNext()) 00293 { 00294 size_t i = sit->next(); 00295 //We retrieve the transition probabilities and substitution counts for this site partition: 00296 if (first) 00297 { 00298 pxy = drtl.getTransitionProbabilitiesPerRateClass(currentNode->getId(), i); 00299 first = false; 00300 } 00301 const VVdouble* likelihoodsFather_node_i = &(*likelihoodsFather_node)[i]; 00302 VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i]; 00303 for (size_t c = 0; c < nbClasses; ++c) 00304 { 00305 const Vdouble* likelihoodsFather_node_i_c = &(*likelihoodsFather_node_i)[c]; 00306 Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c]; 00307 const VVdouble* pxy_c = &pxy[c]; 00308 VVVdouble* nxy_c = &nxy[c]; 00309 for (size_t x = 0; x < nbStates; ++x) 00310 { 00311 double* likelihoodsFatherConstantPart_i_c_x = &(*likelihoodsFatherConstantPart_i_c)[x]; 00312 const Vdouble* pxy_c_x = &(*pxy_c)[x]; 00313 for (size_t y = 0; y < nbStates; ++y) 00314 { 00315 double likelihood_cxy = (*likelihoodsFatherConstantPart_i_c_x) 00316 *(*pxy_c_x)[y] 00317 *(*likelihoodsFather_node_i_c)[y]; 00318 00319 for (size_t t = 0; t < nbTypes; ++t) { 00320 // Now the vector computation: 00321 substitutionsForCurrentNode[i][t] += likelihood_cxy * (*nxy_c)[t][x][y]; 00322 // <------------> <---------------> 00323 // Posterior probability | | 00324 // for site i and rate class c * | | 00325 // likelihood for this site----------------+ | 00326 // | 00327 //Substitution function for site i and rate class c----------+ 00328 } 00329 } 00330 } 00331 } 00332 } 00333 } 00334 00335 //Now we just have to copy the substitutions into the result vector: 00336 for (size_t i = 0; i < nbSites; ++i) 00337 for (size_t t = 0; t < nbTypes; ++t) 00338 (*substitutions)(l, i, t) = substitutionsForCurrentNode[(* rootPatternLinks)[i]][t] / Lr[(* rootPatternLinks)[i]]; 00339 } 00340 if (verbose) 00341 { 00342 if (ApplicationTools::message) *ApplicationTools::message << " "; 00343 ApplicationTools::displayTaskDone(); 00344 } 00345 return substitutions; 00346 } 00347 00348 /**************************************************************************************************/ 00349 00350 ProbabilisticSubstitutionMapping* SubstitutionMappingTools::computeSubstitutionVectorsNoAveraging( 00351 const DRTreeLikelihood& drtl, 00352 SubstitutionCount& substitutionCount, 00353 bool verbose) throw (Exception) 00354 { 00355 //Preamble: 00356 if (!drtl.isInitialized()) throw Exception("SubstitutionMappingTools::computeSubstitutionVectorsNoAveraging(). Likelihood object is not initialized."); 00357 00358 //A few variables we'll need: 00359 const TreeTemplate<Node> tree(drtl.getTree()); 00360 const SiteContainer* sequences = drtl.getData(); 00361 const DiscreteDistribution* rDist = drtl.getRateDistribution(); 00362 00363 size_t nbSites = sequences->getNumberOfSites(); 00364 size_t nbDistinctSites = drtl.getLikelihoodData()->getNumberOfDistinctSites(); 00365 size_t nbStates = sequences->getAlphabet()->getSize(); 00366 size_t nbClasses = rDist->getNumberOfCategories(); 00367 size_t nbTypes = substitutionCount.getNumberOfSubstitutionTypes(); 00368 vector<const Node *> nodes = tree.getNodes(); 00369 const vector<size_t> * rootPatternLinks 00370 = &drtl.getLikelihoodData()->getRootArrayPositions(); 00371 nodes.pop_back(); // Remove root node. 00372 size_t nbNodes = nodes.size(); 00373 00374 // We create a new ProbabilisticSubstitutionMapping object: 00375 ProbabilisticSubstitutionMapping* substitutions = new ProbabilisticSubstitutionMapping(tree, &substitutionCount, nbSites); 00376 00377 Vdouble rcRates = rDist->getCategories(); 00378 00379 // Compute the number of substitutions for each class and each branch in the tree: 00380 if (verbose) ApplicationTools::displayTask("Compute joint node-pairs likelihood", true); 00381 00382 for (size_t l = 0; l < nbNodes; ++l) 00383 { 00384 // For each node, 00385 const Node* currentNode = nodes[l]; 00386 00387 const Node* father = currentNode->getFather(); 00388 00389 double d = currentNode->getDistanceToFather(); 00390 00391 if (verbose) ApplicationTools::displayGauge(l, nbNodes-1, '>'); 00392 VVdouble substitutionsForCurrentNode(nbDistinctSites); 00393 for (size_t i = 0; i < nbDistinctSites; ++i) 00394 substitutionsForCurrentNode[i].resize(nbTypes); 00395 00396 // Now we've got to compute likelihoods in a smart manner... ;) 00397 VVVdouble likelihoodsFatherConstantPart(nbDistinctSites); 00398 for (size_t i = 0; i < nbDistinctSites; ++i) 00399 { 00400 VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i]; 00401 likelihoodsFatherConstantPart_i->resize(nbClasses); 00402 for (size_t c = 0; c < nbClasses; ++c) 00403 { 00404 Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c]; 00405 likelihoodsFatherConstantPart_i_c->resize(nbStates); 00406 double rc = rDist->getProbability(c); 00407 for (size_t s = 0; s < nbStates; ++s) 00408 { 00409 //(* likelihoodsFatherConstantPart_i_c)[s] = rc * model->freq(s); 00410 //freq is already accounted in the array 00411 (* likelihoodsFatherConstantPart_i_c)[s] = rc; 00412 } 00413 } 00414 } 00415 00416 // First, what will remain constant: 00417 size_t nbSons = father->getNumberOfSons(); 00418 for (size_t n = 0; n < nbSons; ++n) 00419 { 00420 const Node* currentSon = father->getSon(n); 00421 if (currentSon->getId() != currentNode->getId()) 00422 { 00423 const VVVdouble* likelihoodsFather_son = &drtl.getLikelihoodData()->getLikelihoodArray(father->getId(), currentSon->getId()); 00424 00425 //Now iterate over all site partitions: 00426 auto_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(currentSon->getId())); 00427 VVVdouble pxy; 00428 bool first; 00429 while (mit->hasNext()) 00430 { 00431 TreeLikelihood::ConstBranchModelDescription* bmd = mit->next(); 00432 auto_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator()); 00433 first = true; 00434 while (sit->hasNext()) 00435 { 00436 size_t i = sit->next(); 00437 //We retrieve the transition probabilities for this site partition: 00438 if (first) 00439 { 00440 pxy = drtl.getTransitionProbabilitiesPerRateClass(currentSon->getId(), i); 00441 first = false; 00442 } 00443 const VVdouble* likelihoodsFather_son_i = &(*likelihoodsFather_son)[i]; 00444 VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i]; 00445 for (size_t c = 0; c < nbClasses; ++c) 00446 { 00447 const Vdouble* likelihoodsFather_son_i_c = &(*likelihoodsFather_son_i)[c]; 00448 Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c]; 00449 VVdouble* pxy_c = & pxy[c]; 00450 for (size_t x = 0; x < nbStates; ++x) 00451 { 00452 Vdouble* pxy_c_x = &(*pxy_c)[x]; 00453 double likelihood = 0.; 00454 for (size_t y = 0; y < nbStates; ++y) 00455 { 00456 likelihood += (*pxy_c_x)[y] * (*likelihoodsFather_son_i_c)[y]; 00457 } 00458 (*likelihoodsFatherConstantPart_i_c)[x] *= likelihood; 00459 } 00460 } 00461 } 00462 } 00463 } 00464 } 00465 if (father->hasFather()) 00466 { 00467 const Node* currentSon = father->getFather(); 00468 const VVVdouble* likelihoodsFather_son = &drtl.getLikelihoodData()->getLikelihoodArray(father->getId(), currentSon->getId()); 00469 //Now iterate over all site partitions: 00470 auto_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(father->getId())); 00471 VVVdouble pxy; 00472 bool first; 00473 while (mit->hasNext()) 00474 { 00475 TreeLikelihood::ConstBranchModelDescription* bmd = mit->next(); 00476 auto_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator()); 00477 first = true; 00478 while (sit->hasNext()) 00479 { 00480 size_t i = sit->next(); 00481 //We retrieve the transition probabilities for this site partition: 00482 if (first) 00483 { 00484 pxy = drtl.getTransitionProbabilitiesPerRateClass(father->getId(), i); 00485 first = false; 00486 } 00487 const VVdouble* likelihoodsFather_son_i = &(*likelihoodsFather_son)[i]; 00488 VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i]; 00489 for (size_t c = 0; c < nbClasses; ++c) 00490 { 00491 const Vdouble* likelihoodsFather_son_i_c = &(*likelihoodsFather_son_i)[c]; 00492 Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c]; 00493 VVdouble* pxy_c = &pxy[c]; 00494 for (size_t x = 0; x < nbStates; ++x) 00495 { 00496 double likelihood = 0.; 00497 for (size_t y = 0; y < nbStates; ++y) 00498 { 00499 Vdouble* pxy_c_x = &(*pxy_c)[y]; 00500 likelihood += (*pxy_c_x)[x] * (*likelihoodsFather_son_i_c)[y]; 00501 } 00502 (*likelihoodsFatherConstantPart_i_c)[x] *= likelihood; 00503 } 00504 } 00505 } 00506 } 00507 } 00508 else 00509 { 00510 //Account for root frequencies: 00511 for (size_t i = 0; i < nbDistinctSites; ++i) 00512 { 00513 vector<double> freqs = drtl.getRootFrequencies(i); 00514 VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i]; 00515 for (size_t c = 0; c < nbClasses; ++c) 00516 { 00517 Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c]; 00518 for (size_t x = 0; x < nbStates; ++x) 00519 { 00520 (*likelihoodsFatherConstantPart_i_c)[x] *= freqs[x]; 00521 } 00522 } 00523 } 00524 } 00525 00526 // Then, we deal with the node of interest. 00527 // We first average uppon 'y' to save computations, and then uppon 'x'. 00528 // ('y' is the state at 'node' and 'x' the state at 'father'.) 00529 00530 //Iterate over all site partitions: 00531 const VVVdouble* likelihoodsFather_node = &drtl.getLikelihoodData()->getLikelihoodArray(father->getId(), currentNode->getId()); 00532 auto_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(currentNode->getId())); 00533 VVVdouble pxy; 00534 bool first; 00535 while (mit->hasNext()) 00536 { 00537 TreeLikelihood::ConstBranchModelDescription* bmd = mit->next(); 00538 substitutionCount.setSubstitutionModel(bmd->getModel()); 00539 //compute all nxy first: 00540 VVVVdouble nxy(nbClasses); 00541 for (size_t c = 0; c < nbClasses; ++c) 00542 { 00543 double rc = rcRates[c]; 00544 VVVdouble* nxy_c = &nxy[c]; 00545 nxy_c->resize(nbTypes); 00546 for (size_t t = 0; t < nbTypes; ++t) 00547 { 00548 VVdouble* nxy_c_t = &(*nxy_c)[t]; 00549 nxy_c_t->resize(nbStates); 00550 Matrix<double>* nijt = substitutionCount.getAllNumbersOfSubstitutions(d * rc, t + 1); 00551 for (size_t x = 0; x < nbStates; ++x) 00552 { 00553 Vdouble* nxy_c_t_x = &(*nxy_c_t)[x]; 00554 nxy_c_t_x->resize(nbStates); 00555 for (size_t y = 0; y < nbStates; ++y) 00556 { 00557 (*nxy_c_t_x)[y] = (*nijt)(x, y); 00558 } 00559 } 00560 delete nijt; 00561 } 00562 } 00563 00564 //Now loop over sites: 00565 auto_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator()); 00566 first = true; 00567 while (sit->hasNext()) 00568 { 00569 size_t i = sit->next(); 00570 //We retrieve the transition probabilities and substitution counts for this site partition: 00571 if (first) 00572 { 00573 pxy = drtl.getTransitionProbabilitiesPerRateClass(currentNode->getId(), i); 00574 first = false; 00575 } 00576 const VVdouble* likelihoodsFather_node_i = &(*likelihoodsFather_node)[i]; 00577 VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i]; 00578 RowMatrix<double> pairProbabilities(nbStates, nbStates); 00579 MatrixTools::fill(pairProbabilities, 0.); 00580 VVVdouble subsCounts(nbStates); 00581 for (size_t j = 0; j < nbStates; ++j) { 00582 subsCounts[j].resize(nbStates); 00583 for (size_t k = 0; k < nbStates; ++k) { 00584 subsCounts[j][k].resize(nbTypes); 00585 } 00586 } 00587 for (size_t c = 0; c < nbClasses; ++c) 00588 { 00589 const Vdouble* likelihoodsFather_node_i_c = &(*likelihoodsFather_node_i)[c]; 00590 Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c]; 00591 const VVdouble* pxy_c = &pxy[c]; 00592 VVVdouble* nxy_c = &nxy[c]; 00593 for (size_t x = 0; x < nbStates; ++x) 00594 { 00595 double* likelihoodsFatherConstantPart_i_c_x = &(*likelihoodsFatherConstantPart_i_c)[x]; 00596 const Vdouble* pxy_c_x = &(*pxy_c)[x]; 00597 for (size_t y = 0; y < nbStates; ++y) 00598 { 00599 double likelihood_cxy = (*likelihoodsFatherConstantPart_i_c_x) 00600 *(*pxy_c_x)[y] 00601 *(*likelihoodsFather_node_i_c)[y]; 00602 pairProbabilities(x, y) += likelihood_cxy; // Sum over all rate classes. 00603 for (size_t t = 0; t < nbTypes; ++t) { 00604 subsCounts[x][y][t] += likelihood_cxy * (* nxy_c)[t][x][y]; 00605 } 00606 } 00607 } 00608 } 00609 // Now the vector computation: 00610 // Here we do not average over all possible pair of ancestral states, 00611 // We only consider the one with max likelihood: 00612 vector<size_t> xy = MatrixTools::whichMax(pairProbabilities); 00613 for (size_t t = 0; t < nbTypes; ++t) { 00614 substitutionsForCurrentNode[i][t] += subsCounts[xy[0]][xy[1]][t] / pairProbabilities(xy[0], xy[1]); 00615 } 00616 } 00617 } 00618 //Now we just have to copy the substitutions into the result vector: 00619 for (size_t i = 0; i < nbSites; i++) 00620 for (size_t t = 0; t < nbTypes; t++) 00621 (*substitutions)(l, i, t) = substitutionsForCurrentNode[(*rootPatternLinks)[i]][t]; 00622 } 00623 if (verbose) 00624 { 00625 if (ApplicationTools::message) *ApplicationTools::message << " "; 00626 ApplicationTools::displayTaskDone(); 00627 } 00628 return substitutions; 00629 } 00630 00631 /**************************************************************************************************/ 00632 00633 ProbabilisticSubstitutionMapping* SubstitutionMappingTools::computeSubstitutionVectorsNoAveragingMarginal( 00634 const DRTreeLikelihood& drtl, 00635 SubstitutionCount& substitutionCount, 00636 bool verbose) throw (Exception) 00637 { 00638 //Preamble: 00639 if (!drtl.isInitialized()) throw Exception("SubstitutionMappingTools::computeSubstitutionVectorsNoAveragingMarginal(). Likelihood object is not initialized."); 00640 00641 //A few variables we'll need: 00642 00643 const TreeTemplate<Node> tree(drtl.getTree()); 00644 const SiteContainer* sequences = drtl.getData(); 00645 const DiscreteDistribution* rDist = drtl.getRateDistribution(); 00646 const Alphabet* alpha = sequences->getAlphabet(); 00647 00648 size_t nbSites = sequences->getNumberOfSites(); 00649 size_t nbDistinctSites = drtl.getLikelihoodData()->getNumberOfDistinctSites(); 00650 size_t nbStates = alpha->getSize(); 00651 size_t nbTypes = substitutionCount.getNumberOfSubstitutionTypes(); 00652 vector<const Node*> nodes = tree.getNodes(); 00653 const vector<size_t>* rootPatternLinks 00654 = &drtl.getLikelihoodData()->getRootArrayPositions(); 00655 nodes.pop_back(); // Remove root node. 00656 size_t nbNodes = nodes.size(); 00657 00658 // We create a new ProbabilisticSubstitutionMapping object: 00659 ProbabilisticSubstitutionMapping* substitutions = new ProbabilisticSubstitutionMapping(tree, &substitutionCount, nbSites); 00660 00661 // Compute the whole likelihood of the tree according to the specified model: 00662 00663 Vdouble rcRates = rDist->getCategories(); 00664 00665 // Compute the number of substitutions for each class and each branch in the tree: 00666 if (verbose) ApplicationTools::displayTask("Compute marginal ancestral states"); 00667 MarginalAncestralStateReconstruction masr(&drtl); 00668 map<int, vector<size_t> > ancestors = masr.getAllAncestralStates(); 00669 if (verbose) ApplicationTools::displayTaskDone(); 00670 00671 // Now we just have to compute the substitution vectors: 00672 if (verbose) ApplicationTools::displayTask("Compute substitution vectors", true); 00673 00674 for (size_t l = 0; l < nbNodes; l++) 00675 { 00676 const Node* currentNode = nodes[l]; 00677 00678 const Node* father = currentNode->getFather(); 00679 00680 double d = currentNode->getDistanceToFather(); 00681 00682 vector<size_t> nodeStates = ancestors[currentNode->getId()]; //These are not 'true' ancestors ;) 00683 vector<size_t> fatherStates = ancestors[father->getId()]; 00684 00685 //For each node, 00686 if (verbose) ApplicationTools::displayGauge(l, nbNodes-1, '>'); 00687 VVdouble substitutionsForCurrentNode(nbDistinctSites); 00688 for (size_t i = 0; i < nbDistinctSites; ++i) 00689 substitutionsForCurrentNode[i].resize(nbTypes); 00690 00691 // Here, we have no likelihood computation to do! 00692 00693 // Then, we deal with the node of interest. 00694 // ('y' is the state at 'node' and 'x' the state at 'father'.) 00695 // Iterate over all site partitions: 00696 auto_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(currentNode->getId())); 00697 while (mit->hasNext()) 00698 { 00699 TreeLikelihood::ConstBranchModelDescription* bmd = mit->next(); 00700 substitutionCount.setSubstitutionModel(bmd->getModel()); 00701 //compute all nxy first: 00702 VVVdouble nxyt(nbTypes); 00703 for (size_t t = 0; t < nbTypes; ++t) 00704 { 00705 nxyt[t].resize(nbStates); 00706 Matrix<double>* nxy = substitutionCount.getAllNumbersOfSubstitutions(d, t + 1); 00707 for (size_t x = 0; x < nbStates; ++x) { 00708 nxyt[t][x].resize(nbStates); 00709 for (size_t y = 0; y < nbStates; ++y) { 00710 nxyt[t][x][y] = (*nxy)(x, y); 00711 } 00712 } 00713 delete nxy; 00714 } 00715 //Now loop over sites: 00716 auto_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator()); 00717 while (sit->hasNext()) 00718 { 00719 size_t i = sit->next(); 00720 size_t fatherState = fatherStates[i]; 00721 size_t nodeState = nodeStates[i]; 00722 if (fatherState >= nbStates || nodeState >= nbStates) 00723 for (size_t t = 0; t < nbTypes; ++t) 00724 substitutionsForCurrentNode[i][t] = 0; // To be conservative! Only in case there are generic characters. 00725 else 00726 for (size_t t = 0; t < nbTypes; ++t) 00727 substitutionsForCurrentNode[i][t] = nxyt[t][fatherState][nodeState]; 00728 } 00729 } 00730 00731 //Now we just have to copy the substitutions into the result vector: 00732 for (size_t i = 0; i < nbSites; i++) 00733 for (size_t t = 0; t < nbTypes; t++) 00734 (*substitutions)(l, i, t) = substitutionsForCurrentNode[(*rootPatternLinks)[i]][t]; 00735 } 00736 if (verbose) 00737 { 00738 if (ApplicationTools::message) *ApplicationTools::message << " "; 00739 ApplicationTools::displayTaskDone(); 00740 } 00741 return substitutions; 00742 } 00743 00744 /**************************************************************************************************/ 00745 00746 ProbabilisticSubstitutionMapping* SubstitutionMappingTools::computeSubstitutionVectorsMarginal( 00747 const DRTreeLikelihood& drtl, 00748 SubstitutionCount& substitutionCount, 00749 bool verbose) throw (Exception) 00750 { 00751 //Preamble: 00752 if (!drtl.isInitialized()) throw Exception("SubstitutionMappingTools::computeSubstitutionVectorsMarginal(). Likelihood object is not initialized."); 00753 00754 //A few variables we'll need: 00755 00756 const TreeTemplate<Node>tree(drtl.getTree()); 00757 const SiteContainer* sequences = drtl.getData(); 00758 const DiscreteDistribution* rDist = drtl.getRateDistribution(); 00759 00760 size_t nbSites = sequences->getNumberOfSites(); 00761 size_t nbDistinctSites = drtl.getLikelihoodData()->getNumberOfDistinctSites(); 00762 size_t nbStates = sequences->getAlphabet()->getSize(); 00763 size_t nbClasses = rDist->getNumberOfCategories(); 00764 size_t nbTypes = substitutionCount.getNumberOfSubstitutionTypes(); 00765 vector<const Node*> nodes = tree.getNodes(); 00766 const vector<size_t>* rootPatternLinks 00767 = &drtl.getLikelihoodData()->getRootArrayPositions(); 00768 nodes.pop_back(); // Remove root node. 00769 size_t nbNodes = nodes.size(); 00770 00771 // We create a new ProbabilisticSubstitutionMapping object: 00772 ProbabilisticSubstitutionMapping* substitutions = new ProbabilisticSubstitutionMapping(tree, &substitutionCount, nbSites); 00773 00774 // Compute the whole likelihood of the tree according to the specified model: 00775 00776 Vdouble rcProbs = rDist->getProbabilities(); 00777 Vdouble rcRates = rDist->getCategories(); 00778 00779 //II) Compute the number of substitutions for each class and each branch in the tree: 00780 if (verbose) ApplicationTools::displayTask("Compute marginal node-pairs likelihoods", true); 00781 00782 for (size_t l = 0; l < nbNodes; l++) 00783 { 00784 const Node* currentNode = nodes[l]; 00785 00786 const Node* father = currentNode->getFather(); 00787 00788 double d = currentNode->getDistanceToFather(); 00789 00790 //For each node, 00791 if (verbose) ApplicationTools::displayGauge(l, nbNodes-1, '>'); 00792 VVdouble substitutionsForCurrentNode(nbDistinctSites); 00793 for (size_t i = 0; i < nbDistinctSites; ++i) 00794 substitutionsForCurrentNode[i].resize(nbTypes); 00795 00796 // Then, we deal with the node of interest. 00797 // ('y' is the state at 'node' and 'x' the state at 'father'.) 00798 VVVdouble probsNode = DRTreeLikelihoodTools::getPosteriorProbabilitiesForEachStateForEachRate(drtl, currentNode->getId()); 00799 VVVdouble probsFather = DRTreeLikelihoodTools::getPosteriorProbabilitiesForEachStateForEachRate(drtl, father->getId()); 00800 00801 //Iterate over all site partitions: 00802 auto_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(currentNode->getId())); 00803 while (mit->hasNext()) 00804 { 00805 TreeLikelihood::ConstBranchModelDescription* bmd = mit->next(); 00806 substitutionCount.setSubstitutionModel(bmd->getModel()); 00807 //compute all nxy first: 00808 VVVVdouble nxy(nbClasses); 00809 for (size_t c = 0; c < nbClasses; ++c) 00810 { 00811 VVVdouble* nxy_c = &nxy[c]; 00812 double rc = rcRates[c]; 00813 nxy_c->resize(nbTypes); 00814 for (size_t t = 0; t < nbTypes; ++t) 00815 { 00816 VVdouble* nxy_c_t = &(*nxy_c)[t]; 00817 Matrix<double>* nijt = substitutionCount.getAllNumbersOfSubstitutions(d * rc, t + 1); 00818 nxy_c_t->resize(nbStates); 00819 for (size_t x = 0; x < nbStates; ++x) 00820 { 00821 Vdouble* nxy_c_t_x = &(*nxy_c_t)[x]; 00822 nxy_c_t_x->resize(nbStates); 00823 for (size_t y = 0; y < nbStates; ++y) 00824 { 00825 (*nxy_c_t_x)[y] = (*nijt)(x, y); 00826 } 00827 } 00828 delete nijt; 00829 } 00830 } 00831 00832 //Now loop over sites: 00833 auto_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator()); 00834 while (sit->hasNext()) 00835 { 00836 size_t i = sit->next(); 00837 VVdouble* probsNode_i = & probsNode[i]; 00838 VVdouble* probsFather_i = & probsFather[i]; 00839 for (size_t c = 0; c < nbClasses; ++c) 00840 { 00841 Vdouble* probsNode_i_c = &(*probsNode_i)[c]; 00842 Vdouble* probsFather_i_c = &(*probsFather_i)[c]; 00843 VVVdouble* nxy_c = &nxy[c]; 00844 for (size_t x = 0; x < nbStates; ++x) 00845 { 00846 for (size_t y = 0; y < nbStates; ++y) 00847 { 00848 double prob_cxy = (*probsFather_i_c)[x] * (*probsNode_i_c)[y]; 00849 // Now the vector computation: 00850 for (size_t t = 0; t < nbTypes; ++t) { 00851 substitutionsForCurrentNode[i][t] += prob_cxy * (*nxy_c)[t][x][y]; 00852 // <------> <---------------> 00853 // Posterior probability | | 00854 // for site i and rate class c * | | 00855 // likelihood for this site--------------+ | 00856 // | 00857 //Substitution function for site i and rate class c-------+ 00858 } 00859 } 00860 } 00861 } 00862 } 00863 } 00864 00865 //Now we just have to copy the substitutions into the result vector: 00866 for (size_t i = 0; i < nbSites; ++i) 00867 for (size_t t = 0; t < nbTypes; ++t) 00868 (*substitutions)(l, i, t) = substitutionsForCurrentNode[(* rootPatternLinks)[i]][t]; 00869 } 00870 if (verbose) 00871 { 00872 if (ApplicationTools::message) *ApplicationTools::message << " "; 00873 ApplicationTools::displayTaskDone(); 00874 } 00875 return substitutions; 00876 } 00877 00878 /**************************************************************************************************/ 00879 00880 void SubstitutionMappingTools::writeToStream( 00881 const ProbabilisticSubstitutionMapping& substitutions, 00882 const SiteContainer& sites, 00883 size_t type, 00884 ostream& out) 00885 throw (IOException) 00886 { 00887 if (!out) throw IOException("SubstitutionMappingTools::writeToFile. Can't write to stream."); 00888 out << "Branches"; 00889 out << "\tMean"; 00890 for (size_t i = 0; i < substitutions.getNumberOfSites(); i++) 00891 { 00892 out << "\tSite" << sites.getSite(i).getPosition(); 00893 } 00894 out << endl; 00895 00896 for (size_t j = 0; j < substitutions.getNumberOfBranches(); j++) 00897 { 00898 out << substitutions.getNode(j)->getId() << "\t" << substitutions.getNode(j)->getDistanceToFather(); 00899 for (size_t i = 0; i < substitutions.getNumberOfSites(); i++) 00900 { 00901 out << "\t" << substitutions(j, i, type); 00902 } 00903 out << endl; 00904 } 00905 } 00906 00907 /**************************************************************************************************/ 00908 00909 void SubstitutionMappingTools::readFromStream(istream& in, ProbabilisticSubstitutionMapping& substitutions, size_t type) 00910 throw (IOException) 00911 { 00912 try 00913 { 00914 DataTable* data = DataTable::read(in, "\t", true, -1); 00915 vector<string> ids = data->getColumn(0); 00916 data->deleteColumn(0);//Remove ids 00917 data->deleteColumn(0);//Remove means 00918 //Now parse the table: 00919 size_t nbSites = data->getNumberOfColumns(); 00920 substitutions.setNumberOfSites(nbSites); 00921 size_t nbBranches = data->getNumberOfRows(); 00922 for (size_t i = 0; i < nbBranches; i++) 00923 { 00924 int id = TextTools::toInt(ids[i]); 00925 size_t br = substitutions.getNodeIndex(id); 00926 for (size_t j = 0; j < nbSites; j++) 00927 { 00928 substitutions(br, j, type) = TextTools::toDouble((*data)(i, j)); 00929 } 00930 } 00931 //Parse the header: 00932 for (size_t i = 0; i < nbSites; i++) 00933 { 00934 string siteTxt = data->getColumnName(i); 00935 int site = 0; 00936 if (siteTxt.substr(0,4) == "Site") site = TextTools::to<int>(siteTxt.substr(4)); 00937 else site = TextTools::to<int>(siteTxt); 00938 substitutions.setSitePosition(i, site); 00939 } 00940 00941 delete data; 00942 } 00943 catch (Exception& e) 00944 { 00945 throw IOException(string("Bad input file. ") + e.what()); 00946 } 00947 } 00948 00949 /**************************************************************************************************/ 00950 00951 vector<double> SubstitutionMappingTools::computeTotalSubstitutionVectorForSite(const SubstitutionMapping& smap, size_t siteIndex) 00952 { 00953 size_t nbBranches = smap.getNumberOfBranches(); 00954 size_t nbTypes = smap.getNumberOfSubstitutionTypes(); 00955 Vdouble v(nbBranches); 00956 for (size_t l = 0; l < nbBranches; ++l) { 00957 v[l] = 0; 00958 for (size_t t = 0; t < nbTypes; ++t) { 00959 v[l] += smap(l, siteIndex, t); 00960 } 00961 } 00962 return v; 00963 } 00964 00965 /**************************************************************************************************/ 00966 00967 double SubstitutionMappingTools::computeNormForSite(const SubstitutionMapping& smap, size_t siteIndex) 00968 { 00969 double sumSquare = 0; 00970 for (size_t l = 0; l < smap.getNumberOfBranches(); ++l) { 00971 double sum = 0; 00972 for (size_t t = 0; t < smap.getNumberOfSubstitutionTypes(); ++t) { 00973 sum += smap(l, siteIndex, t); 00974 } 00975 sumSquare += sum * sum; 00976 } 00977 return sqrt(sumSquare); 00978 } 00979 00980 /**************************************************************************************************/ 00981 00982 vector<double> SubstitutionMappingTools::computeSumForBranch(const SubstitutionMapping& smap, size_t branchIndex) 00983 { 00984 size_t nbSites = smap.getNumberOfSites(); 00985 size_t nbTypes = smap.getNumberOfSubstitutionTypes(); 00986 Vdouble v(nbTypes, 0); 00987 for (size_t i = 0; i < nbSites; ++i) { 00988 for (size_t t = 0; t < nbTypes; ++t) { 00989 v[t] += smap(branchIndex, i, t); 00990 } 00991 } 00992 return v; 00993 } 00994 00995 /**************************************************************************************************/ 00996 00997 vector<double> SubstitutionMappingTools::computeSumForSite(const SubstitutionMapping& smap, size_t siteIndex) 00998 { 00999 size_t nbBranches = smap.getNumberOfBranches(); 01000 size_t nbTypes = smap.getNumberOfSubstitutionTypes(); 01001 Vdouble v(nbTypes, 0); 01002 for (size_t i = 0; i < nbBranches; ++i) { 01003 for (size_t t = 0; t < nbTypes; ++t) { 01004 v[t] += smap(i, siteIndex, t); 01005 } 01006 } 01007 return v; 01008 } 01009 01010 /**************************************************************************************************/ 01011