bpp-phyl  2.1.0
Bpp/Phyl/Mapping/SubstitutionMappingTools.cpp
Go to the documentation of this file.
00001 //
00002 // File: SubstitutionMappingTools.cpp
00003 // Created by: Julien Dutheil
00004 // Created on: Wed Apr 5 13:04 2006
00005 //
00006 
00007 /*
00008 Copyright or © or Copr. Bio++ Development Team, (November 16, 2004, 2005, 2006)
00009 
00010 This software is a computer program whose purpose is to provide classes
00011 for phylogenetic data analysis.
00012 
00013 This software is governed by the CeCILL  license under French law and
00014 abiding by the rules of distribution of free software.  You can  use, 
00015 modify and/ or redistribute the software under the terms of the CeCILL
00016 license as circulated by CEA, CNRS and INRIA at the following URL
00017 "http://www.cecill.info". 
00018 
00019 As a counterpart to the access to the source code and  rights to copy,
00020 modify and redistribute granted by the license, users are provided only
00021 with a limited warranty  and the software's author,  the holder of the
00022 economic rights,  and the successive licensors  have only  limited
00023 liability. 
00024 
00025 In this respect, the user's attention is drawn to the risks associated
00026 with loading,  using,  modifying and/or developing or reproducing the
00027 software by the user in light of its specific status of free software,
00028 that may mean  that it is complicated to manipulate,  and  that  also
00029 therefore means  that it is reserved for developers  and  experienced
00030 professionals having in-depth computer knowledge. Users are therefore
00031 encouraged to load and test the software's suitability as regards their
00032 requirements in conditions enabling the security of their systems and/or 
00033 data to be ensured and,  more generally, to use and operate it in the 
00034 same conditions as regards security. 
00035 
00036 The fact that you are presently reading this means that you have had
00037 knowledge of the CeCILL license and that you accept its terms.
00038 */
00039 
00040 #include "SubstitutionMappingTools.h"
00041 #include "../Likelihood/DRTreeLikelihoodTools.h"
00042 #include "../Likelihood/MarginalAncestralStateReconstruction.h"
00043 
00044 #include <Bpp/Text/TextTools.h>
00045 #include <Bpp/App/ApplicationTools.h>
00046 #include <Bpp/Numeric/Matrix/MatrixTools.h>
00047 #include <Bpp/Numeric/DataTable.h>
00048 
00049 using namespace bpp;
00050 
00051 // From the STL:
00052 #include <iomanip>
00053 
00054 using namespace std;
00055 
00056 /******************************************************************************/
00057 
00058 ProbabilisticSubstitutionMapping* SubstitutionMappingTools::computeSubstitutionVectors(
00059   const DRTreeLikelihood& drtl,
00060   SubstitutionCount& substitutionCount,
00061   bool verbose) throw (Exception)
00062 {
00063   //Preamble:
00064   if (!drtl.isInitialized()) throw Exception("SubstitutionMappingTools::computeSubstitutionVectors(). Likelihood object is not initialized.");
00065                                    
00066   //A few variables we'll need:
00067   
00068   const TreeTemplate<Node> tree(drtl.getTree());
00069   const SiteContainer*    sequences = drtl.getData();
00070   const DiscreteDistribution* rDist = drtl.getRateDistribution();
00071     
00072   size_t nbSites         = sequences->getNumberOfSites();
00073   size_t nbDistinctSites = drtl.getLikelihoodData()->getNumberOfDistinctSites();
00074   size_t nbStates        = sequences->getAlphabet()->getSize();
00075   size_t nbClasses       = rDist->getNumberOfCategories();
00076   size_t nbTypes         = substitutionCount.getNumberOfSubstitutionTypes();
00077   vector<const Node*> nodes    = tree.getNodes();
00078   const vector<size_t>* rootPatternLinks
00079                                = &drtl.getLikelihoodData()->getRootArrayPositions();
00080   nodes.pop_back(); // Remove root node.
00081   size_t nbNodes         = nodes.size();
00082   
00083   // We create a new ProbabilisticSubstitutionMapping object:
00084   ProbabilisticSubstitutionMapping* substitutions = new ProbabilisticSubstitutionMapping(tree, &substitutionCount, nbSites);
00085                                    
00086   // Store likelihood for each rate for each site:
00087   VVVdouble lik;
00088   drtl.computeLikelihoodAtNode(tree.getRootId(), lik);
00089   Vdouble Lr(nbDistinctSites, 0);
00090   Vdouble rcProbs = rDist->getProbabilities();
00091   Vdouble rcRates = rDist->getCategories();
00092   for (size_t i = 0; i < nbDistinctSites; i++)
00093   {
00094     VVdouble* lik_i = &lik[i];
00095     for (size_t c = 0; c < nbClasses; c++)
00096     {
00097       Vdouble* lik_i_c = &(*lik_i)[c];
00098       double rc = rDist->getProbability(c);
00099       for (size_t s = 0; s < nbStates; s++)
00100       {
00101         Lr[i] += (*lik_i_c)[s] * rc;
00102       }
00103     }
00104   }
00105 
00106   // Compute the number of substitutions for each class and each branch in the tree:
00107   if (verbose) ApplicationTools::displayTask("Compute joint node-pairs likelihood", true);
00108   
00109   for (size_t l = 0; l < nbNodes; l++)
00110   {
00111     //For each node,
00112     const Node* currentNode = nodes[l];
00113 
00114     const Node* father = currentNode->getFather();
00115 
00116     double d = currentNode->getDistanceToFather();
00117  
00118     if (verbose) ApplicationTools::displayGauge(l, nbNodes-1, '>');
00119     VVdouble substitutionsForCurrentNode(nbDistinctSites);
00120     for (size_t i = 0; i < nbDistinctSites; ++i)
00121       substitutionsForCurrentNode[i].resize(nbTypes);
00122 
00123     // Now we've got to compute likelihoods in a smart manner... ;)
00124     VVVdouble likelihoodsFatherConstantPart(nbDistinctSites);
00125     for (size_t i = 0; i < nbDistinctSites; i++)
00126     {
00127       VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
00128       likelihoodsFatherConstantPart_i->resize(nbClasses);
00129       for (size_t c = 0; c < nbClasses; c++)
00130       {
00131         Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
00132         likelihoodsFatherConstantPart_i_c->resize(nbStates);
00133         double rc = rDist->getProbability(c);
00134         for (size_t s = 0; s < nbStates; s++)
00135         {
00136           //(* likelihoodsFatherConstantPart_i_c)[s] = rc * model->freq(s);
00137           //freq is already accounted in the array
00138           (* likelihoodsFatherConstantPart_i_c)[s] = rc;
00139         }
00140       }
00141     }
00142     
00143     // First, what will remain constant:
00144     size_t nbSons =  father->getNumberOfSons();
00145     for (size_t n = 0; n < nbSons; n++)
00146     {
00147       const Node* currentSon = father->getSon(n);
00148       if (currentSon->getId() != currentNode->getId())
00149       {
00150         const VVVdouble* likelihoodsFather_son = &drtl.getLikelihoodData()->getLikelihoodArray(father->getId(), currentSon->getId());
00151 
00152         //Now iterate over all site partitions:
00153         auto_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(currentSon->getId()));
00154         VVVdouble pxy;
00155         bool first;
00156         while (mit->hasNext())
00157         {
00158           TreeLikelihood::ConstBranchModelDescription* bmd = mit->next();
00159           auto_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator());
00160           first = true;
00161           while (sit->hasNext())
00162           {
00163             size_t i = sit->next();
00164             //We retrieve the transition probabilities for this site partition:
00165             if (first)
00166             {
00167               pxy = drtl.getTransitionProbabilitiesPerRateClass(currentSon->getId(), i);
00168               first = false;
00169             }
00170             const VVdouble* likelihoodsFather_son_i = &(*likelihoodsFather_son)[i];
00171             VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
00172             for (size_t c = 0; c < nbClasses; c++)
00173             {
00174               const Vdouble* likelihoodsFather_son_i_c = &(*likelihoodsFather_son_i)[c];
00175               Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
00176               VVdouble* pxy_c = &pxy[c];
00177               for (size_t x = 0; x < nbStates; x++)
00178               {
00179                 Vdouble* pxy_c_x = &(*pxy_c)[x];
00180                 double likelihood = 0.;
00181                 for (size_t y = 0; y < nbStates; y++)
00182                 {
00183                   likelihood += (*pxy_c_x)[y] * (*likelihoodsFather_son_i_c)[y];
00184                 }
00185                 (*likelihoodsFatherConstantPart_i_c)[x] *= likelihood;
00186               }
00187             }
00188           }
00189         }      
00190       }
00191     }
00192     if (father->hasFather())
00193     {
00194       const Node* currentSon = father->getFather();
00195       const VVVdouble* likelihoodsFather_son = &drtl.getLikelihoodData()->getLikelihoodArray(father->getId(), currentSon->getId());
00196       //Now iterate over all site partitions:
00197       auto_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(father->getId()));
00198       VVVdouble pxy;
00199       bool first;
00200       while (mit->hasNext())
00201       {
00202         TreeLikelihood::ConstBranchModelDescription* bmd = mit->next();
00203         auto_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator());
00204         first = true;
00205         while (sit->hasNext())
00206         {
00207           size_t i = sit->next();
00208           //We retrieve the transition probabilities for this site partition:
00209           if (first)
00210           {
00211             pxy = drtl.getTransitionProbabilitiesPerRateClass(father->getId(), i);
00212             first = false;
00213           }
00214           const VVdouble* likelihoodsFather_son_i = &(*likelihoodsFather_son)[i];
00215           VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
00216           for (size_t c = 0; c < nbClasses; c++)
00217           {
00218             const Vdouble* likelihoodsFather_son_i_c = &(*likelihoodsFather_son_i)[c];
00219             Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
00220             VVdouble* pxy_c = &pxy[c]; 
00221             for (size_t x = 0; x < nbStates; x++)
00222             {
00223               double likelihood = 0.;
00224               for (size_t y = 0; y < nbStates; y++)
00225               {
00226                 Vdouble* pxy_c_x = &(*pxy_c)[y];
00227                 likelihood += (*pxy_c_x)[x] * (*likelihoodsFather_son_i_c)[y];
00228               }
00229               (*likelihoodsFatherConstantPart_i_c)[x] *= likelihood;
00230             }
00231           }
00232         }
00233       }      
00234     }
00235     else
00236     {
00237       //Account for root frequencies:
00238       for (size_t i = 0; i < nbDistinctSites; i++)
00239       {
00240         vector<double> freqs = drtl.getRootFrequencies(i);
00241         VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
00242         for (size_t c = 0; c < nbClasses; c++)
00243         {
00244           Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
00245           for (size_t x = 0; x < nbStates; x++)
00246           {
00247             (*likelihoodsFatherConstantPart_i_c)[x] *= freqs[x];
00248           }
00249         }
00250       }      
00251     }
00252 
00253     // Then, we deal with the node of interest.
00254     // We first average uppon 'y' to save computations, and then uppon 'x'.
00255     // ('y' is the state at 'node' and 'x' the state at 'father'.)
00256 
00257     //Iterate over all site partitions:
00258     const VVVdouble* likelihoodsFather_node = &(drtl.getLikelihoodData()->getLikelihoodArray(father->getId(), currentNode->getId()));
00259     auto_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(currentNode->getId()));
00260     VVVdouble pxy;
00261     bool first;
00262     while (mit->hasNext())
00263     {
00264       TreeLikelihood::ConstBranchModelDescription* bmd = mit->next();
00265       substitutionCount.setSubstitutionModel(bmd->getModel());
00266       //compute all nxy first:
00267       VVVVdouble nxy(nbClasses);
00268       for (size_t c = 0; c < nbClasses; ++c)
00269       {
00270         VVVdouble* nxy_c = &nxy[c];
00271         double rc = rcRates[c];
00272         nxy_c->resize(nbTypes);
00273         for (size_t t = 0; t < nbTypes; ++t)
00274         {
00275           VVdouble* nxy_c_t = &(*nxy_c)[t];
00276           Matrix<double>* nijt = substitutionCount.getAllNumbersOfSubstitutions(d * rc, t + 1);
00277           nxy_c_t->resize(nbStates);
00278           for (size_t x = 0; x < nbStates; ++x)
00279           {
00280             Vdouble* nxy_c_t_x = &(*nxy_c_t)[x];
00281             nxy_c_t_x->resize(nbStates);
00282             for (size_t y = 0; y < nbStates; ++y)
00283               (*nxy_c_t_x)[y] = (*nijt)(x, y);
00284           }
00285           delete nijt;
00286         }
00287       }
00288 
00289       //Now loop over sites:
00290       auto_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator());
00291       first = true;
00292       while (sit->hasNext())
00293       {
00294         size_t i = sit->next();
00295         //We retrieve the transition probabilities and substitution counts for this site partition:
00296         if (first)
00297         {
00298           pxy = drtl.getTransitionProbabilitiesPerRateClass(currentNode->getId(), i);  
00299           first = false;
00300         }
00301         const VVdouble* likelihoodsFather_node_i = &(*likelihoodsFather_node)[i];
00302         VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
00303         for (size_t c = 0; c < nbClasses; ++c)
00304         {
00305           const Vdouble* likelihoodsFather_node_i_c = &(*likelihoodsFather_node_i)[c];
00306           Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
00307           const VVdouble* pxy_c = &pxy[c];
00308           VVVdouble* nxy_c = &nxy[c];
00309           for (size_t x = 0; x < nbStates; ++x)
00310           {
00311             double* likelihoodsFatherConstantPart_i_c_x = &(*likelihoodsFatherConstantPart_i_c)[x];
00312             const Vdouble* pxy_c_x = &(*pxy_c)[x];
00313             for (size_t y = 0; y < nbStates; ++y)
00314             {
00315               double likelihood_cxy = (*likelihoodsFatherConstantPart_i_c_x)
00316                  *(*pxy_c_x)[y]
00317                  *(*likelihoodsFather_node_i_c)[y];
00318 
00319               for (size_t t = 0; t < nbTypes; ++t) {
00320                 // Now the vector computation:
00321                 substitutionsForCurrentNode[i][t] += likelihood_cxy * (*nxy_c)[t][x][y];
00322                 //                                   <------------>   <--------------->
00323                 // Posterior probability                   |                 | 
00324                 // for site i and rate class c *           |                 |
00325                 // likelihood for this site----------------+                 |
00326                 //                                                           |
00327                 //Substitution function for site i and rate class c----------+
00328               }
00329             }          
00330           }
00331         }
00332       }
00333     }
00334     
00335     //Now we just have to copy the substitutions into the result vector:
00336     for (size_t i = 0; i < nbSites; ++i)
00337       for (size_t t = 0; t < nbTypes; ++t)
00338         (*substitutions)(l, i, t) = substitutionsForCurrentNode[(* rootPatternLinks)[i]][t] / Lr[(* rootPatternLinks)[i]];
00339   }
00340   if (verbose)
00341   {
00342     if (ApplicationTools::message) *ApplicationTools::message << " ";
00343     ApplicationTools::displayTaskDone();
00344   }
00345   return substitutions;
00346 }
00347 
00348 /**************************************************************************************************/
00349 
00350 ProbabilisticSubstitutionMapping* SubstitutionMappingTools::computeSubstitutionVectorsNoAveraging(
00351   const DRTreeLikelihood& drtl,
00352   SubstitutionCount& substitutionCount,
00353   bool verbose) throw (Exception)
00354 {
00355   //Preamble:
00356   if (!drtl.isInitialized()) throw Exception("SubstitutionMappingTools::computeSubstitutionVectorsNoAveraging(). Likelihood object is not initialized.");
00357                                    
00358   //A few variables we'll need:
00359   const TreeTemplate<Node> tree(drtl.getTree());
00360   const SiteContainer*    sequences = drtl.getData();
00361   const DiscreteDistribution* rDist = drtl.getRateDistribution();
00362     
00363   size_t nbSites         = sequences->getNumberOfSites();
00364   size_t nbDistinctSites = drtl.getLikelihoodData()->getNumberOfDistinctSites();
00365   size_t nbStates        = sequences->getAlphabet()->getSize();
00366   size_t nbClasses       = rDist->getNumberOfCategories();
00367   size_t nbTypes         = substitutionCount.getNumberOfSubstitutionTypes();
00368   vector<const Node *> nodes   = tree.getNodes();
00369   const vector<size_t> * rootPatternLinks
00370                                = &drtl.getLikelihoodData()->getRootArrayPositions();
00371   nodes.pop_back(); // Remove root node.
00372   size_t nbNodes = nodes.size();
00373   
00374   // We create a new ProbabilisticSubstitutionMapping object:
00375   ProbabilisticSubstitutionMapping* substitutions = new ProbabilisticSubstitutionMapping(tree, &substitutionCount, nbSites);
00376                                    
00377   Vdouble rcRates = rDist->getCategories();
00378 
00379   // Compute the number of substitutions for each class and each branch in the tree:
00380   if (verbose) ApplicationTools::displayTask("Compute joint node-pairs likelihood", true);
00381   
00382   for (size_t l = 0; l < nbNodes; ++l)
00383   {
00384     // For each node,
00385     const Node* currentNode = nodes[l];
00386 
00387     const Node* father = currentNode->getFather();
00388 
00389     double d = currentNode->getDistanceToFather();
00390     
00391     if (verbose) ApplicationTools::displayGauge(l, nbNodes-1, '>');
00392     VVdouble substitutionsForCurrentNode(nbDistinctSites);
00393     for (size_t i = 0; i < nbDistinctSites; ++i)
00394       substitutionsForCurrentNode[i].resize(nbTypes);
00395 
00396     // Now we've got to compute likelihoods in a smart manner... ;)
00397     VVVdouble likelihoodsFatherConstantPart(nbDistinctSites);
00398     for (size_t i = 0; i < nbDistinctSites; ++i)
00399     {
00400       VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
00401       likelihoodsFatherConstantPart_i->resize(nbClasses);
00402       for (size_t c = 0; c < nbClasses; ++c)
00403       {
00404         Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
00405         likelihoodsFatherConstantPart_i_c->resize(nbStates);
00406         double rc = rDist->getProbability(c);
00407         for (size_t s = 0; s < nbStates; ++s)
00408         {
00409           //(* likelihoodsFatherConstantPart_i_c)[s] = rc * model->freq(s);
00410           //freq is already accounted in the array
00411           (* likelihoodsFatherConstantPart_i_c)[s] = rc;
00412         }
00413       }
00414     }
00415     
00416     // First, what will remain constant:
00417     size_t nbSons =  father->getNumberOfSons();
00418     for (size_t n = 0; n < nbSons; ++n)
00419     {
00420       const Node* currentSon = father->getSon(n);    
00421       if (currentSon->getId() != currentNode->getId())
00422       {
00423         const VVVdouble* likelihoodsFather_son = &drtl.getLikelihoodData()->getLikelihoodArray(father->getId(), currentSon->getId());
00424 
00425         //Now iterate over all site partitions:
00426         auto_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(currentSon->getId()));
00427         VVVdouble pxy;
00428         bool first;
00429         while (mit->hasNext())
00430         {
00431           TreeLikelihood::ConstBranchModelDescription* bmd = mit->next();
00432           auto_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator());
00433           first = true;
00434           while (sit->hasNext())
00435           {
00436             size_t i = sit->next();
00437             //We retrieve the transition probabilities for this site partition:
00438             if (first)
00439             {
00440               pxy = drtl.getTransitionProbabilitiesPerRateClass(currentSon->getId(), i);
00441               first = false;
00442             }
00443             const VVdouble* likelihoodsFather_son_i = &(*likelihoodsFather_son)[i];
00444             VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
00445             for (size_t c = 0; c < nbClasses; ++c)
00446             {
00447               const Vdouble* likelihoodsFather_son_i_c = &(*likelihoodsFather_son_i)[c];
00448               Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
00449               VVdouble* pxy_c = & pxy[c]; 
00450               for (size_t x = 0; x < nbStates; ++x)
00451               {
00452                 Vdouble* pxy_c_x = &(*pxy_c)[x];
00453                 double likelihood = 0.;
00454                 for (size_t y = 0; y < nbStates; ++y)
00455                 {
00456                   likelihood += (*pxy_c_x)[y] * (*likelihoodsFather_son_i_c)[y];
00457                 }
00458                 (*likelihoodsFatherConstantPart_i_c)[x] *= likelihood;
00459               }
00460             }
00461           }
00462         }      
00463       }
00464     }
00465     if (father->hasFather())
00466     {
00467       const Node* currentSon = father->getFather();
00468       const VVVdouble* likelihoodsFather_son = &drtl.getLikelihoodData()->getLikelihoodArray(father->getId(), currentSon->getId());
00469       //Now iterate over all site partitions:
00470       auto_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(father->getId()));
00471       VVVdouble pxy;
00472       bool first;
00473       while (mit->hasNext())
00474       {
00475         TreeLikelihood::ConstBranchModelDescription* bmd = mit->next();
00476         auto_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator());
00477         first = true;
00478         while (sit->hasNext())
00479         {
00480           size_t i = sit->next();
00481           //We retrieve the transition probabilities for this site partition:
00482           if (first)
00483           {
00484             pxy = drtl.getTransitionProbabilitiesPerRateClass(father->getId(), i);
00485             first = false;
00486           }
00487           const VVdouble* likelihoodsFather_son_i = &(*likelihoodsFather_son)[i];
00488           VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
00489           for (size_t c = 0; c < nbClasses; ++c)
00490           {
00491             const Vdouble* likelihoodsFather_son_i_c = &(*likelihoodsFather_son_i)[c];
00492             Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
00493             VVdouble* pxy_c = &pxy[c]; 
00494             for (size_t x = 0; x < nbStates; ++x)
00495             {
00496               double likelihood = 0.;
00497               for (size_t y = 0; y < nbStates; ++y)
00498               {
00499                 Vdouble* pxy_c_x = &(*pxy_c)[y];
00500                 likelihood += (*pxy_c_x)[x] * (*likelihoodsFather_son_i_c)[y];
00501               }
00502               (*likelihoodsFatherConstantPart_i_c)[x] *= likelihood;
00503             }
00504           }
00505         }
00506       }      
00507     }
00508     else
00509     {
00510       //Account for root frequencies:
00511       for (size_t i = 0; i < nbDistinctSites; ++i)
00512       {
00513         vector<double> freqs = drtl.getRootFrequencies(i);
00514         VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
00515         for (size_t c = 0; c < nbClasses; ++c)
00516         {
00517           Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
00518           for (size_t x = 0; x < nbStates; ++x)
00519           {
00520             (*likelihoodsFatherConstantPart_i_c)[x] *= freqs[x]; 
00521           }
00522         }
00523       }      
00524     }
00525 
00526     // Then, we deal with the node of interest.
00527     // We first average uppon 'y' to save computations, and then uppon 'x'.
00528     // ('y' is the state at 'node' and 'x' the state at 'father'.)
00529 
00530     //Iterate over all site partitions:
00531     const VVVdouble* likelihoodsFather_node = &drtl.getLikelihoodData()->getLikelihoodArray(father->getId(), currentNode->getId());
00532     auto_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(currentNode->getId()));
00533     VVVdouble pxy;
00534     bool first;
00535     while (mit->hasNext())
00536     {
00537       TreeLikelihood::ConstBranchModelDescription* bmd = mit->next();
00538       substitutionCount.setSubstitutionModel(bmd->getModel());
00539       //compute all nxy first:
00540       VVVVdouble nxy(nbClasses);
00541       for (size_t c = 0; c < nbClasses; ++c)
00542       {
00543         double rc = rcRates[c];
00544         VVVdouble* nxy_c = &nxy[c];
00545         nxy_c->resize(nbTypes);
00546         for (size_t t = 0; t < nbTypes; ++t)
00547         {
00548           VVdouble* nxy_c_t = &(*nxy_c)[t];
00549           nxy_c_t->resize(nbStates);
00550           Matrix<double>* nijt = substitutionCount.getAllNumbersOfSubstitutions(d * rc, t + 1);
00551           for (size_t x = 0; x < nbStates; ++x)
00552           {
00553             Vdouble* nxy_c_t_x = &(*nxy_c_t)[x];
00554             nxy_c_t_x->resize(nbStates);
00555             for (size_t y = 0; y < nbStates; ++y)
00556             {
00557               (*nxy_c_t_x)[y] = (*nijt)(x, y);
00558             }
00559           }
00560           delete nijt;
00561         }
00562       }
00563 
00564       //Now loop over sites:
00565       auto_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator());
00566       first = true;
00567       while (sit->hasNext())
00568       {
00569         size_t i = sit->next();
00570         //We retrieve the transition probabilities and substitution counts for this site partition:
00571         if (first)
00572         {
00573           pxy = drtl.getTransitionProbabilitiesPerRateClass(currentNode->getId(), i);  
00574           first = false;
00575         }
00576         const VVdouble* likelihoodsFather_node_i = &(*likelihoodsFather_node)[i];
00577         VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
00578         RowMatrix<double> pairProbabilities(nbStates, nbStates);
00579         MatrixTools::fill(pairProbabilities, 0.);
00580         VVVdouble subsCounts(nbStates);
00581         for (size_t j = 0; j < nbStates; ++j) {
00582           subsCounts[j].resize(nbStates);
00583           for (size_t k = 0; k < nbStates; ++k) {
00584             subsCounts[j][k].resize(nbTypes);
00585           }
00586         }
00587         for (size_t c = 0; c < nbClasses; ++c)
00588         {
00589           const Vdouble* likelihoodsFather_node_i_c = &(*likelihoodsFather_node_i)[c];
00590           Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
00591           const VVdouble* pxy_c = &pxy[c];
00592           VVVdouble* nxy_c = &nxy[c];
00593           for (size_t x = 0; x < nbStates; ++x)
00594           {
00595             double* likelihoodsFatherConstantPart_i_c_x = &(*likelihoodsFatherConstantPart_i_c)[x];
00596             const Vdouble* pxy_c_x = &(*pxy_c)[x];
00597             for (size_t y = 0; y < nbStates; ++y)
00598             {
00599               double likelihood_cxy = (*likelihoodsFatherConstantPart_i_c_x)
00600                  *(*pxy_c_x)[y]
00601                  *(*likelihoodsFather_node_i_c)[y];
00602               pairProbabilities(x, y) += likelihood_cxy; // Sum over all rate classes.
00603               for (size_t t = 0; t < nbTypes; ++t) {
00604                 subsCounts[x][y][t] += likelihood_cxy * (* nxy_c)[t][x][y];
00605               }
00606             }
00607           }
00608         }
00609         // Now the vector computation:
00610         // Here we do not average over all possible pair of ancestral states,
00611         // We only consider the one with max likelihood:
00612         vector<size_t> xy = MatrixTools::whichMax(pairProbabilities);
00613         for (size_t t = 0; t < nbTypes; ++t) {
00614           substitutionsForCurrentNode[i][t] += subsCounts[xy[0]][xy[1]][t] / pairProbabilities(xy[0], xy[1]);
00615         }
00616       }
00617     }
00618     //Now we just have to copy the substitutions into the result vector:
00619     for (size_t i = 0; i < nbSites; i++)
00620       for (size_t t = 0; t < nbTypes; t++)
00621         (*substitutions)(l, i, t) = substitutionsForCurrentNode[(*rootPatternLinks)[i]][t];
00622   }
00623   if (verbose)
00624   {
00625     if (ApplicationTools::message) *ApplicationTools::message << " ";
00626     ApplicationTools::displayTaskDone();
00627   }
00628   return substitutions;
00629 }
00630 
00631 /**************************************************************************************************/
00632 
00633 ProbabilisticSubstitutionMapping* SubstitutionMappingTools::computeSubstitutionVectorsNoAveragingMarginal(
00634   const DRTreeLikelihood& drtl,
00635   SubstitutionCount& substitutionCount,
00636   bool verbose) throw (Exception)
00637 {
00638   //Preamble:
00639   if (!drtl.isInitialized()) throw Exception("SubstitutionMappingTools::computeSubstitutionVectorsNoAveragingMarginal(). Likelihood object is not initialized.");
00640                                    
00641   //A few variables we'll need:
00642   
00643   const TreeTemplate<Node> tree(drtl.getTree());
00644   const SiteContainer*    sequences = drtl.getData();
00645   const DiscreteDistribution* rDist = drtl.getRateDistribution();
00646   const Alphabet*             alpha = sequences->getAlphabet();
00647     
00648   size_t nbSites         = sequences->getNumberOfSites();
00649   size_t nbDistinctSites = drtl.getLikelihoodData()->getNumberOfDistinctSites();
00650   size_t nbStates        = alpha->getSize();
00651   size_t nbTypes         = substitutionCount.getNumberOfSubstitutionTypes();
00652   vector<const Node*> nodes    = tree.getNodes();
00653   const vector<size_t>* rootPatternLinks
00654                                = &drtl.getLikelihoodData()->getRootArrayPositions();
00655   nodes.pop_back(); // Remove root node.
00656   size_t nbNodes = nodes.size();
00657   
00658   // We create a new ProbabilisticSubstitutionMapping object:
00659   ProbabilisticSubstitutionMapping* substitutions = new ProbabilisticSubstitutionMapping(tree, &substitutionCount, nbSites);
00660   
00661   // Compute the whole likelihood of the tree according to the specified model:
00662   
00663   Vdouble rcRates = rDist->getCategories();
00664 
00665   // Compute the number of substitutions for each class and each branch in the tree:
00666   if (verbose) ApplicationTools::displayTask("Compute marginal ancestral states");
00667   MarginalAncestralStateReconstruction masr(&drtl);
00668   map<int, vector<size_t> > ancestors = masr.getAllAncestralStates();
00669   if (verbose) ApplicationTools::displayTaskDone();
00670 
00671   // Now we just have to compute the substitution vectors:
00672   if (verbose) ApplicationTools::displayTask("Compute substitution vectors", true);
00673   
00674   for (size_t l = 0; l < nbNodes; l++)
00675   {
00676     const Node* currentNode = nodes[l];
00677     
00678     const Node* father = currentNode->getFather();
00679 
00680     double d = currentNode->getDistanceToFather();
00681 
00682     vector<size_t> nodeStates = ancestors[currentNode->getId()]; //These are not 'true' ancestors ;)
00683     vector<size_t> fatherStates = ancestors[father->getId()];
00684     
00685     //For each node,
00686     if (verbose) ApplicationTools::displayGauge(l, nbNodes-1, '>');
00687     VVdouble substitutionsForCurrentNode(nbDistinctSites);
00688     for (size_t i = 0; i < nbDistinctSites; ++i)
00689       substitutionsForCurrentNode[i].resize(nbTypes);
00690     
00691     // Here, we have no likelihood computation to do!
00692 
00693     // Then, we deal with the node of interest.
00694     // ('y' is the state at 'node' and 'x' the state at 'father'.)
00695     // Iterate over all site partitions:
00696     auto_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(currentNode->getId()));
00697     while (mit->hasNext())
00698     {
00699       TreeLikelihood::ConstBranchModelDescription* bmd = mit->next();
00700       substitutionCount.setSubstitutionModel(bmd->getModel());
00701       //compute all nxy first:
00702       VVVdouble nxyt(nbTypes);
00703       for (size_t t = 0; t < nbTypes; ++t)
00704       {
00705         nxyt[t].resize(nbStates);
00706         Matrix<double>* nxy = substitutionCount.getAllNumbersOfSubstitutions(d, t + 1);
00707         for (size_t x = 0; x < nbStates; ++x) {
00708           nxyt[t][x].resize(nbStates);
00709           for (size_t y = 0; y < nbStates; ++y) {
00710             nxyt[t][x][y] = (*nxy)(x, y);
00711           }
00712         }
00713         delete nxy;
00714       }
00715       //Now loop over sites:
00716       auto_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator());
00717       while (sit->hasNext())
00718       {
00719         size_t i = sit->next();
00720         size_t fatherState = fatherStates[i];
00721         size_t nodeState   = nodeStates[i];
00722         if (fatherState >= nbStates || nodeState >= nbStates)
00723           for (size_t t = 0; t < nbTypes; ++t)
00724             substitutionsForCurrentNode[i][t] = 0; // To be conservative! Only in case there are generic characters.
00725         else
00726           for (size_t t = 0; t < nbTypes; ++t)
00727             substitutionsForCurrentNode[i][t] = nxyt[t][fatherState][nodeState];
00728       }
00729     }
00730     
00731     //Now we just have to copy the substitutions into the result vector:
00732     for (size_t i = 0; i < nbSites; i++)
00733       for (size_t t = 0; t < nbTypes; t++)
00734         (*substitutions)(l, i, t) = substitutionsForCurrentNode[(*rootPatternLinks)[i]][t];
00735   }
00736   if (verbose)
00737   {
00738     if (ApplicationTools::message) *ApplicationTools::message << " ";
00739     ApplicationTools::displayTaskDone();
00740   }
00741   return substitutions;
00742 }
00743 
00744 /**************************************************************************************************/
00745 
00746 ProbabilisticSubstitutionMapping* SubstitutionMappingTools::computeSubstitutionVectorsMarginal(
00747   const DRTreeLikelihood& drtl,
00748   SubstitutionCount& substitutionCount,
00749   bool verbose) throw (Exception)
00750 {
00751   //Preamble:
00752   if (!drtl.isInitialized()) throw Exception("SubstitutionMappingTools::computeSubstitutionVectorsMarginal(). Likelihood object is not initialized.");
00753                                    
00754   //A few variables we'll need:
00755   
00756   const TreeTemplate<Node>tree(drtl.getTree());
00757   const SiteContainer*    sequences = drtl.getData();
00758   const DiscreteDistribution* rDist = drtl.getRateDistribution();
00759     
00760   size_t nbSites         = sequences->getNumberOfSites();
00761   size_t nbDistinctSites = drtl.getLikelihoodData()->getNumberOfDistinctSites();
00762   size_t nbStates        = sequences->getAlphabet()->getSize();
00763   size_t nbClasses       = rDist->getNumberOfCategories();
00764   size_t nbTypes         = substitutionCount.getNumberOfSubstitutionTypes();
00765   vector<const Node*> nodes    = tree.getNodes();
00766   const vector<size_t>* rootPatternLinks
00767                                = &drtl.getLikelihoodData()->getRootArrayPositions();
00768   nodes.pop_back(); // Remove root node.
00769   size_t nbNodes = nodes.size();
00770   
00771   // We create a new ProbabilisticSubstitutionMapping object:
00772   ProbabilisticSubstitutionMapping* substitutions = new ProbabilisticSubstitutionMapping(tree, &substitutionCount, nbSites);
00773                                      
00774   // Compute the whole likelihood of the tree according to the specified model:
00775   
00776   Vdouble rcProbs = rDist->getProbabilities();
00777   Vdouble rcRates = rDist->getCategories();
00778 
00779   //II) Compute the number of substitutions for each class and each branch in the tree:
00780   if (verbose) ApplicationTools::displayTask("Compute marginal node-pairs likelihoods", true);
00781   
00782   for (size_t l = 0; l < nbNodes; l++)
00783   {
00784     const Node* currentNode = nodes[l];
00785     
00786     const Node* father = currentNode->getFather();
00787 
00788     double d = currentNode->getDistanceToFather();
00789     
00790     //For each node,
00791     if (verbose) ApplicationTools::displayGauge(l, nbNodes-1, '>');
00792     VVdouble substitutionsForCurrentNode(nbDistinctSites);
00793     for (size_t i = 0; i < nbDistinctSites; ++i)
00794       substitutionsForCurrentNode[i].resize(nbTypes);
00795 
00796     // Then, we deal with the node of interest.
00797     // ('y' is the state at 'node' and 'x' the state at 'father'.)
00798     VVVdouble probsNode   = DRTreeLikelihoodTools::getPosteriorProbabilitiesForEachStateForEachRate(drtl, currentNode->getId());
00799     VVVdouble probsFather = DRTreeLikelihoodTools::getPosteriorProbabilitiesForEachStateForEachRate(drtl, father->getId());
00800 
00801     //Iterate over all site partitions:
00802     auto_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(currentNode->getId()));
00803     while (mit->hasNext())
00804     {
00805       TreeLikelihood::ConstBranchModelDescription* bmd = mit->next();
00806       substitutionCount.setSubstitutionModel(bmd->getModel());
00807       //compute all nxy first:
00808       VVVVdouble nxy(nbClasses);
00809       for (size_t c = 0; c < nbClasses; ++c)
00810       {
00811         VVVdouble* nxy_c = &nxy[c];
00812         double rc = rcRates[c];
00813         nxy_c->resize(nbTypes);
00814         for (size_t t = 0; t < nbTypes; ++t)
00815         {
00816           VVdouble* nxy_c_t = &(*nxy_c)[t];
00817           Matrix<double>* nijt = substitutionCount.getAllNumbersOfSubstitutions(d * rc, t + 1);
00818           nxy_c_t->resize(nbStates);
00819           for (size_t x = 0; x < nbStates; ++x)
00820           {
00821             Vdouble* nxy_c_t_x = &(*nxy_c_t)[x];
00822             nxy_c_t_x->resize(nbStates);
00823             for (size_t y = 0; y < nbStates; ++y)
00824             {
00825               (*nxy_c_t_x)[y] = (*nijt)(x, y);
00826             }
00827           }
00828           delete nijt;
00829         }
00830       }
00831 
00832       //Now loop over sites:
00833       auto_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator());
00834       while (sit->hasNext())
00835       {
00836         size_t i = sit->next();
00837         VVdouble* probsNode_i   = & probsNode[i];
00838         VVdouble* probsFather_i = & probsFather[i];
00839         for (size_t c = 0; c < nbClasses; ++c)
00840         {
00841           Vdouble* probsNode_i_c   = &(*probsNode_i)[c];
00842           Vdouble* probsFather_i_c = &(*probsFather_i)[c];
00843           VVVdouble* nxy_c = &nxy[c];
00844           for (size_t x = 0; x < nbStates; ++x)
00845           {
00846             for (size_t y = 0; y < nbStates; ++y)
00847             {
00848               double prob_cxy = (*probsFather_i_c)[x] * (*probsNode_i_c)[y];
00849               // Now the vector computation:
00850               for (size_t t = 0; t < nbTypes; ++t) {
00851                 substitutionsForCurrentNode[i][t] += prob_cxy * (*nxy_c)[t][x][y];
00852                 //                                   <------>   <--------------->
00853                 // Posterior probability                 |                | 
00854                 // for site i and rate class c *         |                |
00855                 // likelihood for this site--------------+                |
00856                 //                                                        |
00857                 //Substitution function for site i and rate class c-------+
00858               }
00859             }
00860           }
00861         }
00862       }
00863     }
00864     
00865     //Now we just have to copy the substitutions into the result vector:
00866     for (size_t i = 0; i < nbSites; ++i)
00867       for (size_t t = 0; t < nbTypes; ++t)
00868         (*substitutions)(l, i, t) = substitutionsForCurrentNode[(* rootPatternLinks)[i]][t];
00869   }
00870   if (verbose)
00871   {
00872     if (ApplicationTools::message) *ApplicationTools::message << " ";
00873     ApplicationTools::displayTaskDone();
00874   }
00875   return substitutions;
00876 }
00877 
00878 /**************************************************************************************************/
00879 
00880 void SubstitutionMappingTools::writeToStream(
00881   const ProbabilisticSubstitutionMapping& substitutions,
00882   const SiteContainer& sites,
00883   size_t type,
00884   ostream& out)
00885   throw (IOException) 
00886 {
00887   if (!out) throw IOException("SubstitutionMappingTools::writeToFile. Can't write to stream.");
00888   out << "Branches";
00889   out << "\tMean";
00890   for (size_t i = 0; i < substitutions.getNumberOfSites(); i++)
00891   {
00892     out << "\tSite" << sites.getSite(i).getPosition();
00893   }
00894   out << endl;
00895   
00896   for (size_t j = 0; j < substitutions.getNumberOfBranches(); j++)
00897   {
00898     out << substitutions.getNode(j)->getId() << "\t" << substitutions.getNode(j)->getDistanceToFather();
00899     for (size_t i = 0; i < substitutions.getNumberOfSites(); i++)
00900     {
00901       out << "\t" << substitutions(j, i, type);
00902     }
00903     out << endl;
00904   }
00905 }
00906 
00907 /**************************************************************************************************/
00908 
00909 void SubstitutionMappingTools::readFromStream(istream& in, ProbabilisticSubstitutionMapping& substitutions, size_t type)
00910   throw (IOException)
00911 {
00912   try
00913   {
00914     DataTable* data = DataTable::read(in, "\t", true, -1);
00915     vector<string> ids = data->getColumn(0);
00916     data->deleteColumn(0);//Remove ids
00917     data->deleteColumn(0);//Remove means
00918     //Now parse the table:
00919     size_t nbSites = data->getNumberOfColumns();
00920     substitutions.setNumberOfSites(nbSites);
00921     size_t nbBranches = data->getNumberOfRows();
00922     for (size_t i = 0; i < nbBranches; i++)
00923     {
00924       int id = TextTools::toInt(ids[i]);
00925       size_t br = substitutions.getNodeIndex(id);
00926       for (size_t j = 0; j < nbSites; j++)
00927       {
00928         substitutions(br, j, type) = TextTools::toDouble((*data)(i, j));
00929       }
00930     }
00931     //Parse the header:
00932     for (size_t i = 0; i < nbSites; i++)
00933     {
00934       string siteTxt = data->getColumnName(i);
00935       int site = 0;
00936       if (siteTxt.substr(0,4) == "Site") site = TextTools::to<int>(siteTxt.substr(4));
00937       else site = TextTools::to<int>(siteTxt);
00938       substitutions.setSitePosition(i, site);
00939     }
00940     
00941     delete data;
00942   }
00943   catch (Exception& e)
00944   {
00945     throw IOException(string("Bad input file. ") + e.what());
00946   }
00947 }
00948 
00949 /**************************************************************************************************/
00950     
00951 vector<double> SubstitutionMappingTools::computeTotalSubstitutionVectorForSite(const SubstitutionMapping& smap, size_t siteIndex)
00952 {
00953   size_t nbBranches = smap.getNumberOfBranches();
00954   size_t nbTypes    = smap.getNumberOfSubstitutionTypes();
00955   Vdouble v(nbBranches);
00956   for (size_t l = 0; l < nbBranches; ++l) {
00957     v[l] = 0;
00958     for (size_t t = 0; t < nbTypes; ++t) {
00959       v[l] += smap(l, siteIndex, t);
00960     }
00961   }
00962   return v;
00963 }
00964 
00965 /**************************************************************************************************/
00966 
00967 double SubstitutionMappingTools::computeNormForSite(const SubstitutionMapping& smap, size_t siteIndex)
00968 {
00969   double sumSquare = 0;
00970   for (size_t l = 0; l < smap.getNumberOfBranches(); ++l) {
00971     double sum = 0;
00972     for (size_t t = 0; t < smap.getNumberOfSubstitutionTypes(); ++t) {
00973       sum += smap(l, siteIndex, t);
00974     }
00975     sumSquare += sum * sum;
00976   }
00977   return sqrt(sumSquare);
00978 }
00979 
00980 /**************************************************************************************************/
00981     
00982 vector<double> SubstitutionMappingTools::computeSumForBranch(const SubstitutionMapping& smap, size_t branchIndex)
00983 {
00984   size_t nbSites = smap.getNumberOfSites();
00985   size_t nbTypes = smap.getNumberOfSubstitutionTypes();
00986   Vdouble v(nbTypes, 0);
00987   for (size_t i = 0; i < nbSites; ++i) {
00988     for (size_t t = 0; t < nbTypes; ++t) {
00989       v[t] += smap(branchIndex, i, t);
00990     }
00991   }
00992   return v;
00993 }
00994 
00995 /**************************************************************************************************/
00996 
00997 vector<double> SubstitutionMappingTools::computeSumForSite(const SubstitutionMapping& smap, size_t siteIndex)
00998 {
00999   size_t nbBranches = smap.getNumberOfBranches();
01000   size_t nbTypes = smap.getNumberOfSubstitutionTypes();
01001   Vdouble v(nbTypes, 0);
01002   for (size_t i = 0; i < nbBranches; ++i) {
01003     for (size_t t = 0; t < nbTypes; ++t) {
01004       v[t] += smap(i, siteIndex, t);
01005     }
01006   }
01007   return v;
01008 }
01009 
01010 /**************************************************************************************************/
01011 
 All Classes Namespaces Files Functions Variables Typedefs Friends