bpp-phyl  2.4.0
RewardMappingTools.cpp
Go to the documentation of this file.
1 //
2 // File: RewardMappingTools.cpp
3 // Created by: Laurent Guéguen
4 // Created on: vendredi 29 mars 2013, à 15h 01
5 //
6 
7 /*
8  Copyright or © or Copr. Bio++ Development Team, (November 16, 2004, 2005, 2006)
9 
10  This software is a computer program whose purpose is to provide classes
11  for phylogenetic data analysis.
12 
13  This software is governed by the CeCILL license under French law and
14  abiding by the rules of distribution of free software. You can use,
15  modify and/ or redistribute the software under the terms of the CeCILL
16  license as circulated by CEA, CNRS and INRIA at the following URL
17  "http://www.cecill.info".
18 
19  As a counterpart to the access to the source code and rights to copy,
20  modify and redistribute granted by the license, users are provided only
21  with a limited warranty and the software's author, the holder of the
22  economic rights, and the successive licensors have only limited
23  liability.
24 
25  In this respect, the user's attention is drawn to the risks associated
26  with loading, using, modifying and/or developing or reproducing the
27  software by the user in light of its specific status of free software,
28  that may mean that it is complicated to manipulate, and that also
29  therefore means that it is reserved for developers and experienced
30  professionals having in-depth computer knowledge. Users are therefore
31  encouraged to load and test the software's suitability as regards their
32  requirements in conditions enabling the security of their systems and/or
33  data to be ensured and, more generally, to use and operate it in the
34  same conditions as regards security.
35 
36  The fact that you are presently reading this means that you have had
37  knowledge of the CeCILL license and that you accept its terms.
38  */
39 
40 #include "RewardMappingTools.h"
41 #include "../Likelihood/DRTreeLikelihoodTools.h"
42 #include "../Likelihood/MarginalAncestralStateReconstruction.h"
43 
44 #include <Bpp/Text/TextTools.h>
47 #include <Bpp/Numeric/DataTable.h>
48 
49 using namespace bpp;
50 
51 // From the STL:
52 #include <iomanip>
53 
54 using namespace std;
55 
56 /******************************************************************************/
57 
59  const DRTreeLikelihood& drtl,
60  const vector<int>& nodeIds,
61  Reward& reward,
62  bool verbose)
63 {
64  // Preamble:
65  if (!drtl.isInitialized())
66  throw Exception("RewardMappingTools::computeRewardVectors(). Likelihood object is not initialized.");
67 
68  // A few variables we'll need:
69 
70  const TreeTemplate<Node> tree(drtl.getTree());
71  const SiteContainer* sequences = drtl.getData();
72  const DiscreteDistribution* rDist = drtl.getRateDistribution();
73 
74  size_t nbSites = sequences->getNumberOfSites();
75  size_t nbDistinctSites = drtl.getLikelihoodData()->getNumberOfDistinctSites();
76  size_t nbStates = sequences->getAlphabet()->getSize();
77  size_t nbClasses = rDist->getNumberOfCategories();
78  vector<const Node*> nodes = tree.getNodes();
79  const vector<size_t>* rootPatternLinks
81  nodes.pop_back(); // Remove root node.
82  size_t nbNodes = nodes.size();
83 
84  // We create a new ProbabilisticRewardMapping object:
85  ProbabilisticRewardMapping* rewards = new ProbabilisticRewardMapping(tree, &reward, nbSites);
86 
87  // Store likelihood for each rate for each site:
88  VVVdouble lik;
89  drtl.computeLikelihoodAtNode(tree.getRootId(), lik);
90  Vdouble Lr(nbDistinctSites, 0);
91  Vdouble rcProbs = rDist->getProbabilities();
92  Vdouble rcRates = rDist->getCategories();
93  for (size_t i = 0; i < nbDistinctSites; i++)
94  {
95  VVdouble* lik_i = &lik[i];
96  for (size_t c = 0; c < nbClasses; c++)
97  {
98  Vdouble* lik_i_c = &(*lik_i)[c];
99  double rc = rDist->getProbability(c);
100  for (size_t s = 0; s < nbStates; s++)
101  {
102  Lr[i] += (*lik_i_c)[s] * rc;
103  }
104  }
105  }
106 
107  // Compute the reward for each class and each branch in the tree:
108  if (verbose)
109  ApplicationTools::displayTask("Compute joint node-pairs likelihood", true);
110 
111  for (size_t l = 0; l < nbNodes; ++l)
112  {
113  // For each node,
114  const Node* currentNode = nodes[l];
115  if (nodeIds.size() > 0 && !VectorTools::contains(nodeIds, currentNode->getId()))
116  continue;
117 
118  const Node* father = currentNode->getFather();
119 
120  double d = currentNode->getDistanceToFather();
121 
122  if (verbose)
123  ApplicationTools::displayGauge(l, nbNodes - 1);
124  Vdouble rewardsForCurrentNode(nbDistinctSites);
125 
126  // Now we've got to compute likelihoods in a smart manner... ;)
127  VVVdouble likelihoodsFatherConstantPart(nbDistinctSites);
128  for (size_t i = 0; i < nbDistinctSites; i++)
129  {
130  VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
131  likelihoodsFatherConstantPart_i->resize(nbClasses);
132  for (size_t c = 0; c < nbClasses; c++)
133  {
134  Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
135  likelihoodsFatherConstantPart_i_c->resize(nbStates);
136  double rc = rDist->getProbability(c);
137  for (size_t s = 0; s < nbStates; s++)
138  {
139  // (* likelihoodsFatherConstantPart_i_c)[s] = rc * model->freq(s);
140  // freq is already accounted in the array
141  (*likelihoodsFatherConstantPart_i_c)[s] = rc;
142  }
143  }
144  }
145 
146  // First, what will remain constant:
147  size_t nbSons = father->getNumberOfSons();
148  for (size_t n = 0; n < nbSons; n++)
149  {
150  const Node* currentSon = father->getSon(n);
151  if (currentSon->getId() != currentNode->getId())
152  {
153  const VVVdouble* likelihoodsFather_son = &drtl.getLikelihoodData()->getLikelihoodArray(father->getId(), currentSon->getId());
154 
155  // Now iterate over all site partitions:
156  unique_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(currentSon->getId()));
157  VVVdouble pxy;
158  bool first;
159  while (mit->hasNext())
160  {
162  unique_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator());
163  first = true;
164  while (sit->hasNext())
165  {
166  size_t i = sit->next();
167  // We retrieve the transition probabilities for this site partition:
168  if (first)
169  {
170  pxy = drtl.getTransitionProbabilitiesPerRateClass(currentSon->getId(), i);
171  first = false;
172  }
173  const VVdouble* likelihoodsFather_son_i = &(*likelihoodsFather_son)[i];
174  VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
175  for (size_t c = 0; c < nbClasses; c++)
176  {
177  const Vdouble* likelihoodsFather_son_i_c = &(*likelihoodsFather_son_i)[c];
178  Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
179  VVdouble* pxy_c = &pxy[c];
180  for (size_t x = 0; x < nbStates; x++)
181  {
182  Vdouble* pxy_c_x = &(*pxy_c)[x];
183  double likelihood = 0.;
184  for (size_t y = 0; y < nbStates; y++)
185  {
186  likelihood += (*pxy_c_x)[y] * (*likelihoodsFather_son_i_c)[y];
187  }
188  (*likelihoodsFatherConstantPart_i_c)[x] *= likelihood;
189  }
190  }
191  }
192  }
193  }
194  }
195  if (father->hasFather())
196  {
197  const Node* currentSon = father->getFather();
198  const VVVdouble* likelihoodsFather_son = &drtl.getLikelihoodData()->getLikelihoodArray(father->getId(), currentSon->getId());
199  // Now iterate over all site partitions:
200  unique_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(father->getId()));
201  VVVdouble pxy;
202  bool first;
203  while (mit->hasNext())
204  {
206  unique_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator());
207  first = true;
208  while (sit->hasNext())
209  {
210  size_t i = sit->next();
211  // We retrieve the transition probabilities for this site partition:
212  if (first)
213  {
214  pxy = drtl.getTransitionProbabilitiesPerRateClass(father->getId(), i);
215  first = false;
216  }
217  const VVdouble* likelihoodsFather_son_i = &(*likelihoodsFather_son)[i];
218  VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
219  for (size_t c = 0; c < nbClasses; c++)
220  {
221  const Vdouble* likelihoodsFather_son_i_c = &(*likelihoodsFather_son_i)[c];
222  Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
223  VVdouble* pxy_c = &pxy[c];
224  for (size_t x = 0; x < nbStates; x++)
225  {
226  double likelihood = 0.;
227  for (size_t y = 0; y < nbStates; y++)
228  {
229  Vdouble* pxy_c_x = &(*pxy_c)[y];
230  likelihood += (*pxy_c_x)[x] * (*likelihoodsFather_son_i_c)[y];
231  }
232  (*likelihoodsFatherConstantPart_i_c)[x] *= likelihood;
233  }
234  }
235  }
236  }
237  }
238  else
239  {
240  // Account for root frequencies:
241  for (size_t i = 0; i < nbDistinctSites; i++)
242  {
243  vector<double> freqs = drtl.getRootFrequencies(i);
244  VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
245  for (size_t c = 0; c < nbClasses; c++)
246  {
247  Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
248  for (size_t x = 0; x < nbStates; x++)
249  {
250  (*likelihoodsFatherConstantPart_i_c)[x] *= freqs[x];
251  }
252  }
253  }
254  }
255 
256  // Then, we deal with the node of interest.
257  // We first average upon 'y' to save computations, and then upon 'x'.
258  // ('y' is the state at 'node' and 'x' the state at 'father'.)
259 
260  // Iterate over all site partitions:
261  const VVVdouble* likelihoodsFather_node = &(drtl.getLikelihoodData()->getLikelihoodArray(father->getId(), currentNode->getId()));
262  unique_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(currentNode->getId()));
263  VVVdouble pxy;
264  bool first;
265  while (mit->hasNext())
266  {
269  // compute all nxy first:
270  VVVdouble nxy(nbClasses);
271  for (size_t c = 0; c < nbClasses; ++c)
272  {
273  VVdouble* nxy_c = &nxy[c];
274  double rc = rcRates[c];
275  Matrix<double>* nij = reward.getAllRewards(d * rc);
276  nxy_c->resize(nbStates);
277  for (size_t x = 0; x < nbStates; ++x)
278  {
279  Vdouble* nxy_c_x = &(*nxy_c)[x];
280  nxy_c_x->resize(nbStates);
281  for (size_t y = 0; y < nbStates; ++y)
282  {
283  (*nxy_c_x)[y] = (*nij)(x, y);
284  }
285  }
286  delete nij;
287  }
288 
289  // Now loop over sites:
290  unique_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator());
291  first = true;
292  while (sit->hasNext())
293  {
294  size_t i = sit->next();
295  // We retrieve the transition probabilities and substitution counts for this site partition:
296  if (first)
297  {
298  pxy = drtl.getTransitionProbabilitiesPerRateClass(currentNode->getId(), i);
299  first = false;
300  }
301  const VVdouble* likelihoodsFather_node_i = &(*likelihoodsFather_node)[i];
302  VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
303  for (size_t c = 0; c < nbClasses; ++c)
304  {
305  const Vdouble* likelihoodsFather_node_i_c = &(*likelihoodsFather_node_i)[c];
306  Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
307  const VVdouble* pxy_c = &pxy[c];
308  VVdouble* nxy_c = &nxy[c];
309  for (size_t x = 0; x < nbStates; ++x)
310  {
311  double* likelihoodsFatherConstantPart_i_c_x = &(*likelihoodsFatherConstantPart_i_c)[x];
312  const Vdouble* pxy_c_x = &(*pxy_c)[x];
313  for (size_t y = 0; y < nbStates; ++y)
314  {
315  double likelihood_cxy = (*likelihoodsFatherConstantPart_i_c_x)
316  * (*pxy_c_x)[y]
317  * (*likelihoodsFather_node_i_c)[y];
318 
319  // Now the vector computation:
320  rewardsForCurrentNode[i] += likelihood_cxy * (*nxy_c)[x][y];
321  // <------------> <--------------->
322  // Posterior probability | |
323  // for site i and rate class c * | |
324  // likelihood for this site------+ |
325  // |
326  // Reward function for site i and rate class c------+
327  }
328  }
329  }
330  }
331  }
332 
333  // Now we just have to copy the substitutions into the result vector:
334  for (size_t i = 0; i < nbSites; ++i)
335  (*rewards)(l, i) = rewardsForCurrentNode[(*rootPatternLinks)[i]] / Lr[(*rootPatternLinks)[i]];
336 
337  }
338  if (verbose)
339  {
343  }
344  return rewards;
345 }
346 
347 /**************************************************************************************************/
348 
350  const ProbabilisticRewardMapping& rewards,
351  const SiteContainer& sites,
352  ostream& out)
353 {
354  if (!out)
355  throw IOException("RewardMappingTools::writeToFile. Can't write to stream.");
356  out << "Branches";
357  out << "\tMean";
358  for (size_t i = 0; i < rewards.getNumberOfSites(); i++)
359  {
360  out << "\tSite" << sites.getSite(i).getPosition();
361  }
362  out << endl;
363 
364  for (size_t j = 0; j < rewards.getNumberOfBranches(); j++)
365  {
366  out << rewards.getNode(j)->getId() << "\t" << rewards.getNode(j)->getDistanceToFather();
367  for (size_t i = 0; i < rewards.getNumberOfSites(); i++)
368  {
369  out << "\t" << rewards(j, i);
370  }
371  out << endl;
372  }
373 }
374 
375 /**************************************************************************************************/
376 
378 {
379  try
380  {
381  DataTable* data = DataTable::read(in, "\t", true, -1);
382  vector<string> ids = data->getColumn(0);
383  data->deleteColumn(0); // Remove ids
384  data->deleteColumn(0); // Remove means
385  // Now parse the table:
386  size_t nbSites = data->getNumberOfColumns();
387  rewards.setNumberOfSites(nbSites);
388  size_t nbBranches = data->getNumberOfRows();
389  for (size_t i = 0; i < nbBranches; i++)
390  {
391  int id = TextTools::toInt(ids[i]);
392  size_t br = rewards.getNodeIndex(id);
393  for (size_t j = 0; j < nbSites; j++)
394  {
395  rewards(br, j) = TextTools::toDouble((*data)(i, j));
396  }
397  }
398  // Parse the header:
399  for (size_t i = 0; i < nbSites; i++)
400  {
401  string siteTxt = data->getColumnName(i);
402  int site = 0;
403  if (siteTxt.substr(0, 4) == "Site")
404  site = TextTools::to<int>(siteTxt.substr(4));
405  else
406  site = TextTools::to<int>(siteTxt);
407  rewards.setSitePosition(i, site);
408  }
409 
410  delete data;
411  }
412  catch (Exception& e)
413  {
414  throw IOException(string("Bad input file. ") + e.what());
415  }
416 }
417 
418 /**************************************************************************************************/
419 
420 double RewardMappingTools::computeSumForBranch(const RewardMapping& smap, size_t branchIndex)
421 {
422  size_t nbSites = smap.getNumberOfSites();
423  double v = 0;
424  for (size_t i = 0; i < nbSites; ++i)
425  {
426  v += smap(branchIndex, i);
427  }
428  return v;
429 }
430 
431 /**************************************************************************************************/
432 
433 double RewardMappingTools::computeSumForSite(const RewardMapping& smap, size_t siteIndex)
434 {
435  size_t nbBranches = smap.getNumberOfBranches();
436  double v = 0;
437  for (size_t i = 0; i < nbBranches; ++i)
438  {
439  v += smap(i, siteIndex);
440  }
441  return v;
442 }
443 
444 /**************************************************************************************************/
A pair of SubstitutionModel / SiteIterator.
std::string getColumnName(size_t index) const
size_t getNumberOfRows() const
static void displayTaskDone()
virtual double getDistanceToFather() const
Get the distance to the father node is there is one, otherwise throw a NodeException.
Definition: Node.h:283
virtual const SiteContainer * getData() const =0
Get the dataset for which the likelihood must be evaluated.
virtual size_t getNumberOfCategories() const =0
virtual SiteIterator * getNewSiteIterator() const =0
virtual double getProbability(size_t categoryIndex) const =0
virtual void setSubstitutionModel(const SubstitutionModel *model)=0
Set the substitution model associated with this reward, if relevant.
static double computeSumForSite(const RewardMapping &smap, size_t siteIndex)
Sum all substitutions for each type of a given site (specified by its index).
static void displayTask(const std::string &text, bool eof=false)
static void readFromStream(std::istream &in, ProbabilisticRewardMapping &rewards)
Read the reward vectors from a stream.
STL namespace.
The phylogenetic tree class.
virtual const std::vector< double > & getRootFrequencies(size_t siteIndex) const =0
Get the values of the frequencies for each state in the alphabet at the root node.
double toDouble(const std::string &s, char dec= '.', char scientificNotation= 'e')
General interface for storing reward mapping data.
Definition: RewardMapping.h:62
static bool contains(const std::vector< T > &vec, T el)
static ProbabilisticRewardMapping * computeRewardVectors(const DRTreeLikelihood &drtl, const std::vector< int > &nodeIds, Reward &reward, bool verbose=true)
Compute the reward vectors for a particular dataset using the double-recursive likelihood computation...
virtual const Node * getNode(size_t nodeIndex) const
Definition: Mapping.h:208
The Reward interface.
Definition: Reward.h:70
virtual int getId() const
Get the node&#39;s id.
Definition: Node.h:203
static double computeSumForBranch(const RewardMapping &smap, size_t branchIndex)
Sum all rewards of a given branch (specified by its index).
virtual size_t getNumberOfBranches() const =0
virtual Vdouble getCategories() const =0
virtual ConstBranchModelIterator * getNewBranchModelIterator(int nodeId) const =0
const char * what() const noexceptoverride
std::vector< std::string > & getColumn(size_t index)
virtual const SubstitutionModel * getSubstitutionModel() const =0
std::vector< size_t > & getRootArrayPositions()
virtual const Node * getFather() const
Get the father of this node is there is one.
Definition: Node.h:339
virtual const Tree & getTree() const =0
Get the tree (topology and branch lengths).
std::vector< double > Vdouble
static std::shared_ptr< OutputStream > message
virtual ConstBranchModelDescription * next()=0
VVVdouble & getLikelihoodArray(int parentId, int neighborId)
virtual const Site & getSite(size_t siteIndex) const =0
virtual const Node * getSon(size_t pos) const
Definition: Node.h:395
void setSitePosition(size_t index, int position)
Set the position of a given site.
Definition: Mapping.h:198
virtual DRASDRTreeLikelihoodData * getLikelihoodData()=0
virtual bool isInitialized() const =0
virtual int getPosition() const
virtual const DiscreteDistribution * getRateDistribution() const =0
Get the rate distribution used for the computation.
The phylogenetic node class.
Definition: Node.h:90
size_t getNumberOfBranches() const
Definition: Mapping.h:206
Data storage class for probabilistic rewards mappings.
virtual Matrix< double > * getAllRewards(double length) const =0
Get the rewards on a branch, for each initial and final states, and given the branch length...
virtual bool hasFather() const
Tell if this node has a father node.
Definition: Node.h:379
virtual VVVdouble getTransitionProbabilitiesPerRateClass(int nodeId, size_t siteIndex) const =0
Retrieves all Pij(t) for a particular branch, defined by the upper node.
virtual size_t getNumberOfSons() const
Definition: Node.h:388
static void displayGauge(size_t iter, size_t total, char symbol='>', const std::string &mes="")
void deleteColumn(size_t index)
virtual void computeLikelihoodAtNode(int nodeId, VVVdouble &likelihoodArray) const =0
Compute the likelihood array at a given node.
static DataTable * read(std::istream &in, const std::string &sep="\t", bool header=true, int rowNames=-1)
Interface for double-recursive (DR) implementation of the likelihood computation. ...
virtual size_t getNumberOfSites() const =0
virtual Vdouble getProbabilities() const =0
std::vector< VVdouble > VVVdouble
std::vector< Vdouble > VVdouble
size_t getNumberOfSites() const
Definition: Mapping.h:204
size_t getNumberOfColumns() const
static void writeToStream(const ProbabilisticRewardMapping &rewards, const SiteContainer &sites, std::ostream &out)
Write the reward vectors to a stream.
int toInt(const std::string &s, char scientificNotation= 'e')
virtual void setNumberOfSites(size_t numberOfSites)
virtual size_t getNodeIndex(int nodeId) const
Definition: Mapping.h:226