bpp-seq  2.1.0
 All Classes Namespaces Files Functions Variables Friends Pages
SiteContainerTools.cpp
Go to the documentation of this file.
1 //
2 // File: SiteContainerTools.cpp
3 // Created by: Julien Dutheil
4 // Sylvain Glémin
5 // Created on: Fri Dec 12 18:55:06 2003
6 //
7 
8 /*
9  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
10 
11  This software is a computer program whose purpose is to provide classes
12  for sequences analysis.
13 
14  This software is governed by the CeCILL license under French law and
15  abiding by the rules of distribution of free software. You can use,
16  modify and/ or redistribute the software under the terms of the CeCILL
17  license as circulated by CEA, CNRS and INRIA at the following URL
18  "http://www.cecill.info".
19 
20  As a counterpart to the access to the source code and rights to copy,
21  modify and redistribute granted by the license, users are provided only
22  with a limited warranty and the software's author, the holder of the
23  economic rights, and the successive licensors have only limited
24  liability.
25 
26  In this respect, the user's attention is drawn to the risks associated
27  with loading, using, modifying and/or developing or reproducing the
28  software by the user in light of its specific status of free software,
29  that may mean that it is complicated to manipulate, and that also
30  therefore means that it is reserved for developers and experienced
31  professionals having in-depth computer knowledge. Users are therefore
32  encouraged to load and test the software's suitability as regards their
33  requirements in conditions enabling the security of their systems and/or
34  data to be ensured and, more generally, to use and operate it in the
35  same conditions as regards security.
36 
37  The fact that you are presently reading this means that you have had
38  knowledge of the CeCILL license and that you accept its terms.
39  */
40 
41 #include "SiteContainerTools.h"
42 #include "SequenceContainerTools.h"
43 #include "VectorSiteContainer.h"
44 #include "SiteContainerIterator.h"
45 #include "../SiteTools.h"
46 #include "../Alphabet/AlphabetTools.h"
47 #include "../SequenceTools.h"
49 
50 using namespace bpp;
51 
52 // From the STL:
53 #include <vector>
54 #include <deque>
55 #include <string>
56 
57 using namespace std;
58 
59 /******************************************************************************/
60 
62 {
63  vector<string> seqNames = sites.getSequencesNames();
64  VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
65  noGapCont->setSequencesNames(seqNames, false);
66  NoGapSiteContainerIterator ngsi(sites);
67  while (ngsi.hasMoreSites())
68  {
69  noGapCont->addSite(*ngsi.nextSite());
70  }
71  return noGapCont;
72 }
73 
74 /******************************************************************************/
75 
77 {
78  vector<string> seqNames = sites.getSequencesNames();
79  VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
80  noGapCont->setSequencesNames(seqNames, false);
82  while (csi.hasMoreSites())
83  {
84  noGapCont->addSite(*csi.nextSite());
85  }
86  return noGapCont;
87 }
88 
89 /******************************************************************************/
90 
92  const SiteContainer& sequences,
93  const SiteSelection& selection)
94 {
95  vector<string> seqNames = sequences.getSequencesNames();
96  VectorSiteContainer* sc = new VectorSiteContainer(seqNames.size(), sequences.getAlphabet());
97  sc->setSequencesNames(seqNames, false);
98  for (unsigned int i = 0; i < selection.size(); i++)
99  {
100  sc->addSite(sequences.getSite(selection[i]), false);
101  // We do not check names, we suppose that the container passed as an argument is correct.
102  // WARNING: what if selection contains many times the same indice? ...
103  }
104  sc->setGeneralComments(sequences.getGeneralComments());
105  return sc;
106 }
107 
108 /******************************************************************************/
109 
110 const Sequence* SiteContainerTools::getConsensus(const SiteContainer& sc, const std::string& name, bool ignoreGap, bool resolveUnknown)
111 {
112  Vint consensus;
114  const Site* site;
115  while (ssi.hasMoreSites())
116  {
117  site = ssi.nextSite();
118  map<int, double> freq;
119  SiteTools::getFrequencies(*site, freq, resolveUnknown);
120  double max = 0;
121  int cons = -1; // default result
122  if (ignoreGap)
123  {
124  for (map<int, double>::iterator it = freq.begin(); it != freq.end(); it++)
125  {
126  if (it->second > max && it->first != -1)
127  {
128  max = it->second;
129  cons = it->first;
130  }
131  }
132  }
133  else
134  {
135  for (map<int, double>::iterator it = freq.begin(); it != freq.end(); it++)
136  {
137  if (it->second > max)
138  {
139  max = it->second;
140  cons = it->first;
141  }
142  }
143  }
144  consensus.push_back(cons);
145  }
146  const Sequence* seqConsensus = new BasicSequence(name, consensus, sc.getAlphabet());
147  return seqConsensus;
148 }
149 
150 /******************************************************************************/
151 
153 {
154  // NB: use iterators for a better algorithm?
155  int unknownCode = sites.getAlphabet()->getUnknownCharacterCode();
156  for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
157  {
158  for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++)
159  {
160  int* element = &sites(j, i);
161  if (sites.getAlphabet()->isGap(*element))
162  *element = unknownCode;
163  }
164  }
165 }
166 
167 /******************************************************************************/
168 
170 {
171  // NB: use iterators for a better algorithm?
172  int gapCode = sites.getAlphabet()->getGapCharacterCode();
173  for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
174  {
175  for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++)
176  {
177  int* element = &sites(j, i);
178  if (sites.getAlphabet()->isUnresolved(*element))
179  *element = gapCode;
180  }
181  }
182 }
183 
184 /******************************************************************************/
185 
187 {
188  vector<string> seqNames = sites.getSequencesNames();
189  VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
190  noGapCont->setSequencesNames(seqNames, false);
191  for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
192  {
193  const Site* site = &sites.getSite(i);
194  if (!SiteTools::isGapOnly(*site))
195  noGapCont->addSite(*site);
196  }
197  return noGapCont;
198 }
199 
200 /******************************************************************************/
201 
203 {
204  size_t n = sites.getNumberOfSites();
205  size_t i = n;
206  while (i > 1)
207  {
208  ApplicationTools::displayGauge(n - i + 1, n);
209  const Site* site = &sites.getSite(i - 1);
210  if (SiteTools::isGapOnly(*site))
211  {
212  size_t end = i;
213  while (SiteTools::isGapOnly(*site) && i > 1)
214  {
215  --i;
216  site = &sites.getSite(i - 1);
217  }
218  sites.deleteSites(i, end - i);
219  }
220  else
221  {
222  --i;
223  }
224  }
226  const Site* site = &sites.getSite(0);
227  if (SiteTools::isGapOnly(*site))
228  sites.deleteSite(0);
229 }
230 
231 /******************************************************************************/
232 
234 {
235  vector<string> seqNames = sites.getSequencesNames();
236  VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
237  noGapCont->setSequencesNames(seqNames, false);
238  for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
239  {
240  const Site* site = &sites.getSite(i);
242  noGapCont->addSite(*site, false);
243  }
244  return noGapCont;
245 }
246 
247 /******************************************************************************/
248 
250 {
251  size_t n = sites.getNumberOfSites();
252  size_t i = n;
253  while (i > 1)
254  {
255  ApplicationTools::displayGauge(n - i + 1, n);
256  const Site* site = &sites.getSite(i - 1);
257  if (SiteTools::isGapOnly(*site))
258  {
259  size_t end = i;
260  while (SiteTools::isGapOrUnresolvedOnly(*site) && i > 1)
261  {
262  --i;
263  site = &sites.getSite(i - 1);
264  }
265  sites.deleteSites(i, end - i);
266  }
267  else
268  {
269  --i;
270  }
271  }
273  const Site* site = &sites.getSite(0);
275  sites.deleteSite(0);
276 }
277 
278 /******************************************************************************/
279 
281 {
282  vector<string> seqNames = sites.getSequencesNames();
283  VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
284  noGapCont->setSequencesNames(seqNames, false);
285  for (unsigned int i = 0; i < sites.getNumberOfSites(); ++i) {
286  map<int, double> freq;
287  SiteTools::getFrequencies(sites.getSite(i), freq);
288  if (freq[-1] <= maxFreqGaps)
289  noGapCont->addSite(sites.getSite(i), false);
290  }
291  return noGapCont;
292 }
293 
294 /******************************************************************************/
295 
296 void SiteContainerTools::removeGapSites(SiteContainer& sites, double maxFreqGaps)
297 {
298  for (size_t i = sites.getNumberOfSites(); i > 0; i--) {
299  map<int, double> freq;
300  SiteTools::getFrequencies(sites.getSite(i - 1), freq);
301  if (freq[-1] > maxFreqGaps)
302  sites.deleteSite(i - 1);
303  }
304 }
305 
306 /******************************************************************************/
307 
309 {
310  const CodonAlphabet* pca = dynamic_cast<const CodonAlphabet*>(sites.getAlphabet());
311  if (!pca)
312  throw AlphabetException("Not a Codon Alphabet", sites.getAlphabet());
313  vector<string> seqNames = sites.getSequencesNames();
314  VectorSiteContainer* noStopCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
315  noStopCont->setSequencesNames(seqNames, false);
316  for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
317  {
318  const Site* site = &sites.getSite(i);
319  if (!SiteTools::hasStopCodon(*site))
320  noStopCont->addSite(*site, false);
321  }
322  return noStopCont;
323 }
324 
325 /******************************************************************************/
326 
328  const SiteContainer& dottedAln,
329  const Alphabet* resolvedAlphabet) throw (AlphabetException, Exception)
330 {
331  if (!AlphabetTools::isDefaultAlphabet(dottedAln.getAlphabet()))
332  throw AlphabetException("SiteContainerTools::resolveDottedAlignment. Alignment alphabet should of class 'DefaultAlphabet'.", dottedAln.getAlphabet());
333 
334  // First we look for the reference sequence:
335  size_t n = dottedAln.getNumberOfSequences();
336  if (n == 0)
337  throw Exception("SiteContainerTools::resolveDottedAlignment. Input alignment contains no sequence.");
338 
339  const Sequence* refSeq = 0;
340  for (size_t i = 0; i < n; ++i) // Test each sequence
341  {
342  const Sequence* seq = &dottedAln.getSequence(i);
343  bool isRef = true;
344  for (unsigned int j = 0; isRef && j < seq->size(); ++j) // For each site in the sequence
345  {
346  if (seq->getChar(j) == ".")
347  isRef = false;
348  }
349  if (isRef) // We found the reference sequence!
350  {
351  refSeq = new BasicSequence(*seq);
352  }
353  }
354  if (!refSeq)
355  throw Exception("SiteContainerTools::resolveDottedAlignment. No reference sequence was found in the input alignment.");
356 
357  // Now we build a new VectorSiteContainer:
358  VectorSiteContainer* sites = new VectorSiteContainer(n, resolvedAlphabet);
359 
360  // We add each site one by one:
361  size_t m = dottedAln.getNumberOfSites();
362  string state;
363  for (unsigned int i = 0; i < m; ++i)
364  {
365  string resolved = refSeq->getChar(i);
366  const Site* site = &dottedAln.getSite(i);
367  Site resolvedSite(resolvedAlphabet, site->getPosition());
368  for (unsigned int j = 0; j < n; j++)
369  {
370  state = site->getChar(j);
371  if (state == ".")
372  {
373  state = resolved;
374  }
375  resolvedSite.addElement(state);
376  }
377  // Add the new site:
378  sites->addSite(resolvedSite);
379  }
380 
381  // Seq sequence names:
382  sites->setSequencesNames(dottedAln.getSequencesNames());
383 
384  // Delete the copied sequence:
385  delete refSeq;
386 
387  // Return result:
388  return sites;
389 }
390 
391 /******************************************************************************/
392 
393 std::map<size_t, size_t> SiteContainerTools::getSequencePositions(const Sequence& seq)
394 {
395  map<size_t, size_t> tln;
396  if (seq.size() == 0)
397  return tln;
398  unsigned int count = 0;
399  for (size_t i = 0; i < seq.size(); i++)
400  {
401  if (seq[i] != -1)
402  {
403  count++;
404  tln[i + 1] = count;
405  }
406  }
407  return tln;
408 }
409 
410 /******************************************************************************/
411 
412 std::map<size_t, size_t> SiteContainerTools::getAlignmentPositions(const Sequence& seq)
413 {
414  map<size_t, size_t> tln;
415  if (seq.size() == 0)
416  return tln;
417  unsigned int count = 0;
418  for (size_t i = 0; i < seq.size(); i++)
419  {
420  if (seq[i] != -1)
421  {
422  count++;
423  tln[count] = i + 1;
424  }
425  }
426  return tln;
427 }
428 
429 /******************************************************************************/
430 
431 std::map<size_t, size_t> SiteContainerTools::translateAlignment(const Sequence& seq1, const Sequence& seq2)
433 {
434  if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
435  throw AlphabetMismatchException("SiteContainerTools::translateAlignment", seq1.getAlphabet(), seq2.getAlphabet());
436  map<size_t, size_t> tln;
437  if (seq1.size() == 0)
438  return tln;
439  unsigned int count1 = 0;
440  unsigned int count2 = 0;
441  if (seq2.size() == 0)
442  throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + ".");
443  int state1 = seq1[count1];
444  int state2 = seq2[count2];
445  bool end = false;
446  while (!end)
447  {
448  while (state1 == -1)
449  {
450  count1++;
451  if (count1 < seq1.size())
452  state1 = seq1[count1];
453  else
454  break;
455  }
456  while (state2 == -1)
457  {
458  count2++;
459  if (count2 < seq2.size())
460  state2 = seq2[count2];
461  else
462  break;
463  }
464  if (state1 != state2)
465  throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + ".");
466  tln[count1 + 1] = count2 + 1; // Count start at 1
467  if (count1 == seq1.size() - 1)
468  end = true;
469  else
470  {
471  if (count2 == seq2.size() - 1)
472  {
473  state1 = seq1[++count1];
474  while (state1 == -1)
475  {
476  count1++;
477  if (count1 < seq1.size())
478  state1 = seq1[count1];
479  else
480  break;
481  }
482  if (state1 == -1)
483  end = true;
484  else
485  throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + ".");
486  }
487  else
488  {
489  state1 = seq1[++count1];
490  state2 = seq2[++count2];
491  }
492  }
493  }
494  return tln;
495 }
496 
497 /******************************************************************************/
498 
499 std::map<size_t, size_t> SiteContainerTools::translateSequence(const SiteContainer& sequences, size_t i1, size_t i2)
500 {
501  const Sequence* seq1 = &sequences.getSequence(i1);
502  const Sequence* seq2 = &sequences.getSequence(i2);
503  map<size_t, size_t> tln;
504  size_t count1 = 0; // Sequence 1 counter
505  size_t count2 = 0; // Sequence 2 counter
506  int state1;
507  int state2;
508  for (size_t i = 0; i < sequences.getNumberOfSites(); i++)
509  {
510  state1 = (*seq1)[i];
511  if (state1 != -1)
512  count1++;
513  state2 = (*seq2)[i];
514  if (state2 != -1)
515  count2++;
516  if (state1 != -1)
517  {
518  tln[count1] = (state2 == -1 ? 0 : count2);
519  }
520  }
521  return tln;
522 }
523 
524 /******************************************************************************/
525 
527  const Sequence& seq1,
528  const Sequence& seq2,
529  const AlphabetIndex2& s,
530  double gap)
532 {
533  if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
534  throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), seq2.getAlphabet());
535  if (seq1.getAlphabet()->getAlphabetType() != s.getAlphabet()->getAlphabetType())
536  throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), s.getAlphabet());
537  // Check that sequences have no gap!
538  auto_ptr<Sequence> s1(seq1.clone());
540  auto_ptr<Sequence> s2(seq2.clone());
542 
543  // 1) Initialize matrix:
544  RowMatrix<double> m(s1->size() + 1, s2->size() + 1);
545  RowMatrix<char> p(s1->size(), s2->size());
546  double choice1, choice2, choice3, mx;
547  char px;
548  for (size_t i = 0; i <= s1->size(); i++)
549  {
550  m(i, 0) = static_cast<double>(i) * gap;
551  }
552  for (size_t j = 0; j <= s2->size(); j++)
553  {
554  m(0, j) = static_cast<double>(j) * gap;
555  }
556  for (size_t i = 1; i <= s1->size(); i++)
557  {
558  for (size_t j = 1; j <= s2->size(); j++)
559  {
560  choice1 = m(i - 1, j - 1) + static_cast<double>(s.getIndex((*s1)[i - 1], (*s2)[j - 1]));
561  choice2 = m(i - 1, j) + gap;
562  choice3 = m(i, j - 1) + gap;
563  mx = choice1; px = 'd'; // Default in case of equality of scores.
564  if (choice2 > mx)
565  {
566  mx = choice2; px = 'u';
567  }
568  if (choice3 > mx)
569  {
570  mx = choice3; px = 'l';
571  }
572  m(i, j) = mx;
573  p(i - 1, j - 1) = px;
574  }
575  }
576 
577  // 2) Get alignment:
578  deque<int> a1, a2;
579  size_t i = s1->size(), j = s2->size();
580  char c;
581  while (i > 0 && j > 0)
582  {
583  c = p(i - 1, j - 1);
584  if (c == 'd')
585  {
586  a1.push_front((*s1)[i - 1]);
587  a2.push_front((*s2)[j - 1]);
588  i--;
589  j--;
590  }
591  else if (c == 'u')
592  {
593  a1.push_front((*s1)[i - 1]);
594  a2.push_front(-1);
595  i--;
596  }
597  else
598  {
599  a1.push_front(-1);
600  a2.push_front((*s2)[j - 1]);
601  j--;
602  }
603  }
604  while (i > 0)
605  {
606  a1.push_front((*s1)[i - 1]);
607  a2.push_front(-1);
608  i--;
609  }
610  while (j > 0)
611  {
612  a1.push_front(-1);
613  a2.push_front((*s2)[j - 1]);
614  j--;
615  }
616  s1->setContent(vector<int>(a1.begin(), a1.end()));
617  s2->setContent(vector<int>(a2.begin(), a2.end()));
618  AlignedSequenceContainer* asc = new AlignedSequenceContainer(s1->getAlphabet());
619  asc->addSequence(*s1, false);
620  asc->addSequence(*s2, false); // Do not check for sequence names.
621  return asc;
622 }
623 
624 /******************************************************************************/
625 
627  const Sequence& seq1,
628  const Sequence& seq2,
629  const AlphabetIndex2& s,
630  double opening,
631  double extending)
633 {
634  if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
635  throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), seq2.getAlphabet());
636  if (seq1.getAlphabet()->getAlphabetType() != s.getAlphabet()->getAlphabetType())
637  throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), s.getAlphabet());
638  // Check that sequences have no gap!
639  auto_ptr<Sequence> s1(seq1.clone());
641  auto_ptr<Sequence> s2(seq2.clone());
643 
644  // 1) Initialize matrix:
645  RowMatrix<double> m(s1->size() + 1, s2->size() + 1);
646  RowMatrix<double> v(s1->size() + 1, s2->size() + 1);
647  RowMatrix<double> h(s1->size() + 1, s2->size() + 1);
648  RowMatrix<char> p(s1->size(), s2->size());
649 
650  double choice1, choice2, choice3, mx;
651  char px;
652  m(0, 0) = 0.;
653  for (size_t i = 0; i <= s1->size(); i++)
654  {
655  v(i, 0) = log(0.);
656  }
657  for (size_t j = 0; j <= s2->size(); j++)
658  {
659  h(0, j) = log(0.);
660  }
661  for (size_t i = 1; i <= s1->size(); i++)
662  {
663  m(i, 0) = h(i, 0) = opening + static_cast<double>(i) * extending;
664  }
665  for (size_t j = 1; j <= s2->size(); j++)
666  {
667  m(0, j) = v(0, j) = opening + static_cast<double>(j) * extending;
668  }
669 
670  for (size_t i = 1; i <= s1->size(); i++)
671  {
672  for (size_t j = 1; j <= s2->size(); j++)
673  {
674  choice1 = m(i - 1, j - 1) + s.getIndex((*s1)[i - 1], (*s2)[j - 1]);
675  choice2 = h(i - 1, j - 1) + opening + extending;
676  choice3 = v(i - 1, j - 1) + opening + extending;
677  mx = choice1; // Default in case of equality of scores.
678  if (choice2 > mx)
679  {
680  mx = choice2;
681  }
682  if (choice3 > mx)
683  {
684  mx = choice3;
685  }
686  m(i, j) = mx;
687 
688  choice1 = m(i, j - 1) + opening + extending;
689  choice2 = h(i, j - 1) + extending;
690  mx = choice1; // Default in case of equality of scores.
691  if (choice2 > mx)
692  {
693  mx = choice2;
694  }
695  h(i, j) = mx;
696 
697  choice1 = m(i - 1, j) + opening + extending;
698  choice2 = v(i - 1, j) + extending;
699  mx = choice1; // Default in case of equality of scores.
700  if (choice2 > mx)
701  {
702  mx = choice2;
703  }
704  v(i, j) = mx;
705 
706  px = 'd';
707  if (v(i, j) > m(i, j))
708  px = 'u';
709  if (h(i, j) > m(i, j))
710  px = 'l';
711  p(i - 1, j - 1) = px;
712  }
713  }
714 
715  // 2) Get alignment:
716  deque<int> a1, a2;
717  size_t i = s1->size(), j = s2->size();
718  char c;
719  while (i > 0 && j > 0)
720  {
721  c = p(i - 1, j - 1);
722  if (c == 'd')
723  {
724  a1.push_front((*s1)[i - 1]);
725  a2.push_front((*s2)[j - 1]);
726  i--;
727  j--;
728  }
729  else if (c == 'u')
730  {
731  a1.push_front((*s1)[i - 1]);
732  a2.push_front(-1);
733  i--;
734  }
735  else
736  {
737  a1.push_front(-1);
738  a2.push_front((*s2)[j - 1]);
739  j--;
740  }
741  }
742  while (i > 0)
743  {
744  a1.push_front((*s1)[i - 1]);
745  a2.push_front(-1);
746  i--;
747  }
748  while (j > 0)
749  {
750  a1.push_front(-1);
751  a2.push_front((*s2)[j - 1]);
752  j--;
753  }
754  s1->setContent(vector<int>(a1.begin(), a1.end()));
755  s2->setContent(vector<int>(a2.begin(), a2.end()));
756  AlignedSequenceContainer* asc = new AlignedSequenceContainer(s1->getAlphabet());
757  asc->addSequence(*s1, false);
758  asc->addSequence(*s2, false); // Do not check for sequence names.
759  return asc;
760 }
761 
762 /******************************************************************************/
763 
764 VectorSiteContainer* SiteContainerTools::sampleSites(const SiteContainer& sites, size_t nbSites, vector<size_t>* index)
765 {
767  for (size_t i = 0; i < nbSites; i++)
768  {
769  size_t pos = static_cast<size_t>(RandomTools::giveIntRandomNumberBetweenZeroAndEntry(static_cast<int>(sites.getNumberOfSites())));
770  sample->addSite(sites.getSite(pos), false);
771  if (index)
772  index->push_back(pos);
773  }
774  return sample;
775 }
776 
777 /******************************************************************************/
778 
780 {
781  return sampleSites(sites, sites.getNumberOfSites());
782 }
783 
784 /******************************************************************************/
785 
786 const string SiteContainerTools::SIMILARITY_ALL = "all sites";
787 const string SiteContainerTools::SIMILARITY_NOFULLGAP = "no full gap";
788 const string SiteContainerTools::SIMILARITY_NODOUBLEGAP = "no double gap";
789 const string SiteContainerTools::SIMILARITY_NOGAP = "no gap";
790 
791 /******************************************************************************/
792 
793 double SiteContainerTools::computeSimilarity(const Sequence& seq1, const Sequence& seq2, bool dist, const std::string& gapOption, bool unresolvedAsGap) throw (SequenceNotAlignedException, AlphabetMismatchException, Exception)
794 {
795  if (seq1.size() != seq2.size())
796  throw SequenceNotAlignedException("SiteContainerTools::computeSimilarity.", &seq2);
797  if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
798  throw AlphabetMismatchException("SiteContainerTools::computeSimilarity.", seq1.getAlphabet(), seq2.getAlphabet());
799 
800  const Alphabet* alpha = seq1.getAlphabet();
801  unsigned int s = 0;
802  unsigned int t = 0;
803  for (size_t i = 0; i < seq1.size(); i++)
804  {
805  int x = seq1[i];
806  int y = seq2[i];
807  int gapCode = alpha->getGapCharacterCode();
808  if (unresolvedAsGap)
809  {
810  if (alpha->isUnresolved(x))
811  x = gapCode;
812  if (alpha->isUnresolved(y))
813  y = gapCode;
814  }
815  if (gapOption == SIMILARITY_ALL)
816  {
817  t++;
818  if (x == y && !alpha->isGap(x) && !alpha->isGap(y))
819  s++;
820  }
821  else if (gapOption == SIMILARITY_NODOUBLEGAP)
822  {
823  if (!alpha->isGap(x) || !alpha->isGap(y))
824  {
825  t++;
826  if (x == y)
827  s++;
828  }
829  }
830  else if (gapOption == SIMILARITY_NOGAP)
831  {
832  if (!alpha->isGap(x) && !alpha->isGap(y))
833  {
834  t++;
835  if (x == y)
836  s++;
837  }
838  }
839  else
840  throw Exception("SiteContainerTools::computeSimilarity. Invalid gap option: " + gapOption);
841  }
842  double r = (t == 0 ? 0. : static_cast<double>(s) / static_cast<double>(t));
843  return dist ? 1 - r : r;
844 }
845 
846 /******************************************************************************/
847 
848 DistanceMatrix* SiteContainerTools::computeSimilarityMatrix(const SiteContainer& sites, bool dist, const std::string& gapOption, bool unresolvedAsGap)
849 {
850  size_t n = sites.getNumberOfSequences();
851  DistanceMatrix* mat = new DistanceMatrix(sites.getSequencesNames());
852  string pairwiseGapOption = gapOption;
853  SiteContainer* sites2;
854  if (gapOption == SIMILARITY_NOFULLGAP)
855  {
856  if (unresolvedAsGap)
857  {
858  SiteContainer* tmp = removeGapOrUnresolvedOnlySites(sites);
859  sites2 = new AlignedSequenceContainer(*tmp);
860  delete tmp;
861  }
862  else
863  {
864  SiteContainer* tmp = removeGapOnlySites(sites);
865  sites2 = new AlignedSequenceContainer(*tmp);
866  delete tmp;
867  }
868  pairwiseGapOption = SIMILARITY_ALL;
869  }
870  else
871  {
872  sites2 = new AlignedSequenceContainer(sites);
873  }
874 
875  for (size_t i = 0; i < n; i++)
876  {
877  (*mat)(i, i) = dist ? 0. : 1.;
878  const Sequence* seq1 = &sites2->getSequence(i);
879  for (size_t j = i + 1; j < n; j++)
880  {
881  const Sequence* seq2 = &sites2->getSequence(j);
882  (*mat)(i, j) = (*mat)(j, i) = computeSimilarity(*seq1, *seq2, dist, pairwiseGapOption, unresolvedAsGap);
883  }
884  }
885  delete sites2;
886  return mat;
887 }
888 
889 /******************************************************************************/
890 
891 void SiteContainerTools::merge(SiteContainer& seqCont1, const SiteContainer& seqCont2, bool leavePositionAsIs)
893 {
894  if (seqCont1.getAlphabet()->getAlphabetType() != seqCont2.getAlphabet()->getAlphabetType())
895  throw AlphabetMismatchException("SiteContainerTools::merge.", seqCont1.getAlphabet(), seqCont2.getAlphabet());
896 
897 
898  vector<string> seqNames1 = seqCont1.getSequencesNames();
899  vector<string> seqNames2 = seqCont2.getSequencesNames();
900  const SiteContainer* seqCont2bis = 0;
901  bool del = false;
902  if (seqNames1 == seqNames2)
903  {
904  seqCont2bis = &seqCont2;
905  }
906  else
907  {
908  // We shall reorder sequences first:
909  SiteContainer* seqCont2ter = new VectorSiteContainer(seqCont2.getAlphabet());
910  SequenceContainerTools::getSelectedSequences(seqCont2, seqNames1, *seqCont2ter);
911  seqCont2bis = seqCont2ter;
912  del = true;
913  }
914 
915  if (leavePositionAsIs)
916  {
917  for (size_t i = 0; i < seqCont2bis->getNumberOfSites(); i++)
918  {
919  seqCont1.addSite(seqCont2bis->getSite(i), false);
920  }
921  }
922  else
923  {
924  int offset = static_cast<int>(seqCont1.getNumberOfSites());
925  for (size_t i = 0; i < seqCont2bis->getNumberOfSites(); i++)
926  {
927  seqCont1.addSite(seqCont2bis->getSite(i), offset + seqCont2bis->getSite(i).getPosition(), false);
928  }
929  }
930 
931  if (del)
932  delete seqCont2bis;
933 }
934 
935 /******************************************************************************/
936 
938 {
939  positions.resize(sites.getNumberOfSequences(), sites.getNumberOfSites());
940  int gap = sites.getAlphabet()->getGapCharacterCode();
941  for (size_t i = 0; i < sites.getNumberOfSequences(); ++i) {
942  const Sequence& seq = sites.getSequence(i);
943  unsigned int pos = 0;
944  for (size_t j = 0; j < sites.getNumberOfSites(); ++j) {
945  if (seq[j] != gap) {
946  ++pos;
947  positions(i, j) = pos;
948  } else {
949  positions(i, j) = 0;
950  }
951  }
952  }
953 }
954 
955 /******************************************************************************/
956 
957 vector<int> SiteContainerTools::getColumnScores(const Matrix<size_t>& positions1, const Matrix<size_t>& positions2, int na)
958 {
959  if (positions1.getNumberOfRows() != positions2.getNumberOfRows())
960  throw Exception("SiteContainerTools::getColumnScores. The two input alignments must have the same number of sequences!");
961  vector<int> scores(positions1.getNumberOfColumns());
962  for (size_t i = 0; i < positions1.getNumberOfColumns(); ++i) {
963  //Find an anchor point:
964  size_t whichSeq = 0;
965  size_t whichPos = 0;
966  for (size_t j = 0; j < positions1.getNumberOfRows(); ++j) {
967  if (positions1(j, i) > 0) {
968  whichSeq = j;
969  whichPos = positions1(j, i);
970  break;
971  }
972  }
973  if (whichPos == 0) {
974  //No anchor found, this alignment column is only made of gaps. We assign a score of 'na' and move to the next column.
975  scores[i] = na;
976  continue;
977  }
978  //We look for the anchor in the reference alignment:
979  size_t i2 = 0;
980  bool found = false;
981  for (size_t j = 0; !found && j < positions2.getNumberOfColumns(); ++j) {
982  if (positions2(whichSeq, j) == whichPos) {
983  i2 = j;
984  found = true;
985  }
986  }
987  if (!found) {
988  throw Exception("SiteContainerTools::getColumnScores(). Position " + TextTools::toString(whichPos) + " of sequence " + TextTools::toString(whichSeq) + " not found in reference alignment. Please make sure the two indexes are built from the same data!");
989  }
990  //Now we compare all pairs of sequences between the two positions:
991  bool test = true;
992  for (size_t j = 0; test && j < positions1.getNumberOfRows(); ++j) {
993  test = (positions1(j, i) == positions2(j, i2));
994  }
995  scores[i] = test ? 1 : 0;
996  }
997  return scores;
998 }
999 
1000 /******************************************************************************/
1001 
1002 vector<double> SiteContainerTools::getSumOfPairsScores(const Matrix<size_t>& positions1, const Matrix<size_t>& positions2, double na)
1003 {
1004  if (positions1.getNumberOfRows() != positions2.getNumberOfRows())
1005  throw Exception("SiteContainerTools::getColumnScores. The two input alignments must have the same number of sequences!");
1006  vector<double> scores(positions1.getNumberOfColumns());
1007  for (size_t i = 0; i < positions1.getNumberOfColumns(); ++i) {
1008  //For all positions in alignment 1...
1009  size_t countAlignable = 0;
1010  size_t countAligned = 0;
1011  for (size_t j = 0; j < positions1.getNumberOfRows(); ++j) {
1012  //Get the corresponding column in alignment 2:
1013  size_t whichPos = positions1(j, i);
1014  if (whichPos == 0) {
1015  //No position for this sequence here.
1016  continue;
1017  }
1018  //We look for the position in the second alignment:
1019  size_t i2 = 0;
1020  bool found = false;
1021  for (size_t k = 0; !found && k < positions2.getNumberOfColumns(); ++k) {
1022  if (positions2(j, k) == whichPos) {
1023  i2 = k;
1024  found = true;
1025  }
1026  }
1027  if (!found) {
1028  throw Exception("SiteContainerTools::getColumnScores(). Position " + TextTools::toString(whichPos) + " of sequence " + TextTools::toString(j) + " not found in reference alignment. Please make sure the two indexes are built from the same data!");
1029  }
1030 
1031  //Now we check all other positions and see if they are aligned with this one:
1032  for (size_t k = j + 1; k < positions1.getNumberOfRows(); ++k) {
1033  size_t whichPos2 = positions1(k, i);
1034  if (whichPos2 == 0) {
1035  //Empty position
1036  continue;
1037  }
1038  countAlignable++;
1039  //check position in alignment 2:
1040  if (positions2(k, i2) == whichPos2)
1041  countAligned++;
1042  }
1043  }
1044  scores[i] = countAlignable == 0 ? na : static_cast<double>(countAligned) / static_cast<double>(countAlignable);
1045  }
1046  return scores;
1047 }
1048 
1049 /******************************************************************************/
1050