bpp-seq  2.1.0
 All Classes Namespaces Files Functions Variables Friends Pages
SequenceTools.cpp
Go to the documentation of this file.
1 //
2 // File: SequenceTools.cpp
3 // Authors: Guillaume Deuchst
4 // Julien Dutheil
5 // Sylvain Gaillard
6 // Created on: Tue Aug 21 2003
7 //
8 
9 /*
10  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
11 
12  This software is a computer program whose purpose is to provide classes
13  for sequences analysis.
14 
15  This software is governed by the CeCILL license under French law and
16  abiding by the rules of distribution of free software. You can use,
17  modify and/ or redistribute the software under the terms of the CeCILL
18  license as circulated by CEA, CNRS and INRIA at the following URL
19  "http://www.cecill.info".
20 
21  As a counterpart to the access to the source code and rights to copy,
22  modify and redistribute granted by the license, users are provided only
23  with a limited warranty and the software's author, the holder of the
24  economic rights, and the successive licensors have only limited
25  liability.
26 
27  In this respect, the user's attention is drawn to the risks associated
28  with loading, using, modifying and/or developing or reproducing the
29  software by the user in light of its specific status of free software,
30  that may mean that it is complicated to manipulate, and that also
31  therefore means that it is reserved for developers and experienced
32  professionals having in-depth computer knowledge. Users are therefore
33  encouraged to load and test the software's suitability as regards their
34  requirements in conditions enabling the security of their systems and/or
35  data to be ensured and, more generally, to use and operate it in the
36  same conditions as regards security.
37 
38  The fact that you are presently reading this means that you have had
39  knowledge of the CeCILL license and that you accept its terms.
40  */
41 
42 #include "SequenceTools.h"
43 
44 #include "Alphabet/AlphabetTools.h"
45 #include "StringSequenceTools.h"
48 
49 using namespace bpp;
50 
51 // From the STL:
52 #include <ctype.h>
53 #include <cmath>
54 #include <list>
55 #include <iostream>
56 
57 using namespace std;
58 
65 
66 /******************************************************************************/
67 
68 Sequence* SequenceTools::subseq(const Sequence& sequence, size_t begin, size_t end) throw (IndexOutOfBoundsException, Exception)
69 {
70  // Checking interval
71  if (end >= sequence.size())
72  throw IndexOutOfBoundsException ("SequenceTools::subseq : Invalid upper bound", end, 0, sequence.size());
73  if (end < begin)
74  throw Exception ("SequenceTools::subseq : Invalid interval");
75 
76  // Copy sequence
77  vector<int> temp(sequence.getContent());
78 
79  // Truncate sequence
80  temp.erase(temp.begin() + end + 1, temp.end());
81  temp.erase(temp.begin(), temp.begin() + begin);
82 
83  // New sequence creation
84  return new BasicSequence(sequence.getName(), temp, sequence.getComments(), sequence.getAlphabet());
85 }
86 
87 /******************************************************************************/
88 
90 {
91  // Sequence's alphabets matching verification
92  if ((seq1.getAlphabet()->getAlphabetType()) != (seq2.getAlphabet()->getAlphabetType()))
93  throw AlphabetMismatchException("SequenceTools::concatenate : Sequence's alphabets don't match ", seq1.getAlphabet(), seq2.getAlphabet());
94 
95  // Sequence's names matching verification
96  if (seq1.getName() != seq2.getName())
97  throw Exception ("SequenceTools::concatenate : Sequence's names don't match");
98 
99  // Concatenate sequences and send result
100  vector<int> sequence = seq1.getContent();
101  vector<int> sequence2 = seq2.getContent();
102  sequence.insert(sequence.end(), sequence2.begin(), sequence2.end());
103  return new BasicSequence(seq1.getName(), sequence, seq1.getComments(), seq1.getAlphabet());
104 }
105 
106 /******************************************************************************/
107 
109 {
110  // Alphabet type checking
112  if (seq.getAlphabet()->getAlphabetType() == "DNA alphabet")
113  {
114  NAR = &_DNARep;
115  }
116  else if (seq.getAlphabet()->getAlphabetType() == "RNA alphabet")
117  {
118  NAR = &_RNARep;
119  }
120  else
121  {
122  throw AlphabetException("SequenceTools::complement: Sequence must be nucleic.", seq.getAlphabet());
123  }
124  for (size_t i = 0; i < seq.size(); i++)
125  {
126  seq.setElement(i, NAR->translate(seq.getValue(i)));
127  }
128  return seq;
129 }
130 
131 /******************************************************************************/
132 
134 {
135  // Alphabet type checking
137  if (sequence.getAlphabet()->getAlphabetType() == "DNA alphabet")
138  {
139  NAR = &_DNARep;
140  }
141  else if (sequence.getAlphabet()->getAlphabetType() == "RNA alphabet")
142  {
143  NAR = &_RNARep;
144  }
145  else
146  {
147  throw AlphabetException ("SequenceTools::getComplement: Sequence must be nucleic.", sequence.getAlphabet());
148  }
149 
150  return NAR->translate(sequence);
151 }
152 
153 /******************************************************************************/
154 
156 {
157  // Alphabet type checking
158  if (sequence.getAlphabet()->getAlphabetType() != "DNA alphabet")
159  {
160  throw AlphabetException ("SequenceTools::transcript : Sequence must be DNA", sequence.getAlphabet());
161  }
162 
163  return _transc.translate(sequence);
164 }
165 
166 /******************************************************************************/
167 
169 {
170  // Alphabet type checking
171  if (sequence.getAlphabet()->getAlphabetType() != "RNA alphabet")
172  {
173  throw AlphabetException ("SequenceTools::reverseTranscript : Sequence must be RNA", sequence.getAlphabet());
174  }
175 
176  return _transc.reverse(sequence);
177 }
178 
179 /******************************************************************************/
180 
182 {
183  size_t seq_size = seq.size(); // store seq size for efficiency
184  unsigned int tmp_state = 0; // to store one state when swapping positions
185  size_t j = seq_size; // symetric position iterator from sequence end
186  for (size_t i = 0; i < seq_size / 2; i++)
187  {
188  j = seq_size - 1 - i;
189  tmp_state = seq.getValue(i);
190  seq.setElement(i, seq.getValue(j));
191  seq.setElement(j, tmp_state);
192  }
193  return seq;
194 }
195 
196 /******************************************************************************/
197 
199 {
200  Sequence* iSeq = sequence.clone();
201  invert(*iSeq);
202  return iSeq;
203 }
204 
205 /******************************************************************************/
206 
208 {
209  // Alphabet type checking
211  if (seq.getAlphabet()->getAlphabetType() == "DNA alphabet")
212  {
213  NAR = &_DNARep;
214  }
215  else if (seq.getAlphabet()->getAlphabetType() == "RNA alphabet")
216  {
217  NAR = &_RNARep;
218  }
219  else
220  {
221  throw AlphabetException("SequenceTools::complement: Sequence must be nucleic.", seq.getAlphabet());
222  }
223  // for (size_t i = 0 ; i < seq.size() ; i++) {
224  // seq.setElement(i, NAR->translate(seq.getValue(i)));
225  // }
226  size_t seq_size = seq.size(); // store seq size for efficiency
227  int tmp_state = 0; // to store one state when swapping positions
228  size_t j = seq_size; // symetric position iterator from sequence end
229  for (size_t i = 0; i < seq_size / 2; i++)
230  {
231  j = seq_size - 1 - i;
232  tmp_state = seq.getValue(i);
233  seq.setElement(i, NAR->translate(seq.getValue(j)));
234  seq.setElement(j, NAR->translate(tmp_state));
235  }
236  if (seq_size % 2) // treate the state in the middle of odd sequences
237  {
238  seq.setElement(seq_size / 2, NAR->translate(seq.getValue(seq_size / 2)));
239  }
240  return seq;
241 }
242 
243 /******************************************************************************/
244 
246 {
247  if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
248  throw AlphabetMismatchException("SequenceTools::getPercentIdentity", seq1.getAlphabet(), seq2.getAlphabet());
249  if (seq1.size() != seq2.size())
250  throw SequenceNotAlignedException("SequenceTools::getPercentIdentity", &seq2);
251  int gap = seq1.getAlphabet()->getGapCharacterCode();
252  size_t id = 0;
253  size_t tot = 0;
254  for (size_t i = 0; i < seq1.size(); i++)
255  {
256  int x = seq1.getValue(i);
257  int y = seq2.getValue(i);
258  if (ignoreGaps)
259  {
260  if (x != gap && y != gap)
261  {
262  tot++;
263  if (x == y)
264  id++;
265  }
266  }
267  else
268  {
269  tot++;
270  if (x == y)
271  id++;
272  }
273  }
274  return static_cast<double>(id) / static_cast<double>(tot) * 100.;
275 }
276 
277 /******************************************************************************/
278 
280 {
281  size_t count = 0;
282  const Alphabet* alpha = seq.getAlphabet();
283  for (size_t i = 0; i < seq.size(); i++)
284  {
285  if (!alpha->isGap(seq[i]))
286  count++;
287  }
288  return count;
289 }
290 
291 /******************************************************************************/
292 
294 {
295  size_t count = 0;
296  const Alphabet* alpha = seq.getAlphabet();
297  for (size_t i = 0; i < seq.size(); i++)
298  {
299  if (!alpha->isGap(seq[i]) && !alpha->isUnresolved(seq[i]))
300  count++;
301  }
302  return count;
303 }
304 
305 /******************************************************************************/
306 
308 {
309  size_t count = 0;
310  const Alphabet* alpha = seq.getAlphabet();
311  for (size_t i = 0; i < seq.size(); i++)
312  {
313  if (alpha->isUnresolved(seq[i]))
314  count++;
315  }
316  return count;
317 }
318 
319 /******************************************************************************/
320 
322 {
323  const Alphabet* alpha = seq.getAlphabet();
324  vector<int> content;
325  for (size_t i = 0; i < seq.size(); i++)
326  {
327  if (!alpha->isGap(seq[i]))
328  content.push_back(seq[i]);
329  }
330  Sequence* newSeq = dynamic_cast<Sequence*>(seq.clone());
331  newSeq->setContent(content);
332  return newSeq;
333 }
334 
335 /******************************************************************************/
336 
338 {
339  const Alphabet* alpha = seq.getAlphabet();
340  for (size_t i = seq.size(); i > 0; --i)
341  {
342  if (alpha->isGap(seq[i - 1]))
343  seq.deleteElement(i - 1);
344  }
345 }
346 
347 /******************************************************************************/
348 
350 {
351  const CodonAlphabet* calpha = dynamic_cast<const CodonAlphabet*>(seq.getAlphabet());
352  if (!calpha)
353  throw Exception("SequenceTools::getSequenceWithoutStops. Input sequence should have a codon alphabet.");
354  vector<int> content;
355  for (size_t i = 0; i < seq.size(); i++)
356  {
357  if (!calpha->isStop(seq[i]))
358  content.push_back(seq[i]);
359  }
360  Sequence* newSeq = dynamic_cast<Sequence*>(seq.clone());
361  newSeq->setContent(content);
362  return newSeq;
363 }
364 
365 /******************************************************************************/
366 
368 {
369  const CodonAlphabet* calpha = dynamic_cast<const CodonAlphabet*>(seq.getAlphabet());
370  if (!calpha)
371  throw Exception("SequenceTools::removeStops. Input sequence should have a codon alphabet.");
372  for (size_t i = seq.size(); i > 0; --i)
373  {
374  if (calpha->isStop(seq[i - 1]))
375  seq.deleteElement(i - 1);
376  }
377 }
378 
379 /******************************************************************************/
380 
382 {
383  const CodonAlphabet* calpha = dynamic_cast<const CodonAlphabet*>(seq.getAlphabet());
384  if (!calpha)
385  throw Exception("SequenceTools::replaceStopsWithGaps. Input sequence should have a codon alphabet.");
386  int gap = calpha->getGapCharacterCode();
387  for (size_t i = 0; i < seq.size(); ++i)
388  {
389  if (calpha->isStop(seq[i]))
390  seq.setElement(i, gap);
391  }
392 }
393 
394 /******************************************************************************/
395 
398 {
399  if (seq1.size() != seq2.size())
400  throw SequenceNotAlignedException("SequenceTools::bowkerTest.", &seq2);
401  size_t n = seq1.size();
402  const Alphabet* alpha = seq1.getAlphabet();
403  unsigned int r = alpha->getSize();
404 
405  // Compute contingency table:
406  RowMatrix<double> array(r, r);
407  int x, y;
408  for (size_t i = 0; i < n; i++)
409  {
410  x = seq1[i];
411  y = seq2[i];
412  if (!alpha->isGap(x) && !alpha->isUnresolved(x)
413  && !alpha->isGap(y) && !alpha->isUnresolved(y))
414  {
415  array(static_cast<size_t>(x), static_cast<size_t>(y))++;
416  }
417  }
418 
419  // Compute Bowker's statistic:
420  double sb2 = 0, nij, nji;
421  for (unsigned int i = 1; i < r; ++i)
422  {
423  for (unsigned int j = 0; j < i; ++j)
424  {
425  nij = array(i, j);
426  nji = array(j, i);
427  if (nij != 0 || nji != 0)
428  {
429  sb2 += pow(nij - nji, 2) / (nij + nji);
430  }
431  // Else: we should display a warning there.
432  }
433  }
434 
435  // Compute p-value:
436  double pvalue = 1. - RandomTools::pChisq(sb2, (r - 1) * r / 2);
437 
438  // Return results:
439  BowkerTest* test = new BowkerTest();
440  test->setStatistic(sb2);
441  test->setPValue(pvalue);
442  return test;
443 }
444 
445 /******************************************************************************/
446 
447 void SequenceTools::getPutativeHaplotypes(const Sequence& seq, std::vector<Sequence*>& hap, unsigned int level)
448 {
449  vector< vector< int > > states(seq.size());
450  list<Sequence*> t_hap;
451  const Alphabet* alpha = seq.getAlphabet();
452  unsigned int hap_count = 1;
453  // Vector of available states at each position
454  for (size_t i = 0; i < seq.size(); i++)
455  {
456  vector<int> st = alpha->getAlias(seq[i]);
457  if (!st.size())
458  {
459  st.push_back(alpha->getGapCharacterCode());
460  }
461  if (st.size() <= level)
462  {
463  states[i] = st;
464  }
465  else
466  {
467  states[i] = vector<int>(1, seq[i]);
468  }
469  }
470  // Combinatorial haplotypes building (the use of tree may be more accurate)
471  t_hap.push_back(new BasicSequence(seq.getName() + "_hap" + TextTools::toString(hap_count++), "", alpha));
472  for (size_t i = 0; i < states.size(); i++)
473  {
474  for (list<Sequence*>::iterator it = t_hap.begin(); it != t_hap.end(); it++)
475  {
476  for (unsigned int j = 0; j < states[i].size(); j++)
477  {
478  Sequence* tmp_seq = new BasicSequence(seq.getName() + "_hap", (**it).getContent(), alpha);
479  if (j < states[i].size() - 1)
480  {
481  tmp_seq->setName(tmp_seq->getName() + TextTools::toString(hap_count++));
482  tmp_seq->addElement(states[i][j]);
483  t_hap.insert(it, tmp_seq);
484  }
485  else
486  {
487  (**it).addElement(states[i][j]);
488  }
489  }
490  }
491  }
492  for (list<Sequence*>::reverse_iterator it = t_hap.rbegin(); it != t_hap.rend(); it++)
493  {
494  hap.push_back(*it);
495  }
496 }
497 
498 /******************************************************************************/
499 
501 {
502  if (s1.getAlphabet()->getAlphabetType() != s2.getAlphabet()->getAlphabetType())
503  {
504  throw AlphabetMismatchException("SequenceTools::combineSequences(const Sequence& s1, const Sequence& s2): s1 and s2 don't have same Alphabet.", s1.getAlphabet(), s2.getAlphabet());
505  }
506  const Alphabet* alpha = s1.getAlphabet();
507  vector<int> st;
508  vector<int> seq;
509  size_t length = NumTools::max(s1.size(), s2.size());
510  for (size_t i = 0; i < length; i++)
511  {
512  if (i < s1.size())
513  st.push_back(s1.getValue(i));
514  if (i < s2.size())
515  st.push_back(s2.getValue(i));
516  seq.push_back(alpha->getGeneric(st));
517  st.clear();
518  }
519  Sequence* s = new BasicSequence(s1.getName() + "+" + s2.getName(), seq, alpha);
520  return s;
521 }
522 
523 /******************************************************************************/
524 
525 Sequence* SequenceTools::subtractHaplotype(const Sequence& s, const Sequence& h, string name, unsigned int level) throw (SequenceNotAlignedException)
526 {
527  const Alphabet* alpha = s.getAlphabet();
528  if (name.size() == 0)
529  name = s.getName() + "_haplotype";
530  string seq;
531  if (s.size() != h.size())
532  throw SequenceNotAlignedException("SequenceTools::subtractHaplotype: haplotype must be aligned with the sequence.", &h);
533  for (unsigned int i = 0; i < s.size(); ++i)
534  {
535  string c;
536  vector<int> nucs = alpha->getAlias(s.getValue(i));
537  if (nucs.size() > 1)
538  {
539  remove(nucs.begin(), nucs.end(), h.getValue(i));
540  nucs = vector<int>(nucs.begin(), nucs.end() - 1);
541  }
542  else
543  {
544  nucs = vector<int>(nucs.begin(), nucs.end());
545  }
546  c = alpha->intToChar(alpha->getGeneric(nucs));
547  if (level <= nucs.size() && (alpha->isUnresolved(s.getValue(i)) || alpha->isUnresolved(h.getValue(i))))
548  {
549  c = alpha->intToChar(alpha->getUnknownCharacterCode());
550  }
551  seq += c;
552  }
553  Sequence* hap = new BasicSequence(name, seq, alpha);
554  return hap;
555 }
556 
557 /******************************************************************************/
558 
560 {
561  // Alphabet type checking
562  if (seq.getAlphabet()->getAlphabetType() != "DNA alphabet")
563  {
564  throw AlphabetException ("SequenceTools::transcript : Sequence must be DNA", seq.getAlphabet());
565  }
566 
567  if ((ph < 1) || (ph > 3))
568  throw Exception("Bad phase for RNYSlice: " + TextTools::toString(ph) + ". Should be between 1 and 3.");
569 
570  size_t s = seq.size();
571  size_t n = (s - ph + 3) / 3;
572 
573  vector<int> content(n);
574 
575  int tir = seq.getAlphabet()->getGapCharacterCode();
576  size_t j;
577 
578  for (size_t i = 0; i < n; i++)
579  {
580  j = i * 3 + ph - 1;
581 
582  if (j == 0)
583  content[i] = _RNY.getRNY(tir, seq[0], seq[1], *seq.getAlphabet());
584  else
585  {
586  if (j == s - 1)
587  content[i] = _RNY.getRNY(seq[j - 1], seq[j], tir, *seq.getAlphabet());
588  else
589  content[i] = _RNY.getRNY(seq[j - 1], seq[j], seq[j + 1], *seq.getAlphabet());
590  }
591  }
592 
593  // New sequence creating, and sense reversing
594  Sequence* sq = new BasicSequence(seq.getName(), content,
595  seq.getComments(), &_RNY);
596 
597  // Send result
598  return sq;
599 }
600 
602 {
603  // Alphabet type checking
604  if (seq.getAlphabet()->getAlphabetType() != "DNA alphabet")
605  {
606  throw AlphabetException ("SequenceTools::transcript : Sequence must be DNA", seq.getAlphabet());
607  }
608 
609  size_t n = seq.size();
610 
611  vector<int> content(n);
612 
613  int tir = seq.getAlphabet()->getGapCharacterCode();
614 
615  if (seq.size() >= 2)
616  {
617  content[0] = _RNY.getRNY(tir, seq[0], seq[1], *seq.getAlphabet());
618 
619  for (unsigned int i = 1; i < n - 1; i++)
620  {
621  content[i] = _RNY.getRNY(seq[i - 1], seq[i], seq[i + 1],
622  *seq.getAlphabet());
623  }
624 
625  content[n - 1] = _RNY.getRNY(seq[n - 2], seq[n - 1], tir, *seq.getAlphabet());
626  }
627 
628  // New sequence creating, and sense reversing
629  Sequence* s = new BasicSequence(seq.getName(), content,
630  seq.getComments(), &_RNY);
631 
632  // Send result
633  return s;
634 }
635 
636 /******************************************************************************/
637 
638 
639 void SequenceTools::getCDS(Sequence& sequence, bool checkInit, bool checkStop, bool includeInit, bool includeStop)
640 {
641  const CodonAlphabet* alphabet = dynamic_cast<const CodonAlphabet*>(sequence.getAlphabet());
642  if (!alphabet)
643  throw AlphabetException("SequenceTools::getCDS. Sequence is not a codon sequence.");
644  if (checkInit)
645  {
646  unsigned int i;
647  for (i = 0; i < sequence.size() && !alphabet->isInit(sequence[i]); ++i)
648  {}
649  for (unsigned int j = 0; includeInit ? j < i : j <= i; ++j)
650  {
651  sequence.deleteElement(j);
652  }
653  }
654  if (checkStop)
655  {
656  unsigned int i;
657  for (i = 0; i < sequence.size() && !alphabet->isStop(sequence[i]); ++i)
658  {}
659  for (unsigned int j = includeStop ? i + 1 : i; j < sequence.size(); ++j)
660  {
661  sequence.deleteElement(j);
662  }
663  }
664 }
665 
666 /******************************************************************************/
667 
668 size_t SequenceTools::findFirstOf(const Sequence& seq, const Sequence& motif, bool strict)
669 {
670  if (motif.size() > seq.size())
671  return seq.size();
672  for (size_t seqi = 0; seqi < seq.size() - motif.size() + 1; seqi++)
673  {
674  bool match = false;
675  for (size_t moti = 0; moti < motif.size(); moti++)
676  {
677  if (strict)
678  {
679  match = seq.getValue(seqi + moti) == motif.getValue(moti);
680  }
681  else
682  {
683  match = AlphabetTools::match(seq.getAlphabet(), seq.getValue(seqi + moti), motif.getValue(moti));
684  }
685  if (!match)
686  {
687  break;
688  }
689  }
690  if (match)
691  {
692  return seqi;
693  }
694  }
695  return seq.size();
696 }
697 
698 /******************************************************************************/
699