|
bpp-seq
2.1.0
|
00001 // 00002 // File: SiteContainerTools.cpp 00003 // Created by: Julien Dutheil 00004 // Sylvain Glémin 00005 // Created on: Fri Dec 12 18:55:06 2003 00006 // 00007 00008 /* 00009 Copyright or © or Copr. Bio++ Development Team, (November 17, 2004) 00010 00011 This software is a computer program whose purpose is to provide classes 00012 for sequences analysis. 00013 00014 This software is governed by the CeCILL license under French law and 00015 abiding by the rules of distribution of free software. You can use, 00016 modify and/ or redistribute the software under the terms of the CeCILL 00017 license as circulated by CEA, CNRS and INRIA at the following URL 00018 "http://www.cecill.info". 00019 00020 As a counterpart to the access to the source code and rights to copy, 00021 modify and redistribute granted by the license, users are provided only 00022 with a limited warranty and the software's author, the holder of the 00023 economic rights, and the successive licensors have only limited 00024 liability. 00025 00026 In this respect, the user's attention is drawn to the risks associated 00027 with loading, using, modifying and/or developing or reproducing the 00028 software by the user in light of its specific status of free software, 00029 that may mean that it is complicated to manipulate, and that also 00030 therefore means that it is reserved for developers and experienced 00031 professionals having in-depth computer knowledge. Users are therefore 00032 encouraged to load and test the software's suitability as regards their 00033 requirements in conditions enabling the security of their systems and/or 00034 data to be ensured and, more generally, to use and operate it in the 00035 same conditions as regards security. 00036 00037 The fact that you are presently reading this means that you have had 00038 knowledge of the CeCILL license and that you accept its terms. 00039 */ 00040 00041 #include "SiteContainerTools.h" 00042 #include "SequenceContainerTools.h" 00043 #include "VectorSiteContainer.h" 00044 #include "SiteContainerIterator.h" 00045 #include "../SiteTools.h" 00046 #include "../Alphabet/AlphabetTools.h" 00047 #include "../SequenceTools.h" 00048 #include <Bpp/App/ApplicationTools.h> 00049 00050 using namespace bpp; 00051 00052 // From the STL: 00053 #include <vector> 00054 #include <deque> 00055 #include <string> 00056 00057 using namespace std; 00058 00059 /******************************************************************************/ 00060 00061 SiteContainer* SiteContainerTools::getSitesWithoutGaps(const SiteContainer& sites) 00062 { 00063 vector<string> seqNames = sites.getSequencesNames(); 00064 VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet()); 00065 noGapCont->setSequencesNames(seqNames, false); 00066 NoGapSiteContainerIterator ngsi(sites); 00067 while (ngsi.hasMoreSites()) 00068 { 00069 noGapCont->addSite(*ngsi.nextSite()); 00070 } 00071 return noGapCont; 00072 } 00073 00074 /******************************************************************************/ 00075 00076 SiteContainer* SiteContainerTools::getCompleteSites(const SiteContainer& sites) 00077 { 00078 vector<string> seqNames = sites.getSequencesNames(); 00079 VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet()); 00080 noGapCont->setSequencesNames(seqNames, false); 00081 CompleteSiteContainerIterator csi(sites); 00082 while (csi.hasMoreSites()) 00083 { 00084 noGapCont->addSite(*csi.nextSite()); 00085 } 00086 return noGapCont; 00087 } 00088 00089 /******************************************************************************/ 00090 00091 SiteContainer* SiteContainerTools::getSelectedSites( 00092 const SiteContainer& sequences, 00093 const SiteSelection& selection) 00094 { 00095 vector<string> seqNames = sequences.getSequencesNames(); 00096 VectorSiteContainer* sc = new VectorSiteContainer(seqNames.size(), sequences.getAlphabet()); 00097 sc->setSequencesNames(seqNames, false); 00098 for (unsigned int i = 0; i < selection.size(); i++) 00099 { 00100 sc->addSite(sequences.getSite(selection[i]), false); 00101 // We do not check names, we suppose that the container passed as an argument is correct. 00102 // WARNING: what if selection contains many times the same indice? ... 00103 } 00104 sc->setGeneralComments(sequences.getGeneralComments()); 00105 return sc; 00106 } 00107 00108 /******************************************************************************/ 00109 00110 const Sequence* SiteContainerTools::getConsensus(const SiteContainer& sc, const std::string& name, bool ignoreGap, bool resolveUnknown) 00111 { 00112 Vint consensus; 00113 SimpleSiteContainerIterator ssi(sc); 00114 const Site* site; 00115 while (ssi.hasMoreSites()) 00116 { 00117 site = ssi.nextSite(); 00118 map<int, double> freq; 00119 SiteTools::getFrequencies(*site, freq, resolveUnknown); 00120 double max = 0; 00121 int cons = -1; // default result 00122 if (ignoreGap) 00123 { 00124 for (map<int, double>::iterator it = freq.begin(); it != freq.end(); it++) 00125 { 00126 if (it->second > max && it->first != -1) 00127 { 00128 max = it->second; 00129 cons = it->first; 00130 } 00131 } 00132 } 00133 else 00134 { 00135 for (map<int, double>::iterator it = freq.begin(); it != freq.end(); it++) 00136 { 00137 if (it->second > max) 00138 { 00139 max = it->second; 00140 cons = it->first; 00141 } 00142 } 00143 } 00144 consensus.push_back(cons); 00145 } 00146 const Sequence* seqConsensus = new BasicSequence(name, consensus, sc.getAlphabet()); 00147 return seqConsensus; 00148 } 00149 00150 /******************************************************************************/ 00151 00152 void SiteContainerTools::changeGapsToUnknownCharacters(SiteContainer& sites) 00153 { 00154 // NB: use iterators for a better algorithm? 00155 int unknownCode = sites.getAlphabet()->getUnknownCharacterCode(); 00156 for (unsigned int i = 0; i < sites.getNumberOfSites(); i++) 00157 { 00158 for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++) 00159 { 00160 int* element = &sites(j, i); 00161 if (sites.getAlphabet()->isGap(*element)) 00162 *element = unknownCode; 00163 } 00164 } 00165 } 00166 00167 /******************************************************************************/ 00168 00169 void SiteContainerTools::changeUnresolvedCharactersToGaps(SiteContainer& sites) 00170 { 00171 // NB: use iterators for a better algorithm? 00172 int gapCode = sites.getAlphabet()->getGapCharacterCode(); 00173 for (unsigned int i = 0; i < sites.getNumberOfSites(); i++) 00174 { 00175 for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++) 00176 { 00177 int* element = &sites(j, i); 00178 if (sites.getAlphabet()->isUnresolved(*element)) 00179 *element = gapCode; 00180 } 00181 } 00182 } 00183 00184 /******************************************************************************/ 00185 00186 SiteContainer* SiteContainerTools::removeGapOnlySites(const SiteContainer& sites) 00187 { 00188 vector<string> seqNames = sites.getSequencesNames(); 00189 VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet()); 00190 noGapCont->setSequencesNames(seqNames, false); 00191 for (unsigned int i = 0; i < sites.getNumberOfSites(); i++) 00192 { 00193 const Site* site = &sites.getSite(i); 00194 if (!SiteTools::isGapOnly(*site)) 00195 noGapCont->addSite(*site); 00196 } 00197 return noGapCont; 00198 } 00199 00200 /******************************************************************************/ 00201 00202 void SiteContainerTools::removeGapOnlySites(SiteContainer& sites) 00203 { 00204 size_t n = sites.getNumberOfSites(); 00205 size_t i = n; 00206 while (i > 1) 00207 { 00208 ApplicationTools::displayGauge(n - i + 1, n); 00209 const Site* site = &sites.getSite(i - 1); 00210 if (SiteTools::isGapOnly(*site)) 00211 { 00212 size_t end = i; 00213 while (SiteTools::isGapOnly(*site) && i > 1) 00214 { 00215 --i; 00216 site = &sites.getSite(i - 1); 00217 } 00218 sites.deleteSites(i, end - i); 00219 } 00220 else 00221 { 00222 --i; 00223 } 00224 } 00225 ApplicationTools::displayGauge(n, n); 00226 const Site* site = &sites.getSite(0); 00227 if (SiteTools::isGapOnly(*site)) 00228 sites.deleteSite(0); 00229 } 00230 00231 /******************************************************************************/ 00232 00233 SiteContainer* SiteContainerTools::removeGapOrUnresolvedOnlySites(const SiteContainer& sites) 00234 { 00235 vector<string> seqNames = sites.getSequencesNames(); 00236 VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet()); 00237 noGapCont->setSequencesNames(seqNames, false); 00238 for (unsigned int i = 0; i < sites.getNumberOfSites(); i++) 00239 { 00240 const Site* site = &sites.getSite(i); 00241 if (!SiteTools::isGapOrUnresolvedOnly(*site)) 00242 noGapCont->addSite(*site, false); 00243 } 00244 return noGapCont; 00245 } 00246 00247 /******************************************************************************/ 00248 00249 void SiteContainerTools::removeGapOrUnresolvedOnlySites(SiteContainer& sites) 00250 { 00251 size_t n = sites.getNumberOfSites(); 00252 size_t i = n; 00253 while (i > 1) 00254 { 00255 ApplicationTools::displayGauge(n - i + 1, n); 00256 const Site* site = &sites.getSite(i - 1); 00257 if (SiteTools::isGapOnly(*site)) 00258 { 00259 size_t end = i; 00260 while (SiteTools::isGapOrUnresolvedOnly(*site) && i > 1) 00261 { 00262 --i; 00263 site = &sites.getSite(i - 1); 00264 } 00265 sites.deleteSites(i, end - i); 00266 } 00267 else 00268 { 00269 --i; 00270 } 00271 } 00272 ApplicationTools::displayGauge(n, n); 00273 const Site* site = &sites.getSite(0); 00274 if (SiteTools::isGapOrUnresolvedOnly(*site)) 00275 sites.deleteSite(0); 00276 } 00277 00278 /******************************************************************************/ 00279 00280 SiteContainer* SiteContainerTools::removeGapSites(const SiteContainer& sites, double maxFreqGaps) 00281 { 00282 vector<string> seqNames = sites.getSequencesNames(); 00283 VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet()); 00284 noGapCont->setSequencesNames(seqNames, false); 00285 for (unsigned int i = 0; i < sites.getNumberOfSites(); ++i) { 00286 map<int, double> freq; 00287 SiteTools::getFrequencies(sites.getSite(i), freq); 00288 if (freq[-1] <= maxFreqGaps) 00289 noGapCont->addSite(sites.getSite(i), false); 00290 } 00291 return noGapCont; 00292 } 00293 00294 /******************************************************************************/ 00295 00296 void SiteContainerTools::removeGapSites(SiteContainer& sites, double maxFreqGaps) 00297 { 00298 for (size_t i = sites.getNumberOfSites(); i > 0; i--) { 00299 map<int, double> freq; 00300 SiteTools::getFrequencies(sites.getSite(i - 1), freq); 00301 if (freq[-1] > maxFreqGaps) 00302 sites.deleteSite(i - 1); 00303 } 00304 } 00305 00306 /******************************************************************************/ 00307 00308 SiteContainer* SiteContainerTools::removeStopCodonSites(const SiteContainer& sites) throw (AlphabetException) 00309 { 00310 const CodonAlphabet* pca = dynamic_cast<const CodonAlphabet*>(sites.getAlphabet()); 00311 if (!pca) 00312 throw AlphabetException("Not a Codon Alphabet", sites.getAlphabet()); 00313 vector<string> seqNames = sites.getSequencesNames(); 00314 VectorSiteContainer* noStopCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet()); 00315 noStopCont->setSequencesNames(seqNames, false); 00316 for (unsigned int i = 0; i < sites.getNumberOfSites(); i++) 00317 { 00318 const Site* site = &sites.getSite(i); 00319 if (!SiteTools::hasStopCodon(*site)) 00320 noStopCont->addSite(*site, false); 00321 } 00322 return noStopCont; 00323 } 00324 00325 /******************************************************************************/ 00326 00327 SiteContainer* SiteContainerTools::resolveDottedAlignment( 00328 const SiteContainer& dottedAln, 00329 const Alphabet* resolvedAlphabet) throw (AlphabetException, Exception) 00330 { 00331 if (!AlphabetTools::isDefaultAlphabet(dottedAln.getAlphabet())) 00332 throw AlphabetException("SiteContainerTools::resolveDottedAlignment. Alignment alphabet should of class 'DefaultAlphabet'.", dottedAln.getAlphabet()); 00333 00334 // First we look for the reference sequence: 00335 size_t n = dottedAln.getNumberOfSequences(); 00336 if (n == 0) 00337 throw Exception("SiteContainerTools::resolveDottedAlignment. Input alignment contains no sequence."); 00338 00339 const Sequence* refSeq = 0; 00340 for (size_t i = 0; i < n; ++i) // Test each sequence 00341 { 00342 const Sequence* seq = &dottedAln.getSequence(i); 00343 bool isRef = true; 00344 for (unsigned int j = 0; isRef && j < seq->size(); ++j) // For each site in the sequence 00345 { 00346 if (seq->getChar(j) == ".") 00347 isRef = false; 00348 } 00349 if (isRef) // We found the reference sequence! 00350 { 00351 refSeq = new BasicSequence(*seq); 00352 } 00353 } 00354 if (!refSeq) 00355 throw Exception("SiteContainerTools::resolveDottedAlignment. No reference sequence was found in the input alignment."); 00356 00357 // Now we build a new VectorSiteContainer: 00358 VectorSiteContainer* sites = new VectorSiteContainer(n, resolvedAlphabet); 00359 00360 // We add each site one by one: 00361 size_t m = dottedAln.getNumberOfSites(); 00362 string state; 00363 for (unsigned int i = 0; i < m; ++i) 00364 { 00365 string resolved = refSeq->getChar(i); 00366 const Site* site = &dottedAln.getSite(i); 00367 Site resolvedSite(resolvedAlphabet, site->getPosition()); 00368 for (unsigned int j = 0; j < n; j++) 00369 { 00370 state = site->getChar(j); 00371 if (state == ".") 00372 { 00373 state = resolved; 00374 } 00375 resolvedSite.addElement(state); 00376 } 00377 // Add the new site: 00378 sites->addSite(resolvedSite); 00379 } 00380 00381 // Seq sequence names: 00382 sites->setSequencesNames(dottedAln.getSequencesNames()); 00383 00384 // Delete the copied sequence: 00385 delete refSeq; 00386 00387 // Return result: 00388 return sites; 00389 } 00390 00391 /******************************************************************************/ 00392 00393 std::map<size_t, size_t> SiteContainerTools::getSequencePositions(const Sequence& seq) 00394 { 00395 map<size_t, size_t> tln; 00396 if (seq.size() == 0) 00397 return tln; 00398 unsigned int count = 0; 00399 for (size_t i = 0; i < seq.size(); i++) 00400 { 00401 if (seq[i] != -1) 00402 { 00403 count++; 00404 tln[i + 1] = count; 00405 } 00406 } 00407 return tln; 00408 } 00409 00410 /******************************************************************************/ 00411 00412 std::map<size_t, size_t> SiteContainerTools::getAlignmentPositions(const Sequence& seq) 00413 { 00414 map<size_t, size_t> tln; 00415 if (seq.size() == 0) 00416 return tln; 00417 unsigned int count = 0; 00418 for (size_t i = 0; i < seq.size(); i++) 00419 { 00420 if (seq[i] != -1) 00421 { 00422 count++; 00423 tln[count] = i + 1; 00424 } 00425 } 00426 return tln; 00427 } 00428 00429 /******************************************************************************/ 00430 00431 std::map<size_t, size_t> SiteContainerTools::translateAlignment(const Sequence& seq1, const Sequence& seq2) 00432 throw (AlphabetMismatchException, Exception) 00433 { 00434 if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType()) 00435 throw AlphabetMismatchException("SiteContainerTools::translateAlignment", seq1.getAlphabet(), seq2.getAlphabet()); 00436 map<size_t, size_t> tln; 00437 if (seq1.size() == 0) 00438 return tln; 00439 unsigned int count1 = 0; 00440 unsigned int count2 = 0; 00441 if (seq2.size() == 0) 00442 throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + "."); 00443 int state1 = seq1[count1]; 00444 int state2 = seq2[count2]; 00445 bool end = false; 00446 while (!end) 00447 { 00448 while (state1 == -1) 00449 { 00450 count1++; 00451 if (count1 < seq1.size()) 00452 state1 = seq1[count1]; 00453 else 00454 break; 00455 } 00456 while (state2 == -1) 00457 { 00458 count2++; 00459 if (count2 < seq2.size()) 00460 state2 = seq2[count2]; 00461 else 00462 break; 00463 } 00464 if (state1 != state2) 00465 throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + "."); 00466 tln[count1 + 1] = count2 + 1; // Count start at 1 00467 if (count1 == seq1.size() - 1) 00468 end = true; 00469 else 00470 { 00471 if (count2 == seq2.size() - 1) 00472 { 00473 state1 = seq1[++count1]; 00474 while (state1 == -1) 00475 { 00476 count1++; 00477 if (count1 < seq1.size()) 00478 state1 = seq1[count1]; 00479 else 00480 break; 00481 } 00482 if (state1 == -1) 00483 end = true; 00484 else 00485 throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + "."); 00486 } 00487 else 00488 { 00489 state1 = seq1[++count1]; 00490 state2 = seq2[++count2]; 00491 } 00492 } 00493 } 00494 return tln; 00495 } 00496 00497 /******************************************************************************/ 00498 00499 std::map<size_t, size_t> SiteContainerTools::translateSequence(const SiteContainer& sequences, size_t i1, size_t i2) 00500 { 00501 const Sequence* seq1 = &sequences.getSequence(i1); 00502 const Sequence* seq2 = &sequences.getSequence(i2); 00503 map<size_t, size_t> tln; 00504 size_t count1 = 0; // Sequence 1 counter 00505 size_t count2 = 0; // Sequence 2 counter 00506 int state1; 00507 int state2; 00508 for (size_t i = 0; i < sequences.getNumberOfSites(); i++) 00509 { 00510 state1 = (*seq1)[i]; 00511 if (state1 != -1) 00512 count1++; 00513 state2 = (*seq2)[i]; 00514 if (state2 != -1) 00515 count2++; 00516 if (state1 != -1) 00517 { 00518 tln[count1] = (state2 == -1 ? 0 : count2); 00519 } 00520 } 00521 return tln; 00522 } 00523 00524 /******************************************************************************/ 00525 00526 AlignedSequenceContainer* SiteContainerTools::alignNW( 00527 const Sequence& seq1, 00528 const Sequence& seq2, 00529 const AlphabetIndex2& s, 00530 double gap) 00531 throw (AlphabetMismatchException) 00532 { 00533 if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType()) 00534 throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), seq2.getAlphabet()); 00535 if (seq1.getAlphabet()->getAlphabetType() != s.getAlphabet()->getAlphabetType()) 00536 throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), s.getAlphabet()); 00537 // Check that sequences have no gap! 00538 auto_ptr<Sequence> s1(seq1.clone()); 00539 SequenceTools::removeGaps(*s1); 00540 auto_ptr<Sequence> s2(seq2.clone()); 00541 SequenceTools::removeGaps(*s2); 00542 00543 // 1) Initialize matrix: 00544 RowMatrix<double> m(s1->size() + 1, s2->size() + 1); 00545 RowMatrix<char> p(s1->size(), s2->size()); 00546 double choice1, choice2, choice3, mx; 00547 char px; 00548 for (size_t i = 0; i <= s1->size(); i++) 00549 { 00550 m(i, 0) = static_cast<double>(i) * gap; 00551 } 00552 for (size_t j = 0; j <= s2->size(); j++) 00553 { 00554 m(0, j) = static_cast<double>(j) * gap; 00555 } 00556 for (size_t i = 1; i <= s1->size(); i++) 00557 { 00558 for (size_t j = 1; j <= s2->size(); j++) 00559 { 00560 choice1 = m(i - 1, j - 1) + static_cast<double>(s.getIndex((*s1)[i - 1], (*s2)[j - 1])); 00561 choice2 = m(i - 1, j) + gap; 00562 choice3 = m(i, j - 1) + gap; 00563 mx = choice1; px = 'd'; // Default in case of equality of scores. 00564 if (choice2 > mx) 00565 { 00566 mx = choice2; px = 'u'; 00567 } 00568 if (choice3 > mx) 00569 { 00570 mx = choice3; px = 'l'; 00571 } 00572 m(i, j) = mx; 00573 p(i - 1, j - 1) = px; 00574 } 00575 } 00576 00577 // 2) Get alignment: 00578 deque<int> a1, a2; 00579 size_t i = s1->size(), j = s2->size(); 00580 char c; 00581 while (i > 0 && j > 0) 00582 { 00583 c = p(i - 1, j - 1); 00584 if (c == 'd') 00585 { 00586 a1.push_front((*s1)[i - 1]); 00587 a2.push_front((*s2)[j - 1]); 00588 i--; 00589 j--; 00590 } 00591 else if (c == 'u') 00592 { 00593 a1.push_front((*s1)[i - 1]); 00594 a2.push_front(-1); 00595 i--; 00596 } 00597 else 00598 { 00599 a1.push_front(-1); 00600 a2.push_front((*s2)[j - 1]); 00601 j--; 00602 } 00603 } 00604 while (i > 0) 00605 { 00606 a1.push_front((*s1)[i - 1]); 00607 a2.push_front(-1); 00608 i--; 00609 } 00610 while (j > 0) 00611 { 00612 a1.push_front(-1); 00613 a2.push_front((*s2)[j - 1]); 00614 j--; 00615 } 00616 s1->setContent(vector<int>(a1.begin(), a1.end())); 00617 s2->setContent(vector<int>(a2.begin(), a2.end())); 00618 AlignedSequenceContainer* asc = new AlignedSequenceContainer(s1->getAlphabet()); 00619 asc->addSequence(*s1, false); 00620 asc->addSequence(*s2, false); // Do not check for sequence names. 00621 return asc; 00622 } 00623 00624 /******************************************************************************/ 00625 00626 AlignedSequenceContainer* SiteContainerTools::alignNW( 00627 const Sequence& seq1, 00628 const Sequence& seq2, 00629 const AlphabetIndex2& s, 00630 double opening, 00631 double extending) 00632 throw (AlphabetMismatchException) 00633 { 00634 if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType()) 00635 throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), seq2.getAlphabet()); 00636 if (seq1.getAlphabet()->getAlphabetType() != s.getAlphabet()->getAlphabetType()) 00637 throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), s.getAlphabet()); 00638 // Check that sequences have no gap! 00639 auto_ptr<Sequence> s1(seq1.clone()); 00640 SequenceTools::removeGaps(*s1); 00641 auto_ptr<Sequence> s2(seq2.clone()); 00642 SequenceTools::removeGaps(*s2); 00643 00644 // 1) Initialize matrix: 00645 RowMatrix<double> m(s1->size() + 1, s2->size() + 1); 00646 RowMatrix<double> v(s1->size() + 1, s2->size() + 1); 00647 RowMatrix<double> h(s1->size() + 1, s2->size() + 1); 00648 RowMatrix<char> p(s1->size(), s2->size()); 00649 00650 double choice1, choice2, choice3, mx; 00651 char px; 00652 m(0, 0) = 0.; 00653 for (size_t i = 0; i <= s1->size(); i++) 00654 { 00655 v(i, 0) = log(0.); 00656 } 00657 for (size_t j = 0; j <= s2->size(); j++) 00658 { 00659 h(0, j) = log(0.); 00660 } 00661 for (size_t i = 1; i <= s1->size(); i++) 00662 { 00663 m(i, 0) = h(i, 0) = opening + static_cast<double>(i) * extending; 00664 } 00665 for (size_t j = 1; j <= s2->size(); j++) 00666 { 00667 m(0, j) = v(0, j) = opening + static_cast<double>(j) * extending; 00668 } 00669 00670 for (size_t i = 1; i <= s1->size(); i++) 00671 { 00672 for (size_t j = 1; j <= s2->size(); j++) 00673 { 00674 choice1 = m(i - 1, j - 1) + s.getIndex((*s1)[i - 1], (*s2)[j - 1]); 00675 choice2 = h(i - 1, j - 1) + opening + extending; 00676 choice3 = v(i - 1, j - 1) + opening + extending; 00677 mx = choice1; // Default in case of equality of scores. 00678 if (choice2 > mx) 00679 { 00680 mx = choice2; 00681 } 00682 if (choice3 > mx) 00683 { 00684 mx = choice3; 00685 } 00686 m(i, j) = mx; 00687 00688 choice1 = m(i, j - 1) + opening + extending; 00689 choice2 = h(i, j - 1) + extending; 00690 mx = choice1; // Default in case of equality of scores. 00691 if (choice2 > mx) 00692 { 00693 mx = choice2; 00694 } 00695 h(i, j) = mx; 00696 00697 choice1 = m(i - 1, j) + opening + extending; 00698 choice2 = v(i - 1, j) + extending; 00699 mx = choice1; // Default in case of equality of scores. 00700 if (choice2 > mx) 00701 { 00702 mx = choice2; 00703 } 00704 v(i, j) = mx; 00705 00706 px = 'd'; 00707 if (v(i, j) > m(i, j)) 00708 px = 'u'; 00709 if (h(i, j) > m(i, j)) 00710 px = 'l'; 00711 p(i - 1, j - 1) = px; 00712 } 00713 } 00714 00715 // 2) Get alignment: 00716 deque<int> a1, a2; 00717 size_t i = s1->size(), j = s2->size(); 00718 char c; 00719 while (i > 0 && j > 0) 00720 { 00721 c = p(i - 1, j - 1); 00722 if (c == 'd') 00723 { 00724 a1.push_front((*s1)[i - 1]); 00725 a2.push_front((*s2)[j - 1]); 00726 i--; 00727 j--; 00728 } 00729 else if (c == 'u') 00730 { 00731 a1.push_front((*s1)[i - 1]); 00732 a2.push_front(-1); 00733 i--; 00734 } 00735 else 00736 { 00737 a1.push_front(-1); 00738 a2.push_front((*s2)[j - 1]); 00739 j--; 00740 } 00741 } 00742 while (i > 0) 00743 { 00744 a1.push_front((*s1)[i - 1]); 00745 a2.push_front(-1); 00746 i--; 00747 } 00748 while (j > 0) 00749 { 00750 a1.push_front(-1); 00751 a2.push_front((*s2)[j - 1]); 00752 j--; 00753 } 00754 s1->setContent(vector<int>(a1.begin(), a1.end())); 00755 s2->setContent(vector<int>(a2.begin(), a2.end())); 00756 AlignedSequenceContainer* asc = new AlignedSequenceContainer(s1->getAlphabet()); 00757 asc->addSequence(*s1, false); 00758 asc->addSequence(*s2, false); // Do not check for sequence names. 00759 return asc; 00760 } 00761 00762 /******************************************************************************/ 00763 00764 VectorSiteContainer* SiteContainerTools::sampleSites(const SiteContainer& sites, size_t nbSites, vector<size_t>* index) 00765 { 00766 VectorSiteContainer* sample = new VectorSiteContainer(sites.getSequencesNames(), sites.getAlphabet()); 00767 for (size_t i = 0; i < nbSites; i++) 00768 { 00769 size_t pos = static_cast<size_t>(RandomTools::giveIntRandomNumberBetweenZeroAndEntry(static_cast<int>(sites.getNumberOfSites()))); 00770 sample->addSite(sites.getSite(pos), false); 00771 if (index) 00772 index->push_back(pos); 00773 } 00774 return sample; 00775 } 00776 00777 /******************************************************************************/ 00778 00779 VectorSiteContainer* SiteContainerTools::bootstrapSites(const SiteContainer& sites) 00780 { 00781 return sampleSites(sites, sites.getNumberOfSites()); 00782 } 00783 00784 /******************************************************************************/ 00785 00786 const string SiteContainerTools::SIMILARITY_ALL = "all sites"; 00787 const string SiteContainerTools::SIMILARITY_NOFULLGAP = "no full gap"; 00788 const string SiteContainerTools::SIMILARITY_NODOUBLEGAP = "no double gap"; 00789 const string SiteContainerTools::SIMILARITY_NOGAP = "no gap"; 00790 00791 /******************************************************************************/ 00792 00793 double SiteContainerTools::computeSimilarity(const Sequence& seq1, const Sequence& seq2, bool dist, const std::string& gapOption, bool unresolvedAsGap) throw (SequenceNotAlignedException, AlphabetMismatchException, Exception) 00794 { 00795 if (seq1.size() != seq2.size()) 00796 throw SequenceNotAlignedException("SiteContainerTools::computeSimilarity.", &seq2); 00797 if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType()) 00798 throw AlphabetMismatchException("SiteContainerTools::computeSimilarity.", seq1.getAlphabet(), seq2.getAlphabet()); 00799 00800 const Alphabet* alpha = seq1.getAlphabet(); 00801 unsigned int s = 0; 00802 unsigned int t = 0; 00803 for (size_t i = 0; i < seq1.size(); i++) 00804 { 00805 int x = seq1[i]; 00806 int y = seq2[i]; 00807 int gapCode = alpha->getGapCharacterCode(); 00808 if (unresolvedAsGap) 00809 { 00810 if (alpha->isUnresolved(x)) 00811 x = gapCode; 00812 if (alpha->isUnresolved(y)) 00813 y = gapCode; 00814 } 00815 if (gapOption == SIMILARITY_ALL) 00816 { 00817 t++; 00818 if (x == y && !alpha->isGap(x) && !alpha->isGap(y)) 00819 s++; 00820 } 00821 else if (gapOption == SIMILARITY_NODOUBLEGAP) 00822 { 00823 if (!alpha->isGap(x) || !alpha->isGap(y)) 00824 { 00825 t++; 00826 if (x == y) 00827 s++; 00828 } 00829 } 00830 else if (gapOption == SIMILARITY_NOGAP) 00831 { 00832 if (!alpha->isGap(x) && !alpha->isGap(y)) 00833 { 00834 t++; 00835 if (x == y) 00836 s++; 00837 } 00838 } 00839 else 00840 throw Exception("SiteContainerTools::computeSimilarity. Invalid gap option: " + gapOption); 00841 } 00842 double r = (t == 0 ? 0. : static_cast<double>(s) / static_cast<double>(t)); 00843 return dist ? 1 - r : r; 00844 } 00845 00846 /******************************************************************************/ 00847 00848 DistanceMatrix* SiteContainerTools::computeSimilarityMatrix(const SiteContainer& sites, bool dist, const std::string& gapOption, bool unresolvedAsGap) 00849 { 00850 size_t n = sites.getNumberOfSequences(); 00851 DistanceMatrix* mat = new DistanceMatrix(sites.getSequencesNames()); 00852 string pairwiseGapOption = gapOption; 00853 SiteContainer* sites2; 00854 if (gapOption == SIMILARITY_NOFULLGAP) 00855 { 00856 if (unresolvedAsGap) 00857 { 00858 SiteContainer* tmp = removeGapOrUnresolvedOnlySites(sites); 00859 sites2 = new AlignedSequenceContainer(*tmp); 00860 delete tmp; 00861 } 00862 else 00863 { 00864 SiteContainer* tmp = removeGapOnlySites(sites); 00865 sites2 = new AlignedSequenceContainer(*tmp); 00866 delete tmp; 00867 } 00868 pairwiseGapOption = SIMILARITY_ALL; 00869 } 00870 else 00871 { 00872 sites2 = new AlignedSequenceContainer(sites); 00873 } 00874 00875 for (size_t i = 0; i < n; i++) 00876 { 00877 (*mat)(i, i) = dist ? 0. : 1.; 00878 const Sequence* seq1 = &sites2->getSequence(i); 00879 for (size_t j = i + 1; j < n; j++) 00880 { 00881 const Sequence* seq2 = &sites2->getSequence(j); 00882 (*mat)(i, j) = (*mat)(j, i) = computeSimilarity(*seq1, *seq2, dist, pairwiseGapOption, unresolvedAsGap); 00883 } 00884 } 00885 delete sites2; 00886 return mat; 00887 } 00888 00889 /******************************************************************************/ 00890 00891 void SiteContainerTools::merge(SiteContainer& seqCont1, const SiteContainer& seqCont2, bool leavePositionAsIs) 00892 throw (AlphabetMismatchException, Exception) 00893 { 00894 if (seqCont1.getAlphabet()->getAlphabetType() != seqCont2.getAlphabet()->getAlphabetType()) 00895 throw AlphabetMismatchException("SiteContainerTools::merge.", seqCont1.getAlphabet(), seqCont2.getAlphabet()); 00896 00897 00898 vector<string> seqNames1 = seqCont1.getSequencesNames(); 00899 vector<string> seqNames2 = seqCont2.getSequencesNames(); 00900 const SiteContainer* seqCont2bis = 0; 00901 bool del = false; 00902 if (seqNames1 == seqNames2) 00903 { 00904 seqCont2bis = &seqCont2; 00905 } 00906 else 00907 { 00908 // We shall reorder sequences first: 00909 SiteContainer* seqCont2ter = new VectorSiteContainer(seqCont2.getAlphabet()); 00910 SequenceContainerTools::getSelectedSequences(seqCont2, seqNames1, *seqCont2ter); 00911 seqCont2bis = seqCont2ter; 00912 del = true; 00913 } 00914 00915 if (leavePositionAsIs) 00916 { 00917 for (size_t i = 0; i < seqCont2bis->getNumberOfSites(); i++) 00918 { 00919 seqCont1.addSite(seqCont2bis->getSite(i), false); 00920 } 00921 } 00922 else 00923 { 00924 int offset = static_cast<int>(seqCont1.getNumberOfSites()); 00925 for (size_t i = 0; i < seqCont2bis->getNumberOfSites(); i++) 00926 { 00927 seqCont1.addSite(seqCont2bis->getSite(i), offset + seqCont2bis->getSite(i).getPosition(), false); 00928 } 00929 } 00930 00931 if (del) 00932 delete seqCont2bis; 00933 } 00934 00935 /******************************************************************************/ 00936 00937 void SiteContainerTools::getSequencePositions(const SiteContainer& sites, Matrix<size_t>& positions) 00938 { 00939 positions.resize(sites.getNumberOfSequences(), sites.getNumberOfSites()); 00940 int gap = sites.getAlphabet()->getGapCharacterCode(); 00941 for (size_t i = 0; i < sites.getNumberOfSequences(); ++i) { 00942 const Sequence& seq = sites.getSequence(i); 00943 unsigned int pos = 0; 00944 for (size_t j = 0; j < sites.getNumberOfSites(); ++j) { 00945 if (seq[j] != gap) { 00946 ++pos; 00947 positions(i, j) = pos; 00948 } else { 00949 positions(i, j) = 0; 00950 } 00951 } 00952 } 00953 } 00954 00955 /******************************************************************************/ 00956 00957 vector<int> SiteContainerTools::getColumnScores(const Matrix<size_t>& positions1, const Matrix<size_t>& positions2, int na) 00958 { 00959 if (positions1.getNumberOfRows() != positions2.getNumberOfRows()) 00960 throw Exception("SiteContainerTools::getColumnScores. The two input alignments must have the same number of sequences!"); 00961 vector<int> scores(positions1.getNumberOfColumns()); 00962 for (size_t i = 0; i < positions1.getNumberOfColumns(); ++i) { 00963 //Find an anchor point: 00964 size_t whichSeq = 0; 00965 size_t whichPos = 0; 00966 for (size_t j = 0; j < positions1.getNumberOfRows(); ++j) { 00967 if (positions1(j, i) > 0) { 00968 whichSeq = j; 00969 whichPos = positions1(j, i); 00970 break; 00971 } 00972 } 00973 if (whichPos == 0) { 00974 //No anchor found, this alignment column is only made of gaps. We assign a score of 'na' and move to the next column. 00975 scores[i] = na; 00976 continue; 00977 } 00978 //We look for the anchor in the reference alignment: 00979 size_t i2 = 0; 00980 bool found = false; 00981 for (size_t j = 0; !found && j < positions2.getNumberOfColumns(); ++j) { 00982 if (positions2(whichSeq, j) == whichPos) { 00983 i2 = j; 00984 found = true; 00985 } 00986 } 00987 if (!found) { 00988 throw Exception("SiteContainerTools::getColumnScores(). Position " + TextTools::toString(whichPos) + " of sequence " + TextTools::toString(whichSeq) + " not found in reference alignment. Please make sure the two indexes are built from the same data!"); 00989 } 00990 //Now we compare all pairs of sequences between the two positions: 00991 bool test = true; 00992 for (size_t j = 0; test && j < positions1.getNumberOfRows(); ++j) { 00993 test = (positions1(j, i) == positions2(j, i2)); 00994 } 00995 scores[i] = test ? 1 : 0; 00996 } 00997 return scores; 00998 } 00999 01000 /******************************************************************************/ 01001 01002 vector<double> SiteContainerTools::getSumOfPairsScores(const Matrix<size_t>& positions1, const Matrix<size_t>& positions2, double na) 01003 { 01004 if (positions1.getNumberOfRows() != positions2.getNumberOfRows()) 01005 throw Exception("SiteContainerTools::getColumnScores. The two input alignments must have the same number of sequences!"); 01006 vector<double> scores(positions1.getNumberOfColumns()); 01007 for (size_t i = 0; i < positions1.getNumberOfColumns(); ++i) { 01008 //For all positions in alignment 1... 01009 size_t countAlignable = 0; 01010 size_t countAligned = 0; 01011 for (size_t j = 0; j < positions1.getNumberOfRows(); ++j) { 01012 //Get the corresponding column in alignment 2: 01013 size_t whichPos = positions1(j, i); 01014 if (whichPos == 0) { 01015 //No position for this sequence here. 01016 continue; 01017 } 01018 //We look for the position in the second alignment: 01019 size_t i2 = 0; 01020 bool found = false; 01021 for (size_t k = 0; !found && k < positions2.getNumberOfColumns(); ++k) { 01022 if (positions2(j, k) == whichPos) { 01023 i2 = k; 01024 found = true; 01025 } 01026 } 01027 if (!found) { 01028 throw Exception("SiteContainerTools::getColumnScores(). Position " + TextTools::toString(whichPos) + " of sequence " + TextTools::toString(j) + " not found in reference alignment. Please make sure the two indexes are built from the same data!"); 01029 } 01030 01031 //Now we check all other positions and see if they are aligned with this one: 01032 for (size_t k = j + 1; k < positions1.getNumberOfRows(); ++k) { 01033 size_t whichPos2 = positions1(k, i); 01034 if (whichPos2 == 0) { 01035 //Empty position 01036 continue; 01037 } 01038 countAlignable++; 01039 //check position in alignment 2: 01040 if (positions2(k, i2) == whichPos2) 01041 countAligned++; 01042 } 01043 } 01044 scores[i] = countAlignable == 0 ? na : static_cast<double>(countAligned) / static_cast<double>(countAlignable); 01045 } 01046 return scores; 01047 } 01048 01049 /******************************************************************************/ 01050