|
bpp-seq 2.0.3
|
00001 // 00002 // File: MafIterator.cpp 00003 // Authors: Julien Dutheil 00004 // Created: Tue Sep 07 2010 00005 // 00006 00007 /* 00008 Copyright or © or Copr. Bio++ Development Team, (2010) 00009 00010 This software is a computer program whose purpose is to provide classes 00011 for sequences analysis. 00012 00013 This software is governed by the CeCILL license under French law and 00014 abiding by the rules of distribution of free software. You can use, 00015 modify and/ or redistribute the software under the terms of the CeCILL 00016 license as circulated by CEA, CNRS and INRIA at the following URL 00017 "http://www.cecill.info". 00018 00019 As a counterpart to the access to the source code and rights to copy, 00020 modify and redistribute granted by the license, users are provided only 00021 with a limited warranty and the software's author, the holder of the 00022 economic rights, and the successive licensors have only limited 00023 liability. 00024 00025 In this respect, the user's attention is drawn to the risks associated 00026 with loading, using, modifying and/or developing or reproducing the 00027 software by the user in light of its specific status of free software, 00028 that may mean that it is complicated to manipulate, and that also 00029 therefore means that it is reserved for developers and experienced 00030 professionals having in-depth computer knowledge. Users are therefore 00031 encouraged to load and test the software's suitability as regards their 00032 requirements in conditions enabling the security of their systems and/or 00033 data to be ensured and, more generally, to use and operate it in the 00034 same conditions as regards security. 00035 00036 The fact that you are presently reading this means that you have had 00037 knowledge of the CeCILL license and that you accept its terms. 00038 */ 00039 00040 #include "MafIterator.h" 00041 #include "IterationListener.h" 00042 #include "../../SequenceWithQuality.h" 00043 #include "../../SequenceWithAnnotationTools.h" 00044 #include "../../SequenceWalker.h" 00045 #include "../../Alphabet/AlphabetTools.h" 00046 #include "../../Container/VectorSiteContainer.h" 00047 #include "../../SiteTools.h" 00048 00049 using namespace bpp; 00050 00051 //From the STL: 00052 #include <string> 00053 00054 using namespace std; 00055 00056 void AbstractMafIterator::fireIterationStartSignal_() { 00057 for (std::vector<IterationListener*>::iterator it = iterationListeners_.begin(); it != iterationListeners_.end(); ++it) { 00058 (*it)->iterationStarts(); 00059 } 00060 } 00061 00062 void AbstractMafIterator::fireIterationMoveSignal_(const MafBlock& currentBlock) { 00063 for (std::vector<IterationListener*>::iterator it = iterationListeners_.begin(); it != iterationListeners_.end(); ++it) { 00064 (*it)->iterationMoves(currentBlock); 00065 } 00066 } 00067 00068 void AbstractMafIterator::fireIterationStopSignal_() { 00069 for (std::vector<IterationListener*>::iterator it = iterationListeners_.begin(); it != iterationListeners_.end(); ++it) { 00070 (*it)->iterationStops(); 00071 } 00072 } 00073 00074 MafBlock* SequenceFilterMafIterator::analyseCurrentBlock_() throw (Exception) 00075 { 00076 currentBlock_ = iterator_->nextBlock(); 00077 while (currentBlock_) { 00078 map<string, unsigned int> counts; 00079 for (size_t i = currentBlock_->getNumberOfSequences(); i > 0; --i) { 00080 string species = currentBlock_->getSequence(i-1).getSpecies(); 00081 if (!VectorTools::contains(species_, species)) { 00082 if (logstream_) { 00083 (*logstream_ << "SEQUENCE FILTER: remove sequence '" << species << "' from current block " << currentBlock_->getDescription() << ".").endLine(); 00084 } 00085 currentBlock_->getAlignment().deleteSequence(i - 1); 00086 } else { 00087 counts[species]++; 00088 } 00089 } 00090 bool test = currentBlock_->getNumberOfSequences() == 0; 00091 //Avoid a memory leak: 00092 if (test) { 00093 if (logstream_) { 00094 (*logstream_ << "SEQUENCE FILTER: block " << currentBlock_->getDescription() << " is now empty. Try to get the next one.").endLine(); 00095 } 00096 delete currentBlock_; 00097 } else { 00098 test = strict_ && (counts.size() != species_.size()); 00099 if (test) { 00100 if (logstream_) { 00101 (*logstream_ << "SEQUENCE FILTER: block " << currentBlock_->getDescription() << " does not contain all species and will be ignored. Try to get the next one.").endLine(); 00102 } 00103 delete currentBlock_; 00104 } else { 00105 if (rmDuplicates_) { 00106 test = false; 00107 map<string, unsigned int>::iterator it; 00108 for (it = counts.begin(); it != counts.end() && !(test = it->second > 1); it++) {} 00109 if (test) { 00110 if (logstream_) { 00111 (*logstream_ << "SEQUENCE FILTER: block " << currentBlock_->getDescription() << " has two sequences for species '" << it->first << "' and will be ignored. Try to get the next one.").endLine(); 00112 } 00113 delete currentBlock_; 00114 } else { 00115 return currentBlock_; 00116 } 00117 } else { 00118 return currentBlock_; 00119 } 00120 } 00121 } 00122 00123 //Look for the next block: 00124 currentBlock_ = iterator_->nextBlock(); 00125 } 00126 00127 return currentBlock_; 00128 } 00129 00130 MafBlock* ChromosomeMafIterator::analyseCurrentBlock_() throw (Exception) 00131 { 00132 currentBlock_ = iterator_->nextBlock(); 00133 while (currentBlock_) { 00134 bool foundRef = false; 00135 string chr = ""; 00136 for (size_t i = 0; i < currentBlock_->getNumberOfSequences() && !foundRef; ++i) { 00137 string species = currentBlock_->getSequence(i).getSpecies(); 00138 if (species == ref_) { 00139 foundRef = true; 00140 chr = currentBlock_->getSequence(i).getChromosome(); 00141 } 00142 } 00143 if (!foundRef) { 00144 if (logstream_) { 00145 (*logstream_ << "CHROMOSOME FILTER: block does not contain reference species and was removed.").endLine(); 00146 } 00147 delete currentBlock_; 00148 } else if (chr != chr_) { 00149 if (logstream_) { 00150 (*logstream_ << "CHROMOSOME FILTER: reference species without queried chromosome was removed.").endLine(); 00151 } 00152 delete currentBlock_; 00153 } else { 00154 return currentBlock_; 00155 } 00156 00157 //Look for the next block: 00158 currentBlock_ = iterator_->nextBlock(); 00159 } 00160 00161 return currentBlock_; 00162 } 00163 00164 MafBlock* DuplicateFilterMafIterator::analyseCurrentBlock_() throw (Exception) 00165 { 00166 currentBlock_ = iterator_->nextBlock(); 00167 while (currentBlock_) { 00168 bool foundRef = false; 00169 string chr = ""; 00170 char strand = '+'; 00171 unsigned int start = 0; 00172 unsigned int stop = 0; 00173 for (size_t i = 0; i < currentBlock_->getNumberOfSequences() && !foundRef; ++i) { 00174 string species = currentBlock_->getSequence(i).getSpecies(); 00175 if (species == ref_) { 00176 foundRef = true; 00177 chr = currentBlock_->getSequence(i).getChromosome(); 00178 strand = currentBlock_->getSequence(i).getStrand(); 00179 start = currentBlock_->getSequence(i).start(); 00180 stop = currentBlock_->getSequence(i).stop(); 00181 } 00182 } 00183 if (!foundRef) { 00184 if (logstream_) { 00185 (*logstream_ << "DUPLICATE FILTER: block does not contain reference species and was removed.").endLine(); 00186 } 00187 delete currentBlock_; 00188 } else { 00189 unsigned int occurrence = blocks_[chr][strand][start][stop]++; 00190 if (occurrence > 0) { 00191 if (logstream_) { 00192 (*logstream_ << "DUPLICATE FILTER: sequence in reference species was found in a previous block. New block was removed.").endLine(); 00193 } 00194 delete currentBlock_; 00195 } else { 00196 return currentBlock_; 00197 } 00198 } 00199 00200 //Look for the next block: 00201 currentBlock_ = iterator_->nextBlock(); 00202 } 00203 00204 return currentBlock_; 00205 } 00206 00207 MafBlock* BlockMergerMafIterator::analyseCurrentBlock_() throw (Exception) 00208 { 00209 if (!incomingBlock_) return 0; 00210 currentBlock_ = incomingBlock_; 00211 incomingBlock_ = iterator_->nextBlock(); 00212 while (incomingBlock_) { 00213 unsigned int globalSpace = 0; 00214 for (unsigned int i = 0; i < species_.size(); ++i) { 00215 try { 00216 const MafSequence* seq1 = ¤tBlock_->getSequenceForSpecies(species_[i]); 00217 const MafSequence* seq2 = &incomingBlock_->getSequenceForSpecies(species_[i]); 00218 if (!seq1->hasCoordinates() || !seq2->hasCoordinates()) 00219 throw Exception("BlockMergerMafIterator::nextBlock. Species '" + species_[i] + "' is missing coordinates in at least one block."); 00220 00221 if (seq1->stop() >= seq2->start()) 00222 return currentBlock_; 00223 unsigned int space = seq2->start() - seq1->stop() - 1; 00224 if (space > maxDist_) 00225 return currentBlock_; 00226 if (i == 0) 00227 globalSpace = space; 00228 else { 00229 if (space != globalSpace) 00230 return currentBlock_; 00231 } 00232 if (seq1->getChromosome() != seq2->getChromosome() 00233 || VectorTools::contains(ignoreChrs_, seq1->getChromosome()) 00234 || VectorTools::contains(ignoreChrs_, seq2->getChromosome()) 00235 || seq1->getStrand() != seq2->getStrand() 00236 || seq1->getSrcSize() != seq2->getSrcSize()) 00237 { 00238 //There is a syntheny break in this sequence, so we do not merge the blocks. 00239 return currentBlock_; 00240 } 00241 } catch (SequenceNotFoundException& snfe) { 00242 //At least one block does not contain the sequence. 00243 //We don't merge the blocks: 00244 return currentBlock_; 00245 } 00246 } 00247 //We merge the two blocks: 00248 if (logstream_) { 00249 (*logstream_ << "BLOCK MERGER: merging two consecutive blocks.").endLine(); 00250 } 00251 vector<string> sp1 = currentBlock_->getSpeciesList(); 00252 vector<string> sp2 = incomingBlock_->getSpeciesList(); 00253 vector<string> allSp = VectorTools::unique(VectorTools::vectorUnion(sp1, sp2)); 00254 //We need to create a new MafBlock: 00255 MafBlock* mergedBlock = new MafBlock(); 00256 //We average the score and pass values: 00257 unsigned int p1 = currentBlock_->getPass(); 00258 unsigned int p2 = incomingBlock_->getPass(); 00259 if (p1 == p2) mergedBlock->setPass(p1); 00260 double s1 = currentBlock_->getScore(); 00261 double n1 = static_cast<double>(currentBlock_->getNumberOfSites()); 00262 double s2 = incomingBlock_->getScore(); 00263 double n2 = static_cast<double>(incomingBlock_->getNumberOfSites()); 00264 mergedBlock->setScore((s1 * n1 + s2 * n2) / (n1 + n2)); 00265 00266 //Now fill the new block: 00267 for (size_t i = 0; i < allSp.size(); ++i) { 00268 auto_ptr<MafSequence> seq; 00269 try { 00270 seq.reset(new MafSequence(currentBlock_->getSequenceForSpecies(allSp[i]))); 00271 00272 //Check is there is a second sequence: 00273 try { 00274 auto_ptr<MafSequence> tmp(new MafSequence(incomingBlock_->getSequenceForSpecies(allSp[i]))); 00275 string ref1 = seq->getDescription(), ref2 = tmp->getDescription(); 00276 //Add spacer if needed: 00277 if (globalSpace > 0) { 00278 if (logstream_) { 00279 (*logstream_ << "BLOCK MERGER: a spacer of size " << globalSpace <<" is inserted in sequence for species " << allSp[i] << ".").endLine(); 00280 } 00281 seq->append(vector<int>(globalSpace, AlphabetTools::DNA_ALPHABET.getUnknownCharacterCode())); 00282 } 00283 if (seq->getChromosome() != tmp->getChromosome()) { 00284 seq->setChromosome(seq->getChromosome() + "-" + tmp->getChromosome()); 00285 seq->removeCoordinates(); 00286 } 00287 if (seq->getStrand() != tmp->getStrand()) { 00288 seq->setStrand('?'); 00289 seq->removeCoordinates(); 00290 } 00291 if (seq->getName() != tmp->getName()) 00292 tmp->setName(seq->getName()); //force name conversion to prevent exception in 'merge'. 00293 seq->merge(*tmp); 00294 if (logstream_) { 00295 (*logstream_ << "BLOCK MERGER: merging " << ref1 << " with " << ref2 << " into " << seq->getDescription()).endLine(); 00296 } 00297 } catch (SequenceNotFoundException& snfe2) { 00298 //There was a first sequence, we just extend it: 00299 string ref1 = seq->getDescription(); 00300 seq->setToSizeR(seq->size() + incomingBlock_->getNumberOfSites() + globalSpace); 00301 if (logstream_) { 00302 (*logstream_ << "BLOCK MERGER: extending " << ref1 << " with " << incomingBlock_->getNumberOfSites() << " gaps on the right.").endLine(); 00303 } 00304 } 00305 } catch (SequenceNotFoundException& snfe1) { 00306 //There must be a second sequence then: 00307 seq.reset(new MafSequence(incomingBlock_->getSequenceForSpecies(allSp[i]))); 00308 string ref2 = seq->getDescription(); 00309 seq->setToSizeL(seq->size() + currentBlock_->getNumberOfSites() + globalSpace); 00310 if (logstream_) { 00311 (*logstream_ << "BLOCK MERGER: adding " << ref2 << " and extend it with " << currentBlock_->getNumberOfSites() << " gaps on the left.").endLine(); 00312 } 00313 } 00314 mergedBlock->addSequence(*seq); 00315 } 00316 //Cleaning stuff: 00317 delete currentBlock_; 00318 delete incomingBlock_; 00319 currentBlock_ = mergedBlock; 00320 //We check if we can also merge the next block: 00321 incomingBlock_ = iterator_->nextBlock(); 00322 } 00323 return currentBlock_; 00324 } 00325 00326 MafBlock* FullGapFilterMafIterator::analyseCurrentBlock_() throw (Exception) 00327 { 00328 MafBlock* block = iterator_->nextBlock(); 00329 if (!block) return 0; 00330 00331 //We create a copy of the ingroup alignement for better efficiency: 00332 VectorSiteContainer vsc(&AlphabetTools::DNA_ALPHABET); 00333 for (size_t i = 0; i < species_.size(); ++i) { 00334 vsc.addSequence(block->getSequence(i)); 00335 } 00336 //Now check the positions that are only made of gaps: 00337 if (verbose_) { 00338 ApplicationTools::message->endLine(); 00339 ApplicationTools::displayTask("Cleaning block for gap sites", true); 00340 } 00341 unsigned int n = block->getNumberOfSites(); 00342 vector <unsigned int> start; 00343 vector <unsigned int> count; 00344 bool test = false; 00345 for(unsigned int i = 0; i < n; ++i) { 00346 const Site* site = &vsc.getSite(i); 00347 if (SiteTools::isGapOnly(*site)) { 00348 if (test) { 00349 count[count.size() - 1]++; 00350 } else { 00351 start.push_back(i); 00352 count.push_back(1); 00353 test = true; 00354 } 00355 } else { 00356 test = false; 00357 } 00358 } 00359 //Now remove blocks: 00360 unsigned int totalRemoved = 0; 00361 for(size_t i = start.size(); i > 0; --i) { 00362 if (verbose_) 00363 ApplicationTools::displayGauge(start.size() - i, start.size() - 1, '='); 00364 block->getAlignment().deleteSites(start[i - 1], count[i - 1]); 00365 totalRemoved += count[i - 1]; 00366 } 00367 if (verbose_) 00368 ApplicationTools::displayTaskDone(); 00369 00370 //Correct coordinates: 00371 if (totalRemoved > 0) { 00372 for (unsigned int i = 0; i < block->getNumberOfSequences(); ++i) { 00373 const MafSequence* seq = &block->getSequence(i); 00374 if (!VectorTools::contains(species_, seq->getSpecies())) { 00375 block->removeCoordinatesFromSequence(i); 00376 } 00377 } 00378 } 00379 if (logstream_) { 00380 (*logstream_ << "FULL GAP CLEANER: " << totalRemoved << " positions have been removed.").endLine(); 00381 } 00382 return block; 00383 } 00384 00385 MafBlock* AlignmentFilterMafIterator::analyseCurrentBlock_() throw (Exception) 00386 { 00387 if (blockBuffer_.size() == 0) { 00388 //Else there is no more block in the buffer, we need to parse more: 00389 do { 00390 MafBlock* block = iterator_->nextBlock(); 00391 if (!block) return 0; //No more block. 00392 00393 //Parse block. 00394 int gap = AlphabetTools::DNA_ALPHABET.getGapCharacterCode(); 00395 int unk = AlphabetTools::DNA_ALPHABET.getUnknownCharacterCode(); 00396 size_t nr = species_.size(); 00397 vector< vector<int> > aln(nr); 00398 for (size_t i = 0; i < nr; ++i) { 00399 aln[i] = block->getSequenceForSpecies(species_[i]).getContent(); 00400 } 00401 size_t nc = block->getNumberOfSites(); 00402 //First we create a mask: 00403 vector<unsigned int> pos; 00404 vector<bool> col(nr); 00405 //Reset window: 00406 window_.clear(); 00407 //Init window: 00408 size_t i; 00409 for (i = 0; i < windowSize_; ++i) { 00410 for (size_t j = 0; j < nr; ++j) { 00411 col[j] = (aln[j][i] == gap || aln[j][i] == unk); 00412 } 00413 window_.push_back(col); 00414 } 00415 //Slide window: 00416 if (verbose_) { 00417 ApplicationTools::message->endLine(); 00418 ApplicationTools::displayTask("Sliding window for alignment filter", true); 00419 } 00420 while (i + step_ < nc) { 00421 if (verbose_) 00422 ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1, '>'); 00423 //Evaluate current window: 00424 unsigned int sum = 0; 00425 for (size_t u = 0; u < window_.size(); ++u) 00426 for (size_t v = 0; v < window_[u].size(); ++v) 00427 if (window_[u][v]) sum++; 00428 if (sum > maxGap_) { 00429 if (pos.size() == 0) { 00430 pos.push_back(i - windowSize_); 00431 pos.push_back(i); 00432 } else { 00433 if (i - windowSize_ < pos[pos.size() - 1]) { 00434 pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region 00435 } else { //This is a new region 00436 pos.push_back(i - windowSize_); 00437 pos.push_back(i); 00438 } 00439 } 00440 } 00441 00442 //Move forward: 00443 for (unsigned int k = 0; k < step_; ++k) { 00444 for (size_t j = 0; j < nr; ++j) { 00445 col[j] = (aln[j][i] == gap || aln[j][i] == unk); 00446 } 00447 window_.push_back(col); 00448 window_.pop_front(); 00449 ++i; 00450 } 00451 } 00452 00453 //Evaluate last window: 00454 unsigned int sum = 0; 00455 for (size_t u = 0; u < window_.size(); ++u) 00456 for (size_t v = 0; v < window_[u].size(); ++v) 00457 if (window_[u][v]) sum++; 00458 if (sum > maxGap_) { 00459 if (pos.size() == 0) { 00460 pos.push_back(i - windowSize_); 00461 pos.push_back(i); 00462 } else { 00463 if (i - windowSize_ <= pos[pos.size() - 1]) { 00464 pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region 00465 } else { //This is a new region 00466 pos.push_back(i - windowSize_); 00467 pos.push_back(i); 00468 } 00469 } 00470 } 00471 if (verbose_) 00472 ApplicationTools::displayTaskDone(); 00473 00474 //Now we remove regions with two many gaps, using a sliding window: 00475 if (pos.size() == 0) { 00476 blockBuffer_.push_back(block); 00477 if (logstream_) { 00478 (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " is clean and kept as is.").endLine(); 00479 } 00480 } else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) { 00481 //Everything is removed: 00482 if (logstream_) { 00483 (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " was entirely removed. Tried to get the next one.").endLine(); 00484 } 00485 } else { 00486 if (logstream_) { 00487 (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine(); 00488 } 00489 if (verbose_) { 00490 ApplicationTools::message->endLine(); 00491 ApplicationTools::displayTask("Spliting block", true); 00492 } 00493 for (i = 0; i < pos.size(); i+=2) { 00494 if (verbose_) 00495 ApplicationTools::displayGauge(i, pos.size() - 2, '='); 00496 if (logstream_) { 00497 (*logstream_ << "ALN CLEANER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block " << block->getDescription() << ".").endLine(); 00498 } 00499 if (pos[i] > 0) { 00500 MafBlock* newBlock = new MafBlock(); 00501 newBlock->setScore(block->getScore()); 00502 newBlock->setPass(block->getPass()); 00503 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) { 00504 MafSequence* subseq; 00505 if (i == 0) { 00506 subseq = block->getSequence(j).subSequence(0, pos[i]); 00507 } else { 00508 subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]); 00509 } 00510 newBlock->addSequence(*subseq); 00511 delete subseq; 00512 } 00513 blockBuffer_.push_back(newBlock); 00514 } 00515 00516 if (keepTrashedBlocks_) { 00517 MafBlock* outBlock = new MafBlock(); 00518 outBlock->setScore(block->getScore()); 00519 outBlock->setPass(block->getPass()); 00520 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) { 00521 MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]); 00522 outBlock->addSequence(*outseq); 00523 delete outseq; 00524 } 00525 trashBuffer_.push_back(outBlock); 00526 } 00527 } 00528 //Add last block: 00529 if (pos[pos.size() - 1] < block->getNumberOfSites()) { 00530 MafBlock* newBlock = new MafBlock(); 00531 newBlock->setScore(block->getScore()); 00532 newBlock->setPass(block->getPass()); 00533 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) { 00534 MafSequence* subseq; 00535 subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]); 00536 newBlock->addSequence(*subseq); 00537 delete subseq; 00538 } 00539 blockBuffer_.push_back(newBlock); 00540 } 00541 if (verbose_) 00542 ApplicationTools::displayTaskDone(); 00543 00544 delete block; 00545 } 00546 } while (blockBuffer_.size() == 0); 00547 } 00548 00549 MafBlock* block = blockBuffer_.front(); 00550 blockBuffer_.pop_front(); 00551 return block; 00552 } 00553 00554 MafBlock* AlignmentFilter2MafIterator::analyseCurrentBlock_() throw (Exception) 00555 { 00556 if (blockBuffer_.size() == 0) { 00557 //Else there is no more block in the buffer, we need to parse more: 00558 do { 00559 MafBlock* block = iterator_->nextBlock(); 00560 if (!block) return 0; //No more block. 00561 00562 //Parse block. 00563 int gap = AlphabetTools::DNA_ALPHABET.getGapCharacterCode(); 00564 int unk = AlphabetTools::DNA_ALPHABET.getUnknownCharacterCode(); 00565 size_t nr = species_.size(); 00566 vector< vector<int> > aln(nr); 00567 for (size_t i = 0; i < nr; ++i) { 00568 aln[i] = block->getSequenceForSpecies(species_[i]).getContent(); 00569 } 00570 size_t nc = block->getNumberOfSites(); 00571 //First we create a mask: 00572 vector<unsigned int> pos; 00573 vector<bool> col(nr); 00574 //Reset window: 00575 window_.clear(); 00576 //Init window: 00577 size_t i; 00578 for (i = 0; i < windowSize_; ++i) { 00579 for (size_t j = 0; j < nr; ++j) { 00580 col[j] = (aln[j][i] == gap || aln[j][i] == unk); 00581 } 00582 window_.push_back(col); 00583 } 00584 //Slide window: 00585 if (verbose_) { 00586 ApplicationTools::message->endLine(); 00587 ApplicationTools::displayTask("Sliding window for alignment filter", true); 00588 } 00589 while (i + step_ < nc) { 00590 if (verbose_) 00591 ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1, '>'); 00592 //Evaluate current window: 00593 unsigned int count = 0; 00594 for (size_t u = 0; u < window_.size(); ++u) { 00595 unsigned int partialCount = 0; 00596 if (u > 0 && window_[u] != window_[u - 1]) { 00597 for (size_t v = 0; v < window_[u].size(); ++v) 00598 if (window_[u][v]) partialCount++; 00599 if (partialCount > maxGap_) 00600 count++; 00601 } 00602 } 00603 if (count > maxPos_) { 00604 if (pos.size() == 0) { 00605 pos.push_back(i - windowSize_); 00606 pos.push_back(i); 00607 } else { 00608 if (i - windowSize_ < pos[pos.size() - 1]) { 00609 pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region 00610 } else { //This is a new region 00611 pos.push_back(i - windowSize_); 00612 pos.push_back(i); 00613 } 00614 } 00615 } 00616 00617 //Move forward: 00618 for (unsigned int k = 0; k < step_; ++k) { 00619 for (size_t j = 0; j < nr; ++j) { 00620 col[j] = (aln[j][i] == gap || aln[j][i] == unk); 00621 } 00622 window_.push_back(col); 00623 window_.pop_front(); 00624 ++i; 00625 } 00626 } 00627 00628 //Evaluate last window: 00629 unsigned int count = 0; 00630 for (size_t u = 0; u < window_.size(); ++u) { 00631 unsigned int partialCount = 0; 00632 if (u > 0 && window_[u] != window_[u - 1]) { 00633 for (size_t v = 0; v < window_[u].size(); ++v) 00634 if (window_[u][v]) partialCount++; 00635 if (partialCount > maxGap_) 00636 count++; 00637 } 00638 } 00639 if (count > maxPos_) { 00640 if (pos.size() == 0) { 00641 pos.push_back(i - windowSize_); 00642 pos.push_back(i); 00643 } else { 00644 if (i - windowSize_ <= pos[pos.size() - 1]) { 00645 pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region 00646 } else { //This is a new region 00647 pos.push_back(i - windowSize_); 00648 pos.push_back(i); 00649 } 00650 } 00651 } 00652 if (verbose_) 00653 ApplicationTools::displayTaskDone(); 00654 00655 //Now we remove regions with two many gaps, using a sliding window: 00656 if (pos.size() == 0) { 00657 blockBuffer_.push_back(block); 00658 if (logstream_) { 00659 (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " is clean and kept as is.").endLine(); 00660 } 00661 } else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) { 00662 //Everything is removed: 00663 if (logstream_) { 00664 (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " was entirely removed. Tried to get the next one.").endLine(); 00665 } 00666 } else { 00667 if (logstream_) { 00668 (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine(); 00669 } 00670 if (verbose_) { 00671 ApplicationTools::message->endLine(); 00672 ApplicationTools::displayTask("Spliting block", true); 00673 } 00674 for (i = 0; i < pos.size(); i+=2) { 00675 if (verbose_) 00676 ApplicationTools::displayGauge(i, pos.size() - 2, '='); 00677 if (logstream_) { 00678 (*logstream_ << "ALN CLEANER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block " << block->getDescription() << ".").endLine(); 00679 } 00680 if (pos[i] > 0) { 00681 MafBlock* newBlock = new MafBlock(); 00682 newBlock->setScore(block->getScore()); 00683 newBlock->setPass(block->getPass()); 00684 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) { 00685 MafSequence* subseq; 00686 if (i == 0) { 00687 subseq = block->getSequence(j).subSequence(0, pos[i]); 00688 } else { 00689 subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]); 00690 } 00691 newBlock->addSequence(*subseq); 00692 delete subseq; 00693 } 00694 blockBuffer_.push_back(newBlock); 00695 } 00696 00697 if (keepTrashedBlocks_) { 00698 MafBlock* outBlock = new MafBlock(); 00699 outBlock->setScore(block->getScore()); 00700 outBlock->setPass(block->getPass()); 00701 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) { 00702 MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]); 00703 outBlock->addSequence(*outseq); 00704 delete outseq; 00705 } 00706 trashBuffer_.push_back(outBlock); 00707 } 00708 } 00709 //Add last block: 00710 if (pos[pos.size() - 1] < block->getNumberOfSites()) { 00711 MafBlock* newBlock = new MafBlock(); 00712 newBlock->setScore(block->getScore()); 00713 newBlock->setPass(block->getPass()); 00714 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) { 00715 MafSequence* subseq; 00716 subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]); 00717 newBlock->addSequence(*subseq); 00718 delete subseq; 00719 } 00720 blockBuffer_.push_back(newBlock); 00721 } 00722 if (verbose_) 00723 ApplicationTools::displayTaskDone(); 00724 00725 delete block; 00726 } 00727 } while (blockBuffer_.size() == 0); 00728 } 00729 00730 MafBlock* block = blockBuffer_.front(); 00731 blockBuffer_.pop_front(); 00732 return block; 00733 } 00734 00735 MafBlock* MaskFilterMafIterator::analyseCurrentBlock_() throw (Exception) 00736 { 00737 if (blockBuffer_.size() == 0) { 00738 do { 00739 //Else there is no more block in the buffer, we need parse more: 00740 MafBlock* block = iterator_->nextBlock(); 00741 if (!block) return 0; //No more block. 00742 00743 //Parse block. 00744 vector< vector<bool> > aln; 00745 for (size_t i = 0; i < species_.size(); ++i) { 00746 const MafSequence* seq = &block->getSequenceForSpecies(species_[i]); 00747 if (seq->hasAnnotation(SequenceMask::MASK)) { 00748 aln.push_back(dynamic_cast<const SequenceMask&>(seq->getAnnotation(SequenceMask::MASK)).getMask()); 00749 } 00750 } 00751 size_t nr = aln.size(); 00752 size_t nc = block->getNumberOfSites(); 00753 //First we create a mask: 00754 vector<unsigned int> pos; 00755 vector<bool> col(nr); 00756 //Reset window: 00757 window_.clear(); 00758 //Init window: 00759 size_t i; 00760 for (i = 0; i < windowSize_; ++i) { 00761 for (size_t j = 0; j < nr; ++j) { 00762 col[j] = aln[j][i]; 00763 } 00764 window_.push_back(col); 00765 } 00766 //Slide window: 00767 if (verbose_) { 00768 ApplicationTools::message->endLine(); 00769 ApplicationTools::displayTask("Sliding window for mask filter", true); 00770 } 00771 while (i + step_ < nc) { 00772 if (verbose_) 00773 ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1, '>'); 00774 //Evaluate current window: 00775 unsigned int sum = 0; 00776 for (size_t u = 0; u < window_.size(); ++u) 00777 for (size_t v = 0; v < window_[u].size(); ++v) 00778 if (window_[u][v]) sum++; 00779 if (sum > maxMasked_) { 00780 if (pos.size() == 0) { 00781 pos.push_back(i - windowSize_); 00782 pos.push_back(i); 00783 } else { 00784 if (i - windowSize_ <= pos[pos.size() - 1]) { 00785 pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region 00786 } else { //This is a new region 00787 pos.push_back(i - windowSize_); 00788 pos.push_back(i); 00789 } 00790 } 00791 } 00792 00793 //Move forward: 00794 for (unsigned int k = 0; k < step_; ++k) { 00795 for (size_t j = 0; j < nr; ++j) { 00796 col[j] = aln[j][i]; 00797 } 00798 window_.push_back(col); 00799 window_.pop_front(); 00800 ++i; 00801 } 00802 } 00803 00804 //Evaluate last window: 00805 unsigned int sum = 0; 00806 for (size_t u = 0; u < window_.size(); ++u) 00807 for (size_t v = 0; v < window_[u].size(); ++v) 00808 if (window_[u][v]) sum++; 00809 if (sum > maxMasked_) { 00810 if (pos.size() == 0) { 00811 pos.push_back(i - windowSize_); 00812 pos.push_back(i); 00813 } else { 00814 if (i - windowSize_ < pos[pos.size() - 1]) { 00815 pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region 00816 } else { //This is a new region 00817 pos.push_back(i - windowSize_); 00818 pos.push_back(i); 00819 } 00820 } 00821 } 00822 if (verbose_) 00823 ApplicationTools::displayTaskDone(); 00824 00825 //Now we remove regions with two many gaps, using a sliding window: 00826 if (pos.size() == 0) { 00827 blockBuffer_.push_back(block); 00828 if (logstream_) { 00829 (*logstream_ << "MASK CLEANER: block is clean and kept as is.").endLine(); 00830 } 00831 } else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) { 00832 //Everything is removed: 00833 if (logstream_) { 00834 (*logstream_ << "MASK CLEANER: block was entirely removed. Tried to get the next one.").endLine(); 00835 } 00836 } else { 00837 if (logstream_) { 00838 (*logstream_ << "MASK CLEANER: block with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine(); 00839 } 00840 if (verbose_) { 00841 ApplicationTools::message->endLine(); 00842 ApplicationTools::displayTask("Spliting block", true); 00843 } 00844 for (i = 0; i < pos.size(); i+=2) { 00845 if (verbose_) 00846 ApplicationTools::displayGauge(i, pos.size() - 2, '='); 00847 if (logstream_) { 00848 (*logstream_ << "MASK CLEANER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block.").endLine(); 00849 } 00850 if (pos[i] > 0) { 00851 MafBlock* newBlock = new MafBlock(); 00852 newBlock->setScore(block->getScore()); 00853 newBlock->setPass(block->getPass()); 00854 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) { 00855 MafSequence* subseq; 00856 if (i == 0) { 00857 subseq = block->getSequence(j).subSequence(0, pos[i]); 00858 } else { 00859 subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]); 00860 } 00861 newBlock->addSequence(*subseq); 00862 delete subseq; 00863 } 00864 blockBuffer_.push_back(newBlock); 00865 } 00866 00867 if (keepTrashedBlocks_) { 00868 MafBlock* outBlock = new MafBlock(); 00869 outBlock->setScore(block->getScore()); 00870 outBlock->setPass(block->getPass()); 00871 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) { 00872 MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]); 00873 outBlock->addSequence(*outseq); 00874 delete outseq; 00875 } 00876 trashBuffer_.push_back(outBlock); 00877 } 00878 } 00879 //Add last block: 00880 if (pos[pos.size() - 1] < block->getNumberOfSites()) { 00881 MafBlock* newBlock = new MafBlock(); 00882 newBlock->setScore(block->getScore()); 00883 newBlock->setPass(block->getPass()); 00884 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) { 00885 MafSequence* subseq; 00886 subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]); 00887 newBlock->addSequence(*subseq); 00888 delete subseq; 00889 } 00890 blockBuffer_.push_back(newBlock); 00891 } 00892 if (verbose_) 00893 ApplicationTools::displayTaskDone(); 00894 00895 delete block; 00896 } 00897 } while (blockBuffer_.size() == 0); 00898 } 00899 00900 MafBlock* block = blockBuffer_.front(); 00901 blockBuffer_.pop_front(); 00902 return block; 00903 } 00904 00905 MafBlock* QualityFilterMafIterator::analyseCurrentBlock_() throw (Exception) 00906 { 00907 if (blockBuffer_.size() == 0) { 00908 do { 00909 //Else there is no more block in the buffer, we need parse more: 00910 MafBlock* block = iterator_->nextBlock(); 00911 if (!block) return 0; //No more block. 00912 00913 //Parse block. 00914 vector< vector<int> > aln; 00915 for (size_t i = 0; i < species_.size(); ++i) { 00916 const MafSequence* seq = &block->getSequenceForSpecies(species_[i]); 00917 if (seq->hasAnnotation(SequenceQuality::QUALITY_SCORE)) { 00918 aln.push_back(dynamic_cast<const SequenceQuality&>(seq->getAnnotation(SequenceQuality::QUALITY_SCORE)).getScores()); 00919 } 00920 } 00921 if (aln.size() != species_.size()) { 00922 blockBuffer_.push_back(block); 00923 if (logstream_) { 00924 (*logstream_ << "QUAL CLEANER: block is missing quality score for at least one species and will therefore not be filtered.").endLine(); 00925 } 00926 //NB here we could decide to discard the block instead! 00927 } else { 00928 size_t nr = aln.size(); 00929 size_t nc = block->getNumberOfSites(); 00930 //First we create a mask: 00931 vector<unsigned int> pos; 00932 vector<int> col(nr); 00933 //Reset window: 00934 window_.clear(); 00935 //Init window: 00936 size_t i; 00937 for (i = 0; i < windowSize_; ++i) { 00938 for (size_t j = 0; j < nr; ++j) { 00939 col[j] = aln[j][i]; 00940 } 00941 window_.push_back(col); 00942 } 00943 //Slide window: 00944 if (verbose_) { 00945 ApplicationTools::message->endLine(); 00946 ApplicationTools::displayTask("Sliding window for quality filter", true); 00947 } 00948 while (i + step_ < nc) { 00949 if (verbose_) 00950 ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1, '>'); 00951 //Evaluate current window: 00952 double mean = 0; 00953 double n = static_cast<double>(aln.size() * windowSize_); 00954 for (size_t u = 0; u < window_.size(); ++u) 00955 for (size_t v = 0; v < window_[u].size(); ++v) { 00956 mean += (window_[u][v] > 0 ? static_cast<double>(window_[u][v]) : 0.); 00957 if (window_[u][v] == -1) n--; 00958 } 00959 if (n > 0 && (mean / n) < minQual_) { 00960 if (pos.size() == 0) { 00961 pos.push_back(i - windowSize_); 00962 pos.push_back(i); 00963 } else { 00964 if (i - windowSize_ <= pos[pos.size() - 1]) { 00965 pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region 00966 } else { //This is a new region 00967 pos.push_back(i - windowSize_); 00968 pos.push_back(i); 00969 } 00970 } 00971 } 00972 00973 //Move forward: 00974 for (unsigned int k = 0; k < step_; ++k) { 00975 for (size_t j = 0; j < nr; ++j) { 00976 col[j] = aln[j][i]; 00977 } 00978 window_.push_back(col); 00979 window_.pop_front(); 00980 ++i; 00981 } 00982 } 00983 00984 //Evaluate last window: 00985 double mean = 0; 00986 double n = static_cast<double>(aln.size() * windowSize_); 00987 for (size_t u = 0; u < window_.size(); ++u) 00988 for (size_t v = 0; v < window_[u].size(); ++v) { 00989 mean += (window_[u][v] > 0 ? static_cast<double>(window_[u][v]) : 0.); 00990 if (window_[u][v] == -1) n--; 00991 } 00992 if (n > 0 && (mean / n) < minQual_) { 00993 if (pos.size() == 0) { 00994 pos.push_back(i - windowSize_); 00995 pos.push_back(i); 00996 } else { 00997 if (i - windowSize_ < pos[pos.size() - 1]) { 00998 pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region 00999 } else { //This is a new region 01000 pos.push_back(i - windowSize_); 01001 pos.push_back(i); 01002 } 01003 } 01004 } 01005 if (verbose_) 01006 ApplicationTools::displayTaskDone(); 01007 01008 //Now we remove regions with two many gaps, using a sliding window: 01009 if (pos.size() == 0) { 01010 blockBuffer_.push_back(block); 01011 if (logstream_) { 01012 (*logstream_ << "QUAL CLEANER: block is clean and kept as is.").endLine(); 01013 } 01014 } else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) { 01015 //Everything is removed: 01016 if (logstream_) { 01017 (*logstream_ << "QUAL CLEANER: block was entirely removed. Tried to get the next one.").endLine(); 01018 } 01019 } else { 01020 if (logstream_) { 01021 (*logstream_ << "QUAL CLEANER: block with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine(); 01022 } 01023 if (verbose_) { 01024 ApplicationTools::message->endLine(); 01025 ApplicationTools::displayTask("Spliting block", true); 01026 } 01027 for (i = 0; i < pos.size(); i+=2) { 01028 if (verbose_) 01029 ApplicationTools::displayGauge(i, pos.size() - 2, '='); 01030 if (logstream_) { 01031 (*logstream_ << "QUAL CLEANER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block.").endLine(); 01032 } 01033 if (pos[i] > 0) { 01034 MafBlock* newBlock = new MafBlock(); 01035 newBlock->setScore(block->getScore()); 01036 newBlock->setPass(block->getPass()); 01037 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) { 01038 MafSequence* subseq; 01039 if (i == 0) { 01040 subseq = block->getSequence(j).subSequence(0, pos[i]); 01041 } else { 01042 subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]); 01043 } 01044 newBlock->addSequence(*subseq); 01045 delete subseq; 01046 } 01047 blockBuffer_.push_back(newBlock); 01048 } 01049 01050 if (keepTrashedBlocks_) { 01051 MafBlock* outBlock = new MafBlock(); 01052 outBlock->setScore(block->getScore()); 01053 outBlock->setPass(block->getPass()); 01054 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) { 01055 MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]); 01056 outBlock->addSequence(*outseq); 01057 delete outseq; 01058 } 01059 trashBuffer_.push_back(outBlock); 01060 } 01061 } 01062 //Add last block: 01063 if (pos[pos.size() - 1] < block->getNumberOfSites()) { 01064 MafBlock* newBlock = new MafBlock(); 01065 newBlock->setScore(block->getScore()); 01066 newBlock->setPass(block->getPass()); 01067 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) { 01068 MafSequence* subseq; 01069 subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]); 01070 newBlock->addSequence(*subseq); 01071 delete subseq; 01072 } 01073 blockBuffer_.push_back(newBlock); 01074 } 01075 if (verbose_) 01076 ApplicationTools::displayTaskDone(); 01077 01078 delete block; 01079 } 01080 } 01081 } while (blockBuffer_.size() == 0); 01082 } 01083 01084 MafBlock* block = blockBuffer_.front(); 01085 blockBuffer_.pop_front(); 01086 return block; 01087 } 01088 01089 SequenceStatisticsMafIterator::SequenceStatisticsMafIterator(MafIterator* iterator, const std::vector<MafStatistics*> statistics) : 01090 AbstractFilterMafIterator(iterator), 01091 statistics_(statistics), 01092 results_(), 01093 names_() 01094 { 01095 string name; 01096 for (size_t i = 0; i < statistics_.size(); ++i) { 01097 name = statistics_[i]->getShortName(); 01098 vector<string> tags = statistics_[i]->getSupportedTags(); 01099 if (tags.size() > 1) { 01100 for (size_t j = 0; j < tags.size(); ++j) { 01101 names_.push_back(name + "." + tags[j]); 01102 } 01103 } else { 01104 names_.push_back(name); 01105 } 01106 } 01107 results_.resize(names_.size()); 01108 } 01109 01110 MafBlock* SequenceStatisticsMafIterator::analyseCurrentBlock_() throw (Exception) 01111 { 01112 vector<string> tags; 01113 currentBlock_ = iterator_->nextBlock(); 01114 if (currentBlock_) { 01115 size_t k = 0; 01116 for (size_t i = 0; i < statistics_.size(); ++i) { 01117 statistics_[i]->compute(*currentBlock_); 01118 const MafStatisticsResult& result = statistics_[i]->getResult(); 01119 tags = statistics_[i]->getSupportedTags(); 01120 for (size_t j = 0; j < tags.size(); ++j) { 01121 if (result.hasValue(tags[j])) { 01122 results_[k] = result.getValue(tags[j]); 01123 } else { 01124 results_[k] = NumConstants::NaN; 01125 } 01126 k++; 01127 } 01128 } 01129 } 01130 return currentBlock_; 01131 } 01132 01133 MafBlock* FeatureFilterMafIterator::analyseCurrentBlock_() throw (Exception) 01134 { 01135 if (blockBuffer_.size() == 0) { 01136 //Unless there is no more block in the buffer, we need to parse more: 01137 do { 01138 MafBlock* block = iterator_->nextBlock(); 01139 if (!block) return 0; //No more block. 01140 01141 //Check if the block contains the reference species: 01142 if (!block->hasSequenceForSpecies(refSpecies_)) { 01143 if (logstream_) { 01144 (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " does not contain the reference species and was kept as is.").endLine(); 01145 } 01146 return block; 01147 } 01148 01149 //Get the feature ranges for this block: 01150 const MafSequence& refSeq = block->getSequenceForSpecies(refSpecies_); 01151 //first check if there is one (for now we assume that features refer to the chromosome or contig name, with implicit species): 01152 std::map<std::string, MultiRange<unsigned int> >::iterator mr = ranges_.find(refSeq.getChromosome()); 01153 if (mr == ranges_.end()) { 01154 if (logstream_) { 01155 (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " does not contain any feature and was kept as is.").endLine(); 01156 } 01157 return block; 01158 } 01159 //else 01160 MultiRange<unsigned int> mRange = mr->second; 01161 mRange.restrictTo(Range<unsigned int>(refSeq.start(), refSeq.stop() + 1)); 01162 if (mRange.isEmpty()) { 01163 if (logstream_) { 01164 (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " does not contain any feature and was kept as is.").endLine(); 01165 } 01166 return block; 01167 } 01168 std::vector<unsigned int> tmp = mRange.getBounds(); 01169 std::deque<unsigned int> refBounds(tmp.begin(), tmp.end()); 01170 01171 //Now extract corresponding alignments. We use the range to split the original block. 01172 //Only thing to watch out is the coordinates, refering to the ref species... 01173 //A good idea is then to convert those with respect to the given block: 01174 01175 int gap = refSeq.getAlphabet()->getGapCharacterCode(); 01176 long int refPos = static_cast<long int>(refSeq.start()) - 1; 01177 std::vector<size_t> pos; 01178 for (size_t alnPos = 0; alnPos < refSeq.size() && refBounds.size() > 0; ++alnPos) { 01179 if (refSeq[alnPos] != gap) { 01180 refPos++; 01181 //check if this position is a bound: 01182 if (refBounds.front() == static_cast<unsigned int>(refPos)) { 01183 pos.push_back(alnPos); 01184 refBounds.pop_front(); 01185 } 01186 } 01187 } 01188 //Check if the last bound matches the end of the alignment: 01189 if (refBounds.size() > 0 && refBounds.front() == refSeq.stop() + 1) { 01190 pos.push_back(refSeq.size()); 01191 refBounds.pop_front(); 01192 } 01193 01194 if (refBounds.size() > 0) { 01195 VectorTools::print(vector<unsigned int>(refBounds.begin(), refBounds.end())); 01196 throw Exception("FeatureFilterMafIterator::nextBlock(). An error occurred here, " + TextTools::toString(refBounds.size()) + " coordinates are left... this is most likely a bug, please report!"); 01197 } 01198 01199 //Next step is simply to split the black according to the translated coordinates: 01200 if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) { 01201 //Everything is removed: 01202 if (logstream_) { 01203 (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " was entirely removed. Tried to get the next one.").endLine(); 01204 } 01205 } else { 01206 if (logstream_) { 01207 (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine(); 01208 } 01209 if (verbose_) { 01210 ApplicationTools::message->endLine(); 01211 ApplicationTools::displayTask("Spliting block", true); 01212 } 01213 for (size_t i = 0; i < pos.size(); i+=2) { 01214 if (verbose_) 01215 ApplicationTools::displayGauge(i, pos.size() - 2, '='); 01216 if (logstream_) { 01217 (*logstream_ << "FEATURE FILTER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block " << block->getDescription() << ".").endLine(); 01218 } 01219 if (pos[i] > 0) { 01220 MafBlock* newBlock = new MafBlock(); 01221 newBlock->setScore(block->getScore()); 01222 newBlock->setPass(block->getPass()); 01223 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) { 01224 MafSequence* subseq; 01225 if (i == 0) { 01226 subseq = block->getSequence(j).subSequence(0, pos[i]); 01227 } else { 01228 subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]); 01229 } 01230 newBlock->addSequence(*subseq); 01231 delete subseq; 01232 } 01233 blockBuffer_.push_back(newBlock); 01234 } 01235 01236 if (keepTrashedBlocks_) { 01237 MafBlock* outBlock = new MafBlock(); 01238 outBlock->setScore(block->getScore()); 01239 outBlock->setPass(block->getPass()); 01240 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) { 01241 MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]); 01242 outBlock->addSequence(*outseq); 01243 delete outseq; 01244 } 01245 trashBuffer_.push_back(outBlock); 01246 } 01247 } 01248 //Add last block: 01249 if (pos.back() < block->getNumberOfSites()) { 01250 MafBlock* newBlock = new MafBlock(); 01251 newBlock->setScore(block->getScore()); 01252 newBlock->setPass(block->getPass()); 01253 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) { 01254 MafSequence* subseq; 01255 subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]); 01256 newBlock->addSequence(*subseq); 01257 delete subseq; 01258 } 01259 blockBuffer_.push_back(newBlock); 01260 } 01261 if (verbose_) 01262 ApplicationTools::displayTaskDone(); 01263 01264 delete block; 01265 } 01266 } while (blockBuffer_.size() == 0); 01267 } 01268 01269 MafBlock* nxtBlock = blockBuffer_.front(); 01270 blockBuffer_.pop_front(); 01271 return nxtBlock; 01272 } 01273 01274 MafBlock* FeatureExtractor::analyseCurrentBlock_() throw (Exception) 01275 { 01276 if (blockBuffer_.size() == 0) { 01277 //Unless there is no more block in the buffer, we need to parse more: 01278 START: 01279 MafBlock* block = iterator_->nextBlock(); 01280 if (!block) return 0; //No more block. 01281 01282 //Check if the block contains the reference species: 01283 if (!block->hasSequenceForSpecies(refSpecies_)) 01284 goto START; 01285 01286 //Get the feature ranges for this block: 01287 const MafSequence& refSeq = block->getSequenceForSpecies(refSpecies_); 01288 //first check if there is one (for now we assume that features refer to the chromosome or contig name, with implicit species): 01289 std::map<std::string, RangeSet<unsigned int> >::iterator mr = ranges_.find(refSeq.getChromosome()); 01290 if (mr == ranges_.end()) 01291 goto START; 01292 01293 RangeSet<unsigned int> ranges = mr->second; 01294 ranges.restrictTo(Range<unsigned int>(refSeq.start(), refSeq.stop())); 01295 if (ranges.isEmpty()) 01296 goto START; 01297 01298 //We will need to convert to alignment positions, using a sequence walker: 01299 SequenceWalker walker(refSeq); 01300 01301 //Now creates all blocks for all ranges: 01302 if (verbose_) { 01303 ApplicationTools::message->endLine(); 01304 ApplicationTools::displayTask("Extracting annotations", true); 01305 } 01306 if (logstream_) { 01307 (*logstream_ << "FEATURE EXTRACTOR: extracting " << ranges.getSet().size() << " features from block " << block->getDescription() << ".").endLine(); 01308 } 01309 01310 unsigned int i = 0; 01311 for (set<Range<unsigned int>*>::iterator it = ranges.getSet().begin(); 01312 it != ranges.getSet().end(); 01313 ++it) 01314 { 01315 if (verbose_) { 01316 ApplicationTools::displayGauge(i++, ranges.getSet().size() - 1, '='); 01317 } 01318 MafBlock* newBlock = new MafBlock(); 01319 newBlock->setScore(block->getScore()); 01320 newBlock->setPass(block->getPass()); 01321 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) { 01322 auto_ptr<MafSequence> subseq; 01323 unsigned int a = walker.getAlignmentPosition((**it).begin() - refSeq.start()); 01324 unsigned int b = walker.getAlignmentPosition((**it).end() - refSeq.start() - 1); 01325 subseq.reset(block->getSequence(j).subSequence(a, b - a + 1)); 01326 if (!ignoreStrand_) { 01327 if (dynamic_cast<SeqRange*>(*it)->isNegativeStrand()) { 01328 SequenceTools::invertComplement(*subseq); 01329 } 01330 } 01331 newBlock->addSequence(*subseq); 01332 } 01333 blockBuffer_.push_back(newBlock); 01334 } 01335 01336 if (verbose_) 01337 ApplicationTools::displayTaskDone(); 01338 01339 delete block; 01340 } 01341 01342 MafBlock* nxtBlock = blockBuffer_.front(); 01343 blockBuffer_.pop_front(); 01344 return nxtBlock; 01345 } 01346 01347 01348 void OutputMafIterator::writeHeader(std::ostream& out) const 01349 { 01350 out << "##maf version=1 program=Bio++" << endl << "#" << endl; 01351 //There are more options in the header that we may want to support... 01352 } 01353 01354 void OutputMafIterator::writeBlock(std::ostream& out, const MafBlock& block) const 01355 { 01356 out << "a"; 01357 if (! isinf(block.getScore())) 01358 out << " score=" << block.getScore(); 01359 if (block.getPass() > 0) 01360 out << " pass=" << block.getPass(); 01361 out << endl; 01362 01363 //Now we write sequences. First need to count characters for aligning blocks: 01364 size_t mxcSrc = 0, mxcStart = 0, mxcSize = 0, mxcSrcSize = 0; 01365 for (unsigned int i = 0; i < block.getNumberOfSequences(); i++) { 01366 const MafSequence* seq = &block.getSequence(i); 01367 unsigned int start = 0; //Maybe we should output sthg else here? 01368 if (seq->hasCoordinates()) 01369 start = seq->start(); 01370 mxcSrc = max(mxcSrc , seq->getName().size()); 01371 mxcStart = max(mxcStart , TextTools::toString(start).size()); 01372 mxcSize = max(mxcSize , TextTools::toString(seq->getGenomicSize()).size()); 01373 mxcSrcSize = max(mxcSrcSize, TextTools::toString(seq->getSrcSize()).size()); 01374 } 01375 //Now print each sequence: 01376 for (unsigned int i = 0; i < block.getNumberOfSequences(); i++) { 01377 const MafSequence* seq = &block.getSequence(i); 01378 out << "s "; 01379 out << TextTools::resizeRight(seq->getName(), mxcSrc, ' ') << " "; 01380 unsigned int start = 0; //Maybe we should output sthg else here? 01381 if (seq->hasCoordinates()) 01382 start = seq->start(); 01383 out << TextTools::resizeLeft(TextTools::toString(start), mxcStart, ' ') << " "; 01384 out << TextTools::resizeLeft(TextTools::toString(seq->getGenomicSize()), mxcSize, ' ') << " "; 01385 out << seq->getStrand() << " "; 01386 out << TextTools::resizeLeft(TextTools::toString(seq->getSrcSize()), mxcSrcSize, ' ') << " "; 01387 //Shall we write the sequence as masked? 01388 string seqstr = seq->toString(); 01389 if (mask_ && seq->hasAnnotation(SequenceMask::MASK)) { 01390 const SequenceMask* mask = &dynamic_cast<const SequenceMask&>(seq->getAnnotation(SequenceMask::MASK)); 01391 for (unsigned int j = 0; j < seqstr.size(); ++j) { 01392 char c = ((*mask)[j] ? tolower(seqstr[j]) : seqstr[j]); 01393 out << c; 01394 } 01395 } else { 01396 out << seqstr; 01397 } 01398 out << endl; 01399 //Write quality scores if any: 01400 if (mask_ && seq->hasAnnotation(SequenceQuality::QUALITY_SCORE)) { 01401 const SequenceQuality* qual = &dynamic_cast<const SequenceQuality&>(seq->getAnnotation(SequenceQuality::QUALITY_SCORE)); 01402 out << "q "; 01403 out << TextTools::resizeRight(seq->getName(), mxcSrc + mxcStart + mxcSize + mxcSrcSize + 5, ' ') << " "; 01404 string qualStr; 01405 for (unsigned int j = 0; j < seq->size(); ++j) { 01406 int s = (*qual)[j]; 01407 if (s == -1) { 01408 qualStr += "-"; 01409 } else if (s == -2) { 01410 qualStr += "?"; 01411 } else if (s >=0 && s < 10) { 01412 qualStr += TextTools::toString(s); 01413 } else if (s == 10) { 01414 qualStr += "F"; 01415 } else { 01416 throw Exception("MafAlignmentParser::writeBlock. Unsuported score value: " + TextTools::toString(s)); 01417 } 01418 } 01419 out << qualStr << endl; 01420 } 01421 } 01422 out << endl; 01423 } 01424 01425 void OutputAlignmentMafIterator::writeBlock(std::ostream& out, const MafBlock& block) const { 01426 //First get alignment: 01427 AlignedSequenceContainer aln(block.getAlignment()); 01428 //Format sequence names: 01429 vector<string> names(aln.getNumberOfSequences()); 01430 for (unsigned int i = 0; i < aln.getNumberOfSequences(); ++i) { 01431 const MafSequence& mafseq = block.getSequence(i); 01432 names[i] = mafseq.getSpecies() + "-" + mafseq.getChromosome() + "(" + mafseq.getStrand() + ")/" + TextTools::toString(mafseq.start() + 1) + "-" + TextTools::toString(mafseq.stop() + 1); 01433 } 01434 aln.setSequencesNames(names); 01435 writer_.write(out, aln); 01436 } 01437 01438 const short WindowSplitMafIterator::RAGGED_LEFT = 0; 01439 const short WindowSplitMafIterator::RAGGED_RIGHT = 1; 01440 const short WindowSplitMafIterator::CENTER = 2; 01441 const short WindowSplitMafIterator::ADJUST= 3; 01442 01443 MafBlock* WindowSplitMafIterator::analyseCurrentBlock_() throw (Exception) 01444 { 01445 while (blockBuffer_.size() == 0) { 01446 //Build a new series of windows: 01447 MafBlock* block = iterator_->nextBlock(); 01448 if (!block) return 0; //No more block. 01449 01450 unsigned int pos = 0; 01451 unsigned int size = windowSize_; 01452 unsigned int bSize = block->getNumberOfSites(); 01453 01454 switch (align_) { 01455 case (RAGGED_RIGHT) : { pos = bSize % windowSize_; break; } 01456 case (CENTER) : { pos = (bSize % windowSize_) / 2; break; } 01457 case (ADJUST) : { 01458 unsigned int x = bSize / windowSize_; 01459 if (x == 0) size = bSize; 01460 else size = bSize / x; 01461 break; 01462 } 01463 default : { } 01464 } 01465 //cout << "Effective size: " << size << endl; 01466 for(unsigned int i = pos; i < bSize; i += size) { 01467 MafBlock* newBlock = new MafBlock(); 01468 newBlock->setScore(block->getScore()); 01469 newBlock->setPass(block->getPass()); 01470 if (align_ == ADJUST) { 01471 if (bSize - (i + size) > 0 && bSize - (i + size) < size) { 01472 //cout << "Old size: " << size; 01473 size = bSize - i; //Adjust for last block because of rounding. 01474 //this should not increase size by more than 1! 01475 //cout << " => new size: " << size << endl; 01476 } 01477 } 01478 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) { 01479 auto_ptr<MafSequence> subseq(block->getSequence(j).subSequence(i, size)); 01480 newBlock->addSequence(*subseq); 01481 } 01482 blockBuffer_.push_back(newBlock); 01483 } 01484 } 01485 01486 MafBlock* nxtBlock = blockBuffer_.front(); 01487 blockBuffer_.pop_front(); 01488 return nxtBlock; 01489 } 01490