00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040 #include "MafIterator.h"
00041 #include "../SequenceWithQuality.h"
00042 #include "../SequenceWithAnnotationTools.h"
00043 #include "../SequenceWalker.h"
00044 #include "../Alphabet/AlphabetTools.h"
00045 #include "../Container/VectorSiteContainer.h"
00046 #include "../SiteTools.h"
00047
00048 using namespace bpp;
00049
00050
00051 #include <string>
00052
00053 using namespace std;
00054
00055 MafSequence* MafSequence::subSequence(unsigned int startAt, unsigned int length) const
00056 {
00057 string subseq = toString().substr(startAt, length);
00058 unsigned int begin = begin_;
00059 if (hasCoordinates_) {
00060 for (unsigned int i = 0; i < startAt; ++i) {
00061 if (! getAlphabet()->isGap(operator[](i))) begin++;
00062 }
00063 }
00064 MafSequence* newSeq = new MafSequence(getName(), subseq, begin, strand_, srcSize_);
00065 if (!hasCoordinates_)
00066 newSeq->removeCoordinates();
00067 vector<string> anno = getAnnotationTypes();
00068 for (size_t i = 0; i < anno.size(); ++i) {
00069 newSeq->addAnnotation(getAnnotation(anno[i]).getPartAnnotation(startAt, length));
00070 }
00071 return newSeq;
00072 }
00073
00074 MafBlock* SequenceFilterMafIterator::nextBlock() throw (Exception)
00075 {
00076 currentBlock_ = iterator_->nextBlock();
00077 while (currentBlock_) {
00078 map<string, unsigned int> counts;
00079 for (size_t i = currentBlock_->getNumberOfSequences(); i > 0; --i) {
00080 string species = currentBlock_->getSequence(i-1).getSpecies();
00081 if (!VectorTools::contains(species_, species)) {
00082 if (logstream_) {
00083 (*logstream_ << "SEQUENCE FILTER: remove sequence '" << species << "' from current block " << currentBlock_->getDescription() << ".").endLine();
00084 }
00085 currentBlock_->getAlignment().deleteSequence(i - 1);
00086 } else {
00087 counts[species]++;
00088 }
00089 }
00090 bool test = currentBlock_->getNumberOfSequences() == 0;
00091
00092 if (test) {
00093 if (logstream_) {
00094 (*logstream_ << "SEQUENCE FILTER: block " << currentBlock_->getDescription() << " is now empty. Try to get the next one.").endLine();
00095 }
00096 delete currentBlock_;
00097 } else {
00098 test = strict_ && (counts.size() != species_.size());
00099 if (test) {
00100 if (logstream_) {
00101 (*logstream_ << "SEQUENCE FILTER: block " << currentBlock_->getDescription() << " does not contain all species and will be ignored. Try to get the next one.").endLine();
00102 }
00103 delete currentBlock_;
00104 } else {
00105 if (rmDuplicates_) {
00106 test = false;
00107 map<string, unsigned int>::iterator it;
00108 for (it = counts.begin(); it != counts.end() && !(test = it->second > 1); it++) {}
00109 if (test) {
00110 if (logstream_) {
00111 (*logstream_ << "SEQUENCE FILTER: block " << currentBlock_->getDescription() << " has two sequences for species '" << it->first << "' and will be ignored. Try to get the next one.").endLine();
00112 }
00113 delete currentBlock_;
00114 } else {
00115 return currentBlock_;
00116 }
00117 } else {
00118 return currentBlock_;
00119 }
00120 }
00121 }
00122
00123
00124 currentBlock_ = iterator_->nextBlock();
00125 }
00126
00127 return currentBlock_;
00128 }
00129
00130 MafBlock* ChromosomeMafIterator::nextBlock() throw (Exception)
00131 {
00132 currentBlock_ = iterator_->nextBlock();
00133 while (currentBlock_) {
00134 bool foundRef = false;
00135 string chr = "";
00136 for (size_t i = 0; i < currentBlock_->getNumberOfSequences() && !foundRef; ++i) {
00137 string species = currentBlock_->getSequence(i).getSpecies();
00138 if (species == ref_) {
00139 foundRef = true;
00140 chr = currentBlock_->getSequence(i).getChromosome();
00141 }
00142 }
00143 if (!foundRef) {
00144 if (logstream_) {
00145 (*logstream_ << "CHROMOSOME FILTER: block does not contain reference species and was removed.").endLine();
00146 }
00147 delete currentBlock_;
00148 } else if (chr != chr_) {
00149 if (logstream_) {
00150 (*logstream_ << "CHROMOSOME FILTER: reference species without queried chromosome was removed.").endLine();
00151 }
00152 delete currentBlock_;
00153 } else {
00154 return currentBlock_;
00155 }
00156
00157
00158 currentBlock_ = iterator_->nextBlock();
00159 }
00160
00161 return currentBlock_;
00162 }
00163
00164 MafBlock* DuplicateFilterMafIterator::nextBlock() throw (Exception)
00165 {
00166 currentBlock_ = iterator_->nextBlock();
00167 while (currentBlock_) {
00168 bool foundRef = false;
00169 string chr = "";
00170 char strand = '+';
00171 unsigned int start = 0;
00172 unsigned int stop = 0;
00173 for (size_t i = 0; i < currentBlock_->getNumberOfSequences() && !foundRef; ++i) {
00174 string species = currentBlock_->getSequence(i).getSpecies();
00175 if (species == ref_) {
00176 foundRef = true;
00177 chr = currentBlock_->getSequence(i).getChromosome();
00178 strand = currentBlock_->getSequence(i).getStrand();
00179 start = currentBlock_->getSequence(i).start();
00180 stop = currentBlock_->getSequence(i).stop();
00181 }
00182 }
00183 if (!foundRef) {
00184 if (logstream_) {
00185 (*logstream_ << "DUPLICATE FILTER: block does not contain reference species and was removed.").endLine();
00186 }
00187 delete currentBlock_;
00188 } else {
00189 unsigned int occurrence = blocks_[chr][strand][start][stop]++;
00190 if (occurrence > 0) {
00191 if (logstream_) {
00192 (*logstream_ << "DUPLICATE FILTER: sequence in reference species was found in a previous block. New block was removed.").endLine();
00193 }
00194 delete currentBlock_;
00195 } else {
00196 return currentBlock_;
00197 }
00198 }
00199
00200
00201 currentBlock_ = iterator_->nextBlock();
00202 }
00203
00204 return currentBlock_;
00205 }
00206
00207 MafBlock* BlockMergerMafIterator::nextBlock() throw (Exception)
00208 {
00209 if (!incomingBlock_) return 0;
00210 currentBlock_ = incomingBlock_;
00211 incomingBlock_ = iterator_->nextBlock();
00212 while (incomingBlock_) {
00213 unsigned int globalSpace = 0;
00214 for (unsigned int i = 0; i < species_.size(); ++i) {
00215 try {
00216 const MafSequence* seq1 = ¤tBlock_->getSequenceForSpecies(species_[i]);
00217 const MafSequence* seq2 = &incomingBlock_->getSequenceForSpecies(species_[i]);
00218 if (!seq1->hasCoordinates() || !seq2->hasCoordinates())
00219 throw Exception("BlockMergerMafIterator::nextBlock. Species '" + species_[i] + "' is missing coordinates in at least one block.");
00220
00221 if (seq1->stop() >= seq2->start())
00222 return currentBlock_;
00223 unsigned int space = seq2->start() - seq1->stop() - 1;
00224 if (space > maxDist_)
00225 return currentBlock_;
00226 if (i == 0)
00227 globalSpace = space;
00228 else {
00229 if (space != globalSpace)
00230 return currentBlock_;
00231 }
00232 if (seq1->getChromosome() != seq2->getChromosome()
00233 || VectorTools::contains(ignoreChrs_, seq1->getChromosome())
00234 || VectorTools::contains(ignoreChrs_, seq2->getChromosome())
00235 || seq1->getStrand() != seq2->getStrand())
00236 {
00237
00238 return currentBlock_;
00239 }
00240 } catch (SequenceNotFoundException& snfe) {
00241
00242
00243 return currentBlock_;
00244 }
00245 }
00246
00247 if (logstream_) {
00248 (*logstream_ << "BLOCK MERGER: merging two consecutive blocks.").endLine();
00249 }
00250 vector<string> sp1 = currentBlock_->getSpeciesList();
00251 vector<string> sp2 = incomingBlock_->getSpeciesList();
00252 vector<string> allSp = VectorTools::unique(VectorTools::vectorUnion(sp1, sp2));
00253
00254 MafBlock* mergedBlock = new MafBlock();
00255
00256 unsigned int p1 = currentBlock_->getPass();
00257 unsigned int p2 = incomingBlock_->getPass();
00258 if (p1 == p2) mergedBlock->setPass(p1);
00259 double s1 = currentBlock_->getScore();
00260 double n1 = static_cast<double>(currentBlock_->getNumberOfSites());
00261 double s2 = incomingBlock_->getScore();
00262 double n2 = static_cast<double>(incomingBlock_->getNumberOfSites());
00263 mergedBlock->setScore((s1 * n1 + s2 * n2) / (n1 + n2));
00264
00265
00266 for (size_t i = 0; i < allSp.size(); ++i) {
00267 auto_ptr<MafSequence> seq;
00268 try {
00269 seq.reset(new MafSequence(currentBlock_->getSequenceForSpecies(allSp[i])));
00270
00271
00272 try {
00273 auto_ptr<MafSequence> tmp(new MafSequence(incomingBlock_->getSequenceForSpecies(allSp[i])));
00274 string ref1 = seq->getDescription(), ref2 = tmp->getDescription();
00275
00276 if (globalSpace > 0) {
00277 if (logstream_) {
00278 (*logstream_ << "BLOCK MERGER: a spacer of size " << globalSpace <<" is inserted in sequence for species " << allSp[i] << ".").endLine();
00279 }
00280 seq->append(vector<int>(globalSpace, AlphabetTools::DNA_ALPHABET.getUnknownCharacterCode()));
00281 }
00282 if (seq->getChromosome() != tmp->getChromosome()) {
00283 seq->setChromosome(seq->getChromosome() + "-" + tmp->getChromosome());
00284 seq->removeCoordinates();
00285 }
00286 if (seq->getStrand() != tmp->getStrand()) {
00287 seq->setStrand('?');
00288 seq->removeCoordinates();
00289 }
00290 if (seq->getName() != tmp->getName())
00291 tmp->setName(seq->getName());
00292 seq->merge(*tmp);
00293 seq->setSrcSize(0);
00294 if (logstream_) {
00295 (*logstream_ << "BLOCK MERGER: merging " << ref1 << " with " << ref2 << " into " << seq->getDescription()).endLine();
00296 }
00297 } catch (SequenceNotFoundException& snfe2) {
00298
00299 string ref1 = seq->getDescription();
00300 seq->setToSizeR(incomingBlock_->getNumberOfSites() + globalSpace);
00301 if (logstream_) {
00302 (*logstream_ << "BLOCK MERGER: extending " << ref1 << " with " << incomingBlock_->getNumberOfSites() << " gaps on the right.").endLine();
00303 }
00304 }
00305 } catch (SequenceNotFoundException& snfe1) {
00306
00307 seq.reset(new MafSequence(incomingBlock_->getSequenceForSpecies(allSp[i])));
00308 string ref2 = seq->getDescription();
00309 seq->setToSizeL(currentBlock_->getNumberOfSites() + globalSpace);
00310 if (logstream_) {
00311 (*logstream_ << "BLOCK MERGER: adding " << ref2 << " and extend it with " << currentBlock_->getNumberOfSites() << " gaps on the left.").endLine();
00312 }
00313 }
00314 mergedBlock->addSequence(*seq);
00315 }
00316
00317 delete currentBlock_;
00318 delete incomingBlock_;
00319 currentBlock_ = mergedBlock;
00320
00321 incomingBlock_ = iterator_->nextBlock();
00322 }
00323 return currentBlock_;
00324 }
00325
00326 MafBlock* FullGapFilterMafIterator::nextBlock() throw (Exception)
00327 {
00328 MafBlock* block = iterator_->nextBlock();
00329 if (!block) return 0;
00330
00331
00332 VectorSiteContainer vsc(&AlphabetTools::DNA_ALPHABET);
00333 for (size_t i = 0; i < species_.size(); ++i) {
00334 vsc.addSequence(block->getSequence(i));
00335 }
00336
00337 if (verbose_) {
00338 ApplicationTools::message->endLine();
00339 ApplicationTools::displayTask("Cleaning block for gap sites", true);
00340 }
00341 unsigned int n = block->getNumberOfSites();
00342 vector <unsigned int> start;
00343 vector <unsigned int> count;
00344 bool test = false;
00345 for(unsigned int i = 0; i < n; ++i) {
00346 const Site* site = &vsc.getSite(i);
00347 if (SiteTools::isGapOnly(*site)) {
00348 if (test) {
00349 count[count.size() - 1]++;
00350 } else {
00351 start.push_back(i);
00352 count.push_back(1);
00353 test = true;
00354 }
00355 } else {
00356 test = false;
00357 }
00358 }
00359
00360 unsigned int totalRemoved = 0;
00361 for(size_t i = start.size(); i > 0; --i) {
00362 if (verbose_)
00363 ApplicationTools::displayGauge(start.size() - i, start.size() - 1, '=');
00364 block->getAlignment().deleteSites(start[i - 1], count[i - 1]);
00365 totalRemoved += count[i - 1];
00366 }
00367 if (verbose_)
00368 ApplicationTools::displayTaskDone();
00369
00370
00371 if (totalRemoved > 0) {
00372 for (unsigned int i = 0; i < block->getNumberOfSequences(); ++i) {
00373 const MafSequence* seq = &block->getSequence(i);
00374 if (!VectorTools::contains(species_, seq->getSpecies())) {
00375 block->removeCoordinatesFromSequence(i);
00376 }
00377 }
00378 }
00379 if (logstream_) {
00380 (*logstream_ << "FULL GAP CLEANER: " << totalRemoved << " positions have been removed.").endLine();
00381 }
00382 return block;
00383 }
00384
00385 MafBlock* AlignmentFilterMafIterator::nextBlock() throw (Exception)
00386 {
00387 if (blockBuffer_.size() == 0) {
00388
00389 do {
00390 MafBlock* block = iterator_->nextBlock();
00391 if (!block) return 0;
00392
00393
00394 int gap = AlphabetTools::DNA_ALPHABET.getGapCharacterCode();
00395 int unk = AlphabetTools::DNA_ALPHABET.getUnknownCharacterCode();
00396 size_t nr = species_.size();
00397 vector< vector<int> > aln(nr);
00398 for (size_t i = 0; i < nr; ++i) {
00399 aln[i] = block->getSequenceForSpecies(species_[i]).getContent();
00400 }
00401 size_t nc = block->getNumberOfSites();
00402
00403 vector<unsigned int> pos;
00404 vector<bool> col(nr);
00405
00406 window_.clear();
00407
00408 size_t i;
00409 for (i = 0; i < windowSize_; ++i) {
00410 for (size_t j = 0; j < nr; ++j) {
00411 col[j] = (aln[j][i] == gap || aln[j][i] == unk);
00412 }
00413 window_.push_back(col);
00414 }
00415
00416 if (verbose_) {
00417 ApplicationTools::message->endLine();
00418 ApplicationTools::displayTask("Sliding window for alignment filter", true);
00419 }
00420 while (i + step_ < nc) {
00421 if (verbose_)
00422 ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1, '>');
00423
00424 unsigned int sum = 0;
00425 for (size_t u = 0; u < window_.size(); ++u)
00426 for (size_t v = 0; v < window_[u].size(); ++v)
00427 if (window_[u][v]) sum++;
00428 if (sum > maxGap_) {
00429 if (pos.size() == 0) {
00430 pos.push_back(i - windowSize_);
00431 pos.push_back(i);
00432 } else {
00433 if (i - windowSize_ < pos[pos.size() - 1]) {
00434 pos[pos.size() - 1] = i;
00435 } else {
00436 pos.push_back(i - windowSize_);
00437 pos.push_back(i);
00438 }
00439 }
00440 }
00441
00442
00443 for (unsigned int k = 0; k < step_; ++k) {
00444 for (size_t j = 0; j < nr; ++j) {
00445 col[j] = (aln[j][i] == gap || aln[j][i] == unk);
00446 }
00447 window_.push_back(col);
00448 window_.pop_front();
00449 ++i;
00450 }
00451 }
00452
00453
00454 unsigned int sum = 0;
00455 for (size_t u = 0; u < window_.size(); ++u)
00456 for (size_t v = 0; v < window_[u].size(); ++v)
00457 if (window_[u][v]) sum++;
00458 if (sum > maxGap_) {
00459 if (pos.size() == 0) {
00460 pos.push_back(i - windowSize_);
00461 pos.push_back(i);
00462 } else {
00463 if (i - windowSize_ <= pos[pos.size() - 1]) {
00464 pos[pos.size() - 1] = i;
00465 } else {
00466 pos.push_back(i - windowSize_);
00467 pos.push_back(i);
00468 }
00469 }
00470 }
00471 if (verbose_)
00472 ApplicationTools::displayTaskDone();
00473
00474
00475 if (pos.size() == 0) {
00476 blockBuffer_.push_back(block);
00477 if (logstream_) {
00478 (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " is clean and kept as is.").endLine();
00479 }
00480 } else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) {
00481
00482 if (logstream_) {
00483 (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " was entirely removed. Tried to get the next one.").endLine();
00484 }
00485 } else {
00486 if (logstream_) {
00487 (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine();
00488 }
00489 if (verbose_) {
00490 ApplicationTools::message->endLine();
00491 ApplicationTools::displayTask("Spliting block", true);
00492 }
00493 for (i = 0; i < pos.size(); i+=2) {
00494 if (verbose_)
00495 ApplicationTools::displayGauge(i, pos.size() - 2, '=');
00496 if (logstream_) {
00497 (*logstream_ << "ALN CLEANER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block " << block->getDescription() << ".").endLine();
00498 }
00499 if (pos[i] > 0) {
00500 MafBlock* newBlock = new MafBlock();
00501 newBlock->setScore(block->getScore());
00502 newBlock->setPass(block->getPass());
00503 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
00504 MafSequence* subseq;
00505 if (i == 0) {
00506 subseq = block->getSequence(j).subSequence(0, pos[i]);
00507 } else {
00508 subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]);
00509 }
00510 newBlock->addSequence(*subseq);
00511 delete subseq;
00512 }
00513 blockBuffer_.push_back(newBlock);
00514 }
00515
00516 if (keepTrashedBlocks_) {
00517 MafBlock* outBlock = new MafBlock();
00518 outBlock->setScore(block->getScore());
00519 outBlock->setPass(block->getPass());
00520 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
00521 MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]);
00522 outBlock->addSequence(*outseq);
00523 delete outseq;
00524 }
00525 trashBuffer_.push_back(outBlock);
00526 }
00527 }
00528
00529 if (pos[pos.size() - 1] < block->getNumberOfSites()) {
00530 MafBlock* newBlock = new MafBlock();
00531 newBlock->setScore(block->getScore());
00532 newBlock->setPass(block->getPass());
00533 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
00534 MafSequence* subseq;
00535 subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]);
00536 newBlock->addSequence(*subseq);
00537 delete subseq;
00538 }
00539 blockBuffer_.push_back(newBlock);
00540 }
00541 if (verbose_)
00542 ApplicationTools::displayTaskDone();
00543
00544 delete block;
00545 }
00546 } while (blockBuffer_.size() == 0);
00547 }
00548
00549 MafBlock* block = blockBuffer_.front();
00550 blockBuffer_.pop_front();
00551 return block;
00552 }
00553
00554 MafBlock* AlignmentFilter2MafIterator::nextBlock() throw (Exception)
00555 {
00556 if (blockBuffer_.size() == 0) {
00557
00558 do {
00559 MafBlock* block = iterator_->nextBlock();
00560 if (!block) return 0;
00561
00562
00563 int gap = AlphabetTools::DNA_ALPHABET.getGapCharacterCode();
00564 int unk = AlphabetTools::DNA_ALPHABET.getUnknownCharacterCode();
00565 size_t nr = species_.size();
00566 vector< vector<int> > aln(nr);
00567 for (size_t i = 0; i < nr; ++i) {
00568 aln[i] = block->getSequenceForSpecies(species_[i]).getContent();
00569 }
00570 size_t nc = block->getNumberOfSites();
00571
00572 vector<unsigned int> pos;
00573 vector<bool> col(nr);
00574
00575 window_.clear();
00576
00577 size_t i;
00578 for (i = 0; i < windowSize_; ++i) {
00579 for (size_t j = 0; j < nr; ++j) {
00580 col[j] = (aln[j][i] == gap || aln[j][i] == unk);
00581 }
00582 window_.push_back(col);
00583 }
00584
00585 if (verbose_) {
00586 ApplicationTools::message->endLine();
00587 ApplicationTools::displayTask("Sliding window for alignment filter", true);
00588 }
00589 while (i + step_ < nc) {
00590 if (verbose_)
00591 ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1, '>');
00592
00593 unsigned int count = 0;
00594 for (size_t u = 0; u < window_.size(); ++u) {
00595 unsigned int partialCount = 0;
00596 if (u > 0 && window_[u] != window_[u - 1]) {
00597 for (size_t v = 0; v < window_[u].size(); ++v)
00598 if (window_[u][v]) partialCount++;
00599 if (partialCount > maxGap_)
00600 count++;
00601 }
00602 }
00603 if (count > maxPos_) {
00604 if (pos.size() == 0) {
00605 pos.push_back(i - windowSize_);
00606 pos.push_back(i);
00607 } else {
00608 if (i - windowSize_ < pos[pos.size() - 1]) {
00609 pos[pos.size() - 1] = i;
00610 } else {
00611 pos.push_back(i - windowSize_);
00612 pos.push_back(i);
00613 }
00614 }
00615 }
00616
00617
00618 for (unsigned int k = 0; k < step_; ++k) {
00619 for (size_t j = 0; j < nr; ++j) {
00620 col[j] = (aln[j][i] == gap || aln[j][i] == unk);
00621 }
00622 window_.push_back(col);
00623 window_.pop_front();
00624 ++i;
00625 }
00626 }
00627
00628
00629 unsigned int count = 0;
00630 for (size_t u = 0; u < window_.size(); ++u) {
00631 unsigned int partialCount = 0;
00632 if (u > 0 && window_[u] != window_[u - 1]) {
00633 for (size_t v = 0; v < window_[u].size(); ++v)
00634 if (window_[u][v]) partialCount++;
00635 if (partialCount > maxGap_)
00636 count++;
00637 }
00638 }
00639 if (count > maxPos_) {
00640 if (pos.size() == 0) {
00641 pos.push_back(i - windowSize_);
00642 pos.push_back(i);
00643 } else {
00644 if (i - windowSize_ <= pos[pos.size() - 1]) {
00645 pos[pos.size() - 1] = i;
00646 } else {
00647 pos.push_back(i - windowSize_);
00648 pos.push_back(i);
00649 }
00650 }
00651 }
00652 if (verbose_)
00653 ApplicationTools::displayTaskDone();
00654
00655
00656 if (pos.size() == 0) {
00657 blockBuffer_.push_back(block);
00658 if (logstream_) {
00659 (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " is clean and kept as is.").endLine();
00660 }
00661 } else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) {
00662
00663 if (logstream_) {
00664 (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " was entirely removed. Tried to get the next one.").endLine();
00665 }
00666 } else {
00667 if (logstream_) {
00668 (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine();
00669 }
00670 if (verbose_) {
00671 ApplicationTools::message->endLine();
00672 ApplicationTools::displayTask("Spliting block", true);
00673 }
00674 for (i = 0; i < pos.size(); i+=2) {
00675 if (verbose_)
00676 ApplicationTools::displayGauge(i, pos.size() - 2, '=');
00677 if (logstream_) {
00678 (*logstream_ << "ALN CLEANER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block " << block->getDescription() << ".").endLine();
00679 }
00680 if (pos[i] > 0) {
00681 MafBlock* newBlock = new MafBlock();
00682 newBlock->setScore(block->getScore());
00683 newBlock->setPass(block->getPass());
00684 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
00685 MafSequence* subseq;
00686 if (i == 0) {
00687 subseq = block->getSequence(j).subSequence(0, pos[i]);
00688 } else {
00689 subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]);
00690 }
00691 newBlock->addSequence(*subseq);
00692 delete subseq;
00693 }
00694 blockBuffer_.push_back(newBlock);
00695 }
00696
00697 if (keepTrashedBlocks_) {
00698 MafBlock* outBlock = new MafBlock();
00699 outBlock->setScore(block->getScore());
00700 outBlock->setPass(block->getPass());
00701 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
00702 MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]);
00703 outBlock->addSequence(*outseq);
00704 delete outseq;
00705 }
00706 trashBuffer_.push_back(outBlock);
00707 }
00708 }
00709
00710 if (pos[pos.size() - 1] < block->getNumberOfSites()) {
00711 MafBlock* newBlock = new MafBlock();
00712 newBlock->setScore(block->getScore());
00713 newBlock->setPass(block->getPass());
00714 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
00715 MafSequence* subseq;
00716 subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]);
00717 newBlock->addSequence(*subseq);
00718 delete subseq;
00719 }
00720 blockBuffer_.push_back(newBlock);
00721 }
00722 if (verbose_)
00723 ApplicationTools::displayTaskDone();
00724
00725 delete block;
00726 }
00727 } while (blockBuffer_.size() == 0);
00728 }
00729
00730 MafBlock* block = blockBuffer_.front();
00731 blockBuffer_.pop_front();
00732 return block;
00733 }
00734
00735 MafBlock* MaskFilterMafIterator::nextBlock() throw (Exception)
00736 {
00737 if (blockBuffer_.size() == 0) {
00738 do {
00739
00740 MafBlock* block = iterator_->nextBlock();
00741 if (!block) return 0;
00742
00743
00744 vector< vector<bool> > aln;
00745 for (size_t i = 0; i < species_.size(); ++i) {
00746 const MafSequence* seq = &block->getSequenceForSpecies(species_[i]);
00747 if (seq->hasAnnotation(SequenceMask::MASK)) {
00748 aln.push_back(dynamic_cast<const SequenceMask&>(seq->getAnnotation(SequenceMask::MASK)).getMask());
00749 }
00750 }
00751 size_t nr = aln.size();
00752 size_t nc = block->getNumberOfSites();
00753
00754 vector<unsigned int> pos;
00755 vector<bool> col(nr);
00756
00757 window_.clear();
00758
00759 size_t i;
00760 for (i = 0; i < windowSize_; ++i) {
00761 for (size_t j = 0; j < nr; ++j) {
00762 col[j] = aln[j][i];
00763 }
00764 window_.push_back(col);
00765 }
00766
00767 if (verbose_) {
00768 ApplicationTools::message->endLine();
00769 ApplicationTools::displayTask("Sliding window for mask filter", true);
00770 }
00771 while (i + step_ < nc) {
00772 if (verbose_)
00773 ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1, '>');
00774
00775 unsigned int sum = 0;
00776 for (size_t u = 0; u < window_.size(); ++u)
00777 for (size_t v = 0; v < window_[u].size(); ++v)
00778 if (window_[u][v]) sum++;
00779 if (sum > maxMasked_) {
00780 if (pos.size() == 0) {
00781 pos.push_back(i - windowSize_);
00782 pos.push_back(i);
00783 } else {
00784 if (i - windowSize_ <= pos[pos.size() - 1]) {
00785 pos[pos.size() - 1] = i;
00786 } else {
00787 pos.push_back(i - windowSize_);
00788 pos.push_back(i);
00789 }
00790 }
00791 }
00792
00793
00794 for (unsigned int k = 0; k < step_; ++k) {
00795 for (size_t j = 0; j < nr; ++j) {
00796 col[j] = aln[j][i];
00797 }
00798 window_.push_back(col);
00799 window_.pop_front();
00800 ++i;
00801 }
00802 }
00803
00804
00805 unsigned int sum = 0;
00806 for (size_t u = 0; u < window_.size(); ++u)
00807 for (size_t v = 0; v < window_[u].size(); ++v)
00808 if (window_[u][v]) sum++;
00809 if (sum > maxMasked_) {
00810 if (pos.size() == 0) {
00811 pos.push_back(i - windowSize_);
00812 pos.push_back(i);
00813 } else {
00814 if (i - windowSize_ < pos[pos.size() - 1]) {
00815 pos[pos.size() - 1] = i;
00816 } else {
00817 pos.push_back(i - windowSize_);
00818 pos.push_back(i);
00819 }
00820 }
00821 }
00822 if (verbose_)
00823 ApplicationTools::displayTaskDone();
00824
00825
00826 if (pos.size() == 0) {
00827 blockBuffer_.push_back(block);
00828 if (logstream_) {
00829 (*logstream_ << "MASK CLEANER: block is clean and kept as is.").endLine();
00830 }
00831 } else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) {
00832
00833 if (logstream_) {
00834 (*logstream_ << "MASK CLEANER: block was entirely removed. Tried to get the next one.").endLine();
00835 }
00836 } else {
00837 if (logstream_) {
00838 (*logstream_ << "MASK CLEANER: block with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine();
00839 }
00840 if (verbose_) {
00841 ApplicationTools::message->endLine();
00842 ApplicationTools::displayTask("Spliting block", true);
00843 }
00844 for (i = 0; i < pos.size(); i+=2) {
00845 if (verbose_)
00846 ApplicationTools::displayGauge(i, pos.size() - 2, '=');
00847 if (logstream_) {
00848 (*logstream_ << "MASK CLEANER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block.").endLine();
00849 }
00850 if (pos[i] > 0) {
00851 MafBlock* newBlock = new MafBlock();
00852 newBlock->setScore(block->getScore());
00853 newBlock->setPass(block->getPass());
00854 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
00855 MafSequence* subseq;
00856 if (i == 0) {
00857 subseq = block->getSequence(j).subSequence(0, pos[i]);
00858 } else {
00859 subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]);
00860 }
00861 newBlock->addSequence(*subseq);
00862 delete subseq;
00863 }
00864 blockBuffer_.push_back(newBlock);
00865 }
00866
00867 if (keepTrashedBlocks_) {
00868 MafBlock* outBlock = new MafBlock();
00869 outBlock->setScore(block->getScore());
00870 outBlock->setPass(block->getPass());
00871 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
00872 MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]);
00873 outBlock->addSequence(*outseq);
00874 delete outseq;
00875 }
00876 trashBuffer_.push_back(outBlock);
00877 }
00878 }
00879
00880 if (pos[pos.size() - 1] < block->getNumberOfSites()) {
00881 MafBlock* newBlock = new MafBlock();
00882 newBlock->setScore(block->getScore());
00883 newBlock->setPass(block->getPass());
00884 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
00885 MafSequence* subseq;
00886 subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]);
00887 newBlock->addSequence(*subseq);
00888 delete subseq;
00889 }
00890 blockBuffer_.push_back(newBlock);
00891 }
00892 if (verbose_)
00893 ApplicationTools::displayTaskDone();
00894
00895 delete block;
00896 }
00897 } while (blockBuffer_.size() == 0);
00898 }
00899
00900 MafBlock* block = blockBuffer_.front();
00901 blockBuffer_.pop_front();
00902 return block;
00903 }
00904
00905 MafBlock* QualityFilterMafIterator::nextBlock() throw (Exception)
00906 {
00907 if (blockBuffer_.size() == 0) {
00908 do {
00909
00910 MafBlock* block = iterator_->nextBlock();
00911 if (!block) return 0;
00912
00913
00914 vector< vector<int> > aln;
00915 for (size_t i = 0; i < species_.size(); ++i) {
00916 const MafSequence* seq = &block->getSequenceForSpecies(species_[i]);
00917 if (seq->hasAnnotation(SequenceQuality::QUALITY_SCORE)) {
00918 aln.push_back(dynamic_cast<const SequenceQuality&>(seq->getAnnotation(SequenceQuality::QUALITY_SCORE)).getScores());
00919 }
00920 }
00921 if (aln.size() != species_.size()) {
00922 blockBuffer_.push_back(block);
00923 if (logstream_) {
00924 (*logstream_ << "QUAL CLEANER: block is missing quality score for at least one species and will therefore not be filtered.").endLine();
00925 }
00926
00927 } else {
00928 size_t nr = aln.size();
00929 size_t nc = block->getNumberOfSites();
00930
00931 vector<unsigned int> pos;
00932 vector<int> col(nr);
00933
00934 window_.clear();
00935
00936 size_t i;
00937 for (i = 0; i < windowSize_; ++i) {
00938 for (size_t j = 0; j < nr; ++j) {
00939 col[j] = aln[j][i];
00940 }
00941 window_.push_back(col);
00942 }
00943
00944 if (verbose_) {
00945 ApplicationTools::message->endLine();
00946 ApplicationTools::displayTask("Sliding window for quality filter", true);
00947 }
00948 while (i + step_ < nc) {
00949 if (verbose_)
00950 ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1, '>');
00951
00952 double mean = 0;
00953 double n = static_cast<double>(aln.size() * windowSize_);
00954 for (size_t u = 0; u < window_.size(); ++u)
00955 for (size_t v = 0; v < window_[u].size(); ++v) {
00956 mean += (window_[u][v] > 0 ? static_cast<double>(window_[u][v]) : 0.);
00957 if (window_[u][v] == -1) n--;
00958 }
00959 if (n > 0 && (mean / n) < minQual_) {
00960 if (pos.size() == 0) {
00961 pos.push_back(i - windowSize_);
00962 pos.push_back(i);
00963 } else {
00964 if (i - windowSize_ <= pos[pos.size() - 1]) {
00965 pos[pos.size() - 1] = i;
00966 } else {
00967 pos.push_back(i - windowSize_);
00968 pos.push_back(i);
00969 }
00970 }
00971 }
00972
00973
00974 for (unsigned int k = 0; k < step_; ++k) {
00975 for (size_t j = 0; j < nr; ++j) {
00976 col[j] = aln[j][i];
00977 }
00978 window_.push_back(col);
00979 window_.pop_front();
00980 ++i;
00981 }
00982 }
00983
00984
00985 double mean = 0;
00986 double n = static_cast<double>(aln.size() * windowSize_);
00987 for (size_t u = 0; u < window_.size(); ++u)
00988 for (size_t v = 0; v < window_[u].size(); ++v) {
00989 mean += (window_[u][v] > 0 ? static_cast<double>(window_[u][v]) : 0.);
00990 if (window_[u][v] == -1) n--;
00991 }
00992 if (n > 0 && (mean / n) < minQual_) {
00993 if (pos.size() == 0) {
00994 pos.push_back(i - windowSize_);
00995 pos.push_back(i);
00996 } else {
00997 if (i - windowSize_ < pos[pos.size() - 1]) {
00998 pos[pos.size() - 1] = i;
00999 } else {
01000 pos.push_back(i - windowSize_);
01001 pos.push_back(i);
01002 }
01003 }
01004 }
01005 if (verbose_)
01006 ApplicationTools::displayTaskDone();
01007
01008
01009 if (pos.size() == 0) {
01010 blockBuffer_.push_back(block);
01011 if (logstream_) {
01012 (*logstream_ << "QUAL CLEANER: block is clean and kept as is.").endLine();
01013 }
01014 } else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) {
01015
01016 if (logstream_) {
01017 (*logstream_ << "QUAL CLEANER: block was entirely removed. Tried to get the next one.").endLine();
01018 }
01019 } else {
01020 if (logstream_) {
01021 (*logstream_ << "QUAL CLEANER: block with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine();
01022 }
01023 if (verbose_) {
01024 ApplicationTools::message->endLine();
01025 ApplicationTools::displayTask("Spliting block", true);
01026 }
01027 for (i = 0; i < pos.size(); i+=2) {
01028 if (verbose_)
01029 ApplicationTools::displayGauge(i, pos.size() - 2, '=');
01030 if (logstream_) {
01031 (*logstream_ << "QUAL CLEANER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block.").endLine();
01032 }
01033 if (pos[i] > 0) {
01034 MafBlock* newBlock = new MafBlock();
01035 newBlock->setScore(block->getScore());
01036 newBlock->setPass(block->getPass());
01037 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
01038 MafSequence* subseq;
01039 if (i == 0) {
01040 subseq = block->getSequence(j).subSequence(0, pos[i]);
01041 } else {
01042 subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]);
01043 }
01044 newBlock->addSequence(*subseq);
01045 delete subseq;
01046 }
01047 blockBuffer_.push_back(newBlock);
01048 }
01049
01050 if (keepTrashedBlocks_) {
01051 MafBlock* outBlock = new MafBlock();
01052 outBlock->setScore(block->getScore());
01053 outBlock->setPass(block->getPass());
01054 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
01055 MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]);
01056 outBlock->addSequence(*outseq);
01057 delete outseq;
01058 }
01059 trashBuffer_.push_back(outBlock);
01060 }
01061 }
01062
01063 if (pos[pos.size() - 1] < block->getNumberOfSites()) {
01064 MafBlock* newBlock = new MafBlock();
01065 newBlock->setScore(block->getScore());
01066 newBlock->setPass(block->getPass());
01067 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
01068 MafSequence* subseq;
01069 subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]);
01070 newBlock->addSequence(*subseq);
01071 delete subseq;
01072 }
01073 blockBuffer_.push_back(newBlock);
01074 }
01075 if (verbose_)
01076 ApplicationTools::displayTaskDone();
01077
01078 delete block;
01079 }
01080 }
01081 } while (blockBuffer_.size() == 0);
01082 }
01083
01084 MafBlock* block = blockBuffer_.front();
01085 blockBuffer_.pop_front();
01086 return block;
01087 }
01088
01089 MafBlock* SequenceStatisticsMafIterator::nextBlock() throw (Exception)
01090 {
01091 currentBlock_ = iterator_->nextBlock();
01092 if (currentBlock_) {
01093 for (vector<string>::iterator sp = species_.begin(); sp != species_.end(); ++sp) {
01094 if (sp > species_.begin())
01095 *output_ << "\t";
01096 if (currentBlock_->hasSequenceForSpecies(*sp)) {
01097 map<int, double> counts;
01098 const Sequence& seq = currentBlock_->getSequenceForSpecies(*sp);
01099 SequenceTools::getCounts(seq, counts, false);
01100 *output_ << counts[0] << "\t" << counts[1] << "\t" << counts[2] << "\t" << counts[3] << "\t" << counts[currentBlock_->getAlignment().getAlphabet()->getGapCharacterCode()];
01101 *output_ << "\t" << SequenceTools::getNumberOfSites(seq);
01102 *output_ << "\t" << SequenceTools::getNumberOfCompleteSites(seq);
01103 *output_ << "\t" << SequenceTools::getNumberOfUnresolvedSites(seq);
01104 } else {
01105 *output_ << "NA\tNA\tNA\tNA\tNA\tNA\tNA\tNA";
01106 }
01107 }
01108 output_->endLine();
01109 }
01110 return currentBlock_;
01111 }
01112
01113 MafBlock* PairwiseSequenceStatisticsMafIterator::nextBlock() throw (Exception)
01114 {
01115 currentBlock_ = iterator_->nextBlock();
01116 if (currentBlock_) {
01117 if (currentBlock_->hasSequenceForSpecies(species1_) && currentBlock_->hasSequenceForSpecies(species2_)) {
01118 *output_ << SequenceTools::getPercentIdentity(currentBlock_->getSequenceForSpecies(species1_), currentBlock_->getSequenceForSpecies(species2_), true);
01119 } else {
01120 *output_ << "NA";
01121 }
01122 output_->endLine();
01123 }
01124 return currentBlock_;
01125 }
01126
01127 MafBlock* FeatureFilterMafIterator::nextBlock() throw (Exception)
01128 {
01129 if (blockBuffer_.size() == 0) {
01130
01131 do {
01132 MafBlock* block = iterator_->nextBlock();
01133 if (!block) return 0;
01134
01135
01136 if (!block->hasSequenceForSpecies(refSpecies_)) {
01137 if (logstream_) {
01138 (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " does not contain the reference species and was kept as is.").endLine();
01139 }
01140 return block;
01141 }
01142
01143
01144 const MafSequence& refSeq = block->getSequenceForSpecies(refSpecies_);
01145
01146 std::map<std::string, MultiRange<unsigned int> >::iterator mr = ranges_.find(refSeq.getChromosome());
01147 if (mr == ranges_.end()) {
01148 if (logstream_) {
01149 (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " does not contain any feature and was kept as is.").endLine();
01150 }
01151 return block;
01152 }
01153
01154 MultiRange<unsigned int> mRange = mr->second;
01155 mRange.restrictTo(Range<unsigned int>(refSeq.start(), refSeq.stop() + 1));
01156 if (mRange.isEmpty()) {
01157 if (logstream_) {
01158 (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " does not contain any feature and was kept as is.").endLine();
01159 }
01160 return block;
01161 }
01162 std::vector<unsigned int> tmp = mRange.getBounds();
01163 std::deque<unsigned int> refBounds(tmp.begin(), tmp.end());
01164
01165
01166
01167
01168
01169 int gap = refSeq.getAlphabet()->getGapCharacterCode();
01170 long int refPos = static_cast<long int>(refSeq.start()) - 1;
01171 std::vector<size_t> pos;
01172 for (size_t alnPos = 0; alnPos < refSeq.size() && refBounds.size() > 0; ++alnPos) {
01173 if (refSeq[alnPos] != gap) {
01174 refPos++;
01175
01176 if (refBounds.front() == static_cast<unsigned int>(refPos)) {
01177 pos.push_back(alnPos);
01178 refBounds.pop_front();
01179 }
01180 }
01181 }
01182
01183 if (refBounds.size() > 0 && refBounds.front() == refSeq.stop() + 1) {
01184 pos.push_back(refSeq.size());
01185 refBounds.pop_front();
01186 }
01187
01188 if (refBounds.size() > 0) {
01189 VectorTools::print(vector<unsigned int>(refBounds.begin(), refBounds.end()));
01190 throw Exception("FeatureFilterMafIterator::nextBlock(). An error occurred here, " + TextTools::toString(refBounds.size()) + " coordinates are left... this is most likely a bug, please report!");
01191 }
01192
01193
01194 if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) {
01195
01196 if (logstream_) {
01197 (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " was entirely removed. Tried to get the next one.").endLine();
01198 }
01199 } else {
01200 if (logstream_) {
01201 (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine();
01202 }
01203 if (verbose_) {
01204 ApplicationTools::message->endLine();
01205 ApplicationTools::displayTask("Spliting block", true);
01206 }
01207 for (size_t i = 0; i < pos.size(); i+=2) {
01208 if (verbose_)
01209 ApplicationTools::displayGauge(i, pos.size() - 2, '=');
01210 if (logstream_) {
01211 (*logstream_ << "FEATURE FILTER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block " << block->getDescription() << ".").endLine();
01212 }
01213 if (pos[i] > 0) {
01214 MafBlock* newBlock = new MafBlock();
01215 newBlock->setScore(block->getScore());
01216 newBlock->setPass(block->getPass());
01217 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
01218 MafSequence* subseq;
01219 if (i == 0) {
01220 subseq = block->getSequence(j).subSequence(0, pos[i]);
01221 } else {
01222 subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]);
01223 }
01224 newBlock->addSequence(*subseq);
01225 delete subseq;
01226 }
01227 blockBuffer_.push_back(newBlock);
01228 }
01229
01230 if (keepTrashedBlocks_) {
01231 MafBlock* outBlock = new MafBlock();
01232 outBlock->setScore(block->getScore());
01233 outBlock->setPass(block->getPass());
01234 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
01235 MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]);
01236 outBlock->addSequence(*outseq);
01237 delete outseq;
01238 }
01239 trashBuffer_.push_back(outBlock);
01240 }
01241 }
01242
01243 if (pos.back() < block->getNumberOfSites()) {
01244 MafBlock* newBlock = new MafBlock();
01245 newBlock->setScore(block->getScore());
01246 newBlock->setPass(block->getPass());
01247 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
01248 MafSequence* subseq;
01249 subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]);
01250 newBlock->addSequence(*subseq);
01251 delete subseq;
01252 }
01253 blockBuffer_.push_back(newBlock);
01254 }
01255 if (verbose_)
01256 ApplicationTools::displayTaskDone();
01257
01258 delete block;
01259 }
01260 } while (blockBuffer_.size() == 0);
01261 }
01262
01263 MafBlock* nxtBlock = blockBuffer_.front();
01264 blockBuffer_.pop_front();
01265 return nxtBlock;
01266 }
01267
01268 MafBlock* FeatureExtractor::nextBlock() throw (Exception)
01269 {
01270 if (blockBuffer_.size() == 0) {
01271
01272 START:
01273 MafBlock* block = iterator_->nextBlock();
01274 if (!block) return 0;
01275
01276
01277 if (!block->hasSequenceForSpecies(refSpecies_))
01278 goto START;
01279
01280
01281 const MafSequence& refSeq = block->getSequenceForSpecies(refSpecies_);
01282
01283 std::map<std::string, RangeSet<unsigned int> >::iterator mr = ranges_.find(refSeq.getChromosome());
01284 if (mr == ranges_.end())
01285 goto START;
01286
01287 RangeSet<unsigned int> ranges = mr->second;
01288 ranges.restrictTo(Range<unsigned int>(refSeq.start(), refSeq.stop()));
01289 if (ranges.isEmpty())
01290 goto START;
01291
01292
01293 SequenceWalker walker(refSeq);
01294
01295
01296 if (verbose_) {
01297 ApplicationTools::message->endLine();
01298 ApplicationTools::displayTask("Extracting annotations", true);
01299 }
01300 if (logstream_) {
01301 (*logstream_ << "FEATURE EXTRACTOR: extracting " << ranges.getSet().size() << " features from block " << block->getDescription() << ".").endLine();
01302 }
01303
01304 unsigned int i = 0;
01305 for (set<Range<unsigned int>*>::iterator it = ranges.getSet().begin();
01306 it != ranges.getSet().end();
01307 ++it)
01308 {
01309 if (verbose_) {
01310 ApplicationTools::displayGauge(i++, ranges.getSet().size() - 1, '=');
01311 }
01312 MafBlock* newBlock = new MafBlock();
01313 newBlock->setScore(block->getScore());
01314 newBlock->setPass(block->getPass());
01315 for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
01316 auto_ptr<MafSequence> subseq;
01317 unsigned int a = walker.getAlignmentPosition((**it).begin() - refSeq.start());
01318 unsigned int b = walker.getAlignmentPosition((**it).end() - refSeq.start() - 1);
01319 subseq.reset(block->getSequence(j).subSequence(a, b - a + 1));
01320 if (!ignoreStrand_) {
01321 if (dynamic_cast<SeqRange*>(*it)->isNegativeStrand()) {
01322 SequenceTools::invertComplement(*subseq);
01323 }
01324 }
01325 newBlock->addSequence(*subseq);
01326 }
01327 blockBuffer_.push_back(newBlock);
01328 }
01329
01330 if (verbose_)
01331 ApplicationTools::displayTaskDone();
01332
01333 delete block;
01334 }
01335
01336 MafBlock* nxtBlock = blockBuffer_.front();
01337 blockBuffer_.pop_front();
01338 return nxtBlock;
01339 }
01340
01341
01342 void OutputMafIterator::writeHeader(std::ostream& out) const
01343 {
01344 out << "##maf version=1 program=Bio++" << endl << "#" << endl;
01345
01346 }
01347
01348 void OutputMafIterator::writeBlock(std::ostream& out, const MafBlock& block) const
01349 {
01350 out << "a";
01351 if (block.getScore() > -1.)
01352 out << " score=" << block.getScore();
01353 if (block.getPass() > 0)
01354 out << " pass=" << block.getPass();
01355 out << endl;
01356
01357
01358 size_t mxcSrc = 0, mxcStart = 0, mxcSize = 0, mxcSrcSize = 0;
01359 for (unsigned int i = 0; i < block.getNumberOfSequences(); i++) {
01360 const MafSequence* seq = &block.getSequence(i);
01361 unsigned int start = 0;
01362 if (seq->hasCoordinates())
01363 start = seq->start();
01364 mxcSrc = max(mxcSrc , seq->getName().size());
01365 mxcStart = max(mxcStart , TextTools::toString(start).size());
01366 mxcSize = max(mxcSize , TextTools::toString(seq->getGenomicSize()).size());
01367 mxcSrcSize = max(mxcSrcSize, TextTools::toString(seq->getSrcSize()).size());
01368 }
01369
01370 for (unsigned int i = 0; i < block.getNumberOfSequences(); i++) {
01371 const MafSequence* seq = &block.getSequence(i);
01372 out << "s ";
01373 out << TextTools::resizeRight(seq->getName(), mxcSrc, ' ') << " ";
01374 unsigned int start = 0;
01375 if (seq->hasCoordinates())
01376 start = seq->start();
01377 out << TextTools::resizeLeft(TextTools::toString(start), mxcStart, ' ') << " ";
01378 out << TextTools::resizeLeft(TextTools::toString(seq->getGenomicSize()), mxcSize, ' ') << " ";
01379 out << seq->getStrand() << " ";
01380 out << TextTools::resizeLeft(TextTools::toString(seq->getSrcSize()), mxcSrcSize, ' ') << " ";
01381
01382 string seqstr = seq->toString();
01383 if (mask_ && seq->hasAnnotation(SequenceMask::MASK)) {
01384 const SequenceMask* mask = &dynamic_cast<const SequenceMask&>(seq->getAnnotation(SequenceMask::MASK));
01385 for (unsigned int j = 0; j < seqstr.size(); ++j) {
01386 char c = ((*mask)[j] ? tolower(seqstr[j]) : seqstr[j]);
01387 out << c;
01388 }
01389 } else {
01390 out << seqstr;
01391 }
01392 out << endl;
01393
01394 if (mask_ && seq->hasAnnotation(SequenceQuality::QUALITY_SCORE)) {
01395 const SequenceQuality* qual = &dynamic_cast<const SequenceQuality&>(seq->getAnnotation(SequenceQuality::QUALITY_SCORE));
01396 out << "q ";
01397 out << TextTools::resizeRight(seq->getName(), mxcSrc + mxcStart + mxcSize + mxcSrcSize + 5, ' ') << " ";
01398 string qualStr;
01399 for (unsigned int j = 0; j < seq->size(); ++j) {
01400 int s = (*qual)[j];
01401 if (s == -1) {
01402 qualStr += "-";
01403 } else if (s == -2) {
01404 qualStr += "?";
01405 } else if (s >=0 && s < 10) {
01406 qualStr += TextTools::toString(s);
01407 } else if (s == 10) {
01408 qualStr += "F";
01409 } else {
01410 throw Exception("MafAlignmentParser::writeBlock. Unsuported score value: " + TextTools::toString(s));
01411 }
01412 }
01413 out << qualStr << endl;
01414 }
01415 }
01416 out << endl;
01417 }
01418