bpp-seq 2.0.3

MafIterator.cpp

Go to the documentation of this file.
00001 //
00002 // File: MafIterator.cpp
00003 // Authors: Julien Dutheil
00004 // Created: Tue Sep 07 2010
00005 //
00006 
00007 /*
00008 Copyright or © or Copr. Bio++ Development Team, (2010)
00009 
00010 This software is a computer program whose purpose is to provide classes
00011 for sequences analysis.
00012 
00013 This software is governed by the CeCILL  license under French law and
00014 abiding by the rules of distribution of free software.  You can  use, 
00015 modify and/ or redistribute the software under the terms of the CeCILL
00016 license as circulated by CEA, CNRS and INRIA at the following URL
00017 "http://www.cecill.info". 
00018 
00019 As a counterpart to the access to the source code and  rights to copy,
00020 modify and redistribute granted by the license, users are provided only
00021 with a limited warranty  and the software's author,  the holder of the
00022 economic rights,  and the successive licensors  have only  limited
00023 liability. 
00024 
00025 In this respect, the user's attention is drawn to the risks associated
00026 with loading,  using,  modifying and/or developing or reproducing the
00027 software by the user in light of its specific status of free software,
00028 that may mean  that it is complicated to manipulate,  and  that  also
00029 therefore means  that it is reserved for developers  and  experienced
00030 professionals having in-depth computer knowledge. Users are therefore
00031 encouraged to load and test the software's suitability as regards their
00032 requirements in conditions enabling the security of their systems and/or 
00033 data to be ensured and,  more generally, to use and operate it in the 
00034 same conditions as regards security. 
00035 
00036 The fact that you are presently reading this means that you have had
00037 knowledge of the CeCILL license and that you accept its terms.
00038 */
00039 
00040 #include "MafIterator.h"
00041 #include "IterationListener.h"
00042 #include "../../SequenceWithQuality.h"
00043 #include "../../SequenceWithAnnotationTools.h"
00044 #include "../../SequenceWalker.h"
00045 #include "../../Alphabet/AlphabetTools.h"
00046 #include "../../Container/VectorSiteContainer.h"
00047 #include "../../SiteTools.h"
00048 
00049 using namespace bpp;
00050 
00051 //From the STL:
00052 #include <string>
00053 
00054 using namespace std;
00055 
00056 void AbstractMafIterator::fireIterationStartSignal_() {
00057   for (std::vector<IterationListener*>::iterator it = iterationListeners_.begin(); it != iterationListeners_.end(); ++it) {
00058     (*it)->iterationStarts();
00059   }
00060 }
00061 
00062 void AbstractMafIterator::fireIterationMoveSignal_(const MafBlock& currentBlock) {
00063   for (std::vector<IterationListener*>::iterator it = iterationListeners_.begin(); it != iterationListeners_.end(); ++it) {
00064     (*it)->iterationMoves(currentBlock);
00065   }
00066 }
00067 
00068 void AbstractMafIterator::fireIterationStopSignal_() {
00069   for (std::vector<IterationListener*>::iterator it = iterationListeners_.begin(); it != iterationListeners_.end(); ++it) {
00070     (*it)->iterationStops();
00071   }
00072 }
00073 
00074 MafBlock* SequenceFilterMafIterator::analyseCurrentBlock_() throw (Exception)
00075 {
00076   currentBlock_ = iterator_->nextBlock();
00077   while (currentBlock_) {
00078     map<string, unsigned int> counts;
00079     for (size_t i = currentBlock_->getNumberOfSequences(); i > 0; --i) {
00080       string species = currentBlock_->getSequence(i-1).getSpecies(); 
00081       if (!VectorTools::contains(species_, species)) {
00082         if (logstream_) {
00083           (*logstream_ << "SEQUENCE FILTER: remove sequence '" << species << "' from current block " << currentBlock_->getDescription() << ".").endLine();
00084         }
00085         currentBlock_->getAlignment().deleteSequence(i - 1);
00086       } else {
00087         counts[species]++;
00088       }
00089     }
00090     bool test = currentBlock_->getNumberOfSequences() == 0;
00091     //Avoid a memory leak:
00092     if (test) {
00093       if (logstream_) {
00094         (*logstream_ << "SEQUENCE FILTER: block " << currentBlock_->getDescription() << " is now empty. Try to get the next one.").endLine();
00095       }
00096       delete currentBlock_;
00097     } else {
00098       test = strict_ && (counts.size() != species_.size());
00099       if (test) {
00100         if (logstream_) {
00101           (*logstream_ << "SEQUENCE FILTER: block " << currentBlock_->getDescription() << " does not contain all species and will be ignored. Try to get the next one.").endLine();
00102         }
00103         delete currentBlock_;
00104       } else {
00105         if (rmDuplicates_) {
00106           test = false;
00107           map<string, unsigned int>::iterator it;
00108           for (it = counts.begin(); it != counts.end() && !(test = it->second > 1); it++) {}
00109           if (test) {
00110             if (logstream_) {
00111               (*logstream_ << "SEQUENCE FILTER: block " << currentBlock_->getDescription() << " has two sequences for species '" << it->first << "' and will be ignored. Try to get the next one.").endLine();
00112             }
00113             delete currentBlock_;
00114           } else {
00115             return currentBlock_;
00116           }
00117         } else {
00118           return currentBlock_;
00119         }
00120       }
00121     }
00122 
00123     //Look for the next block:
00124     currentBlock_ = iterator_->nextBlock();
00125   }
00126   
00127   return currentBlock_;
00128 }
00129 
00130 MafBlock* ChromosomeMafIterator::analyseCurrentBlock_() throw (Exception)
00131 {
00132   currentBlock_ = iterator_->nextBlock();
00133   while (currentBlock_) {
00134     bool foundRef = false;
00135     string chr = "";
00136     for (size_t i = 0; i < currentBlock_->getNumberOfSequences() && !foundRef; ++i) {
00137       string species = currentBlock_->getSequence(i).getSpecies(); 
00138       if (species == ref_) {
00139         foundRef = true;
00140         chr = currentBlock_->getSequence(i).getChromosome();
00141       }
00142     }
00143     if (!foundRef) {
00144       if (logstream_) {
00145         (*logstream_ << "CHROMOSOME FILTER: block does not contain reference species and was removed.").endLine();
00146       }
00147       delete currentBlock_;
00148     } else if (chr != chr_) {
00149       if (logstream_) {
00150         (*logstream_ << "CHROMOSOME FILTER: reference species without queried chromosome was removed.").endLine();
00151       }
00152       delete currentBlock_;
00153     } else {
00154       return currentBlock_;
00155     }
00156 
00157     //Look for the next block:
00158     currentBlock_ = iterator_->nextBlock();
00159   }
00160   
00161   return currentBlock_;
00162 }
00163 
00164 MafBlock* DuplicateFilterMafIterator::analyseCurrentBlock_() throw (Exception)
00165 {
00166   currentBlock_ = iterator_->nextBlock();
00167   while (currentBlock_) {
00168     bool foundRef = false;
00169     string chr = "";
00170     char strand = '+';
00171     unsigned int start = 0;
00172     unsigned int stop  = 0;
00173     for (size_t i = 0; i < currentBlock_->getNumberOfSequences() && !foundRef; ++i) {
00174       string species = currentBlock_->getSequence(i).getSpecies(); 
00175       if (species == ref_) {
00176         foundRef = true;
00177         chr    = currentBlock_->getSequence(i).getChromosome();
00178         strand = currentBlock_->getSequence(i).getStrand();
00179         start  = currentBlock_->getSequence(i).start();
00180         stop   = currentBlock_->getSequence(i).stop();
00181       }
00182     }
00183     if (!foundRef) {
00184       if (logstream_) {
00185         (*logstream_ << "DUPLICATE FILTER: block does not contain reference species and was removed.").endLine();
00186       }
00187       delete currentBlock_;
00188     } else {
00189       unsigned int occurrence = blocks_[chr][strand][start][stop]++;
00190       if (occurrence > 0) {
00191         if (logstream_) {
00192           (*logstream_ << "DUPLICATE FILTER: sequence in reference species was found in a previous block. New block was removed.").endLine();
00193         }
00194         delete currentBlock_;
00195       } else {
00196         return currentBlock_;
00197       }
00198     }
00199 
00200     //Look for the next block:
00201     currentBlock_ = iterator_->nextBlock();
00202   }
00203   
00204   return currentBlock_;
00205 }
00206 
00207 MafBlock* BlockMergerMafIterator::analyseCurrentBlock_() throw (Exception)
00208 {
00209   if (!incomingBlock_) return 0;
00210   currentBlock_  = incomingBlock_;
00211   incomingBlock_ = iterator_->nextBlock();
00212   while (incomingBlock_) {
00213     unsigned int globalSpace = 0;
00214     for (unsigned int i = 0; i < species_.size(); ++i) {
00215       try {
00216         const MafSequence* seq1 = &currentBlock_->getSequenceForSpecies(species_[i]); 
00217         const MafSequence* seq2 = &incomingBlock_->getSequenceForSpecies(species_[i]);
00218         if (!seq1->hasCoordinates() || !seq2->hasCoordinates())
00219           throw Exception("BlockMergerMafIterator::nextBlock. Species '" + species_[i] + "' is missing coordinates in at least one block.");
00220 
00221         if (seq1->stop() >= seq2->start())
00222           return currentBlock_;
00223         unsigned int space = seq2->start() - seq1->stop() - 1;
00224         if (space > maxDist_)
00225           return currentBlock_;
00226         if (i == 0)
00227           globalSpace = space;
00228         else {
00229           if (space != globalSpace)
00230             return currentBlock_;
00231         }
00232         if (seq1->getChromosome() != seq2->getChromosome()
00233          || VectorTools::contains(ignoreChrs_, seq1->getChromosome())
00234          || VectorTools::contains(ignoreChrs_, seq2->getChromosome())
00235          || seq1->getStrand() != seq2->getStrand()
00236          || seq1->getSrcSize() != seq2->getSrcSize())
00237         {
00238           //There is a syntheny break in this sequence, so we do not merge the blocks.
00239           return currentBlock_;
00240         }
00241       } catch (SequenceNotFoundException& snfe) {
00242         //At least one block does not contain the sequence.
00243         //We don't merge the blocks:
00244         return currentBlock_;
00245       }
00246     }
00247     //We merge the two blocks:
00248     if (logstream_) {
00249       (*logstream_ << "BLOCK MERGER: merging two consecutive blocks.").endLine();
00250     }
00251     vector<string> sp1 = currentBlock_->getSpeciesList();
00252     vector<string> sp2 = incomingBlock_->getSpeciesList();
00253     vector<string> allSp = VectorTools::unique(VectorTools::vectorUnion(sp1, sp2));
00254     //We need to create a new MafBlock:
00255     MafBlock* mergedBlock = new MafBlock();
00256     //We average the score and pass values:
00257     unsigned int p1 = currentBlock_->getPass();
00258     unsigned int p2 = incomingBlock_->getPass();
00259     if (p1 == p2) mergedBlock->setPass(p1);
00260     double s1 = currentBlock_->getScore();
00261     double n1 = static_cast<double>(currentBlock_->getNumberOfSites());
00262     double s2 = incomingBlock_->getScore();
00263     double n2 = static_cast<double>(incomingBlock_->getNumberOfSites());
00264     mergedBlock->setScore((s1 * n1 + s2 * n2) / (n1 + n2));
00265 
00266     //Now fill the new block:
00267     for (size_t i = 0; i < allSp.size(); ++i) {
00268       auto_ptr<MafSequence> seq;
00269       try {
00270         seq.reset(new MafSequence(currentBlock_->getSequenceForSpecies(allSp[i])));
00271 
00272         //Check is there is a second sequence:
00273         try {
00274           auto_ptr<MafSequence> tmp(new MafSequence(incomingBlock_->getSequenceForSpecies(allSp[i])));
00275           string ref1 = seq->getDescription(), ref2 = tmp->getDescription();
00276           //Add spacer if needed:
00277           if (globalSpace > 0) {
00278             if (logstream_) {
00279               (*logstream_ << "BLOCK MERGER: a spacer of size " << globalSpace <<" is inserted in sequence for species " << allSp[i] << ".").endLine();
00280             }
00281             seq->append(vector<int>(globalSpace, AlphabetTools::DNA_ALPHABET.getUnknownCharacterCode()));
00282           }
00283           if (seq->getChromosome() != tmp->getChromosome()) {
00284             seq->setChromosome(seq->getChromosome() + "-" + tmp->getChromosome());
00285             seq->removeCoordinates();
00286           }
00287           if (seq->getStrand() != tmp->getStrand()) {
00288             seq->setStrand('?');
00289             seq->removeCoordinates();
00290           }
00291           if (seq->getName() != tmp->getName())
00292             tmp->setName(seq->getName()); //force name conversion to prevent exception in 'merge'.
00293           seq->merge(*tmp);
00294           if (logstream_) {
00295             (*logstream_ << "BLOCK MERGER: merging " << ref1 << " with " << ref2 << " into " << seq->getDescription()).endLine();
00296           }
00297         } catch (SequenceNotFoundException& snfe2) {
00298           //There was a first sequence, we just extend it:
00299           string ref1 = seq->getDescription();
00300           seq->setToSizeR(seq->size() + incomingBlock_->getNumberOfSites() + globalSpace);
00301           if (logstream_) {
00302             (*logstream_ << "BLOCK MERGER: extending " << ref1 << " with " << incomingBlock_->getNumberOfSites() << " gaps on the right.").endLine();
00303           }
00304         }
00305       } catch (SequenceNotFoundException& snfe1) {
00306         //There must be a second sequence then:
00307         seq.reset(new MafSequence(incomingBlock_->getSequenceForSpecies(allSp[i])));
00308         string ref2 = seq->getDescription();
00309         seq->setToSizeL(seq->size() + currentBlock_->getNumberOfSites() + globalSpace);
00310         if (logstream_) {
00311           (*logstream_ << "BLOCK MERGER: adding " << ref2 << " and extend it with " << currentBlock_->getNumberOfSites() << " gaps on the left.").endLine();
00312         }
00313       }
00314       mergedBlock->addSequence(*seq);
00315     }
00316     //Cleaning stuff:
00317     delete currentBlock_;
00318     delete incomingBlock_;
00319     currentBlock_ = mergedBlock;
00320     //We check if we can also merge the next block:
00321     incomingBlock_ = iterator_->nextBlock();
00322   }
00323   return currentBlock_;
00324 }
00325 
00326 MafBlock* FullGapFilterMafIterator::analyseCurrentBlock_() throw (Exception)
00327 {
00328   MafBlock* block = iterator_->nextBlock();
00329   if (!block) return 0;
00330 
00331   //We create a copy of the ingroup alignement for better efficiency:
00332   VectorSiteContainer vsc(&AlphabetTools::DNA_ALPHABET);
00333   for (size_t i = 0; i < species_.size(); ++i) {
00334     vsc.addSequence(block->getSequence(i));
00335   }
00336   //Now check the positions that are only made of gaps:
00337   if (verbose_) {
00338     ApplicationTools::message->endLine();
00339     ApplicationTools::displayTask("Cleaning block for gap sites", true);
00340   }
00341   unsigned int n = block->getNumberOfSites();
00342   vector <unsigned int> start;
00343   vector <unsigned int> count;
00344   bool test = false;
00345   for(unsigned int i = 0; i < n; ++i) {
00346     const Site* site = &vsc.getSite(i);
00347     if (SiteTools::isGapOnly(*site)) {
00348       if (test) {
00349         count[count.size() - 1]++;
00350       } else {
00351         start.push_back(i);
00352         count.push_back(1);
00353         test = true;
00354       }
00355     } else {
00356       test = false;
00357     }
00358   }
00359   //Now remove blocks:
00360   unsigned int totalRemoved = 0;
00361   for(size_t i = start.size(); i > 0; --i) {
00362     if (verbose_)
00363       ApplicationTools::displayGauge(start.size() - i, start.size() - 1, '=');
00364     block->getAlignment().deleteSites(start[i - 1], count[i - 1]);
00365     totalRemoved += count[i - 1];
00366   }
00367   if (verbose_)
00368     ApplicationTools::displayTaskDone();
00369   
00370   //Correct coordinates:
00371   if (totalRemoved > 0) {
00372     for (unsigned int i = 0; i < block->getNumberOfSequences(); ++i) {
00373       const MafSequence* seq = &block->getSequence(i);
00374       if (!VectorTools::contains(species_, seq->getSpecies())) {
00375         block->removeCoordinatesFromSequence(i);
00376       }
00377     }
00378   }
00379   if (logstream_) {
00380     (*logstream_ << "FULL GAP CLEANER: " << totalRemoved << " positions have been removed.").endLine();
00381   }
00382   return block;
00383 }
00384 
00385 MafBlock* AlignmentFilterMafIterator::analyseCurrentBlock_() throw (Exception)
00386 {
00387   if (blockBuffer_.size() == 0) {
00388     //Else there is no more block in the buffer, we need to parse more:
00389     do {
00390       MafBlock* block = iterator_->nextBlock();
00391       if (!block) return 0; //No more block.
00392     
00393       //Parse block.
00394       int gap = AlphabetTools::DNA_ALPHABET.getGapCharacterCode();
00395       int unk = AlphabetTools::DNA_ALPHABET.getUnknownCharacterCode();
00396       size_t nr = species_.size();
00397       vector< vector<int> > aln(nr);
00398       for (size_t i = 0; i < nr; ++i) {
00399         aln[i] = block->getSequenceForSpecies(species_[i]).getContent();
00400       }
00401       size_t nc = block->getNumberOfSites();
00402       //First we create a mask:
00403       vector<unsigned int> pos;
00404       vector<bool> col(nr);
00405       //Reset window:
00406       window_.clear();
00407       //Init window:
00408       size_t i;
00409       for (i = 0; i < windowSize_; ++i) {
00410         for (size_t j = 0; j < nr; ++j) {
00411           col[j] = (aln[j][i] == gap || aln[j][i] == unk);
00412         }
00413         window_.push_back(col);
00414       }
00415       //Slide window:
00416       if (verbose_) {
00417         ApplicationTools::message->endLine();
00418         ApplicationTools::displayTask("Sliding window for alignment filter", true);
00419       }
00420       while (i + step_ < nc) {
00421         if (verbose_)
00422           ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1, '>');
00423         //Evaluate current window:
00424         unsigned int sum = 0;
00425         for (size_t u = 0; u < window_.size(); ++u)
00426           for (size_t v = 0; v < window_[u].size(); ++v)
00427             if (window_[u][v]) sum++;
00428         if (sum > maxGap_) {
00429           if (pos.size() == 0) {
00430             pos.push_back(i - windowSize_);
00431             pos.push_back(i);
00432           } else {
00433             if (i - windowSize_ < pos[pos.size() - 1]) {
00434               pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region
00435             } else { //This is a new region
00436               pos.push_back(i - windowSize_);
00437               pos.push_back(i);
00438             }
00439           }
00440         }
00441       
00442         //Move forward:
00443         for (unsigned int k = 0; k < step_; ++k) {
00444           for (size_t j = 0; j < nr; ++j) {
00445             col[j] = (aln[j][i] == gap || aln[j][i] == unk);
00446           }
00447           window_.push_back(col);
00448           window_.pop_front();
00449           ++i;
00450         }
00451       }
00452       
00453       //Evaluate last window:
00454       unsigned int sum = 0;
00455       for (size_t u = 0; u < window_.size(); ++u)
00456         for (size_t v = 0; v < window_[u].size(); ++v)
00457           if (window_[u][v]) sum++;
00458       if (sum > maxGap_) {
00459         if (pos.size() == 0) {
00460           pos.push_back(i - windowSize_);
00461           pos.push_back(i);
00462         } else {
00463           if (i - windowSize_ <= pos[pos.size() - 1]) {
00464             pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region
00465           } else { //This is a new region
00466             pos.push_back(i - windowSize_);
00467             pos.push_back(i);
00468           }
00469         }
00470       } 
00471       if (verbose_)
00472         ApplicationTools::displayTaskDone();
00473     
00474       //Now we remove regions with two many gaps, using a sliding window:
00475       if (pos.size() == 0) {
00476         blockBuffer_.push_back(block);
00477         if (logstream_) {
00478           (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " is clean and kept as is.").endLine();
00479         }
00480       } else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) {
00481         //Everything is removed:
00482         if (logstream_) {
00483           (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " was entirely removed. Tried to get the next one.").endLine();
00484         }
00485       } else {
00486         if (logstream_) {
00487           (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine();
00488         }
00489         if (verbose_) {
00490           ApplicationTools::message->endLine();
00491           ApplicationTools::displayTask("Spliting block", true);
00492         }
00493         for (i = 0; i < pos.size(); i+=2) {
00494           if (verbose_)
00495             ApplicationTools::displayGauge(i, pos.size() - 2, '=');
00496           if (logstream_) {
00497             (*logstream_ << "ALN CLEANER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block " << block->getDescription() << ".").endLine();
00498           }
00499           if (pos[i] > 0) {
00500             MafBlock* newBlock = new MafBlock();
00501             newBlock->setScore(block->getScore());
00502             newBlock->setPass(block->getPass());
00503             for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
00504               MafSequence* subseq;
00505               if (i == 0) {
00506                 subseq = block->getSequence(j).subSequence(0, pos[i]);
00507               } else {
00508                 subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]);
00509               }
00510               newBlock->addSequence(*subseq);
00511               delete subseq;
00512             }
00513             blockBuffer_.push_back(newBlock);
00514           }
00515         
00516           if (keepTrashedBlocks_) {
00517             MafBlock* outBlock = new MafBlock();
00518             outBlock->setScore(block->getScore());
00519             outBlock->setPass(block->getPass());
00520             for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
00521               MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]);
00522               outBlock->addSequence(*outseq);
00523               delete outseq;
00524             } 
00525             trashBuffer_.push_back(outBlock);
00526           }
00527         }
00528         //Add last block:
00529         if (pos[pos.size() - 1] < block->getNumberOfSites()) {
00530           MafBlock* newBlock = new MafBlock();
00531           newBlock->setScore(block->getScore());
00532           newBlock->setPass(block->getPass());
00533           for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
00534             MafSequence* subseq;
00535             subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]);
00536             newBlock->addSequence(*subseq);
00537             delete subseq;
00538           }
00539           blockBuffer_.push_back(newBlock);
00540         }
00541         if (verbose_)
00542           ApplicationTools::displayTaskDone();
00543 
00544         delete block;
00545       }
00546     } while (blockBuffer_.size() == 0);
00547   }
00548 
00549   MafBlock* block = blockBuffer_.front();
00550   blockBuffer_.pop_front();
00551   return block;
00552 }
00553 
00554 MafBlock* AlignmentFilter2MafIterator::analyseCurrentBlock_() throw (Exception)
00555 {
00556   if (blockBuffer_.size() == 0) {
00557     //Else there is no more block in the buffer, we need to parse more:
00558     do {
00559       MafBlock* block = iterator_->nextBlock();
00560       if (!block) return 0; //No more block.
00561     
00562       //Parse block.
00563       int gap = AlphabetTools::DNA_ALPHABET.getGapCharacterCode();
00564       int unk = AlphabetTools::DNA_ALPHABET.getUnknownCharacterCode();
00565       size_t nr = species_.size();
00566       vector< vector<int> > aln(nr);
00567       for (size_t i = 0; i < nr; ++i) {
00568         aln[i] = block->getSequenceForSpecies(species_[i]).getContent();
00569       }
00570       size_t nc = block->getNumberOfSites();
00571       //First we create a mask:
00572       vector<unsigned int> pos;
00573       vector<bool> col(nr);
00574       //Reset window:
00575       window_.clear();
00576       //Init window:
00577       size_t i;
00578       for (i = 0; i < windowSize_; ++i) {
00579         for (size_t j = 0; j < nr; ++j) {
00580           col[j] = (aln[j][i] == gap || aln[j][i] == unk);
00581         }
00582         window_.push_back(col);
00583       }
00584       //Slide window:
00585       if (verbose_) {
00586         ApplicationTools::message->endLine();
00587         ApplicationTools::displayTask("Sliding window for alignment filter", true);
00588       }
00589       while (i + step_ < nc) {
00590         if (verbose_)
00591           ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1, '>');
00592         //Evaluate current window:
00593         unsigned int count = 0;
00594         for (size_t u = 0; u < window_.size(); ++u) {
00595           unsigned int partialCount = 0;
00596           if (u > 0 && window_[u] != window_[u - 1]) {
00597             for (size_t v = 0; v < window_[u].size(); ++v)
00598               if (window_[u][v]) partialCount++;
00599             if (partialCount > maxGap_)
00600               count++;
00601           }
00602         }
00603         if (count > maxPos_) {
00604           if (pos.size() == 0) {
00605             pos.push_back(i - windowSize_);
00606             pos.push_back(i);
00607           } else {
00608             if (i - windowSize_ < pos[pos.size() - 1]) {
00609               pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region
00610             } else { //This is a new region
00611               pos.push_back(i - windowSize_);
00612               pos.push_back(i);
00613             }
00614           }
00615         }
00616       
00617         //Move forward:
00618         for (unsigned int k = 0; k < step_; ++k) {
00619           for (size_t j = 0; j < nr; ++j) {
00620             col[j] = (aln[j][i] == gap || aln[j][i] == unk);
00621           }
00622           window_.push_back(col);
00623           window_.pop_front();
00624           ++i;
00625         }
00626       }
00627 
00628       //Evaluate last window:
00629       unsigned int count = 0;
00630       for (size_t u = 0; u < window_.size(); ++u) {
00631         unsigned int partialCount = 0;
00632         if (u > 0 && window_[u] != window_[u - 1]) {
00633           for (size_t v = 0; v < window_[u].size(); ++v)
00634             if (window_[u][v]) partialCount++;
00635           if (partialCount > maxGap_)
00636             count++;
00637         }
00638       }
00639       if (count > maxPos_) {
00640         if (pos.size() == 0) {
00641           pos.push_back(i - windowSize_);
00642           pos.push_back(i);
00643         } else {
00644           if (i - windowSize_ <= pos[pos.size() - 1]) {
00645             pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region
00646           } else { //This is a new region
00647             pos.push_back(i - windowSize_);
00648             pos.push_back(i);
00649           }
00650         }
00651       } 
00652       if (verbose_)
00653         ApplicationTools::displayTaskDone();
00654     
00655       //Now we remove regions with two many gaps, using a sliding window:
00656       if (pos.size() == 0) {
00657         blockBuffer_.push_back(block);
00658         if (logstream_) {
00659           (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " is clean and kept as is.").endLine();
00660         }
00661       } else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) {
00662         //Everything is removed:
00663         if (logstream_) {
00664           (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " was entirely removed. Tried to get the next one.").endLine();
00665         }
00666       } else {
00667         if (logstream_) {
00668           (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine();
00669         }
00670         if (verbose_) {
00671           ApplicationTools::message->endLine();
00672           ApplicationTools::displayTask("Spliting block", true);
00673         }
00674         for (i = 0; i < pos.size(); i+=2) {
00675           if (verbose_)
00676             ApplicationTools::displayGauge(i, pos.size() - 2, '=');
00677           if (logstream_) {
00678             (*logstream_ << "ALN CLEANER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block " << block->getDescription() << ".").endLine();
00679           }
00680           if (pos[i] > 0) {
00681             MafBlock* newBlock = new MafBlock();
00682             newBlock->setScore(block->getScore());
00683             newBlock->setPass(block->getPass());
00684             for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
00685               MafSequence* subseq;
00686               if (i == 0) {
00687                 subseq = block->getSequence(j).subSequence(0, pos[i]);
00688               } else {
00689                 subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]);
00690               }
00691               newBlock->addSequence(*subseq);
00692               delete subseq;
00693             }
00694             blockBuffer_.push_back(newBlock);
00695           }
00696         
00697           if (keepTrashedBlocks_) {
00698             MafBlock* outBlock = new MafBlock();
00699             outBlock->setScore(block->getScore());
00700             outBlock->setPass(block->getPass());
00701             for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
00702               MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]);
00703               outBlock->addSequence(*outseq);
00704               delete outseq;
00705             } 
00706             trashBuffer_.push_back(outBlock);
00707           }
00708         }
00709         //Add last block:
00710         if (pos[pos.size() - 1] < block->getNumberOfSites()) {
00711           MafBlock* newBlock = new MafBlock();
00712           newBlock->setScore(block->getScore());
00713           newBlock->setPass(block->getPass());
00714           for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
00715             MafSequence* subseq;
00716             subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]);
00717             newBlock->addSequence(*subseq);
00718             delete subseq;
00719           }
00720           blockBuffer_.push_back(newBlock);
00721         }
00722         if (verbose_)
00723           ApplicationTools::displayTaskDone();
00724 
00725         delete block;
00726       }
00727     } while (blockBuffer_.size() == 0);
00728   }
00729 
00730   MafBlock* block = blockBuffer_.front();
00731   blockBuffer_.pop_front();
00732   return block;
00733 }
00734 
00735 MafBlock* MaskFilterMafIterator::analyseCurrentBlock_() throw (Exception)
00736 {
00737   if (blockBuffer_.size() == 0) {
00738     do {
00739       //Else there is no more block in the buffer, we need parse more:
00740       MafBlock* block = iterator_->nextBlock();
00741       if (!block) return 0; //No more block.
00742     
00743       //Parse block.
00744       vector< vector<bool> > aln;
00745       for (size_t i = 0; i < species_.size(); ++i) {
00746         const MafSequence* seq = &block->getSequenceForSpecies(species_[i]);
00747         if (seq->hasAnnotation(SequenceMask::MASK)) {
00748           aln.push_back(dynamic_cast<const SequenceMask&>(seq->getAnnotation(SequenceMask::MASK)).getMask());
00749         }
00750       }
00751       size_t nr = aln.size();
00752       size_t nc = block->getNumberOfSites();
00753       //First we create a mask:
00754       vector<unsigned int> pos;
00755       vector<bool> col(nr);
00756       //Reset window:
00757       window_.clear();
00758       //Init window:
00759       size_t i;
00760       for (i = 0; i < windowSize_; ++i) {
00761         for (size_t j = 0; j < nr; ++j) {
00762           col[j] = aln[j][i];
00763         }
00764         window_.push_back(col);
00765       }
00766       //Slide window:
00767       if (verbose_) {
00768         ApplicationTools::message->endLine();
00769         ApplicationTools::displayTask("Sliding window for mask filter", true);
00770       }
00771       while (i + step_ < nc) {
00772         if (verbose_)
00773           ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1, '>');
00774         //Evaluate current window:
00775         unsigned int sum = 0;
00776         for (size_t u = 0; u < window_.size(); ++u)
00777           for (size_t v = 0; v < window_[u].size(); ++v)
00778             if (window_[u][v]) sum++;
00779         if (sum > maxMasked_) {
00780           if (pos.size() == 0) {
00781             pos.push_back(i - windowSize_);
00782             pos.push_back(i);
00783           } else {
00784             if (i - windowSize_ <= pos[pos.size() - 1]) {
00785               pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region
00786             } else { //This is a new region
00787               pos.push_back(i - windowSize_);
00788               pos.push_back(i);
00789             }
00790           }
00791         }
00792       
00793         //Move forward:
00794         for (unsigned int k = 0; k < step_; ++k) {
00795           for (size_t j = 0; j < nr; ++j) {
00796             col[j] = aln[j][i];
00797           }  
00798           window_.push_back(col);
00799           window_.pop_front();
00800           ++i;
00801         }
00802       }
00803 
00804       //Evaluate last window:
00805       unsigned int sum = 0;
00806       for (size_t u = 0; u < window_.size(); ++u)
00807         for (size_t v = 0; v < window_[u].size(); ++v)
00808           if (window_[u][v]) sum++;
00809       if (sum > maxMasked_) {
00810         if (pos.size() == 0) {
00811           pos.push_back(i - windowSize_);
00812           pos.push_back(i);
00813         } else {
00814           if (i - windowSize_ < pos[pos.size() - 1]) {
00815             pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region
00816           } else { //This is a new region
00817             pos.push_back(i - windowSize_);
00818             pos.push_back(i);
00819           }
00820         }
00821       }
00822       if (verbose_)
00823         ApplicationTools::displayTaskDone();
00824     
00825       //Now we remove regions with two many gaps, using a sliding window:
00826       if (pos.size() == 0) {
00827         blockBuffer_.push_back(block);
00828         if (logstream_) {
00829           (*logstream_ << "MASK CLEANER: block is clean and kept as is.").endLine();
00830         }
00831       } else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) {
00832         //Everything is removed:
00833         if (logstream_) {
00834           (*logstream_ << "MASK CLEANER: block was entirely removed. Tried to get the next one.").endLine();
00835         }
00836       } else {
00837         if (logstream_) {
00838           (*logstream_ << "MASK CLEANER: block with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine();
00839         }
00840         if (verbose_) {
00841           ApplicationTools::message->endLine();
00842           ApplicationTools::displayTask("Spliting block", true);
00843         }
00844         for (i = 0; i < pos.size(); i+=2) {
00845           if (verbose_)
00846             ApplicationTools::displayGauge(i, pos.size() - 2, '=');
00847           if (logstream_) {
00848             (*logstream_ << "MASK CLEANER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block.").endLine();
00849           }
00850           if (pos[i] > 0) {
00851             MafBlock* newBlock = new MafBlock();
00852             newBlock->setScore(block->getScore());
00853             newBlock->setPass(block->getPass());
00854             for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
00855               MafSequence* subseq;
00856               if (i == 0) {
00857                 subseq = block->getSequence(j).subSequence(0, pos[i]);
00858               } else {
00859                 subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]);
00860               }
00861               newBlock->addSequence(*subseq);
00862               delete subseq;
00863             }
00864             blockBuffer_.push_back(newBlock);
00865           }
00866           
00867           if (keepTrashedBlocks_) {
00868             MafBlock* outBlock = new MafBlock();
00869             outBlock->setScore(block->getScore());
00870             outBlock->setPass(block->getPass());
00871             for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
00872               MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]);
00873               outBlock->addSequence(*outseq);
00874               delete outseq;
00875             } 
00876             trashBuffer_.push_back(outBlock);
00877           }
00878         }
00879         //Add last block:
00880         if (pos[pos.size() - 1] < block->getNumberOfSites()) {
00881           MafBlock* newBlock = new MafBlock();
00882           newBlock->setScore(block->getScore());
00883           newBlock->setPass(block->getPass());
00884           for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
00885             MafSequence* subseq;
00886             subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]);
00887             newBlock->addSequence(*subseq);
00888             delete subseq;
00889           }
00890           blockBuffer_.push_back(newBlock);
00891         }
00892         if (verbose_)
00893           ApplicationTools::displayTaskDone();
00894 
00895         delete block;
00896       }  
00897     } while (blockBuffer_.size() == 0);
00898   }
00899 
00900   MafBlock* block = blockBuffer_.front();
00901   blockBuffer_.pop_front();
00902   return block;
00903 }
00904 
00905 MafBlock* QualityFilterMafIterator::analyseCurrentBlock_() throw (Exception)
00906 {
00907   if (blockBuffer_.size() == 0) {
00908     do {
00909       //Else there is no more block in the buffer, we need parse more:
00910       MafBlock* block = iterator_->nextBlock();
00911       if (!block) return 0; //No more block.
00912     
00913       //Parse block.
00914       vector< vector<int> > aln;
00915       for (size_t i = 0; i < species_.size(); ++i) {
00916         const MafSequence* seq = &block->getSequenceForSpecies(species_[i]);
00917         if (seq->hasAnnotation(SequenceQuality::QUALITY_SCORE)) {
00918           aln.push_back(dynamic_cast<const SequenceQuality&>(seq->getAnnotation(SequenceQuality::QUALITY_SCORE)).getScores());
00919         }
00920       }
00921       if (aln.size() != species_.size()) {
00922         blockBuffer_.push_back(block);
00923         if (logstream_) {
00924           (*logstream_ << "QUAL CLEANER: block is missing quality score for at least one species and will therefore not be filtered.").endLine();
00925         }
00926         //NB here we could decide to discard the block instead!
00927       } else {
00928         size_t nr = aln.size();
00929         size_t nc = block->getNumberOfSites();
00930         //First we create a mask:
00931         vector<unsigned int> pos;
00932         vector<int> col(nr);
00933         //Reset window:
00934         window_.clear();
00935         //Init window:
00936         size_t i;
00937         for (i = 0; i < windowSize_; ++i) {
00938           for (size_t j = 0; j < nr; ++j) {
00939             col[j] = aln[j][i];
00940           }
00941           window_.push_back(col);
00942         }
00943         //Slide window:
00944         if (verbose_) {
00945           ApplicationTools::message->endLine();
00946           ApplicationTools::displayTask("Sliding window for quality filter", true);
00947         }
00948         while (i + step_ < nc) {
00949           if (verbose_)
00950             ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1, '>');
00951           //Evaluate current window:
00952           double mean = 0;
00953           double n = static_cast<double>(aln.size() * windowSize_);
00954           for (size_t u = 0; u < window_.size(); ++u)
00955             for (size_t v = 0; v < window_[u].size(); ++v) {
00956               mean += (window_[u][v] > 0 ? static_cast<double>(window_[u][v]) : 0.);
00957               if (window_[u][v] == -1) n--;
00958             }
00959           if (n > 0 && (mean / n) < minQual_) {
00960             if (pos.size() == 0) {
00961               pos.push_back(i - windowSize_);
00962               pos.push_back(i);
00963             } else {
00964               if (i - windowSize_ <= pos[pos.size() - 1]) {
00965                 pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region
00966               } else { //This is a new region
00967                 pos.push_back(i - windowSize_);
00968                 pos.push_back(i);
00969               }
00970             }
00971           }
00972       
00973           //Move forward:
00974           for (unsigned int k = 0; k < step_; ++k) {
00975             for (size_t j = 0; j < nr; ++j) {
00976               col[j] = aln[j][i];
00977             }  
00978             window_.push_back(col);
00979             window_.pop_front();
00980             ++i;
00981           }
00982         }
00983 
00984         //Evaluate last window:
00985         double mean = 0;
00986         double n = static_cast<double>(aln.size() * windowSize_);
00987         for (size_t u = 0; u < window_.size(); ++u)
00988           for (size_t v = 0; v < window_[u].size(); ++v) {
00989             mean += (window_[u][v] > 0 ? static_cast<double>(window_[u][v]) : 0.);
00990             if (window_[u][v] == -1) n--;
00991           }
00992         if (n > 0 && (mean / n) < minQual_) {
00993           if (pos.size() == 0) {
00994             pos.push_back(i - windowSize_);
00995             pos.push_back(i);
00996           } else {
00997             if (i - windowSize_ < pos[pos.size() - 1]) {
00998               pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region
00999             } else { //This is a new region
01000               pos.push_back(i - windowSize_);
01001               pos.push_back(i);
01002             }
01003           }
01004         }
01005         if (verbose_)
01006           ApplicationTools::displayTaskDone();
01007     
01008         //Now we remove regions with two many gaps, using a sliding window:
01009         if (pos.size() == 0) {
01010           blockBuffer_.push_back(block);
01011           if (logstream_) {
01012             (*logstream_ << "QUAL CLEANER: block is clean and kept as is.").endLine();
01013           }
01014         } else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) {
01015           //Everything is removed:
01016           if (logstream_) {
01017             (*logstream_ << "QUAL CLEANER: block was entirely removed. Tried to get the next one.").endLine();
01018           }
01019         } else {
01020           if (logstream_) {
01021             (*logstream_ << "QUAL CLEANER: block with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine();
01022           }
01023           if (verbose_) {
01024             ApplicationTools::message->endLine();
01025             ApplicationTools::displayTask("Spliting block", true);
01026           }
01027           for (i = 0; i < pos.size(); i+=2) {
01028             if (verbose_)
01029               ApplicationTools::displayGauge(i, pos.size() - 2, '=');
01030             if (logstream_) {
01031               (*logstream_ << "QUAL CLEANER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block.").endLine();
01032             }
01033             if (pos[i] > 0) {
01034               MafBlock* newBlock = new MafBlock();
01035               newBlock->setScore(block->getScore());
01036               newBlock->setPass(block->getPass());
01037               for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
01038                 MafSequence* subseq;
01039                 if (i == 0) {
01040                   subseq = block->getSequence(j).subSequence(0, pos[i]);
01041                 } else {
01042                   subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]);
01043                 }
01044                 newBlock->addSequence(*subseq);
01045                 delete subseq;
01046               }
01047               blockBuffer_.push_back(newBlock);
01048             }
01049            
01050             if (keepTrashedBlocks_) {
01051               MafBlock* outBlock = new MafBlock();
01052               outBlock->setScore(block->getScore());
01053               outBlock->setPass(block->getPass());
01054               for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
01055                 MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]);
01056                 outBlock->addSequence(*outseq);
01057                 delete outseq;
01058               } 
01059               trashBuffer_.push_back(outBlock);
01060             }
01061           }
01062           //Add last block:
01063           if (pos[pos.size() - 1] < block->getNumberOfSites()) {
01064             MafBlock* newBlock = new MafBlock();
01065             newBlock->setScore(block->getScore());
01066             newBlock->setPass(block->getPass());
01067             for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
01068               MafSequence* subseq;
01069               subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]);
01070               newBlock->addSequence(*subseq);
01071               delete subseq;
01072             }
01073             blockBuffer_.push_back(newBlock);
01074           }
01075           if (verbose_)
01076             ApplicationTools::displayTaskDone();
01077 
01078           delete block;
01079         }
01080       }
01081     } while (blockBuffer_.size() == 0);
01082   }
01083 
01084   MafBlock* block = blockBuffer_.front();
01085   blockBuffer_.pop_front();
01086   return block;
01087 }
01088 
01089 SequenceStatisticsMafIterator::SequenceStatisticsMafIterator(MafIterator* iterator, const std::vector<MafStatistics*> statistics) :
01090   AbstractFilterMafIterator(iterator),
01091   statistics_(statistics),
01092   results_(),
01093   names_()
01094 {
01095   string name;
01096   for (size_t i = 0; i < statistics_.size(); ++i) {
01097     name = statistics_[i]->getShortName();
01098     vector<string> tags = statistics_[i]->getSupportedTags();
01099     if (tags.size() > 1) {
01100       for (size_t j = 0; j < tags.size(); ++j) {
01101         names_.push_back(name + "." + tags[j]);
01102       }
01103     } else {
01104       names_.push_back(name);
01105     }
01106   }
01107   results_.resize(names_.size());
01108 }
01109 
01110 MafBlock* SequenceStatisticsMafIterator::analyseCurrentBlock_() throw (Exception)
01111 {
01112   vector<string> tags;
01113   currentBlock_ = iterator_->nextBlock();
01114   if (currentBlock_) {
01115     size_t k = 0;
01116     for (size_t i = 0; i < statistics_.size(); ++i) {
01117       statistics_[i]->compute(*currentBlock_);
01118       const MafStatisticsResult& result = statistics_[i]->getResult();
01119       tags = statistics_[i]->getSupportedTags();
01120       for (size_t j = 0; j < tags.size(); ++j) {
01121         if (result.hasValue(tags[j])) {
01122           results_[k] = result.getValue(tags[j]);
01123         } else {
01124           results_[k] = NumConstants::NaN;
01125         }
01126         k++;
01127       }
01128     }
01129   }
01130   return currentBlock_;
01131 }
01132 
01133 MafBlock* FeatureFilterMafIterator::analyseCurrentBlock_() throw (Exception)
01134 {
01135   if (blockBuffer_.size() == 0) {
01136     //Unless there is no more block in the buffer, we need to parse more:
01137     do {
01138       MafBlock* block = iterator_->nextBlock();
01139       if (!block) return 0; //No more block.
01140 
01141       //Check if the block contains the reference species:
01142       if (!block->hasSequenceForSpecies(refSpecies_)) {
01143         if (logstream_) {
01144           (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " does not contain the reference species and was kept as is.").endLine(); 
01145         }
01146         return block;
01147       }
01148 
01149       //Get the feature ranges for this block:
01150       const MafSequence& refSeq = block->getSequenceForSpecies(refSpecies_);
01151       //first check if there is one (for now we assume that features refer to the chromosome or contig name, with implicit species):
01152       std::map<std::string, MultiRange<unsigned int> >::iterator mr = ranges_.find(refSeq.getChromosome());
01153       if (mr == ranges_.end()) {
01154         if (logstream_) {
01155           (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " does not contain any feature and was kept as is.").endLine(); 
01156         }
01157         return block;
01158       }
01159       //else
01160       MultiRange<unsigned int> mRange = mr->second;
01161       mRange.restrictTo(Range<unsigned int>(refSeq.start(), refSeq.stop() + 1));
01162       if (mRange.isEmpty()) {
01163         if (logstream_) {
01164           (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " does not contain any feature and was kept as is.").endLine(); 
01165         }
01166         return block;
01167       }
01168       std::vector<unsigned int> tmp = mRange.getBounds(); 
01169       std::deque<unsigned int> refBounds(tmp.begin(), tmp.end()); 
01170 
01171       //Now extract corresponding alignments. We use the range to split the original block.
01172       //Only thing to watch out is the coordinates, refering to the ref species...
01173       //A good idea is then to convert those with respect to the given block:
01174 
01175       int gap = refSeq.getAlphabet()->getGapCharacterCode();
01176       long int refPos = static_cast<long int>(refSeq.start()) - 1;
01177       std::vector<size_t> pos;
01178       for (size_t alnPos = 0; alnPos < refSeq.size() && refBounds.size() > 0; ++alnPos) {
01179         if (refSeq[alnPos] != gap) {
01180           refPos++;
01181           //check if this position is a bound:
01182           if (refBounds.front() == static_cast<unsigned int>(refPos)) {
01183             pos.push_back(alnPos);
01184             refBounds.pop_front();
01185           }
01186         }
01187       }
01188       //Check if the last bound matches the end of the alignment:
01189       if (refBounds.size() > 0 && refBounds.front() == refSeq.stop() + 1) {
01190         pos.push_back(refSeq.size());
01191         refBounds.pop_front();
01192       }
01193 
01194       if (refBounds.size() > 0) {
01195         VectorTools::print(vector<unsigned int>(refBounds.begin(), refBounds.end()));
01196         throw Exception("FeatureFilterMafIterator::nextBlock(). An error occurred here, " + TextTools::toString(refBounds.size()) + " coordinates are left... this is most likely a bug, please report!");
01197       }
01198 
01199       //Next step is simply to split the black according to the translated coordinates:
01200       if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) {
01201         //Everything is removed:
01202         if (logstream_) {
01203           (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " was entirely removed. Tried to get the next one.").endLine();
01204         }
01205       } else {
01206         if (logstream_) {
01207           (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine();
01208         }
01209         if (verbose_) {
01210           ApplicationTools::message->endLine();
01211           ApplicationTools::displayTask("Spliting block", true);
01212         }
01213         for (size_t i = 0; i < pos.size(); i+=2) {
01214           if (verbose_)
01215             ApplicationTools::displayGauge(i, pos.size() - 2, '=');
01216           if (logstream_) {
01217             (*logstream_ << "FEATURE FILTER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block " << block->getDescription() << ".").endLine();
01218           }
01219           if (pos[i] > 0) {
01220             MafBlock* newBlock = new MafBlock();
01221             newBlock->setScore(block->getScore());
01222             newBlock->setPass(block->getPass());
01223             for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
01224               MafSequence* subseq;
01225               if (i == 0) {
01226                 subseq = block->getSequence(j).subSequence(0, pos[i]);
01227               } else {
01228                 subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]);
01229               }
01230               newBlock->addSequence(*subseq);
01231               delete subseq;
01232             }
01233             blockBuffer_.push_back(newBlock);
01234           }
01235         
01236           if (keepTrashedBlocks_) {
01237             MafBlock* outBlock = new MafBlock();
01238             outBlock->setScore(block->getScore());
01239             outBlock->setPass(block->getPass());
01240             for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
01241               MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]);
01242               outBlock->addSequence(*outseq);
01243               delete outseq;
01244             } 
01245             trashBuffer_.push_back(outBlock);
01246           }
01247         }
01248         //Add last block:
01249         if (pos.back() < block->getNumberOfSites()) {
01250           MafBlock* newBlock = new MafBlock();
01251           newBlock->setScore(block->getScore());
01252           newBlock->setPass(block->getPass());
01253           for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
01254             MafSequence* subseq;
01255             subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]);
01256             newBlock->addSequence(*subseq);
01257             delete subseq;
01258           }
01259           blockBuffer_.push_back(newBlock);
01260         }
01261         if (verbose_)
01262           ApplicationTools::displayTaskDone();
01263 
01264         delete block;
01265       }
01266     } while (blockBuffer_.size() == 0);
01267   }
01268 
01269   MafBlock* nxtBlock = blockBuffer_.front();
01270   blockBuffer_.pop_front();
01271   return nxtBlock;
01272 }
01273 
01274 MafBlock* FeatureExtractor::analyseCurrentBlock_() throw (Exception)
01275 {
01276   if (blockBuffer_.size() == 0) {
01277     //Unless there is no more block in the buffer, we need to parse more:
01278     START:
01279     MafBlock* block = iterator_->nextBlock();
01280     if (!block) return 0; //No more block.
01281 
01282     //Check if the block contains the reference species:
01283     if (!block->hasSequenceForSpecies(refSpecies_))
01284       goto START;
01285 
01286     //Get the feature ranges for this block:
01287     const MafSequence& refSeq = block->getSequenceForSpecies(refSpecies_);
01288     //first check if there is one (for now we assume that features refer to the chromosome or contig name, with implicit species):
01289     std::map<std::string, RangeSet<unsigned int> >::iterator mr = ranges_.find(refSeq.getChromosome());
01290     if (mr == ranges_.end())
01291       goto START;
01292         
01293     RangeSet<unsigned int> ranges = mr->second;
01294     ranges.restrictTo(Range<unsigned int>(refSeq.start(), refSeq.stop()));
01295     if (ranges.isEmpty())
01296       goto START;
01297 
01298     //We will need to convert to alignment positions, using a sequence walker:
01299     SequenceWalker walker(refSeq);
01300 
01301     //Now creates all blocks for all ranges:
01302     if (verbose_) {
01303       ApplicationTools::message->endLine();
01304       ApplicationTools::displayTask("Extracting annotations", true);
01305     }
01306     if (logstream_) {
01307       (*logstream_ << "FEATURE EXTRACTOR: extracting " << ranges.getSet().size() << " features from block " << block->getDescription() << ".").endLine();
01308     }
01309 
01310     unsigned int i = 0;
01311     for (set<Range<unsigned int>*>::iterator it = ranges.getSet().begin();
01312         it !=  ranges.getSet().end();
01313         ++it)
01314     {
01315       if (verbose_) {
01316         ApplicationTools::displayGauge(i++, ranges.getSet().size() - 1, '=');
01317       }
01318       MafBlock* newBlock = new MafBlock();
01319       newBlock->setScore(block->getScore());
01320       newBlock->setPass(block->getPass());
01321       for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
01322         auto_ptr<MafSequence> subseq;
01323         unsigned int a = walker.getAlignmentPosition((**it).begin() - refSeq.start());
01324         unsigned int b = walker.getAlignmentPosition((**it).end() - refSeq.start() - 1);
01325         subseq.reset(block->getSequence(j).subSequence(a, b - a + 1));
01326         if (!ignoreStrand_) {
01327           if (dynamic_cast<SeqRange*>(*it)->isNegativeStrand()) {
01328             SequenceTools::invertComplement(*subseq);
01329           }
01330         }
01331         newBlock->addSequence(*subseq);
01332       }
01333       blockBuffer_.push_back(newBlock);
01334     }
01335         
01336     if (verbose_)
01337       ApplicationTools::displayTaskDone();
01338 
01339     delete block;
01340   }
01341 
01342   MafBlock* nxtBlock = blockBuffer_.front();
01343   blockBuffer_.pop_front();
01344   return nxtBlock;
01345 }
01346 
01347 
01348 void OutputMafIterator::writeHeader(std::ostream& out) const
01349 {
01350   out << "##maf version=1 program=Bio++" << endl << "#" << endl;
01351   //There are more options in the header that we may want to support...
01352 }
01353 
01354 void OutputMafIterator::writeBlock(std::ostream& out, const MafBlock& block) const
01355 {
01356   out << "a";
01357   if (! isinf(block.getScore()))
01358     out << " score=" << block.getScore();
01359   if (block.getPass() > 0)
01360     out << " pass=" << block.getPass();
01361   out << endl;
01362   
01363   //Now we write sequences. First need to count characters for aligning blocks:
01364   size_t mxcSrc = 0, mxcStart = 0, mxcSize = 0, mxcSrcSize = 0;
01365   for (unsigned int i = 0; i < block.getNumberOfSequences(); i++) {
01366     const MafSequence* seq = &block.getSequence(i);
01367     unsigned int start = 0; //Maybe we should output sthg else here?
01368     if (seq->hasCoordinates())
01369       start = seq->start();
01370     mxcSrc     = max(mxcSrc    , seq->getName().size());
01371     mxcStart   = max(mxcStart  , TextTools::toString(start).size());
01372     mxcSize    = max(mxcSize   , TextTools::toString(seq->getGenomicSize()).size());
01373     mxcSrcSize = max(mxcSrcSize, TextTools::toString(seq->getSrcSize()).size());
01374   }
01375   //Now print each sequence:
01376   for (unsigned int i = 0; i < block.getNumberOfSequences(); i++) {
01377     const MafSequence* seq = &block.getSequence(i);
01378     out << "s ";
01379     out << TextTools::resizeRight(seq->getName(), mxcSrc, ' ') << " ";
01380     unsigned int start = 0; //Maybe we should output sthg else here?
01381     if (seq->hasCoordinates())
01382       start = seq->start();
01383     out << TextTools::resizeLeft(TextTools::toString(start), mxcStart, ' ') << " ";
01384     out << TextTools::resizeLeft(TextTools::toString(seq->getGenomicSize()), mxcSize, ' ') << " ";
01385     out << seq->getStrand() << " ";
01386     out << TextTools::resizeLeft(TextTools::toString(seq->getSrcSize()), mxcSrcSize, ' ') << " ";
01387     //Shall we write the sequence as masked?
01388     string seqstr = seq->toString();
01389     if (mask_ && seq->hasAnnotation(SequenceMask::MASK)) {
01390       const SequenceMask* mask = &dynamic_cast<const SequenceMask&>(seq->getAnnotation(SequenceMask::MASK));
01391       for (unsigned int j = 0; j < seqstr.size(); ++j) {
01392         char c = ((*mask)[j] ? tolower(seqstr[j]) : seqstr[j]);
01393         out << c;
01394       }
01395     } else {
01396       out << seqstr;
01397     }
01398     out << endl;
01399     //Write quality scores if any:
01400     if (mask_ && seq->hasAnnotation(SequenceQuality::QUALITY_SCORE)) {
01401       const SequenceQuality* qual = &dynamic_cast<const SequenceQuality&>(seq->getAnnotation(SequenceQuality::QUALITY_SCORE));
01402       out << "q ";
01403       out << TextTools::resizeRight(seq->getName(), mxcSrc + mxcStart + mxcSize + mxcSrcSize + 5, ' ') << " ";
01404       string qualStr;
01405       for (unsigned int j = 0; j < seq->size(); ++j) {
01406         int s = (*qual)[j];
01407         if (s == -1) {
01408           qualStr += "-";
01409         } else if (s == -2) {
01410           qualStr += "?";
01411         } else if (s >=0 && s < 10) {
01412           qualStr += TextTools::toString(s);
01413         } else if (s == 10) {
01414           qualStr += "F";
01415         } else {
01416           throw Exception("MafAlignmentParser::writeBlock. Unsuported score value: " + TextTools::toString(s));
01417         }
01418       }
01419       out << qualStr << endl;
01420     }
01421   }
01422   out << endl;
01423 }
01424 
01425 void OutputAlignmentMafIterator::writeBlock(std::ostream& out, const MafBlock& block) const {
01426   //First get alignment:
01427   AlignedSequenceContainer aln(block.getAlignment());
01428   //Format sequence names:
01429   vector<string> names(aln.getNumberOfSequences());
01430   for (unsigned int i = 0; i < aln.getNumberOfSequences(); ++i) {
01431     const MafSequence& mafseq = block.getSequence(i);
01432     names[i] = mafseq.getSpecies() + "-" + mafseq.getChromosome() + "(" + mafseq.getStrand() + ")/" + TextTools::toString(mafseq.start() + 1) + "-" + TextTools::toString(mafseq.stop() + 1);
01433   }
01434   aln.setSequencesNames(names);
01435   writer_.write(out, aln);
01436 }
01437 
01438 const short WindowSplitMafIterator::RAGGED_LEFT = 0;
01439 const short WindowSplitMafIterator::RAGGED_RIGHT = 1;
01440 const short WindowSplitMafIterator::CENTER = 2;
01441 const short WindowSplitMafIterator::ADJUST= 3;
01442 
01443 MafBlock* WindowSplitMafIterator::analyseCurrentBlock_() throw (Exception)
01444 {
01445   while (blockBuffer_.size() == 0) {
01446     //Build a new series of windows:
01447     MafBlock* block = iterator_->nextBlock();
01448     if (!block) return 0; //No more block.
01449 
01450     unsigned int pos = 0;
01451     unsigned int size = windowSize_;
01452     unsigned int bSize = block->getNumberOfSites();
01453 
01454     switch (align_) {
01455       case (RAGGED_RIGHT) : { pos = bSize % windowSize_; break; }
01456       case (CENTER)       : { pos = (bSize % windowSize_) / 2; break; }
01457       case (ADJUST)       : {
01458           unsigned int x = bSize / windowSize_;
01459           if (x == 0) size = bSize;
01460           else        size = bSize / x;
01461           break;
01462         }               
01463       default             : { }
01464     }
01465     //cout << "Effective size: " << size << endl;
01466     for(unsigned int i = pos; i < bSize; i += size) {
01467       MafBlock* newBlock = new MafBlock();
01468       newBlock->setScore(block->getScore());
01469       newBlock->setPass(block->getPass());
01470       if (align_ == ADJUST) {
01471         if (bSize - (i + size) > 0 && bSize - (i + size) < size) {
01472           //cout << "Old size: " << size;
01473           size = bSize - i; //Adjust for last block because of rounding.
01474                             //this should not increase size by more than 1!
01475           //cout << " => new size: " << size << endl;
01476         }
01477       }
01478       for (unsigned int j = 0; j < block->getNumberOfSequences(); ++j) {
01479         auto_ptr<MafSequence> subseq(block->getSequence(j).subSequence(i, size));
01480         newBlock->addSequence(*subseq);
01481       }
01482       blockBuffer_.push_back(newBlock);
01483     }
01484   }
01485   
01486   MafBlock* nxtBlock = blockBuffer_.front();
01487   blockBuffer_.pop_front();
01488   return nxtBlock;
01489 }
01490 
 All Classes Namespaces Files Functions Variables Typedefs Friends