bpp-seq  2.1.0
Bpp/Seq/Io/Phylip.cpp
Go to the documentation of this file.
00001 //
00002 // File: Phylip.cpp
00003 // Created by: Julien Dutheil
00004 // Created on: Mon Oct 27 12:22:56 2003
00005 //
00006 
00007 /*
00008 Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
00009 
00010 This software is a computer program whose purpose is to provide classes
00011 for sequences analysis.
00012 
00013 This software is governed by the CeCILL  license under French law and
00014 abiding by the rules of distribution of free software.  You can  use, 
00015 modify and/ or redistribute the software under the terms of the CeCILL
00016 license as circulated by CEA, CNRS and INRIA at the following URL
00017 "http://www.cecill.info". 
00018 
00019 As a counterpart to the access to the source code and  rights to copy,
00020 modify and redistribute granted by the license, users are provided only
00021 with a limited warranty  and the software's author,  the holder of the
00022 economic rights,  and the successive licensors  have only  limited
00023 liability. 
00024 
00025 In this respect, the user's attention is drawn to the risks associated
00026 with loading,  using,  modifying and/or developing or reproducing the
00027 software by the user in light of its specific status of free software,
00028 that may mean  that it is complicated to manipulate,  and  that  also
00029 therefore means  that it is reserved for developers  and  experienced
00030 professionals having in-depth computer knowledge. Users are therefore
00031 encouraged to load and test the software's suitability as regards their
00032 requirements in conditions enabling the security of their systems and/or 
00033 data to be ensured and,  more generally, to use and operate it in the 
00034 same conditions as regards security. 
00035 
00036 The fact that you are presently reading this means that you have had
00037 knowledge of the CeCILL license and that you accept its terms.
00038 */
00039 
00040 #include "Phylip.h"
00041 #include "../Container/SequenceContainerTools.h"
00042 #include <Bpp/Text/TextTools.h>
00043 #include <Bpp/Text/StringTokenizer.h>
00044 #include <Bpp/Io/FileTools.h>
00045 
00046 using namespace bpp;
00047 
00048 // From the STL:
00049 #include <sstream>
00050 
00051 using namespace std;
00052 
00053 /******************************************************************************/
00054 
00055 const std::vector<std::string> Phylip::splitNameAndSequence(const std::string& s) const throw (Exception)
00056 {
00057   vector<string> v(2);
00058   if (extended_)
00059   {
00060     string::size_type index = s.find(namesSplit_);
00061     if(index == string::npos) throw Exception("No sequence name found.");
00062     v[0] = TextTools::removeSurroundingWhiteSpaces(s.substr(0, index));
00063     v[1] = TextTools::removeFirstWhiteSpaces      (s.substr(index + namesSplit_.size())); //There may be more than 2 white spaces.
00064   }
00065   else
00066   {
00067     v[0] = TextTools::removeSurroundingWhiteSpaces(s.substr(0, 10));
00068     v[1] = s.substr(10);
00069   }
00070   return v;
00071 }  
00072 
00073 /******************************************************************************/
00074 
00075 void Phylip::readSequential(std::istream& in, SiteContainer& asc) const throw (Exception)
00076 {
00077   string temp;
00078   
00079   //Ignore first line:
00080   getline(in, temp, '\n');  // Copy current line in temporary string
00081   temp = TextTools::removeSurroundingWhiteSpaces(FileTools::getNextLine(in));
00082   string name = "";
00083   string seq  = "";
00084   
00085   while (!in.eof())
00086   {
00087     // Read each sequence:
00088     vector<string> v;
00089     bool hasName = true;
00090     try
00091     { 
00092       v = splitNameAndSequence(temp);
00093     }
00094     catch (Exception & e)
00095     {
00096       hasName = false;
00097     }
00098     if (hasName)
00099     {
00100       // a new sequence is found:
00101       if (!TextTools::isEmpty(name)) //If this is not the first sequence!
00102       {
00103         // Add the previous sequence to the container:
00104         asc.addSequence(BasicSequence(name, seq, asc.getAlphabet()), checkNames_);
00105       }
00106       name = v[0];
00107       seq  = v[1];
00108     }
00109     else
00110     {
00111       //No sequence name found.
00112       if (TextTools::isEmpty(name))
00113         throw Exception("First sequence in file has no name!");
00114       seq += TextTools::removeWhiteSpaces(temp);
00115     }
00116     //while(!TextTools::isEmpty(temp))
00117     //{
00118     //  //Sequences are separated by at least one blank line:
00119     //  getline(in, temp, '\n');  // read next line in file.
00120     //  seq += TextTools::removeWhiteSpaces(temp);      
00121     //}
00122     //end of this sequence:
00123     temp = TextTools::removeSurroundingWhiteSpaces(FileTools::getNextLine(in));
00124 
00125   }
00126   // Add last sequence:
00127   asc.addSequence(BasicSequence(name, seq, asc.getAlphabet()), checkNames_);
00128 }
00129 
00130 /******************************************************************************/
00131 
00132 void Phylip::readInterleaved(std::istream& in, SiteContainer& asc) const throw (Exception)
00133 {
00134   string temp;
00135   
00136   //Read first line:
00137   getline(in, temp, '\n'); // Copy current line in temporary string
00138   StringTokenizer st(temp);
00139   unsigned int nbSequences = TextTools::to<unsigned int>(st.nextToken());
00140   //int nbSites     = TextTools::toInt(st.nextToken());
00141   temp = FileTools::getNextLine(in);
00142   
00143   vector<string> names, seqs;
00144   // Read first block:
00145   for (unsigned int i = 0; i < nbSequences && !in.eof() && !TextTools::isEmpty(temp); i++)
00146   {
00147     vector<string> v = splitNameAndSequence(temp);
00148     names.push_back(v[0]);
00149     seqs.push_back(v[1]);
00150     getline(in, temp, '\n');  // read next line in file.
00151   }
00152   
00153   //Then read all other blocks:
00154   temp = FileTools::getNextLine(in);
00155   while (!in.eof())
00156   {
00157     for (unsigned int i = 0; i < names.size(); i++)
00158     {
00159       if (TextTools::isEmpty(temp))
00160         throw IOException("Phylip::readInterleaved. Bad file,there are not the same number of sequence in each block.");
00161       seqs[i] += TextTools::removeWhiteSpaces(temp);      
00162       getline(in, temp, '\n');  // read next line in file.
00163     }
00164     temp = FileTools::getNextLine(in);
00165   }
00166   for (unsigned int i = 0; i < names.size(); i++)
00167   {
00168     asc.addSequence(BasicSequence(names[i], seqs[i], asc.getAlphabet()), checkNames_);
00169   }
00170 }
00171   
00172 /******************************************************************************/
00173 
00174 void Phylip::appendAlignmentFromStream(std::istream& input, SiteContainer& vsc) const throw (Exception)
00175 {
00176   // Checking the existence of specified file
00177   if (!input) { throw IOException ("Phylip::read: fail to open file"); }
00178   
00179   if(sequential_) readSequential (input, vsc);
00180   else            readInterleaved(input, vsc);
00181 }
00182 
00183 /******************************************************************************/
00184 
00185 unsigned int Phylip::getNumberOfSequences(const std::string& path) const throw (IOException)
00186 {
00187   // Checking the existence of specified file
00188   ifstream file (path.c_str(), ios::in);
00189   if (! file) { throw IOException ("Phylip::getNumberOfSequences: failed to open file"); }
00190   string firstLine = FileTools::getNextLine(file);
00191   StringTokenizer st(firstLine, " \t");
00192   istringstream iss(st.nextToken());
00193   int nb;
00194   iss >> nb;
00195   file.close();
00196   return nb;
00197 }
00198  
00199 /******************************************************************************/
00200 
00201 std::vector<std::string> Phylip::getSizedNames(const std::vector<std::string>& names) const
00202 {
00203   vector<string> sizedNames(names.size());
00204   if (extended_)
00205   {
00206     //Add 6 white spaces to the larger name and align other names.
00207     //First, determine the size of the wider name:
00208     size_t sizeMax = 0;
00209     for (size_t i = 0; i < names.size(); i++)
00210       if (names[i].size() > sizeMax) sizeMax = names[i].size();
00211     //Quite easy ;-) Now update all lengths:
00212     for (size_t i = 0; i < names.size(); i++)
00213       sizedNames[i] = TextTools::resizeRight(names[i], sizeMax) + namesSplit_;  
00214   }
00215   else
00216   {
00217     //We trunc all names to ten characters:
00218     for(unsigned int i = 0; i < names.size(); i++) sizedNames[i] = TextTools::resizeRight(names[i], 10);
00219     cout << "Warning: names have been truncated to 10 characters. They may be ambiguous sequence names then." << endl;
00220   }
00221   return sizedNames;
00222 }
00223 
00224 /******************************************************************************/
00225 
00226 void Phylip::writeSequential(std::ostream& out, const SequenceContainer& sc, int charsByLine) const
00227 {
00228   //cout << "Write sequential" << endl;
00229   size_t numberOfSites = sc.getSequence(sc.getSequencesNames()[0]).size() * sc.getAlphabet()->getStateCodingSize();
00230   out << sc.getNumberOfSequences() << " " << numberOfSites << endl;
00231   
00232   vector<string> seqNames = sc.getSequencesNames();
00233   vector<string> names = getSizedNames(seqNames);
00234   for (size_t i = 0; i < seqNames.size(); i++)
00235   {
00236     vector<string> seq = TextTools::split(sc.toString(seqNames[i]), charsByLine);
00237     out << names[i] << seq[0] << endl;
00238     for (unsigned int j = 1; j < seq.size(); j++)
00239     {
00240       out << string(names[i].size(), ' ') << seq[j] << endl;
00241     }
00242     out << endl;
00243   }
00244 }
00245 
00246 void Phylip::writeInterleaved(std::ostream& out, const SequenceContainer& sc, int charsByLine) const
00247 {
00248   //cout << "Write interleaved;" << endl;
00249   size_t numberOfSites = sc.getSequence(sc.getSequencesNames()[0]).size() * sc.getAlphabet()->getStateCodingSize();
00250   out << sc.getNumberOfSequences() << " " << numberOfSites << endl;
00251   
00252   vector<string> seqNames = sc.getSequencesNames();
00253   vector<string> names = getSizedNames(seqNames);
00254   //Split sequences:
00255   vector< vector<string> > seqs(sc.getNumberOfSequences());
00256   for (size_t i = 0; i < seqNames.size(); i++)
00257   {
00258     seqs[i] = TextTools::split(sc.toString(seqNames[i]), charsByLine);
00259   }
00260   //Write first block:
00261   for (size_t i = 0; i < names.size(); i++)
00262   {
00263     out << names[i] << seqs[i][0] << endl;
00264   }
00265   out << endl;
00266   //Write other blocks:
00267   for (size_t j = 1; j < seqs[0].size(); j++)
00268   {
00269     for (unsigned int i = 0; i < sc.getNumberOfSequences(); i++)
00270     {
00271       out << seqs[i][j] << endl;
00272     }
00273     out << endl;
00274   }
00275 }
00276 
00277 /******************************************************************************/
00278 
00279 void Phylip::writeAlignment(std::ostream& output, const SiteContainer& sc) const throw (Exception)
00280 {
00281   //First must check if all sequences are aligned:
00282   if (sc.getNumberOfSequences() == 0)
00283     throw Exception("Phylip::write. SequenceContainer appear to contain no sequence.");
00284   
00285   // Checking the existence of specified file, and possibility to open it in write mode
00286   if (!output) { throw IOException ("Phylip::write : failed to open file"); }
00287 
00288   if (sequential_) writeSequential (output, sc, charsByLine_);
00289   else             writeInterleaved(output, sc, charsByLine_);
00290 }
00291 
00292 /******************************************************************************/
00293 
00294 const std::string Phylip::getFormatName() const { return "Phylip file, " + string(extended_ ? "extended," : "") + string(sequential_ ? "sequential" : "interleaved"); }
00295 
00296 /******************************************************************************/
00297 
00298 const std::string Phylip::getFormatDescription() const
00299 {
00300   return "Phylip file format, sequential and interleaved. PAML extension also supported.";
00301 }
00302 
00303 /******************************************************************************/
00304 
 All Classes Namespaces Files Functions Variables Typedefs Friends