|
bpp-seq
2.1.0
|
00001 // 00002 // File: Phylip.cpp 00003 // Created by: Julien Dutheil 00004 // Created on: Mon Oct 27 12:22:56 2003 00005 // 00006 00007 /* 00008 Copyright or © or Copr. Bio++ Development Team, (November 17, 2004) 00009 00010 This software is a computer program whose purpose is to provide classes 00011 for sequences analysis. 00012 00013 This software is governed by the CeCILL license under French law and 00014 abiding by the rules of distribution of free software. You can use, 00015 modify and/ or redistribute the software under the terms of the CeCILL 00016 license as circulated by CEA, CNRS and INRIA at the following URL 00017 "http://www.cecill.info". 00018 00019 As a counterpart to the access to the source code and rights to copy, 00020 modify and redistribute granted by the license, users are provided only 00021 with a limited warranty and the software's author, the holder of the 00022 economic rights, and the successive licensors have only limited 00023 liability. 00024 00025 In this respect, the user's attention is drawn to the risks associated 00026 with loading, using, modifying and/or developing or reproducing the 00027 software by the user in light of its specific status of free software, 00028 that may mean that it is complicated to manipulate, and that also 00029 therefore means that it is reserved for developers and experienced 00030 professionals having in-depth computer knowledge. Users are therefore 00031 encouraged to load and test the software's suitability as regards their 00032 requirements in conditions enabling the security of their systems and/or 00033 data to be ensured and, more generally, to use and operate it in the 00034 same conditions as regards security. 00035 00036 The fact that you are presently reading this means that you have had 00037 knowledge of the CeCILL license and that you accept its terms. 00038 */ 00039 00040 #include "Phylip.h" 00041 #include "../Container/SequenceContainerTools.h" 00042 #include <Bpp/Text/TextTools.h> 00043 #include <Bpp/Text/StringTokenizer.h> 00044 #include <Bpp/Io/FileTools.h> 00045 00046 using namespace bpp; 00047 00048 // From the STL: 00049 #include <sstream> 00050 00051 using namespace std; 00052 00053 /******************************************************************************/ 00054 00055 const std::vector<std::string> Phylip::splitNameAndSequence(const std::string& s) const throw (Exception) 00056 { 00057 vector<string> v(2); 00058 if (extended_) 00059 { 00060 string::size_type index = s.find(namesSplit_); 00061 if(index == string::npos) throw Exception("No sequence name found."); 00062 v[0] = TextTools::removeSurroundingWhiteSpaces(s.substr(0, index)); 00063 v[1] = TextTools::removeFirstWhiteSpaces (s.substr(index + namesSplit_.size())); //There may be more than 2 white spaces. 00064 } 00065 else 00066 { 00067 v[0] = TextTools::removeSurroundingWhiteSpaces(s.substr(0, 10)); 00068 v[1] = s.substr(10); 00069 } 00070 return v; 00071 } 00072 00073 /******************************************************************************/ 00074 00075 void Phylip::readSequential(std::istream& in, SiteContainer& asc) const throw (Exception) 00076 { 00077 string temp; 00078 00079 //Ignore first line: 00080 getline(in, temp, '\n'); // Copy current line in temporary string 00081 temp = TextTools::removeSurroundingWhiteSpaces(FileTools::getNextLine(in)); 00082 string name = ""; 00083 string seq = ""; 00084 00085 while (!in.eof()) 00086 { 00087 // Read each sequence: 00088 vector<string> v; 00089 bool hasName = true; 00090 try 00091 { 00092 v = splitNameAndSequence(temp); 00093 } 00094 catch (Exception & e) 00095 { 00096 hasName = false; 00097 } 00098 if (hasName) 00099 { 00100 // a new sequence is found: 00101 if (!TextTools::isEmpty(name)) //If this is not the first sequence! 00102 { 00103 // Add the previous sequence to the container: 00104 asc.addSequence(BasicSequence(name, seq, asc.getAlphabet()), checkNames_); 00105 } 00106 name = v[0]; 00107 seq = v[1]; 00108 } 00109 else 00110 { 00111 //No sequence name found. 00112 if (TextTools::isEmpty(name)) 00113 throw Exception("First sequence in file has no name!"); 00114 seq += TextTools::removeWhiteSpaces(temp); 00115 } 00116 //while(!TextTools::isEmpty(temp)) 00117 //{ 00118 // //Sequences are separated by at least one blank line: 00119 // getline(in, temp, '\n'); // read next line in file. 00120 // seq += TextTools::removeWhiteSpaces(temp); 00121 //} 00122 //end of this sequence: 00123 temp = TextTools::removeSurroundingWhiteSpaces(FileTools::getNextLine(in)); 00124 00125 } 00126 // Add last sequence: 00127 asc.addSequence(BasicSequence(name, seq, asc.getAlphabet()), checkNames_); 00128 } 00129 00130 /******************************************************************************/ 00131 00132 void Phylip::readInterleaved(std::istream& in, SiteContainer& asc) const throw (Exception) 00133 { 00134 string temp; 00135 00136 //Read first line: 00137 getline(in, temp, '\n'); // Copy current line in temporary string 00138 StringTokenizer st(temp); 00139 unsigned int nbSequences = TextTools::to<unsigned int>(st.nextToken()); 00140 //int nbSites = TextTools::toInt(st.nextToken()); 00141 temp = FileTools::getNextLine(in); 00142 00143 vector<string> names, seqs; 00144 // Read first block: 00145 for (unsigned int i = 0; i < nbSequences && !in.eof() && !TextTools::isEmpty(temp); i++) 00146 { 00147 vector<string> v = splitNameAndSequence(temp); 00148 names.push_back(v[0]); 00149 seqs.push_back(v[1]); 00150 getline(in, temp, '\n'); // read next line in file. 00151 } 00152 00153 //Then read all other blocks: 00154 temp = FileTools::getNextLine(in); 00155 while (!in.eof()) 00156 { 00157 for (unsigned int i = 0; i < names.size(); i++) 00158 { 00159 if (TextTools::isEmpty(temp)) 00160 throw IOException("Phylip::readInterleaved. Bad file,there are not the same number of sequence in each block."); 00161 seqs[i] += TextTools::removeWhiteSpaces(temp); 00162 getline(in, temp, '\n'); // read next line in file. 00163 } 00164 temp = FileTools::getNextLine(in); 00165 } 00166 for (unsigned int i = 0; i < names.size(); i++) 00167 { 00168 asc.addSequence(BasicSequence(names[i], seqs[i], asc.getAlphabet()), checkNames_); 00169 } 00170 } 00171 00172 /******************************************************************************/ 00173 00174 void Phylip::appendAlignmentFromStream(std::istream& input, SiteContainer& vsc) const throw (Exception) 00175 { 00176 // Checking the existence of specified file 00177 if (!input) { throw IOException ("Phylip::read: fail to open file"); } 00178 00179 if(sequential_) readSequential (input, vsc); 00180 else readInterleaved(input, vsc); 00181 } 00182 00183 /******************************************************************************/ 00184 00185 unsigned int Phylip::getNumberOfSequences(const std::string& path) const throw (IOException) 00186 { 00187 // Checking the existence of specified file 00188 ifstream file (path.c_str(), ios::in); 00189 if (! file) { throw IOException ("Phylip::getNumberOfSequences: failed to open file"); } 00190 string firstLine = FileTools::getNextLine(file); 00191 StringTokenizer st(firstLine, " \t"); 00192 istringstream iss(st.nextToken()); 00193 int nb; 00194 iss >> nb; 00195 file.close(); 00196 return nb; 00197 } 00198 00199 /******************************************************************************/ 00200 00201 std::vector<std::string> Phylip::getSizedNames(const std::vector<std::string>& names) const 00202 { 00203 vector<string> sizedNames(names.size()); 00204 if (extended_) 00205 { 00206 //Add 6 white spaces to the larger name and align other names. 00207 //First, determine the size of the wider name: 00208 size_t sizeMax = 0; 00209 for (size_t i = 0; i < names.size(); i++) 00210 if (names[i].size() > sizeMax) sizeMax = names[i].size(); 00211 //Quite easy ;-) Now update all lengths: 00212 for (size_t i = 0; i < names.size(); i++) 00213 sizedNames[i] = TextTools::resizeRight(names[i], sizeMax) + namesSplit_; 00214 } 00215 else 00216 { 00217 //We trunc all names to ten characters: 00218 for(unsigned int i = 0; i < names.size(); i++) sizedNames[i] = TextTools::resizeRight(names[i], 10); 00219 cout << "Warning: names have been truncated to 10 characters. They may be ambiguous sequence names then." << endl; 00220 } 00221 return sizedNames; 00222 } 00223 00224 /******************************************************************************/ 00225 00226 void Phylip::writeSequential(std::ostream& out, const SequenceContainer& sc, int charsByLine) const 00227 { 00228 //cout << "Write sequential" << endl; 00229 size_t numberOfSites = sc.getSequence(sc.getSequencesNames()[0]).size() * sc.getAlphabet()->getStateCodingSize(); 00230 out << sc.getNumberOfSequences() << " " << numberOfSites << endl; 00231 00232 vector<string> seqNames = sc.getSequencesNames(); 00233 vector<string> names = getSizedNames(seqNames); 00234 for (size_t i = 0; i < seqNames.size(); i++) 00235 { 00236 vector<string> seq = TextTools::split(sc.toString(seqNames[i]), charsByLine); 00237 out << names[i] << seq[0] << endl; 00238 for (unsigned int j = 1; j < seq.size(); j++) 00239 { 00240 out << string(names[i].size(), ' ') << seq[j] << endl; 00241 } 00242 out << endl; 00243 } 00244 } 00245 00246 void Phylip::writeInterleaved(std::ostream& out, const SequenceContainer& sc, int charsByLine) const 00247 { 00248 //cout << "Write interleaved;" << endl; 00249 size_t numberOfSites = sc.getSequence(sc.getSequencesNames()[0]).size() * sc.getAlphabet()->getStateCodingSize(); 00250 out << sc.getNumberOfSequences() << " " << numberOfSites << endl; 00251 00252 vector<string> seqNames = sc.getSequencesNames(); 00253 vector<string> names = getSizedNames(seqNames); 00254 //Split sequences: 00255 vector< vector<string> > seqs(sc.getNumberOfSequences()); 00256 for (size_t i = 0; i < seqNames.size(); i++) 00257 { 00258 seqs[i] = TextTools::split(sc.toString(seqNames[i]), charsByLine); 00259 } 00260 //Write first block: 00261 for (size_t i = 0; i < names.size(); i++) 00262 { 00263 out << names[i] << seqs[i][0] << endl; 00264 } 00265 out << endl; 00266 //Write other blocks: 00267 for (size_t j = 1; j < seqs[0].size(); j++) 00268 { 00269 for (unsigned int i = 0; i < sc.getNumberOfSequences(); i++) 00270 { 00271 out << seqs[i][j] << endl; 00272 } 00273 out << endl; 00274 } 00275 } 00276 00277 /******************************************************************************/ 00278 00279 void Phylip::writeAlignment(std::ostream& output, const SiteContainer& sc) const throw (Exception) 00280 { 00281 //First must check if all sequences are aligned: 00282 if (sc.getNumberOfSequences() == 0) 00283 throw Exception("Phylip::write. SequenceContainer appear to contain no sequence."); 00284 00285 // Checking the existence of specified file, and possibility to open it in write mode 00286 if (!output) { throw IOException ("Phylip::write : failed to open file"); } 00287 00288 if (sequential_) writeSequential (output, sc, charsByLine_); 00289 else writeInterleaved(output, sc, charsByLine_); 00290 } 00291 00292 /******************************************************************************/ 00293 00294 const std::string Phylip::getFormatName() const { return "Phylip file, " + string(extended_ ? "extended," : "") + string(sequential_ ? "sequential" : "interleaved"); } 00295 00296 /******************************************************************************/ 00297 00298 const std::string Phylip::getFormatDescription() const 00299 { 00300 return "Phylip file format, sequential and interleaved. PAML extension also supported."; 00301 } 00302 00303 /******************************************************************************/ 00304