Computing GC third codon position

From Bio++ Wiki
Revision as of 19:22, 9 February 2013 by Benoit.nabholz (talk | contribs)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to: navigation, search

Problem

You want to compute the GC content at the third codon position of several protein-coding sequences already aligned. You also want to evaluate the GC-content variability of each sequence.

Solution

This is a simple problem that will help you to explore the VectorSiteContainer class of SeqLib. You can assess the GC-content variability using a bootstrap analysis. You have to bootstrap the whole matrix at once and store the GC content of each sequence in a matrix. Finally, you can print a summary of your analysis, for example, the mean and standard deviation of the bootstrapped GC content distribution.

This code works with bio++ version 2.1.0.

<source lang="cpp">

  1. include <Bpp/Seq/Alphabet.all>
  2. include <Bpp/Seq/Container.all>
  3. include <Bpp/Seq/Sequence.h>
  4. include <Bpp/Seq/GeneticCode.all>
  5. include <Bpp/Seq/SequenceTools.h>
  6. include <Bpp/Seq/CodonSiteTools.h>
  7. include <Bpp/Seq/Container/VectorSequenceContainer.h>
  8. include <Bpp/Seq/Io/Fasta.h>
  9. include <Bpp/Seq/Io/Phylip.h>
  1. include <Bpp/Utils.all>
  2. include <Bpp/Numeric/VectorTools.h>
  1. include <unistd.h>
  2. include <cstdlib>
  3. include <cmath>
  4. include <iostream>

using namespace std; using namespace bpp;

int main (int argc, char* argv[]){

string nameSeq = "Your_alignment.fasta"; Fasta Fst;

const NucleicAlphabet *alpha = new DNA(); SequenceContainer * seqCont = Fst.readSequences(nameSeq , alpha); VectorSiteContainer * sequences = new VectorSiteContainer( *seqCont); VectorSiteContainer * vs3pos = new VectorSiteContainer(sequences->getSequencesNames(),sequences->getAlphabet());

delete seqCont;

unsigned int numberSites = sequences->getNumberOfSites();

if((numberSites % 3) != 0){

 throw Exception("Sequence size is not a multiple of three!!!");

}

/** select third codon position **/ int compt = 0; for(unsigned int i = 0; i < numberSites; i++){

 compt++;
 if(compt == 3){
   compt = 0;
   const Site site = sequences->getSite(i);
   vs3pos->addSite(site);
 }

}

/** bootstrap the sites **/ unsigned int numBoot = 1000; unsigned int numSP = sequences->getNumberOfSequences();

LinearMatrix<double> bootGC(numBoot, numSP); for(unsigned int i = 0; i < numBoot; i++){

 ApplicationTools::displayGauge(i, numBoot, '=', "Bootstrap : ");

 /** Bootstrap the whole matrix at once **/
 VectorSiteContainer* BootSites =  SiteContainerTools::bootstrapSites(* vs3pos);
 BootSites->reindexSites();

 for(unsigned int j = 0; j < numSP; j++){
   const Sequence & seq = BootSites->getSequence(j);
   bootGC(i, j) = SequenceTools::getGCContent(seq);
 }

}

for(unsigned int j = 0; j < numSP; j++){

 vector<double> GC;
 for(unsigned int i = 0; i < bootGC.getNumberOfRows(); i++){
   GC.push_back(bootGC(i, j));
 }
 const Sequence & seq = sequences->getSequence(j);
 double meanGC = VectorTools::mean<double, double>(GC);
 double sdGC   = VectorTools::sd<double, double>(GC);
 cout << seq.getName() << "  " << meanGC << "  " << sdGC << endl;

}

delete alpha; delete vs3pos;

return 0; }

</source>