# ================================================================================================== # Evolutionary Genetics / UniBas HS 2011: LV 25600-01 # Bioinformatics - Help and Exercises # Jean-Claude Walser (jean-claude.walser@unibas.ch) & Anne Roulin (anne.roulin@unibas.ch) # ================================================================================================== # ------------------------------------------------------------------------------------------------- # The R Project for Statistical Computing (http://www.r-project.org/) # ------------------------------------------------------------------------------------------------- ## Download (R Sources or R Binaries) http://cran.r-project.org/ # What version am I using version ## FAQ http://cran.r-project.org/faqs.html ## HELP Install http://cran.r-project.org/doc/manuals/R-admin.html ## Manuals & Books # An Introduction to R (http://cran.r-project.org/doc/manuals/R-intro.pdf) # Using R for Data Analysis and Graphics (http://cran.r-project.org/doc/contrib/usingR.pdf) # The R Guide (http://cran.r-project.org/doc/contrib/Owen-TheRGuide.pdf) # The R Book (ISBN-10: 0470510242) # Data Analysis and Graphics Using R: An Example-based Approach (ISBN-10: 0521861160) # R in a Nutshell: A Desktop Quick Reference (ISBN-10: 059680170X) ## Possible Error Message: # In case you get the following error message after starting the R terminal # # “You're using a non-UTF8 locale, therefore only ASCII characters will work. # Please read R for Mac OS X FAQ (see Help) section 9 and adjust your system # preferences accordingly.“ type the following: system("defaults write org.R-project.R force.LANG en_US.UTF-8") # ------------------------------------------------------------------------------------------------- # Packages needed for the following exercises # ------------------------------------------------------------------------------------------------- # Package: seqinr # Title: Biological Sequences Retrieval and Analysis # Version: 3.0-1 # Date: 2010-10-18 # Depends: R (>= 2.6.0) # Suggests: ade4, segmented library(seqinr) # Package: ape # Title: Analyses of Phylogenetics and Evolution # Version: 2.6-1 # Date: 2010-11-01 # Depends: R (>= 2.6.0) # Suggests: gee # Imports: gee, nlme, lattice library(ape) # ------------------------------------------------------------------------------------------------- # General Information # ------------------------------------------------------------------------------------------------- # Everything that starts with “#“ is comment and not part R code # For help type: help() e.g. help(plot) # Redundancies - don't get confused: e.g. a<-5 or a=5 # ------------------------------------------------------------------------------------------------- # Bioconductor Packages (not needed but "interesting" to have in the future) # ------------------------------------------------------------------------------------------------- # Use the biocLite.R script to install Bioconductor packages. source("http://bioconductor.org/biocLite.R") # To install a particular package, e.g., limma, type the following in an R command window: biocLite("limma") # Packages and their dependencies installed by this usage are: # affy, affydata, affyPLM, affyQCReport, annaffy, annotate, Biobase, biomaRt, Biostrings, # DynDoc, gcrma, genefilter, geneplotter, GenomicRanges, hgu95av2.db, limma, marray, multtest, # vsn, and xtable. # ------------------------------------------------------------------------------------------------- # Start Exercises # ------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------- # R Basics # --------------------------------------------------------------------- # To find out which additional packages are available on your system, type library() # You can “load” the installed package by typing library(seqinr) # You can then find out the functions the package provides by typing library(help = seqinr) help(package = seqinr) # Get Help help(read.table) # Import data data=read.table("{File}",header={T/F},sep={,}) # or data=read.csv("{File}",header={T/F},sep={,}) # Quick look at the data daten names(data) str(data) summary(data) ## First steps: # Dataset: women {datasets} # This data set gives the average heights and weights for American women aged 30–39. women str(women) women$height women$weight # Plot data plot(women) # A more detailed plot plot(women, xlab = "Height (in)", ylab = "Weight (lb)", main = "women data: Women aged 30-39") # Transfer data from the US metric system into the iso system: # 1in = 2.54cm # 1lb = 453.6g women.height.iso=(women$height)*2.54 women.weight.iso=(women$weight)*0.4536 plot(women.height.iso,women.weight.iso, xlab = "Height (cm)", ylab = "Weight (kg)", main = "women data: Women aged 30-39") # Dataset: Titanic # This data set provides information on the fate of passengers on the fatal maiden voyage # of the ocean liner ‘Titanic’, summarized according to economic status (class), # sex, age and survival. # Data structure str(Titanic) # Plot the data mosaicplot(Titanic, main = "Survival on the Titanic") # Explore data - Is there an higher survival rates in children? apply(Titanic, c(3, 4), sum) # Explore data - Is there an higher survival rates in females? apply(Titanic, c(2, 4), sum) # Dataset: eurodist eurodist # The data give the road distances (in km) between 21 cities in Europe. # The data are taken from a table in The Cambridge Encyclopaedia. loc <- cmdscale(eurodist) rx <- range(x <- loc[,1]) ry <- range(y <- -loc[,2]) plot(x, y, type="n", asp=1, xlab="", ylab="") abline(h = pretty(rx, 10), v = pretty(ry, 10), col = "light gray") text(x, y, labels(eurodist), cex=0.8) # --------------------------------------------------------------------- # seqinr - Working with DNA Sequences in R # --------------------------------------------------------------------- library(seqinr) # Create and save 4 nucleotide sequences in fasta format: cat(">SEQ1", "TACGATGGATCAGTCTGGGTGGGACGTGCTCCATTTATACCCTGCGCAGGCTGGACCGAG", file = "/SEQ1.fasta", sep = "\n") cat(">SEQ2", "TACGATGGATCATTCTGGGTGGCCCGTGCTGCATATATACCCTGCGCAGGCTGGACCGAG",file = "/SEQ2.fasta", sep = "\n") cat(">SEQ3", "TACGATGGATCATTCTCCCTGGCCCGTGCTGCATATCCCTGCGCAGGCTGGACCGAG",file = "/SEQ3.fasta", sep = "\n") cat(">SEQ4", "TACGATGGATCATTCTGGGTGGCCCGTGCTGC-------CCCTGCGCAGGCTGGACCGAG",file = "/SEQ4.afasta", sep = "\n") # Load the files: dna1 = read.fasta("/SEQ1.fasta", seqtype = "DNA") # load sequence 1 dna1 # display sequence 1 dna2 = read.fasta("/SEQ2.fasta", seqtype = "DNA") # load sequence 2 dna2 # display sequence 2 dna3 = read.fasta("/SEQ3.fasta", seqtype = "DNA") # load sequence 3 dna3 # display sequence 3 dna4 = read.fasta("/SEQ4.afasta", seqtype = "DNA") # load sequence 4 dna4 # display sequence 4 # Calculate sequence Length getLength(dna1) getLength(dna2) getLength(dna3) getLength(dna4) # Calculate %-conservation: sum((dna1$SEQ1)==(dna1$SEQ1))/(getLength(dna1))*100 sum((dna1$SEQ1)==(dna2$SEQ2))/((getLength(dna1)+getLength(dna2))/2)*100 sum((dna1$SEQ1)==(dna3$SEQ3))/((getLength(dna1)+getLength(dna3))/2)*100 # !!! sum((dna1$SEQ1)==(dna4$SEQ4))/((getLength(dna1)+getLength(dna4))/2)*100 # ! In order to compare two sequences both need to be aligned ! # Simple plot for sequence length: plot(c(getLength(dna1),getLength(dna2),getLength(dna3),getLength(dna4)),type = "h", col = "red", lwd=10,main="Sequence Length") # Calculates some codon usage uco(dna1$SEQ1) uco(dna2$SEQ2) uco(dna3$SEQ3) uco(dna4$SEQ4) par(mfrow=c(2,2)) plot(uco(dna1$SEQ1)) plot(uco(dna2$SEQ2)) plot(uco(dna3$SEQ3)) plot(uco(dna4$SEQ4)) par(mfrow=c(1,1)) # ? Try to adjust font size of the x axis # Statistical dinculeotide over- and under-representation rho(dna1$SEQ1, wordsize = 2, alphabet = s2c("acgt")) # More plots plot(rho(dna1$SEQ1, wordsize = 2, alphabet = s2c("acgt"))) plot(sort((rho(dna1$SEQ1, wordsize = 5, alphabet = s2c("acgt"))))) # Dot Plot dotPlot(dna1$SEQ1,dna1$SEQ1, wsize = 5, wstep = 3, nmatch = 5) dotPlot(dna1$SEQ1,dna4$SEQ4, wsize = 5, wstep = 3, nmatch = 5) # Split a sequence into sub-sequences of 3 splitseq(dna1$SEQ1, frame = 0, word = 3) #Returns all synonymous codons for each codon given syncodons("ggg") syncodons((splitseq(dna1$SEQ1, frame = 0, word = 3)), numcode = 1) # Translate DNA into Protein translate(dna1$SEQ1) # Protein Statistics AAstat((translate(dna1$SEQ1)), plot = TRUE) # AA in three-letter code aaa(translate(dna1$SEQ1)) # Fractional G+C content of nucleic acid sequences GC(dna1$SEQ1) # ? Create an spreadsheet data file - save it as txt or csv and import it # Find out more about the function "read.alignment" and try to import fasta and mase files (download from the website) ### Simple utility function # Conversion of a string into a vector of chars s2c("Watson & Crick") # Numerical encoding of a DNA sequence # This is one way (n > s) nseq <- sample(x = 0:3, size = 100, replace = TRUE) n2s(nseq) # and this the other (s > n) s2n(dna1$SEQ1, levels = s2c("acgt"), base4 = TRUE, forceToLower = TRUE) # is it really the same? n2s(s2n(dna1$SEQ1, levels = s2c("acgt"), base4 = TRUE, forceToLower = TRUE))==dna1$SEQ1 # Reverse alignment - from protein sequence alignment to nucleic sequence alignment reverse.align # --------------------------------------------------- # (A1) ape - Read DNA Sequences from GenBank via Internet # --------------------------------------------------- # Get help: help(ape) Dro_adh <- read.GenBank(ref) # Test installation: data(bird.orders) plot(bird.orders) # read.GenBank(access.nb, seq.names = access.nb, # species.names = TRUE, as.character = FALSE) ## This won't work if your computer is not connected ## to the Internet!!! ## ## Get the 4 sequences of Adh from different Drosophila species as used in O'Grady & Kidwell (2002) ref <- c("AF502402S1", "AF502404S1", "AF502406S1","AF502408S2") Dro_adh <- read.GenBank(ref) ### get the species names: attr(Dro_adh, "species") ### get a summary of the sequences Dro_adh ### print the # sequence Dro_adh[1] Dro_adh[2] Dro_adh[3] Dro_adh[4] ### ways to manipulate the data as.alignment(Dro_adh) as.character(Dro_adh) # ------------------------------------------- # (A2) Read DNA Sequences from a text file # ------------------------------------------- dna <- read.dna("path/text.fasta", format = "fasta") # ------------------------------------------- # (A3) Create DNA Sequences # ------------------------------------------- # Create, safe, and load sequences in fasta format cat(">SEQ_A", "AAAAAAAAAAAAAAAAAAAA", ">SEQ_B", "AAAAAAAAAAAAAAATTTTT", ">SEQ_C", "AAAAAAAAAATTTTTTTTTT", ">SEQ_D", "AAAAAAAAAAAAAAAAAAAG", ">SEQ_E", "AAAAAAAAAAAAAAAAAAAC", file = "/SEQ_A-E.fasta", sep = "\n") dna5 <- read.dna("/SEQ_A-E.fasta", format = "fasta") # ------------------------------------------- # (A4) Sequences manipulation # ------------------------------------------- # Base frequencies from DNA Sequences base.freq(Dro_adh[[1]], freq = FALSE) # Content in GC from DNA Sequences GC.content(Dro_adh[[1]]) # Simple compsrison of two sequences identical(dna, dna2) # ------------------------------------------- # (B1) Pairwise Distances from Genetic Data # ------------------------------------------- dist.gene(dna5, method = "pairwise", variance = FALSE) # -------------------------------------------- # (B1) Pairwise Distances from DNA Sequences # -------------------------------------------- # dist.dna(x, model = "K80", variance = FALSE, # gamma = FALSE, pairwise.deletion = FALSE, # base.freq = NULL, as.matrix = FALSE) # molecular evolutionary models available: # raw # JC69 # K80 # F81 # K81 # F84 # BH87 # T92 # TN93 # GG95 # logdet # paralin dist.dna(dna5, model = "raw") dist.dna(dna5, model = "JC69") # ------------------------------------------- # (C1) Phylogenetic Trees # ------------------------------------------- # Paradis, E. (2008) Definition of Formats for Coding Phylogenetic Trees in R. # Read Tree File in Parenthetic Format (Newick or New Hampshire format) ape_tr <- read.tree(text = "(( Human, Chimp), Gorilla);") plot(ape_tr) # Add some label to the tree nodelabels("7.3 Ma", 4, frame = "r", bg = "yellow", adj = 0) nodelabels("5.4 Ma", 5, frame = "c", bg = "tomato", font = 3) # Some info about the tree ape_tr str(ape_tr) names(ape_tr) # Root or re-roots a phylogenetic tree # The dataset data(bird.orders) bird.orders$tip.label # plot(root(bird.orders, 1)) plot(root(bird.orders, 7)) plot(root(bird.orders, 1:5)) # create a random tree: tre <- rtree(25) # visualize labels of internal nodes: plot.phylo(tre, use.edge.length=FALSE) nodelabels() # rotate clades around node 30: tre.new1 <- rotate(tre, 30) # compare the results: par(mfrow=c(1,2)) # devide graphical device plot(tre) # plot old tre nodelabels() plot(tre.new1) # plot new tree #1 nodelabels() par(mfrow=c(1,1)) # set graphical device back # rotate clades containing nodes 12 and 20: tre.new2 <- rotate(tre, c(12, 21)) # or you migth just specify tiplabel names: tre.new3 <- rotate(tre, c("t3", "t14")) # Plot and compare them all together par(mfrow=c(2,2)) # devide graphical device plot(tre) # plot old tre plot(tre.new1) # plot new tree #1 plot(tre.new2) # plot new tree #2 plot(tre.new3) # plot new tree #3 par(mfrow=c(1,1)) # set graphical device back