210 likes | 331 Views
Running an R application - GeoSSE (Geographic State Speciation and Extinction) analyses run using R and parallel functions. Download this file http://goo.gl/rQL8Hd. Victor Soria-Carrasco v.soria-carrasco@shef.ac.uk. GeoSSE.
E N D
Running an R application - GeoSSE (Geographic State Speciation and Extinction) analyses run using R and parallel functions Download this file http://goo.gl/rQL8Hd Victor Soria-Carrasco v.soria-carrasco@shef.ac.uk
GeoSSE • Belongs to a family of likelihood-based methods for studying diversification and character evolution using phylogenetic trees (BiSSE - Binary State Speciation and Extinction) • GeoSSE models the reciprocal effects between geographic range evolution and diversification (speciation and extinction) • It estimates region-dependent rates of speciation, extinction and dispersal. • R package diversitree
A shell script is used to call the R script R_job.sh myscript.R #!/bin/bash #$ -l h_rt=08:00:00 #$ -l mem=2G #$ -m bea #$ -M user@sheffield.ac.uk R CMD BATCH --vanilla myscript.R # Load diversitree package library (diversitree) # Load phylogenetic trees trees<-read.tree(“mytrees.new”) # Load states states<-read.table(“mytable.txt”, header=T) ... $ qsub R_job.sh -j y -o rjob.log
Example 1 Running R using multiple cores of 1 node (shared memory)
Example 1. Run GeoSSE using multiple cores of 1 node (shared memory) # Launch an interactive session $ qrsh # Copy the input files (trees and table with geographic states) to your home folder $cp /usr/local/extras/Genomics/HPC_course/R_GeoSSE/example_trees_10.new $HOME/ $cp /usr/local/extras/Genomics/HPC_course/R_GeoSSE/example_states.txt $HOME/ # Copy the R script to your home folder $cp /usr/local/extras/Genomics/submit_scripts/geosse_openmp.R $HOME/ # Copy the shell script to call the R script to your home folder $ cp /usr/local/extras/Genomics/submit_scripts/R_job_openmp.sh $HOME/
Example 1. Run GeoSSE using multiple cores of 1 node (shared memory) # If you experience problems, you can download all the files from the web: $ wget http://goo.gl/9GVykX $ tar -xvf R_GeoSSE.tar.gz $ rm R_GeoSSE.tar.gz
Example 1. Run GeoSSE using multiple cores of 1 node (shared memory) # Check you have the following 4 files in your home directory: example_states.txt example_trees_10.new geosse_openmp.R R_job_openmp.sh # Edit the shell script to change number of processors, e-mail address, and path to R script $ nano $HOME/R_job_openmp.sh
R_job_openmp.sh #!/bin/bash #$ -l h_rt=08:00:00 #$ -l mem=2G #$ -pe openmp 2 #$ -m bea #$ -M user@sheffield.ac.uk # Number of cores to be used export OMP_NUM_THREADS=2 # Add path to software repository libraries export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/extras/Genomics/lib/lib R='/usr/local/extras/Genomics/apps/R/current/bin/R' # Change R script path RSCRIPT="$HOME/geosse_openmp.R" # Some options # # --slave # Don't print input commands # # --vanilla (highly recommended) # Don't load anything from previous sessions, # and don't save data at the end of the current session $R CMD BATCH --vanilla $RSCRIPT
Example 1. Run GeoSSE using multiple cores of 1 node (shared memory) # Edit the R script to change input and output files $ nano $HOME/geosse_openmp.R
geosse_openmp.R (1) # Load parallel package library(parallel) # Number of cores ncores<-2 # Load diversitree package library(diversitree) # Load phylogenetic trees trees<-read.tree(“/home/user/example_trees_10.new”) # Load states states<-read.table(”/home/user/example_states.txt”, header=T) tip.states<-states$state names(tip.states)<-states$taxon # Fit birth-death model to get an extinction/speciation ratio (eps) # and use it as starting value for GeoSSE bd.rates<-mclapply(trees, birthdeath)
geosse_openmp.R (2) # Fit GeoSSE model by maximum likelihood to the trees ML.est<-mclapply(1:length(trees), function(x){ model<-make.geosse( tree=trees[[x]], states=tip.states) start.values<-starting.point.geosse(trees[[x]], eps=bd.rates[[x]]$par[1]) res<-find.mle(model, x.init=start.values, method="subplex") cat("Job # ", x, " finished\n", sep="") return(res) }, mc.cores=ncores) # Format results into a table ML.est.pars.table<-as.data.frame(do.call("rbind", lapply(ML.est, function(x) c(x$par, npar=length(x$par), lnl=x$lnLik)))) # Write the table to a file output<-paste(Sys.getenv("HOME"), "/example_openmp_output.txt", sep='') write.table(ML.est.pars.table, file=output, row.names=F, quote=F, sep="\t") # Save image of this session image<-paste(Sys.getenv("HOME"), "/example_openmp_image.RData", sep='') save.image(file=image, compress="bzip2")
Example 1. Run GeoSSE using multiple cores of 1 node (shared memory) # Submit the job to the cluster $ qsub -j y -o geosse_openmp.log R_job_openmp.sh # Check if the job is running $ Qstat # When the job finishes, look at the logs and the output $ less -S geosse_openmp.log $ less -S geosse_openmp.Rout $ less -S example_openmp_output.txt
Example 2 Running R using multiple nodes (distributed memory)
Example 2. Run GeoSSE using multiple nodes (distributed memory) # Copy the R script to your home folder $cp /usr/local/extras/Genomics/submit_scripts/geosse_mpi.R $HOME/ # Copy the shell script to call the R script to your home folder $ cp /usr/local/extras/Genomics/submit_scripts/R_job_mpi.sh $HOME/ # Check you have the following 4 files in your home directory: example_states.txt example_trees_10.new geosse_mpi.R R_job_mpi.sh # Edit the shell script to change number of nodes, e-mail address, and path to R script $ nano $HOME/R_job_mpi.sh
R_job_mpi.sh #!/bin/bash #$ -l h_rt=08:00:00 #$ -l mem=2G #$ -pe ompigige 4 #$ -m bea #$ -M user@sheffield.ac.uk # Required for running Rmpi module add mpi/gcc/openmpi/1.4.4 export MPI_ROOT='/usr/local/mpi/gcc/openmpi/1.4.4/' export OMPI_MCA_mtl=^psm # Add path to software repository libraries export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/extras/Genomics/lib/lib R='/usr/local/extras/Genomics/apps/R/current/bin/R' # Change R script path RSCRIPT=”$HOME/geosse_mpi.R" # Some options # # --slave # Don't print input commands # # --vanilla (highly recommended) # Don't load anything from previous sessions, # and don't save data at the end of the current session $R CMD BATCH --vanilla $RSCRIPT
Example 2. Run GeoSSE using multiple nodes (distributed memory) # Edit the R script to change input and output files $ nano $HOME/geosse_mpi.R
geosse_mpi.R (1) # This R script uses multiple nodes to fit a GeoSSE model # to multiple phylogenetic trees # Ignore Infiniband interface Sys.setenv(OMPI_MCA_mtl='^psm') # Echo commands options(echo=T) # Load Rmpi package library (Rmpi) # Load diversitree package library (diversitree) # Load phylogenetic trees trees<-read.tree(“/home/user/example_trees_10.new”) # Load states states<-read.table(”/home/user/example_states.txt”, header=T) tip.states<-states$state names(tip.states)<-states$taxon
geosse_mpi.R (2) # Number of nodes nnodes<-4 # spawn mpi slaves mpi.spawn.Rslaves(nslaves=nnodes, needlog=T) # load diversitree on slaves mpi.bcast.cmd(library(diversitree)) # Broadcast trees to slaves mpi.bcast.Robj2slave(trees) # Broadcast tip.states to slaves mpi.bcast.Robj2slave(tip.states) # Fit birth-death model to get an extinction/speciation ratio (eps) # and use it as starting value for GeoSSE bd.rates<-mpi.parLapply(trees, birthdeath) # Broadcast bd.rates to slaves mpi.bcast.Robj2slave(bd.rates)
geosse_mpi.R (3) # Fit GeoSSE model by maximum likelihood ML.est<-mpi.parLapply(1:length(trees), function(x){ model<-make.geosse( tree=trees[[x]], states=tip.states) start.values<-starting.point.geosse(trees[[x]], eps=bd.rates[[x]]$par[1]) res<-find.mle(model, x.init=start.values, method="subplex") cat("Job # ", x, " finished\n", sep="") return(res) }, job.num=nnodes) # Format results into a table ML.est.pars.table<-as.data.frame(do.call("rbind", lapply(ML.est, function(x) c(x$par, npar=length(x$par), lnl=x$lnLik)))) # Write the table to a file output<-paste(Sys.getenv("HOME"), "/example_mpi_output.txt", sep='') write.table(ML.est.pars.table, file=output, row.names=F, quote=F, sep="\t") # Save an image of this session image<-paste(Sys.getenv("HOME"), "/example_mpi_image.RData", sep='') save.image(file=image, compress="bzip2")
Example 2. Run GeoSSE using multiple nodes (distributed memory) # Submit the job to the cluster $ qsub -j y -o geosse_mpi.log R_job_mpi.sh # Check if the job is running $ Qstat # When the job finishes, look at the logs $ less -S geosse_mpi.log $ less -S geosse_mpi.Rout # look at the log for each node $ find . -name "*node*.log" -exec sh -c "echo {}; cat {} | sed 's/^/\t/g'" \; # Output (results) $ less -S example_mpi_output.txt
https://myapps.shef.ac.uk bo4cm14 / gen0mics