diff --git a/doc/GATK_Hello_World.pdf b/doc/GATK_Hello_World.pdf new file mode 100755 index 000000000..2747cc374 Binary files /dev/null and b/doc/GATK_Hello_World.pdf differ diff --git a/doc/GATK_Hello_World.tex b/doc/GATK_Hello_World.tex new file mode 100755 index 000000000..4041c250b --- /dev/null +++ b/doc/GATK_Hello_World.tex @@ -0,0 +1,93 @@ +\documentclass[11pt,fullpage]{article} +\usepackage[urlcolor=blue,colorlinks=true]{hyperref} + +\oddsidemargin 0.0in +\textwidth 6.5in + +\begin{document} + +\title{Getting Started with the Genome Analysis Toolkit (GATK)} +\author{Matt Hanna} +\date{16 Mar 2009} +\maketitle + +\section{Build Prerequisites} +GATK requires JDK 1.6 and Ant 1.7.1 to compile. + +\section{Getting and Building the Source} +GATK is located in the Sting svn repository, and +compiles using a build.xml in the root directory. + +Download and build the source as follows: +\begin{verbatim} + svn co https://svnrepos/Sting/trunk Sting + cd Sting + ant +\end{verbatim} + +\section{Getting Started} +The core concept behind GATK is the walker, a class that implements the +three core operations, filtering, mapping, and reducing. + +\begin{description} + \item [filter] reduces the size of the dataset by applying a predicate. + \item [map] Applies a function to each individual element in a dataset, + effectively 'mapping' it to a new element. + \item [reduce] Inductively combines the elements of a list. The base + case is supplied by the reduceInit() function, and the inductive step + is performed by the reduce() function. +\end{description} +Users of the GATK will provide a walker to run their analyses. The engine +will produce a result by first filtering the dataset, running a map operation, +and finally reducing the map operation to a single result. + +\section{Example} +This walker will print output for each read it sees, eventually computing the +total number of reads by mapping every read to 1 and summing all the 1s to +realize the total number of reads. + +\begin{verbatim} +import net.sf.samtools.SAMRecord; + +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.walkers.BasicReadWalker; + +/** + * Define a class extending from BasicReadWalker with types + * . + */ +public class HelloWalker extends BasicReadWalker { + private Long currentRead = 0L; + + // Maps each read to the value 1. + public Integer map(LocusContext context, SAMRecord read) { + System.out.printf("Hello read %d%n", ++currentRead ); + return 1; + } + + // Provides an initial value for the reduce function. + public Long reduceInit() { return 0L; } + + // Defines how to compute the reduction given a value in the list. + public Long reduce(Integer value, Long sum) { + return sum + value; + } +} +\end{verbatim} +To compile the walker: +\begin{verbatim} +setenv CLASSPATH $STING_HOME/dist/GenomeAnalysisTK.jar:$STING_HOME/dist/sam-1.0.jar +javac HelloWalker.java +\end{verbatim} +To run the walker: +\begin{verbatim} +mkdir $STING_HOME/dist/walkers +cp HelloWalker.java $STING_HOME/dist/walkers +java -Xmx4096m -jar dist/GenomeAnalysisTK.jar \ + INPUT_FILE=/broad/1KG/legacy_data/trio/na12878.bam \ + ANALYSIS_NAME=Hello L=chr1:10000000-10000100 +\end{verbatim} +This command will run the walker across a subsection of chromosome 1, operating on +reads which align to that subsection. + +\end{document}