Added loads of documentation.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@777 348d0f76-0448-11de-a6fe-93d51630548a
2009-05-21 19:40:47 +00:00 · 2009-05-21 19:40:47 +00:00 · 5f67914b08
parent 1a9d5cea29
commit 5f67914b08
10 changed files with 320 additions and 94 deletions
--- a/java/src/org/broadinstitute/sting/secondarybase/AddFourProbsToSAM.java
+++ b/java/src/org/broadinstitute/sting/secondarybase/AddFourProbsToSAM.java
@ -1,4 +1,4 @@
-package org.broadinstitute.sting.playground.fourbasecaller;
+package org.broadinstitute.sting.secondarybase;

 import net.sf.samtools.*;
 import org.broadinstitute.sting.utils.QualityUtils;
--- a/java/src/org/broadinstitute/sting/secondarybase/AnnotateSecondaryBase.java
+++ b/java/src/org/broadinstitute/sting/secondarybase/AnnotateSecondaryBase.java
@ -2,18 +2,18 @@ package org.broadinstitute.sting.secondarybase;

 import net.sf.samtools.*;
 import net.sf.samtools.util.CloseableIterator;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.Pair;
 import org.broadinstitute.sting.utils.QualityUtils;
 import org.broadinstitute.sting.utils.StingException;
-import org.broadinstitute.sting.utils.Pair;
 import org.broadinstitute.sting.utils.cmdLine.Argument;
 import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;

 import java.io.File;
 import java.io.IOException;
-import java.util.HashMap;
 import java.util.ArrayList;
-import java.util.regex.Pattern;
 import java.util.regex.Matcher;
+import java.util.regex.Pattern;

 /**
 * AnnotateSecondaryBase computes the second best base for every base in an Illumina lane.
@ -23,24 +23,35 @@ import java.util.regex.Matcher;
 * base observation.
 *
 * Approximately 95% of the time, this method and Illumina's basecalling package, "Bustard",
- * agree on the identity of the best base.  In these cases, we simply annotate the
- * second-best base.  In cases where this method and Bustard disagree, we annotate the
- * secondary base as this method's primary base.
+ * agree on the identity of the best base.  In these cases, we simply annotate our estimate
+ * of the second-best base.  In cases where this method and Bustard disagree, we annotate
+ * the secondary base as this method's primary base.
 *
 * @author Kiran Garimella
 */
+
+/*
+ An example invocation:
+   java -Xmx2048m -Djava.io.tmpdir=/broad/hptmp/ -jar /path/to/AnnotateSecondaryBase.jar \
+    -D /seq/solexaproc2/SL-XAX/analyzed/090217_SL-XAX_0003_FC30R47AAXX/Data/C1-152_Firecrest1.3.2_25-02-2009_prodinfo/Bustard1.3.2_25-02-2009_prodinfo/ \
+    -L 5 \
+    -B 30R47AAXX090217
+    -CR '0-75,76-151' \
+    -SO ~/test.sam \
+    -SI /seq/picard/30R47AAXX/C1-152_2009-02-17_2009-04-02/5/Solexa-10265/30R47AAXX.5.aligned.bam \
+ */
+    
 public class AnnotateSecondaryBase extends CommandLineProgram {
    public static AnnotateSecondaryBase Instance = null;

-    @Argument(fullName="dir", shortName="D", doc="Illumina Bustard directory") public File BUSTARD_DIR;
+    @Argument(fullName="bustard_dir", shortName="D", doc="Illumina Bustard directory") public File BUSTARD_DIR;
    @Argument(fullName="lane", shortName="L", doc="Illumina flowcell lane") public int LANE;
-    @Argument(fullName="sam_in", shortName="SI", doc="The file to use for training and annotation", required=false) public File SAM_IN;
+    @Argument(fullName="run_barcode", shortName="B", doc="Run barcode (embedded as part of the read name; i.e. 30R47AAXX090217)") public String RUN_BARCODE;
+    @Argument(fullName="cycle_ranges", shortName="CR", doc="Cycle ranges for single-end or paired reads (i.e. '0-50,56-106') (0-based, inclusive)") public String CYCLE_RANGES;
    @Argument(fullName="sam_out", shortName="SO", doc="Output path for sam file") public File SAM_OUT;
-    @Argument(fullName="reference", shortName="R", doc="Reference sequence to which sam_in is aligned (in fasta format)") public File REFERENCE;
-    @Argument(fullName="cycle_ranges", shortName="CR", doc="Cycle ranges for single-end or paired reads (i.e. '0-50,56-106') (0-based inclusive)") public String CYCLE_RANGES;
-    @Argument(fullName="tlim", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 100000;
-    @Argument(fullName="clim", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE;
-    @Argument(fullName="runbarcode", shortName="B", doc="Run barcode (embedded as part of the read name)") public String RUN_BARCODE;
+    @Argument(fullName="sam_in", shortName="SI", doc="The file to use for training and annotation", required=false) public File SAM_IN;
+    @Argument(fullName="training_limit", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 100000;
+    @Argument(fullName="calling_limit", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE;

    public static void main(String[] argv) {
        Instance = new AnnotateSecondaryBase();
@ -52,7 +63,7 @@ public class AnnotateSecondaryBase extends CommandLineProgram {

        BasecallingTrainer trainer = new BasecallingTrainer(BUSTARD_DIR, LANE, TRAINING_LIMIT);

-        // Iterate through raw Firecrest data and the first N reads up to TRAINING_LIMIT
+        // Iterate through raw Firecrest data and store the first unambiguous N reads up to TRAINING_LIMIT
        System.out.println("Loading training set from the first " + TRAINING_LIMIT + " unambiguous reads in the raw data...");
        trainer.loadFirstNUnambiguousReadsTrainingSet();

@ -65,7 +76,9 @@ public class AnnotateSecondaryBase extends CommandLineProgram {

        SAMFileHeader sfh = new SAMFileHeader();
        sfh.setSortOrder(SAMFileHeader.SortOrder.queryname);
-        SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, SAM_OUT);
+        
+        File unalignedSam = (canAnnotate(SAM_IN)) ? getTempSAMFile("unaligned") : SAM_OUT;
+        SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, unalignedSam);

        IlluminaParser iparser = new IlluminaParser(BUSTARD_DIR, LANE);

@ -97,21 +110,15 @@ public class AnnotateSecondaryBase extends CommandLineProgram {

        if (canAnnotate(SAM_IN)) {
            // If we're in annotation mode, annotate the aligned BAM file with the SQ tag
-            System.out.println("Annotating aligned BAM file...");
+            System.out.println("Annotating aligned SAM file...");
            
-            try {
-                File sortedSam = File.createTempFile("sorted", ".sam", SAM_OUT.getParentFile());
-                //System.out.println("  sorted file: " + sortedSam.getAbsolutePath());
-                
-                sortBAMByReadName(SAM_IN, sortedSam);
+            System.out.println("  sorting aligned SAM file by read name...");
+            File alignedSam = getTempSAMFile("aligned");
+            sortBAMByReadName(SAM_IN, alignedSam);

-                File mergedSam = File.createTempFile("merged", ".sam", SAM_OUT.getParentFile());
-                //System.out.println("  merged file: " + mergedSam.getAbsolutePath());
-
-                mergeUnalignedAndAlignedBams(SAM_OUT, sortedSam, mergedSam);
-            } catch (IOException e) {
-                throw new StingException("There was a problem in trying to merge the unaligned and aligned BAM files.");
-            }
+            System.out.println("  merging unaligned and aligned SAM files...");
+            File mergedSam = SAM_OUT;
+            mergeUnalignedAndAlignedBams(unalignedSam, alignedSam, mergedSam);
        }

        System.out.println("Done.");
@ -119,6 +126,23 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
        return 0;
    }

+    /**
+     * Return a tempfile.  This is a laziness method so that I don't have to litter my code with try/catch blocks for IOExceptions.
+     *
+     * @param prefix  the prefix for the temp file
+     * @return  the temp file
+     */
+    private File getTempSAMFile(String prefix) {
+        try {
+            File tempFile = File.createTempFile(prefix, ".sam", SAM_OUT.getParentFile());
+            tempFile.deleteOnExit();
+
+            return tempFile;
+        } catch (IOException e) {
+            throw new StingException("Unable to create tempfile in directory " + SAM_OUT.getParent());
+        }
+    }
+
    /**
     * Parse the cycle_ranges string that defines the cycle where a read starts and stops.
     * Comma-separated ranges are interpreted to be the first and second end of a pair.
@ -133,8 +157,8 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
        
        Pattern p = Pattern.compile("(\\d+)-(\\d+)");

-        for (int pieceIndex = 0; pieceIndex < pieces.length; pieceIndex++) {
-            Matcher m = p.matcher(pieces[pieceIndex]);
+        for (String piece : pieces) {
+            Matcher m = p.matcher(piece);

            if (m.find()) {
                Integer cycleStart = new Integer(m.group(1));
@ -169,25 +193,27 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
     * Construct a SAMRecord object with the specified information.  The secondary bases
     * will be annotated suchthat they will not conflict with the primary base.
     *
-     * @param rr          the raw Illumina read
-     * @param fpr         the four-base distributions for every base in the read
-     * @param sfh         the SAM header
-     * @param runBarcode  the run barcode of the lane (used to prefix the reads)
+     * @param rr                 the raw Illumina read
+     * @param fpr                the four-base distributions for every base in the read
+     * @param sfh                the SAM header
+     * @param runBarcode         the run barcode of the lane (used to prefix the reads)
+     * @param isPaired           is this a paired-end lane?
+     * @param isSecondEndOfPair  is this the second end of the pair?
     *
     * @return a fully-constructed SAM record
     */
-    private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode, boolean isPaired, boolean secondEndOfPair) {
+    private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode, boolean isPaired, boolean isSecondEndOfPair) {
        SAMRecord sr = new SAMRecord(sfh);

        sr.setReadName(runBarcode + ":" + rr.getReadKey() + "#0");
-        sr.setMateUnmappedFlag(true);
        sr.setReadUmappedFlag(true);
        sr.setReadString(rr.getSequenceAsString());
        sr.setBaseQualities(rr.getQuals());

        sr.setReadPairedFlag(isPaired);
        if (isPaired) {
-            sr.setFirstOfPairFlag(!secondEndOfPair);
+            sr.setMateUnmappedFlag(true);
+            sr.setFirstOfPairFlag(!isSecondEndOfPair);
        }
        
        sr.setAttribute("SQ", fpr.getSQTag(rr));
@ -220,9 +246,9 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
    /**
     * Merges two SAM files that have been sorted in queryname order
     * 
-     * @param queryNameSortedUnalignedSam
-     * @param queryNameSortedAlignedSam
-     * @param mergedSam
+     * @param queryNameSortedUnalignedSam  the sorted unaligned SAM file
+     * @param queryNameSortedAlignedSam    the sorted aligned SAM file
+     * @param mergedSam the output file where the merged results should be stored
     */
    private void mergeUnalignedAndAlignedBams(File queryNameSortedUnalignedSam, File queryNameSortedAlignedSam, File mergedSam) {
        SAMFileReader usam = new SAMFileReader(queryNameSortedUnalignedSam);
@ -240,36 +266,54 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
        SAMRecord asr = asamIt.next();

        do {
-            while (usamIt.hasNext() && asamIt.hasNext() && !usr.getReadName().matches(asr.getReadName()) && usr.getReadNegativeStrandFlag() == asr.getReadNegativeStrandFlag()) {
-                int comp = usr.getReadName().compareTo(asr.getReadName());
-
-                if (comp < 0) {
-                    usr = usamIt.next();
-                } else if (comp > 0) {
-                    asr = asamIt.next();
-                }
-            }
-
-            if (usr.getReadName().matches(asr.getReadName()) && usr.getReadNegativeStrandFlag() == asr.getReadNegativeStrandFlag()) {
+            // Pull a record from the unaligned file and advance the aligned file until we find the matching record.  We
+            // don't have to advance the unaligned file until we find our record because we assume every record we generate
+            // will be in the aligned file (which also contains unaligned reads).
+            //
+            // If Picard ever stops storing the unaligned reads, this logic will need to be rewritten.
+            
+            if (usr.getReadName().equals(asr.getReadName()) && usr.getFirstOfPairFlag() == asr.getFirstOfPairFlag()) {
                byte[] sqtag = (byte[]) usr.getAttribute("SQ");
+                String usrread = usr.getReadString();
+                String asrread = asr.getReadString();

-                if (sqtag != null) {
-                    if (asr.getReadNegativeStrandFlag()) {
-                        QualityUtils.reverseComplementCompressedQualityArray(sqtag);
-
-                        asr.setAttribute("SQ", sqtag);
-                    }
+                if (asr.getReadNegativeStrandFlag()) {
+                    sqtag = QualityUtils.reverseComplementCompressedQualityArray(sqtag);
+                    asrread = BaseUtils.simpleReverseComplement(asrread);
                }

-                samOut.addAlignment(asr);
+                if (usrread != null && asrread != null && !usrread.equals(asrread)) {
+                    throw new StingException(
+                        String.format("Purportedly identical unaligned and aligned reads have different read sequences.  Perhaps this lane was reanalyzed by the Illumina software but not the production pipeline?\n '%s:%b:%s'\n '%s:%b:%s'",
+                                      usr.getReadName(), usr.getFirstOfPairFlag(), usrread,
+                                      asr.getReadName(), asr.getFirstOfPairFlag(), asrread));
+                }
+
+                asr.setAttribute("SQ", sqtag);

                usr = usamIt.next();
+            } else {
                asr = asamIt.next();
            }
+
+            samOut.addAlignment(asr);
        } while (usamIt.hasNext() && asamIt.hasNext());

        usam.close();
        asam.close();
        samOut.close();
    }
+
+    /**
+     * For debugging purposes.  Spits out relevant information for two SAMRecords.
+     * 
+     * @param sra  first SAMRecord
+     * @param srb  second SAMRecord
+     */
+    private void printRecords(SAMRecord sra, SAMRecord srb) {
+        System.out.println("a: " + sra.getReadName() + " " + sra.getFirstOfPairFlag());
+        System.out.println("b: " + srb.getReadName() + " " + srb.getFirstOfPairFlag());
+        System.out.println();
+
+    }
 }
--- a/java/src/org/broadinstitute/sting/secondarybase/BasecallingBaseModel.java
+++ b/java/src/org/broadinstitute/sting/secondarybase/BasecallingBaseModel.java
@ -39,7 +39,9 @@ public class BasecallingBaseModel {
    private boolean readyToCall = false;

    /**
-     * Constructor for BasecallingBaseModel
+     * Constructor for BasecallingBaseModel.
+     *
+     * @param correctForContext  should we attempt to correct for contextual sequence effects?
     */
    public BasecallingBaseModel(boolean correctForContext) {
        this.correctForContext = correctForContext;
@ -67,8 +69,12 @@ public class BasecallingBaseModel {
        alg = new Algebra();
    }

+
    /**
     * Add a single training point to the model to estimate the means.
+     *
+     * @param probMatrix     the matrix of probabilities for the base
+     * @param fourIntensity  the four raw intensities for the base
     */
    public void addMeanPoint(double[][] probMatrix, double[] fourIntensity) {
        for (int basePrevIndex = 0; basePrevIndex < numTheories; basePrevIndex++) {
@ -88,6 +94,9 @@ public class BasecallingBaseModel {

    /**
     * Add a single training point to the model to estimate the covariances.
+     *
+     * @param probMatrix     the matrix of probabilities for the base
+     * @param fourIntensity  the four raw intensities for the base
     */
    public void addCovariancePoint(double[][] probMatrix, double[] fourIntensity) {
        for (int basePrevIndex = 0; basePrevIndex < numTheories; basePrevIndex++) {
@ -133,7 +142,7 @@ public class BasecallingBaseModel {
    }

    /**
-     * Compute the likelihood matrix for a base
+     * Compute the likelihood matrix for a base.
     *
     * @param cycle         the cycle we're calling right now
     * @param fourintensity the four intensities of the current cycle's base
@ -163,6 +172,11 @@ public class BasecallingBaseModel {
        return likedist;
    }

+    /**
+     * Write the model parameters to disk.
+     *
+     * @param outparam  the file in which the output parameters should be stored
+     */
    public void write(File outparam) {
        try {
            PrintWriter writer = new PrintWriter(outparam);
--- a/java/src/org/broadinstitute/sting/secondarybase/BasecallingReadModel.java
+++ b/java/src/org/broadinstitute/sting/secondarybase/BasecallingReadModel.java
@ -20,16 +20,31 @@ public class BasecallingReadModel {
    private BasecallingBaseModel[] basemodels = null;
    private boolean correctForContext = true;

+    /**
+     * Constructs a BasecallingReadModel with space for a given read length.
+     * 
+     * @param readLength  the length of the reads to which this model will apply.
+     */
    public BasecallingReadModel(int readLength) {
        initialize(readLength);
    }

+    /**
+     * Constructs a BasecallingReadModel and trains it using the specified training data.
+     *
+     * @param trainingData  a set of RawReads from which the model will be trained.
+     */
    public BasecallingReadModel(ArrayList<RawRead> trainingData) {
        initialize(trainingData.get(0).getReadLength());

        train(trainingData);
    }

+    /**
+     * Initialize the model and set default parameters for each cycle appropriately.
+     *
+     * @param readLength  the length of the reads to which this model will apply.
+     */
    public void initialize(int readLength) {
        basemodels = new BasecallingBaseModel[readLength];

@ -38,8 +53,12 @@ public class BasecallingReadModel {
        }
    }

+    /**
+     * Train the model using the specified training data.
+     *
+     * @param trainingData  a set of RawReads from which the model will be trained.
+     */
    public void train(ArrayList<RawRead> trainingData) {
-        //for (int readIndex = 0; readIndex < trainingData.size(); readIndex++) {
        for ( RawRead read : trainingData ) {
            addMeanPoints(read);
        }
@ -49,10 +68,22 @@ public class BasecallingReadModel {
        }
    }

+    /**
+     * Add a training point for the mean intensity values per base and per cycle.
+     *
+     * @param cycle          the cycle number (0-based)
+     * @param probMatrix     the probability matrix for the base
+     * @param fourintensity  the four raw intensities for the base
+     */
    public void addMeanPoint(int cycle, double[][] probMatrix, double[] fourintensity) {
        basemodels[cycle].addMeanPoint(probMatrix, fourintensity);
    }

+    /**
+     * Add a training point for the mean intensity values per base in all cycles.
+     *
+     * @param read  the raw read
+     */
    public void addMeanPoints(RawRead read) {
        byte[] seqs = read.getSequence();
        byte[] quals = read.getQuals();
@ -74,10 +105,22 @@ public class BasecallingReadModel {
        }
    }

+    /**
+     * Add a training point for the intensity covariance matrix per base and per cycle.
+     *
+     * @param cycle          the cycle number (0-based)
+     * @param probMatrix     the probability matrix for the base
+     * @param fourintensity  the four raw intensities for the base
+     */
    public void addCovariancePoint(int cycle, double[][] probMatrix, double[] fourintensity) {
        basemodels[cycle].addCovariancePoint(probMatrix, fourintensity);
    }

+    /**
+     * Add a training point for the intensity covariance matrix per base in all cycles.
+     *
+     * @param read  the raw read
+     */
    public void addCovariancePoints(RawRead read) {
        byte[] seqs = read.getSequence();
        byte[] quals = read.getQuals();
@ -99,10 +142,26 @@ public class BasecallingReadModel {
        }
    }

+    /**
+     * Compute the likelihoods that a given set of intensities yields each possible base.
+     *
+     * @param cycle          the cycle number (0-based)
+     * @param fourintensity  the four raw intensities for the base
+     * @return               the matrix of likelihoods
+     */
    public double[][] computeLikelihoods(int cycle, double[] fourintensity) {
        return basemodels[cycle].computeLikelihoods(cycle, fourintensity);
    }

+    /**
+     * Compute the probabilities that a given set of intensities yields each possible base.
+     *
+     * @param cycle          the cycle number (0-based)
+     * @param basePrev       the previous base
+     * @param qualPrev       the previous base's quality score
+     * @param fourintensity  the four raw intensities for the base
+     * @return the probability distribution over the four base possibilities
+     */
    public FourProb computeProbabilities(int cycle, char basePrev, byte qualPrev, double[] fourintensity) {
        double[][] likes = computeLikelihoods(cycle, fourintensity);

@ -133,6 +192,12 @@ public class BasecallingReadModel {
        return new FourProb(likes);
    }

+    /**
+     * Call the bases in the given RawRead.
+     *
+     * @param read  the RawRead
+     * @return  the basecalled read
+     */
    public FourProbRead call(RawRead read) {
        FourProbRead fpr = new FourProbRead(read.getReadLength());
        
@ -152,6 +217,15 @@ public class BasecallingReadModel {
        return fpr;
    }

+    /**
+     * Return the probability matrix given the previous cycle's base, the current cycle's base, and the current base's probability.
+     *
+     * @param cycle     the cycle number (0-based)
+     * @param basePrev  the previous base
+     * @param baseCur   the current base
+     * @param probCur   the probability of the current base
+     * @return the probability matrix of the base
+     */
    public double[][] getBaseProbabilityMatrix(int cycle, char basePrev, char baseCur, double probCur) {
        double[][] dist = new double[(correctForContext && cycle > 0) ? 4 : 1][4];

@ -169,6 +243,11 @@ public class BasecallingReadModel {
        return dist;
    }

+    /**
+     * Write model parameters to disk.
+     *
+     * @param dir  the directory in which model parameters should be stored.
+     */
    public void write(File dir) {
        for (int cycle = 0; cycle < basemodels.length; cycle++) {
            File outparam = new File(dir.getPath() + "/param." + cycle + ".r");
--- a/java/src/org/broadinstitute/sting/secondarybase/BasecallingStats.java
+++ b/java/src/org/broadinstitute/sting/secondarybase/BasecallingStats.java
@ -78,7 +78,7 @@ public class BasecallingStats {
    }

    /**
-     * Returns basecalling stats info in a nicely formatted string
+     * Returns basecalling stats info in a nicely formatted string.
     *
     * @return nicely formatted string containing basecalling stats
     */
@ -87,7 +87,7 @@ public class BasecallingStats {
    }

    /**
-     * Periodically print a line containing basecalling stats
+     * Periodically print a line containing basecalling stats.
     *
     * @param interval  the periodicity of the messages given in number of bases observed
     */
@ -98,7 +98,7 @@ public class BasecallingStats {
    }

    /**
-     * Immediately print a line containing basecalling stats
+     * Immediately print a line containing basecalling stats.
     */
    public void notifyNow() {
        System.out.printf("%s\n", toString());
--- a/java/src/org/broadinstitute/sting/secondarybase/BasecallingTrainer.java
+++ b/java/src/org/broadinstitute/sting/secondarybase/BasecallingTrainer.java
@ -40,7 +40,7 @@ public class BasecallingTrainer {
    }

    /**
-     * Get the training data array list
+     * Get the training data array list.
     *
     * @return  the arraylist of raw training reads
     */
@ -49,7 +49,7 @@ public class BasecallingTrainer {
    }

    /**
-     * Set the training data array list
+     * Set the training data array list.
     *
     * @param trainingData  the arraylist of raw training reads
     */
@ -88,7 +88,7 @@ public class BasecallingTrainer {
    }

    /**
-     * Load a training set from perfect reads in an already-aligned bam file
+     * Load a training set from perfect reads in an already-aligned bam file.
     *
     * @param samIn      the SAM/BAM file to load the reads from
     * @param reference  the reference to which the reads should be compared
@ -167,6 +167,12 @@ public class BasecallingTrainer {
        return trainingReads;
    }

+    /**
+     * Correlate the perfect reads with their intensities (at least, theoretically).  This doesn't work right now...
+     *
+     * @param trainingReads  the set of training reads, hashed by tile
+     * @return the final training set with intensities added in
+     */
    private ArrayList<RawRead> correlateReadsAndIntensities(Vector<HashMap<String, SAMRecord>> trainingReads) {
        ArrayList<RawRead> newTrainingData = new ArrayList<RawRead>(trainingLimit);

--- a/java/src/org/broadinstitute/sting/secondarybase/FourProb.java
+++ b/java/src/org/broadinstitute/sting/secondarybase/FourProb.java
@ -50,6 +50,7 @@ public class FourProb {

    /**
     * Returns the base label of the base at the specified rank.
+     *
     * @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned
     * @return the base label (A, C, G, T).
     */
@ -57,6 +58,7 @@ public class FourProb {

    /**
     * Returns the probability of the base at the specified rank.
+     *
     * @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned
     * @return the probability of the base (0.0-1.0)
     */
@ -64,6 +66,7 @@ public class FourProb {

    /**
     * Returns the quality score of the base at the specified rank.
+     *
     * @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned
     * @return the quality score of the base (0-40)
     */
@ -71,6 +74,7 @@ public class FourProb {

    /**
     * A utility method to convert a base index into a base label.
+     *
     * @param baseIndex the index of the base (0, 1, 2, 3).
     * @return A, C, G, T, or '.' if the base index can't be understood.
     */
--- a/java/src/org/broadinstitute/sting/secondarybase/FourProbRead.java
+++ b/java/src/org/broadinstitute/sting/secondarybase/FourProbRead.java
@ -10,7 +10,7 @@ import java.util.ArrayList;
 */
 public class FourProbRead extends ArrayList<FourProb> {
    /**
-     * Initialize the container with the specified capacity
+     * Initialize the container with the specified capacity.
     *
     * @param initialCapacity  the number of bases in the read
     */
@ -23,7 +23,7 @@ public class FourProbRead extends ArrayList<FourProb> {
     *
     * @param cycleStart  the starting cycle (0-based, inclusive)
     * @param cycleStop   the ending cycle (0-based, inclusive)
-     * @return
+     * @return a FourProbRead that is a subset of this FourProbRead
     */
    public FourProbRead getSubset(int cycleStart, int cycleStop) {
        FourProbRead subset = new FourProbRead(cycleStop - cycleStart + 1);
@ -36,7 +36,7 @@ public class FourProbRead extends ArrayList<FourProb> {
    }

    /**
-     * Get the read sequence at a specified rank
+     * Get the read sequence at a specified rank.
     *
     * @param rank  the rank of the sequence to return (0=best, 1=second-best, 2=third-best, 3=fourth-best)
     * @return the read sequence at the specified rank
@ -44,9 +44,7 @@ public class FourProbRead extends ArrayList<FourProb> {
    public String getBaseSequenceAtGivenRank(int rank) {
        String pseq = "";

-        for (int cycle = 0; cycle < this.size(); cycle++) {
-            FourProb fp = this.get(cycle);
-
+        for ( FourProb fp : this ) {
            pseq += fp.baseAtRank(rank);
        }

@ -54,7 +52,8 @@ public class FourProbRead extends ArrayList<FourProb> {
    }

    /**
-     * Get the primary read sequence
+     * Get the primary read sequence.
+     *
     * @return  the primary read sequence
     */
    public String getPrimaryBaseSequence() {
@ -62,7 +61,8 @@ public class FourProbRead extends ArrayList<FourProb> {
    }

    /**
-     * Get the secondary read sequence
+     * Get the secondary read sequence.
+     *
     * @return  the secondary read sequence
     */
    public String getSecondaryBaseSequence() {
@ -73,7 +73,7 @@ public class FourProbRead extends ArrayList<FourProb> {
     * Get the SAM spec-conformant SQ tag that will be written to the SAM/BAM file.
     *
     * @param rr  the raw read
-     * @return the byte array for the SQ tag (first two bits: the base identity, the last six bits: -10*log10(p3/p2).
+     * @return the byte array for the SQ tag (first two bits: the base identity, the last six bits: -10*log10(p3/p2)
     */
    public byte[] getSQTag(RawRead rr) {
        byte[] sqtag = new byte[this.size()];
@ -89,8 +89,6 @@ public class FourProbRead extends ArrayList<FourProb> {
            double qualdiff = -10.0*Math.log10(fp.probAtRank(2)/fp.probAtRank(1));

            sqtag[cycle] = QualityUtils.baseAndProbDiffToCompressedQuality(fpSecondaryBaseIndex, qualdiff);
-
-            //System.out.println("SQTAG: " + qualdiff + " " + sqtag[cycle] + " " + QualityUtils.compressedQualityToProbDiff(sqtag[cycle]));
        }

        return sqtag;
--- a/java/src/org/broadinstitute/sting/secondarybase/IlluminaParser.java
+++ b/java/src/org/broadinstitute/sting/secondarybase/IlluminaParser.java
@ -9,10 +9,20 @@ import java.io.File;
 import java.io.FilenameFilter;
 import java.util.Arrays;
 import java.util.Comparator;
-import java.util.Iterator;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

+/**
+ * IlluminaParser parses raw Illumina data (raw intensities, basecalled read sequences and quality scores)
+ * and presents it to the developer in an easy-to-use form.  It also permits random tile jumping.
+ *
+ * WARNING: This parser does not understand newer GAPipeline data formats, and instead relies on the older
+ * data formats that may have been generated with older Illumina tools.  As a result, this may return
+ * suboptimal data.  This parser only exists temporarily until the Picard team writes a much more sensible
+ * version.  Proceed with caution.
+ *
+ * @author Kiran Garimella
+ */
 public class IlluminaParser implements Closeable {
    private File bustardDir;
    private File firecrestDir;
@ -26,6 +36,12 @@ public class IlluminaParser implements Closeable {
    private PasteParser currentTileParser;
    private String[][] currentParseResult;

+    /**
+     * Construct an IlluminaParser given the Bustard directory and lane.  Infer the Firecrest directory.
+     *
+     * @param bustardDir  the Illumina Bustard directory
+     * @param lane        the Illumina lane
+     */
    public IlluminaParser(File bustardDir, int lane) {
        this.bustardDir = bustardDir;
        this.firecrestDir = bustardDir.getParentFile();
@ -34,6 +50,13 @@ public class IlluminaParser implements Closeable {
        initializeParser();
    }

+    /**
+     * Construct an IlluminaParser given the Bustard directory, Firecrest directory and lane.
+     *
+     * @param bustardDir    the Illumina Bustard directory
+     * @param firecrestDir  the Illumina Firecrest directory
+     * @param lane          the Illumina lane
+     */
    public IlluminaParser(File bustardDir, File firecrestDir, int lane) {
        this.bustardDir = bustardDir;
        this.firecrestDir = firecrestDir;
@ -42,6 +65,9 @@ public class IlluminaParser implements Closeable {
        initializeParser();
    }

+    /**
+     * Initialize the parser and seek to the first tile.
+     */
    private void initializeParser() {
        intfiles = firecrestDir.listFiles(getFilenameFilter("int"));
        seqfiles = bustardDir.listFiles(getFilenameFilter("seq"));
@ -65,6 +91,12 @@ public class IlluminaParser implements Closeable {
        // Todo: put some more consistency checks here
    }

+    /**
+     * Get the filename filter for files of a given type.
+     *
+     * @param suffix  the type (i.e. 'int', 'seq', 'prb').
+     * @return the filename filter
+     */
    private FilenameFilter getFilenameFilter(final String suffix) {
        return new FilenameFilter() {
            public boolean accept(File file, String s) {
@ -76,6 +108,11 @@ public class IlluminaParser implements Closeable {
        };
    }

+    /**
+     * Get a comparator that sorts by tile.
+     *
+     * @return the comparator that sorts by tile.
+     */
    private Comparator<File> getTileSortingComparator() {
        return new Comparator<File>() {
            public int compare(File file1, File file2) {
@ -99,8 +136,19 @@ public class IlluminaParser implements Closeable {
        };
    }

+    /**
+     * Return the number of tiles.
+     *
+     * @return  the number of tiles.
+     */
    public int numTiles() { return intfiles.length; }

+    /**
+     * Seek to a specified tile.
+     *
+     * @param tile  the tile to which we should seek
+     * @return true if we were able to seek to the tile, false if otherwise
+     */
    public boolean seekToTile(int tile) {
        if (tile < intfiles.length - 1) {
            currentTileIndex = tile - 1;
@ -117,10 +165,20 @@ public class IlluminaParser implements Closeable {
        return false;
    }

+    /**
+     * Returns whether the parser has any more data to go through.
+     *
+     * @return  true if there's data left, false if otherwise
+     */
    public boolean hasNext() {
        return (currentTileParser.hasNext() || seekToTile(currentTileIndex + 1));
    }

+    /**
+     * Advance the parser to the next read.
+     *
+     * @return true if successful, false if otherwise
+     */
    public boolean next() {
        if (hasNext()) {
            currentParseResult = currentTileParser.next();
@ -131,22 +189,45 @@ public class IlluminaParser implements Closeable {
        return false;
    }

+    /**
+     * Returns the result from the current parse as an matrix of Strings.
+     *
+     * @return the matrix of Strings containing the current parse result
+     */
    public String[][] getCurrentParseResult() {
        return currentParseResult;
    }

+    /**
+     * Removes, um, something, but in reality, does nothing.
+     */
    public void remove() {
        throw new UnsupportedOperationException("IlluminaParser.remove() method is not supported.");
    }

+    /**
+     * Close the current tile.
+     */
    public void close() {
        currentTileParser.close();
    }

+    /**
+     * Returns a raw read containing the raw intensities, read sequence, and quality scores.
+     *
+     * @return the raw read
+     */
    public RawRead getRawRead() {
        return getSubset(0, currentParseResult[1][4].length() - 1);
    }

+    /**
+     * Returns a subset of the current parse result as a raw read.
+     *
+     * @param cycleStart  the starting cycle for the desired subset
+     * @param cycleStop   the ending cycle for the desired subset
+     * @return the subset of the current parse result as a raw read
+     */
    public RawRead getSubset(int cycleStart, int cycleStop) {
        return new RawRead(currentParseResult, cycleStart, cycleStop);
    }
--- a/java/src/org/broadinstitute/sting/secondarybase/RawRead.java
+++ b/java/src/org/broadinstitute/sting/secondarybase/RawRead.java
@ -55,93 +55,93 @@ public class RawRead {
    }

    /**
-     * Get lane number of read
+     * Get lane number of read.
     *
     * @return lane number of read
     */
    public byte getLane() { return lane; }

    /**
-     * Get tile number of read
+     * Get tile number of read.
     *
     * @return tile number of read
     */
    public int getTile() { return tile; }

    /**
-     * Get x-coordinate of read
+     * Get x-coordinate of read.
     *
     * @return x-coordinate of read
     */
    public int getXCoordinate() { return x; }

    /**
-     * Get y-coordinate of read
+     * Get y-coordinate of read.
     *
     * @return y-coordinate of read
     */
    public int getYCoordinate() { return y; }

    /**
-     * Get read key (lane:tile:x:y)
+     * Get read key (lane:tile:x:y).
     *
     * @return read key (lane:tile:x:y)
     */
    public String getReadKey() { return String.format("%d:%d:%d:%d", lane, tile, x, y); }

    /**
-     * Get the read sequence between the cycles specified in the constructor as a byte array
+     * Get the read sequence between the cycles specified in the constructor as a byte array.
     *
     * @return read sequence
     */
    public byte[] getSequence() { return sequence; }

    /**
-     * Set the read sequence from a byte array
+     * Set the read sequence from a byte array.
     *
     * @param sequence  the read sequence in byte array form
     */
    public void setSequence(byte[] sequence) { this.sequence = sequence; }

    /**
-     * Get the read sequence as a string
+     * Get the read sequence as a String.
     *
-     * @return  the read sequence in string form
+     * @return  the read sequence in String form
     */
    public String getSequenceAsString() {
        return new String(getSequence());
    }

    /**
-     * Get the quals
+     * Get the quals.
     *
     * @return a byte array of quals
     */
    public byte[] getQuals() { return quals; }

    /**
-     * Set the quals
+     * Set the quals.
     *
     * @param quals  a byte array of quals
     */
    public void setQuals(byte[] quals) { this.quals = quals; }

    /**
-     * Get the raw read intensities
+     * Get the raw read intensities.
     *
     * @return the (readLength)x(numChannels) array of raw intensities
     */
    public short[][] getIntensities() { return intensities; }

    /**
-     * Set the raw intensities
+     * Set the raw intensities.
     *
     * @param intensities  the (readLength)x(numChannels) array of raw intensities
     */
    public void setIntensities(short[][] intensities) { this.intensities = intensities; }

    /**
-     * Get the read length
+     * Get the read length.
     *
     * @return the read length
     */