Cleaned up a bit. Added some documentation.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@728 348d0f76-0448-11de-a6fe-93d51630548a
2009-05-15 21:22:24 +00:00 · 2009-05-15 21:22:24 +00:00 · 6f1559bd77
parent 2c4de7b5c5
commit 6f1559bd77
1 changed files with 35 additions and 9 deletions
--- a/java/src/org/broadinstitute/sting/secondarybase/AnnotateSecondaryBase.java
+++ b/java/src/org/broadinstitute/sting/secondarybase/AnnotateSecondaryBase.java
@ -10,6 +10,20 @@ import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;

 import java.io.File;

+/**
+ * AnnotateSecondaryBase computes the second best base for every base in an Illumina lane.
+ * First, a statistical model is fit to a subset of the raw Illumina intensities (i.e. those
+ * generated by Illumina's "Firecrest" package).  Then, every read's set of raw intensities
+ * is evaluated against this model to determine the base probability distribution of a given
+ * base observation.
+ *
+ * Approximately 95% of the time, this method and Illumina's basecalling package, "Bustard",
+ * agree on the identity of the best base.  In these cases, we simply annotate the
+ * second-best base.  In cases where this method and Bustard disagree, we annotate the
+ * secondary base as this method's primary base.
+ *
+ * @author Kiran Garimella
+ */
 public class AnnotateSecondaryBase extends CommandLineProgram {
    public static AnnotateSecondaryBase Instance = null;

@ -33,15 +47,16 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
    protected int execute() {
        BasecallingTrainingSet trainingSet = new BasecallingTrainingSet(BUSTARD_DIR, LANE, CYCLE_BEGIN, CYCLE_END, TRAINING_LIMIT);

-        if (SAM_IN == null || !SAM_IN.exists()) {
-            // Iterate through raw Firecrest data and store the first N reads up to TRAINING_LIMIT
-            System.out.println("Loading training set from the first " + TRAINING_LIMIT + " reads in the raw data...");
-            trainingSet.loadFirstNUnambiguousReadsTrainingSet();
-        } else {
-            // Find alignments with zero mismatches and store them until we've picked up TRAINING_LIMIT alignments
-            System.out.println("Loading training set from the first " + TRAINING_LIMIT + " perfect reads in the aligned data...");
-            trainingSet.loadPreAlignedTrainingSet(SAM_IN, REFERENCE);
-        }
+        /*
+        // This doesn't work right now...
+        // Find alignments with zero mismatches and store them until we've picked up TRAINING_LIMIT alignments
+        System.out.println("Loading training set from the first " + TRAINING_LIMIT + " perfect reads in the aligned data...");
+        trainingSet.loadPreAlignedTrainingSet(SAM_IN, REFERENCE);
+        */
+
+        // Iterate through raw Firecrest data and store the first N reads up to TRAINING_LIMIT
+        System.out.println("Loading training set from the first " + TRAINING_LIMIT + " reads in the raw data...");
+        trainingSet.loadFirstNUnambiguousReadsTrainingSet();

        // Iterate through the stored training data and add the info to the BasecallingReadModel
        System.out.println("Applying training set...");
@ -93,6 +108,17 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
        return 0;
    }

+    /**
+     * Construct a SAMRecord object with the specified information.  The secondary bases
+     * will be annotated suchthat they will not conflict with the primary base.
+     *
+     * @param rr          the raw Illumina read
+     * @param fpr         the four-base distributions for every base in the read
+     * @param sfh         the SAM header
+     * @param runBarcode  the run barcode of the lane (used to prefix the reads)
+     *
+     * @return a fully-constructed SAM record
+     */
    private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode) {
        SAMRecord sr = new SAMRecord(sfh);