From 6f1559bd7798681229926e46697c65598f32961c Mon Sep 17 00:00:00 2001 From: kiran Date: Fri, 15 May 2009 21:22:24 +0000 Subject: [PATCH] Cleaned up a bit. Added some documentation. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@728 348d0f76-0448-11de-a6fe-93d51630548a --- .../secondarybase/AnnotateSecondaryBase.java | 44 +++++++++++++++---- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/java/src/org/broadinstitute/sting/secondarybase/AnnotateSecondaryBase.java b/java/src/org/broadinstitute/sting/secondarybase/AnnotateSecondaryBase.java index 537dd105f..ba7494074 100755 --- a/java/src/org/broadinstitute/sting/secondarybase/AnnotateSecondaryBase.java +++ b/java/src/org/broadinstitute/sting/secondarybase/AnnotateSecondaryBase.java @@ -10,6 +10,20 @@ import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; import java.io.File; +/** + * AnnotateSecondaryBase computes the second best base for every base in an Illumina lane. + * First, a statistical model is fit to a subset of the raw Illumina intensities (i.e. those + * generated by Illumina's "Firecrest" package). Then, every read's set of raw intensities + * is evaluated against this model to determine the base probability distribution of a given + * base observation. + * + * Approximately 95% of the time, this method and Illumina's basecalling package, "Bustard", + * agree on the identity of the best base. In these cases, we simply annotate the + * second-best base. In cases where this method and Bustard disagree, we annotate the + * secondary base as this method's primary base. + * + * @author Kiran Garimella + */ public class AnnotateSecondaryBase extends CommandLineProgram { public static AnnotateSecondaryBase Instance = null; @@ -33,15 +47,16 @@ public class AnnotateSecondaryBase extends CommandLineProgram { protected int execute() { BasecallingTrainingSet trainingSet = new BasecallingTrainingSet(BUSTARD_DIR, LANE, CYCLE_BEGIN, CYCLE_END, TRAINING_LIMIT); - if (SAM_IN == null || !SAM_IN.exists()) { - // Iterate through raw Firecrest data and store the first N reads up to TRAINING_LIMIT - System.out.println("Loading training set from the first " + TRAINING_LIMIT + " reads in the raw data..."); - trainingSet.loadFirstNUnambiguousReadsTrainingSet(); - } else { - // Find alignments with zero mismatches and store them until we've picked up TRAINING_LIMIT alignments - System.out.println("Loading training set from the first " + TRAINING_LIMIT + " perfect reads in the aligned data..."); - trainingSet.loadPreAlignedTrainingSet(SAM_IN, REFERENCE); - } + /* + // This doesn't work right now... + // Find alignments with zero mismatches and store them until we've picked up TRAINING_LIMIT alignments + System.out.println("Loading training set from the first " + TRAINING_LIMIT + " perfect reads in the aligned data..."); + trainingSet.loadPreAlignedTrainingSet(SAM_IN, REFERENCE); + */ + + // Iterate through raw Firecrest data and store the first N reads up to TRAINING_LIMIT + System.out.println("Loading training set from the first " + TRAINING_LIMIT + " reads in the raw data..."); + trainingSet.loadFirstNUnambiguousReadsTrainingSet(); // Iterate through the stored training data and add the info to the BasecallingReadModel System.out.println("Applying training set..."); @@ -93,6 +108,17 @@ public class AnnotateSecondaryBase extends CommandLineProgram { return 0; } + /** + * Construct a SAMRecord object with the specified information. The secondary bases + * will be annotated suchthat they will not conflict with the primary base. + * + * @param rr the raw Illumina read + * @param fpr the four-base distributions for every base in the read + * @param sfh the SAM header + * @param runBarcode the run barcode of the lane (used to prefix the reads) + * + * @return a fully-constructed SAM record + */ private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode) { SAMRecord sr = new SAMRecord(sfh);