Cleaned up a bit. Added some documentation.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@728 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2009-05-15 21:22:24 +00:00
parent 2c4de7b5c5
commit 6f1559bd77
1 changed files with 35 additions and 9 deletions

View File

@ -10,6 +10,20 @@ import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import java.io.File;
/**
* AnnotateSecondaryBase computes the second best base for every base in an Illumina lane.
* First, a statistical model is fit to a subset of the raw Illumina intensities (i.e. those
* generated by Illumina's "Firecrest" package). Then, every read's set of raw intensities
* is evaluated against this model to determine the base probability distribution of a given
* base observation.
*
* Approximately 95% of the time, this method and Illumina's basecalling package, "Bustard",
* agree on the identity of the best base. In these cases, we simply annotate the
* second-best base. In cases where this method and Bustard disagree, we annotate the
* secondary base as this method's primary base.
*
* @author Kiran Garimella
*/
public class AnnotateSecondaryBase extends CommandLineProgram {
public static AnnotateSecondaryBase Instance = null;
@ -33,15 +47,16 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
protected int execute() {
BasecallingTrainingSet trainingSet = new BasecallingTrainingSet(BUSTARD_DIR, LANE, CYCLE_BEGIN, CYCLE_END, TRAINING_LIMIT);
if (SAM_IN == null || !SAM_IN.exists()) {
// Iterate through raw Firecrest data and store the first N reads up to TRAINING_LIMIT
System.out.println("Loading training set from the first " + TRAINING_LIMIT + " reads in the raw data...");
trainingSet.loadFirstNUnambiguousReadsTrainingSet();
} else {
// Find alignments with zero mismatches and store them until we've picked up TRAINING_LIMIT alignments
System.out.println("Loading training set from the first " + TRAINING_LIMIT + " perfect reads in the aligned data...");
trainingSet.loadPreAlignedTrainingSet(SAM_IN, REFERENCE);
}
/*
// This doesn't work right now...
// Find alignments with zero mismatches and store them until we've picked up TRAINING_LIMIT alignments
System.out.println("Loading training set from the first " + TRAINING_LIMIT + " perfect reads in the aligned data...");
trainingSet.loadPreAlignedTrainingSet(SAM_IN, REFERENCE);
*/
// Iterate through raw Firecrest data and store the first N reads up to TRAINING_LIMIT
System.out.println("Loading training set from the first " + TRAINING_LIMIT + " reads in the raw data...");
trainingSet.loadFirstNUnambiguousReadsTrainingSet();
// Iterate through the stored training data and add the info to the BasecallingReadModel
System.out.println("Applying training set...");
@ -93,6 +108,17 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
return 0;
}
/**
* Construct a SAMRecord object with the specified information. The secondary bases
* will be annotated suchthat they will not conflict with the primary base.
*
* @param rr the raw Illumina read
* @param fpr the four-base distributions for every base in the read
* @param sfh the SAM header
* @param runBarcode the run barcode of the lane (used to prefix the reads)
*
* @return a fully-constructed SAM record
*/
private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode) {
SAMRecord sr = new SAMRecord(sfh);