Cleaned up a bit. Added some documentation.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@728 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2c4de7b5c5
commit
6f1559bd77
|
|
@ -10,6 +10,20 @@ import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
|||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* AnnotateSecondaryBase computes the second best base for every base in an Illumina lane.
|
||||
* First, a statistical model is fit to a subset of the raw Illumina intensities (i.e. those
|
||||
* generated by Illumina's "Firecrest" package). Then, every read's set of raw intensities
|
||||
* is evaluated against this model to determine the base probability distribution of a given
|
||||
* base observation.
|
||||
*
|
||||
* Approximately 95% of the time, this method and Illumina's basecalling package, "Bustard",
|
||||
* agree on the identity of the best base. In these cases, we simply annotate the
|
||||
* second-best base. In cases where this method and Bustard disagree, we annotate the
|
||||
* secondary base as this method's primary base.
|
||||
*
|
||||
* @author Kiran Garimella
|
||||
*/
|
||||
public class AnnotateSecondaryBase extends CommandLineProgram {
|
||||
public static AnnotateSecondaryBase Instance = null;
|
||||
|
||||
|
|
@ -33,15 +47,16 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
|
|||
protected int execute() {
|
||||
BasecallingTrainingSet trainingSet = new BasecallingTrainingSet(BUSTARD_DIR, LANE, CYCLE_BEGIN, CYCLE_END, TRAINING_LIMIT);
|
||||
|
||||
if (SAM_IN == null || !SAM_IN.exists()) {
|
||||
// Iterate through raw Firecrest data and store the first N reads up to TRAINING_LIMIT
|
||||
System.out.println("Loading training set from the first " + TRAINING_LIMIT + " reads in the raw data...");
|
||||
trainingSet.loadFirstNUnambiguousReadsTrainingSet();
|
||||
} else {
|
||||
// Find alignments with zero mismatches and store them until we've picked up TRAINING_LIMIT alignments
|
||||
System.out.println("Loading training set from the first " + TRAINING_LIMIT + " perfect reads in the aligned data...");
|
||||
trainingSet.loadPreAlignedTrainingSet(SAM_IN, REFERENCE);
|
||||
}
|
||||
/*
|
||||
// This doesn't work right now...
|
||||
// Find alignments with zero mismatches and store them until we've picked up TRAINING_LIMIT alignments
|
||||
System.out.println("Loading training set from the first " + TRAINING_LIMIT + " perfect reads in the aligned data...");
|
||||
trainingSet.loadPreAlignedTrainingSet(SAM_IN, REFERENCE);
|
||||
*/
|
||||
|
||||
// Iterate through raw Firecrest data and store the first N reads up to TRAINING_LIMIT
|
||||
System.out.println("Loading training set from the first " + TRAINING_LIMIT + " reads in the raw data...");
|
||||
trainingSet.loadFirstNUnambiguousReadsTrainingSet();
|
||||
|
||||
// Iterate through the stored training data and add the info to the BasecallingReadModel
|
||||
System.out.println("Applying training set...");
|
||||
|
|
@ -93,6 +108,17 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
|
|||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a SAMRecord object with the specified information. The secondary bases
|
||||
* will be annotated suchthat they will not conflict with the primary base.
|
||||
*
|
||||
* @param rr the raw Illumina read
|
||||
* @param fpr the four-base distributions for every base in the read
|
||||
* @param sfh the SAM header
|
||||
* @param runBarcode the run barcode of the lane (used to prefix the reads)
|
||||
*
|
||||
* @return a fully-constructed SAM record
|
||||
*/
|
||||
private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode) {
|
||||
SAMRecord sr = new SAMRecord(sfh);
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue