Added loads of documentation.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@777 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
1a9d5cea29
commit
5f67914b08
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.playground.fourbasecaller;
|
||||
package org.broadinstitute.sting.secondarybase;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
|
|
|||
|
|
@ -2,18 +2,18 @@ package org.broadinstitute.sting.secondarybase;
|
|||
|
||||
import net.sf.samtools.*;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.ArrayList;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* AnnotateSecondaryBase computes the second best base for every base in an Illumina lane.
|
||||
|
|
@ -23,24 +23,35 @@ import java.util.regex.Matcher;
|
|||
* base observation.
|
||||
*
|
||||
* Approximately 95% of the time, this method and Illumina's basecalling package, "Bustard",
|
||||
* agree on the identity of the best base. In these cases, we simply annotate the
|
||||
* second-best base. In cases where this method and Bustard disagree, we annotate the
|
||||
* secondary base as this method's primary base.
|
||||
* agree on the identity of the best base. In these cases, we simply annotate our estimate
|
||||
* of the second-best base. In cases where this method and Bustard disagree, we annotate
|
||||
* the secondary base as this method's primary base.
|
||||
*
|
||||
* @author Kiran Garimella
|
||||
*/
|
||||
|
||||
/*
|
||||
An example invocation:
|
||||
java -Xmx2048m -Djava.io.tmpdir=/broad/hptmp/ -jar /path/to/AnnotateSecondaryBase.jar \
|
||||
-D /seq/solexaproc2/SL-XAX/analyzed/090217_SL-XAX_0003_FC30R47AAXX/Data/C1-152_Firecrest1.3.2_25-02-2009_prodinfo/Bustard1.3.2_25-02-2009_prodinfo/ \
|
||||
-L 5 \
|
||||
-B 30R47AAXX090217
|
||||
-CR '0-75,76-151' \
|
||||
-SO ~/test.sam \
|
||||
-SI /seq/picard/30R47AAXX/C1-152_2009-02-17_2009-04-02/5/Solexa-10265/30R47AAXX.5.aligned.bam \
|
||||
*/
|
||||
|
||||
public class AnnotateSecondaryBase extends CommandLineProgram {
|
||||
public static AnnotateSecondaryBase Instance = null;
|
||||
|
||||
@Argument(fullName="dir", shortName="D", doc="Illumina Bustard directory") public File BUSTARD_DIR;
|
||||
@Argument(fullName="bustard_dir", shortName="D", doc="Illumina Bustard directory") public File BUSTARD_DIR;
|
||||
@Argument(fullName="lane", shortName="L", doc="Illumina flowcell lane") public int LANE;
|
||||
@Argument(fullName="sam_in", shortName="SI", doc="The file to use for training and annotation", required=false) public File SAM_IN;
|
||||
@Argument(fullName="run_barcode", shortName="B", doc="Run barcode (embedded as part of the read name; i.e. 30R47AAXX090217)") public String RUN_BARCODE;
|
||||
@Argument(fullName="cycle_ranges", shortName="CR", doc="Cycle ranges for single-end or paired reads (i.e. '0-50,56-106') (0-based, inclusive)") public String CYCLE_RANGES;
|
||||
@Argument(fullName="sam_out", shortName="SO", doc="Output path for sam file") public File SAM_OUT;
|
||||
@Argument(fullName="reference", shortName="R", doc="Reference sequence to which sam_in is aligned (in fasta format)") public File REFERENCE;
|
||||
@Argument(fullName="cycle_ranges", shortName="CR", doc="Cycle ranges for single-end or paired reads (i.e. '0-50,56-106') (0-based inclusive)") public String CYCLE_RANGES;
|
||||
@Argument(fullName="tlim", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 100000;
|
||||
@Argument(fullName="clim", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE;
|
||||
@Argument(fullName="runbarcode", shortName="B", doc="Run barcode (embedded as part of the read name)") public String RUN_BARCODE;
|
||||
@Argument(fullName="sam_in", shortName="SI", doc="The file to use for training and annotation", required=false) public File SAM_IN;
|
||||
@Argument(fullName="training_limit", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 100000;
|
||||
@Argument(fullName="calling_limit", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE;
|
||||
|
||||
public static void main(String[] argv) {
|
||||
Instance = new AnnotateSecondaryBase();
|
||||
|
|
@ -52,7 +63,7 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
|
|||
|
||||
BasecallingTrainer trainer = new BasecallingTrainer(BUSTARD_DIR, LANE, TRAINING_LIMIT);
|
||||
|
||||
// Iterate through raw Firecrest data and the first N reads up to TRAINING_LIMIT
|
||||
// Iterate through raw Firecrest data and store the first unambiguous N reads up to TRAINING_LIMIT
|
||||
System.out.println("Loading training set from the first " + TRAINING_LIMIT + " unambiguous reads in the raw data...");
|
||||
trainer.loadFirstNUnambiguousReadsTrainingSet();
|
||||
|
||||
|
|
@ -65,7 +76,9 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
|
|||
|
||||
SAMFileHeader sfh = new SAMFileHeader();
|
||||
sfh.setSortOrder(SAMFileHeader.SortOrder.queryname);
|
||||
SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, SAM_OUT);
|
||||
|
||||
File unalignedSam = (canAnnotate(SAM_IN)) ? getTempSAMFile("unaligned") : SAM_OUT;
|
||||
SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, unalignedSam);
|
||||
|
||||
IlluminaParser iparser = new IlluminaParser(BUSTARD_DIR, LANE);
|
||||
|
||||
|
|
@ -97,21 +110,15 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
|
|||
|
||||
if (canAnnotate(SAM_IN)) {
|
||||
// If we're in annotation mode, annotate the aligned BAM file with the SQ tag
|
||||
System.out.println("Annotating aligned BAM file...");
|
||||
System.out.println("Annotating aligned SAM file...");
|
||||
|
||||
try {
|
||||
File sortedSam = File.createTempFile("sorted", ".sam", SAM_OUT.getParentFile());
|
||||
//System.out.println(" sorted file: " + sortedSam.getAbsolutePath());
|
||||
|
||||
sortBAMByReadName(SAM_IN, sortedSam);
|
||||
System.out.println(" sorting aligned SAM file by read name...");
|
||||
File alignedSam = getTempSAMFile("aligned");
|
||||
sortBAMByReadName(SAM_IN, alignedSam);
|
||||
|
||||
File mergedSam = File.createTempFile("merged", ".sam", SAM_OUT.getParentFile());
|
||||
//System.out.println(" merged file: " + mergedSam.getAbsolutePath());
|
||||
|
||||
mergeUnalignedAndAlignedBams(SAM_OUT, sortedSam, mergedSam);
|
||||
} catch (IOException e) {
|
||||
throw new StingException("There was a problem in trying to merge the unaligned and aligned BAM files.");
|
||||
}
|
||||
System.out.println(" merging unaligned and aligned SAM files...");
|
||||
File mergedSam = SAM_OUT;
|
||||
mergeUnalignedAndAlignedBams(unalignedSam, alignedSam, mergedSam);
|
||||
}
|
||||
|
||||
System.out.println("Done.");
|
||||
|
|
@ -119,6 +126,23 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
|
|||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a tempfile. This is a laziness method so that I don't have to litter my code with try/catch blocks for IOExceptions.
|
||||
*
|
||||
* @param prefix the prefix for the temp file
|
||||
* @return the temp file
|
||||
*/
|
||||
private File getTempSAMFile(String prefix) {
|
||||
try {
|
||||
File tempFile = File.createTempFile(prefix, ".sam", SAM_OUT.getParentFile());
|
||||
tempFile.deleteOnExit();
|
||||
|
||||
return tempFile;
|
||||
} catch (IOException e) {
|
||||
throw new StingException("Unable to create tempfile in directory " + SAM_OUT.getParent());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the cycle_ranges string that defines the cycle where a read starts and stops.
|
||||
* Comma-separated ranges are interpreted to be the first and second end of a pair.
|
||||
|
|
@ -133,8 +157,8 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
|
|||
|
||||
Pattern p = Pattern.compile("(\\d+)-(\\d+)");
|
||||
|
||||
for (int pieceIndex = 0; pieceIndex < pieces.length; pieceIndex++) {
|
||||
Matcher m = p.matcher(pieces[pieceIndex]);
|
||||
for (String piece : pieces) {
|
||||
Matcher m = p.matcher(piece);
|
||||
|
||||
if (m.find()) {
|
||||
Integer cycleStart = new Integer(m.group(1));
|
||||
|
|
@ -169,25 +193,27 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
|
|||
* Construct a SAMRecord object with the specified information. The secondary bases
|
||||
* will be annotated suchthat they will not conflict with the primary base.
|
||||
*
|
||||
* @param rr the raw Illumina read
|
||||
* @param fpr the four-base distributions for every base in the read
|
||||
* @param sfh the SAM header
|
||||
* @param runBarcode the run barcode of the lane (used to prefix the reads)
|
||||
* @param rr the raw Illumina read
|
||||
* @param fpr the four-base distributions for every base in the read
|
||||
* @param sfh the SAM header
|
||||
* @param runBarcode the run barcode of the lane (used to prefix the reads)
|
||||
* @param isPaired is this a paired-end lane?
|
||||
* @param isSecondEndOfPair is this the second end of the pair?
|
||||
*
|
||||
* @return a fully-constructed SAM record
|
||||
*/
|
||||
private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode, boolean isPaired, boolean secondEndOfPair) {
|
||||
private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode, boolean isPaired, boolean isSecondEndOfPair) {
|
||||
SAMRecord sr = new SAMRecord(sfh);
|
||||
|
||||
sr.setReadName(runBarcode + ":" + rr.getReadKey() + "#0");
|
||||
sr.setMateUnmappedFlag(true);
|
||||
sr.setReadUmappedFlag(true);
|
||||
sr.setReadString(rr.getSequenceAsString());
|
||||
sr.setBaseQualities(rr.getQuals());
|
||||
|
||||
sr.setReadPairedFlag(isPaired);
|
||||
if (isPaired) {
|
||||
sr.setFirstOfPairFlag(!secondEndOfPair);
|
||||
sr.setMateUnmappedFlag(true);
|
||||
sr.setFirstOfPairFlag(!isSecondEndOfPair);
|
||||
}
|
||||
|
||||
sr.setAttribute("SQ", fpr.getSQTag(rr));
|
||||
|
|
@ -220,9 +246,9 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
|
|||
/**
|
||||
* Merges two SAM files that have been sorted in queryname order
|
||||
*
|
||||
* @param queryNameSortedUnalignedSam
|
||||
* @param queryNameSortedAlignedSam
|
||||
* @param mergedSam
|
||||
* @param queryNameSortedUnalignedSam the sorted unaligned SAM file
|
||||
* @param queryNameSortedAlignedSam the sorted aligned SAM file
|
||||
* @param mergedSam the output file where the merged results should be stored
|
||||
*/
|
||||
private void mergeUnalignedAndAlignedBams(File queryNameSortedUnalignedSam, File queryNameSortedAlignedSam, File mergedSam) {
|
||||
SAMFileReader usam = new SAMFileReader(queryNameSortedUnalignedSam);
|
||||
|
|
@ -240,36 +266,54 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
|
|||
SAMRecord asr = asamIt.next();
|
||||
|
||||
do {
|
||||
while (usamIt.hasNext() && asamIt.hasNext() && !usr.getReadName().matches(asr.getReadName()) && usr.getReadNegativeStrandFlag() == asr.getReadNegativeStrandFlag()) {
|
||||
int comp = usr.getReadName().compareTo(asr.getReadName());
|
||||
|
||||
if (comp < 0) {
|
||||
usr = usamIt.next();
|
||||
} else if (comp > 0) {
|
||||
asr = asamIt.next();
|
||||
}
|
||||
}
|
||||
|
||||
if (usr.getReadName().matches(asr.getReadName()) && usr.getReadNegativeStrandFlag() == asr.getReadNegativeStrandFlag()) {
|
||||
// Pull a record from the unaligned file and advance the aligned file until we find the matching record. We
|
||||
// don't have to advance the unaligned file until we find our record because we assume every record we generate
|
||||
// will be in the aligned file (which also contains unaligned reads).
|
||||
//
|
||||
// If Picard ever stops storing the unaligned reads, this logic will need to be rewritten.
|
||||
|
||||
if (usr.getReadName().equals(asr.getReadName()) && usr.getFirstOfPairFlag() == asr.getFirstOfPairFlag()) {
|
||||
byte[] sqtag = (byte[]) usr.getAttribute("SQ");
|
||||
String usrread = usr.getReadString();
|
||||
String asrread = asr.getReadString();
|
||||
|
||||
if (sqtag != null) {
|
||||
if (asr.getReadNegativeStrandFlag()) {
|
||||
QualityUtils.reverseComplementCompressedQualityArray(sqtag);
|
||||
|
||||
asr.setAttribute("SQ", sqtag);
|
||||
}
|
||||
if (asr.getReadNegativeStrandFlag()) {
|
||||
sqtag = QualityUtils.reverseComplementCompressedQualityArray(sqtag);
|
||||
asrread = BaseUtils.simpleReverseComplement(asrread);
|
||||
}
|
||||
|
||||
samOut.addAlignment(asr);
|
||||
if (usrread != null && asrread != null && !usrread.equals(asrread)) {
|
||||
throw new StingException(
|
||||
String.format("Purportedly identical unaligned and aligned reads have different read sequences. Perhaps this lane was reanalyzed by the Illumina software but not the production pipeline?\n '%s:%b:%s'\n '%s:%b:%s'",
|
||||
usr.getReadName(), usr.getFirstOfPairFlag(), usrread,
|
||||
asr.getReadName(), asr.getFirstOfPairFlag(), asrread));
|
||||
}
|
||||
|
||||
asr.setAttribute("SQ", sqtag);
|
||||
|
||||
usr = usamIt.next();
|
||||
} else {
|
||||
asr = asamIt.next();
|
||||
}
|
||||
|
||||
samOut.addAlignment(asr);
|
||||
} while (usamIt.hasNext() && asamIt.hasNext());
|
||||
|
||||
usam.close();
|
||||
asam.close();
|
||||
samOut.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* For debugging purposes. Spits out relevant information for two SAMRecords.
|
||||
*
|
||||
* @param sra first SAMRecord
|
||||
* @param srb second SAMRecord
|
||||
*/
|
||||
private void printRecords(SAMRecord sra, SAMRecord srb) {
|
||||
System.out.println("a: " + sra.getReadName() + " " + sra.getFirstOfPairFlag());
|
||||
System.out.println("b: " + srb.getReadName() + " " + srb.getFirstOfPairFlag());
|
||||
System.out.println();
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,7 +39,9 @@ public class BasecallingBaseModel {
|
|||
private boolean readyToCall = false;
|
||||
|
||||
/**
|
||||
* Constructor for BasecallingBaseModel
|
||||
* Constructor for BasecallingBaseModel.
|
||||
*
|
||||
* @param correctForContext should we attempt to correct for contextual sequence effects?
|
||||
*/
|
||||
public BasecallingBaseModel(boolean correctForContext) {
|
||||
this.correctForContext = correctForContext;
|
||||
|
|
@ -67,8 +69,12 @@ public class BasecallingBaseModel {
|
|||
alg = new Algebra();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Add a single training point to the model to estimate the means.
|
||||
*
|
||||
* @param probMatrix the matrix of probabilities for the base
|
||||
* @param fourIntensity the four raw intensities for the base
|
||||
*/
|
||||
public void addMeanPoint(double[][] probMatrix, double[] fourIntensity) {
|
||||
for (int basePrevIndex = 0; basePrevIndex < numTheories; basePrevIndex++) {
|
||||
|
|
@ -88,6 +94,9 @@ public class BasecallingBaseModel {
|
|||
|
||||
/**
|
||||
* Add a single training point to the model to estimate the covariances.
|
||||
*
|
||||
* @param probMatrix the matrix of probabilities for the base
|
||||
* @param fourIntensity the four raw intensities for the base
|
||||
*/
|
||||
public void addCovariancePoint(double[][] probMatrix, double[] fourIntensity) {
|
||||
for (int basePrevIndex = 0; basePrevIndex < numTheories; basePrevIndex++) {
|
||||
|
|
@ -133,7 +142,7 @@ public class BasecallingBaseModel {
|
|||
}
|
||||
|
||||
/**
|
||||
* Compute the likelihood matrix for a base
|
||||
* Compute the likelihood matrix for a base.
|
||||
*
|
||||
* @param cycle the cycle we're calling right now
|
||||
* @param fourintensity the four intensities of the current cycle's base
|
||||
|
|
@ -163,6 +172,11 @@ public class BasecallingBaseModel {
|
|||
return likedist;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the model parameters to disk.
|
||||
*
|
||||
* @param outparam the file in which the output parameters should be stored
|
||||
*/
|
||||
public void write(File outparam) {
|
||||
try {
|
||||
PrintWriter writer = new PrintWriter(outparam);
|
||||
|
|
|
|||
|
|
@ -20,16 +20,31 @@ public class BasecallingReadModel {
|
|||
private BasecallingBaseModel[] basemodels = null;
|
||||
private boolean correctForContext = true;
|
||||
|
||||
/**
|
||||
* Constructs a BasecallingReadModel with space for a given read length.
|
||||
*
|
||||
* @param readLength the length of the reads to which this model will apply.
|
||||
*/
|
||||
public BasecallingReadModel(int readLength) {
|
||||
initialize(readLength);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a BasecallingReadModel and trains it using the specified training data.
|
||||
*
|
||||
* @param trainingData a set of RawReads from which the model will be trained.
|
||||
*/
|
||||
public BasecallingReadModel(ArrayList<RawRead> trainingData) {
|
||||
initialize(trainingData.get(0).getReadLength());
|
||||
|
||||
train(trainingData);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the model and set default parameters for each cycle appropriately.
|
||||
*
|
||||
* @param readLength the length of the reads to which this model will apply.
|
||||
*/
|
||||
public void initialize(int readLength) {
|
||||
basemodels = new BasecallingBaseModel[readLength];
|
||||
|
||||
|
|
@ -38,8 +53,12 @@ public class BasecallingReadModel {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Train the model using the specified training data.
|
||||
*
|
||||
* @param trainingData a set of RawReads from which the model will be trained.
|
||||
*/
|
||||
public void train(ArrayList<RawRead> trainingData) {
|
||||
//for (int readIndex = 0; readIndex < trainingData.size(); readIndex++) {
|
||||
for ( RawRead read : trainingData ) {
|
||||
addMeanPoints(read);
|
||||
}
|
||||
|
|
@ -49,10 +68,22 @@ public class BasecallingReadModel {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a training point for the mean intensity values per base and per cycle.
|
||||
*
|
||||
* @param cycle the cycle number (0-based)
|
||||
* @param probMatrix the probability matrix for the base
|
||||
* @param fourintensity the four raw intensities for the base
|
||||
*/
|
||||
public void addMeanPoint(int cycle, double[][] probMatrix, double[] fourintensity) {
|
||||
basemodels[cycle].addMeanPoint(probMatrix, fourintensity);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a training point for the mean intensity values per base in all cycles.
|
||||
*
|
||||
* @param read the raw read
|
||||
*/
|
||||
public void addMeanPoints(RawRead read) {
|
||||
byte[] seqs = read.getSequence();
|
||||
byte[] quals = read.getQuals();
|
||||
|
|
@ -74,10 +105,22 @@ public class BasecallingReadModel {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a training point for the intensity covariance matrix per base and per cycle.
|
||||
*
|
||||
* @param cycle the cycle number (0-based)
|
||||
* @param probMatrix the probability matrix for the base
|
||||
* @param fourintensity the four raw intensities for the base
|
||||
*/
|
||||
public void addCovariancePoint(int cycle, double[][] probMatrix, double[] fourintensity) {
|
||||
basemodels[cycle].addCovariancePoint(probMatrix, fourintensity);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a training point for the intensity covariance matrix per base in all cycles.
|
||||
*
|
||||
* @param read the raw read
|
||||
*/
|
||||
public void addCovariancePoints(RawRead read) {
|
||||
byte[] seqs = read.getSequence();
|
||||
byte[] quals = read.getQuals();
|
||||
|
|
@ -99,10 +142,26 @@ public class BasecallingReadModel {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the likelihoods that a given set of intensities yields each possible base.
|
||||
*
|
||||
* @param cycle the cycle number (0-based)
|
||||
* @param fourintensity the four raw intensities for the base
|
||||
* @return the matrix of likelihoods
|
||||
*/
|
||||
public double[][] computeLikelihoods(int cycle, double[] fourintensity) {
|
||||
return basemodels[cycle].computeLikelihoods(cycle, fourintensity);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the probabilities that a given set of intensities yields each possible base.
|
||||
*
|
||||
* @param cycle the cycle number (0-based)
|
||||
* @param basePrev the previous base
|
||||
* @param qualPrev the previous base's quality score
|
||||
* @param fourintensity the four raw intensities for the base
|
||||
* @return the probability distribution over the four base possibilities
|
||||
*/
|
||||
public FourProb computeProbabilities(int cycle, char basePrev, byte qualPrev, double[] fourintensity) {
|
||||
double[][] likes = computeLikelihoods(cycle, fourintensity);
|
||||
|
||||
|
|
@ -133,6 +192,12 @@ public class BasecallingReadModel {
|
|||
return new FourProb(likes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Call the bases in the given RawRead.
|
||||
*
|
||||
* @param read the RawRead
|
||||
* @return the basecalled read
|
||||
*/
|
||||
public FourProbRead call(RawRead read) {
|
||||
FourProbRead fpr = new FourProbRead(read.getReadLength());
|
||||
|
||||
|
|
@ -152,6 +217,15 @@ public class BasecallingReadModel {
|
|||
return fpr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the probability matrix given the previous cycle's base, the current cycle's base, and the current base's probability.
|
||||
*
|
||||
* @param cycle the cycle number (0-based)
|
||||
* @param basePrev the previous base
|
||||
* @param baseCur the current base
|
||||
* @param probCur the probability of the current base
|
||||
* @return the probability matrix of the base
|
||||
*/
|
||||
public double[][] getBaseProbabilityMatrix(int cycle, char basePrev, char baseCur, double probCur) {
|
||||
double[][] dist = new double[(correctForContext && cycle > 0) ? 4 : 1][4];
|
||||
|
||||
|
|
@ -169,6 +243,11 @@ public class BasecallingReadModel {
|
|||
return dist;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write model parameters to disk.
|
||||
*
|
||||
* @param dir the directory in which model parameters should be stored.
|
||||
*/
|
||||
public void write(File dir) {
|
||||
for (int cycle = 0; cycle < basemodels.length; cycle++) {
|
||||
File outparam = new File(dir.getPath() + "/param." + cycle + ".r");
|
||||
|
|
|
|||
|
|
@ -78,7 +78,7 @@ public class BasecallingStats {
|
|||
}
|
||||
|
||||
/**
|
||||
* Returns basecalling stats info in a nicely formatted string
|
||||
* Returns basecalling stats info in a nicely formatted string.
|
||||
*
|
||||
* @return nicely formatted string containing basecalling stats
|
||||
*/
|
||||
|
|
@ -87,7 +87,7 @@ public class BasecallingStats {
|
|||
}
|
||||
|
||||
/**
|
||||
* Periodically print a line containing basecalling stats
|
||||
* Periodically print a line containing basecalling stats.
|
||||
*
|
||||
* @param interval the periodicity of the messages given in number of bases observed
|
||||
*/
|
||||
|
|
@ -98,7 +98,7 @@ public class BasecallingStats {
|
|||
}
|
||||
|
||||
/**
|
||||
* Immediately print a line containing basecalling stats
|
||||
* Immediately print a line containing basecalling stats.
|
||||
*/
|
||||
public void notifyNow() {
|
||||
System.out.printf("%s\n", toString());
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ public class BasecallingTrainer {
|
|||
}
|
||||
|
||||
/**
|
||||
* Get the training data array list
|
||||
* Get the training data array list.
|
||||
*
|
||||
* @return the arraylist of raw training reads
|
||||
*/
|
||||
|
|
@ -49,7 +49,7 @@ public class BasecallingTrainer {
|
|||
}
|
||||
|
||||
/**
|
||||
* Set the training data array list
|
||||
* Set the training data array list.
|
||||
*
|
||||
* @param trainingData the arraylist of raw training reads
|
||||
*/
|
||||
|
|
@ -88,7 +88,7 @@ public class BasecallingTrainer {
|
|||
}
|
||||
|
||||
/**
|
||||
* Load a training set from perfect reads in an already-aligned bam file
|
||||
* Load a training set from perfect reads in an already-aligned bam file.
|
||||
*
|
||||
* @param samIn the SAM/BAM file to load the reads from
|
||||
* @param reference the reference to which the reads should be compared
|
||||
|
|
@ -167,6 +167,12 @@ public class BasecallingTrainer {
|
|||
return trainingReads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Correlate the perfect reads with their intensities (at least, theoretically). This doesn't work right now...
|
||||
*
|
||||
* @param trainingReads the set of training reads, hashed by tile
|
||||
* @return the final training set with intensities added in
|
||||
*/
|
||||
private ArrayList<RawRead> correlateReadsAndIntensities(Vector<HashMap<String, SAMRecord>> trainingReads) {
|
||||
ArrayList<RawRead> newTrainingData = new ArrayList<RawRead>(trainingLimit);
|
||||
|
||||
|
|
|
|||
|
|
@ -50,6 +50,7 @@ public class FourProb {
|
|||
|
||||
/**
|
||||
* Returns the base label of the base at the specified rank.
|
||||
*
|
||||
* @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned
|
||||
* @return the base label (A, C, G, T).
|
||||
*/
|
||||
|
|
@ -57,6 +58,7 @@ public class FourProb {
|
|||
|
||||
/**
|
||||
* Returns the probability of the base at the specified rank.
|
||||
*
|
||||
* @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned
|
||||
* @return the probability of the base (0.0-1.0)
|
||||
*/
|
||||
|
|
@ -64,6 +66,7 @@ public class FourProb {
|
|||
|
||||
/**
|
||||
* Returns the quality score of the base at the specified rank.
|
||||
*
|
||||
* @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned
|
||||
* @return the quality score of the base (0-40)
|
||||
*/
|
||||
|
|
@ -71,6 +74,7 @@ public class FourProb {
|
|||
|
||||
/**
|
||||
* A utility method to convert a base index into a base label.
|
||||
*
|
||||
* @param baseIndex the index of the base (0, 1, 2, 3).
|
||||
* @return A, C, G, T, or '.' if the base index can't be understood.
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ import java.util.ArrayList;
|
|||
*/
|
||||
public class FourProbRead extends ArrayList<FourProb> {
|
||||
/**
|
||||
* Initialize the container with the specified capacity
|
||||
* Initialize the container with the specified capacity.
|
||||
*
|
||||
* @param initialCapacity the number of bases in the read
|
||||
*/
|
||||
|
|
@ -23,7 +23,7 @@ public class FourProbRead extends ArrayList<FourProb> {
|
|||
*
|
||||
* @param cycleStart the starting cycle (0-based, inclusive)
|
||||
* @param cycleStop the ending cycle (0-based, inclusive)
|
||||
* @return
|
||||
* @return a FourProbRead that is a subset of this FourProbRead
|
||||
*/
|
||||
public FourProbRead getSubset(int cycleStart, int cycleStop) {
|
||||
FourProbRead subset = new FourProbRead(cycleStop - cycleStart + 1);
|
||||
|
|
@ -36,7 +36,7 @@ public class FourProbRead extends ArrayList<FourProb> {
|
|||
}
|
||||
|
||||
/**
|
||||
* Get the read sequence at a specified rank
|
||||
* Get the read sequence at a specified rank.
|
||||
*
|
||||
* @param rank the rank of the sequence to return (0=best, 1=second-best, 2=third-best, 3=fourth-best)
|
||||
* @return the read sequence at the specified rank
|
||||
|
|
@ -44,9 +44,7 @@ public class FourProbRead extends ArrayList<FourProb> {
|
|||
public String getBaseSequenceAtGivenRank(int rank) {
|
||||
String pseq = "";
|
||||
|
||||
for (int cycle = 0; cycle < this.size(); cycle++) {
|
||||
FourProb fp = this.get(cycle);
|
||||
|
||||
for ( FourProb fp : this ) {
|
||||
pseq += fp.baseAtRank(rank);
|
||||
}
|
||||
|
||||
|
|
@ -54,7 +52,8 @@ public class FourProbRead extends ArrayList<FourProb> {
|
|||
}
|
||||
|
||||
/**
|
||||
* Get the primary read sequence
|
||||
* Get the primary read sequence.
|
||||
*
|
||||
* @return the primary read sequence
|
||||
*/
|
||||
public String getPrimaryBaseSequence() {
|
||||
|
|
@ -62,7 +61,8 @@ public class FourProbRead extends ArrayList<FourProb> {
|
|||
}
|
||||
|
||||
/**
|
||||
* Get the secondary read sequence
|
||||
* Get the secondary read sequence.
|
||||
*
|
||||
* @return the secondary read sequence
|
||||
*/
|
||||
public String getSecondaryBaseSequence() {
|
||||
|
|
@ -73,7 +73,7 @@ public class FourProbRead extends ArrayList<FourProb> {
|
|||
* Get the SAM spec-conformant SQ tag that will be written to the SAM/BAM file.
|
||||
*
|
||||
* @param rr the raw read
|
||||
* @return the byte array for the SQ tag (first two bits: the base identity, the last six bits: -10*log10(p3/p2).
|
||||
* @return the byte array for the SQ tag (first two bits: the base identity, the last six bits: -10*log10(p3/p2)
|
||||
*/
|
||||
public byte[] getSQTag(RawRead rr) {
|
||||
byte[] sqtag = new byte[this.size()];
|
||||
|
|
@ -89,8 +89,6 @@ public class FourProbRead extends ArrayList<FourProb> {
|
|||
double qualdiff = -10.0*Math.log10(fp.probAtRank(2)/fp.probAtRank(1));
|
||||
|
||||
sqtag[cycle] = QualityUtils.baseAndProbDiffToCompressedQuality(fpSecondaryBaseIndex, qualdiff);
|
||||
|
||||
//System.out.println("SQTAG: " + qualdiff + " " + sqtag[cycle] + " " + QualityUtils.compressedQualityToProbDiff(sqtag[cycle]));
|
||||
}
|
||||
|
||||
return sqtag;
|
||||
|
|
|
|||
|
|
@ -9,10 +9,20 @@ import java.io.File;
|
|||
import java.io.FilenameFilter;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* IlluminaParser parses raw Illumina data (raw intensities, basecalled read sequences and quality scores)
|
||||
* and presents it to the developer in an easy-to-use form. It also permits random tile jumping.
|
||||
*
|
||||
* WARNING: This parser does not understand newer GAPipeline data formats, and instead relies on the older
|
||||
* data formats that may have been generated with older Illumina tools. As a result, this may return
|
||||
* suboptimal data. This parser only exists temporarily until the Picard team writes a much more sensible
|
||||
* version. Proceed with caution.
|
||||
*
|
||||
* @author Kiran Garimella
|
||||
*/
|
||||
public class IlluminaParser implements Closeable {
|
||||
private File bustardDir;
|
||||
private File firecrestDir;
|
||||
|
|
@ -26,6 +36,12 @@ public class IlluminaParser implements Closeable {
|
|||
private PasteParser currentTileParser;
|
||||
private String[][] currentParseResult;
|
||||
|
||||
/**
|
||||
* Construct an IlluminaParser given the Bustard directory and lane. Infer the Firecrest directory.
|
||||
*
|
||||
* @param bustardDir the Illumina Bustard directory
|
||||
* @param lane the Illumina lane
|
||||
*/
|
||||
public IlluminaParser(File bustardDir, int lane) {
|
||||
this.bustardDir = bustardDir;
|
||||
this.firecrestDir = bustardDir.getParentFile();
|
||||
|
|
@ -34,6 +50,13 @@ public class IlluminaParser implements Closeable {
|
|||
initializeParser();
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct an IlluminaParser given the Bustard directory, Firecrest directory and lane.
|
||||
*
|
||||
* @param bustardDir the Illumina Bustard directory
|
||||
* @param firecrestDir the Illumina Firecrest directory
|
||||
* @param lane the Illumina lane
|
||||
*/
|
||||
public IlluminaParser(File bustardDir, File firecrestDir, int lane) {
|
||||
this.bustardDir = bustardDir;
|
||||
this.firecrestDir = firecrestDir;
|
||||
|
|
@ -42,6 +65,9 @@ public class IlluminaParser implements Closeable {
|
|||
initializeParser();
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the parser and seek to the first tile.
|
||||
*/
|
||||
private void initializeParser() {
|
||||
intfiles = firecrestDir.listFiles(getFilenameFilter("int"));
|
||||
seqfiles = bustardDir.listFiles(getFilenameFilter("seq"));
|
||||
|
|
@ -65,6 +91,12 @@ public class IlluminaParser implements Closeable {
|
|||
// Todo: put some more consistency checks here
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the filename filter for files of a given type.
|
||||
*
|
||||
* @param suffix the type (i.e. 'int', 'seq', 'prb').
|
||||
* @return the filename filter
|
||||
*/
|
||||
private FilenameFilter getFilenameFilter(final String suffix) {
|
||||
return new FilenameFilter() {
|
||||
public boolean accept(File file, String s) {
|
||||
|
|
@ -76,6 +108,11 @@ public class IlluminaParser implements Closeable {
|
|||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a comparator that sorts by tile.
|
||||
*
|
||||
* @return the comparator that sorts by tile.
|
||||
*/
|
||||
private Comparator<File> getTileSortingComparator() {
|
||||
return new Comparator<File>() {
|
||||
public int compare(File file1, File file2) {
|
||||
|
|
@ -99,8 +136,19 @@ public class IlluminaParser implements Closeable {
|
|||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of tiles.
|
||||
*
|
||||
* @return the number of tiles.
|
||||
*/
|
||||
public int numTiles() { return intfiles.length; }
|
||||
|
||||
/**
|
||||
* Seek to a specified tile.
|
||||
*
|
||||
* @param tile the tile to which we should seek
|
||||
* @return true if we were able to seek to the tile, false if otherwise
|
||||
*/
|
||||
public boolean seekToTile(int tile) {
|
||||
if (tile < intfiles.length - 1) {
|
||||
currentTileIndex = tile - 1;
|
||||
|
|
@ -117,10 +165,20 @@ public class IlluminaParser implements Closeable {
|
|||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether the parser has any more data to go through.
|
||||
*
|
||||
* @return true if there's data left, false if otherwise
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return (currentTileParser.hasNext() || seekToTile(currentTileIndex + 1));
|
||||
}
|
||||
|
||||
/**
|
||||
* Advance the parser to the next read.
|
||||
*
|
||||
* @return true if successful, false if otherwise
|
||||
*/
|
||||
public boolean next() {
|
||||
if (hasNext()) {
|
||||
currentParseResult = currentTileParser.next();
|
||||
|
|
@ -131,22 +189,45 @@ public class IlluminaParser implements Closeable {
|
|||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the result from the current parse as an matrix of Strings.
|
||||
*
|
||||
* @return the matrix of Strings containing the current parse result
|
||||
*/
|
||||
public String[][] getCurrentParseResult() {
|
||||
return currentParseResult;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes, um, something, but in reality, does nothing.
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("IlluminaParser.remove() method is not supported.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the current tile.
|
||||
*/
|
||||
public void close() {
|
||||
currentTileParser.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a raw read containing the raw intensities, read sequence, and quality scores.
|
||||
*
|
||||
* @return the raw read
|
||||
*/
|
||||
public RawRead getRawRead() {
|
||||
return getSubset(0, currentParseResult[1][4].length() - 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a subset of the current parse result as a raw read.
|
||||
*
|
||||
* @param cycleStart the starting cycle for the desired subset
|
||||
* @param cycleStop the ending cycle for the desired subset
|
||||
* @return the subset of the current parse result as a raw read
|
||||
*/
|
||||
public RawRead getSubset(int cycleStart, int cycleStop) {
|
||||
return new RawRead(currentParseResult, cycleStart, cycleStop);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -55,93 +55,93 @@ public class RawRead {
|
|||
}
|
||||
|
||||
/**
|
||||
* Get lane number of read
|
||||
* Get lane number of read.
|
||||
*
|
||||
* @return lane number of read
|
||||
*/
|
||||
public byte getLane() { return lane; }
|
||||
|
||||
/**
|
||||
* Get tile number of read
|
||||
* Get tile number of read.
|
||||
*
|
||||
* @return tile number of read
|
||||
*/
|
||||
public int getTile() { return tile; }
|
||||
|
||||
/**
|
||||
* Get x-coordinate of read
|
||||
* Get x-coordinate of read.
|
||||
*
|
||||
* @return x-coordinate of read
|
||||
*/
|
||||
public int getXCoordinate() { return x; }
|
||||
|
||||
/**
|
||||
* Get y-coordinate of read
|
||||
* Get y-coordinate of read.
|
||||
*
|
||||
* @return y-coordinate of read
|
||||
*/
|
||||
public int getYCoordinate() { return y; }
|
||||
|
||||
/**
|
||||
* Get read key (lane:tile:x:y)
|
||||
* Get read key (lane:tile:x:y).
|
||||
*
|
||||
* @return read key (lane:tile:x:y)
|
||||
*/
|
||||
public String getReadKey() { return String.format("%d:%d:%d:%d", lane, tile, x, y); }
|
||||
|
||||
/**
|
||||
* Get the read sequence between the cycles specified in the constructor as a byte array
|
||||
* Get the read sequence between the cycles specified in the constructor as a byte array.
|
||||
*
|
||||
* @return read sequence
|
||||
*/
|
||||
public byte[] getSequence() { return sequence; }
|
||||
|
||||
/**
|
||||
* Set the read sequence from a byte array
|
||||
* Set the read sequence from a byte array.
|
||||
*
|
||||
* @param sequence the read sequence in byte array form
|
||||
*/
|
||||
public void setSequence(byte[] sequence) { this.sequence = sequence; }
|
||||
|
||||
/**
|
||||
* Get the read sequence as a string
|
||||
* Get the read sequence as a String.
|
||||
*
|
||||
* @return the read sequence in string form
|
||||
* @return the read sequence in String form
|
||||
*/
|
||||
public String getSequenceAsString() {
|
||||
return new String(getSequence());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the quals
|
||||
* Get the quals.
|
||||
*
|
||||
* @return a byte array of quals
|
||||
*/
|
||||
public byte[] getQuals() { return quals; }
|
||||
|
||||
/**
|
||||
* Set the quals
|
||||
* Set the quals.
|
||||
*
|
||||
* @param quals a byte array of quals
|
||||
*/
|
||||
public void setQuals(byte[] quals) { this.quals = quals; }
|
||||
|
||||
/**
|
||||
* Get the raw read intensities
|
||||
* Get the raw read intensities.
|
||||
*
|
||||
* @return the (readLength)x(numChannels) array of raw intensities
|
||||
*/
|
||||
public short[][] getIntensities() { return intensities; }
|
||||
|
||||
/**
|
||||
* Set the raw intensities
|
||||
* Set the raw intensities.
|
||||
*
|
||||
* @param intensities the (readLength)x(numChannels) array of raw intensities
|
||||
*/
|
||||
public void setIntensities(short[][] intensities) { this.intensities = intensities; }
|
||||
|
||||
/**
|
||||
* Get the read length
|
||||
* Get the read length.
|
||||
*
|
||||
* @return the read length
|
||||
*/
|
||||
|
|
|
|||
Loading…
Reference in New Issue