Added loads of documentation.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@777 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2009-05-21 19:40:47 +00:00
parent 1a9d5cea29
commit 5f67914b08
10 changed files with 320 additions and 94 deletions

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.playground.fourbasecaller;
package org.broadinstitute.sting.secondarybase;
import net.sf.samtools.*;
import org.broadinstitute.sting.utils.QualityUtils;

View File

@ -2,18 +2,18 @@ package org.broadinstitute.sting.secondarybase;
import net.sf.samtools.*;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* AnnotateSecondaryBase computes the second best base for every base in an Illumina lane.
@ -23,24 +23,35 @@ import java.util.regex.Matcher;
* base observation.
*
* Approximately 95% of the time, this method and Illumina's basecalling package, "Bustard",
* agree on the identity of the best base. In these cases, we simply annotate the
* second-best base. In cases where this method and Bustard disagree, we annotate the
* secondary base as this method's primary base.
* agree on the identity of the best base. In these cases, we simply annotate our estimate
* of the second-best base. In cases where this method and Bustard disagree, we annotate
* the secondary base as this method's primary base.
*
* @author Kiran Garimella
*/
/*
An example invocation:
java -Xmx2048m -Djava.io.tmpdir=/broad/hptmp/ -jar /path/to/AnnotateSecondaryBase.jar \
-D /seq/solexaproc2/SL-XAX/analyzed/090217_SL-XAX_0003_FC30R47AAXX/Data/C1-152_Firecrest1.3.2_25-02-2009_prodinfo/Bustard1.3.2_25-02-2009_prodinfo/ \
-L 5 \
-B 30R47AAXX090217
-CR '0-75,76-151' \
-SO ~/test.sam \
-SI /seq/picard/30R47AAXX/C1-152_2009-02-17_2009-04-02/5/Solexa-10265/30R47AAXX.5.aligned.bam \
*/
public class AnnotateSecondaryBase extends CommandLineProgram {
public static AnnotateSecondaryBase Instance = null;
@Argument(fullName="dir", shortName="D", doc="Illumina Bustard directory") public File BUSTARD_DIR;
@Argument(fullName="bustard_dir", shortName="D", doc="Illumina Bustard directory") public File BUSTARD_DIR;
@Argument(fullName="lane", shortName="L", doc="Illumina flowcell lane") public int LANE;
@Argument(fullName="sam_in", shortName="SI", doc="The file to use for training and annotation", required=false) public File SAM_IN;
@Argument(fullName="run_barcode", shortName="B", doc="Run barcode (embedded as part of the read name; i.e. 30R47AAXX090217)") public String RUN_BARCODE;
@Argument(fullName="cycle_ranges", shortName="CR", doc="Cycle ranges for single-end or paired reads (i.e. '0-50,56-106') (0-based, inclusive)") public String CYCLE_RANGES;
@Argument(fullName="sam_out", shortName="SO", doc="Output path for sam file") public File SAM_OUT;
@Argument(fullName="reference", shortName="R", doc="Reference sequence to which sam_in is aligned (in fasta format)") public File REFERENCE;
@Argument(fullName="cycle_ranges", shortName="CR", doc="Cycle ranges for single-end or paired reads (i.e. '0-50,56-106') (0-based inclusive)") public String CYCLE_RANGES;
@Argument(fullName="tlim", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 100000;
@Argument(fullName="clim", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE;
@Argument(fullName="runbarcode", shortName="B", doc="Run barcode (embedded as part of the read name)") public String RUN_BARCODE;
@Argument(fullName="sam_in", shortName="SI", doc="The file to use for training and annotation", required=false) public File SAM_IN;
@Argument(fullName="training_limit", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 100000;
@Argument(fullName="calling_limit", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE;
public static void main(String[] argv) {
Instance = new AnnotateSecondaryBase();
@ -52,7 +63,7 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
BasecallingTrainer trainer = new BasecallingTrainer(BUSTARD_DIR, LANE, TRAINING_LIMIT);
// Iterate through raw Firecrest data and the first N reads up to TRAINING_LIMIT
// Iterate through raw Firecrest data and store the first unambiguous N reads up to TRAINING_LIMIT
System.out.println("Loading training set from the first " + TRAINING_LIMIT + " unambiguous reads in the raw data...");
trainer.loadFirstNUnambiguousReadsTrainingSet();
@ -65,7 +76,9 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
SAMFileHeader sfh = new SAMFileHeader();
sfh.setSortOrder(SAMFileHeader.SortOrder.queryname);
SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, SAM_OUT);
File unalignedSam = (canAnnotate(SAM_IN)) ? getTempSAMFile("unaligned") : SAM_OUT;
SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, unalignedSam);
IlluminaParser iparser = new IlluminaParser(BUSTARD_DIR, LANE);
@ -97,21 +110,15 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
if (canAnnotate(SAM_IN)) {
// If we're in annotation mode, annotate the aligned BAM file with the SQ tag
System.out.println("Annotating aligned BAM file...");
System.out.println("Annotating aligned SAM file...");
try {
File sortedSam = File.createTempFile("sorted", ".sam", SAM_OUT.getParentFile());
//System.out.println(" sorted file: " + sortedSam.getAbsolutePath());
sortBAMByReadName(SAM_IN, sortedSam);
System.out.println(" sorting aligned SAM file by read name...");
File alignedSam = getTempSAMFile("aligned");
sortBAMByReadName(SAM_IN, alignedSam);
File mergedSam = File.createTempFile("merged", ".sam", SAM_OUT.getParentFile());
//System.out.println(" merged file: " + mergedSam.getAbsolutePath());
mergeUnalignedAndAlignedBams(SAM_OUT, sortedSam, mergedSam);
} catch (IOException e) {
throw new StingException("There was a problem in trying to merge the unaligned and aligned BAM files.");
}
System.out.println(" merging unaligned and aligned SAM files...");
File mergedSam = SAM_OUT;
mergeUnalignedAndAlignedBams(unalignedSam, alignedSam, mergedSam);
}
System.out.println("Done.");
@ -119,6 +126,23 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
return 0;
}
/**
* Return a tempfile. This is a laziness method so that I don't have to litter my code with try/catch blocks for IOExceptions.
*
* @param prefix the prefix for the temp file
* @return the temp file
*/
private File getTempSAMFile(String prefix) {
try {
File tempFile = File.createTempFile(prefix, ".sam", SAM_OUT.getParentFile());
tempFile.deleteOnExit();
return tempFile;
} catch (IOException e) {
throw new StingException("Unable to create tempfile in directory " + SAM_OUT.getParent());
}
}
/**
* Parse the cycle_ranges string that defines the cycle where a read starts and stops.
* Comma-separated ranges are interpreted to be the first and second end of a pair.
@ -133,8 +157,8 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
Pattern p = Pattern.compile("(\\d+)-(\\d+)");
for (int pieceIndex = 0; pieceIndex < pieces.length; pieceIndex++) {
Matcher m = p.matcher(pieces[pieceIndex]);
for (String piece : pieces) {
Matcher m = p.matcher(piece);
if (m.find()) {
Integer cycleStart = new Integer(m.group(1));
@ -169,25 +193,27 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
* Construct a SAMRecord object with the specified information. The secondary bases
* will be annotated suchthat they will not conflict with the primary base.
*
* @param rr the raw Illumina read
* @param fpr the four-base distributions for every base in the read
* @param sfh the SAM header
* @param runBarcode the run barcode of the lane (used to prefix the reads)
* @param rr the raw Illumina read
* @param fpr the four-base distributions for every base in the read
* @param sfh the SAM header
* @param runBarcode the run barcode of the lane (used to prefix the reads)
* @param isPaired is this a paired-end lane?
* @param isSecondEndOfPair is this the second end of the pair?
*
* @return a fully-constructed SAM record
*/
private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode, boolean isPaired, boolean secondEndOfPair) {
private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode, boolean isPaired, boolean isSecondEndOfPair) {
SAMRecord sr = new SAMRecord(sfh);
sr.setReadName(runBarcode + ":" + rr.getReadKey() + "#0");
sr.setMateUnmappedFlag(true);
sr.setReadUmappedFlag(true);
sr.setReadString(rr.getSequenceAsString());
sr.setBaseQualities(rr.getQuals());
sr.setReadPairedFlag(isPaired);
if (isPaired) {
sr.setFirstOfPairFlag(!secondEndOfPair);
sr.setMateUnmappedFlag(true);
sr.setFirstOfPairFlag(!isSecondEndOfPair);
}
sr.setAttribute("SQ", fpr.getSQTag(rr));
@ -220,9 +246,9 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
/**
* Merges two SAM files that have been sorted in queryname order
*
* @param queryNameSortedUnalignedSam
* @param queryNameSortedAlignedSam
* @param mergedSam
* @param queryNameSortedUnalignedSam the sorted unaligned SAM file
* @param queryNameSortedAlignedSam the sorted aligned SAM file
* @param mergedSam the output file where the merged results should be stored
*/
private void mergeUnalignedAndAlignedBams(File queryNameSortedUnalignedSam, File queryNameSortedAlignedSam, File mergedSam) {
SAMFileReader usam = new SAMFileReader(queryNameSortedUnalignedSam);
@ -240,36 +266,54 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
SAMRecord asr = asamIt.next();
do {
while (usamIt.hasNext() && asamIt.hasNext() && !usr.getReadName().matches(asr.getReadName()) && usr.getReadNegativeStrandFlag() == asr.getReadNegativeStrandFlag()) {
int comp = usr.getReadName().compareTo(asr.getReadName());
if (comp < 0) {
usr = usamIt.next();
} else if (comp > 0) {
asr = asamIt.next();
}
}
if (usr.getReadName().matches(asr.getReadName()) && usr.getReadNegativeStrandFlag() == asr.getReadNegativeStrandFlag()) {
// Pull a record from the unaligned file and advance the aligned file until we find the matching record. We
// don't have to advance the unaligned file until we find our record because we assume every record we generate
// will be in the aligned file (which also contains unaligned reads).
//
// If Picard ever stops storing the unaligned reads, this logic will need to be rewritten.
if (usr.getReadName().equals(asr.getReadName()) && usr.getFirstOfPairFlag() == asr.getFirstOfPairFlag()) {
byte[] sqtag = (byte[]) usr.getAttribute("SQ");
String usrread = usr.getReadString();
String asrread = asr.getReadString();
if (sqtag != null) {
if (asr.getReadNegativeStrandFlag()) {
QualityUtils.reverseComplementCompressedQualityArray(sqtag);
asr.setAttribute("SQ", sqtag);
}
if (asr.getReadNegativeStrandFlag()) {
sqtag = QualityUtils.reverseComplementCompressedQualityArray(sqtag);
asrread = BaseUtils.simpleReverseComplement(asrread);
}
samOut.addAlignment(asr);
if (usrread != null && asrread != null && !usrread.equals(asrread)) {
throw new StingException(
String.format("Purportedly identical unaligned and aligned reads have different read sequences. Perhaps this lane was reanalyzed by the Illumina software but not the production pipeline?\n '%s:%b:%s'\n '%s:%b:%s'",
usr.getReadName(), usr.getFirstOfPairFlag(), usrread,
asr.getReadName(), asr.getFirstOfPairFlag(), asrread));
}
asr.setAttribute("SQ", sqtag);
usr = usamIt.next();
} else {
asr = asamIt.next();
}
samOut.addAlignment(asr);
} while (usamIt.hasNext() && asamIt.hasNext());
usam.close();
asam.close();
samOut.close();
}
/**
* For debugging purposes. Spits out relevant information for two SAMRecords.
*
* @param sra first SAMRecord
* @param srb second SAMRecord
*/
private void printRecords(SAMRecord sra, SAMRecord srb) {
System.out.println("a: " + sra.getReadName() + " " + sra.getFirstOfPairFlag());
System.out.println("b: " + srb.getReadName() + " " + srb.getFirstOfPairFlag());
System.out.println();
}
}

View File

@ -39,7 +39,9 @@ public class BasecallingBaseModel {
private boolean readyToCall = false;
/**
* Constructor for BasecallingBaseModel
* Constructor for BasecallingBaseModel.
*
* @param correctForContext should we attempt to correct for contextual sequence effects?
*/
public BasecallingBaseModel(boolean correctForContext) {
this.correctForContext = correctForContext;
@ -67,8 +69,12 @@ public class BasecallingBaseModel {
alg = new Algebra();
}
/**
* Add a single training point to the model to estimate the means.
*
* @param probMatrix the matrix of probabilities for the base
* @param fourIntensity the four raw intensities for the base
*/
public void addMeanPoint(double[][] probMatrix, double[] fourIntensity) {
for (int basePrevIndex = 0; basePrevIndex < numTheories; basePrevIndex++) {
@ -88,6 +94,9 @@ public class BasecallingBaseModel {
/**
* Add a single training point to the model to estimate the covariances.
*
* @param probMatrix the matrix of probabilities for the base
* @param fourIntensity the four raw intensities for the base
*/
public void addCovariancePoint(double[][] probMatrix, double[] fourIntensity) {
for (int basePrevIndex = 0; basePrevIndex < numTheories; basePrevIndex++) {
@ -133,7 +142,7 @@ public class BasecallingBaseModel {
}
/**
* Compute the likelihood matrix for a base
* Compute the likelihood matrix for a base.
*
* @param cycle the cycle we're calling right now
* @param fourintensity the four intensities of the current cycle's base
@ -163,6 +172,11 @@ public class BasecallingBaseModel {
return likedist;
}
/**
* Write the model parameters to disk.
*
* @param outparam the file in which the output parameters should be stored
*/
public void write(File outparam) {
try {
PrintWriter writer = new PrintWriter(outparam);

View File

@ -20,16 +20,31 @@ public class BasecallingReadModel {
private BasecallingBaseModel[] basemodels = null;
private boolean correctForContext = true;
/**
* Constructs a BasecallingReadModel with space for a given read length.
*
* @param readLength the length of the reads to which this model will apply.
*/
public BasecallingReadModel(int readLength) {
initialize(readLength);
}
/**
* Constructs a BasecallingReadModel and trains it using the specified training data.
*
* @param trainingData a set of RawReads from which the model will be trained.
*/
public BasecallingReadModel(ArrayList<RawRead> trainingData) {
initialize(trainingData.get(0).getReadLength());
train(trainingData);
}
/**
* Initialize the model and set default parameters for each cycle appropriately.
*
* @param readLength the length of the reads to which this model will apply.
*/
public void initialize(int readLength) {
basemodels = new BasecallingBaseModel[readLength];
@ -38,8 +53,12 @@ public class BasecallingReadModel {
}
}
/**
* Train the model using the specified training data.
*
* @param trainingData a set of RawReads from which the model will be trained.
*/
public void train(ArrayList<RawRead> trainingData) {
//for (int readIndex = 0; readIndex < trainingData.size(); readIndex++) {
for ( RawRead read : trainingData ) {
addMeanPoints(read);
}
@ -49,10 +68,22 @@ public class BasecallingReadModel {
}
}
/**
* Add a training point for the mean intensity values per base and per cycle.
*
* @param cycle the cycle number (0-based)
* @param probMatrix the probability matrix for the base
* @param fourintensity the four raw intensities for the base
*/
public void addMeanPoint(int cycle, double[][] probMatrix, double[] fourintensity) {
basemodels[cycle].addMeanPoint(probMatrix, fourintensity);
}
/**
* Add a training point for the mean intensity values per base in all cycles.
*
* @param read the raw read
*/
public void addMeanPoints(RawRead read) {
byte[] seqs = read.getSequence();
byte[] quals = read.getQuals();
@ -74,10 +105,22 @@ public class BasecallingReadModel {
}
}
/**
* Add a training point for the intensity covariance matrix per base and per cycle.
*
* @param cycle the cycle number (0-based)
* @param probMatrix the probability matrix for the base
* @param fourintensity the four raw intensities for the base
*/
public void addCovariancePoint(int cycle, double[][] probMatrix, double[] fourintensity) {
basemodels[cycle].addCovariancePoint(probMatrix, fourintensity);
}
/**
* Add a training point for the intensity covariance matrix per base in all cycles.
*
* @param read the raw read
*/
public void addCovariancePoints(RawRead read) {
byte[] seqs = read.getSequence();
byte[] quals = read.getQuals();
@ -99,10 +142,26 @@ public class BasecallingReadModel {
}
}
/**
* Compute the likelihoods that a given set of intensities yields each possible base.
*
* @param cycle the cycle number (0-based)
* @param fourintensity the four raw intensities for the base
* @return the matrix of likelihoods
*/
public double[][] computeLikelihoods(int cycle, double[] fourintensity) {
return basemodels[cycle].computeLikelihoods(cycle, fourintensity);
}
/**
* Compute the probabilities that a given set of intensities yields each possible base.
*
* @param cycle the cycle number (0-based)
* @param basePrev the previous base
* @param qualPrev the previous base's quality score
* @param fourintensity the four raw intensities for the base
* @return the probability distribution over the four base possibilities
*/
public FourProb computeProbabilities(int cycle, char basePrev, byte qualPrev, double[] fourintensity) {
double[][] likes = computeLikelihoods(cycle, fourintensity);
@ -133,6 +192,12 @@ public class BasecallingReadModel {
return new FourProb(likes);
}
/**
* Call the bases in the given RawRead.
*
* @param read the RawRead
* @return the basecalled read
*/
public FourProbRead call(RawRead read) {
FourProbRead fpr = new FourProbRead(read.getReadLength());
@ -152,6 +217,15 @@ public class BasecallingReadModel {
return fpr;
}
/**
* Return the probability matrix given the previous cycle's base, the current cycle's base, and the current base's probability.
*
* @param cycle the cycle number (0-based)
* @param basePrev the previous base
* @param baseCur the current base
* @param probCur the probability of the current base
* @return the probability matrix of the base
*/
public double[][] getBaseProbabilityMatrix(int cycle, char basePrev, char baseCur, double probCur) {
double[][] dist = new double[(correctForContext && cycle > 0) ? 4 : 1][4];
@ -169,6 +243,11 @@ public class BasecallingReadModel {
return dist;
}
/**
* Write model parameters to disk.
*
* @param dir the directory in which model parameters should be stored.
*/
public void write(File dir) {
for (int cycle = 0; cycle < basemodels.length; cycle++) {
File outparam = new File(dir.getPath() + "/param." + cycle + ".r");

View File

@ -78,7 +78,7 @@ public class BasecallingStats {
}
/**
* Returns basecalling stats info in a nicely formatted string
* Returns basecalling stats info in a nicely formatted string.
*
* @return nicely formatted string containing basecalling stats
*/
@ -87,7 +87,7 @@ public class BasecallingStats {
}
/**
* Periodically print a line containing basecalling stats
* Periodically print a line containing basecalling stats.
*
* @param interval the periodicity of the messages given in number of bases observed
*/
@ -98,7 +98,7 @@ public class BasecallingStats {
}
/**
* Immediately print a line containing basecalling stats
* Immediately print a line containing basecalling stats.
*/
public void notifyNow() {
System.out.printf("%s\n", toString());

View File

@ -40,7 +40,7 @@ public class BasecallingTrainer {
}
/**
* Get the training data array list
* Get the training data array list.
*
* @return the arraylist of raw training reads
*/
@ -49,7 +49,7 @@ public class BasecallingTrainer {
}
/**
* Set the training data array list
* Set the training data array list.
*
* @param trainingData the arraylist of raw training reads
*/
@ -88,7 +88,7 @@ public class BasecallingTrainer {
}
/**
* Load a training set from perfect reads in an already-aligned bam file
* Load a training set from perfect reads in an already-aligned bam file.
*
* @param samIn the SAM/BAM file to load the reads from
* @param reference the reference to which the reads should be compared
@ -167,6 +167,12 @@ public class BasecallingTrainer {
return trainingReads;
}
/**
* Correlate the perfect reads with their intensities (at least, theoretically). This doesn't work right now...
*
* @param trainingReads the set of training reads, hashed by tile
* @return the final training set with intensities added in
*/
private ArrayList<RawRead> correlateReadsAndIntensities(Vector<HashMap<String, SAMRecord>> trainingReads) {
ArrayList<RawRead> newTrainingData = new ArrayList<RawRead>(trainingLimit);

View File

@ -50,6 +50,7 @@ public class FourProb {
/**
* Returns the base label of the base at the specified rank.
*
* @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned
* @return the base label (A, C, G, T).
*/
@ -57,6 +58,7 @@ public class FourProb {
/**
* Returns the probability of the base at the specified rank.
*
* @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned
* @return the probability of the base (0.0-1.0)
*/
@ -64,6 +66,7 @@ public class FourProb {
/**
* Returns the quality score of the base at the specified rank.
*
* @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned
* @return the quality score of the base (0-40)
*/
@ -71,6 +74,7 @@ public class FourProb {
/**
* A utility method to convert a base index into a base label.
*
* @param baseIndex the index of the base (0, 1, 2, 3).
* @return A, C, G, T, or '.' if the base index can't be understood.
*/

View File

@ -10,7 +10,7 @@ import java.util.ArrayList;
*/
public class FourProbRead extends ArrayList<FourProb> {
/**
* Initialize the container with the specified capacity
* Initialize the container with the specified capacity.
*
* @param initialCapacity the number of bases in the read
*/
@ -23,7 +23,7 @@ public class FourProbRead extends ArrayList<FourProb> {
*
* @param cycleStart the starting cycle (0-based, inclusive)
* @param cycleStop the ending cycle (0-based, inclusive)
* @return
* @return a FourProbRead that is a subset of this FourProbRead
*/
public FourProbRead getSubset(int cycleStart, int cycleStop) {
FourProbRead subset = new FourProbRead(cycleStop - cycleStart + 1);
@ -36,7 +36,7 @@ public class FourProbRead extends ArrayList<FourProb> {
}
/**
* Get the read sequence at a specified rank
* Get the read sequence at a specified rank.
*
* @param rank the rank of the sequence to return (0=best, 1=second-best, 2=third-best, 3=fourth-best)
* @return the read sequence at the specified rank
@ -44,9 +44,7 @@ public class FourProbRead extends ArrayList<FourProb> {
public String getBaseSequenceAtGivenRank(int rank) {
String pseq = "";
for (int cycle = 0; cycle < this.size(); cycle++) {
FourProb fp = this.get(cycle);
for ( FourProb fp : this ) {
pseq += fp.baseAtRank(rank);
}
@ -54,7 +52,8 @@ public class FourProbRead extends ArrayList<FourProb> {
}
/**
* Get the primary read sequence
* Get the primary read sequence.
*
* @return the primary read sequence
*/
public String getPrimaryBaseSequence() {
@ -62,7 +61,8 @@ public class FourProbRead extends ArrayList<FourProb> {
}
/**
* Get the secondary read sequence
* Get the secondary read sequence.
*
* @return the secondary read sequence
*/
public String getSecondaryBaseSequence() {
@ -73,7 +73,7 @@ public class FourProbRead extends ArrayList<FourProb> {
* Get the SAM spec-conformant SQ tag that will be written to the SAM/BAM file.
*
* @param rr the raw read
* @return the byte array for the SQ tag (first two bits: the base identity, the last six bits: -10*log10(p3/p2).
* @return the byte array for the SQ tag (first two bits: the base identity, the last six bits: -10*log10(p3/p2)
*/
public byte[] getSQTag(RawRead rr) {
byte[] sqtag = new byte[this.size()];
@ -89,8 +89,6 @@ public class FourProbRead extends ArrayList<FourProb> {
double qualdiff = -10.0*Math.log10(fp.probAtRank(2)/fp.probAtRank(1));
sqtag[cycle] = QualityUtils.baseAndProbDiffToCompressedQuality(fpSecondaryBaseIndex, qualdiff);
//System.out.println("SQTAG: " + qualdiff + " " + sqtag[cycle] + " " + QualityUtils.compressedQualityToProbDiff(sqtag[cycle]));
}
return sqtag;

View File

@ -9,10 +9,20 @@ import java.io.File;
import java.io.FilenameFilter;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* IlluminaParser parses raw Illumina data (raw intensities, basecalled read sequences and quality scores)
* and presents it to the developer in an easy-to-use form. It also permits random tile jumping.
*
* WARNING: This parser does not understand newer GAPipeline data formats, and instead relies on the older
* data formats that may have been generated with older Illumina tools. As a result, this may return
* suboptimal data. This parser only exists temporarily until the Picard team writes a much more sensible
* version. Proceed with caution.
*
* @author Kiran Garimella
*/
public class IlluminaParser implements Closeable {
private File bustardDir;
private File firecrestDir;
@ -26,6 +36,12 @@ public class IlluminaParser implements Closeable {
private PasteParser currentTileParser;
private String[][] currentParseResult;
/**
* Construct an IlluminaParser given the Bustard directory and lane. Infer the Firecrest directory.
*
* @param bustardDir the Illumina Bustard directory
* @param lane the Illumina lane
*/
public IlluminaParser(File bustardDir, int lane) {
this.bustardDir = bustardDir;
this.firecrestDir = bustardDir.getParentFile();
@ -34,6 +50,13 @@ public class IlluminaParser implements Closeable {
initializeParser();
}
/**
* Construct an IlluminaParser given the Bustard directory, Firecrest directory and lane.
*
* @param bustardDir the Illumina Bustard directory
* @param firecrestDir the Illumina Firecrest directory
* @param lane the Illumina lane
*/
public IlluminaParser(File bustardDir, File firecrestDir, int lane) {
this.bustardDir = bustardDir;
this.firecrestDir = firecrestDir;
@ -42,6 +65,9 @@ public class IlluminaParser implements Closeable {
initializeParser();
}
/**
* Initialize the parser and seek to the first tile.
*/
private void initializeParser() {
intfiles = firecrestDir.listFiles(getFilenameFilter("int"));
seqfiles = bustardDir.listFiles(getFilenameFilter("seq"));
@ -65,6 +91,12 @@ public class IlluminaParser implements Closeable {
// Todo: put some more consistency checks here
}
/**
* Get the filename filter for files of a given type.
*
* @param suffix the type (i.e. 'int', 'seq', 'prb').
* @return the filename filter
*/
private FilenameFilter getFilenameFilter(final String suffix) {
return new FilenameFilter() {
public boolean accept(File file, String s) {
@ -76,6 +108,11 @@ public class IlluminaParser implements Closeable {
};
}
/**
* Get a comparator that sorts by tile.
*
* @return the comparator that sorts by tile.
*/
private Comparator<File> getTileSortingComparator() {
return new Comparator<File>() {
public int compare(File file1, File file2) {
@ -99,8 +136,19 @@ public class IlluminaParser implements Closeable {
};
}
/**
* Return the number of tiles.
*
* @return the number of tiles.
*/
public int numTiles() { return intfiles.length; }
/**
* Seek to a specified tile.
*
* @param tile the tile to which we should seek
* @return true if we were able to seek to the tile, false if otherwise
*/
public boolean seekToTile(int tile) {
if (tile < intfiles.length - 1) {
currentTileIndex = tile - 1;
@ -117,10 +165,20 @@ public class IlluminaParser implements Closeable {
return false;
}
/**
* Returns whether the parser has any more data to go through.
*
* @return true if there's data left, false if otherwise
*/
public boolean hasNext() {
return (currentTileParser.hasNext() || seekToTile(currentTileIndex + 1));
}
/**
* Advance the parser to the next read.
*
* @return true if successful, false if otherwise
*/
public boolean next() {
if (hasNext()) {
currentParseResult = currentTileParser.next();
@ -131,22 +189,45 @@ public class IlluminaParser implements Closeable {
return false;
}
/**
* Returns the result from the current parse as an matrix of Strings.
*
* @return the matrix of Strings containing the current parse result
*/
public String[][] getCurrentParseResult() {
return currentParseResult;
}
/**
* Removes, um, something, but in reality, does nothing.
*/
public void remove() {
throw new UnsupportedOperationException("IlluminaParser.remove() method is not supported.");
}
/**
* Close the current tile.
*/
public void close() {
currentTileParser.close();
}
/**
* Returns a raw read containing the raw intensities, read sequence, and quality scores.
*
* @return the raw read
*/
public RawRead getRawRead() {
return getSubset(0, currentParseResult[1][4].length() - 1);
}
/**
* Returns a subset of the current parse result as a raw read.
*
* @param cycleStart the starting cycle for the desired subset
* @param cycleStop the ending cycle for the desired subset
* @return the subset of the current parse result as a raw read
*/
public RawRead getSubset(int cycleStart, int cycleStop) {
return new RawRead(currentParseResult, cycleStart, cycleStop);
}

View File

@ -55,93 +55,93 @@ public class RawRead {
}
/**
* Get lane number of read
* Get lane number of read.
*
* @return lane number of read
*/
public byte getLane() { return lane; }
/**
* Get tile number of read
* Get tile number of read.
*
* @return tile number of read
*/
public int getTile() { return tile; }
/**
* Get x-coordinate of read
* Get x-coordinate of read.
*
* @return x-coordinate of read
*/
public int getXCoordinate() { return x; }
/**
* Get y-coordinate of read
* Get y-coordinate of read.
*
* @return y-coordinate of read
*/
public int getYCoordinate() { return y; }
/**
* Get read key (lane:tile:x:y)
* Get read key (lane:tile:x:y).
*
* @return read key (lane:tile:x:y)
*/
public String getReadKey() { return String.format("%d:%d:%d:%d", lane, tile, x, y); }
/**
* Get the read sequence between the cycles specified in the constructor as a byte array
* Get the read sequence between the cycles specified in the constructor as a byte array.
*
* @return read sequence
*/
public byte[] getSequence() { return sequence; }
/**
* Set the read sequence from a byte array
* Set the read sequence from a byte array.
*
* @param sequence the read sequence in byte array form
*/
public void setSequence(byte[] sequence) { this.sequence = sequence; }
/**
* Get the read sequence as a string
* Get the read sequence as a String.
*
* @return the read sequence in string form
* @return the read sequence in String form
*/
public String getSequenceAsString() {
return new String(getSequence());
}
/**
* Get the quals
* Get the quals.
*
* @return a byte array of quals
*/
public byte[] getQuals() { return quals; }
/**
* Set the quals
* Set the quals.
*
* @param quals a byte array of quals
*/
public void setQuals(byte[] quals) { this.quals = quals; }
/**
* Get the raw read intensities
* Get the raw read intensities.
*
* @return the (readLength)x(numChannels) array of raw intensities
*/
public short[][] getIntensities() { return intensities; }
/**
* Set the raw intensities
* Set the raw intensities.
*
* @param intensities the (readLength)x(numChannels) array of raw intensities
*/
public void setIntensities(short[][] intensities) { this.intensities = intensities; }
/**
* Get the read length
* Get the read length.
*
* @return the read length
*/