diff --git a/java/src/org/broadinstitute/sting/secondarybase/AddFourProbsToSAM.java b/java/src/org/broadinstitute/sting/secondarybase/AddFourProbsToSAM.java index de0544e9e..7d918801d 100755 --- a/java/src/org/broadinstitute/sting/secondarybase/AddFourProbsToSAM.java +++ b/java/src/org/broadinstitute/sting/secondarybase/AddFourProbsToSAM.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.fourbasecaller; +package org.broadinstitute.sting.secondarybase; import net.sf.samtools.*; import org.broadinstitute.sting.utils.QualityUtils; diff --git a/java/src/org/broadinstitute/sting/secondarybase/AnnotateSecondaryBase.java b/java/src/org/broadinstitute/sting/secondarybase/AnnotateSecondaryBase.java index d24b2b80c..31ac2bd17 100755 --- a/java/src/org/broadinstitute/sting/secondarybase/AnnotateSecondaryBase.java +++ b/java/src/org/broadinstitute/sting/secondarybase/AnnotateSecondaryBase.java @@ -2,18 +2,18 @@ package org.broadinstitute.sting.secondarybase; import net.sf.samtools.*; import net.sf.samtools.util.CloseableIterator; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; import java.io.File; import java.io.IOException; -import java.util.HashMap; import java.util.ArrayList; -import java.util.regex.Pattern; import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * AnnotateSecondaryBase computes the second best base for every base in an Illumina lane. @@ -23,24 +23,35 @@ import java.util.regex.Matcher; * base observation. * * Approximately 95% of the time, this method and Illumina's basecalling package, "Bustard", - * agree on the identity of the best base. In these cases, we simply annotate the - * second-best base. In cases where this method and Bustard disagree, we annotate the - * secondary base as this method's primary base. + * agree on the identity of the best base. In these cases, we simply annotate our estimate + * of the second-best base. In cases where this method and Bustard disagree, we annotate + * the secondary base as this method's primary base. * * @author Kiran Garimella */ + +/* + An example invocation: + java -Xmx2048m -Djava.io.tmpdir=/broad/hptmp/ -jar /path/to/AnnotateSecondaryBase.jar \ + -D /seq/solexaproc2/SL-XAX/analyzed/090217_SL-XAX_0003_FC30R47AAXX/Data/C1-152_Firecrest1.3.2_25-02-2009_prodinfo/Bustard1.3.2_25-02-2009_prodinfo/ \ + -L 5 \ + -B 30R47AAXX090217 + -CR '0-75,76-151' \ + -SO ~/test.sam \ + -SI /seq/picard/30R47AAXX/C1-152_2009-02-17_2009-04-02/5/Solexa-10265/30R47AAXX.5.aligned.bam \ + */ + public class AnnotateSecondaryBase extends CommandLineProgram { public static AnnotateSecondaryBase Instance = null; - @Argument(fullName="dir", shortName="D", doc="Illumina Bustard directory") public File BUSTARD_DIR; + @Argument(fullName="bustard_dir", shortName="D", doc="Illumina Bustard directory") public File BUSTARD_DIR; @Argument(fullName="lane", shortName="L", doc="Illumina flowcell lane") public int LANE; - @Argument(fullName="sam_in", shortName="SI", doc="The file to use for training and annotation", required=false) public File SAM_IN; + @Argument(fullName="run_barcode", shortName="B", doc="Run barcode (embedded as part of the read name; i.e. 30R47AAXX090217)") public String RUN_BARCODE; + @Argument(fullName="cycle_ranges", shortName="CR", doc="Cycle ranges for single-end or paired reads (i.e. '0-50,56-106') (0-based, inclusive)") public String CYCLE_RANGES; @Argument(fullName="sam_out", shortName="SO", doc="Output path for sam file") public File SAM_OUT; - @Argument(fullName="reference", shortName="R", doc="Reference sequence to which sam_in is aligned (in fasta format)") public File REFERENCE; - @Argument(fullName="cycle_ranges", shortName="CR", doc="Cycle ranges for single-end or paired reads (i.e. '0-50,56-106') (0-based inclusive)") public String CYCLE_RANGES; - @Argument(fullName="tlim", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 100000; - @Argument(fullName="clim", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE; - @Argument(fullName="runbarcode", shortName="B", doc="Run barcode (embedded as part of the read name)") public String RUN_BARCODE; + @Argument(fullName="sam_in", shortName="SI", doc="The file to use for training and annotation", required=false) public File SAM_IN; + @Argument(fullName="training_limit", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 100000; + @Argument(fullName="calling_limit", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE; public static void main(String[] argv) { Instance = new AnnotateSecondaryBase(); @@ -52,7 +63,7 @@ public class AnnotateSecondaryBase extends CommandLineProgram { BasecallingTrainer trainer = new BasecallingTrainer(BUSTARD_DIR, LANE, TRAINING_LIMIT); - // Iterate through raw Firecrest data and the first N reads up to TRAINING_LIMIT + // Iterate through raw Firecrest data and store the first unambiguous N reads up to TRAINING_LIMIT System.out.println("Loading training set from the first " + TRAINING_LIMIT + " unambiguous reads in the raw data..."); trainer.loadFirstNUnambiguousReadsTrainingSet(); @@ -65,7 +76,9 @@ public class AnnotateSecondaryBase extends CommandLineProgram { SAMFileHeader sfh = new SAMFileHeader(); sfh.setSortOrder(SAMFileHeader.SortOrder.queryname); - SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, SAM_OUT); + + File unalignedSam = (canAnnotate(SAM_IN)) ? getTempSAMFile("unaligned") : SAM_OUT; + SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, unalignedSam); IlluminaParser iparser = new IlluminaParser(BUSTARD_DIR, LANE); @@ -97,21 +110,15 @@ public class AnnotateSecondaryBase extends CommandLineProgram { if (canAnnotate(SAM_IN)) { // If we're in annotation mode, annotate the aligned BAM file with the SQ tag - System.out.println("Annotating aligned BAM file..."); + System.out.println("Annotating aligned SAM file..."); - try { - File sortedSam = File.createTempFile("sorted", ".sam", SAM_OUT.getParentFile()); - //System.out.println(" sorted file: " + sortedSam.getAbsolutePath()); - - sortBAMByReadName(SAM_IN, sortedSam); + System.out.println(" sorting aligned SAM file by read name..."); + File alignedSam = getTempSAMFile("aligned"); + sortBAMByReadName(SAM_IN, alignedSam); - File mergedSam = File.createTempFile("merged", ".sam", SAM_OUT.getParentFile()); - //System.out.println(" merged file: " + mergedSam.getAbsolutePath()); - - mergeUnalignedAndAlignedBams(SAM_OUT, sortedSam, mergedSam); - } catch (IOException e) { - throw new StingException("There was a problem in trying to merge the unaligned and aligned BAM files."); - } + System.out.println(" merging unaligned and aligned SAM files..."); + File mergedSam = SAM_OUT; + mergeUnalignedAndAlignedBams(unalignedSam, alignedSam, mergedSam); } System.out.println("Done."); @@ -119,6 +126,23 @@ public class AnnotateSecondaryBase extends CommandLineProgram { return 0; } + /** + * Return a tempfile. This is a laziness method so that I don't have to litter my code with try/catch blocks for IOExceptions. + * + * @param prefix the prefix for the temp file + * @return the temp file + */ + private File getTempSAMFile(String prefix) { + try { + File tempFile = File.createTempFile(prefix, ".sam", SAM_OUT.getParentFile()); + tempFile.deleteOnExit(); + + return tempFile; + } catch (IOException e) { + throw new StingException("Unable to create tempfile in directory " + SAM_OUT.getParent()); + } + } + /** * Parse the cycle_ranges string that defines the cycle where a read starts and stops. * Comma-separated ranges are interpreted to be the first and second end of a pair. @@ -133,8 +157,8 @@ public class AnnotateSecondaryBase extends CommandLineProgram { Pattern p = Pattern.compile("(\\d+)-(\\d+)"); - for (int pieceIndex = 0; pieceIndex < pieces.length; pieceIndex++) { - Matcher m = p.matcher(pieces[pieceIndex]); + for (String piece : pieces) { + Matcher m = p.matcher(piece); if (m.find()) { Integer cycleStart = new Integer(m.group(1)); @@ -169,25 +193,27 @@ public class AnnotateSecondaryBase extends CommandLineProgram { * Construct a SAMRecord object with the specified information. The secondary bases * will be annotated suchthat they will not conflict with the primary base. * - * @param rr the raw Illumina read - * @param fpr the four-base distributions for every base in the read - * @param sfh the SAM header - * @param runBarcode the run barcode of the lane (used to prefix the reads) + * @param rr the raw Illumina read + * @param fpr the four-base distributions for every base in the read + * @param sfh the SAM header + * @param runBarcode the run barcode of the lane (used to prefix the reads) + * @param isPaired is this a paired-end lane? + * @param isSecondEndOfPair is this the second end of the pair? * * @return a fully-constructed SAM record */ - private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode, boolean isPaired, boolean secondEndOfPair) { + private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode, boolean isPaired, boolean isSecondEndOfPair) { SAMRecord sr = new SAMRecord(sfh); sr.setReadName(runBarcode + ":" + rr.getReadKey() + "#0"); - sr.setMateUnmappedFlag(true); sr.setReadUmappedFlag(true); sr.setReadString(rr.getSequenceAsString()); sr.setBaseQualities(rr.getQuals()); sr.setReadPairedFlag(isPaired); if (isPaired) { - sr.setFirstOfPairFlag(!secondEndOfPair); + sr.setMateUnmappedFlag(true); + sr.setFirstOfPairFlag(!isSecondEndOfPair); } sr.setAttribute("SQ", fpr.getSQTag(rr)); @@ -220,9 +246,9 @@ public class AnnotateSecondaryBase extends CommandLineProgram { /** * Merges two SAM files that have been sorted in queryname order * - * @param queryNameSortedUnalignedSam - * @param queryNameSortedAlignedSam - * @param mergedSam + * @param queryNameSortedUnalignedSam the sorted unaligned SAM file + * @param queryNameSortedAlignedSam the sorted aligned SAM file + * @param mergedSam the output file where the merged results should be stored */ private void mergeUnalignedAndAlignedBams(File queryNameSortedUnalignedSam, File queryNameSortedAlignedSam, File mergedSam) { SAMFileReader usam = new SAMFileReader(queryNameSortedUnalignedSam); @@ -240,36 +266,54 @@ public class AnnotateSecondaryBase extends CommandLineProgram { SAMRecord asr = asamIt.next(); do { - while (usamIt.hasNext() && asamIt.hasNext() && !usr.getReadName().matches(asr.getReadName()) && usr.getReadNegativeStrandFlag() == asr.getReadNegativeStrandFlag()) { - int comp = usr.getReadName().compareTo(asr.getReadName()); - - if (comp < 0) { - usr = usamIt.next(); - } else if (comp > 0) { - asr = asamIt.next(); - } - } - - if (usr.getReadName().matches(asr.getReadName()) && usr.getReadNegativeStrandFlag() == asr.getReadNegativeStrandFlag()) { + // Pull a record from the unaligned file and advance the aligned file until we find the matching record. We + // don't have to advance the unaligned file until we find our record because we assume every record we generate + // will be in the aligned file (which also contains unaligned reads). + // + // If Picard ever stops storing the unaligned reads, this logic will need to be rewritten. + + if (usr.getReadName().equals(asr.getReadName()) && usr.getFirstOfPairFlag() == asr.getFirstOfPairFlag()) { byte[] sqtag = (byte[]) usr.getAttribute("SQ"); + String usrread = usr.getReadString(); + String asrread = asr.getReadString(); - if (sqtag != null) { - if (asr.getReadNegativeStrandFlag()) { - QualityUtils.reverseComplementCompressedQualityArray(sqtag); - - asr.setAttribute("SQ", sqtag); - } + if (asr.getReadNegativeStrandFlag()) { + sqtag = QualityUtils.reverseComplementCompressedQualityArray(sqtag); + asrread = BaseUtils.simpleReverseComplement(asrread); } - samOut.addAlignment(asr); + if (usrread != null && asrread != null && !usrread.equals(asrread)) { + throw new StingException( + String.format("Purportedly identical unaligned and aligned reads have different read sequences. Perhaps this lane was reanalyzed by the Illumina software but not the production pipeline?\n '%s:%b:%s'\n '%s:%b:%s'", + usr.getReadName(), usr.getFirstOfPairFlag(), usrread, + asr.getReadName(), asr.getFirstOfPairFlag(), asrread)); + } + + asr.setAttribute("SQ", sqtag); usr = usamIt.next(); + } else { asr = asamIt.next(); } + + samOut.addAlignment(asr); } while (usamIt.hasNext() && asamIt.hasNext()); usam.close(); asam.close(); samOut.close(); } + + /** + * For debugging purposes. Spits out relevant information for two SAMRecords. + * + * @param sra first SAMRecord + * @param srb second SAMRecord + */ + private void printRecords(SAMRecord sra, SAMRecord srb) { + System.out.println("a: " + sra.getReadName() + " " + sra.getFirstOfPairFlag()); + System.out.println("b: " + srb.getReadName() + " " + srb.getFirstOfPairFlag()); + System.out.println(); + + } } diff --git a/java/src/org/broadinstitute/sting/secondarybase/BasecallingBaseModel.java b/java/src/org/broadinstitute/sting/secondarybase/BasecallingBaseModel.java index 9ce8a31d7..2afb05f10 100755 --- a/java/src/org/broadinstitute/sting/secondarybase/BasecallingBaseModel.java +++ b/java/src/org/broadinstitute/sting/secondarybase/BasecallingBaseModel.java @@ -39,7 +39,9 @@ public class BasecallingBaseModel { private boolean readyToCall = false; /** - * Constructor for BasecallingBaseModel + * Constructor for BasecallingBaseModel. + * + * @param correctForContext should we attempt to correct for contextual sequence effects? */ public BasecallingBaseModel(boolean correctForContext) { this.correctForContext = correctForContext; @@ -67,8 +69,12 @@ public class BasecallingBaseModel { alg = new Algebra(); } + /** * Add a single training point to the model to estimate the means. + * + * @param probMatrix the matrix of probabilities for the base + * @param fourIntensity the four raw intensities for the base */ public void addMeanPoint(double[][] probMatrix, double[] fourIntensity) { for (int basePrevIndex = 0; basePrevIndex < numTheories; basePrevIndex++) { @@ -88,6 +94,9 @@ public class BasecallingBaseModel { /** * Add a single training point to the model to estimate the covariances. + * + * @param probMatrix the matrix of probabilities for the base + * @param fourIntensity the four raw intensities for the base */ public void addCovariancePoint(double[][] probMatrix, double[] fourIntensity) { for (int basePrevIndex = 0; basePrevIndex < numTheories; basePrevIndex++) { @@ -133,7 +142,7 @@ public class BasecallingBaseModel { } /** - * Compute the likelihood matrix for a base + * Compute the likelihood matrix for a base. * * @param cycle the cycle we're calling right now * @param fourintensity the four intensities of the current cycle's base @@ -163,6 +172,11 @@ public class BasecallingBaseModel { return likedist; } + /** + * Write the model parameters to disk. + * + * @param outparam the file in which the output parameters should be stored + */ public void write(File outparam) { try { PrintWriter writer = new PrintWriter(outparam); diff --git a/java/src/org/broadinstitute/sting/secondarybase/BasecallingReadModel.java b/java/src/org/broadinstitute/sting/secondarybase/BasecallingReadModel.java index 26788fd07..01c726efa 100644 --- a/java/src/org/broadinstitute/sting/secondarybase/BasecallingReadModel.java +++ b/java/src/org/broadinstitute/sting/secondarybase/BasecallingReadModel.java @@ -20,16 +20,31 @@ public class BasecallingReadModel { private BasecallingBaseModel[] basemodels = null; private boolean correctForContext = true; + /** + * Constructs a BasecallingReadModel with space for a given read length. + * + * @param readLength the length of the reads to which this model will apply. + */ public BasecallingReadModel(int readLength) { initialize(readLength); } + /** + * Constructs a BasecallingReadModel and trains it using the specified training data. + * + * @param trainingData a set of RawReads from which the model will be trained. + */ public BasecallingReadModel(ArrayList trainingData) { initialize(trainingData.get(0).getReadLength()); train(trainingData); } + /** + * Initialize the model and set default parameters for each cycle appropriately. + * + * @param readLength the length of the reads to which this model will apply. + */ public void initialize(int readLength) { basemodels = new BasecallingBaseModel[readLength]; @@ -38,8 +53,12 @@ public class BasecallingReadModel { } } + /** + * Train the model using the specified training data. + * + * @param trainingData a set of RawReads from which the model will be trained. + */ public void train(ArrayList trainingData) { - //for (int readIndex = 0; readIndex < trainingData.size(); readIndex++) { for ( RawRead read : trainingData ) { addMeanPoints(read); } @@ -49,10 +68,22 @@ public class BasecallingReadModel { } } + /** + * Add a training point for the mean intensity values per base and per cycle. + * + * @param cycle the cycle number (0-based) + * @param probMatrix the probability matrix for the base + * @param fourintensity the four raw intensities for the base + */ public void addMeanPoint(int cycle, double[][] probMatrix, double[] fourintensity) { basemodels[cycle].addMeanPoint(probMatrix, fourintensity); } + /** + * Add a training point for the mean intensity values per base in all cycles. + * + * @param read the raw read + */ public void addMeanPoints(RawRead read) { byte[] seqs = read.getSequence(); byte[] quals = read.getQuals(); @@ -74,10 +105,22 @@ public class BasecallingReadModel { } } + /** + * Add a training point for the intensity covariance matrix per base and per cycle. + * + * @param cycle the cycle number (0-based) + * @param probMatrix the probability matrix for the base + * @param fourintensity the four raw intensities for the base + */ public void addCovariancePoint(int cycle, double[][] probMatrix, double[] fourintensity) { basemodels[cycle].addCovariancePoint(probMatrix, fourintensity); } + /** + * Add a training point for the intensity covariance matrix per base in all cycles. + * + * @param read the raw read + */ public void addCovariancePoints(RawRead read) { byte[] seqs = read.getSequence(); byte[] quals = read.getQuals(); @@ -99,10 +142,26 @@ public class BasecallingReadModel { } } + /** + * Compute the likelihoods that a given set of intensities yields each possible base. + * + * @param cycle the cycle number (0-based) + * @param fourintensity the four raw intensities for the base + * @return the matrix of likelihoods + */ public double[][] computeLikelihoods(int cycle, double[] fourintensity) { return basemodels[cycle].computeLikelihoods(cycle, fourintensity); } + /** + * Compute the probabilities that a given set of intensities yields each possible base. + * + * @param cycle the cycle number (0-based) + * @param basePrev the previous base + * @param qualPrev the previous base's quality score + * @param fourintensity the four raw intensities for the base + * @return the probability distribution over the four base possibilities + */ public FourProb computeProbabilities(int cycle, char basePrev, byte qualPrev, double[] fourintensity) { double[][] likes = computeLikelihoods(cycle, fourintensity); @@ -133,6 +192,12 @@ public class BasecallingReadModel { return new FourProb(likes); } + /** + * Call the bases in the given RawRead. + * + * @param read the RawRead + * @return the basecalled read + */ public FourProbRead call(RawRead read) { FourProbRead fpr = new FourProbRead(read.getReadLength()); @@ -152,6 +217,15 @@ public class BasecallingReadModel { return fpr; } + /** + * Return the probability matrix given the previous cycle's base, the current cycle's base, and the current base's probability. + * + * @param cycle the cycle number (0-based) + * @param basePrev the previous base + * @param baseCur the current base + * @param probCur the probability of the current base + * @return the probability matrix of the base + */ public double[][] getBaseProbabilityMatrix(int cycle, char basePrev, char baseCur, double probCur) { double[][] dist = new double[(correctForContext && cycle > 0) ? 4 : 1][4]; @@ -169,6 +243,11 @@ public class BasecallingReadModel { return dist; } + /** + * Write model parameters to disk. + * + * @param dir the directory in which model parameters should be stored. + */ public void write(File dir) { for (int cycle = 0; cycle < basemodels.length; cycle++) { File outparam = new File(dir.getPath() + "/param." + cycle + ".r"); diff --git a/java/src/org/broadinstitute/sting/secondarybase/BasecallingStats.java b/java/src/org/broadinstitute/sting/secondarybase/BasecallingStats.java index 908f79c17..c1be06c97 100755 --- a/java/src/org/broadinstitute/sting/secondarybase/BasecallingStats.java +++ b/java/src/org/broadinstitute/sting/secondarybase/BasecallingStats.java @@ -78,7 +78,7 @@ public class BasecallingStats { } /** - * Returns basecalling stats info in a nicely formatted string + * Returns basecalling stats info in a nicely formatted string. * * @return nicely formatted string containing basecalling stats */ @@ -87,7 +87,7 @@ public class BasecallingStats { } /** - * Periodically print a line containing basecalling stats + * Periodically print a line containing basecalling stats. * * @param interval the periodicity of the messages given in number of bases observed */ @@ -98,7 +98,7 @@ public class BasecallingStats { } /** - * Immediately print a line containing basecalling stats + * Immediately print a line containing basecalling stats. */ public void notifyNow() { System.out.printf("%s\n", toString()); diff --git a/java/src/org/broadinstitute/sting/secondarybase/BasecallingTrainer.java b/java/src/org/broadinstitute/sting/secondarybase/BasecallingTrainer.java index 8621bbf4e..8ad85aac4 100755 --- a/java/src/org/broadinstitute/sting/secondarybase/BasecallingTrainer.java +++ b/java/src/org/broadinstitute/sting/secondarybase/BasecallingTrainer.java @@ -40,7 +40,7 @@ public class BasecallingTrainer { } /** - * Get the training data array list + * Get the training data array list. * * @return the arraylist of raw training reads */ @@ -49,7 +49,7 @@ public class BasecallingTrainer { } /** - * Set the training data array list + * Set the training data array list. * * @param trainingData the arraylist of raw training reads */ @@ -88,7 +88,7 @@ public class BasecallingTrainer { } /** - * Load a training set from perfect reads in an already-aligned bam file + * Load a training set from perfect reads in an already-aligned bam file. * * @param samIn the SAM/BAM file to load the reads from * @param reference the reference to which the reads should be compared @@ -167,6 +167,12 @@ public class BasecallingTrainer { return trainingReads; } + /** + * Correlate the perfect reads with their intensities (at least, theoretically). This doesn't work right now... + * + * @param trainingReads the set of training reads, hashed by tile + * @return the final training set with intensities added in + */ private ArrayList correlateReadsAndIntensities(Vector> trainingReads) { ArrayList newTrainingData = new ArrayList(trainingLimit); diff --git a/java/src/org/broadinstitute/sting/secondarybase/FourProb.java b/java/src/org/broadinstitute/sting/secondarybase/FourProb.java index 93ec78794..b0bba9934 100755 --- a/java/src/org/broadinstitute/sting/secondarybase/FourProb.java +++ b/java/src/org/broadinstitute/sting/secondarybase/FourProb.java @@ -50,6 +50,7 @@ public class FourProb { /** * Returns the base label of the base at the specified rank. + * * @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned * @return the base label (A, C, G, T). */ @@ -57,6 +58,7 @@ public class FourProb { /** * Returns the probability of the base at the specified rank. + * * @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned * @return the probability of the base (0.0-1.0) */ @@ -64,6 +66,7 @@ public class FourProb { /** * Returns the quality score of the base at the specified rank. + * * @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned * @return the quality score of the base (0-40) */ @@ -71,6 +74,7 @@ public class FourProb { /** * A utility method to convert a base index into a base label. + * * @param baseIndex the index of the base (0, 1, 2, 3). * @return A, C, G, T, or '.' if the base index can't be understood. */ diff --git a/java/src/org/broadinstitute/sting/secondarybase/FourProbRead.java b/java/src/org/broadinstitute/sting/secondarybase/FourProbRead.java index c2f1ae402..c5e74a1c4 100755 --- a/java/src/org/broadinstitute/sting/secondarybase/FourProbRead.java +++ b/java/src/org/broadinstitute/sting/secondarybase/FourProbRead.java @@ -10,7 +10,7 @@ import java.util.ArrayList; */ public class FourProbRead extends ArrayList { /** - * Initialize the container with the specified capacity + * Initialize the container with the specified capacity. * * @param initialCapacity the number of bases in the read */ @@ -23,7 +23,7 @@ public class FourProbRead extends ArrayList { * * @param cycleStart the starting cycle (0-based, inclusive) * @param cycleStop the ending cycle (0-based, inclusive) - * @return + * @return a FourProbRead that is a subset of this FourProbRead */ public FourProbRead getSubset(int cycleStart, int cycleStop) { FourProbRead subset = new FourProbRead(cycleStop - cycleStart + 1); @@ -36,7 +36,7 @@ public class FourProbRead extends ArrayList { } /** - * Get the read sequence at a specified rank + * Get the read sequence at a specified rank. * * @param rank the rank of the sequence to return (0=best, 1=second-best, 2=third-best, 3=fourth-best) * @return the read sequence at the specified rank @@ -44,9 +44,7 @@ public class FourProbRead extends ArrayList { public String getBaseSequenceAtGivenRank(int rank) { String pseq = ""; - for (int cycle = 0; cycle < this.size(); cycle++) { - FourProb fp = this.get(cycle); - + for ( FourProb fp : this ) { pseq += fp.baseAtRank(rank); } @@ -54,7 +52,8 @@ public class FourProbRead extends ArrayList { } /** - * Get the primary read sequence + * Get the primary read sequence. + * * @return the primary read sequence */ public String getPrimaryBaseSequence() { @@ -62,7 +61,8 @@ public class FourProbRead extends ArrayList { } /** - * Get the secondary read sequence + * Get the secondary read sequence. + * * @return the secondary read sequence */ public String getSecondaryBaseSequence() { @@ -73,7 +73,7 @@ public class FourProbRead extends ArrayList { * Get the SAM spec-conformant SQ tag that will be written to the SAM/BAM file. * * @param rr the raw read - * @return the byte array for the SQ tag (first two bits: the base identity, the last six bits: -10*log10(p3/p2). + * @return the byte array for the SQ tag (first two bits: the base identity, the last six bits: -10*log10(p3/p2) */ public byte[] getSQTag(RawRead rr) { byte[] sqtag = new byte[this.size()]; @@ -89,8 +89,6 @@ public class FourProbRead extends ArrayList { double qualdiff = -10.0*Math.log10(fp.probAtRank(2)/fp.probAtRank(1)); sqtag[cycle] = QualityUtils.baseAndProbDiffToCompressedQuality(fpSecondaryBaseIndex, qualdiff); - - //System.out.println("SQTAG: " + qualdiff + " " + sqtag[cycle] + " " + QualityUtils.compressedQualityToProbDiff(sqtag[cycle])); } return sqtag; diff --git a/java/src/org/broadinstitute/sting/secondarybase/IlluminaParser.java b/java/src/org/broadinstitute/sting/secondarybase/IlluminaParser.java index 611646736..c25fe695e 100755 --- a/java/src/org/broadinstitute/sting/secondarybase/IlluminaParser.java +++ b/java/src/org/broadinstitute/sting/secondarybase/IlluminaParser.java @@ -9,10 +9,20 @@ import java.io.File; import java.io.FilenameFilter; import java.util.Arrays; import java.util.Comparator; -import java.util.Iterator; import java.util.regex.Matcher; import java.util.regex.Pattern; +/** + * IlluminaParser parses raw Illumina data (raw intensities, basecalled read sequences and quality scores) + * and presents it to the developer in an easy-to-use form. It also permits random tile jumping. + * + * WARNING: This parser does not understand newer GAPipeline data formats, and instead relies on the older + * data formats that may have been generated with older Illumina tools. As a result, this may return + * suboptimal data. This parser only exists temporarily until the Picard team writes a much more sensible + * version. Proceed with caution. + * + * @author Kiran Garimella + */ public class IlluminaParser implements Closeable { private File bustardDir; private File firecrestDir; @@ -26,6 +36,12 @@ public class IlluminaParser implements Closeable { private PasteParser currentTileParser; private String[][] currentParseResult; + /** + * Construct an IlluminaParser given the Bustard directory and lane. Infer the Firecrest directory. + * + * @param bustardDir the Illumina Bustard directory + * @param lane the Illumina lane + */ public IlluminaParser(File bustardDir, int lane) { this.bustardDir = bustardDir; this.firecrestDir = bustardDir.getParentFile(); @@ -34,6 +50,13 @@ public class IlluminaParser implements Closeable { initializeParser(); } + /** + * Construct an IlluminaParser given the Bustard directory, Firecrest directory and lane. + * + * @param bustardDir the Illumina Bustard directory + * @param firecrestDir the Illumina Firecrest directory + * @param lane the Illumina lane + */ public IlluminaParser(File bustardDir, File firecrestDir, int lane) { this.bustardDir = bustardDir; this.firecrestDir = firecrestDir; @@ -42,6 +65,9 @@ public class IlluminaParser implements Closeable { initializeParser(); } + /** + * Initialize the parser and seek to the first tile. + */ private void initializeParser() { intfiles = firecrestDir.listFiles(getFilenameFilter("int")); seqfiles = bustardDir.listFiles(getFilenameFilter("seq")); @@ -65,6 +91,12 @@ public class IlluminaParser implements Closeable { // Todo: put some more consistency checks here } + /** + * Get the filename filter for files of a given type. + * + * @param suffix the type (i.e. 'int', 'seq', 'prb'). + * @return the filename filter + */ private FilenameFilter getFilenameFilter(final String suffix) { return new FilenameFilter() { public boolean accept(File file, String s) { @@ -76,6 +108,11 @@ public class IlluminaParser implements Closeable { }; } + /** + * Get a comparator that sorts by tile. + * + * @return the comparator that sorts by tile. + */ private Comparator getTileSortingComparator() { return new Comparator() { public int compare(File file1, File file2) { @@ -99,8 +136,19 @@ public class IlluminaParser implements Closeable { }; } + /** + * Return the number of tiles. + * + * @return the number of tiles. + */ public int numTiles() { return intfiles.length; } + /** + * Seek to a specified tile. + * + * @param tile the tile to which we should seek + * @return true if we were able to seek to the tile, false if otherwise + */ public boolean seekToTile(int tile) { if (tile < intfiles.length - 1) { currentTileIndex = tile - 1; @@ -117,10 +165,20 @@ public class IlluminaParser implements Closeable { return false; } + /** + * Returns whether the parser has any more data to go through. + * + * @return true if there's data left, false if otherwise + */ public boolean hasNext() { return (currentTileParser.hasNext() || seekToTile(currentTileIndex + 1)); } + /** + * Advance the parser to the next read. + * + * @return true if successful, false if otherwise + */ public boolean next() { if (hasNext()) { currentParseResult = currentTileParser.next(); @@ -131,22 +189,45 @@ public class IlluminaParser implements Closeable { return false; } + /** + * Returns the result from the current parse as an matrix of Strings. + * + * @return the matrix of Strings containing the current parse result + */ public String[][] getCurrentParseResult() { return currentParseResult; } + /** + * Removes, um, something, but in reality, does nothing. + */ public void remove() { throw new UnsupportedOperationException("IlluminaParser.remove() method is not supported."); } + /** + * Close the current tile. + */ public void close() { currentTileParser.close(); } + /** + * Returns a raw read containing the raw intensities, read sequence, and quality scores. + * + * @return the raw read + */ public RawRead getRawRead() { return getSubset(0, currentParseResult[1][4].length() - 1); } + /** + * Returns a subset of the current parse result as a raw read. + * + * @param cycleStart the starting cycle for the desired subset + * @param cycleStop the ending cycle for the desired subset + * @return the subset of the current parse result as a raw read + */ public RawRead getSubset(int cycleStart, int cycleStop) { return new RawRead(currentParseResult, cycleStart, cycleStop); } diff --git a/java/src/org/broadinstitute/sting/secondarybase/RawRead.java b/java/src/org/broadinstitute/sting/secondarybase/RawRead.java index 36c716303..1cf7d7e1b 100755 --- a/java/src/org/broadinstitute/sting/secondarybase/RawRead.java +++ b/java/src/org/broadinstitute/sting/secondarybase/RawRead.java @@ -55,93 +55,93 @@ public class RawRead { } /** - * Get lane number of read + * Get lane number of read. * * @return lane number of read */ public byte getLane() { return lane; } /** - * Get tile number of read + * Get tile number of read. * * @return tile number of read */ public int getTile() { return tile; } /** - * Get x-coordinate of read + * Get x-coordinate of read. * * @return x-coordinate of read */ public int getXCoordinate() { return x; } /** - * Get y-coordinate of read + * Get y-coordinate of read. * * @return y-coordinate of read */ public int getYCoordinate() { return y; } /** - * Get read key (lane:tile:x:y) + * Get read key (lane:tile:x:y). * * @return read key (lane:tile:x:y) */ public String getReadKey() { return String.format("%d:%d:%d:%d", lane, tile, x, y); } /** - * Get the read sequence between the cycles specified in the constructor as a byte array + * Get the read sequence between the cycles specified in the constructor as a byte array. * * @return read sequence */ public byte[] getSequence() { return sequence; } /** - * Set the read sequence from a byte array + * Set the read sequence from a byte array. * * @param sequence the read sequence in byte array form */ public void setSequence(byte[] sequence) { this.sequence = sequence; } /** - * Get the read sequence as a string + * Get the read sequence as a String. * - * @return the read sequence in string form + * @return the read sequence in String form */ public String getSequenceAsString() { return new String(getSequence()); } /** - * Get the quals + * Get the quals. * * @return a byte array of quals */ public byte[] getQuals() { return quals; } /** - * Set the quals + * Set the quals. * * @param quals a byte array of quals */ public void setQuals(byte[] quals) { this.quals = quals; } /** - * Get the raw read intensities + * Get the raw read intensities. * * @return the (readLength)x(numChannels) array of raw intensities */ public short[][] getIntensities() { return intensities; } /** - * Set the raw intensities + * Set the raw intensities. * * @param intensities the (readLength)x(numChannels) array of raw intensities */ public void setIntensities(short[][] intensities) { this.intensities = intensities; } /** - * Get the read length + * Get the read length. * * @return the read length */