From ceffefa6a6a4d470f27853017aea66678c60e685 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Tue, 27 Sep 2011 10:18:58 -0400 Subject: [PATCH 01/19] Intermediate version with banded pair HMM --- .../indels/PairHMMIndelErrorModel.java | 184 ++++++++++++------ 1 file changed, 121 insertions(+), 63 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 31e9819ab..706b3abf7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -31,6 +31,7 @@ import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -38,6 +39,8 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import java.io.File; +import java.io.FileWriter; +import java.io.PrintStream; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; @@ -478,6 +481,8 @@ public class PairHMMIndelErrorModel { final int X_METRIC_LENGTH = readBases.length+1; final int Y_METRIC_LENGTH = haplotypeBases.length+1; + final int BWIDTH = 10; + // initialize path metric and traceback memories for likelihood computation double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; @@ -507,6 +512,96 @@ public class PairHMMIndelErrorModel { bestActionArrayY[0][j] = bestActionArrayM[0][j] = bestActionArrayX[0][j] = LEFT_GOTO_Y; } + final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH -1; + final int elemsInDiag = Math.min(X_METRIC_LENGTH, Y_METRIC_LENGTH); + + int idxWithMaxElement = 0; + + for (int diag=0; diag < numDiags; diag++) { + + int indI = 0; + int indJ = diag; + if (diag >= Y_METRIC_LENGTH ) { + indI = diag-(Y_METRIC_LENGTH-1); + indJ = Y_METRIC_LENGTH-1; + } + + + double maxElementInDiag = Double.NEGATIVE_INFINITY; + + int idxLow = idxWithMaxElement - BWIDTH; + int idxHigh = idxWithMaxElement + BWIDTH; + + if (idxLow < 0) + idxLow = 0; + + if (idxHigh > elemsInDiag ) + idxHigh = elemsInDiag; + + + // for each diagonal, compute max element, and center band around it for next diagonal. + // + // start point is idxOfMax-BWIDTH, end pt is idxMax+BWIDTH + + for (int el = idxLow; el < idxHigh; el++) { + // first row and first column of all matrices had been pre-filled before + int im1 = indI -1; + int jm1 = indJ - 1; + if (indI > 0 && indJ > 0) { + // update current point + byte x = readBases[im1]; + byte y = haplotypeBases[jm1]; + byte qual = readQuals[im1]; + + double bestMetric = 0.0; + + // compute metric for match/mismatch + // workaround for reads whose bases quality = 0, + if (qual < 1) + qual = 1; + + if (qual > MAX_CACHED_QUAL) + qual = MAX_CACHED_QUAL; + + double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual]; + + matchMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[im1][jm1] + pBaseRead, XMetricArray[im1][jm1] + pBaseRead, + YMetricArray[im1][jm1] + pBaseRead); + + c = currentGOP[jm1]; + d = currentGCP[jm1]; + if (indJ == Y_METRIC_LENGTH-1) + c = d = END_GAP_COST; + + + XMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[im1][indJ] + c, XMetricArray[im1][indJ] + d); + + // update Y array + //c = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentGOP[jm1]); + //d = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentGCP[jm1]); + c = currentGOP[jm1]; + d = currentGCP[jm1]; + if (indI == X_METRIC_LENGTH-1) + c = d = END_GAP_COST; + + YMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[indI][jm1] + c, YMetricArray[indI][jm1] + d); + bestMetric = MathUtils.softMax(matchMetricArray[indI][indJ], XMetricArray[indI][indJ], YMetricArray[indI][indJ]); + + if (bestMetric > maxElementInDiag) { + maxElementInDiag = bestMetric; + idxWithMaxElement = el ; + } + + } + indI++; + if (indI >=X_METRIC_LENGTH ) + break; + indJ--; + if (indJ < 0) + break; + } + } +/* for (int indI=1; indI < X_METRIC_LENGTH; indI++) { int im1 = indI-1; for (int indJ=1; indJ < Y_METRIC_LENGTH; indJ++) { @@ -526,6 +621,8 @@ public class PairHMMIndelErrorModel { if (qual > MAX_CACHED_QUAL) qual = MAX_CACHED_QUAL; + //qual = 30; + double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual]; @@ -613,78 +710,39 @@ public class PairHMMIndelErrorModel { } } - + */ double bestMetric; double metrics[] = new double[3]; - int bestTable=0, bestI=X_METRIC_LENGTH - 1, bestJ=Y_METRIC_LENGTH - 1; + final int bestI=X_METRIC_LENGTH - 1, bestJ=Y_METRIC_LENGTH - 1; metrics[MATCH_OFFSET] = matchMetricArray[bestI][bestJ]; metrics[X_OFFSET] = XMetricArray[bestI][bestJ]; metrics[Y_OFFSET] = YMetricArray[bestI][bestJ]; - if (doViterbi) { - bestTable = MathUtils.maxElementIndex(metrics); - bestMetric = metrics[bestTable]; - } - else - bestMetric = MathUtils.softMax(metrics); + bestMetric = MathUtils.softMax(metrics); - // Do traceback (needed only for debugging!) - if (DEBUG && doViterbi) { + if (DEBUG) { + PrintStream outx, outy, outm; + double[][] sumMetrics = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + try { + outx = new PrintStream("../../UGOptim/datax.txt"); + outy = new PrintStream("../../UGOptim/datay.txt"); + outm = new PrintStream("../../UGOptim/datam.txt"); - int bestAction; - int i = bestI; - int j = bestJ; - - - System.out.println("Affine gap NW"); - - - String haplotypeString = new String (haplotypeBases); - String readString = new String(readBases); - - - while (i >0 || j >0) { - if (bestTable == X_OFFSET) { - // insert gap in Y - haplotypeString = haplotypeString.substring(0,j)+"-"+haplotypeString.substring(j); - bestAction = bestActionArrayX[i][j]; + for (int indI=0; indI < X_METRIC_LENGTH; indI++) { + for (int indJ=0; indJ < Y_METRIC_LENGTH; indJ++) { + metrics[MATCH_OFFSET] = matchMetricArray[indI][indJ]; + metrics[X_OFFSET] = XMetricArray[indI][indJ]; + metrics[Y_OFFSET] = YMetricArray[indI][indJ]; + sumMetrics[indI][indJ] = MathUtils.softMax(metrics); + outx.format("%4.1f ", metrics[X_OFFSET]); + outy.format("%4.1f ", metrics[Y_OFFSET]); + outm.format("%4.1f ", metrics[MATCH_OFFSET]); + } + outx.println(); outm.println();outy.println(); } - else if (bestTable == Y_OFFSET) { - readString = readString.substring(0,i)+"-"+readString.substring(i); - bestAction = bestActionArrayY[i][j]; + outm.close(); outx.close(); outy.close(); + } catch (java.io.IOException e) { throw new UserException("bla");} + } - } - else { - bestAction = bestActionArrayM[i][j]; - } - System.out.print(bestAction); - - - // bestAction contains action to take at next step - // encoding of bestAction: upper 2 bits = direction, lower 2 bits = next table - - // bestTable and nextDirection for next step - bestTable = bestAction & 0x3; - int nextDirection = bestAction >> 2; - if (nextDirection == UP) { - i--; - } else if (nextDirection == LEFT) { - j--; - } else { // if (nextDirection == DIAG) - i--; j--; - } - - } - - - - - System.out.println("\nAlignment: "); - System.out.println("R:"+readString); - System.out.println("H:"+haplotypeString); - System.out.println(); - - - } if (DEBUG) System.out.format("Likelihood: %5.4f\n", bestMetric); From 89544c209c0344fbf9aca9f283daa0fa428f9398 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 28 Sep 2011 11:19:17 -0400 Subject: [PATCH 03/19] Fixing contracts changed return type to Pair, changing contracts accordingly. --- .../java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 5fa410922..14ebbaa6f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -825,7 +825,7 @@ public class ReadUtils { * @return the read coordinate corresponding to the requested reference coordinate. (see warning!) */ @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"}) - @Ensures({"result >= 0", "result < read.getReadLength()"}) + @Ensures({"result.getFirst() >= 0", "result.getFirst() < read.getReadLength()"}) public static Pair getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord) { int readBases = 0; int refBases = 0; From 1e32281a15dc773ec41aaf5a17342d37f181fb43 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 28 Sep 2011 11:31:20 -0400 Subject: [PATCH 04/19] Fix to not show -null when missing short name argument --- .../utils/help/GenericDocumentationHandler.java | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java index 7ea77a939..b100b3e1b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java @@ -432,11 +432,17 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { * Returns a Pair of (main, synonym) names for argument with fullName s1 and * shortName s2. The main is selected to be the longest of the two, provided * it doesn't exceed MAX_DISPLAY_NAME, in which case the shorter is taken. - * @param s1 - * @param s2 - * @return + * + * @param s1 the short argument name without -, or null if not provided + * @param s2 the long argument name without --, or null if not provided + * @return A pair of fully qualified names (with - or --) for the argument. The first + * element is the primary display name while the second (potentially null) is a + * synonymous name. */ Pair displayNames(String s1, String s2) { + s1 = s1 == null ? null : "-" + s1; + s2 = s2 == null ? null : "--" + s2; + if ( s1 == null ) return new Pair(s2, null); if ( s2 == null ) return new Pair(s1, null); @@ -510,7 +516,7 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { */ protected Map docForArgument(FieldDoc fieldDoc, ArgumentSource source, ArgumentDefinition def) { Map root = new HashMap(); - Pair names = displayNames("-" + def.shortName, "--" + def.fullName); + Pair names = displayNames(def.shortName, def.fullName); root.put("name", names.getFirst() ); From a5006831d7b52748a77aa86512979d220c9540a7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 28 Sep 2011 11:35:52 -0400 Subject: [PATCH 05/19] Shows "" not empty space when default string value is "" --- .../sting/utils/help/GenericDocumentationHandler.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java index b100b3e1b..20f9eccd3 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java @@ -325,11 +325,14 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { return Arrays.toString((Object[])value); else throw new RuntimeException("Unexpected array type in prettyPrintValue. Value was " + value + " type is " + type); - } else if ( RodBinding.class.isAssignableFrom(value.getClass() ) ) + } else if ( RodBinding.class.isAssignableFrom(value.getClass() ) ) { // annoying special case to handle the UnBound() constructor return "none"; - - return value.toString(); + } else if ( value instanceof String ) { + return value.equals("") ? "\"\"" : value; + } else { + return value.toString(); + } } /** From bb619a9a3c946cef37ccbcadf8f00af57db5dd8e Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 28 Sep 2011 13:13:03 -0400 Subject: [PATCH 06/19] Fixing docs --- .../broadinstitute/sting/gatk/walkers/annotator/SampleList.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java index ff409484d..ee08cfa3b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java @@ -42,7 +42,7 @@ import java.util.List; import java.util.Map; /** - * List all of the samples in the info field + * List all of the polymorphic samples. */ public class SampleList extends InfoFieldAnnotation { From 1b45f217749f84c1af6833886124bc3c6f071ebd Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 28 Sep 2011 13:18:32 -0400 Subject: [PATCH 07/19] Removing this command-line tool. Purposely not doing this in stable so that users who may still use it have time to find other options. But the docs are no longer on the wiki. --- .../gatk/refdata/indexer/RMDIndexer.java | 130 ------------------ 1 file changed, 130 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java deleted file mode 100644 index 9e5a95d10..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java +++ /dev/null @@ -1,130 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata.indexer; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.apache.log4j.Logger; -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.Tribble; -import org.broad.tribble.index.Index; -import org.broad.tribble.index.IndexFactory; -import org.broad.tribble.util.LittleEndianOutputStream; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.CommandLineProgram; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; - -import java.io.File; -import java.io.FileOutputStream; - -/** - * a utility class that can create an index, written to a target location. This is useful when you're unable to write to the directory - * in which an index is located, or if you'd like to pre-index files to save time. - */ -public class RMDIndexer extends CommandLineProgram { - @Argument(shortName="in", fullName="inputFile", doc="The reference meta data file to index", required = true) - File inputFileSource = null; - - @Argument(shortName="t", fullName="type", doc="The reference meta data file format (e.g. vcf, bed)", required = true) - String inputFileType = null; - - @Input(fullName = "referenceSequence", shortName = "R", doc = "The reference to use when indexing; this sequence will be set in the index", required = true) - public File referenceFile = null; - - @Input(shortName = "i", fullName = "indexFile", doc = "Where to write the index to (as a file), if not supplied we write to .idx", required = false) - public File indexFile = null; - - @Argument(shortName = "ba", fullName = "balanceApproach", doc="the index balancing approach to take", required=false) - IndexFactory.IndexBalanceApproach approach = IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME; - - private static Logger logger = Logger.getLogger(RMDIndexer.class); - private IndexedFastaSequenceFile ref = null; - private GenomeLocParser genomeLocParser = null; - - @Override - protected int execute() throws Exception { - - - // check parameters - // --------------------------------------------------------------------------------- - - // check the input parameters - if (referenceFile != null && !referenceFile.canRead()) - throw new IllegalArgumentException("We can't read the reference file: " - + referenceFile + ", check that it exists, and that you have permissions to read it"); - - - // set the index file to the default name if they didn't specify a file - if (indexFile == null && inputFileSource != null) - indexFile = new File(inputFileSource.getAbsolutePath() + Tribble.STANDARD_INDEX_EXTENSION); - - // check that we can create the output file - if (indexFile == null || indexFile.exists()) - throw new IllegalArgumentException("We can't write to the index file location: " - + indexFile + ", the index exists"); - - logger.info(String.format("attempting to index file: %s", inputFileSource)); - logger.info(String.format("using reference: %s", ((referenceFile != null) ? referenceFile.getAbsolutePath() : "(not supplied)"))); - logger.info(String.format("using type: %s", inputFileType)); - logger.info(String.format("writing to location: %s", indexFile.getAbsolutePath())); - - // try to index the file - // --------------------------------------------------------------------------------- - - // setup the reference - ref = new CachingIndexedFastaSequenceFile(referenceFile); - genomeLocParser = new GenomeLocParser(ref); - - // get a track builder - RMDTrackBuilder builder = new RMDTrackBuilder(ref.getSequenceDictionary(),genomeLocParser, ValidationExclusion.TYPE.ALL); - - // find the types available to the track builders - FeatureManager.FeatureDescriptor descriptor = builder.getFeatureManager().getByName(inputFileType); - - // check that the type is valid - if (descriptor == null) - throw new IllegalArgumentException("The type specified " + inputFileType + " is not a valid type. Valid type list: " + builder.getFeatureManager().userFriendlyListOfAvailableFeatures()); - - // create the codec - FeatureCodec codec = builder.getFeatureManager().createCodec(descriptor, "foo", genomeLocParser); - - // check if it's a reference dependent feature codec - if (codec instanceof ReferenceDependentFeatureCodec) - ((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(genomeLocParser); - - // get some timing info - long currentTime = System.currentTimeMillis(); - - Index index = IndexFactory.createIndex(inputFileSource, codec, approach); - - // add writing of the sequence dictionary, if supplied - builder.validateAndUpdateIndexSequenceDictionary(inputFileSource, index, ref.getSequenceDictionary()); - - // create the output stream, and write the index - LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); - index.write(stream); - stream.close(); - - // report and exit - logger.info("Successfully wrote the index to location: " + indexFile + " in " + ((System.currentTimeMillis() - currentTime)/1000) + " seconds"); - return 0; // return successfully - } - - - /** - * the generic call execute main - * @param argv the arguments from the command line - */ - public static void main(String[] argv) { - try { - RMDIndexer instance = new RMDIndexer(); - start(instance, argv); - System.exit(CommandLineProgram.result); - } catch (Exception e) { - exitSystemWithError(e); - } - } -} From 5c9b659c02aef41527ee07cd9c2e472a23de977b Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 28 Sep 2011 12:10:47 -0400 Subject: [PATCH 08/19] clipping both ends of the reads was modifying the original read This goes against the ReadClipper contract, and was affecting the second part of the read that spans over multiple intervals. Fixed. --- .../broadinstitute/sting/utils/clipreads/ReadClipper.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index 83db20238..275f476dc 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -93,8 +93,9 @@ public class ReadClipper { public SAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) { if (left == right) return new SAMRecord(read.getHeader()); - this.read = hardClipByReferenceCoordinates(right, -1); - return hardClipByReferenceCoordinates(-1, left); + SAMRecord leftTailRead = hardClipByReferenceCoordinates(right, -1); + ReadClipper clipper = new ReadClipper(leftTailRead); + return clipper.hardClipByReferenceCoordinatesLeftTail(left); } public SAMRecord hardClipLowQualEnds(byte lowQual) { From 3c7b7f74ef235ed9933c8227b2c89cb68b3374b0 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 28 Sep 2011 14:41:30 -0400 Subject: [PATCH 09/19] Optimized interval iteration Using a TreedSet to manipulate getToolkit.getIntervals() and being smart about which intervals to test makes interval clipping O(1) instead of O(n). --- .../sting/utils/GenomeLocComparator.java | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 public/java/src/org/broadinstitute/sting/utils/GenomeLocComparator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocComparator.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocComparator.java new file mode 100644 index 000000000..7aa9fdd65 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocComparator.java @@ -0,0 +1,56 @@ +package org.broadinstitute.sting.utils; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.Comparator; + +/** + * + * @author Mauricio Carneiro + * @since 9/28/11 + */ +public class GenomeLocComparator implements Comparator { + /** + * compares genomeLoc's contigs + * + * @param gl1 the genome loc to compare contigs + * @param gl2 the genome loc to compare contigs + * @return 0 if equal, -1 if gl2.contig is greater, 1 if gl1.contig is greater + */ + @Requires("gl2 != null") + @Ensures("result == 0 || result == 1 || result == -1") + public final int compareContigs( GenomeLoc gl1, GenomeLoc gl2 ) { + if (gl1.contigIndex == gl2.contigIndex) + return 0; + else if (gl1.contigIndex > gl2.contigIndex) + return 1; + return -1; + } + + @Requires("gl2 != null") + @Ensures("result == 0 || result == 1 || result == -1") + public int compare ( GenomeLoc gl1, GenomeLoc gl2 ) { + int result = 0; + + if ( gl1 == gl2 ) { + result = 0; + } + else if(GenomeLoc.isUnmapped(gl1)) + result = 1; + else if(GenomeLoc.isUnmapped(gl2)) + result = -1; + else { + final int cmpContig = compareContigs(gl1, gl2); + + if ( cmpContig != 0 ) { + result = cmpContig; + } else { + if ( gl1.getStart() < gl2.getStart() ) result = -1; + if ( gl1.getStart() > gl2.getStart() ) result = 1; + } + } + + return result; + } +} From ff2f4df043eae05bfc338898155a34bb50becd06 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 28 Sep 2011 16:07:00 -0400 Subject: [PATCH 10/19] Fixed hardclipping inside indel (right tail) when hard clipping the right tail of a read falls inside a deletion, clipping should fall back to the last base before the deletion to follow the ReadClipper's contract. --- .../src/org/broadinstitute/sting/utils/sam/ReadUtils.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 14ebbaa6f..e0a3a5a53 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -893,6 +893,10 @@ public class ReadUtils { // base before the deletion (see warning in function contracts) else if (fallsInsideDeletion && !endsWithinCigar) readBases += shift - 1; + + // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion + else if (fallsInsideDeletion && endsWithinCigar) + readBases--; } } From 3b73dc89fe94246eeb6f8884e1786f3e30c20678 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 28 Sep 2011 16:17:31 -0400 Subject: [PATCH 11/19] Making several esoteric arguments in the BQSR @Hidden. Adding basic support for Complete Genomics machine cycle. --- .../gatk/walkers/recalibration/CycleCovariate.java | 13 +++++++++---- .../walkers/recalibration/RecalDataManager.java | 6 ++---- .../RecalibrationArgumentCollection.java | 8 ++++++++ 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index 945d02837..5d07922a7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -159,10 +159,11 @@ public class CycleCovariate implements StandardCovariate { */ // todo -- this should be put into a common place in the code base - private static List PACBIO_NAMES = Arrays.asList("PACBIO"); private static List ILLUMINA_NAMES = Arrays.asList("ILLUMINA", "SLX", "SOLEXA"); private static List SOLID_NAMES = Arrays.asList("SOLID"); private static List LS454_NAMES = Arrays.asList("454"); + private static List COMPLETE_GENOMICS_NAMES = Arrays.asList("COMPLETE"); + private static List PACBIO_NAMES = Arrays.asList("PACBIO"); private static boolean isPlatform(SAMRecord read, List names) { String pl = read.getReadGroup().getPlatform().toUpperCase(); @@ -176,11 +177,10 @@ public class CycleCovariate implements StandardCovariate { public void getValues(SAMRecord read, Comparable[] comparable) { //----------------------------- - // ILLUMINA and SOLID + // Illumina, Solid, PacBio, and Complete Genomics //----------------------------- - - if( isPlatform(read, ILLUMINA_NAMES) || isPlatform(read, SOLID_NAMES) || isPlatform(read, PACBIO_NAMES)) { + if( isPlatform(read, ILLUMINA_NAMES) || isPlatform(read, SOLID_NAMES) || isPlatform(read, PACBIO_NAMES) || isPlatform(read, COMPLETE_GENOMICS_NAMES) ) { final int init; final int increment; if( !read.getReadNegativeStrandFlag() ) { @@ -222,6 +222,11 @@ public class CycleCovariate implements StandardCovariate { cycle += increment; } } + + //----------------------------- + // 454 + //----------------------------- + else if ( isPlatform(read, LS454_NAMES) ) { // Some bams have "LS454" and others have just "454" final int readLength = read.getReadLength(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index ac25d4f13..2daa8c025 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -245,8 +245,7 @@ public class RecalDataManager { readGroup.setPlatform( RAC.DEFAULT_PLATFORM ); ((GATKSAMRecord)read).setReadGroup( readGroup ); } else { - throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName() + - " Users must set both the default read group using the --default_read_group argument and the default platform using the --default_platform argument." ); + throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName() ); } } @@ -271,8 +270,7 @@ public class RecalDataManager { } readGroup.setPlatform( RAC.DEFAULT_PLATFORM ); } else { - throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName() + - " Users must set the default platform using the --default_platform argument." ); + throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName() ); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java index f31e2fc5b..75de84cb4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; /** * Created by IntelliJ IDEA. @@ -41,22 +42,29 @@ public class RecalibrationArgumentCollection { ////////////////////////////////// // Shared Command Line Arguments ////////////////////////////////// + @Hidden @Argument(fullName="default_read_group", shortName="dRG", required=false, doc="If a read has no read group then default to the provided String.") public String DEFAULT_READ_GROUP = null; + @Hidden @Argument(fullName="default_platform", shortName="dP", required=false, doc="If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") public String DEFAULT_PLATFORM = null; + @Hidden @Argument(fullName="force_read_group", shortName="fRG", required=false, doc="If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.") public String FORCE_READ_GROUP = null; + @Hidden @Argument(fullName="force_platform", shortName="fP", required=false, doc="If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; + @Hidden @Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false) public int WINDOW_SIZE = 5; /** * This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score. */ + @Hidden @Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false) public int HOMOPOLYMER_NBACK = 7; + @Hidden @Argument(fullName = "exception_if_no_tile", shortName="throwTileException", doc="If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required=false) public boolean EXCEPTION_IF_NO_TILE = false; From 7e3cb45093f86a29537c3475b46ca3b919f95f96 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 28 Sep 2011 16:27:28 -0400 Subject: [PATCH 12/19] Further performance optim in banded hmm, about 60% speed improvement over current implementation now --- .../sting/gatk/walkers/indels/PairHMMIndelErrorModel.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 55cc38e04..c8328771b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -260,7 +260,7 @@ public class PairHMMIndelErrorModel { currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray); // update max in diagonal if (bandedLikelihoods) { - final double bestMetric = MathUtils.softMax(matchMetricArray[indI][indJ], XMetricArray[indI][indJ], YMetricArray[indI][indJ]); + final double bestMetric = MathUtils.max(matchMetricArray[indI][indJ], XMetricArray[indI][indJ], YMetricArray[indI][indJ]); // check if we've fallen off diagonal value by threshold if (bestMetric > maxElementInDiag) { @@ -294,7 +294,7 @@ public class PairHMMIndelErrorModel { updateCell(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, readBases, readQuals, haplotypeBases, currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray); // update max in diagonal - final double bestMetric = MathUtils.softMax(matchMetricArray[indI][indJ], XMetricArray[indI][indJ], YMetricArray[indI][indJ]); + final double bestMetric = MathUtils.max(matchMetricArray[indI][indJ], XMetricArray[indI][indJ], YMetricArray[indI][indJ]); // check if we've fallen off diagonal value by threshold if (bestMetric > maxElementInDiag) { From 0acaf2df6577505768984536d5c1cce337da7132 Mon Sep 17 00:00:00 2001 From: Matt Hanna Date: Wed, 28 Sep 2011 21:23:01 -0400 Subject: [PATCH 18/19] Fix an embarrassing issue where a specific configuration of minimal coverage over small intervals could cause reads to be dropped from the pileup. Nothing to see here... --- .../sting/gatk/executive/WindowMaker.java | 67 +++++++------------ .../providers/LocusViewTemplate.java | 14 ++++ 2 files changed, 39 insertions(+), 42 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index cfbce58ee..43ea46002 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Iterator; import java.util.List; @@ -37,7 +38,7 @@ public class WindowMaker implements Iterable, I /** * The data source for reads. Will probably come directly from the BAM file. */ - private final Iterator sourceIterator; + private final PeekableIterator sourceIterator; /** * Stores the sequence of intervals that the windowmaker should be tracking. @@ -69,7 +70,7 @@ public class WindowMaker implements Iterable, I this.sourceInfo = shard.getReadProperties(); this.readIterator = iterator; - this.sourceIterator = new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleData); + this.sourceIterator = new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleData)); this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null; } @@ -100,11 +101,6 @@ public class WindowMaker implements Iterable, I */ private final GenomeLoc locus; - /** - * Signal not to advance the iterator because we're currently sitting at the next element. - */ - private boolean atNextElement = false; - public WindowMakerIterator(GenomeLoc locus) { this.locus = locus; advance(); @@ -124,58 +120,45 @@ public class WindowMaker implements Iterable, I public boolean hasNext() { advance(); - return atNextElement; + return currentAlignmentContext != null; } public AlignmentContext next() { - advance(); - if(!atNextElement) throw new NoSuchElementException("WindowMakerIterator is out of elements for this interval."); + if(!hasNext()) throw new NoSuchElementException("WindowMakerIterator is out of elements for this interval."); - // Prepare object state for no next element. + // Consume this alignment context. AlignmentContext toReturn = currentAlignmentContext; currentAlignmentContext = null; - atNextElement = false; // Return the current element. return toReturn; } private void advance() { - // No shard boundaries specified. If currentAlignmentContext has been consumed, grab the next one. - if(locus == null) { - if(!atNextElement && sourceIterator.hasNext()) { - currentAlignmentContext = sourceIterator.next(); - atNextElement = true; - } - return; - } - - // Can't possibly find another element. Skip out early. - if(currentAlignmentContext == null && !sourceIterator.hasNext()) - return; - // Need to find the next element that is not past shard boundaries. If we travel past the edge of // shard boundaries, stop and let the next interval pick it up. - while(sourceIterator.hasNext()) { - // Seed the current alignment context first time through the loop. - if(currentAlignmentContext == null) - currentAlignmentContext = sourceIterator.next(); + while(currentAlignmentContext == null && sourceIterator.hasNext()) { + // Advance the iterator and try again. + AlignmentContext candidateAlignmentContext = sourceIterator.peek(); - // Found a match. - if(locus.containsP(currentAlignmentContext.getLocation())) { - atNextElement = true; + if(locus == null) { + // No filter present. Return everything that LocusIteratorByState provides us. + currentAlignmentContext = sourceIterator.next(); + } + else if(locus.isPast(candidateAlignmentContext.getLocation())) + // Found a locus before the current window; claim this alignment context and throw it away. + sourceIterator.next(); + else if(locus.containsP(candidateAlignmentContext.getLocation())) { + // Found a locus within the current window; claim this alignment context and call it the next entry. + currentAlignmentContext = sourceIterator.next(); + } + else if(locus.isBefore(candidateAlignmentContext.getLocation())) { + // Whoops. Skipped passed the end of the region. Iteration for this window is complete. Do + // not claim this alignment context in case it is part of the next shard. break; } - // Whoops. Skipped passed the end of the region. Iteration for this window is complete. - if(locus.isBefore(currentAlignmentContext.getLocation())) - break; - - // No more elements to examine. Iteration is complete. - if(!sourceIterator.hasNext()) - break; - - // Advance the iterator and try again. - currentAlignmentContext = sourceIterator.next(); + else + throw new ReviewedStingException("BUG: filtering locus does not contain, is not before, and is not past the given alignment context"); } } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java index acfefd627..e5cf80826 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java @@ -118,6 +118,20 @@ public abstract class LocusViewTemplate extends BaseTest { testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); } + @Test + public void readAndLocusOverlapAtLastBase() { + SAMRecord read = buildSAMRecord("chr1", 1, 5); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 5, 5))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); + } + @Test public void readOverlappingStartTest() { SAMRecord read = buildSAMRecord("chr1", 1, 10); From 4fd5630f6a65c75eaf1a80facfc55cb86b42c390 Mon Sep 17 00:00:00 2001 From: Roger Zurawicki Date: Wed, 28 Sep 2011 23:13:50 -0400 Subject: [PATCH 19/19] Added ReadClipper Unit Test * Includes tests that include HardClip to Read and Reference Coords. * Changed ReadUtils.HardClipByReferenceCoordinates from private to protected to allow for testing --- .../sting/utils/clipreads/ReadClipper.java | 2 +- .../utils/clipreads/ReadClipperUnitTest.java | 225 ++++++++++++++++++ 2 files changed, 226 insertions(+), 1 deletion(-) create mode 100644 public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index 83db20238..4c07f15f6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -68,7 +68,7 @@ public class ReadClipper { return result; } - private SAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { + protected SAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL); int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL); diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java new file mode 100644 index 000000000..1415379db --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.clipreads; + +import net.sf.samtools.*; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.LinkedList; +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: roger + * Date: 9/28/11 + * Time: 9:54 PM + * To change this template use File | Settings | File Templates. + */ +public class ReadClipperUnitTest extends BaseTest { + + // TODO: Add error messages on failed tests + + SAMRecord read, expected; + ReadClipper readClipper; + final static String BASES = "ACTG"; + final static String QUALS = "!+5?"; //ASCII values = 33,43,53,63 + + @BeforeClass + public void init() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length()); + read.setReadUnmappedFlag(true); + read.setReadBases(new String(BASES).getBytes()); + read.setBaseQualityString(new String(QUALS)); + + readClipper = new ReadClipper(read); + } + + @Test + public void testHardClipBothEndsByReferenceCoordinates() { + logger.warn("Executing testHardClipBothEndsByReferenceCoordinates"); + + //Clip whole read + Assert.assertEquals(readClipper.hardClipBothEndsByReferenceCoordinates(0,0), new SAMRecord(read.getHeader())); + //clip 1 base + expected = readClipper.hardClipBothEndsByReferenceCoordinates(0,3); + Assert.assertEquals(expected.getReadBases(), BASES.substring(1,3).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,3)); + Assert.assertEquals(expected.getCigarString(), "1H2M1H"); + + } + + @Test + public void testHardClipByReadCoordinates() { + logger.warn("Executing testHardClipByReadCoordinates"); + + //Clip whole read + Assert.assertEquals(readClipper.hardClipByReadCoordinates(0,3), new SAMRecord(read.getHeader())); + + //clip 1 base at start + expected = readClipper.hardClipByReadCoordinates(0,0); + Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); + Assert.assertEquals(expected.getCigarString(), "1H3M"); + + //clip 1 base at end + expected = readClipper.hardClipByReadCoordinates(3,3); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); + Assert.assertEquals(expected.getCigarString(), "3M1H"); + + //clip 2 bases at start + expected = readClipper.hardClipByReadCoordinates(0,1); + Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); + Assert.assertEquals(expected.getCigarString(), "2H2M"); + + //clip 2 bases at end + expected = readClipper.hardClipByReadCoordinates(2,3); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); + Assert.assertEquals(expected.getCigarString(), "2M2H"); + + } + + @Test + public void testHardClipByReferenceCoordinates() { + logger.warn("Executing testHardClipByReferenceCoordinates"); + + //Clip whole read + Assert.assertEquals(readClipper.hardClipByReferenceCoordinates(1,4), new SAMRecord(read.getHeader())); + + //clip 1 base at start + expected = readClipper.hardClipByReferenceCoordinates(-1,1); + Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); + Assert.assertEquals(expected.getCigarString(), "1H3M"); + + //clip 1 base at end + expected = readClipper.hardClipByReferenceCoordinates(3,-1); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); + Assert.assertEquals(expected.getCigarString(), "3M1H"); + + //clip 2 bases at start + expected = readClipper.hardClipByReferenceCoordinates(-1,2); + Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); + Assert.assertEquals(expected.getCigarString(), "2H2M"); + + //clip 2 bases at end + expected = readClipper.hardClipByReferenceCoordinates(2,-1); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); + Assert.assertEquals(expected.getCigarString(), "2M2H"); + + } + + @Test + public void testHardClipByReferenceCoordinatesLeftTail() { + logger.warn("Executing testHardClipByReferenceCoordinatesLeftTail"); + + //Clip whole read + Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesLeftTail(4), new SAMRecord(read.getHeader())); + + //clip 1 base at start + expected = readClipper.hardClipByReferenceCoordinatesLeftTail(1); + Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); + Assert.assertEquals(expected.getCigarString(), "1H3M"); + + //clip 2 bases at start + expected = readClipper.hardClipByReferenceCoordinatesLeftTail(2); + Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); + Assert.assertEquals(expected.getCigarString(), "2H2M"); + + } + + @Test + public void testHardClipByReferenceCoordinatesRightTail() { + logger.warn("Executing testHardClipByReferenceCoordinatesRightTail"); + + //Clip whole read + Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesRightTail(1), new SAMRecord(read.getHeader())); + + //clip 1 base at end + expected = readClipper.hardClipByReferenceCoordinatesRightTail(3); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); + Assert.assertEquals(expected.getCigarString(), "3M1H"); + + //clip 2 bases at end + expected = readClipper.hardClipByReferenceCoordinatesRightTail(2); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); + Assert.assertEquals(expected.getCigarString(), "2M2H"); + + } + + @Test + public void testHardClipLowQualEnds() { + logger.warn("Executing testHardClipByReferenceCoordinates"); + + + //Clip whole read + Assert.assertEquals(readClipper.hardClipLowQualEnds((byte)64), new SAMRecord(read.getHeader())); + + //clip 1 base at start + expected = readClipper.hardClipLowQualEnds((byte)34); + Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); + Assert.assertEquals(expected.getCigarString(), "1H3M"); + + //clip 2 bases at start + expected = readClipper.hardClipLowQualEnds((byte)44); + Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); + Assert.assertEquals(expected.getCigarString(), "2H2M"); + + // Reverse Quals sequence + readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33 + + //clip 1 base at end + expected = readClipper.hardClipLowQualEnds((byte)34); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); + Assert.assertEquals(expected.getCigarString(), "3M1H"); + + //clip 2 bases at end + expected = readClipper.hardClipLowQualEnds((byte)44); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); + Assert.assertEquals(expected.getCigarString(), "2M2H"); + + // revert Qual sequence + readClipper.getRead().setBaseQualityString(QUALS); + } +}